@lojban/semantic-search-mcp 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +2 -4
- package/src/index.ts +164 -178
- package/src/storage.ts +74 -103
package/README.md
CHANGED
|
@@ -12,7 +12,7 @@ Use it in **Cursor**, **Claude Code**, or any IDE that supports MCP to search th
|
|
|
12
12
|
|
|
13
13
|
## How it works
|
|
14
14
|
|
|
15
|
-
- **Indexing**: Scans directories for `.txt`, `.md`, `.tsv`, `.csv`, `.json`, `.html`, `.xml`. Each non-empty line gets a vector embedding (via [Hugging Face Transformers.js](https://huggingface.co/docs/transformers.js), model `Xenova/all-MiniLM-L6-v2`) and is stored in a local SQLite database with [
|
|
15
|
+
- **Indexing**: Scans directories for `.txt`, `.md`, `.tsv`, `.csv`, `.json`, `.html`, `.xml`. Each non-empty line gets a vector embedding (via [Hugging Face Transformers.js](https://huggingface.co/docs/transformers.js), model `Xenova/all-MiniLM-L6-v2`) and is stored in a local SQLite database with [@dao-xyz/sqlite3-vec](https://www.npmjs.com/package/@dao-xyz/sqlite3-vec) (SQLite + sqlite-vec for Node and browser).
|
|
16
16
|
- **Search**: You send a natural-language query; the server embeds it and returns the closest lines by cosine similarity.
|
|
17
17
|
- **Storage**: Index is stored in your project's `.semantic-search/data/` (or set `SEMANTIC_SEARCH_DATA_DIR`). No cloud, no API keys.
|
|
18
18
|
|
package/package.json
CHANGED
|
@@ -1,21 +1,19 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lojban/semantic-search-mcp",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.2",
|
|
4
4
|
"description": "Local-first MCP server for semantic search using transformers.js and SQLite",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"scripts": {
|
|
7
7
|
"dev": "tsx src/index.ts"
|
|
8
8
|
},
|
|
9
9
|
"dependencies": {
|
|
10
|
+
"@dao-xyz/sqlite3-vec": "^0.0.19",
|
|
10
11
|
"@huggingface/transformers": "^3.0.0",
|
|
11
12
|
"@modelcontextprotocol/sdk": "^1.0.0",
|
|
12
|
-
"better-sqlite3": "^11.0.0",
|
|
13
13
|
"glob": "^10.3.0",
|
|
14
|
-
"sqlite-vec": "^0.1.0",
|
|
15
14
|
"tsx": "^4.0.0"
|
|
16
15
|
},
|
|
17
16
|
"devDependencies": {
|
|
18
|
-
"@types/better-sqlite3": "^7.6.0",
|
|
19
17
|
"@types/node": "^20.0.0",
|
|
20
18
|
"typescript": "^5.0.0"
|
|
21
19
|
},
|
package/src/index.ts
CHANGED
|
@@ -7,210 +7,196 @@ import {
|
|
|
7
7
|
} from '@modelcontextprotocol/sdk/types.js';
|
|
8
8
|
import path from 'path';
|
|
9
9
|
import { getEmbedding, getBatchEmbeddings } from './embeddings.js';
|
|
10
|
-
import {
|
|
10
|
+
import { createVectorStorage, type SearchResult } from './storage.js';
|
|
11
11
|
import { scanDirectories } from './scanner.js';
|
|
12
12
|
|
|
13
|
-
import { mkdirSync } from 'fs';
|
|
14
|
-
|
|
15
13
|
// Data dir: use env, or project cwd so each workspace has its own index when run via npx from Cursor
|
|
16
14
|
const dataDir =
|
|
17
15
|
process.env.SEMANTIC_SEARCH_DATA_DIR ||
|
|
18
16
|
path.join(process.cwd(), '.semantic-search', 'data');
|
|
19
17
|
const DB_PATH = path.join(dataDir, 'vectors.db');
|
|
20
18
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
// Initialize storage
|
|
24
|
-
const storage = new VectorStorage(DB_PATH);
|
|
19
|
+
async function main() {
|
|
20
|
+
const storage = await createVectorStorage(DB_PATH);
|
|
25
21
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
version: '1.0.0',
|
|
31
|
-
},
|
|
32
|
-
{
|
|
33
|
-
capabilities: {
|
|
34
|
-
tools: {},
|
|
22
|
+
const server = new Server(
|
|
23
|
+
{
|
|
24
|
+
name: 'semantic-search',
|
|
25
|
+
version: '1.0.0',
|
|
35
26
|
},
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
27
|
+
{
|
|
28
|
+
capabilities: {
|
|
29
|
+
tools: {},
|
|
30
|
+
},
|
|
31
|
+
}
|
|
32
|
+
);
|
|
33
|
+
|
|
34
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
35
|
+
return {
|
|
36
|
+
tools: [
|
|
37
|
+
{
|
|
38
|
+
name: 'index_directories',
|
|
39
|
+
description: 'Scan directories and index all text file lines for semantic search. Each line gets a vector embedding.',
|
|
40
|
+
inputSchema: {
|
|
41
|
+
type: 'object',
|
|
42
|
+
properties: {
|
|
43
|
+
directories: {
|
|
44
|
+
type: 'array',
|
|
45
|
+
items: { type: 'string' },
|
|
46
|
+
description: 'List of directory paths to scan and index. Defaults to SEMANTIC_SEARCH_INDEX_DIRS (comma-separated) if unset.',
|
|
47
|
+
},
|
|
48
|
+
clear_existing: {
|
|
49
|
+
type: 'boolean',
|
|
50
|
+
description: 'Whether to clear the existing index before indexing (default: false)',
|
|
51
|
+
default: false,
|
|
52
|
+
},
|
|
58
53
|
},
|
|
54
|
+
required: [],
|
|
59
55
|
},
|
|
60
|
-
required: [],
|
|
61
56
|
},
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
57
|
+
{
|
|
58
|
+
name: 'search',
|
|
59
|
+
description: 'Search for lines semantically similar to the query. Returns the most relevant lines from indexed files.',
|
|
60
|
+
inputSchema: {
|
|
61
|
+
type: 'object',
|
|
62
|
+
properties: {
|
|
63
|
+
query: {
|
|
64
|
+
type: 'string',
|
|
65
|
+
description: 'The search query (natural language)',
|
|
66
|
+
},
|
|
67
|
+
limit: {
|
|
68
|
+
type: 'number',
|
|
69
|
+
description: 'Maximum number of results to return (default: 10)',
|
|
70
|
+
default: 10,
|
|
71
|
+
},
|
|
77
72
|
},
|
|
73
|
+
required: ['query'],
|
|
78
74
|
},
|
|
79
|
-
required: ['query'],
|
|
80
75
|
},
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
76
|
+
{
|
|
77
|
+
name: 'get_index_stats',
|
|
78
|
+
description: 'Get statistics about the current index (number of files and lines indexed)',
|
|
79
|
+
inputSchema: {
|
|
80
|
+
type: 'object',
|
|
81
|
+
properties: {},
|
|
82
|
+
},
|
|
88
83
|
},
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
};
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
84
|
+
],
|
|
85
|
+
};
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
89
|
+
const { name, arguments: args } = request.params;
|
|
90
|
+
|
|
91
|
+
try {
|
|
92
|
+
switch (name) {
|
|
93
|
+
case 'index_directories': {
|
|
94
|
+
let directories = (args as { directories?: string[]; clear_existing?: boolean }).directories;
|
|
95
|
+
if (!directories?.length) {
|
|
96
|
+
const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
|
|
97
|
+
directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
|
|
98
|
+
}
|
|
99
|
+
if (!directories.length) {
|
|
100
|
+
throw new Error('No directories to index. Set directories in the request or SEMANTIC_SEARCH_INDEX_DIRS (comma-separated).');
|
|
101
|
+
}
|
|
102
|
+
const clearExisting = (args as { directories?: string[]; clear_existing?: boolean }).clear_existing ?? false;
|
|
103
|
+
|
|
104
|
+
if (clearExisting) {
|
|
105
|
+
storage.clear();
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
console.error(`Scanning ${directories.length} directories...`);
|
|
109
|
+
const lines = await scanDirectories(directories);
|
|
110
|
+
console.error(`Found ${lines.length} lines to index`);
|
|
111
|
+
|
|
112
|
+
const batchSize = 50;
|
|
113
|
+
let indexed = 0;
|
|
114
|
+
|
|
115
|
+
for (let i = 0; i < lines.length; i += batchSize) {
|
|
116
|
+
const batch = lines.slice(i, i + batchSize);
|
|
117
|
+
const texts = batch.map(l => l.content);
|
|
118
|
+
const embeddings = await getBatchEmbeddings(texts);
|
|
119
|
+
|
|
120
|
+
const batchData = batch.map((line, idx) => ({
|
|
121
|
+
filePath: line.filePath,
|
|
122
|
+
lineNumber: line.lineNumber,
|
|
123
|
+
content: line.content,
|
|
124
|
+
embedding: embeddings[idx],
|
|
125
|
+
}));
|
|
126
|
+
|
|
127
|
+
await storage.upsertLinesBatch(batchData);
|
|
128
|
+
indexed += batch.length;
|
|
129
|
+
console.error(`Indexed ${indexed}/${lines.length} lines`);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const stats = await storage.getStats();
|
|
133
|
+
return {
|
|
134
|
+
content: [
|
|
135
|
+
{
|
|
136
|
+
type: 'text',
|
|
137
|
+
text: JSON.stringify({
|
|
138
|
+
success: true,
|
|
139
|
+
indexed_lines: stats.totalLines,
|
|
140
|
+
indexed_files: stats.totalFiles,
|
|
141
|
+
message: `Successfully indexed ${stats.totalLines} lines from ${stats.totalFiles} files`,
|
|
142
|
+
}),
|
|
143
|
+
},
|
|
144
|
+
],
|
|
145
|
+
};
|
|
108
146
|
}
|
|
109
|
-
const clearExisting = (args as { directories?: string[]; clear_existing?: boolean }).clear_existing ?? false;
|
|
110
147
|
|
|
111
|
-
|
|
112
|
-
|
|
148
|
+
case 'search': {
|
|
149
|
+
const query = (args as { query: string; limit?: number }).query;
|
|
150
|
+
const limit = (args as { query: string; limit?: number }).limit ?? 10;
|
|
151
|
+
|
|
152
|
+
const queryEmbedding = await getEmbedding(query);
|
|
153
|
+
const results = await storage.search(queryEmbedding, limit);
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
content: [
|
|
157
|
+
{
|
|
158
|
+
type: 'text',
|
|
159
|
+
text: JSON.stringify({
|
|
160
|
+
query,
|
|
161
|
+
results: results.map((r: SearchResult) => ({
|
|
162
|
+
file: r.file_path,
|
|
163
|
+
line: r.line_number,
|
|
164
|
+
content: r.content,
|
|
165
|
+
score: Math.round(r.score * 1000) / 1000,
|
|
166
|
+
})),
|
|
167
|
+
}),
|
|
168
|
+
},
|
|
169
|
+
],
|
|
170
|
+
};
|
|
113
171
|
}
|
|
114
172
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
const batchData = batch.map((line, idx) => ({
|
|
130
|
-
filePath: line.filePath,
|
|
131
|
-
lineNumber: line.lineNumber,
|
|
132
|
-
content: line.content,
|
|
133
|
-
embedding: embeddings[idx],
|
|
134
|
-
}));
|
|
135
|
-
|
|
136
|
-
storage.upsertLinesBatch(batchData);
|
|
137
|
-
indexed += batch.length;
|
|
138
|
-
console.error(`Indexed ${indexed}/${lines.length} lines`);
|
|
173
|
+
case 'get_index_stats': {
|
|
174
|
+
const stats = await storage.getStats();
|
|
175
|
+
return {
|
|
176
|
+
content: [
|
|
177
|
+
{
|
|
178
|
+
type: 'text',
|
|
179
|
+
text: JSON.stringify({
|
|
180
|
+
total_files: stats.totalFiles,
|
|
181
|
+
total_lines: stats.totalLines,
|
|
182
|
+
}),
|
|
183
|
+
},
|
|
184
|
+
],
|
|
185
|
+
};
|
|
139
186
|
}
|
|
140
187
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
content: [
|
|
144
|
-
{
|
|
145
|
-
type: 'text',
|
|
146
|
-
text: JSON.stringify({
|
|
147
|
-
success: true,
|
|
148
|
-
indexed_lines: stats.totalLines,
|
|
149
|
-
indexed_files: stats.totalFiles,
|
|
150
|
-
message: `Successfully indexed ${stats.totalLines} lines from ${stats.totalFiles} files`,
|
|
151
|
-
}),
|
|
152
|
-
},
|
|
153
|
-
],
|
|
154
|
-
};
|
|
188
|
+
default:
|
|
189
|
+
throw new Error(`Unknown tool: ${name}`);
|
|
155
190
|
}
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
const queryEmbedding = await getEmbedding(query);
|
|
163
|
-
|
|
164
|
-
// Search
|
|
165
|
-
const results = storage.search(queryEmbedding, limit);
|
|
166
|
-
|
|
167
|
-
return {
|
|
168
|
-
content: [
|
|
169
|
-
{
|
|
170
|
-
type: 'text',
|
|
171
|
-
text: JSON.stringify({
|
|
172
|
-
query,
|
|
173
|
-
results: results.map((r: SearchResult) => ({
|
|
174
|
-
file: r.file_path,
|
|
175
|
-
line: r.line_number,
|
|
176
|
-
content: r.content,
|
|
177
|
-
score: Math.round(r.score * 1000) / 1000,
|
|
178
|
-
})),
|
|
179
|
-
}),
|
|
180
|
-
},
|
|
181
|
-
],
|
|
182
|
-
};
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
case 'get_index_stats': {
|
|
186
|
-
const stats = storage.getStats();
|
|
187
|
-
return {
|
|
188
|
-
content: [
|
|
189
|
-
{
|
|
190
|
-
type: 'text',
|
|
191
|
-
text: JSON.stringify({
|
|
192
|
-
total_files: stats.totalFiles,
|
|
193
|
-
total_lines: stats.totalLines,
|
|
194
|
-
}),
|
|
195
|
-
},
|
|
196
|
-
],
|
|
197
|
-
};
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
default:
|
|
201
|
-
throw new Error(`Unknown tool: ${name}`);
|
|
191
|
+
} catch (error) {
|
|
192
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
193
|
+
return {
|
|
194
|
+
content: [{ type: 'text', text: JSON.stringify({ error: message }) }],
|
|
195
|
+
isError: true,
|
|
196
|
+
};
|
|
202
197
|
}
|
|
203
|
-
}
|
|
204
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
205
|
-
return {
|
|
206
|
-
content: [{ type: 'text', text: JSON.stringify({ error: message }) }],
|
|
207
|
-
isError: true,
|
|
208
|
-
};
|
|
209
|
-
}
|
|
210
|
-
});
|
|
198
|
+
});
|
|
211
199
|
|
|
212
|
-
// Start server
|
|
213
|
-
async function main() {
|
|
214
200
|
const transport = new StdioServerTransport();
|
|
215
201
|
await server.connect(transport);
|
|
216
202
|
console.error('Semantic Search MCP Server running on stdio');
|
package/src/storage.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import
|
|
1
|
+
import pkg from '@dao-xyz/sqlite3-vec';
|
|
2
|
+
const { createDatabase } = pkg;
|
|
2
3
|
import path from 'path';
|
|
3
|
-
import
|
|
4
|
+
import { mkdirSync } from 'fs';
|
|
4
5
|
|
|
5
6
|
const EMBEDDING_DIM = 384; // all-MiniLM-L6-v2 produces 384-dim vectors
|
|
6
7
|
|
|
@@ -19,20 +20,17 @@ export interface SearchResult {
|
|
|
19
20
|
score: number;
|
|
20
21
|
}
|
|
21
22
|
|
|
23
|
+
type DB = Awaited<ReturnType<typeof createDatabase>>;
|
|
24
|
+
|
|
22
25
|
export class VectorStorage {
|
|
23
|
-
private db:
|
|
24
|
-
|
|
25
|
-
constructor(
|
|
26
|
-
this.db =
|
|
27
|
-
|
|
28
|
-
// Load sqlite-vec extension
|
|
29
|
-
sqliteVec.load(this.db);
|
|
30
|
-
|
|
26
|
+
private db: DB;
|
|
27
|
+
|
|
28
|
+
constructor(db: DB) {
|
|
29
|
+
this.db = db;
|
|
31
30
|
this.init();
|
|
32
31
|
}
|
|
33
32
|
|
|
34
33
|
private init(): void {
|
|
35
|
-
// Create regular table for metadata
|
|
36
34
|
this.db.exec(`
|
|
37
35
|
CREATE TABLE IF NOT EXISTS lines (
|
|
38
36
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
@@ -41,11 +39,8 @@ export class VectorStorage {
|
|
|
41
39
|
content TEXT NOT NULL,
|
|
42
40
|
UNIQUE(file_path, line_number)
|
|
43
41
|
);
|
|
44
|
-
|
|
45
42
|
CREATE INDEX IF NOT EXISTS idx_file ON lines(file_path);
|
|
46
43
|
`);
|
|
47
|
-
|
|
48
|
-
// Create virtual table for vectors using sqlite-vec
|
|
49
44
|
this.db.exec(`
|
|
50
45
|
CREATE VIRTUAL TABLE IF NOT EXISTS vec_lines USING vec0(
|
|
51
46
|
line_id INTEGER PRIMARY KEY,
|
|
@@ -57,126 +52,90 @@ export class VectorStorage {
|
|
|
57
52
|
/**
|
|
58
53
|
* Insert or update a line with its embedding
|
|
59
54
|
*/
|
|
60
|
-
upsertLine(filePath: string, lineNumber: number, content: string, embedding: Float32Array): void {
|
|
61
|
-
const insertLine = this.db.prepare(
|
|
62
|
-
INSERT INTO lines (file_path, line_number, content)
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
const
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?').run(safeId);
|
|
76
|
-
|
|
77
|
-
const insertVec = this.db.prepare(`
|
|
78
|
-
INSERT INTO vec_lines (line_id, embedding)
|
|
79
|
-
VALUES (?, ?)
|
|
80
|
-
`);
|
|
81
|
-
|
|
82
|
-
insertVec.run(safeId, JSON.stringify(Array.from(embedding)));
|
|
55
|
+
async upsertLine(filePath: string, lineNumber: number, content: string, embedding: Float32Array): Promise<void> {
|
|
56
|
+
const insertLine = await this.db.prepare(
|
|
57
|
+
`INSERT INTO lines (file_path, line_number, content)
|
|
58
|
+
VALUES (?, ?, ?)
|
|
59
|
+
ON CONFLICT(file_path, line_number) DO UPDATE SET content = excluded.content`
|
|
60
|
+
);
|
|
61
|
+
insertLine.run([filePath, lineNumber, content]);
|
|
62
|
+
|
|
63
|
+
const sel = await this.db.prepare('SELECT id FROM lines WHERE file_path = ? AND line_number = ?');
|
|
64
|
+
const row = sel.get([filePath, lineNumber]) as { id: number } | undefined;
|
|
65
|
+
if (row == null) throw new Error('Failed to get line id');
|
|
66
|
+
const id = Math.trunc(Number(row.id));
|
|
67
|
+
|
|
68
|
+
(await this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?')).run([id]);
|
|
69
|
+
(await this.db.prepare('INSERT INTO vec_lines (line_id, embedding) VALUES (?, ?)')).run([id, embedding.buffer]);
|
|
83
70
|
}
|
|
84
71
|
|
|
85
72
|
/**
|
|
86
73
|
* Batch insert lines for efficiency
|
|
87
74
|
*/
|
|
88
|
-
upsertLinesBatch(
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
const
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
deleteVec.run(safeId);
|
|
108
|
-
insertVec.run(safeId, JSON.stringify(Array.from(item.embedding)));
|
|
109
|
-
}
|
|
110
|
-
});
|
|
111
|
-
|
|
112
|
-
return insertMany(lines);
|
|
75
|
+
async upsertLinesBatch(
|
|
76
|
+
lines: Array<{ filePath: string; lineNumber: number; content: string; embedding: Float32Array }>
|
|
77
|
+
): Promise<void> {
|
|
78
|
+
const insertLine = await this.db.prepare(
|
|
79
|
+
`INSERT INTO lines (file_path, line_number, content)
|
|
80
|
+
VALUES (?, ?, ?)
|
|
81
|
+
ON CONFLICT(file_path, line_number) DO UPDATE SET content = excluded.content`
|
|
82
|
+
);
|
|
83
|
+
const selId = await this.db.prepare('SELECT id FROM lines WHERE file_path = ? AND line_number = ?');
|
|
84
|
+
const deleteVec = await this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?');
|
|
85
|
+
const insertVec = await this.db.prepare('INSERT INTO vec_lines (line_id, embedding) VALUES (?, ?)');
|
|
86
|
+
|
|
87
|
+
for (const item of lines) {
|
|
88
|
+
insertLine.run([item.filePath, item.lineNumber, item.content]);
|
|
89
|
+
const row = selId.get([item.filePath, item.lineNumber]) as { id: number };
|
|
90
|
+
const id = Math.trunc(Number(row.id));
|
|
91
|
+
deleteVec.run([id]);
|
|
92
|
+
insertVec.run([id, item.embedding.buffer]);
|
|
93
|
+
}
|
|
113
94
|
}
|
|
114
95
|
|
|
115
96
|
/**
|
|
116
|
-
* Search for similar lines using sqlite-vec
|
|
97
|
+
* Search for similar lines using sqlite-vec cosine distance
|
|
117
98
|
*/
|
|
118
|
-
search(queryEmbedding: Float32Array, limit: number = 10): SearchResult[] {
|
|
119
|
-
const stmt = this.db.prepare(`
|
|
120
|
-
SELECT
|
|
99
|
+
async search(queryEmbedding: Float32Array, limit: number = 10): Promise<SearchResult[]> {
|
|
100
|
+
const stmt = await this.db.prepare(`
|
|
101
|
+
SELECT
|
|
121
102
|
l.file_path,
|
|
122
103
|
l.line_number,
|
|
123
104
|
l.content,
|
|
124
|
-
vec_distance_cosine(v.embedding, ?)
|
|
105
|
+
vec_distance_cosine(v.embedding, ?1) AS distance
|
|
125
106
|
FROM vec_lines v
|
|
126
107
|
INNER JOIN lines l ON v.line_id = l.id
|
|
127
108
|
ORDER BY distance
|
|
128
|
-
LIMIT ?
|
|
109
|
+
LIMIT ?2
|
|
129
110
|
`);
|
|
130
|
-
|
|
131
|
-
const rows = stmt.all(JSON.stringify(Array.from(queryEmbedding)), limit) as Array<{
|
|
111
|
+
const rows = stmt.all([queryEmbedding.buffer, limit]) as Array<{
|
|
132
112
|
file_path: string;
|
|
133
113
|
line_number: number;
|
|
134
114
|
content: string;
|
|
135
115
|
distance: number;
|
|
136
116
|
}>;
|
|
137
|
-
|
|
138
|
-
return rows.map(row => ({
|
|
117
|
+
return rows.map((row) => ({
|
|
139
118
|
file_path: row.file_path,
|
|
140
119
|
line_number: row.line_number,
|
|
141
120
|
content: row.content,
|
|
142
|
-
score: 1 - row.distance,
|
|
121
|
+
score: 1 - row.distance,
|
|
143
122
|
}));
|
|
144
123
|
}
|
|
145
124
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
const totalFiles = (filesStmt.get() as { count: number }).count;
|
|
154
|
-
const totalLines = (linesStmt.get() as { count: number }).count;
|
|
155
|
-
|
|
156
|
-
return { totalFiles, totalLines };
|
|
125
|
+
async getStats(): Promise<{ totalFiles: number; totalLines: number }> {
|
|
126
|
+
const filesRow = (await this.db.prepare('SELECT COUNT(DISTINCT file_path) AS count FROM lines')).get() as { count: number } | undefined;
|
|
127
|
+
const linesRow = (await this.db.prepare('SELECT COUNT(*) AS count FROM lines')).get() as { count: number } | undefined;
|
|
128
|
+
return {
|
|
129
|
+
totalFiles: filesRow?.count ?? 0,
|
|
130
|
+
totalLines: linesRow?.count ?? 0,
|
|
131
|
+
};
|
|
157
132
|
}
|
|
158
133
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
removeFile(filePath: string): void {
|
|
163
|
-
const deleteVecs = this.db.prepare(`
|
|
164
|
-
DELETE FROM vec_lines
|
|
165
|
-
WHERE line_id IN (SELECT id FROM lines WHERE file_path = ?)
|
|
166
|
-
`);
|
|
167
|
-
const deleteLines = this.db.prepare('DELETE FROM lines WHERE file_path = ?');
|
|
168
|
-
|
|
169
|
-
const transaction = this.db.transaction(() => {
|
|
170
|
-
deleteVecs.run(filePath);
|
|
171
|
-
deleteLines.run(filePath);
|
|
172
|
-
});
|
|
173
|
-
|
|
174
|
-
transaction();
|
|
134
|
+
async removeFile(filePath: string): Promise<void> {
|
|
135
|
+
(await this.db.prepare('DELETE FROM vec_lines WHERE line_id IN (SELECT id FROM lines WHERE file_path = ?)')).run([filePath]);
|
|
136
|
+
(await this.db.prepare('DELETE FROM lines WHERE file_path = ?')).run([filePath]);
|
|
175
137
|
}
|
|
176
138
|
|
|
177
|
-
/**
|
|
178
|
-
* Clear the entire index
|
|
179
|
-
*/
|
|
180
139
|
clear(): void {
|
|
181
140
|
this.db.exec('DELETE FROM vec_lines');
|
|
182
141
|
this.db.exec('DELETE FROM lines');
|
|
@@ -187,3 +146,15 @@ export class VectorStorage {
|
|
|
187
146
|
}
|
|
188
147
|
}
|
|
189
148
|
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Create and open the vector storage (async). Use this instead of `new VectorStorage()`.
|
|
152
|
+
*/
|
|
153
|
+
export async function createVectorStorage(dbPath: string): Promise<VectorStorage> {
|
|
154
|
+
mkdirSync(path.dirname(dbPath), { recursive: true });
|
|
155
|
+
const db = await createDatabase({
|
|
156
|
+
database: dbPath,
|
|
157
|
+
});
|
|
158
|
+
await db.open();
|
|
159
|
+
return new VectorStorage(db);
|
|
160
|
+
}
|