@lojban/semantic-search-mcp 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/embeddings.ts +15 -8
- package/src/index.ts +13 -3
- package/src/scanner.ts +3 -9
- package/src/storage.ts +19 -10
package/package.json
CHANGED
package/src/embeddings.ts
CHANGED
|
@@ -24,23 +24,30 @@ export async function getEmbedding(text: string): Promise<Float32Array> {
|
|
|
24
24
|
return new Float32Array(output.data as ArrayLike<number>);
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
+
const EMBEDDING_DIM = 384; // all-MiniLM-L6-v2
|
|
28
|
+
|
|
27
29
|
/**
|
|
28
|
-
* Generate embeddings for multiple texts (batched for efficiency)
|
|
30
|
+
* Generate embeddings for multiple texts (batched for efficiency).
|
|
31
|
+
* Passes arrays to the pipeline so the model runs one forward pass per batch.
|
|
29
32
|
*/
|
|
30
33
|
export async function getBatchEmbeddings(texts: string[]): Promise<Float32Array[]> {
|
|
34
|
+
if (texts.length === 0) return [];
|
|
35
|
+
|
|
31
36
|
const ext = await getExtractor();
|
|
32
37
|
const results: Float32Array[] = [];
|
|
33
|
-
|
|
34
|
-
// Process in batches
|
|
35
|
-
const batchSize =
|
|
38
|
+
|
|
39
|
+
// Process in batches for memory; each batch is one model forward pass
|
|
40
|
+
const batchSize = 64;
|
|
36
41
|
for (let i = 0; i < texts.length; i += batchSize) {
|
|
37
42
|
const batch = texts.slice(i, i + batchSize);
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
43
|
+
const output = await ext(batch, { pooling: 'mean', normalize: true });
|
|
44
|
+
const data = output.data as Float32Array;
|
|
45
|
+
const dim = EMBEDDING_DIM;
|
|
46
|
+
for (let j = 0; j < batch.length; j++) {
|
|
47
|
+
results.push(new Float32Array(data.slice(j * dim, (j + 1) * dim)));
|
|
41
48
|
}
|
|
42
49
|
}
|
|
43
|
-
|
|
50
|
+
|
|
44
51
|
return results;
|
|
45
52
|
}
|
|
46
53
|
|
package/src/index.ts
CHANGED
|
@@ -109,13 +109,23 @@ async function main() {
|
|
|
109
109
|
const lines = await scanDirectories(directories);
|
|
110
110
|
console.error(`Found ${lines.length} lines to index`);
|
|
111
111
|
|
|
112
|
-
const batchSize =
|
|
112
|
+
const batchSize = 128;
|
|
113
113
|
let indexed = 0;
|
|
114
114
|
|
|
115
|
+
// Pipeline: compute embeddings for next batch while writing current batch to DB
|
|
116
|
+
let embedPromise: Promise<Float32Array[]> =
|
|
117
|
+
lines.length > 0
|
|
118
|
+
? getBatchEmbeddings(lines.slice(0, batchSize).map((l) => l.content))
|
|
119
|
+
: Promise.resolve([]);
|
|
120
|
+
|
|
115
121
|
for (let i = 0; i < lines.length; i += batchSize) {
|
|
116
122
|
const batch = lines.slice(i, i + batchSize);
|
|
117
|
-
const
|
|
118
|
-
|
|
123
|
+
const embeddings = await embedPromise;
|
|
124
|
+
|
|
125
|
+
if (i + batchSize < lines.length) {
|
|
126
|
+
const nextTexts = lines.slice(i + batchSize, i + batchSize * 2).map((l) => l.content);
|
|
127
|
+
embedPromise = getBatchEmbeddings(nextTexts);
|
|
128
|
+
}
|
|
119
129
|
|
|
120
130
|
const batchData = batch.map((line, idx) => ({
|
|
121
131
|
filePath: line.filePath,
|
package/src/scanner.ts
CHANGED
|
@@ -67,15 +67,9 @@ export async function scanDirectory(dirPath: string): Promise<FileLine[]> {
|
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
/**
|
|
70
|
-
* Scan multiple directories
|
|
70
|
+
* Scan multiple directories in parallel
|
|
71
71
|
*/
|
|
72
72
|
export async function scanDirectories(dirPaths: string[]): Promise<FileLine[]> {
|
|
73
|
-
const
|
|
74
|
-
|
|
75
|
-
for (const dirPath of dirPaths) {
|
|
76
|
-
const lines = await scanDirectory(dirPath);
|
|
77
|
-
allLines.push(...lines);
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
return allLines;
|
|
73
|
+
const results = await Promise.all(dirPaths.map((dirPath) => scanDirectory(dirPath)));
|
|
74
|
+
return results.flat();
|
|
81
75
|
}
|
package/src/storage.ts
CHANGED
|
@@ -63,18 +63,20 @@ export class VectorStorage {
|
|
|
63
63
|
const sel = await this.db.prepare('SELECT id FROM lines WHERE file_path = ? AND line_number = ?');
|
|
64
64
|
const row = sel.get([filePath, lineNumber]) as { id: number } | undefined;
|
|
65
65
|
if (row == null) throw new Error('Failed to get line id');
|
|
66
|
-
const
|
|
66
|
+
const idInt = BigInt(row.id); // vec0 requires INTEGER primary key; BigInt binds as integer in better-sqlite3
|
|
67
67
|
|
|
68
|
-
(await this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?')).run([
|
|
69
|
-
(await this.db.prepare('INSERT INTO vec_lines (line_id, embedding) VALUES (?, ?)')).run([
|
|
68
|
+
(await this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?')).run([idInt]);
|
|
69
|
+
(await this.db.prepare('INSERT INTO vec_lines (line_id, embedding) VALUES (?, ?)')).run([idInt, embedding.buffer]);
|
|
70
70
|
}
|
|
71
71
|
|
|
72
72
|
/**
|
|
73
|
-
* Batch insert lines for efficiency
|
|
73
|
+
* Batch insert lines for efficiency (single transaction)
|
|
74
74
|
*/
|
|
75
75
|
async upsertLinesBatch(
|
|
76
76
|
lines: Array<{ filePath: string; lineNumber: number; content: string; embedding: Float32Array }>
|
|
77
77
|
): Promise<void> {
|
|
78
|
+
if (lines.length === 0) return;
|
|
79
|
+
|
|
78
80
|
const insertLine = await this.db.prepare(
|
|
79
81
|
`INSERT INTO lines (file_path, line_number, content)
|
|
80
82
|
VALUES (?, ?, ?)
|
|
@@ -84,12 +86,19 @@ export class VectorStorage {
|
|
|
84
86
|
const deleteVec = await this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?');
|
|
85
87
|
const insertVec = await this.db.prepare('INSERT INTO vec_lines (line_id, embedding) VALUES (?, ?)');
|
|
86
88
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
const
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
89
|
+
this.db.exec('BEGIN');
|
|
90
|
+
try {
|
|
91
|
+
for (const item of lines) {
|
|
92
|
+
insertLine.run([item.filePath, item.lineNumber, item.content]);
|
|
93
|
+
const row = selId.get([item.filePath, item.lineNumber]) as { id: number };
|
|
94
|
+
const idInt = BigInt(row.id);
|
|
95
|
+
deleteVec.run([idInt]);
|
|
96
|
+
insertVec.run([idInt, item.embedding.buffer]);
|
|
97
|
+
}
|
|
98
|
+
this.db.exec('COMMIT');
|
|
99
|
+
} catch (e) {
|
|
100
|
+
this.db.exec('ROLLBACK');
|
|
101
|
+
throw e;
|
|
93
102
|
}
|
|
94
103
|
}
|
|
95
104
|
|