@lojban/semantic-search-mcp 1.0.12 → 1.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/embeddings.ts +2 -2
- package/src/index.ts +6 -3
- package/src/scanner.ts +16 -9
- package/src/storage.ts +4 -0
package/package.json
CHANGED
package/src/embeddings.ts
CHANGED
|
@@ -36,8 +36,8 @@ export async function getBatchEmbeddings(texts: string[]): Promise<Float32Array[
|
|
|
36
36
|
const ext = await getExtractor();
|
|
37
37
|
const results: Float32Array[] = [];
|
|
38
38
|
|
|
39
|
-
// Process in batches for memory; each batch is one model forward pass
|
|
40
|
-
const batchSize =
|
|
39
|
+
// Process in batches for memory; each batch is one model forward pass (smaller = lower peak RAM)
|
|
40
|
+
const batchSize = 32;
|
|
41
41
|
for (let i = 0; i < texts.length; i += batchSize) {
|
|
42
42
|
const batch = texts.slice(i, i + batchSize);
|
|
43
43
|
const output = await ext(batch, { pooling: 'mean', normalize: true });
|
package/src/index.ts
CHANGED
|
@@ -42,8 +42,8 @@ async function runBackgroundIndexing(
|
|
|
42
42
|
try {
|
|
43
43
|
let currentBatch: Array<{ filePath: string; lineNumber: number; content: string }> = [];
|
|
44
44
|
let processingPromise: Promise<void> | null = null;
|
|
45
|
-
const seenFiles = new Set<string>();
|
|
46
45
|
|
|
46
|
+
let batchesProcessed = 0;
|
|
47
47
|
const processBatch = async (
|
|
48
48
|
batch: Array<{ filePath: string; lineNumber: number; content: string }>
|
|
49
49
|
): Promise<void> => {
|
|
@@ -57,9 +57,12 @@ async function runBackgroundIndexing(
|
|
|
57
57
|
embedding: embeddings[idx],
|
|
58
58
|
}));
|
|
59
59
|
await storage.upsertLinesBatch(batchData);
|
|
60
|
-
for (const l of batch) seenFiles.add(l.filePath);
|
|
61
60
|
indexingState.linesIndexed += batch.length;
|
|
62
|
-
|
|
61
|
+
batchesProcessed++;
|
|
62
|
+
if (batchesProcessed === 1 || batchesProcessed % 10 === 0) {
|
|
63
|
+
const stats = await storage.getStats();
|
|
64
|
+
indexingState.filesIndexed = stats.totalFiles;
|
|
65
|
+
}
|
|
63
66
|
};
|
|
64
67
|
|
|
65
68
|
const yieldToEventLoop = (): Promise<void> =>
|
package/src/scanner.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { createReadStream, statSync } from 'fs';
|
|
2
|
-
import {
|
|
2
|
+
import { globIterate } from 'glob';
|
|
3
3
|
import path from 'path';
|
|
4
4
|
import readline from 'readline';
|
|
5
5
|
|
|
@@ -18,6 +18,9 @@ const MIN_LINE_LENGTH = 5;
|
|
|
18
18
|
// Maximum file size to process (skip very large files)
|
|
19
19
|
const MAX_FILE_SIZE = 10 * 1024 * 1024; // 10MB
|
|
20
20
|
|
|
21
|
+
// Cap line length to avoid unbounded readline buffer (e.g. file with no newlines)
|
|
22
|
+
const MAX_LINE_LENGTH = 256 * 1024; // 256KB
|
|
23
|
+
|
|
21
24
|
/**
|
|
22
25
|
* Check if a file is a text file we should index
|
|
23
26
|
*/
|
|
@@ -27,17 +30,17 @@ function isTextFile(filePath: string): boolean {
|
|
|
27
30
|
}
|
|
28
31
|
|
|
29
32
|
/**
|
|
30
|
-
* Scan a directory for text files and yield lines
|
|
33
|
+
* Scan a directory for text files and yield lines.
|
|
34
|
+
* Uses globIterate so file paths are streamed one-by-one (no full list in RAM).
|
|
35
|
+
* Readline has maxLineLength to avoid huge single-line buffers.
|
|
31
36
|
*/
|
|
32
37
|
export async function* scanDirectory(dirPath: string): AsyncGenerator<FileLine> {
|
|
33
|
-
// Find all files in directory recursively
|
|
34
38
|
const pattern = path.join(dirPath, '**/*');
|
|
35
|
-
|
|
36
|
-
const files = await glob(pattern, { nodir: true, absolute: true });
|
|
37
|
-
|
|
38
|
-
for (const filePath of files) {
|
|
39
|
+
for await (const filePath of globIterate(pattern, { nodir: true, absolute: true }) as AsyncIterable<string>) {
|
|
39
40
|
if (!isTextFile(filePath)) continue;
|
|
40
41
|
|
|
42
|
+
let fileStream: ReturnType<typeof createReadStream> | null = null;
|
|
43
|
+
let rl: readline.Interface | null = null;
|
|
41
44
|
try {
|
|
42
45
|
const stats = statSync(filePath);
|
|
43
46
|
if (stats.size > MAX_FILE_SIZE) {
|
|
@@ -45,10 +48,11 @@ export async function* scanDirectory(dirPath: string): AsyncGenerator<FileLine>
|
|
|
45
48
|
continue;
|
|
46
49
|
}
|
|
47
50
|
|
|
48
|
-
|
|
49
|
-
|
|
51
|
+
fileStream = createReadStream(filePath);
|
|
52
|
+
rl = readline.createInterface({
|
|
50
53
|
input: fileStream,
|
|
51
54
|
crlfDelay: Infinity,
|
|
55
|
+
maxLineLength: MAX_LINE_LENGTH,
|
|
52
56
|
});
|
|
53
57
|
|
|
54
58
|
let lineNumber = 0;
|
|
@@ -65,6 +69,9 @@ export async function* scanDirectory(dirPath: string): AsyncGenerator<FileLine>
|
|
|
65
69
|
}
|
|
66
70
|
} catch (err) {
|
|
67
71
|
console.error(`Error reading file ${filePath}:`, err);
|
|
72
|
+
} finally {
|
|
73
|
+
rl?.close();
|
|
74
|
+
fileStream?.destroy();
|
|
68
75
|
}
|
|
69
76
|
}
|
|
70
77
|
}
|
package/src/storage.ts
CHANGED
|
@@ -31,6 +31,10 @@ export class VectorStorage {
|
|
|
31
31
|
}
|
|
32
32
|
|
|
33
33
|
private init(): void {
|
|
34
|
+
// Limit SQLite page cache to avoid unbounded RAM (negative = kibibytes)
|
|
35
|
+
try {
|
|
36
|
+
this.db.exec('PRAGMA cache_size = -65536'); // 64MB max
|
|
37
|
+
} catch {}
|
|
34
38
|
this.db.exec(`
|
|
35
39
|
CREATE TABLE IF NOT EXISTS lines (
|
|
36
40
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|