@lojban/semantic-search-mcp 1.0.9 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/package.json +1 -1
- package/src/index.ts +11 -13
- package/src/scanner.ts +21 -11
package/README.md
CHANGED
|
@@ -111,6 +111,8 @@ Indexing uses **adaptive batch size** based on free system RAM so the OS doesn
|
|
|
111
111
|
|
|
112
112
|
Example: `SEMANTIC_SEARCH_RESERVE_MB=800 SEMANTIC_SEARCH_MAX_BATCH=256` to leave more headroom and use smaller batches.
|
|
113
113
|
|
|
114
|
+
- **`SEMANTIC_SEARCH_GC`** — explicit GC after each batch is **on by default** when Node is run with `--expose-gc` (helps avoid OS freezes during long indexing). In MCP use e.g. `"args": ["--expose-gc", "-y", "@lojban/semantic-search-mcp"]`. Set to `0` or `false` to disable.
|
|
115
|
+
|
|
114
116
|
## Example: Lojban dictionary gaps
|
|
115
117
|
|
|
116
118
|
1. Put your dictionary TSV (e.g. `jbo-eng.tsv`) in a folder.
|
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -45,7 +45,9 @@ let currentJobId = 0;
|
|
|
45
45
|
const RESERVE_MB = Number(process.env.SEMANTIC_SEARCH_RESERVE_MB) || 400;
|
|
46
46
|
const RESERVE_BYTES = RESERVE_MB * 1024 * 1024;
|
|
47
47
|
const MIN_BATCH = Number(process.env.SEMANTIC_SEARCH_MIN_BATCH) || 32;
|
|
48
|
-
const MAX_BATCH = Number(process.env.SEMANTIC_SEARCH_MAX_BATCH) ||
|
|
48
|
+
const MAX_BATCH = Number(process.env.SEMANTIC_SEARCH_MAX_BATCH) || 128;
|
|
49
|
+
// Explicit GC after each batch (when --expose-gc is available). Default on; set SEMANTIC_SEARCH_GC=0 or false to disable.
|
|
50
|
+
const ENABLE_GC = process.env.SEMANTIC_SEARCH_GC !== '0' && process.env.SEMANTIC_SEARCH_GC !== 'false';
|
|
49
51
|
|
|
50
52
|
/** Rough bytes per indexed line in memory: line text + path + embedding (384 floats) + overhead */
|
|
51
53
|
const BYTES_PER_LINE_ESTIMATE = 4000;
|
|
@@ -127,9 +129,14 @@ async function startIndexing(
|
|
|
127
129
|
indexedCount += batchToProcess.length;
|
|
128
130
|
if (isCurrentJob()) indexStatus.indexedLines = indexedCount;
|
|
129
131
|
console.error(`Indexed ${indexedCount} lines...`);
|
|
132
|
+
// Explicit GC when enabled (default) and Node run with --expose-gc
|
|
133
|
+
if (ENABLE_GC && typeof (globalThis as { gc?: () => void }).gc === 'function') {
|
|
134
|
+
(globalThis as { gc: () => void }).gc();
|
|
135
|
+
}
|
|
130
136
|
};
|
|
131
137
|
|
|
132
|
-
|
|
138
|
+
// Single task queue: only one batch is processed at a time (no pipelining).
|
|
139
|
+
// We do not read the next batch until the current one is fully done, to avoid memory spikes and OS freezes.
|
|
133
140
|
let batchSize = getAdaptiveBatchSize();
|
|
134
141
|
console.error(`Adaptive batch size: ${batchSize} (free RAM: ${Math.round(os.freemem() / 1024 / 1024)}MB, reserve: ${RESERVE_MB}MB)`);
|
|
135
142
|
|
|
@@ -138,18 +145,12 @@ async function startIndexing(
|
|
|
138
145
|
|
|
139
146
|
currentBatch.push(line);
|
|
140
147
|
if (currentBatch.length >= batchSize) {
|
|
141
|
-
if (processingPromise) {
|
|
142
|
-
await processingPromise;
|
|
143
|
-
}
|
|
144
|
-
if (signal.aborted) break;
|
|
145
|
-
|
|
146
148
|
const batchToProcess = currentBatch;
|
|
147
149
|
currentBatch = [];
|
|
148
150
|
batchSize = getAdaptiveBatchSize();
|
|
149
151
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
});
|
|
152
|
+
await processBatch(batchToProcess);
|
|
153
|
+
if (signal.aborted) break;
|
|
153
154
|
}
|
|
154
155
|
}
|
|
155
156
|
|
|
@@ -158,9 +159,6 @@ async function startIndexing(
|
|
|
158
159
|
return;
|
|
159
160
|
}
|
|
160
161
|
|
|
161
|
-
if (processingPromise) {
|
|
162
|
-
await processingPromise;
|
|
163
|
-
}
|
|
164
162
|
if (currentBatch.length > 0) {
|
|
165
163
|
await processBatch(currentBatch);
|
|
166
164
|
}
|
package/src/scanner.ts
CHANGED
|
@@ -38,6 +38,7 @@ export async function* scanDirectory(dirPath: string): AsyncGenerator<FileLine>
|
|
|
38
38
|
for (const filePath of files) {
|
|
39
39
|
if (!isTextFile(filePath)) continue;
|
|
40
40
|
|
|
41
|
+
let fileStream: ReturnType<typeof createReadStream> | null = null;
|
|
41
42
|
try {
|
|
42
43
|
const stats = statSync(filePath);
|
|
43
44
|
if (stats.size > MAX_FILE_SIZE) {
|
|
@@ -45,26 +46,35 @@ export async function* scanDirectory(dirPath: string): AsyncGenerator<FileLine>
|
|
|
45
46
|
continue;
|
|
46
47
|
}
|
|
47
48
|
|
|
48
|
-
|
|
49
|
+
fileStream = createReadStream(filePath);
|
|
49
50
|
const rl = readline.createInterface({
|
|
50
51
|
input: fileStream,
|
|
51
52
|
crlfDelay: Infinity,
|
|
52
53
|
});
|
|
53
54
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
55
|
+
try {
|
|
56
|
+
let lineNumber = 0;
|
|
57
|
+
for await (const line of rl) {
|
|
58
|
+
lineNumber++;
|
|
59
|
+
const trimmed = line.trim();
|
|
60
|
+
if (trimmed.length >= MIN_LINE_LENGTH) {
|
|
61
|
+
yield {
|
|
62
|
+
filePath,
|
|
63
|
+
lineNumber,
|
|
64
|
+
content: trimmed,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
64
67
|
}
|
|
68
|
+
} finally {
|
|
69
|
+
rl.close();
|
|
70
|
+
fileStream.destroy();
|
|
71
|
+
fileStream = null;
|
|
65
72
|
}
|
|
66
73
|
} catch (err) {
|
|
67
74
|
console.error(`Error reading file ${filePath}:`, err);
|
|
75
|
+
if (fileStream) {
|
|
76
|
+
fileStream.destroy();
|
|
77
|
+
}
|
|
68
78
|
}
|
|
69
79
|
}
|
|
70
80
|
}
|