@lojban/semantic-search-mcp 1.0.11 → 1.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -53
- package/package.json +1 -1
- package/src/embeddings.ts +2 -2
- package/src/index.ts +88 -217
- package/src/scanner.ts +24 -27
- package/src/storage.ts +4 -0
package/README.md
CHANGED
|
@@ -12,7 +12,7 @@ Use it in **Cursor**, **Claude Code**, or any IDE that supports MCP to search th
|
|
|
12
12
|
|
|
13
13
|
## How it works
|
|
14
14
|
|
|
15
|
-
- **Indexing**:
|
|
15
|
+
- **Indexing**: On startup, if `SEMANTIC_SEARCH_INDEX_DIRS` is set (comma-separated paths), the server scans those directories in the background for `.txt`, `.md`, `.tsv`, `.csv`. Each non-empty line gets a vector embedding (via [Hugging Face Transformers.js](https://huggingface.co/docs/transformers.js), model `Xenova/all-MiniLM-L6-v2`) and is stored in a local SQLite database with [@dao-xyz/sqlite3-vec](https://www.npmjs.com/package/@dao-xyz/sqlite3-vec) (SQLite + sqlite-vec for Node and browser). Indexing runs asynchronously so the server stays responsive and uses bounded memory.
|
|
16
16
|
- **Search**: You send a natural-language query; the server embeds it and returns the closest lines by cosine similarity.
|
|
17
17
|
- **Storage**: Index is stored in your project's `.semantic-search/data/` (or set `SEMANTIC_SEARCH_DATA_DIR`). No cloud, no API keys.
|
|
18
18
|
|
|
@@ -44,22 +44,21 @@ The package is published as [**@lojban/semantic-search-mcp**](https://www.npmjs.
|
|
|
44
44
|
}
|
|
45
45
|
```
|
|
46
46
|
|
|
47
|
-
No `cwd` needed: the server stores its index in your **project directory** (`.semantic-search/data/`), so open your project in Cursor and the index is per-workspace. To use a fixed data directory instead, add `"env": { "SEMANTIC_SEARCH_DATA_DIR": "/path/to/data" }`.
|
|
47
|
+
No `cwd` needed: the server stores its index in your **project directory** (`.semantic-search/data/`), so open your project in Cursor and the index is per-workspace. To use a fixed data directory instead, add `"env": { "SEMANTIC_SEARCH_DATA_DIR": "/path/to/data" }`. To have the server index directories on startup, set `"env": { "SEMANTIC_SEARCH_INDEX_DIRS": "./dictionary,./glossary" }` (comma-separated paths).
|
|
48
48
|
|
|
49
|
-
2. **Restart Cursor** (or reload the window).
|
|
49
|
+
2. **Restart Cursor** (or reload the window). If `SEMANTIC_SEARCH_INDEX_DIRS` is set, indexing starts automatically in the background.
|
|
50
50
|
|
|
51
51
|
3. In chat or Composer, ask the AI to use the tools:
|
|
52
|
-
- **Index**: "Index the directory `./my-dictionary`" (or a list of paths). Optionally "clear existing index first."
|
|
53
52
|
- **Search**: "Search the index for …" or "Find entries similar to …"
|
|
54
|
-
- **Stats**: "How many lines/files are in the index?"
|
|
53
|
+
- **Stats**: "How many lines/files are in the index?" or "Is indexing still running?" — stats include progress and start time (locale-formatted) when indexing is in progress.
|
|
55
54
|
|
|
56
|
-
The AI will call `
|
|
55
|
+
The AI will call `search` and `get_index_stats` for you.
|
|
57
56
|
|
|
58
57
|
## Use in other AI IDEs (Claude Code, etc.)
|
|
59
58
|
|
|
60
59
|
Any environment that supports MCP over stdio can use this server. Run:
|
|
61
60
|
|
|
62
|
-
- **One-liner**: `npx -y @lojban/semantic-search-mcp` — dependencies are installed on first run; index is stored in the current working directory's `.semantic-search/data/`.
|
|
61
|
+
- **One-liner**: `npx -y @lojban/semantic-search-mcp` — dependencies are installed on first run; index is stored in the current working directory's `.semantic-search/data/`. Set env `SEMANTIC_SEARCH_INDEX_DIRS` (comma-separated paths) to index those directories on startup in the background. Tools: `search`, `get_index_stats`.
|
|
63
62
|
|
|
64
63
|
**From source**: Clone the repo, run `npm install` once, then use `"command": "npx", "args": ["tsx", "src/index.ts"], "cwd": "/path/to/semantic-search-mcp"` or `"command": "node", "args": ["/path/to/semantic-search-mcp/run.mjs"]` (no `cwd` needed with the latter). See [MCP_SETUP.md](MCP_SETUP.md) for details.
|
|
65
64
|
|
|
@@ -67,60 +66,21 @@ Any environment that supports MCP over stdio can use this server. Run:
|
|
|
67
66
|
|
|
68
67
|
| Tool | Description |
|
|
69
68
|
|------|-------------|
|
|
70
|
-
| `index_directories` | Scan one or more directories and index every line of supported text files. Pass `directories` (array of paths) or set env `SEMANTIC_SEARCH_INDEX_DIRS` (comma-separated). Optional: `clear_existing: true` to replace the index. |
|
|
71
69
|
| `search` | Semantic search: `query` (string), optional `limit` (default 10). Returns file path, line number, content, and similarity score. |
|
|
72
|
-
| `get_index_stats` | Returns total number of indexed files and lines. |
|
|
70
|
+
| `get_index_stats` | Returns total number of indexed files and lines. When indexing is running in the background, also returns progress: `indexing.started_at` (locale-formatted), `lines_indexed_so_far`, `files_indexed_so_far`, and `in_progress`. |
|
|
73
71
|
|
|
74
|
-
### Indexing
|
|
72
|
+
### Indexing on startup
|
|
75
73
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
**In Cursor (natural language):**
|
|
79
|
-
|
|
80
|
-
- "Index these directories: `./dictionary`, `./glossary`, and `./notes`."
|
|
81
|
-
- "Index `./data/lojban-eng` and `/home/me/other-corpus` with clear_existing true."
|
|
82
|
-
- "Clear the index and re-index only `./tsv` and `./exports`."
|
|
83
|
-
|
|
84
|
-
**Under the hood** the tool receives:
|
|
85
|
-
|
|
86
|
-
```json
|
|
87
|
-
{
|
|
88
|
-
"directories": ["./dictionary", "./glossary", "./notes"],
|
|
89
|
-
"clear_existing": false
|
|
90
|
-
}
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
To replace the entire index with new content from several places:
|
|
94
|
-
|
|
95
|
-
```json
|
|
96
|
-
{
|
|
97
|
-
"directories": ["/path/to/dict1", "/path/to/dict2", "/path/to/corpus"],
|
|
98
|
-
"clear_existing": true
|
|
99
|
-
}
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
Paths can be anywhere on disk (e.g. different drives or projects); the server reads and indexes all supported text/TSV/CSV files under each directory recursively.
|
|
103
|
-
|
|
104
|
-
### Memory and batch size
|
|
105
|
-
|
|
106
|
-
Indexing uses **adaptive batch size** based on free system RAM so the OS doesn’t freeze on low-memory machines. The server reads `os.freemem()`, keeps a reserve (default 400MB), and caps batch size between 32 and 512 lines. You can tune this with env vars:
|
|
107
|
-
|
|
108
|
-
- **`SEMANTIC_SEARCH_RESERVE_MB`** — MB of RAM to keep free (default `400`).
|
|
109
|
-
- **`SEMANTIC_SEARCH_MIN_BATCH`** — minimum lines per batch (default `32`).
|
|
110
|
-
- **`SEMANTIC_SEARCH_MAX_BATCH`** — maximum lines per batch (default `512`).
|
|
111
|
-
|
|
112
|
-
Example: `SEMANTIC_SEARCH_RESERVE_MB=800 SEMANTIC_SEARCH_MAX_BATCH=256` to leave more headroom and use smaller batches.
|
|
113
|
-
|
|
114
|
-
- **`SEMANTIC_SEARCH_GC`** — explicit GC after each batch is **on by default** when Node is run with `--expose-gc` (helps avoid OS freezes during long indexing). In MCP use e.g. `"args": ["--expose-gc", "-y", "@lojban/semantic-search-mcp"]`. Set to `0` or `false` to disable.
|
|
74
|
+
Set the environment variable **`SEMANTIC_SEARCH_INDEX_DIRS`** to a comma-separated list of directories to index. When the MCP server starts, it begins indexing those directories in the background (async). The index is cleared and rebuilt each time the server starts. Use absolute paths or paths relative to the server's working directory. The server reads and indexes all supported text/TSV/CSV files under each directory recursively. Indexing uses bounded memory and yields to the event loop so the OS stays responsive.
|
|
115
75
|
|
|
116
76
|
## Example: Lojban dictionary gaps
|
|
117
77
|
|
|
118
|
-
1. Put your dictionary TSV (e.g. `jbo-eng.tsv`) in a folder.
|
|
119
|
-
2.
|
|
120
|
-
3.
|
|
78
|
+
1. Put your dictionary TSV (e.g. `jbo-eng.tsv`) in a folder (e.g. `./dictionary`).
|
|
79
|
+
2. Set `SEMANTIC_SEARCH_INDEX_DIRS=./dictionary` in your MCP config (or in the environment). Restart the server; indexing runs in the background.
|
|
80
|
+
3. In Cursor: "Search for entries similar to 'to cause to become warm' and limit 20."
|
|
121
81
|
4. Or: "Search for 'emotional state of joy' and show me what we have; then suggest word combinations the dictionary might be missing."
|
|
122
82
|
|
|
123
|
-
The index is stored in `.semantic-search/data/vectors.db` (or your project root).
|
|
83
|
+
The index is stored in `.semantic-search/data/vectors.db` (or your project root). Restart the server to re-index when you add or change files.
|
|
124
84
|
|
|
125
85
|
## Development
|
|
126
86
|
|
package/package.json
CHANGED
package/src/embeddings.ts
CHANGED
|
@@ -36,8 +36,8 @@ export async function getBatchEmbeddings(texts: string[]): Promise<Float32Array[
|
|
|
36
36
|
const ext = await getExtractor();
|
|
37
37
|
const results: Float32Array[] = [];
|
|
38
38
|
|
|
39
|
-
// Process in batches for memory; each batch is one model forward pass
|
|
40
|
-
const batchSize =
|
|
39
|
+
// Process in batches for memory; each batch is one model forward pass (smaller = lower peak RAM)
|
|
40
|
+
const batchSize = 32;
|
|
41
41
|
for (let i = 0; i < texts.length; i += batchSize) {
|
|
42
42
|
const batch = texts.slice(i, i + batchSize);
|
|
43
43
|
const output = await ext(batch, { pooling: 'mean', normalize: true });
|
package/src/index.ts
CHANGED
|
@@ -5,10 +5,9 @@ import {
|
|
|
5
5
|
CallToolRequestSchema,
|
|
6
6
|
ListToolsRequestSchema,
|
|
7
7
|
} from '@modelcontextprotocol/sdk/types.js';
|
|
8
|
-
import os from 'node:os';
|
|
9
8
|
import path from 'path';
|
|
10
9
|
import { getEmbedding, getBatchEmbeddings } from './embeddings.js';
|
|
11
|
-
import { createVectorStorage, type SearchResult
|
|
10
|
+
import { createVectorStorage, type SearchResult } from './storage.js';
|
|
12
11
|
import { scanDirectories } from './scanner.js';
|
|
13
12
|
|
|
14
13
|
// Data dir: use env, or project cwd so each workspace has its own index when run via npx from Cursor
|
|
@@ -17,191 +16,81 @@ const dataDir =
|
|
|
17
16
|
path.join(process.cwd(), '.semantic-search', 'data');
|
|
18
17
|
const DB_PATH = path.join(dataDir, 'vectors.db');
|
|
19
18
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
directories: string[];
|
|
19
|
+
// Background indexing state (progress for get_index_stats)
|
|
20
|
+
const indexingState = {
|
|
21
|
+
inProgress: false,
|
|
22
|
+
startedAt: null as Date | null,
|
|
23
|
+
linesIndexed: 0,
|
|
24
|
+
filesIndexed: 0,
|
|
25
|
+
error: null as string | null,
|
|
28
26
|
};
|
|
29
27
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
startedAt: null,
|
|
33
|
-
finishedAt: null,
|
|
34
|
-
lastError: null,
|
|
35
|
-
indexedLines: 0,
|
|
36
|
-
indexedFiles: 0,
|
|
37
|
-
directories: [],
|
|
38
|
-
};
|
|
39
|
-
|
|
40
|
-
// Single "mutex": only one indexing job is allowed to run. Starting a new job aborts the previous one.
|
|
41
|
-
let currentIndexingAbortController: AbortController | null = null;
|
|
42
|
-
let currentJobId = 0;
|
|
43
|
-
|
|
44
|
-
// Adaptive batch size: reserve RAM so we don't freeze the OS (env overrides in bytes or MB)
|
|
45
|
-
const RESERVE_MB = Number(process.env.SEMANTIC_SEARCH_RESERVE_MB) || 400;
|
|
46
|
-
const RESERVE_BYTES = RESERVE_MB * 1024 * 1024;
|
|
47
|
-
const MIN_BATCH = Number(process.env.SEMANTIC_SEARCH_MIN_BATCH) || 32;
|
|
48
|
-
const MAX_BATCH = Number(process.env.SEMANTIC_SEARCH_MAX_BATCH) || 128;
|
|
49
|
-
// Explicit GC after each batch (when --expose-gc is available). Default on; set SEMANTIC_SEARCH_GC=0 or false to disable.
|
|
50
|
-
const ENABLE_GC = process.env.SEMANTIC_SEARCH_GC !== '0' && process.env.SEMANTIC_SEARCH_GC !== 'false';
|
|
51
|
-
|
|
52
|
-
/** Rough bytes per indexed line in memory: line text + path + embedding (384 floats) + overhead */
|
|
53
|
-
const BYTES_PER_LINE_ESTIMATE = 4000;
|
|
54
|
-
|
|
55
|
-
/**
|
|
56
|
-
* Compute batch size from current free system RAM. Keeps reserve free to avoid freezing the OS.
|
|
57
|
-
*/
|
|
58
|
-
function getAdaptiveBatchSize(): number {
|
|
59
|
-
const free = os.freemem();
|
|
60
|
-
const available = free > RESERVE_BYTES ? free - RESERVE_BYTES : Math.floor(free / 2);
|
|
61
|
-
const batch = Math.floor(available / BYTES_PER_LINE_ESTIMATE);
|
|
62
|
-
const clamped = Math.max(MIN_BATCH, Math.min(MAX_BATCH, batch));
|
|
63
|
-
return clamped;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
/**
|
|
67
|
-
* Request indexing of directories. If another indexing job is running, it is aborted first.
|
|
68
|
-
* Then a new job is started (clears index and rebuilds).
|
|
69
|
-
*/
|
|
70
|
-
function requestIndexing(storage: VectorStorage, directories: string[]): void {
|
|
71
|
-
if (!directories.length) {
|
|
72
|
-
console.error('No directories to index. Set SEMANTIC_SEARCH_INDEX_DIRS (comma-separated paths).');
|
|
73
|
-
return;
|
|
74
|
-
}
|
|
28
|
+
// Batch size kept small to avoid high RAM usage during indexing
|
|
29
|
+
const INDEX_BATCH_SIZE = 256;
|
|
75
30
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
currentIndexingAbortController = null;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
currentJobId += 1;
|
|
83
|
-
const jobId = currentJobId;
|
|
84
|
-
currentIndexingAbortController = new AbortController();
|
|
85
|
-
const signal = currentIndexingAbortController.signal;
|
|
86
|
-
|
|
87
|
-
indexStatus.isIndexing = true;
|
|
88
|
-
indexStatus.startedAt = Date.now();
|
|
89
|
-
indexStatus.finishedAt = null;
|
|
90
|
-
indexStatus.lastError = null;
|
|
91
|
-
indexStatus.directories = directories;
|
|
92
|
-
indexStatus.indexedLines = 0;
|
|
93
|
-
indexStatus.indexedFiles = 0;
|
|
94
|
-
|
|
95
|
-
void startIndexing(storage, directories, signal, jobId);
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
async function startIndexing(
|
|
99
|
-
storage: VectorStorage,
|
|
100
|
-
directories: string[],
|
|
101
|
-
signal: AbortSignal,
|
|
102
|
-
jobId: number
|
|
31
|
+
async function runBackgroundIndexing(
|
|
32
|
+
storage: Awaited<ReturnType<typeof createVectorStorage>>,
|
|
33
|
+
directories: string[]
|
|
103
34
|
): Promise<void> {
|
|
104
|
-
|
|
35
|
+
indexingState.inProgress = true;
|
|
36
|
+
indexingState.startedAt = new Date();
|
|
37
|
+
indexingState.linesIndexed = 0;
|
|
38
|
+
indexingState.filesIndexed = 0;
|
|
39
|
+
indexingState.error = null;
|
|
40
|
+
storage.clear();
|
|
105
41
|
|
|
106
42
|
try {
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
const processBatch = async (batchToProcess: any[]) => {
|
|
117
|
-
if (batchToProcess.length === 0) return;
|
|
118
|
-
const contents = batchToProcess.map((l) => l.content);
|
|
43
|
+
let currentBatch: Array<{ filePath: string; lineNumber: number; content: string }> = [];
|
|
44
|
+
let processingPromise: Promise<void> | null = null;
|
|
45
|
+
|
|
46
|
+
let batchesProcessed = 0;
|
|
47
|
+
const processBatch = async (
|
|
48
|
+
batch: Array<{ filePath: string; lineNumber: number; content: string }>
|
|
49
|
+
): Promise<void> => {
|
|
50
|
+
if (batch.length === 0) return;
|
|
51
|
+
const contents = batch.map((l) => l.content);
|
|
119
52
|
const embeddings = await getBatchEmbeddings(contents);
|
|
120
|
-
|
|
121
|
-
const batchData = batchToProcess.map((line, idx) => ({
|
|
53
|
+
const batchData = batch.map((line, idx) => ({
|
|
122
54
|
filePath: line.filePath,
|
|
123
55
|
lineNumber: line.lineNumber,
|
|
124
56
|
content: line.content,
|
|
125
57
|
embedding: embeddings[idx],
|
|
126
58
|
}));
|
|
127
|
-
|
|
128
59
|
await storage.upsertLinesBatch(batchData);
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
(globalThis as { gc: () => void }).gc();
|
|
60
|
+
indexingState.linesIndexed += batch.length;
|
|
61
|
+
batchesProcessed++;
|
|
62
|
+
if (batchesProcessed === 1 || batchesProcessed % 10 === 0) {
|
|
63
|
+
const stats = await storage.getStats();
|
|
64
|
+
indexingState.filesIndexed = stats.totalFiles;
|
|
135
65
|
}
|
|
136
66
|
};
|
|
137
67
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
let batchSize = getAdaptiveBatchSize();
|
|
141
|
-
console.error(`Adaptive batch size: ${batchSize} (free RAM: ${Math.round(os.freemem() / 1024 / 1024)}MB, reserve: ${RESERVE_MB}MB)`);
|
|
68
|
+
const yieldToEventLoop = (): Promise<void> =>
|
|
69
|
+
new Promise((resolve) => setImmediate(resolve));
|
|
142
70
|
|
|
143
71
|
for await (const line of scanDirectories(directories)) {
|
|
144
|
-
if (signal.aborted) break;
|
|
145
|
-
|
|
146
72
|
currentBatch.push(line);
|
|
147
|
-
if (currentBatch.length >=
|
|
73
|
+
if (currentBatch.length >= INDEX_BATCH_SIZE) {
|
|
74
|
+
if (processingPromise) await processingPromise;
|
|
148
75
|
const batchToProcess = currentBatch;
|
|
149
76
|
currentBatch = [];
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
await processBatch(batchToProcess);
|
|
153
|
-
if (signal.aborted) break;
|
|
77
|
+
processingPromise = processBatch(batchToProcess);
|
|
78
|
+
await yieldToEventLoop();
|
|
154
79
|
}
|
|
155
80
|
}
|
|
156
81
|
|
|
157
|
-
if (
|
|
158
|
-
|
|
159
|
-
return;
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
if (currentBatch.length > 0) {
|
|
163
|
-
await processBatch(currentBatch);
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
if (!isCurrentJob()) return;
|
|
82
|
+
if (processingPromise) await processingPromise;
|
|
83
|
+
if (currentBatch.length > 0) await processBatch(currentBatch);
|
|
167
84
|
|
|
168
85
|
const stats = await storage.getStats();
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
indexStatus.finishedAt = Date.now();
|
|
172
|
-
|
|
173
|
-
console.error(
|
|
174
|
-
`Finished indexing ${stats.totalLines} lines from ${stats.totalFiles} files in background job.`
|
|
175
|
-
);
|
|
86
|
+
indexingState.linesIndexed = stats.totalLines;
|
|
87
|
+
indexingState.filesIndexed = stats.totalFiles;
|
|
176
88
|
} catch (err) {
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
indexStatus.lastError = message;
|
|
180
|
-
indexStatus.finishedAt = Date.now();
|
|
181
|
-
}
|
|
182
|
-
console.error('Error during indexing job:', err);
|
|
89
|
+
indexingState.error = err instanceof Error ? err.message : String(err);
|
|
90
|
+
console.error('Background indexing error:', indexingState.error);
|
|
183
91
|
} finally {
|
|
184
|
-
|
|
185
|
-
indexStatus.isIndexing = false;
|
|
186
|
-
}
|
|
187
|
-
if (currentIndexingAbortController && currentJobId === jobId) {
|
|
188
|
-
currentIndexingAbortController = null;
|
|
189
|
-
}
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
function ensureInitialIndexing(storage: VectorStorage): void {
|
|
194
|
-
const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
|
|
195
|
-
const directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
|
|
196
|
-
|
|
197
|
-
if (!directories.length) {
|
|
198
|
-
console.error(
|
|
199
|
-
'Semantic Search MCP: SEMANTIC_SEARCH_INDEX_DIRS is not set; automatic indexing on startup is disabled.'
|
|
200
|
-
);
|
|
201
|
-
return;
|
|
92
|
+
indexingState.inProgress = false;
|
|
202
93
|
}
|
|
203
|
-
|
|
204
|
-
requestIndexing(storage, directories);
|
|
205
94
|
}
|
|
206
95
|
|
|
207
96
|
async function main() {
|
|
@@ -222,19 +111,9 @@ async function main() {
|
|
|
222
111
|
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
223
112
|
return {
|
|
224
113
|
tools: [
|
|
225
|
-
{
|
|
226
|
-
name: 'index_directories',
|
|
227
|
-
description:
|
|
228
|
-
'Trigger background indexing of directories from SEMANTIC_SEARCH_INDEX_DIRS (comma-separated). Clears and rebuilds the index asynchronously.',
|
|
229
|
-
inputSchema: {
|
|
230
|
-
type: 'object',
|
|
231
|
-
properties: {},
|
|
232
|
-
},
|
|
233
|
-
},
|
|
234
114
|
{
|
|
235
115
|
name: 'search',
|
|
236
|
-
description:
|
|
237
|
-
'Search for lines semantically similar to the query. Returns the most relevant lines from indexed files.',
|
|
116
|
+
description: 'Search for lines semantically similar to the query. Returns the most relevant lines from indexed files.',
|
|
238
117
|
inputSchema: {
|
|
239
118
|
type: 'object',
|
|
240
119
|
properties: {
|
|
@@ -253,7 +132,7 @@ async function main() {
|
|
|
253
132
|
},
|
|
254
133
|
{
|
|
255
134
|
name: 'get_index_stats',
|
|
256
|
-
description: 'Get statistics
|
|
135
|
+
description: 'Get statistics about the current index (number of files and lines indexed). If indexing is running in the background, returns progress and start time (locale-formatted).',
|
|
257
136
|
inputSchema: {
|
|
258
137
|
type: 'object',
|
|
259
138
|
properties: {},
|
|
@@ -268,40 +147,6 @@ async function main() {
|
|
|
268
147
|
|
|
269
148
|
try {
|
|
270
149
|
switch (name) {
|
|
271
|
-
case 'index_directories': {
|
|
272
|
-
const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
|
|
273
|
-
const directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
|
|
274
|
-
if (!directories.length) {
|
|
275
|
-
throw new Error(
|
|
276
|
-
'No directories to index. Set SEMANTIC_SEARCH_INDEX_DIRS (comma-separated paths).'
|
|
277
|
-
);
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
// Abort any in-progress indexing and start a new job (clears and rebuilds).
|
|
281
|
-
requestIndexing(storage, directories);
|
|
282
|
-
|
|
283
|
-
const stats = await storage.getStats();
|
|
284
|
-
return {
|
|
285
|
-
content: [
|
|
286
|
-
{
|
|
287
|
-
type: 'text',
|
|
288
|
-
text: JSON.stringify({
|
|
289
|
-
success: true,
|
|
290
|
-
indexing: indexStatus.isIndexing,
|
|
291
|
-
indexed_lines: stats.totalLines,
|
|
292
|
-
indexed_files: stats.totalFiles,
|
|
293
|
-
started_at: indexStatus.startedAt,
|
|
294
|
-
finished_at: indexStatus.finishedAt,
|
|
295
|
-
last_error: indexStatus.lastError,
|
|
296
|
-
message: indexStatus.isIndexing
|
|
297
|
-
? `Indexing started in background. Currently ${stats.totalLines} lines from ${stats.totalFiles} files in index.`
|
|
298
|
-
: `Indexing completed. Indexed ${stats.totalLines} lines from ${stats.totalFiles} files.`,
|
|
299
|
-
}),
|
|
300
|
-
},
|
|
301
|
-
],
|
|
302
|
-
};
|
|
303
|
-
}
|
|
304
|
-
|
|
305
150
|
case 'search': {
|
|
306
151
|
const query = (args as { query: string; limit?: number }).query;
|
|
307
152
|
const limit = (args as { query: string; limit?: number }).limit ?? 10;
|
|
@@ -329,21 +174,41 @@ async function main() {
|
|
|
329
174
|
|
|
330
175
|
case 'get_index_stats': {
|
|
331
176
|
const stats = await storage.getStats();
|
|
177
|
+
const payload: {
|
|
178
|
+
total_files: number;
|
|
179
|
+
total_lines: number;
|
|
180
|
+
indexing?: {
|
|
181
|
+
in_progress: boolean;
|
|
182
|
+
started_at: string;
|
|
183
|
+
lines_indexed_so_far: number;
|
|
184
|
+
files_indexed_so_far: number;
|
|
185
|
+
error?: string;
|
|
186
|
+
};
|
|
187
|
+
} = {
|
|
188
|
+
total_files: stats.totalFiles,
|
|
189
|
+
total_lines: stats.totalLines,
|
|
190
|
+
};
|
|
191
|
+
if (indexingState.inProgress && indexingState.startedAt) {
|
|
192
|
+
payload.indexing = {
|
|
193
|
+
in_progress: true,
|
|
194
|
+
started_at: indexingState.startedAt.toLocaleString(),
|
|
195
|
+
lines_indexed_so_far: indexingState.linesIndexed,
|
|
196
|
+
files_indexed_so_far: indexingState.filesIndexed,
|
|
197
|
+
};
|
|
198
|
+
} else if (indexingState.error) {
|
|
199
|
+
payload.indexing = {
|
|
200
|
+
in_progress: false,
|
|
201
|
+
started_at: indexingState.startedAt?.toLocaleString() ?? '',
|
|
202
|
+
lines_indexed_so_far: indexingState.linesIndexed,
|
|
203
|
+
files_indexed_so_far: indexingState.filesIndexed,
|
|
204
|
+
error: indexingState.error,
|
|
205
|
+
};
|
|
206
|
+
}
|
|
332
207
|
return {
|
|
333
208
|
content: [
|
|
334
209
|
{
|
|
335
210
|
type: 'text',
|
|
336
|
-
text: JSON.stringify(
|
|
337
|
-
total_files: stats.totalFiles,
|
|
338
|
-
total_lines: stats.totalLines,
|
|
339
|
-
is_indexing: indexStatus.isIndexing,
|
|
340
|
-
indexed_lines: indexStatus.indexedLines,
|
|
341
|
-
indexed_files: indexStatus.indexedFiles,
|
|
342
|
-
started_at: indexStatus.startedAt,
|
|
343
|
-
finished_at: indexStatus.finishedAt,
|
|
344
|
-
last_error: indexStatus.lastError,
|
|
345
|
-
directories: indexStatus.directories,
|
|
346
|
-
}),
|
|
211
|
+
text: JSON.stringify(payload),
|
|
347
212
|
},
|
|
348
213
|
],
|
|
349
214
|
};
|
|
@@ -365,8 +230,14 @@ async function main() {
|
|
|
365
230
|
await server.connect(transport);
|
|
366
231
|
console.error('Semantic Search MCP Server running on stdio');
|
|
367
232
|
|
|
368
|
-
|
|
369
|
-
|
|
233
|
+
const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
|
|
234
|
+
const directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
|
|
235
|
+
if (directories.length > 0) {
|
|
236
|
+
console.error(`Starting background indexing for ${directories.length} directories...`);
|
|
237
|
+
runBackgroundIndexing(storage, directories).catch((err) => {
|
|
238
|
+
console.error('Background indexing failed:', err);
|
|
239
|
+
});
|
|
240
|
+
}
|
|
370
241
|
}
|
|
371
242
|
|
|
372
243
|
main().catch(console.error);
|
package/src/scanner.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { createReadStream, statSync } from 'fs';
|
|
2
|
-
import {
|
|
2
|
+
import { globIterate } from 'glob';
|
|
3
3
|
import path from 'path';
|
|
4
4
|
import readline from 'readline';
|
|
5
5
|
|
|
@@ -18,6 +18,9 @@ const MIN_LINE_LENGTH = 5;
|
|
|
18
18
|
// Maximum file size to process (skip very large files)
|
|
19
19
|
const MAX_FILE_SIZE = 10 * 1024 * 1024; // 10MB
|
|
20
20
|
|
|
21
|
+
// Cap line length to avoid unbounded readline buffer (e.g. file with no newlines)
|
|
22
|
+
const MAX_LINE_LENGTH = 256 * 1024; // 256KB
|
|
23
|
+
|
|
21
24
|
/**
|
|
22
25
|
* Check if a file is a text file we should index
|
|
23
26
|
*/
|
|
@@ -27,18 +30,17 @@ function isTextFile(filePath: string): boolean {
|
|
|
27
30
|
}
|
|
28
31
|
|
|
29
32
|
/**
|
|
30
|
-
* Scan a directory for text files and yield lines
|
|
33
|
+
* Scan a directory for text files and yield lines.
|
|
34
|
+
* Uses globIterate so file paths are streamed one-by-one (no full list in RAM).
|
|
35
|
+
* Readline has maxLineLength to avoid huge single-line buffers.
|
|
31
36
|
*/
|
|
32
37
|
export async function* scanDirectory(dirPath: string): AsyncGenerator<FileLine> {
|
|
33
|
-
// Find all files in directory recursively
|
|
34
38
|
const pattern = path.join(dirPath, '**/*');
|
|
35
|
-
|
|
36
|
-
const files = await glob(pattern, { nodir: true, absolute: true });
|
|
37
|
-
|
|
38
|
-
for (const filePath of files) {
|
|
39
|
+
for await (const filePath of globIterate(pattern, { nodir: true, absolute: true }) as AsyncIterable<string>) {
|
|
39
40
|
if (!isTextFile(filePath)) continue;
|
|
40
41
|
|
|
41
42
|
let fileStream: ReturnType<typeof createReadStream> | null = null;
|
|
43
|
+
let rl: readline.Interface | null = null;
|
|
42
44
|
try {
|
|
43
45
|
const stats = statSync(filePath);
|
|
44
46
|
if (stats.size > MAX_FILE_SIZE) {
|
|
@@ -47,34 +49,29 @@ export async function* scanDirectory(dirPath: string): AsyncGenerator<FileLine>
|
|
|
47
49
|
}
|
|
48
50
|
|
|
49
51
|
fileStream = createReadStream(filePath);
|
|
50
|
-
|
|
52
|
+
rl = readline.createInterface({
|
|
51
53
|
input: fileStream,
|
|
52
54
|
crlfDelay: Infinity,
|
|
55
|
+
maxLineLength: MAX_LINE_LENGTH,
|
|
53
56
|
});
|
|
54
57
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
};
|
|
66
|
-
}
|
|
58
|
+
let lineNumber = 0;
|
|
59
|
+
for await (const line of rl) {
|
|
60
|
+
lineNumber++;
|
|
61
|
+
const trimmed = line.trim();
|
|
62
|
+
if (trimmed.length >= MIN_LINE_LENGTH) {
|
|
63
|
+
yield {
|
|
64
|
+
filePath,
|
|
65
|
+
lineNumber,
|
|
66
|
+
content: trimmed,
|
|
67
|
+
};
|
|
67
68
|
}
|
|
68
|
-
} finally {
|
|
69
|
-
rl.close();
|
|
70
|
-
fileStream.destroy();
|
|
71
|
-
fileStream = null;
|
|
72
69
|
}
|
|
73
70
|
} catch (err) {
|
|
74
71
|
console.error(`Error reading file ${filePath}:`, err);
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
72
|
+
} finally {
|
|
73
|
+
rl?.close();
|
|
74
|
+
fileStream?.destroy();
|
|
78
75
|
}
|
|
79
76
|
}
|
|
80
77
|
}
|
package/src/storage.ts
CHANGED
|
@@ -31,6 +31,10 @@ export class VectorStorage {
|
|
|
31
31
|
}
|
|
32
32
|
|
|
33
33
|
private init(): void {
|
|
34
|
+
// Limit SQLite page cache to avoid unbounded RAM (negative = kibibytes)
|
|
35
|
+
try {
|
|
36
|
+
this.db.exec('PRAGMA cache_size = -65536'); // 64MB max
|
|
37
|
+
} catch {}
|
|
34
38
|
this.db.exec(`
|
|
35
39
|
CREATE TABLE IF NOT EXISTS lines (
|
|
36
40
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|