@lojban/semantic-search-mcp 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.ts +163 -40
- package/src/scanner.ts +29 -25
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -7,7 +7,7 @@ import {
|
|
|
7
7
|
} from '@modelcontextprotocol/sdk/types.js';
|
|
8
8
|
import path from 'path';
|
|
9
9
|
import { getEmbedding, getBatchEmbeddings } from './embeddings.js';
|
|
10
|
-
import { createVectorStorage, type SearchResult } from './storage.js';
|
|
10
|
+
import { createVectorStorage, type SearchResult, type VectorStorage } from './storage.js';
|
|
11
11
|
import { scanDirectories } from './scanner.js';
|
|
12
12
|
|
|
13
13
|
// Data dir: use env, or project cwd so each workspace has its own index when run via npx from Cursor
|
|
@@ -16,6 +16,139 @@ const dataDir =
|
|
|
16
16
|
path.join(process.cwd(), '.semantic-search', 'data');
|
|
17
17
|
const DB_PATH = path.join(dataDir, 'vectors.db');
|
|
18
18
|
|
|
19
|
+
type IndexStatus = {
|
|
20
|
+
isIndexing: boolean;
|
|
21
|
+
startedAt: number | null;
|
|
22
|
+
finishedAt: number | null;
|
|
23
|
+
lastError: string | null;
|
|
24
|
+
indexedLines: number;
|
|
25
|
+
indexedFiles: number;
|
|
26
|
+
directories: string[];
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
const indexStatus: IndexStatus = {
|
|
30
|
+
isIndexing: false,
|
|
31
|
+
startedAt: null,
|
|
32
|
+
finishedAt: null,
|
|
33
|
+
lastError: null,
|
|
34
|
+
indexedLines: 0,
|
|
35
|
+
indexedFiles: 0,
|
|
36
|
+
directories: [],
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
async function startIndexing(storage: VectorStorage, directories: string[]): Promise<void> {
|
|
40
|
+
if (!directories.length) {
|
|
41
|
+
console.error('No directories to index. Set SEMANTIC_SEARCH_INDEX_DIRS (comma-separated paths).');
|
|
42
|
+
return;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (indexStatus.isIndexing) {
|
|
46
|
+
console.error('Indexing already in progress, not starting a new job.');
|
|
47
|
+
return;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
indexStatus.isIndexing = true;
|
|
51
|
+
indexStatus.startedAt = Date.now();
|
|
52
|
+
indexStatus.finishedAt = null;
|
|
53
|
+
indexStatus.lastError = null;
|
|
54
|
+
indexStatus.directories = directories;
|
|
55
|
+
indexStatus.indexedLines = 0;
|
|
56
|
+
indexStatus.indexedFiles = 0;
|
|
57
|
+
|
|
58
|
+
try {
|
|
59
|
+
storage.clear();
|
|
60
|
+
|
|
61
|
+
console.error(`Scanning ${directories.length} directories (background indexing)...`);
|
|
62
|
+
|
|
63
|
+
let indexedCount = 0;
|
|
64
|
+
const BATCH_SIZE = 512;
|
|
65
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
66
|
+
let currentBatch: any[] = [];
|
|
67
|
+
|
|
68
|
+
const processBatch = async (batchToProcess: any[]) => {
|
|
69
|
+
if (batchToProcess.length === 0) return;
|
|
70
|
+
const contents = batchToProcess.map((l) => l.content);
|
|
71
|
+
const embeddings = await getBatchEmbeddings(contents);
|
|
72
|
+
|
|
73
|
+
const batchData = batchToProcess.map((line, idx) => ({
|
|
74
|
+
filePath: line.filePath,
|
|
75
|
+
lineNumber: line.lineNumber,
|
|
76
|
+
content: line.content,
|
|
77
|
+
embedding: embeddings[idx],
|
|
78
|
+
}));
|
|
79
|
+
|
|
80
|
+
await storage.upsertLinesBatch(batchData);
|
|
81
|
+
indexedCount += batchToProcess.length;
|
|
82
|
+
indexStatus.indexedLines = indexedCount;
|
|
83
|
+
console.error(`Indexed ${indexedCount} lines...`);
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
// Pipelining: Read next batch while processing current batch
|
|
87
|
+
// We allow ONE batch to be processed in parallel with reading the next one.
|
|
88
|
+
let processingPromise: Promise<void> | null = null;
|
|
89
|
+
|
|
90
|
+
for await (const line of scanDirectories(directories)) {
|
|
91
|
+
currentBatch.push(line);
|
|
92
|
+
if (currentBatch.length >= BATCH_SIZE) {
|
|
93
|
+
// If there's a previous batch still processing, wait for it
|
|
94
|
+
if (processingPromise) {
|
|
95
|
+
await processingPromise;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const batchToProcess = currentBatch;
|
|
99
|
+
currentBatch = [];
|
|
100
|
+
|
|
101
|
+
// Start processing this batch, but don't await it yet!
|
|
102
|
+
// This allows the loop to continue and read the next batch from disk.
|
|
103
|
+
processingPromise = processBatch(batchToProcess).catch((err) => {
|
|
104
|
+
console.error('Error in background batch processing:', err);
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Wait for the last async batch
|
|
110
|
+
if (processingPromise) {
|
|
111
|
+
await processingPromise;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Process any remaining lines
|
|
115
|
+
if (currentBatch.length > 0) {
|
|
116
|
+
await processBatch(currentBatch);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const stats = await storage.getStats();
|
|
120
|
+
indexStatus.indexedFiles = stats.totalFiles;
|
|
121
|
+
indexStatus.indexedLines = stats.totalLines;
|
|
122
|
+
indexStatus.finishedAt = Date.now();
|
|
123
|
+
|
|
124
|
+
console.error(
|
|
125
|
+
`Finished indexing ${stats.totalLines} lines from ${stats.totalFiles} files in background job.`
|
|
126
|
+
);
|
|
127
|
+
} catch (err) {
|
|
128
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
129
|
+
indexStatus.lastError = message;
|
|
130
|
+
indexStatus.finishedAt = Date.now();
|
|
131
|
+
console.error('Error during indexing job:', err);
|
|
132
|
+
} finally {
|
|
133
|
+
indexStatus.isIndexing = false;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
async function ensureInitialIndexing(storage: VectorStorage): Promise<void> {
|
|
138
|
+
const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
|
|
139
|
+
const directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
|
|
140
|
+
|
|
141
|
+
if (!directories.length) {
|
|
142
|
+
console.error(
|
|
143
|
+
'Semantic Search MCP: SEMANTIC_SEARCH_INDEX_DIRS is not set; automatic indexing on startup is disabled.'
|
|
144
|
+
);
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Fire-and-forget; indexing runs in background.
|
|
149
|
+
void startIndexing(storage, directories);
|
|
150
|
+
}
|
|
151
|
+
|
|
19
152
|
async function main() {
|
|
20
153
|
const storage = await createVectorStorage(DB_PATH);
|
|
21
154
|
|
|
@@ -36,7 +169,8 @@ async function main() {
|
|
|
36
169
|
tools: [
|
|
37
170
|
{
|
|
38
171
|
name: 'index_directories',
|
|
39
|
-
description:
|
|
172
|
+
description:
|
|
173
|
+
'Trigger background indexing of directories from SEMANTIC_SEARCH_INDEX_DIRS (comma-separated). Clears and rebuilds the index asynchronously.',
|
|
40
174
|
inputSchema: {
|
|
41
175
|
type: 'object',
|
|
42
176
|
properties: {},
|
|
@@ -44,7 +178,8 @@ async function main() {
|
|
|
44
178
|
},
|
|
45
179
|
{
|
|
46
180
|
name: 'search',
|
|
47
|
-
description:
|
|
181
|
+
description:
|
|
182
|
+
'Search for lines semantically similar to the query. Returns the most relevant lines from indexed files.',
|
|
48
183
|
inputSchema: {
|
|
49
184
|
type: 'object',
|
|
50
185
|
properties: {
|
|
@@ -63,7 +198,7 @@ async function main() {
|
|
|
63
198
|
},
|
|
64
199
|
{
|
|
65
200
|
name: 'get_index_stats',
|
|
66
|
-
description: 'Get statistics
|
|
201
|
+
description: 'Get statistics and progress for the current index (files, lines, progress state)',
|
|
67
202
|
inputSchema: {
|
|
68
203
|
type: 'object',
|
|
69
204
|
properties: {},
|
|
@@ -82,42 +217,14 @@ async function main() {
|
|
|
82
217
|
const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
|
|
83
218
|
const directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
|
|
84
219
|
if (!directories.length) {
|
|
85
|
-
throw new Error(
|
|
220
|
+
throw new Error(
|
|
221
|
+
'No directories to index. Set SEMANTIC_SEARCH_INDEX_DIRS (comma-separated paths).'
|
|
222
|
+
);
|
|
86
223
|
}
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
console.error(`Found ${lines.length} lines to index`);
|
|
92
|
-
|
|
93
|
-
const batchSize = 128;
|
|
94
|
-
let indexed = 0;
|
|
95
|
-
|
|
96
|
-
// Pipeline: compute embeddings for next batch while writing current batch to DB
|
|
97
|
-
let embedPromise: Promise<Float32Array[]> =
|
|
98
|
-
lines.length > 0
|
|
99
|
-
? getBatchEmbeddings(lines.slice(0, batchSize).map((l) => l.content))
|
|
100
|
-
: Promise.resolve([]);
|
|
101
|
-
|
|
102
|
-
for (let i = 0; i < lines.length; i += batchSize) {
|
|
103
|
-
const batch = lines.slice(i, i + batchSize);
|
|
104
|
-
const embeddings = await embedPromise;
|
|
105
|
-
|
|
106
|
-
if (i + batchSize < lines.length) {
|
|
107
|
-
const nextTexts = lines.slice(i + batchSize, i + batchSize * 2).map((l) => l.content);
|
|
108
|
-
embedPromise = getBatchEmbeddings(nextTexts);
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
const batchData = batch.map((line, idx) => ({
|
|
112
|
-
filePath: line.filePath,
|
|
113
|
-
lineNumber: line.lineNumber,
|
|
114
|
-
content: line.content,
|
|
115
|
-
embedding: embeddings[idx],
|
|
116
|
-
}));
|
|
117
|
-
|
|
118
|
-
await storage.upsertLinesBatch(batchData);
|
|
119
|
-
indexed += batch.length;
|
|
120
|
-
console.error(`Indexed ${indexed}/${lines.length} lines`);
|
|
224
|
+
|
|
225
|
+
// Trigger (or reuse) background indexing job.
|
|
226
|
+
if (!indexStatus.isIndexing) {
|
|
227
|
+
void startIndexing(storage, directories);
|
|
121
228
|
}
|
|
122
229
|
|
|
123
230
|
const stats = await storage.getStats();
|
|
@@ -127,9 +234,15 @@ async function main() {
|
|
|
127
234
|
type: 'text',
|
|
128
235
|
text: JSON.stringify({
|
|
129
236
|
success: true,
|
|
237
|
+
indexing: indexStatus.isIndexing,
|
|
130
238
|
indexed_lines: stats.totalLines,
|
|
131
239
|
indexed_files: stats.totalFiles,
|
|
132
|
-
|
|
240
|
+
started_at: indexStatus.startedAt,
|
|
241
|
+
finished_at: indexStatus.finishedAt,
|
|
242
|
+
last_error: indexStatus.lastError,
|
|
243
|
+
message: indexStatus.isIndexing
|
|
244
|
+
? `Indexing started in background. Currently ${stats.totalLines} lines from ${stats.totalFiles} files in index.`
|
|
245
|
+
: `Indexing completed. Indexed ${stats.totalLines} lines from ${stats.totalFiles} files.`,
|
|
133
246
|
}),
|
|
134
247
|
},
|
|
135
248
|
],
|
|
@@ -170,6 +283,13 @@ async function main() {
|
|
|
170
283
|
text: JSON.stringify({
|
|
171
284
|
total_files: stats.totalFiles,
|
|
172
285
|
total_lines: stats.totalLines,
|
|
286
|
+
is_indexing: indexStatus.isIndexing,
|
|
287
|
+
indexed_lines: indexStatus.indexedLines,
|
|
288
|
+
indexed_files: indexStatus.indexedFiles,
|
|
289
|
+
started_at: indexStatus.startedAt,
|
|
290
|
+
finished_at: indexStatus.finishedAt,
|
|
291
|
+
last_error: indexStatus.lastError,
|
|
292
|
+
directories: indexStatus.directories,
|
|
173
293
|
}),
|
|
174
294
|
},
|
|
175
295
|
],
|
|
@@ -191,6 +311,9 @@ async function main() {
|
|
|
191
311
|
const transport = new StdioServerTransport();
|
|
192
312
|
await server.connect(transport);
|
|
193
313
|
console.error('Semantic Search MCP Server running on stdio');
|
|
314
|
+
|
|
315
|
+
// Kick off initial background indexing when the MCP server is enabled.
|
|
316
|
+
await ensureInitialIndexing(storage);
|
|
194
317
|
}
|
|
195
318
|
|
|
196
319
|
main().catch(console.error);
|
package/src/scanner.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { createReadStream, statSync } from 'fs';
|
|
2
2
|
import { glob } from 'glob';
|
|
3
3
|
import path from 'path';
|
|
4
|
+
import readline from 'readline';
|
|
4
5
|
|
|
5
6
|
export interface FileLine {
|
|
6
7
|
filePath: string;
|
|
@@ -9,7 +10,7 @@ export interface FileLine {
|
|
|
9
10
|
}
|
|
10
11
|
|
|
11
12
|
// File extensions to index
|
|
12
|
-
const TEXT_EXTENSIONS = ['.txt', '.md', '.tsv', '.csv'
|
|
13
|
+
const TEXT_EXTENSIONS = ['.txt', '.md', '.tsv', '.csv'];
|
|
13
14
|
|
|
14
15
|
// Minimum line length to index (skip very short lines)
|
|
15
16
|
const MIN_LINE_LENGTH = 5;
|
|
@@ -26,50 +27,53 @@ function isTextFile(filePath: string): boolean {
|
|
|
26
27
|
}
|
|
27
28
|
|
|
28
29
|
/**
|
|
29
|
-
* Scan a directory for text files and
|
|
30
|
+
* Scan a directory for text files and yield lines
|
|
30
31
|
*/
|
|
31
|
-
export async function scanDirectory(dirPath: string):
|
|
32
|
-
const lines: FileLine[] = [];
|
|
33
|
-
|
|
32
|
+
export async function* scanDirectory(dirPath: string): AsyncGenerator<FileLine> {
|
|
34
33
|
// Find all files in directory recursively
|
|
35
34
|
const pattern = path.join(dirPath, '**/*');
|
|
35
|
+
// Nodir: true ensures we only get files
|
|
36
36
|
const files = await glob(pattern, { nodir: true, absolute: true });
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
for (const filePath of files) {
|
|
39
39
|
if (!isTextFile(filePath)) continue;
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
try {
|
|
42
42
|
const stats = statSync(filePath);
|
|
43
43
|
if (stats.size > MAX_FILE_SIZE) {
|
|
44
44
|
console.error(`Skipping large file: ${filePath}`);
|
|
45
45
|
continue;
|
|
46
46
|
}
|
|
47
|
-
|
|
48
|
-
const
|
|
49
|
-
const
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
47
|
+
|
|
48
|
+
const fileStream = createReadStream(filePath);
|
|
49
|
+
const rl = readline.createInterface({
|
|
50
|
+
input: fileStream,
|
|
51
|
+
crlfDelay: Infinity,
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
let lineNumber = 0;
|
|
55
|
+
for await (const line of rl) {
|
|
56
|
+
lineNumber++;
|
|
57
|
+
const trimmed = line.trim();
|
|
58
|
+
if (trimmed.length >= MIN_LINE_LENGTH) {
|
|
59
|
+
yield {
|
|
55
60
|
filePath,
|
|
56
|
-
lineNumber
|
|
57
|
-
content:
|
|
58
|
-
}
|
|
61
|
+
lineNumber,
|
|
62
|
+
content: trimmed,
|
|
63
|
+
};
|
|
59
64
|
}
|
|
60
65
|
}
|
|
61
66
|
} catch (err) {
|
|
62
67
|
console.error(`Error reading file ${filePath}:`, err);
|
|
63
68
|
}
|
|
64
69
|
}
|
|
65
|
-
|
|
66
|
-
return lines;
|
|
67
70
|
}
|
|
68
71
|
|
|
69
72
|
/**
|
|
70
|
-
* Scan multiple directories
|
|
73
|
+
* Scan multiple directories and yield lines
|
|
71
74
|
*/
|
|
72
|
-
export async function scanDirectories(dirPaths: string[]):
|
|
73
|
-
const
|
|
74
|
-
|
|
75
|
+
export async function* scanDirectories(dirPaths: string[]): AsyncGenerator<FileLine> {
|
|
76
|
+
for (const dirPath of dirPaths) {
|
|
77
|
+
yield* scanDirectory(dirPath);
|
|
78
|
+
}
|
|
75
79
|
}
|