@lojban/semantic-search-mcp 1.0.6 → 1.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.ts +195 -63
- package/src/scanner.ts +1 -1
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -7,7 +7,7 @@ import {
|
|
|
7
7
|
} from '@modelcontextprotocol/sdk/types.js';
|
|
8
8
|
import path from 'path';
|
|
9
9
|
import { getEmbedding, getBatchEmbeddings } from './embeddings.js';
|
|
10
|
-
import { createVectorStorage, type SearchResult } from './storage.js';
|
|
10
|
+
import { createVectorStorage, type SearchResult, type VectorStorage } from './storage.js';
|
|
11
11
|
import { scanDirectories } from './scanner.js';
|
|
12
12
|
|
|
13
13
|
// Data dir: use env, or project cwd so each workspace has its own index when run via npx from Cursor
|
|
@@ -16,6 +16,173 @@ const dataDir =
|
|
|
16
16
|
path.join(process.cwd(), '.semantic-search', 'data');
|
|
17
17
|
const DB_PATH = path.join(dataDir, 'vectors.db');
|
|
18
18
|
|
|
19
|
+
type IndexStatus = {
|
|
20
|
+
isIndexing: boolean;
|
|
21
|
+
startedAt: number | null;
|
|
22
|
+
finishedAt: number | null;
|
|
23
|
+
lastError: string | null;
|
|
24
|
+
indexedLines: number;
|
|
25
|
+
indexedFiles: number;
|
|
26
|
+
directories: string[];
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
const indexStatus: IndexStatus = {
|
|
30
|
+
isIndexing: false,
|
|
31
|
+
startedAt: null,
|
|
32
|
+
finishedAt: null,
|
|
33
|
+
lastError: null,
|
|
34
|
+
indexedLines: 0,
|
|
35
|
+
indexedFiles: 0,
|
|
36
|
+
directories: [],
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
// Single "mutex": only one indexing job is allowed to run. Starting a new job aborts the previous one.
|
|
40
|
+
let currentIndexingAbortController: AbortController | null = null;
|
|
41
|
+
let currentJobId = 0;
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Request indexing of directories. If another indexing job is running, it is aborted first.
|
|
45
|
+
* Then a new job is started (clears index and rebuilds).
|
|
46
|
+
*/
|
|
47
|
+
function requestIndexing(storage: VectorStorage, directories: string[]): void {
|
|
48
|
+
if (!directories.length) {
|
|
49
|
+
console.error('No directories to index. Set SEMANTIC_SEARCH_INDEX_DIRS (comma-separated paths).');
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Abort any in-progress indexing so it doesn't conflict or flush this job's work.
|
|
54
|
+
if (currentIndexingAbortController) {
|
|
55
|
+
currentIndexingAbortController.abort();
|
|
56
|
+
currentIndexingAbortController = null;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
currentJobId += 1;
|
|
60
|
+
const jobId = currentJobId;
|
|
61
|
+
currentIndexingAbortController = new AbortController();
|
|
62
|
+
const signal = currentIndexingAbortController.signal;
|
|
63
|
+
|
|
64
|
+
indexStatus.isIndexing = true;
|
|
65
|
+
indexStatus.startedAt = Date.now();
|
|
66
|
+
indexStatus.finishedAt = null;
|
|
67
|
+
indexStatus.lastError = null;
|
|
68
|
+
indexStatus.directories = directories;
|
|
69
|
+
indexStatus.indexedLines = 0;
|
|
70
|
+
indexStatus.indexedFiles = 0;
|
|
71
|
+
|
|
72
|
+
void startIndexing(storage, directories, signal, jobId);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
async function startIndexing(
|
|
76
|
+
storage: VectorStorage,
|
|
77
|
+
directories: string[],
|
|
78
|
+
signal: AbortSignal,
|
|
79
|
+
jobId: number
|
|
80
|
+
): Promise<void> {
|
|
81
|
+
const isCurrentJob = (): boolean => currentJobId === jobId;
|
|
82
|
+
|
|
83
|
+
try {
|
|
84
|
+
if (signal.aborted) return;
|
|
85
|
+
|
|
86
|
+
storage.clear();
|
|
87
|
+
console.error(`Scanning ${directories.length} directories (background indexing)...`);
|
|
88
|
+
|
|
89
|
+
let indexedCount = 0;
|
|
90
|
+
const BATCH_SIZE = 512;
|
|
91
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
92
|
+
let currentBatch: any[] = [];
|
|
93
|
+
|
|
94
|
+
const processBatch = async (batchToProcess: any[]) => {
|
|
95
|
+
if (batchToProcess.length === 0) return;
|
|
96
|
+
const contents = batchToProcess.map((l) => l.content);
|
|
97
|
+
const embeddings = await getBatchEmbeddings(contents);
|
|
98
|
+
|
|
99
|
+
const batchData = batchToProcess.map((line, idx) => ({
|
|
100
|
+
filePath: line.filePath,
|
|
101
|
+
lineNumber: line.lineNumber,
|
|
102
|
+
content: line.content,
|
|
103
|
+
embedding: embeddings[idx],
|
|
104
|
+
}));
|
|
105
|
+
|
|
106
|
+
await storage.upsertLinesBatch(batchData);
|
|
107
|
+
indexedCount += batchToProcess.length;
|
|
108
|
+
if (isCurrentJob()) indexStatus.indexedLines = indexedCount;
|
|
109
|
+
console.error(`Indexed ${indexedCount} lines...`);
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
let processingPromise: Promise<void> | null = null;
|
|
113
|
+
|
|
114
|
+
for await (const line of scanDirectories(directories)) {
|
|
115
|
+
if (signal.aborted) break;
|
|
116
|
+
|
|
117
|
+
currentBatch.push(line);
|
|
118
|
+
if (currentBatch.length >= BATCH_SIZE) {
|
|
119
|
+
if (processingPromise) {
|
|
120
|
+
await processingPromise;
|
|
121
|
+
}
|
|
122
|
+
if (signal.aborted) break;
|
|
123
|
+
|
|
124
|
+
const batchToProcess = currentBatch;
|
|
125
|
+
currentBatch = [];
|
|
126
|
+
|
|
127
|
+
processingPromise = processBatch(batchToProcess).catch((err) => {
|
|
128
|
+
console.error('Error in background batch processing:', err);
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if (signal.aborted) {
|
|
134
|
+
console.error('Indexing aborted (new job started or cancelled).');
|
|
135
|
+
return;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (processingPromise) {
|
|
139
|
+
await processingPromise;
|
|
140
|
+
}
|
|
141
|
+
if (currentBatch.length > 0) {
|
|
142
|
+
await processBatch(currentBatch);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
if (!isCurrentJob()) return;
|
|
146
|
+
|
|
147
|
+
const stats = await storage.getStats();
|
|
148
|
+
indexStatus.indexedFiles = stats.totalFiles;
|
|
149
|
+
indexStatus.indexedLines = stats.totalLines;
|
|
150
|
+
indexStatus.finishedAt = Date.now();
|
|
151
|
+
|
|
152
|
+
console.error(
|
|
153
|
+
`Finished indexing ${stats.totalLines} lines from ${stats.totalFiles} files in background job.`
|
|
154
|
+
);
|
|
155
|
+
} catch (err) {
|
|
156
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
157
|
+
if (isCurrentJob()) {
|
|
158
|
+
indexStatus.lastError = message;
|
|
159
|
+
indexStatus.finishedAt = Date.now();
|
|
160
|
+
}
|
|
161
|
+
console.error('Error during indexing job:', err);
|
|
162
|
+
} finally {
|
|
163
|
+
if (isCurrentJob()) {
|
|
164
|
+
indexStatus.isIndexing = false;
|
|
165
|
+
}
|
|
166
|
+
if (currentIndexingAbortController && currentJobId === jobId) {
|
|
167
|
+
currentIndexingAbortController = null;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
function ensureInitialIndexing(storage: VectorStorage): void {
|
|
173
|
+
const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
|
|
174
|
+
const directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
|
|
175
|
+
|
|
176
|
+
if (!directories.length) {
|
|
177
|
+
console.error(
|
|
178
|
+
'Semantic Search MCP: SEMANTIC_SEARCH_INDEX_DIRS is not set; automatic indexing on startup is disabled.'
|
|
179
|
+
);
|
|
180
|
+
return;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
requestIndexing(storage, directories);
|
|
184
|
+
}
|
|
185
|
+
|
|
19
186
|
async function main() {
|
|
20
187
|
const storage = await createVectorStorage(DB_PATH);
|
|
21
188
|
|
|
@@ -36,7 +203,8 @@ async function main() {
|
|
|
36
203
|
tools: [
|
|
37
204
|
{
|
|
38
205
|
name: 'index_directories',
|
|
39
|
-
description:
|
|
206
|
+
description:
|
|
207
|
+
'Trigger background indexing of directories from SEMANTIC_SEARCH_INDEX_DIRS (comma-separated). Clears and rebuilds the index asynchronously.',
|
|
40
208
|
inputSchema: {
|
|
41
209
|
type: 'object',
|
|
42
210
|
properties: {},
|
|
@@ -44,7 +212,8 @@ async function main() {
|
|
|
44
212
|
},
|
|
45
213
|
{
|
|
46
214
|
name: 'search',
|
|
47
|
-
description:
|
|
215
|
+
description:
|
|
216
|
+
'Search for lines semantically similar to the query. Returns the most relevant lines from indexed files.',
|
|
48
217
|
inputSchema: {
|
|
49
218
|
type: 'object',
|
|
50
219
|
properties: {
|
|
@@ -63,7 +232,7 @@ async function main() {
|
|
|
63
232
|
},
|
|
64
233
|
{
|
|
65
234
|
name: 'get_index_stats',
|
|
66
|
-
description: 'Get statistics
|
|
235
|
+
description: 'Get statistics and progress for the current index (files, lines, progress state)',
|
|
67
236
|
inputSchema: {
|
|
68
237
|
type: 'object',
|
|
69
238
|
properties: {},
|
|
@@ -82,66 +251,13 @@ async function main() {
|
|
|
82
251
|
const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
|
|
83
252
|
const directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
|
|
84
253
|
if (!directories.length) {
|
|
85
|
-
throw new Error(
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
console.error(`Scanning ${directories.length} directories (streaming)...`);
|
|
90
|
-
|
|
91
|
-
let indexedCount = 0;
|
|
92
|
-
const BATCH_SIZE = 512;
|
|
93
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
94
|
-
let currentBatch: any[] = [];
|
|
95
|
-
|
|
96
|
-
const processBatch = async (batchToProcess: any[]) => {
|
|
97
|
-
if (batchToProcess.length === 0) return;
|
|
98
|
-
const contents = batchToProcess.map((l) => l.content);
|
|
99
|
-
const embeddings = await getBatchEmbeddings(contents);
|
|
100
|
-
|
|
101
|
-
const batchData = batchToProcess.map((line, idx) => ({
|
|
102
|
-
filePath: line.filePath,
|
|
103
|
-
lineNumber: line.lineNumber,
|
|
104
|
-
content: line.content,
|
|
105
|
-
embedding: embeddings[idx],
|
|
106
|
-
}));
|
|
107
|
-
|
|
108
|
-
await storage.upsertLinesBatch(batchData);
|
|
109
|
-
indexedCount += batchToProcess.length;
|
|
110
|
-
console.error(`Indexed ${indexedCount} lines...`);
|
|
111
|
-
};
|
|
112
|
-
|
|
113
|
-
// Pipelining: Read next batch while processing current batch
|
|
114
|
-
// We allow ONE batch to be processed in parallel with reading the next one.
|
|
115
|
-
let processingPromise: Promise<void> | null = null;
|
|
116
|
-
|
|
117
|
-
for await (const line of scanDirectories(directories)) {
|
|
118
|
-
currentBatch.push(line);
|
|
119
|
-
if (currentBatch.length >= BATCH_SIZE) {
|
|
120
|
-
// If there's a previous batch still processing, wait for it
|
|
121
|
-
if (processingPromise) {
|
|
122
|
-
await processingPromise;
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
const batchToProcess = currentBatch;
|
|
126
|
-
currentBatch = [];
|
|
127
|
-
|
|
128
|
-
// Start processing this batch, but don't await it yet!
|
|
129
|
-
// This allows the loop to continue and read the next batch from disk.
|
|
130
|
-
processingPromise = processBatch(batchToProcess).catch((err) => {
|
|
131
|
-
console.error('Error in background batch processing:', err);
|
|
132
|
-
});
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
// Wait for the last async batch
|
|
137
|
-
if (processingPromise) {
|
|
138
|
-
await processingPromise;
|
|
254
|
+
throw new Error(
|
|
255
|
+
'No directories to index. Set SEMANTIC_SEARCH_INDEX_DIRS (comma-separated paths).'
|
|
256
|
+
);
|
|
139
257
|
}
|
|
140
258
|
|
|
141
|
-
//
|
|
142
|
-
|
|
143
|
-
await processBatch(currentBatch);
|
|
144
|
-
}
|
|
259
|
+
// Abort any in-progress indexing and start a new job (clears and rebuilds).
|
|
260
|
+
requestIndexing(storage, directories);
|
|
145
261
|
|
|
146
262
|
const stats = await storage.getStats();
|
|
147
263
|
return {
|
|
@@ -150,9 +266,15 @@ async function main() {
|
|
|
150
266
|
type: 'text',
|
|
151
267
|
text: JSON.stringify({
|
|
152
268
|
success: true,
|
|
269
|
+
indexing: indexStatus.isIndexing,
|
|
153
270
|
indexed_lines: stats.totalLines,
|
|
154
271
|
indexed_files: stats.totalFiles,
|
|
155
|
-
|
|
272
|
+
started_at: indexStatus.startedAt,
|
|
273
|
+
finished_at: indexStatus.finishedAt,
|
|
274
|
+
last_error: indexStatus.lastError,
|
|
275
|
+
message: indexStatus.isIndexing
|
|
276
|
+
? `Indexing started in background. Currently ${stats.totalLines} lines from ${stats.totalFiles} files in index.`
|
|
277
|
+
: `Indexing completed. Indexed ${stats.totalLines} lines from ${stats.totalFiles} files.`,
|
|
156
278
|
}),
|
|
157
279
|
},
|
|
158
280
|
],
|
|
@@ -193,6 +315,13 @@ async function main() {
|
|
|
193
315
|
text: JSON.stringify({
|
|
194
316
|
total_files: stats.totalFiles,
|
|
195
317
|
total_lines: stats.totalLines,
|
|
318
|
+
is_indexing: indexStatus.isIndexing,
|
|
319
|
+
indexed_lines: indexStatus.indexedLines,
|
|
320
|
+
indexed_files: indexStatus.indexedFiles,
|
|
321
|
+
started_at: indexStatus.startedAt,
|
|
322
|
+
finished_at: indexStatus.finishedAt,
|
|
323
|
+
last_error: indexStatus.lastError,
|
|
324
|
+
directories: indexStatus.directories,
|
|
196
325
|
}),
|
|
197
326
|
},
|
|
198
327
|
],
|
|
@@ -214,6 +343,9 @@ async function main() {
|
|
|
214
343
|
const transport = new StdioServerTransport();
|
|
215
344
|
await server.connect(transport);
|
|
216
345
|
console.error('Semantic Search MCP Server running on stdio');
|
|
346
|
+
|
|
347
|
+
// Kick off initial background indexing when the MCP server is enabled.
|
|
348
|
+
ensureInitialIndexing(storage);
|
|
217
349
|
}
|
|
218
350
|
|
|
219
351
|
main().catch(console.error);
|
package/src/scanner.ts
CHANGED
|
@@ -10,7 +10,7 @@ export interface FileLine {
|
|
|
10
10
|
}
|
|
11
11
|
|
|
12
12
|
// File extensions to index
|
|
13
|
-
const TEXT_EXTENSIONS = ['.txt', '.md', '.tsv', '.csv'
|
|
13
|
+
const TEXT_EXTENSIONS = ['.txt', '.md', '.tsv', '.csv'];
|
|
14
14
|
|
|
15
15
|
// Minimum line length to index (skip very short lines)
|
|
16
16
|
const MIN_LINE_LENGTH = 5;
|