@lojban/semantic-search-mcp 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lojban/semantic-search-mcp",
3
- "version": "1.0.6",
3
+ "version": "1.0.7",
4
4
  "description": "Local-first MCP server for semantic search using transformers.js and SQLite",
5
5
  "type": "module",
6
6
  "scripts": {
package/src/index.ts CHANGED
@@ -7,7 +7,7 @@ import {
7
7
  } from '@modelcontextprotocol/sdk/types.js';
8
8
  import path from 'path';
9
9
  import { getEmbedding, getBatchEmbeddings } from './embeddings.js';
10
- import { createVectorStorage, type SearchResult } from './storage.js';
10
+ import { createVectorStorage, type SearchResult, type VectorStorage } from './storage.js';
11
11
  import { scanDirectories } from './scanner.js';
12
12
 
13
13
  // Data dir: use env, or project cwd so each workspace has its own index when run via npx from Cursor
@@ -16,6 +16,139 @@ const dataDir =
16
16
  path.join(process.cwd(), '.semantic-search', 'data');
17
17
  const DB_PATH = path.join(dataDir, 'vectors.db');
18
18
 
19
+ type IndexStatus = {
20
+ isIndexing: boolean;
21
+ startedAt: number | null;
22
+ finishedAt: number | null;
23
+ lastError: string | null;
24
+ indexedLines: number;
25
+ indexedFiles: number;
26
+ directories: string[];
27
+ };
28
+
29
+ const indexStatus: IndexStatus = {
30
+ isIndexing: false,
31
+ startedAt: null,
32
+ finishedAt: null,
33
+ lastError: null,
34
+ indexedLines: 0,
35
+ indexedFiles: 0,
36
+ directories: [],
37
+ };
38
+
39
+ async function startIndexing(storage: VectorStorage, directories: string[]): Promise<void> {
40
+ if (!directories.length) {
41
+ console.error('No directories to index. Set SEMANTIC_SEARCH_INDEX_DIRS (comma-separated paths).');
42
+ return;
43
+ }
44
+
45
+ if (indexStatus.isIndexing) {
46
+ console.error('Indexing already in progress, not starting a new job.');
47
+ return;
48
+ }
49
+
50
+ indexStatus.isIndexing = true;
51
+ indexStatus.startedAt = Date.now();
52
+ indexStatus.finishedAt = null;
53
+ indexStatus.lastError = null;
54
+ indexStatus.directories = directories;
55
+ indexStatus.indexedLines = 0;
56
+ indexStatus.indexedFiles = 0;
57
+
58
+ try {
59
+ storage.clear();
60
+
61
+ console.error(`Scanning ${directories.length} directories (background indexing)...`);
62
+
63
+ let indexedCount = 0;
64
+ const BATCH_SIZE = 512;
65
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
66
+ let currentBatch: any[] = [];
67
+
68
+ const processBatch = async (batchToProcess: any[]) => {
69
+ if (batchToProcess.length === 0) return;
70
+ const contents = batchToProcess.map((l) => l.content);
71
+ const embeddings = await getBatchEmbeddings(contents);
72
+
73
+ const batchData = batchToProcess.map((line, idx) => ({
74
+ filePath: line.filePath,
75
+ lineNumber: line.lineNumber,
76
+ content: line.content,
77
+ embedding: embeddings[idx],
78
+ }));
79
+
80
+ await storage.upsertLinesBatch(batchData);
81
+ indexedCount += batchToProcess.length;
82
+ indexStatus.indexedLines = indexedCount;
83
+ console.error(`Indexed ${indexedCount} lines...`);
84
+ };
85
+
86
+ // Pipelining: Read next batch while processing current batch
87
+ // We allow ONE batch to be processed in parallel with reading the next one.
88
+ let processingPromise: Promise<void> | null = null;
89
+
90
+ for await (const line of scanDirectories(directories)) {
91
+ currentBatch.push(line);
92
+ if (currentBatch.length >= BATCH_SIZE) {
93
+ // If there's a previous batch still processing, wait for it
94
+ if (processingPromise) {
95
+ await processingPromise;
96
+ }
97
+
98
+ const batchToProcess = currentBatch;
99
+ currentBatch = [];
100
+
101
+ // Start processing this batch, but don't await it yet!
102
+ // This allows the loop to continue and read the next batch from disk.
103
+ processingPromise = processBatch(batchToProcess).catch((err) => {
104
+ console.error('Error in background batch processing:', err);
105
+ });
106
+ }
107
+ }
108
+
109
+ // Wait for the last async batch
110
+ if (processingPromise) {
111
+ await processingPromise;
112
+ }
113
+
114
+ // Process any remaining lines
115
+ if (currentBatch.length > 0) {
116
+ await processBatch(currentBatch);
117
+ }
118
+
119
+ const stats = await storage.getStats();
120
+ indexStatus.indexedFiles = stats.totalFiles;
121
+ indexStatus.indexedLines = stats.totalLines;
122
+ indexStatus.finishedAt = Date.now();
123
+
124
+ console.error(
125
+ `Finished indexing ${stats.totalLines} lines from ${stats.totalFiles} files in background job.`
126
+ );
127
+ } catch (err) {
128
+ const message = err instanceof Error ? err.message : String(err);
129
+ indexStatus.lastError = message;
130
+ indexStatus.finishedAt = Date.now();
131
+ console.error('Error during indexing job:', err);
132
+ } finally {
133
+ indexStatus.isIndexing = false;
134
+ }
135
+ }
136
+
137
+ async function ensureInitialIndexing(storage: VectorStorage): Promise<void> {
138
+ const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
139
+ const directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
140
+
141
+ if (!directories.length) {
142
+ console.error(
143
+ 'Semantic Search MCP: SEMANTIC_SEARCH_INDEX_DIRS is not set; automatic indexing on startup is disabled.'
144
+ );
145
+ return;
146
+ }
147
+
148
+ // Fire-and-forget; indexing runs in background.
149
+ void startIndexing(storage, directories);
150
+ }
151
+
19
152
  async function main() {
20
153
  const storage = await createVectorStorage(DB_PATH);
21
154
 
@@ -36,7 +169,8 @@ async function main() {
36
169
  tools: [
37
170
  {
38
171
  name: 'index_directories',
39
- description: 'Scan directories from SEMANTIC_SEARCH_INDEX_DIRS (comma-separated) and index all text file lines for semantic search. Each line gets a vector embedding. Always clears the existing index first.',
172
+ description:
173
+ 'Trigger background indexing of directories from SEMANTIC_SEARCH_INDEX_DIRS (comma-separated). Clears and rebuilds the index asynchronously.',
40
174
  inputSchema: {
41
175
  type: 'object',
42
176
  properties: {},
@@ -44,7 +178,8 @@ async function main() {
44
178
  },
45
179
  {
46
180
  name: 'search',
47
- description: 'Search for lines semantically similar to the query. Returns the most relevant lines from indexed files.',
181
+ description:
182
+ 'Search for lines semantically similar to the query. Returns the most relevant lines from indexed files.',
48
183
  inputSchema: {
49
184
  type: 'object',
50
185
  properties: {
@@ -63,7 +198,7 @@ async function main() {
63
198
  },
64
199
  {
65
200
  name: 'get_index_stats',
66
- description: 'Get statistics about the current index (number of files and lines indexed)',
201
+ description: 'Get statistics and progress for the current index (files, lines, progress state)',
67
202
  inputSchema: {
68
203
  type: 'object',
69
204
  properties: {},
@@ -82,65 +217,14 @@ async function main() {
82
217
  const envDirs = process.env.SEMANTIC_SEARCH_INDEX_DIRS;
83
218
  const directories = envDirs ? envDirs.split(',').map((d) => d.trim()).filter(Boolean) : [];
84
219
  if (!directories.length) {
85
- throw new Error('No directories to index. Set SEMANTIC_SEARCH_INDEX_DIRS (comma-separated paths).');
220
+ throw new Error(
221
+ 'No directories to index. Set SEMANTIC_SEARCH_INDEX_DIRS (comma-separated paths).'
222
+ );
86
223
  }
87
- storage.clear();
88
-
89
- console.error(`Scanning ${directories.length} directories (streaming)...`);
90
-
91
- let indexedCount = 0;
92
- const BATCH_SIZE = 512;
93
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
94
- let currentBatch: any[] = [];
95
-
96
- const processBatch = async (batchToProcess: any[]) => {
97
- if (batchToProcess.length === 0) return;
98
- const contents = batchToProcess.map((l) => l.content);
99
- const embeddings = await getBatchEmbeddings(contents);
100
-
101
- const batchData = batchToProcess.map((line, idx) => ({
102
- filePath: line.filePath,
103
- lineNumber: line.lineNumber,
104
- content: line.content,
105
- embedding: embeddings[idx],
106
- }));
107
-
108
- await storage.upsertLinesBatch(batchData);
109
- indexedCount += batchToProcess.length;
110
- console.error(`Indexed ${indexedCount} lines...`);
111
- };
112
224
 
113
- // Pipelining: Read next batch while processing current batch
114
- // We allow ONE batch to be processed in parallel with reading the next one.
115
- let processingPromise: Promise<void> | null = null;
116
-
117
- for await (const line of scanDirectories(directories)) {
118
- currentBatch.push(line);
119
- if (currentBatch.length >= BATCH_SIZE) {
120
- // If there's a previous batch still processing, wait for it
121
- if (processingPromise) {
122
- await processingPromise;
123
- }
124
-
125
- const batchToProcess = currentBatch;
126
- currentBatch = [];
127
-
128
- // Start processing this batch, but don't await it yet!
129
- // This allows the loop to continue and read the next batch from disk.
130
- processingPromise = processBatch(batchToProcess).catch((err) => {
131
- console.error('Error in background batch processing:', err);
132
- });
133
- }
134
- }
135
-
136
- // Wait for the last async batch
137
- if (processingPromise) {
138
- await processingPromise;
139
- }
140
-
141
- // Process any remaining lines
142
- if (currentBatch.length > 0) {
143
- await processBatch(currentBatch);
225
+ // Trigger (or reuse) background indexing job.
226
+ if (!indexStatus.isIndexing) {
227
+ void startIndexing(storage, directories);
144
228
  }
145
229
 
146
230
  const stats = await storage.getStats();
@@ -150,9 +234,15 @@ async function main() {
150
234
  type: 'text',
151
235
  text: JSON.stringify({
152
236
  success: true,
237
+ indexing: indexStatus.isIndexing,
153
238
  indexed_lines: stats.totalLines,
154
239
  indexed_files: stats.totalFiles,
155
- message: `Successfully indexed ${stats.totalLines} lines from ${stats.totalFiles} files`,
240
+ started_at: indexStatus.startedAt,
241
+ finished_at: indexStatus.finishedAt,
242
+ last_error: indexStatus.lastError,
243
+ message: indexStatus.isIndexing
244
+ ? `Indexing started in background. Currently ${stats.totalLines} lines from ${stats.totalFiles} files in index.`
245
+ : `Indexing completed. Indexed ${stats.totalLines} lines from ${stats.totalFiles} files.`,
156
246
  }),
157
247
  },
158
248
  ],
@@ -193,6 +283,13 @@ async function main() {
193
283
  text: JSON.stringify({
194
284
  total_files: stats.totalFiles,
195
285
  total_lines: stats.totalLines,
286
+ is_indexing: indexStatus.isIndexing,
287
+ indexed_lines: indexStatus.indexedLines,
288
+ indexed_files: indexStatus.indexedFiles,
289
+ started_at: indexStatus.startedAt,
290
+ finished_at: indexStatus.finishedAt,
291
+ last_error: indexStatus.lastError,
292
+ directories: indexStatus.directories,
196
293
  }),
197
294
  },
198
295
  ],
@@ -214,6 +311,9 @@ async function main() {
214
311
  const transport = new StdioServerTransport();
215
312
  await server.connect(transport);
216
313
  console.error('Semantic Search MCP Server running on stdio');
314
+
315
+ // Kick off initial background indexing when the MCP server is enabled.
316
+ await ensureInitialIndexing(storage);
217
317
  }
218
318
 
219
319
  main().catch(console.error);
package/src/scanner.ts CHANGED
@@ -10,7 +10,7 @@ export interface FileLine {
10
10
  }
11
11
 
12
12
  // File extensions to index
13
- const TEXT_EXTENSIONS = ['.txt', '.md', '.tsv', '.csv', '.json', '.html', '.xml', '.ts'];
13
+ const TEXT_EXTENSIONS = ['.txt', '.md', '.tsv', '.csv'];
14
14
 
15
15
  // Minimum line length to index (skip very short lines)
16
16
  const MIN_LINE_LENGTH = 5;