gitmem-mcp 1.4.3 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,328 @@
1
+ /**
2
+ * Document Index — Storage and search for indexed doc chunks
3
+ *
4
+ * Supports two backends:
5
+ * - Free tier: Local JSON file with BM25 keyword search
6
+ * - Pro/dev tier: In-memory vector index with embeddings
7
+ *
8
+ * Follows the same patterns as local-vector-search.ts and local-file-storage.ts
9
+ */
10
+ import * as fs from "fs";
11
+ import * as path from "path";
12
+ import { v4 as uuidv4 } from "uuid";
13
+ import { getGitmemDir } from "./gitmem-dir.js";
14
+ import { bm25Search } from "./bm25.js";
15
+ import { embed as generateEmbedding, isEmbeddingAvailable } from "./embedding.js";
16
+ // --- Local File Index ---
17
+ const INDEX_FILE = "docs-index.json";
18
+ const MAX_INDEX_SIZE = 20 * 1024 * 1024; // 20MB
19
+ /**
20
+ * Get the path to the local docs index file
21
+ */
22
+ function getIndexPath() {
23
+ return path.join(getGitmemDir(), INDEX_FILE);
24
+ }
25
+ /**
26
+ * Read the local index from disk
27
+ */
28
+ function readLocalIndex() {
29
+ const indexPath = getIndexPath();
30
+ if (!fs.existsSync(indexPath))
31
+ return [];
32
+ try {
33
+ const raw = fs.readFileSync(indexPath, "utf-8");
34
+ return JSON.parse(raw);
35
+ }
36
+ catch {
37
+ console.error("[doc-index] Failed to read docs-index.json, starting fresh");
38
+ return [];
39
+ }
40
+ }
41
+ /**
42
+ * Write the local index to disk
43
+ */
44
+ function writeLocalIndex(chunks) {
45
+ const indexPath = getIndexPath();
46
+ const dir = path.dirname(indexPath);
47
+ if (!fs.existsSync(dir)) {
48
+ fs.mkdirSync(dir, { recursive: true });
49
+ }
50
+ // Strip embeddings from local file to save space
51
+ const stripped = chunks.map(({ embedding: _e, ...rest }) => rest);
52
+ const json = JSON.stringify(stripped, null, 2);
53
+ if (Buffer.byteLength(json, "utf-8") > MAX_INDEX_SIZE) {
54
+ console.error("[doc-index] Warning: docs-index.json exceeds 20MB");
55
+ }
56
+ fs.writeFileSync(indexPath, json, "utf-8");
57
+ }
58
+ let vectorIndex = [];
59
+ /**
60
+ * Compute cosine similarity between two normalized vectors
61
+ */
62
+ function cosineSimilarity(a, b) {
63
+ if (a.length !== b.length)
64
+ return 0;
65
+ let dot = 0;
66
+ for (let i = 0; i < a.length; i++) {
67
+ dot += a[i] * b[i];
68
+ }
69
+ return dot;
70
+ }
71
+ // --- Public API ---
72
+ /**
73
+ * Index doc chunks into storage.
74
+ *
75
+ * - Removes old chunks for the same project + file_path
76
+ * - Generates embeddings if available (pro/dev tier)
77
+ * - Stores to local JSON file
78
+ * - Loads into in-memory vector index if embeddings present
79
+ *
80
+ * Returns count of chunks indexed.
81
+ */
82
+ export async function indexChunks(chunks, project, options = {}) {
83
+ const batchSize = options.batchSize || 10;
84
+ const now = new Date().toISOString();
85
+ let embedded = 0;
86
+ let errors = 0;
87
+ // Read existing index
88
+ const existing = readLocalIndex();
89
+ // Build set of file paths being re-indexed
90
+ const reindexedPaths = new Set(chunks.map((c) => `${project}:${c.file_path}`));
91
+ // Remove old chunks for files being re-indexed
92
+ const kept = existing.filter((c) => !reindexedPaths.has(`${c.project}:${c.file_path}`));
93
+ // Create new indexed chunks
94
+ const newChunks = [];
95
+ // Process in batches for embedding
96
+ for (let i = 0; i < chunks.length; i += batchSize) {
97
+ const batch = chunks.slice(i, i + batchSize);
98
+ for (const chunk of batch) {
99
+ const indexed = {
100
+ id: uuidv4(),
101
+ file_path: chunk.file_path,
102
+ chunk_index: chunk.chunk_index,
103
+ title: chunk.title,
104
+ section_title: chunk.section_title,
105
+ category: chunk.category,
106
+ content: chunk.content,
107
+ file_hash: chunk.file_hash,
108
+ project,
109
+ indexed_at: now,
110
+ };
111
+ // Generate embedding if available
112
+ if (isEmbeddingAvailable()) {
113
+ try {
114
+ // Embed title + section + content for richer representation
115
+ const textToEmbed = [
116
+ indexed.title,
117
+ indexed.section_title,
118
+ indexed.content,
119
+ ]
120
+ .filter(Boolean)
121
+ .join(" | ");
122
+ const embedding = await generateEmbedding(textToEmbed);
123
+ if (embedding) {
124
+ indexed.embedding = embedding;
125
+ embedded++;
126
+ }
127
+ }
128
+ catch (err) {
129
+ console.error(`[doc-index] Embedding failed for ${chunk.file_path}:${chunk.chunk_index}:`, err instanceof Error ? err.message : err);
130
+ errors++;
131
+ }
132
+ }
133
+ newChunks.push(indexed);
134
+ }
135
+ // Progress logging for large batches (every ~100 chunks)
136
+ if (chunks.length > 50 && i + batchSize < chunks.length && (i + batchSize) % 100 < batchSize) {
137
+ console.error(`[doc-index] Progress: ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`);
138
+ }
139
+ }
140
+ // Merge and write
141
+ const merged = [...kept, ...newChunks];
142
+ writeLocalIndex(merged);
143
+ // Update in-memory vector index
144
+ rebuildVectorIndex(merged);
145
+ return { indexed: newChunks.length, embedded, errors };
146
+ }
147
+ /**
148
+ * Rebuild the in-memory vector index from stored chunks
149
+ */
150
+ function rebuildVectorIndex(chunks) {
151
+ vectorIndex = chunks
152
+ .filter((c) => c.embedding && Array.isArray(c.embedding) && c.embedding.length > 0)
153
+ .map((c) => ({
154
+ chunk: c,
155
+ embedding: c.embedding,
156
+ }));
157
+ console.error(`[doc-index] Vector index rebuilt: ${vectorIndex.length} entries with embeddings`);
158
+ }
159
+ /**
160
+ * Search indexed docs using semantic similarity (pro/dev) or BM25 (free)
161
+ */
162
+ export async function searchDocs(query, options = {}) {
163
+ const matchCount = options.match_count || 5;
164
+ // Try vector search first (pro/dev tier with embeddings)
165
+ if (isEmbeddingAvailable() && vectorIndex.length > 0) {
166
+ return vectorSearchDocs(query, options);
167
+ }
168
+ // Fall back to BM25 keyword search
169
+ return bm25SearchDocs(query, options);
170
+ }
171
+ /**
172
+ * Vector-based semantic search over doc chunks
173
+ */
174
+ async function vectorSearchDocs(query, options) {
175
+ const matchCount = options.match_count || 5;
176
+ // Generate query embedding
177
+ const queryEmbedding = await generateEmbedding(query);
178
+ if (!queryEmbedding) {
179
+ console.error("[doc-index] Query embedding failed, falling back to BM25");
180
+ return bm25SearchDocs(query, options);
181
+ }
182
+ // Filter candidates
183
+ let candidates = vectorIndex;
184
+ if (options.project) {
185
+ candidates = candidates.filter((e) => e.chunk.project === options.project);
186
+ }
187
+ if (options.category) {
188
+ candidates = candidates.filter((e) => e.chunk.category === options.category);
189
+ }
190
+ // Score by cosine similarity
191
+ const scored = candidates.map((entry) => ({
192
+ chunk: entry.chunk,
193
+ similarity: cosineSimilarity(queryEmbedding, entry.embedding),
194
+ }));
195
+ // Sort and take top k
196
+ scored.sort((a, b) => b.similarity - a.similarity);
197
+ const topK = scored.slice(0, matchCount);
198
+ return topK.map(({ chunk, similarity }) => ({
199
+ id: chunk.id,
200
+ file_path: chunk.file_path,
201
+ chunk_index: chunk.chunk_index,
202
+ title: chunk.title,
203
+ section_title: chunk.section_title,
204
+ category: chunk.category,
205
+ content: chunk.content,
206
+ similarity: Math.round(similarity * 1000) / 1000,
207
+ project: chunk.project,
208
+ }));
209
+ }
210
+ /**
211
+ * BM25 keyword search over doc chunks (free tier)
212
+ */
213
+ function bm25SearchDocs(query, options) {
214
+ const matchCount = options.match_count || 5;
215
+ const chunks = readLocalIndex();
216
+ // Filter by project and category
217
+ let filtered = chunks;
218
+ if (options.project) {
219
+ filtered = filtered.filter((c) => c.project === options.project);
220
+ }
221
+ if (options.category) {
222
+ filtered = filtered.filter((c) => c.category === options.category);
223
+ }
224
+ if (filtered.length === 0)
225
+ return [];
226
+ // Build BM25 documents with field boosting
227
+ const docs = filtered.map((c) => ({
228
+ id: c.id,
229
+ fields: [
230
+ { text: c.title, boost: 3 },
231
+ { text: c.section_title || "", boost: 2 },
232
+ { text: c.category, boost: 1.5 },
233
+ { text: c.content, boost: 1 },
234
+ ],
235
+ }));
236
+ const results = bm25Search(query, docs, matchCount);
237
+ // Map back to DocSearchResult
238
+ const byId = new Map(filtered.map((c) => [c.id, c]));
239
+ return results
240
+ .map((r) => {
241
+ const c = byId.get(r.id);
242
+ if (!c)
243
+ return null;
244
+ return {
245
+ id: c.id,
246
+ file_path: c.file_path,
247
+ chunk_index: c.chunk_index,
248
+ title: c.title,
249
+ section_title: c.section_title,
250
+ category: c.category,
251
+ content: c.content,
252
+ similarity: r.similarity,
253
+ project: c.project,
254
+ };
255
+ })
256
+ .filter((r) => r !== null);
257
+ }
258
+ /**
259
+ * Get index statistics
260
+ */
261
+ export function getIndexStats(project) {
262
+ const chunks = readLocalIndex();
263
+ const filtered = project
264
+ ? chunks.filter((c) => c.project === project)
265
+ : chunks;
266
+ const files = new Set(filtered.map((c) => c.file_path));
267
+ const categories = {};
268
+ for (const c of filtered) {
269
+ categories[c.category] = (categories[c.category] || 0) + 1;
270
+ }
271
+ return {
272
+ total_chunks: filtered.length,
273
+ total_files: files.size,
274
+ files_indexed: Array.from(files).sort(),
275
+ categories,
276
+ project: project || "all",
277
+ has_embeddings: vectorIndex.length > 0,
278
+ };
279
+ }
280
+ /**
281
+ * Check which files have changed since last index (by hash)
282
+ */
283
+ export function getChangedFiles(fileHashes, project) {
284
+ const existing = readLocalIndex().filter((c) => c.project === project);
285
+ const existingHashes = new Map();
286
+ for (const c of existing) {
287
+ existingHashes.set(c.file_path, c.file_hash);
288
+ }
289
+ const changed = [];
290
+ const unchanged = [];
291
+ const newFiles = [];
292
+ for (const [filePath, hash] of fileHashes) {
293
+ const existingHash = existingHashes.get(filePath);
294
+ if (!existingHash) {
295
+ newFiles.push(filePath);
296
+ }
297
+ else if (existingHash !== hash) {
298
+ changed.push(filePath);
299
+ }
300
+ else {
301
+ unchanged.push(filePath);
302
+ }
303
+ }
304
+ return { changed, unchanged, new_files: newFiles };
305
+ }
306
+ /**
307
+ * Initialize vector index from local storage on startup
308
+ */
309
+ export function initDocVectorIndex() {
310
+ const chunks = readLocalIndex();
311
+ rebuildVectorIndex(chunks);
312
+ }
313
+ /**
314
+ * Clear the doc index for a project (or all)
315
+ */
316
+ export function clearDocIndex(project) {
317
+ const existing = readLocalIndex();
318
+ if (!project) {
319
+ writeLocalIndex([]);
320
+ vectorIndex = [];
321
+ return existing.length;
322
+ }
323
+ const kept = existing.filter((c) => c.project !== project);
324
+ writeLocalIndex(kept);
325
+ rebuildVectorIndex(kept);
326
+ return existing.length - kept.length;
327
+ }
328
+ //# sourceMappingURL=doc-index.js.map
@@ -37,7 +37,6 @@ const SESSION_REQUIRED_TOOLS = new Set([
37
37
  const CONSEQUENTIAL_TOOLS = new Set([
38
38
  "create_learning", "gitmem-cl", "gm-scar",
39
39
  "create_decision", "gitmem-cd",
40
- "create_thread", "gitmem-ct", "gm-thread-new",
41
40
  "session_close", "gitmem-sc", "gm-close",
42
41
  ]);
43
42
  /**