gitmem-mcp 1.4.4 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/dist/hooks/format-utils.js +4 -0
- package/dist/server.js +13 -0
- package/dist/services/doc-chunker.d.ts +45 -0
- package/dist/services/doc-chunker.js +208 -0
- package/dist/services/doc-index.d.ts +88 -0
- package/dist/services/doc-index.js +328 -0
- package/dist/tools/definitions.d.ts +688 -0
- package/dist/tools/definitions.js +87 -0
- package/dist/tools/index-docs.d.ts +30 -0
- package/dist/tools/index-docs.js +163 -0
- package/dist/tools/prepare-context.js +7 -0
- package/dist/tools/recall.js +10 -1
- package/dist/tools/search-docs.d.ts +38 -0
- package/dist/tools/search-docs.js +94 -0
- package/dist/tools/search.js +11 -1
- package/dist/tools/session-close.js +45 -2
- package/dist/tools/session-start.js +14 -0
- package/package.json +1 -1
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Index — Storage and search for indexed doc chunks
|
|
3
|
+
*
|
|
4
|
+
* Supports two backends:
|
|
5
|
+
* - Free tier: Local JSON file with BM25 keyword search
|
|
6
|
+
* - Pro/dev tier: In-memory vector index with embeddings
|
|
7
|
+
*
|
|
8
|
+
* Follows the same patterns as local-vector-search.ts and local-file-storage.ts
|
|
9
|
+
*/
|
|
10
|
+
import * as fs from "fs";
|
|
11
|
+
import * as path from "path";
|
|
12
|
+
import { v4 as uuidv4 } from "uuid";
|
|
13
|
+
import { getGitmemDir } from "./gitmem-dir.js";
|
|
14
|
+
import { bm25Search } from "./bm25.js";
|
|
15
|
+
import { embed as generateEmbedding, isEmbeddingAvailable } from "./embedding.js";
|
|
16
|
+
// --- Local File Index ---
|
|
17
|
+
const INDEX_FILE = "docs-index.json";
|
|
18
|
+
const MAX_INDEX_SIZE = 20 * 1024 * 1024; // 20MB
|
|
19
|
+
/**
|
|
20
|
+
* Get the path to the local docs index file
|
|
21
|
+
*/
|
|
22
|
+
function getIndexPath() {
|
|
23
|
+
return path.join(getGitmemDir(), INDEX_FILE);
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Read the local index from disk
|
|
27
|
+
*/
|
|
28
|
+
function readLocalIndex() {
|
|
29
|
+
const indexPath = getIndexPath();
|
|
30
|
+
if (!fs.existsSync(indexPath))
|
|
31
|
+
return [];
|
|
32
|
+
try {
|
|
33
|
+
const raw = fs.readFileSync(indexPath, "utf-8");
|
|
34
|
+
return JSON.parse(raw);
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
console.error("[doc-index] Failed to read docs-index.json, starting fresh");
|
|
38
|
+
return [];
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Write the local index to disk
|
|
43
|
+
*/
|
|
44
|
+
function writeLocalIndex(chunks) {
|
|
45
|
+
const indexPath = getIndexPath();
|
|
46
|
+
const dir = path.dirname(indexPath);
|
|
47
|
+
if (!fs.existsSync(dir)) {
|
|
48
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
49
|
+
}
|
|
50
|
+
// Strip embeddings from local file to save space
|
|
51
|
+
const stripped = chunks.map(({ embedding: _e, ...rest }) => rest);
|
|
52
|
+
const json = JSON.stringify(stripped, null, 2);
|
|
53
|
+
if (Buffer.byteLength(json, "utf-8") > MAX_INDEX_SIZE) {
|
|
54
|
+
console.error("[doc-index] Warning: docs-index.json exceeds 20MB");
|
|
55
|
+
}
|
|
56
|
+
fs.writeFileSync(indexPath, json, "utf-8");
|
|
57
|
+
}
|
|
58
|
+
let vectorIndex = [];
|
|
59
|
+
/**
|
|
60
|
+
* Compute cosine similarity between two normalized vectors
|
|
61
|
+
*/
|
|
62
|
+
function cosineSimilarity(a, b) {
|
|
63
|
+
if (a.length !== b.length)
|
|
64
|
+
return 0;
|
|
65
|
+
let dot = 0;
|
|
66
|
+
for (let i = 0; i < a.length; i++) {
|
|
67
|
+
dot += a[i] * b[i];
|
|
68
|
+
}
|
|
69
|
+
return dot;
|
|
70
|
+
}
|
|
71
|
+
// --- Public API ---
|
|
72
|
+
/**
|
|
73
|
+
* Index doc chunks into storage.
|
|
74
|
+
*
|
|
75
|
+
* - Removes old chunks for the same project + file_path
|
|
76
|
+
* - Generates embeddings if available (pro/dev tier)
|
|
77
|
+
* - Stores to local JSON file
|
|
78
|
+
* - Loads into in-memory vector index if embeddings present
|
|
79
|
+
*
|
|
80
|
+
* Returns count of chunks indexed.
|
|
81
|
+
*/
|
|
82
|
+
export async function indexChunks(chunks, project, options = {}) {
|
|
83
|
+
const batchSize = options.batchSize || 10;
|
|
84
|
+
const now = new Date().toISOString();
|
|
85
|
+
let embedded = 0;
|
|
86
|
+
let errors = 0;
|
|
87
|
+
// Read existing index
|
|
88
|
+
const existing = readLocalIndex();
|
|
89
|
+
// Build set of file paths being re-indexed
|
|
90
|
+
const reindexedPaths = new Set(chunks.map((c) => `${project}:${c.file_path}`));
|
|
91
|
+
// Remove old chunks for files being re-indexed
|
|
92
|
+
const kept = existing.filter((c) => !reindexedPaths.has(`${c.project}:${c.file_path}`));
|
|
93
|
+
// Create new indexed chunks
|
|
94
|
+
const newChunks = [];
|
|
95
|
+
// Process in batches for embedding
|
|
96
|
+
for (let i = 0; i < chunks.length; i += batchSize) {
|
|
97
|
+
const batch = chunks.slice(i, i + batchSize);
|
|
98
|
+
for (const chunk of batch) {
|
|
99
|
+
const indexed = {
|
|
100
|
+
id: uuidv4(),
|
|
101
|
+
file_path: chunk.file_path,
|
|
102
|
+
chunk_index: chunk.chunk_index,
|
|
103
|
+
title: chunk.title,
|
|
104
|
+
section_title: chunk.section_title,
|
|
105
|
+
category: chunk.category,
|
|
106
|
+
content: chunk.content,
|
|
107
|
+
file_hash: chunk.file_hash,
|
|
108
|
+
project,
|
|
109
|
+
indexed_at: now,
|
|
110
|
+
};
|
|
111
|
+
// Generate embedding if available
|
|
112
|
+
if (isEmbeddingAvailable()) {
|
|
113
|
+
try {
|
|
114
|
+
// Embed title + section + content for richer representation
|
|
115
|
+
const textToEmbed = [
|
|
116
|
+
indexed.title,
|
|
117
|
+
indexed.section_title,
|
|
118
|
+
indexed.content,
|
|
119
|
+
]
|
|
120
|
+
.filter(Boolean)
|
|
121
|
+
.join(" | ");
|
|
122
|
+
const embedding = await generateEmbedding(textToEmbed);
|
|
123
|
+
if (embedding) {
|
|
124
|
+
indexed.embedding = embedding;
|
|
125
|
+
embedded++;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
catch (err) {
|
|
129
|
+
console.error(`[doc-index] Embedding failed for ${chunk.file_path}:${chunk.chunk_index}:`, err instanceof Error ? err.message : err);
|
|
130
|
+
errors++;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
newChunks.push(indexed);
|
|
134
|
+
}
|
|
135
|
+
// Progress logging for large batches (every ~100 chunks)
|
|
136
|
+
if (chunks.length > 50 && i + batchSize < chunks.length && (i + batchSize) % 100 < batchSize) {
|
|
137
|
+
console.error(`[doc-index] Progress: ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
// Merge and write
|
|
141
|
+
const merged = [...kept, ...newChunks];
|
|
142
|
+
writeLocalIndex(merged);
|
|
143
|
+
// Update in-memory vector index
|
|
144
|
+
rebuildVectorIndex(merged);
|
|
145
|
+
return { indexed: newChunks.length, embedded, errors };
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Rebuild the in-memory vector index from stored chunks
|
|
149
|
+
*/
|
|
150
|
+
function rebuildVectorIndex(chunks) {
|
|
151
|
+
vectorIndex = chunks
|
|
152
|
+
.filter((c) => c.embedding && Array.isArray(c.embedding) && c.embedding.length > 0)
|
|
153
|
+
.map((c) => ({
|
|
154
|
+
chunk: c,
|
|
155
|
+
embedding: c.embedding,
|
|
156
|
+
}));
|
|
157
|
+
console.error(`[doc-index] Vector index rebuilt: ${vectorIndex.length} entries with embeddings`);
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Search indexed docs using semantic similarity (pro/dev) or BM25 (free)
|
|
161
|
+
*/
|
|
162
|
+
export async function searchDocs(query, options = {}) {
|
|
163
|
+
const matchCount = options.match_count || 5;
|
|
164
|
+
// Try vector search first (pro/dev tier with embeddings)
|
|
165
|
+
if (isEmbeddingAvailable() && vectorIndex.length > 0) {
|
|
166
|
+
return vectorSearchDocs(query, options);
|
|
167
|
+
}
|
|
168
|
+
// Fall back to BM25 keyword search
|
|
169
|
+
return bm25SearchDocs(query, options);
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Vector-based semantic search over doc chunks
|
|
173
|
+
*/
|
|
174
|
+
async function vectorSearchDocs(query, options) {
|
|
175
|
+
const matchCount = options.match_count || 5;
|
|
176
|
+
// Generate query embedding
|
|
177
|
+
const queryEmbedding = await generateEmbedding(query);
|
|
178
|
+
if (!queryEmbedding) {
|
|
179
|
+
console.error("[doc-index] Query embedding failed, falling back to BM25");
|
|
180
|
+
return bm25SearchDocs(query, options);
|
|
181
|
+
}
|
|
182
|
+
// Filter candidates
|
|
183
|
+
let candidates = vectorIndex;
|
|
184
|
+
if (options.project) {
|
|
185
|
+
candidates = candidates.filter((e) => e.chunk.project === options.project);
|
|
186
|
+
}
|
|
187
|
+
if (options.category) {
|
|
188
|
+
candidates = candidates.filter((e) => e.chunk.category === options.category);
|
|
189
|
+
}
|
|
190
|
+
// Score by cosine similarity
|
|
191
|
+
const scored = candidates.map((entry) => ({
|
|
192
|
+
chunk: entry.chunk,
|
|
193
|
+
similarity: cosineSimilarity(queryEmbedding, entry.embedding),
|
|
194
|
+
}));
|
|
195
|
+
// Sort and take top k
|
|
196
|
+
scored.sort((a, b) => b.similarity - a.similarity);
|
|
197
|
+
const topK = scored.slice(0, matchCount);
|
|
198
|
+
return topK.map(({ chunk, similarity }) => ({
|
|
199
|
+
id: chunk.id,
|
|
200
|
+
file_path: chunk.file_path,
|
|
201
|
+
chunk_index: chunk.chunk_index,
|
|
202
|
+
title: chunk.title,
|
|
203
|
+
section_title: chunk.section_title,
|
|
204
|
+
category: chunk.category,
|
|
205
|
+
content: chunk.content,
|
|
206
|
+
similarity: Math.round(similarity * 1000) / 1000,
|
|
207
|
+
project: chunk.project,
|
|
208
|
+
}));
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* BM25 keyword search over doc chunks (free tier)
|
|
212
|
+
*/
|
|
213
|
+
function bm25SearchDocs(query, options) {
|
|
214
|
+
const matchCount = options.match_count || 5;
|
|
215
|
+
const chunks = readLocalIndex();
|
|
216
|
+
// Filter by project and category
|
|
217
|
+
let filtered = chunks;
|
|
218
|
+
if (options.project) {
|
|
219
|
+
filtered = filtered.filter((c) => c.project === options.project);
|
|
220
|
+
}
|
|
221
|
+
if (options.category) {
|
|
222
|
+
filtered = filtered.filter((c) => c.category === options.category);
|
|
223
|
+
}
|
|
224
|
+
if (filtered.length === 0)
|
|
225
|
+
return [];
|
|
226
|
+
// Build BM25 documents with field boosting
|
|
227
|
+
const docs = filtered.map((c) => ({
|
|
228
|
+
id: c.id,
|
|
229
|
+
fields: [
|
|
230
|
+
{ text: c.title, boost: 3 },
|
|
231
|
+
{ text: c.section_title || "", boost: 2 },
|
|
232
|
+
{ text: c.category, boost: 1.5 },
|
|
233
|
+
{ text: c.content, boost: 1 },
|
|
234
|
+
],
|
|
235
|
+
}));
|
|
236
|
+
const results = bm25Search(query, docs, matchCount);
|
|
237
|
+
// Map back to DocSearchResult
|
|
238
|
+
const byId = new Map(filtered.map((c) => [c.id, c]));
|
|
239
|
+
return results
|
|
240
|
+
.map((r) => {
|
|
241
|
+
const c = byId.get(r.id);
|
|
242
|
+
if (!c)
|
|
243
|
+
return null;
|
|
244
|
+
return {
|
|
245
|
+
id: c.id,
|
|
246
|
+
file_path: c.file_path,
|
|
247
|
+
chunk_index: c.chunk_index,
|
|
248
|
+
title: c.title,
|
|
249
|
+
section_title: c.section_title,
|
|
250
|
+
category: c.category,
|
|
251
|
+
content: c.content,
|
|
252
|
+
similarity: r.similarity,
|
|
253
|
+
project: c.project,
|
|
254
|
+
};
|
|
255
|
+
})
|
|
256
|
+
.filter((r) => r !== null);
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Get index statistics
|
|
260
|
+
*/
|
|
261
|
+
export function getIndexStats(project) {
|
|
262
|
+
const chunks = readLocalIndex();
|
|
263
|
+
const filtered = project
|
|
264
|
+
? chunks.filter((c) => c.project === project)
|
|
265
|
+
: chunks;
|
|
266
|
+
const files = new Set(filtered.map((c) => c.file_path));
|
|
267
|
+
const categories = {};
|
|
268
|
+
for (const c of filtered) {
|
|
269
|
+
categories[c.category] = (categories[c.category] || 0) + 1;
|
|
270
|
+
}
|
|
271
|
+
return {
|
|
272
|
+
total_chunks: filtered.length,
|
|
273
|
+
total_files: files.size,
|
|
274
|
+
files_indexed: Array.from(files).sort(),
|
|
275
|
+
categories,
|
|
276
|
+
project: project || "all",
|
|
277
|
+
has_embeddings: vectorIndex.length > 0,
|
|
278
|
+
};
|
|
279
|
+
}
|
|
280
|
+
/**
|
|
281
|
+
* Check which files have changed since last index (by hash)
|
|
282
|
+
*/
|
|
283
|
+
export function getChangedFiles(fileHashes, project) {
|
|
284
|
+
const existing = readLocalIndex().filter((c) => c.project === project);
|
|
285
|
+
const existingHashes = new Map();
|
|
286
|
+
for (const c of existing) {
|
|
287
|
+
existingHashes.set(c.file_path, c.file_hash);
|
|
288
|
+
}
|
|
289
|
+
const changed = [];
|
|
290
|
+
const unchanged = [];
|
|
291
|
+
const newFiles = [];
|
|
292
|
+
for (const [filePath, hash] of fileHashes) {
|
|
293
|
+
const existingHash = existingHashes.get(filePath);
|
|
294
|
+
if (!existingHash) {
|
|
295
|
+
newFiles.push(filePath);
|
|
296
|
+
}
|
|
297
|
+
else if (existingHash !== hash) {
|
|
298
|
+
changed.push(filePath);
|
|
299
|
+
}
|
|
300
|
+
else {
|
|
301
|
+
unchanged.push(filePath);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
return { changed, unchanged, new_files: newFiles };
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Initialize vector index from local storage on startup
|
|
308
|
+
*/
|
|
309
|
+
export function initDocVectorIndex() {
|
|
310
|
+
const chunks = readLocalIndex();
|
|
311
|
+
rebuildVectorIndex(chunks);
|
|
312
|
+
}
|
|
313
|
+
/**
|
|
314
|
+
* Clear the doc index for a project (or all)
|
|
315
|
+
*/
|
|
316
|
+
export function clearDocIndex(project) {
|
|
317
|
+
const existing = readLocalIndex();
|
|
318
|
+
if (!project) {
|
|
319
|
+
writeLocalIndex([]);
|
|
320
|
+
vectorIndex = [];
|
|
321
|
+
return existing.length;
|
|
322
|
+
}
|
|
323
|
+
const kept = existing.filter((c) => c.project !== project);
|
|
324
|
+
writeLocalIndex(kept);
|
|
325
|
+
rebuildVectorIndex(kept);
|
|
326
|
+
return existing.length - kept.length;
|
|
327
|
+
}
|
|
328
|
+
//# sourceMappingURL=doc-index.js.map
|