@mrxkun/mcfast-mcp 4.0.14 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/memory/bootstrap/agents-md.js +173 -0
- package/src/memory/index.js +26 -13
- package/src/memory/layers/curated-memory.js +324 -0
- package/src/memory/layers/daily-logs.js +236 -0
- package/src/memory/memory-engine.js +472 -452
- package/src/memory/stores/codebase-database.js +418 -0
- package/src/memory/stores/memory-database.js +425 -0
- package/src/memory/utils/markdown-chunker.js +242 -0
- package/src/memory/watchers/file-watcher.js +286 -20
- package/src/tools/memory_get.js +139 -100
- package/src/tools/memory_search.js +118 -86
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Memory Database
|
|
3
|
+
* Lưu trữ index cho memory notes (Markdown source of truth)
|
|
4
|
+
* Sử dụng FTS5 cho full-text search + vectors cho semantic search
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import Database from 'better-sqlite3';
|
|
8
|
+
import path from 'path';
|
|
9
|
+
import fs from 'fs/promises';
|
|
10
|
+
|
|
11
|
+
export class MemoryDatabase {
|
|
12
|
+
constructor(dbPath = null) {
|
|
13
|
+
this.dbPath = dbPath;
|
|
14
|
+
this.db = null;
|
|
15
|
+
this.isInitialized = false;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
async initialize() {
|
|
19
|
+
if (this.isInitialized) return;
|
|
20
|
+
|
|
21
|
+
// Create directory if not exists
|
|
22
|
+
await fs.mkdir(path.dirname(this.dbPath), { recursive: true });
|
|
23
|
+
|
|
24
|
+
this.db = new Database(this.dbPath);
|
|
25
|
+
this.db.pragma('journal_mode = WAL');
|
|
26
|
+
|
|
27
|
+
this.createTables();
|
|
28
|
+
this.isInitialized = true;
|
|
29
|
+
|
|
30
|
+
console.log(`[MemoryDatabase] Initialized at: ${this.dbPath}`);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
createTables() {
|
|
34
|
+
// Chunks table - stores content from Markdown files
|
|
35
|
+
this.db.exec(`
|
|
36
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
37
|
+
id TEXT PRIMARY KEY,
|
|
38
|
+
file_path TEXT NOT NULL,
|
|
39
|
+
start_line INTEGER,
|
|
40
|
+
end_line INTEGER,
|
|
41
|
+
content TEXT NOT NULL,
|
|
42
|
+
content_hash TEXT NOT NULL,
|
|
43
|
+
chunk_type TEXT DEFAULT 'content',
|
|
44
|
+
created_at INTEGER,
|
|
45
|
+
updated_at INTEGER
|
|
46
|
+
);
|
|
47
|
+
|
|
48
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_path);
|
|
49
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_hash ON chunks(content_hash);
|
|
50
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_updated ON chunks(updated_at);
|
|
51
|
+
`);
|
|
52
|
+
|
|
53
|
+
// FTS5 virtual table for full-text search
|
|
54
|
+
this.db.exec(`
|
|
55
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
|
|
56
|
+
content,
|
|
57
|
+
content_rowid=id,
|
|
58
|
+
tokenize='porter'
|
|
59
|
+
);
|
|
60
|
+
`);
|
|
61
|
+
|
|
62
|
+
// Triggers to keep FTS5 in sync
|
|
63
|
+
this.db.exec(`
|
|
64
|
+
CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
|
|
65
|
+
INSERT INTO chunks_fts(rowid, content) VALUES (new.id, new.content);
|
|
66
|
+
END;
|
|
67
|
+
|
|
68
|
+
CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
|
|
69
|
+
INSERT INTO chunks_fts(chunks_fts, rowid, content) VALUES ('delete', old.id, old.content);
|
|
70
|
+
END;
|
|
71
|
+
|
|
72
|
+
CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN
|
|
73
|
+
INSERT INTO chunks_fts(chunks_fts, rowid, content) VALUES ('delete', old.id, old.content);
|
|
74
|
+
INSERT INTO chunks_fts(rowid, content) VALUES (new.id, new.content);
|
|
75
|
+
END;
|
|
76
|
+
`);
|
|
77
|
+
|
|
78
|
+
// Vectors table - stores embeddings
|
|
79
|
+
this.db.exec(`
|
|
80
|
+
CREATE TABLE IF NOT EXISTS embeddings (
|
|
81
|
+
chunk_id TEXT PRIMARY KEY,
|
|
82
|
+
embedding BLOB NOT NULL,
|
|
83
|
+
model TEXT,
|
|
84
|
+
dimensions INTEGER,
|
|
85
|
+
created_at INTEGER,
|
|
86
|
+
FOREIGN KEY (chunk_id) REFERENCES chunks(id) ON DELETE CASCADE
|
|
87
|
+
);
|
|
88
|
+
|
|
89
|
+
CREATE INDEX IF NOT EXISTS idx_embeddings_model ON embeddings(model);
|
|
90
|
+
`);
|
|
91
|
+
|
|
92
|
+
// Embedding cache - avoid re-embedding unchanged content
|
|
93
|
+
this.db.exec(`
|
|
94
|
+
CREATE TABLE IF NOT EXISTS embedding_cache (
|
|
95
|
+
content_hash TEXT PRIMARY KEY,
|
|
96
|
+
embedding BLOB NOT NULL,
|
|
97
|
+
model TEXT,
|
|
98
|
+
dimensions INTEGER,
|
|
99
|
+
created_at INTEGER
|
|
100
|
+
);
|
|
101
|
+
`);
|
|
102
|
+
|
|
103
|
+
// File metadata - track indexed files
|
|
104
|
+
this.db.exec(`
|
|
105
|
+
CREATE TABLE IF NOT EXISTS files (
|
|
106
|
+
path TEXT PRIMARY KEY,
|
|
107
|
+
content_hash TEXT NOT NULL,
|
|
108
|
+
last_modified INTEGER,
|
|
109
|
+
file_size INTEGER,
|
|
110
|
+
indexed_at INTEGER
|
|
111
|
+
);
|
|
112
|
+
|
|
113
|
+
CREATE INDEX IF NOT EXISTS idx_files_hash ON files(content_hash);
|
|
114
|
+
`);
|
|
115
|
+
|
|
116
|
+
// Search history - for analytics and optimization
|
|
117
|
+
this.db.exec(`
|
|
118
|
+
CREATE TABLE IF NOT EXISTS search_history (
|
|
119
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
120
|
+
query TEXT NOT NULL,
|
|
121
|
+
query_hash TEXT,
|
|
122
|
+
method TEXT,
|
|
123
|
+
results_count INTEGER,
|
|
124
|
+
duration_ms INTEGER,
|
|
125
|
+
timestamp INTEGER
|
|
126
|
+
);
|
|
127
|
+
|
|
128
|
+
CREATE INDEX IF NOT EXISTS idx_search_timestamp ON search_history(timestamp);
|
|
129
|
+
`);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// ========== Chunk Operations ==========
|
|
133
|
+
|
|
134
|
+
insertChunk(chunk) {
|
|
135
|
+
const stmt = this.db.prepare(`
|
|
136
|
+
INSERT OR REPLACE INTO chunks
|
|
137
|
+
(id, file_path, start_line, end_line, content, content_hash, chunk_type, created_at, updated_at)
|
|
138
|
+
VALUES ($id, $file_path, $start_line, $end_line, $content, $content_hash, $chunk_type, $created_at, $updated_at)
|
|
139
|
+
`);
|
|
140
|
+
|
|
141
|
+
const now = Date.now();
|
|
142
|
+
return stmt.run({
|
|
143
|
+
...chunk,
|
|
144
|
+
created_at: chunk.created_at || now,
|
|
145
|
+
updated_at: now
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
deleteChunksByFile(filePath) {
|
|
150
|
+
const stmt = this.db.prepare('DELETE FROM chunks WHERE file_path = ?');
|
|
151
|
+
return stmt.run(filePath);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
getChunkById(chunkId) {
|
|
155
|
+
return this.db.prepare('SELECT * FROM chunks WHERE id = ?').get(chunkId);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
getChunksByFile(filePath, limit = 100) {
|
|
159
|
+
return this.db.prepare('SELECT * FROM chunks WHERE file_path = ? ORDER BY start_line LIMIT ?')
|
|
160
|
+
.all(filePath, limit);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
getRecentChunks(limit = 100) {
|
|
164
|
+
return this.db.prepare('SELECT * FROM chunks ORDER BY updated_at DESC LIMIT ?').all(limit);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// ========== Embedding Operations ==========
|
|
168
|
+
|
|
169
|
+
insertEmbedding(embedding) {
|
|
170
|
+
const stmt = this.db.prepare(`
|
|
171
|
+
INSERT OR REPLACE INTO embeddings (chunk_id, embedding, model, dimensions, created_at)
|
|
172
|
+
VALUES ($chunk_id, $embedding, $model, $dimensions, $created_at)
|
|
173
|
+
`);
|
|
174
|
+
return stmt.run({
|
|
175
|
+
...embedding,
|
|
176
|
+
created_at: embedding.created_at || Date.now()
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
getEmbedding(chunkId) {
|
|
181
|
+
return this.db.prepare('SELECT * FROM embeddings WHERE chunk_id = ?').get(chunkId);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
getAllEmbeddings() {
|
|
185
|
+
return this.db.prepare(`
|
|
186
|
+
SELECT e.*, c.content, c.file_path, c.start_line, c.end_line
|
|
187
|
+
FROM embeddings e
|
|
188
|
+
JOIN chunks c ON e.chunk_id = c.id
|
|
189
|
+
`).all();
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// ========== Embedding Cache ==========
|
|
193
|
+
|
|
194
|
+
getCachedEmbedding(contentHash) {
|
|
195
|
+
return this.db.prepare('SELECT * FROM embedding_cache WHERE content_hash = ?').get(contentHash);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
cacheEmbedding(contentHash, embedding, model, dimensions) {
|
|
199
|
+
const stmt = this.db.prepare(`
|
|
200
|
+
INSERT OR REPLACE INTO embedding_cache (content_hash, embedding, model, dimensions, created_at)
|
|
201
|
+
VALUES (?, ?, ?, ?, ?)
|
|
202
|
+
`);
|
|
203
|
+
return stmt.run(contentHash, embedding, model, dimensions, Date.now());
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// ========== File Tracking ==========
|
|
207
|
+
|
|
208
|
+
upsertFile(filePath, contentHash, lastModified, fileSize) {
|
|
209
|
+
const stmt = this.db.prepare(`
|
|
210
|
+
INSERT OR REPLACE INTO files (path, content_hash, last_modified, file_size, indexed_at)
|
|
211
|
+
VALUES (?, ?, ?, ?, ?)
|
|
212
|
+
`);
|
|
213
|
+
return stmt.run(filePath, contentHash, lastModified, fileSize, Date.now());
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
getFile(filePath) {
|
|
217
|
+
return this.db.prepare('SELECT * FROM files WHERE path = ?').get(filePath);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
isFileIndexed(filePath, contentHash) {
|
|
221
|
+
const file = this.getFile(filePath);
|
|
222
|
+
return file && file.content_hash === contentHash;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
deleteFile(filePath) {
|
|
226
|
+
return this.db.prepare('DELETE FROM files WHERE path = ?').run(filePath);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// ========== Search Operations ==========
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* Full-text search using FTS5
|
|
233
|
+
*/
|
|
234
|
+
searchFTS(query, limit = 20) {
|
|
235
|
+
const startTime = performance.now();
|
|
236
|
+
|
|
237
|
+
// Use FTS5 rank for scoring (BM25)
|
|
238
|
+
const stmt = this.db.prepare(`
|
|
239
|
+
SELECT
|
|
240
|
+
c.*,
|
|
241
|
+
rank as bm25_score
|
|
242
|
+
FROM chunks_fts fts
|
|
243
|
+
JOIN chunks c ON fts.rowid = c.id
|
|
244
|
+
WHERE chunks_fts MATCH ?
|
|
245
|
+
ORDER BY rank
|
|
246
|
+
LIMIT ?
|
|
247
|
+
`);
|
|
248
|
+
|
|
249
|
+
const results = stmt.all(query, limit);
|
|
250
|
+
const duration = performance.now() - startTime;
|
|
251
|
+
|
|
252
|
+
// Log search
|
|
253
|
+
this.logSearch(query, 'fts', results.length, duration);
|
|
254
|
+
|
|
255
|
+
return {
|
|
256
|
+
results: results.map(r => ({
|
|
257
|
+
...r,
|
|
258
|
+
// Convert BM25 rank to 0-1 score (lower rank = better match)
|
|
259
|
+
score: 1 / (1 + Math.max(0, r.bm25_score))
|
|
260
|
+
})),
|
|
261
|
+
metadata: {
|
|
262
|
+
method: 'fts5',
|
|
263
|
+
duration: duration.toFixed(2) + 'ms',
|
|
264
|
+
candidates: results.length
|
|
265
|
+
}
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Hybrid search: Combine FTS5 + Vector
|
|
271
|
+
* Weight: Vector 0.7, BM25 0.3
|
|
272
|
+
*/
|
|
273
|
+
searchHybrid(query, vectorResults, limit = 10) {
|
|
274
|
+
const startTime = performance.now();
|
|
275
|
+
const candidateMultiplier = 4;
|
|
276
|
+
const maxCandidates = limit * candidateMultiplier;
|
|
277
|
+
|
|
278
|
+
// Get FTS results
|
|
279
|
+
const ftsResults = this.searchFTS(query, maxCandidates);
|
|
280
|
+
|
|
281
|
+
// Normalize FTS scores to 0-1
|
|
282
|
+
const ftsMap = new Map();
|
|
283
|
+
ftsResults.results.forEach(r => {
|
|
284
|
+
ftsMap.set(r.id, r.score);
|
|
285
|
+
});
|
|
286
|
+
|
|
287
|
+
// Normalize vector scores to 0-1
|
|
288
|
+
let maxVectorScore = 0;
|
|
289
|
+
vectorResults.forEach(r => {
|
|
290
|
+
if (r.similarity > maxVectorScore) maxVectorScore = r.similarity;
|
|
291
|
+
});
|
|
292
|
+
|
|
293
|
+
// Combine results
|
|
294
|
+
const combined = new Map();
|
|
295
|
+
|
|
296
|
+
// Add vector results
|
|
297
|
+
vectorResults.forEach(r => {
|
|
298
|
+
const normalizedVectorScore = maxVectorScore > 0 ? r.similarity / maxVectorScore : 0;
|
|
299
|
+
combined.set(r.chunk_id, {
|
|
300
|
+
chunk_id: r.chunk_id,
|
|
301
|
+
file_path: r.file_path,
|
|
302
|
+
start_line: r.start_line,
|
|
303
|
+
end_line: r.end_line,
|
|
304
|
+
content: r.content,
|
|
305
|
+
vectorScore: normalizedVectorScore,
|
|
306
|
+
textScore: ftsMap.get(r.chunk_id) || 0,
|
|
307
|
+
sources: ['vector']
|
|
308
|
+
});
|
|
309
|
+
});
|
|
310
|
+
|
|
311
|
+
// Add FTS results not in vector
|
|
312
|
+
ftsResults.results.forEach(r => {
|
|
313
|
+
if (combined.has(r.id)) {
|
|
314
|
+
combined.get(r.id).textScore = r.score;
|
|
315
|
+
combined.get(r.id).sources.push('fts');
|
|
316
|
+
} else {
|
|
317
|
+
combined.set(r.id, {
|
|
318
|
+
chunk_id: r.id,
|
|
319
|
+
file_path: r.file_path,
|
|
320
|
+
start_line: r.start_line,
|
|
321
|
+
end_line: r.end_line,
|
|
322
|
+
content: r.content,
|
|
323
|
+
vectorScore: 0,
|
|
324
|
+
textScore: r.score,
|
|
325
|
+
sources: ['fts']
|
|
326
|
+
});
|
|
327
|
+
}
|
|
328
|
+
});
|
|
329
|
+
|
|
330
|
+
// Calculate final scores (0.7 vector + 0.3 text)
|
|
331
|
+
const results = Array.from(combined.values()).map(r => ({
|
|
332
|
+
...r,
|
|
333
|
+
finalScore: (0.7 * r.vectorScore) + (0.3 * r.textScore)
|
|
334
|
+
}));
|
|
335
|
+
|
|
336
|
+
// Sort by final score and filter
|
|
337
|
+
results.sort((a, b) => b.finalScore - a.finalScore);
|
|
338
|
+
const filtered = results.filter(r => r.finalScore >= 0.35).slice(0, limit);
|
|
339
|
+
|
|
340
|
+
const duration = performance.now() - startTime;
|
|
341
|
+
this.logSearch(query, 'hybrid', filtered.length, duration);
|
|
342
|
+
|
|
343
|
+
return {
|
|
344
|
+
results: filtered,
|
|
345
|
+
metadata: {
|
|
346
|
+
method: 'hybrid',
|
|
347
|
+
duration: duration.toFixed(2) + 'ms',
|
|
348
|
+
vectorWeight: 0.7,
|
|
349
|
+
textWeight: 0.3,
|
|
350
|
+
minScore: 0.35,
|
|
351
|
+
candidates: results.length,
|
|
352
|
+
returned: filtered.length
|
|
353
|
+
}
|
|
354
|
+
};
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// ========== Search History ==========
|
|
358
|
+
|
|
359
|
+
logSearch(query, method, resultsCount, durationMs) {
|
|
360
|
+
try {
|
|
361
|
+
const stmt = this.db.prepare(`
|
|
362
|
+
INSERT INTO search_history (query, query_hash, method, results_count, duration_ms, timestamp)
|
|
363
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
364
|
+
`);
|
|
365
|
+
|
|
366
|
+
const crypto = await import('crypto');
|
|
367
|
+
const queryHash = crypto.createHash('md5').update(query).digest('hex');
|
|
368
|
+
|
|
369
|
+
stmt.run(query, queryHash, method, resultsCount, Math.round(durationMs), Date.now());
|
|
370
|
+
} catch (error) {
|
|
371
|
+
// Silent fail - don't break search for logging
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
getSearchStats(days = 7) {
|
|
376
|
+
const since = Date.now() - (days * 24 * 60 * 60 * 1000);
|
|
377
|
+
return this.db.prepare(`
|
|
378
|
+
SELECT
|
|
379
|
+
method,
|
|
380
|
+
COUNT(*) as count,
|
|
381
|
+
AVG(duration_ms) as avg_duration,
|
|
382
|
+
AVG(results_count) as avg_results
|
|
383
|
+
FROM search_history
|
|
384
|
+
WHERE timestamp > ?
|
|
385
|
+
GROUP BY method
|
|
386
|
+
`).all(since);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// ========== Stats ==========
|
|
390
|
+
|
|
391
|
+
getStats() {
|
|
392
|
+
const files = this.db.prepare('SELECT COUNT(*) as count FROM files').get();
|
|
393
|
+
const chunks = this.db.prepare('SELECT COUNT(*) as count FROM chunks').get();
|
|
394
|
+
const embeddings = this.db.prepare('SELECT COUNT(*) as count FROM embeddings').get();
|
|
395
|
+
const cache = this.db.prepare('SELECT COUNT(*) as count FROM embedding_cache').get();
|
|
396
|
+
|
|
397
|
+
return {
|
|
398
|
+
files: files.count,
|
|
399
|
+
chunks: chunks.count,
|
|
400
|
+
embeddings: embeddings.count,
|
|
401
|
+
cacheEntries: cache.count,
|
|
402
|
+
dbPath: this.dbPath
|
|
403
|
+
};
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// ========== Maintenance ==========
|
|
407
|
+
|
|
408
|
+
vacuum() {
|
|
409
|
+
this.db.exec('VACUUM');
|
|
410
|
+
this.db.exec('ANALYZE');
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
clearCache() {
|
|
414
|
+
this.db.exec('DELETE FROM embedding_cache');
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
close() {
|
|
418
|
+
if (this.db) {
|
|
419
|
+
this.db.close();
|
|
420
|
+
this.isInitialized = false;
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
export default MemoryDatabase;
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown Chunker
|
|
3
|
+
* Chia Markdown thành chunks với overlap
|
|
4
|
+
* Chunk size: ~400 tokens, overlap: 80 tokens
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import crypto from 'crypto';
|
|
8
|
+
|
|
9
|
+
export class MarkdownChunker {
|
|
10
|
+
constructor(options = {}) {
|
|
11
|
+
// Target ~400 tokens (approx 1600 chars for English)
|
|
12
|
+
this.chunkSize = options.chunkSize || 1600;
|
|
13
|
+
// 80 tokens overlap (approx 320 chars)
|
|
14
|
+
this.overlap = options.overlap || 320;
|
|
15
|
+
// Respect headers - don't split within a header section if possible
|
|
16
|
+
this.respectHeaders = options.respectHeaders !== false;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Chunk Markdown content
|
|
21
|
+
* @param {string} content - Markdown content
|
|
22
|
+
* @param {string} filePath - Source file path
|
|
23
|
+
* @returns {Array} Array of chunks
|
|
24
|
+
*/
|
|
25
|
+
chunk(content, filePath) {
|
|
26
|
+
const lines = content.split('\n');
|
|
27
|
+
const chunks = [];
|
|
28
|
+
let currentChunk = [];
|
|
29
|
+
let currentSize = 0;
|
|
30
|
+
let startLine = 0;
|
|
31
|
+
|
|
32
|
+
for (let i = 0; i < lines.length; i++) {
|
|
33
|
+
const line = lines[i];
|
|
34
|
+
const lineSize = line.length + 1; // +1 for newline
|
|
35
|
+
|
|
36
|
+
// Check if adding this line would exceed chunk size
|
|
37
|
+
if (currentSize + lineSize > this.chunkSize && currentChunk.length > 0) {
|
|
38
|
+
// Save current chunk
|
|
39
|
+
const chunkText = currentChunk.join('\n');
|
|
40
|
+
chunks.push(this.createChunk(chunkText, filePath, startLine + 1, i, content));
|
|
41
|
+
|
|
42
|
+
// Start new chunk with overlap
|
|
43
|
+
const overlapLines = this.getOverlapLines(currentChunk);
|
|
44
|
+
currentChunk = [...overlapLines, line];
|
|
45
|
+
currentSize = overlapLines.join('\n').length + lineSize;
|
|
46
|
+
startLine = i - overlapLines.length;
|
|
47
|
+
} else {
|
|
48
|
+
currentChunk.push(line);
|
|
49
|
+
currentSize += lineSize;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Don't forget the last chunk
|
|
54
|
+
if (currentChunk.length > 0) {
|
|
55
|
+
const chunkText = currentChunk.join('\n');
|
|
56
|
+
chunks.push(this.createChunk(chunkText, filePath, startLine + 1, lines.length, content));
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return chunks;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Get overlap lines from previous chunk
|
|
64
|
+
*/
|
|
65
|
+
getOverlapLines(previousChunk) {
|
|
66
|
+
let overlapSize = 0;
|
|
67
|
+
const overlapLines = [];
|
|
68
|
+
|
|
69
|
+
// Take lines from end until we reach overlap size
|
|
70
|
+
for (let i = previousChunk.length - 1; i >= 0; i--) {
|
|
71
|
+
const line = previousChunk[i];
|
|
72
|
+
overlapLines.unshift(line);
|
|
73
|
+
overlapSize += line.length + 1;
|
|
74
|
+
|
|
75
|
+
if (overlapSize >= this.overlap) {
|
|
76
|
+
break;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return overlapLines;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Create a chunk object
|
|
85
|
+
*/
|
|
86
|
+
createChunk(content, filePath, startLine, endLine, fullContent) {
|
|
87
|
+
const hash = crypto.createHash('md5').update(content).digest('hex');
|
|
88
|
+
const fullHash = crypto.createHash('md5').update(fullContent).digest('hex');
|
|
89
|
+
|
|
90
|
+
return {
|
|
91
|
+
id: this.generateChunkId(filePath, startLine, content),
|
|
92
|
+
filePath,
|
|
93
|
+
startLine,
|
|
94
|
+
endLine,
|
|
95
|
+
content: content.trim(),
|
|
96
|
+
contentHash: hash,
|
|
97
|
+
fullContentHash: fullHash,
|
|
98
|
+
tokenCount: this.estimateTokenCount(content),
|
|
99
|
+
chunkType: this.detectChunkType(content)
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Generate unique chunk ID
|
|
105
|
+
*/
|
|
106
|
+
generateChunkId(filePath, startLine, content) {
|
|
107
|
+
const hash = crypto.createHash('md5')
|
|
108
|
+
.update(`${filePath}:${startLine}:${content.substring(0, 100)}`)
|
|
109
|
+
.digest('hex')
|
|
110
|
+
.substring(0, 16);
|
|
111
|
+
return hash;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Estimate token count (rough approximation)
|
|
116
|
+
* ~4 chars per token for English
|
|
117
|
+
*/
|
|
118
|
+
estimateTokenCount(text) {
|
|
119
|
+
return Math.ceil(text.length / 4);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Detect chunk type based on content
|
|
124
|
+
*/
|
|
125
|
+
detectChunkType(content) {
|
|
126
|
+
const trimmed = content.trim();
|
|
127
|
+
|
|
128
|
+
if (trimmed.startsWith('# ')) return 'h1';
|
|
129
|
+
if (trimmed.startsWith('## ')) return 'h2';
|
|
130
|
+
if (trimmed.startsWith('### ')) return 'h3';
|
|
131
|
+
if (trimmed.startsWith('```')) return 'code';
|
|
132
|
+
if (trimmed.startsWith('- ') || trimmed.startsWith('* ')) return 'list';
|
|
133
|
+
if (/^\d+\./.test(trimmed)) return 'numbered';
|
|
134
|
+
if (trimmed.startsWith('>')) return 'quote';
|
|
135
|
+
|
|
136
|
+
return 'content';
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Chunk with header-aware splitting
|
|
141
|
+
* Tries to keep headers with their content
|
|
142
|
+
*/
|
|
143
|
+
chunkWithHeaders(content, filePath) {
|
|
144
|
+
if (!this.respectHeaders) {
|
|
145
|
+
return this.chunk(content, filePath);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const sections = this.splitByHeaders(content);
|
|
149
|
+
const chunks = [];
|
|
150
|
+
let currentChunk = [];
|
|
151
|
+
let currentSize = 0;
|
|
152
|
+
let startLine = 0;
|
|
153
|
+
let lineOffset = 0;
|
|
154
|
+
|
|
155
|
+
for (const section of sections) {
|
|
156
|
+
const sectionSize = section.content.length;
|
|
157
|
+
|
|
158
|
+
// If section is larger than chunk size, split it
|
|
159
|
+
if (sectionSize > this.chunkSize) {
|
|
160
|
+
// Save current chunk first
|
|
161
|
+
if (currentChunk.length > 0) {
|
|
162
|
+
const chunkText = currentChunk.join('\n\n');
|
|
163
|
+
chunks.push(this.createChunk(chunkText, filePath, startLine + 1, lineOffset, content));
|
|
164
|
+
currentChunk = [];
|
|
165
|
+
currentSize = 0;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Split large section
|
|
169
|
+
const sectionChunks = this.chunk(section.content, filePath);
|
|
170
|
+
for (const chunk of sectionChunks) {
|
|
171
|
+
chunk.startLine += lineOffset;
|
|
172
|
+
chunk.endLine += lineOffset;
|
|
173
|
+
chunks.push(chunk);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
// If adding this section would exceed chunk size, start new chunk
|
|
177
|
+
else if (currentSize + sectionSize > this.chunkSize && currentChunk.length > 0) {
|
|
178
|
+
const chunkText = currentChunk.join('\n\n');
|
|
179
|
+
chunks.push(this.createChunk(chunkText, filePath, startLine + 1, lineOffset, content));
|
|
180
|
+
|
|
181
|
+
currentChunk = [section.content];
|
|
182
|
+
currentSize = sectionSize;
|
|
183
|
+
startLine = lineOffset;
|
|
184
|
+
} else {
|
|
185
|
+
currentChunk.push(section.content);
|
|
186
|
+
currentSize += sectionSize;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
lineOffset += section.content.split('\n').length;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Don't forget the last chunk
|
|
193
|
+
if (currentChunk.length > 0) {
|
|
194
|
+
const chunkText = currentChunk.join('\n\n');
|
|
195
|
+
chunks.push(this.createChunk(chunkText, filePath, startLine + 1, lineOffset, content));
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return chunks;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Split content by headers
|
|
203
|
+
*/
|
|
204
|
+
splitByHeaders(content) {
|
|
205
|
+
const lines = content.split('\n');
|
|
206
|
+
const sections = [];
|
|
207
|
+
let currentSection = [];
|
|
208
|
+
let currentLevel = 0;
|
|
209
|
+
|
|
210
|
+
for (const line of lines) {
|
|
211
|
+
const headerMatch = line.match(/^(#{1,6})\s/);
|
|
212
|
+
|
|
213
|
+
if (headerMatch) {
|
|
214
|
+
// Save previous section
|
|
215
|
+
if (currentSection.length > 0) {
|
|
216
|
+
sections.push({
|
|
217
|
+
level: currentLevel,
|
|
218
|
+
content: currentSection.join('\n')
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Start new section
|
|
223
|
+
currentLevel = headerMatch[1].length;
|
|
224
|
+
currentSection = [line];
|
|
225
|
+
} else {
|
|
226
|
+
currentSection.push(line);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Don't forget the last section
|
|
231
|
+
if (currentSection.length > 0) {
|
|
232
|
+
sections.push({
|
|
233
|
+
level: currentLevel,
|
|
234
|
+
content: currentSection.join('\n')
|
|
235
|
+
});
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
return sections;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
export default MarkdownChunker;
|