@grec0/memory-bank-mcp 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +642 -420
- package/dist/common/chunker.js +166 -534
- package/dist/common/embeddingService.js +39 -51
- package/dist/common/fileScanner.js +123 -58
- package/dist/common/indexManager.js +185 -108
- package/dist/common/projectKnowledgeService.js +627 -0
- package/dist/common/setup.js +49 -0
- package/dist/common/utils.js +215 -0
- package/dist/common/vectorStore.js +80 -67
- package/dist/index.js +106 -14
- package/dist/operations/boardMemberships.js +186 -0
- package/dist/operations/boards.js +268 -0
- package/dist/operations/cards.js +426 -0
- package/dist/operations/comments.js +249 -0
- package/dist/operations/labels.js +258 -0
- package/dist/operations/lists.js +157 -0
- package/dist/operations/projects.js +102 -0
- package/dist/operations/tasks.js +238 -0
- package/dist/tools/analyzeCoverage.js +50 -67
- package/dist/tools/board-summary.js +151 -0
- package/dist/tools/card-details.js +106 -0
- package/dist/tools/create-card-with-tasks.js +81 -0
- package/dist/tools/generateProjectDocs.js +133 -0
- package/dist/tools/getProjectDocs.js +126 -0
- package/dist/tools/index.js +3 -0
- package/dist/tools/indexCode.js +4 -2
- package/dist/tools/searchMemory.js +4 -2
- package/dist/tools/workflow-actions.js +145 -0
- package/dist/tools/writeFile.js +2 -2
- package/package.json +2 -2
package/dist/common/chunker.js
CHANGED
|
@@ -1,73 +1,30 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @fileoverview Intelligent code chunker for Memory Bank
|
|
3
3
|
* Fragments code intelligently using AST parsing when possible
|
|
4
|
+
* Uses token counting to respect embedding model limits
|
|
4
5
|
*/
|
|
5
6
|
import * as fs from "fs";
|
|
6
7
|
import { parse } from "@babel/parser";
|
|
7
8
|
import traverseLib from "@babel/traverse";
|
|
8
9
|
import * as crypto from "crypto";
|
|
9
|
-
import {
|
|
10
|
+
import { encode } from "gpt-tokenizer";
|
|
10
11
|
// Handle traverse library export
|
|
11
12
|
const traverse = typeof traverseLib === 'function' ? traverseLib : traverseLib.default;
|
|
12
|
-
//
|
|
13
|
-
|
|
13
|
+
// Constants for embedding model limits
|
|
14
|
+
// text-embedding-3-small has 8192 token limit, use 7500 for safety margin
|
|
15
|
+
const MAX_TOKENS_PER_CHUNK = 7500;
|
|
16
|
+
const DEFAULT_CHUNK_OVERLAP_TOKENS = 200;
|
|
14
17
|
/**
|
|
15
|
-
*
|
|
18
|
+
* Counts tokens in a text using tiktoken-compatible tokenizer
|
|
16
19
|
*/
|
|
17
|
-
function
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
else {
|
|
25
|
-
// Split into smaller chunks
|
|
26
|
-
const content = chunk.content;
|
|
27
|
-
const lines = content.split('\n');
|
|
28
|
-
let currentChunkLines = [];
|
|
29
|
-
let currentTokens = 0;
|
|
30
|
-
let startLine = chunk.startLine;
|
|
31
|
-
let partIndex = 1;
|
|
32
|
-
for (let i = 0; i < lines.length; i++) {
|
|
33
|
-
const line = lines[i];
|
|
34
|
-
const lineTokens = enc.encode(line + '\n').length;
|
|
35
|
-
if (currentTokens + lineTokens > maxTokens) {
|
|
36
|
-
// Push current chunk
|
|
37
|
-
if (currentChunkLines.length > 0) {
|
|
38
|
-
const subContent = currentChunkLines.join('\n');
|
|
39
|
-
result.push({
|
|
40
|
-
...chunk,
|
|
41
|
-
id: `${chunk.id}-${partIndex}`,
|
|
42
|
-
content: subContent,
|
|
43
|
-
startLine: startLine,
|
|
44
|
-
endLine: startLine + currentChunkLines.length - 1,
|
|
45
|
-
name: chunk.name ? `${chunk.name} (Part ${partIndex})` : undefined
|
|
46
|
-
});
|
|
47
|
-
partIndex++;
|
|
48
|
-
startLine += currentChunkLines.length;
|
|
49
|
-
currentChunkLines = [];
|
|
50
|
-
currentTokens = 0;
|
|
51
|
-
}
|
|
52
|
-
}
|
|
53
|
-
currentChunkLines.push(line);
|
|
54
|
-
currentTokens += lineTokens;
|
|
55
|
-
}
|
|
56
|
-
// Remaining
|
|
57
|
-
if (currentChunkLines.length > 0) {
|
|
58
|
-
const subContent = currentChunkLines.join('\n');
|
|
59
|
-
result.push({
|
|
60
|
-
...chunk,
|
|
61
|
-
id: `${chunk.id}-${partIndex}`,
|
|
62
|
-
content: subContent,
|
|
63
|
-
startLine: startLine,
|
|
64
|
-
endLine: chunk.endLine, // Best effort
|
|
65
|
-
name: chunk.name ? `${chunk.name} (Part ${partIndex})` : undefined
|
|
66
|
-
});
|
|
67
|
-
}
|
|
68
|
-
}
|
|
20
|
+
export function countTokens(text) {
|
|
21
|
+
try {
|
|
22
|
+
return encode(text).length;
|
|
23
|
+
}
|
|
24
|
+
catch {
|
|
25
|
+
// Fallback estimation: ~4 characters per token for code
|
|
26
|
+
return Math.ceil(text.length / 4);
|
|
69
27
|
}
|
|
70
|
-
return result;
|
|
71
28
|
}
|
|
72
29
|
/**
|
|
73
30
|
* Generates unique ID for a chunk based on content and metadata
|
|
@@ -120,6 +77,94 @@ function extractContext(content, language) {
|
|
|
120
77
|
}
|
|
121
78
|
return contextLines.join("\n");
|
|
122
79
|
}
|
|
80
|
+
/**
|
|
81
|
+
* Splits a chunk that exceeds the token limit into smaller chunks
|
|
82
|
+
*/
|
|
83
|
+
function splitLargeChunk(chunk, maxTokens, overlapTokens) {
|
|
84
|
+
const tokenCount = countTokens(chunk.content);
|
|
85
|
+
// If under limit, return as-is
|
|
86
|
+
if (tokenCount <= maxTokens) {
|
|
87
|
+
return [{ ...chunk, tokenCount }];
|
|
88
|
+
}
|
|
89
|
+
console.error(`Splitting large chunk: ${chunk.filePath} (${chunk.name || 'unnamed'}) - ${tokenCount} tokens exceeds ${maxTokens} limit`);
|
|
90
|
+
const subChunks = [];
|
|
91
|
+
const lines = chunk.content.split("\n");
|
|
92
|
+
let currentLines = [];
|
|
93
|
+
let currentTokens = 0;
|
|
94
|
+
let subChunkStartLine = chunk.startLine;
|
|
95
|
+
let subChunkIndex = 0;
|
|
96
|
+
for (let i = 0; i < lines.length; i++) {
|
|
97
|
+
const line = lines[i];
|
|
98
|
+
const lineTokens = countTokens(line + "\n");
|
|
99
|
+
// If single line exceeds max, we have to include it anyway (extreme edge case)
|
|
100
|
+
if (lineTokens > maxTokens && currentLines.length === 0) {
|
|
101
|
+
currentLines.push(line);
|
|
102
|
+
currentTokens = lineTokens;
|
|
103
|
+
}
|
|
104
|
+
else if (currentTokens + lineTokens > maxTokens && currentLines.length > 0) {
|
|
105
|
+
// Save current chunk
|
|
106
|
+
const content = currentLines.join("\n");
|
|
107
|
+
const actualTokens = countTokens(content);
|
|
108
|
+
subChunks.push({
|
|
109
|
+
id: generateChunkId(chunk.filePath, content, subChunkStartLine),
|
|
110
|
+
filePath: chunk.filePath,
|
|
111
|
+
content,
|
|
112
|
+
startLine: subChunkStartLine,
|
|
113
|
+
endLine: chunk.startLine + i - 1,
|
|
114
|
+
chunkType: chunk.chunkType,
|
|
115
|
+
name: chunk.name ? `${chunk.name}_part${subChunkIndex + 1}` : undefined,
|
|
116
|
+
language: chunk.language,
|
|
117
|
+
context: chunk.context,
|
|
118
|
+
tokenCount: actualTokens,
|
|
119
|
+
});
|
|
120
|
+
subChunkIndex++;
|
|
121
|
+
// Calculate overlap - try to include enough lines to reach overlapTokens
|
|
122
|
+
let overlapLines = [];
|
|
123
|
+
let overlapTokenCount = 0;
|
|
124
|
+
for (let j = currentLines.length - 1; j >= 0 && overlapTokenCount < overlapTokens; j--) {
|
|
125
|
+
overlapLines.unshift(currentLines[j]);
|
|
126
|
+
overlapTokenCount += countTokens(currentLines[j] + "\n");
|
|
127
|
+
}
|
|
128
|
+
currentLines = [...overlapLines, line];
|
|
129
|
+
currentTokens = overlapTokenCount + lineTokens;
|
|
130
|
+
subChunkStartLine = chunk.startLine + i - overlapLines.length;
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
currentLines.push(line);
|
|
134
|
+
currentTokens += lineTokens;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
// Save final sub-chunk
|
|
138
|
+
if (currentLines.length > 0) {
|
|
139
|
+
const content = currentLines.join("\n");
|
|
140
|
+
const actualTokens = countTokens(content);
|
|
141
|
+
subChunks.push({
|
|
142
|
+
id: generateChunkId(chunk.filePath, content, subChunkStartLine),
|
|
143
|
+
filePath: chunk.filePath,
|
|
144
|
+
content,
|
|
145
|
+
startLine: subChunkStartLine,
|
|
146
|
+
endLine: chunk.endLine,
|
|
147
|
+
chunkType: chunk.chunkType,
|
|
148
|
+
name: chunk.name ? `${chunk.name}_part${subChunkIndex + 1}` : undefined,
|
|
149
|
+
language: chunk.language,
|
|
150
|
+
context: chunk.context,
|
|
151
|
+
tokenCount: actualTokens,
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
console.error(` Split into ${subChunks.length} sub-chunks`);
|
|
155
|
+
return subChunks;
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Processes chunks to ensure none exceed the token limit
|
|
159
|
+
*/
|
|
160
|
+
function enforceTokenLimits(chunks, maxTokens, overlapTokens) {
|
|
161
|
+
const result = [];
|
|
162
|
+
for (const chunk of chunks) {
|
|
163
|
+
const splitChunks = splitLargeChunk(chunk, maxTokens, overlapTokens);
|
|
164
|
+
result.push(...splitChunks);
|
|
165
|
+
}
|
|
166
|
+
return result;
|
|
167
|
+
}
|
|
123
168
|
/**
|
|
124
169
|
* Chunks TypeScript/JavaScript code using AST parsing
|
|
125
170
|
*/
|
|
@@ -231,8 +276,9 @@ function chunkTypeScriptJavaScript(options) {
|
|
|
231
276
|
}
|
|
232
277
|
},
|
|
233
278
|
});
|
|
234
|
-
// If no chunks were extracted
|
|
235
|
-
if (chunks.length === 0
|
|
279
|
+
// If no chunks were extracted, treat as single chunk
|
|
280
|
+
if (chunks.length === 0) {
|
|
281
|
+
const tokenCount = countTokens(options.content);
|
|
236
282
|
chunks.push({
|
|
237
283
|
id: generateChunkId(options.filePath, options.content, 1),
|
|
238
284
|
filePath: options.filePath,
|
|
@@ -242,15 +288,17 @@ function chunkTypeScriptJavaScript(options) {
|
|
|
242
288
|
chunkType: "file",
|
|
243
289
|
language: options.language,
|
|
244
290
|
context,
|
|
291
|
+
tokenCount,
|
|
245
292
|
});
|
|
246
293
|
}
|
|
247
294
|
}
|
|
248
295
|
catch (error) {
|
|
249
296
|
console.error(`AST parsing failed for ${options.filePath}, falling back to fixed chunking: ${error}`);
|
|
250
297
|
// Fallback to fixed chunking if AST parsing fails
|
|
251
|
-
return
|
|
298
|
+
return chunkByTokens(options);
|
|
252
299
|
}
|
|
253
|
-
|
|
300
|
+
// Enforce token limits on all chunks
|
|
301
|
+
return enforceTokenLimits(chunks, options.maxTokens, options.chunkOverlapTokens);
|
|
254
302
|
}
|
|
255
303
|
/**
|
|
256
304
|
* Chunks Python code using simple pattern matching
|
|
@@ -269,7 +317,7 @@ function chunkPython(options) {
|
|
|
269
317
|
for (let i = 0; i < lines.length; i++) {
|
|
270
318
|
const line = lines[i];
|
|
271
319
|
const trimmed = line.trim();
|
|
272
|
-
const indent = line.length - line.
|
|
320
|
+
const indent = line.length - line.trimStart().length;
|
|
273
321
|
// Detect function definition
|
|
274
322
|
if (trimmed.startsWith("def ")) {
|
|
275
323
|
// Save previous chunk if exists
|
|
@@ -378,477 +426,57 @@ function chunkPython(options) {
|
|
|
378
426
|
context,
|
|
379
427
|
});
|
|
380
428
|
}
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
/**
|
|
384
|
-
* Chunks HTML/Vue/Svelte code by extracting script/style blocks
|
|
385
|
-
*/
|
|
386
|
-
function chunkHtml(options) {
|
|
387
|
-
const chunks = [];
|
|
388
|
-
const content = options.content;
|
|
389
|
-
const context = extractContext(content, options.language);
|
|
390
|
-
// Helper to add chunks from other languages
|
|
391
|
-
const addSubChunks = (subContent, subLang, offsetLine) => {
|
|
392
|
-
// If language is not supported for semantic chunking, it will fall back to fixed size
|
|
393
|
-
// We need to adjust line numbers relative to the file
|
|
394
|
-
const subOptions = {
|
|
395
|
-
...options,
|
|
396
|
-
content: subContent,
|
|
397
|
-
language: subLang,
|
|
398
|
-
};
|
|
399
|
-
// We use the main chunkCode router to handle the sub-content
|
|
400
|
-
// This allows reusing JS/TS/CSS logic
|
|
401
|
-
let subChunks = [];
|
|
402
|
-
if (subLang === "typescript" || subLang === "javascript" || subLang === "ts" || subLang === "js") {
|
|
403
|
-
subChunks = chunkTypeScriptJavaScript(subOptions);
|
|
404
|
-
}
|
|
405
|
-
else if (subLang === "css" || subLang === "scss" || subLang === "sass") {
|
|
406
|
-
subChunks = chunkCss(subOptions);
|
|
407
|
-
}
|
|
408
|
-
else {
|
|
409
|
-
subChunks = chunkByFixedSize(subOptions);
|
|
410
|
-
}
|
|
411
|
-
subChunks.forEach(chunk => {
|
|
412
|
-
chunk.startLine += offsetLine;
|
|
413
|
-
chunk.endLine += offsetLine;
|
|
414
|
-
// Regenerate ID to ensure it includes the correct line numbers and file context
|
|
415
|
-
chunk.id = generateChunkId(options.filePath, chunk.content, chunk.startLine);
|
|
416
|
-
chunks.push(chunk);
|
|
417
|
-
});
|
|
418
|
-
};
|
|
419
|
-
// 1. Extract <script> blocks
|
|
420
|
-
const scriptRegex = /<script\s*(?:lang=["']([\w-]+)["'])?\s*(?:setup)?\s*>([\s\S]*?)<\/script>/gi;
|
|
421
|
-
let match;
|
|
422
|
-
while ((match = scriptRegex.exec(content)) !== null) {
|
|
423
|
-
const langIdx = match[1] || "javascript"; // Default to JS
|
|
424
|
-
const scriptContent = match[2];
|
|
425
|
-
// Normalize language
|
|
426
|
-
let subLang = langIdx.toLowerCase();
|
|
427
|
-
if (subLang === "ts")
|
|
428
|
-
subLang = "typescript";
|
|
429
|
-
if (subLang === "js")
|
|
430
|
-
subLang = "javascript";
|
|
431
|
-
// Calculate start line
|
|
432
|
-
const preMatch = content.substring(0, match.index);
|
|
433
|
-
const startLine = preMatch.split("\n").length - 1; // 0-indexed adjustment for calc
|
|
434
|
-
addSubChunks(scriptContent, subLang, startLine);
|
|
435
|
-
}
|
|
436
|
-
// 2. Extract <style> blocks
|
|
437
|
-
const styleRegex = /<style\s*(?:lang=["']([\w-]+)["'])?\s*(?:scoped)?\s*>([\s\S]*?)<\/style>/gi;
|
|
438
|
-
while ((match = styleRegex.exec(content)) !== null) {
|
|
439
|
-
const langIdx = match[1] || "css"; // Default to CSS
|
|
440
|
-
const styleContent = match[2];
|
|
441
|
-
// Normalize language
|
|
442
|
-
let subLang = langIdx.toLowerCase();
|
|
443
|
-
// Calculate start line
|
|
444
|
-
const preMatch = content.substring(0, match.index);
|
|
445
|
-
const startLine = preMatch.split("\n").length - 1;
|
|
446
|
-
addSubChunks(styleContent, subLang, startLine);
|
|
447
|
-
}
|
|
448
|
-
// 3. Process the template/HTML structure (rest of file or specific template block)
|
|
449
|
-
// For Vue, we might look for <template>, for pure HTML it's the whole file
|
|
450
|
-
// For simplicity, we'll try to find <template> first, if not, treat whole file (minus script/style) as HTML structure
|
|
451
|
-
// But removing script/style from content to chunk remainder is complex with line numbers.
|
|
452
|
-
// Instead, we will just chunk the whole file as "html" fixed chunks,
|
|
453
|
-
// but we can be smarter: split by top-level tags if possible?
|
|
454
|
-
// Given complexity, falling back to fixed-size chunking for the *entire* file content
|
|
455
|
-
// but labeled as "template" might be redundant with the script/style chunks.
|
|
456
|
-
// Better approach: Regex for <template> block in Vue/Svelte
|
|
457
|
-
const templateRegex = /<template>([\s\S]*?)<\/template>/i;
|
|
458
|
-
const templateMatch = templateRegex.exec(content);
|
|
459
|
-
if (templateMatch) {
|
|
460
|
-
const templateContent = templateMatch[1];
|
|
461
|
-
const preMatch = content.substring(0, templateMatch.index);
|
|
462
|
-
const startLine = preMatch.split("\n").length - 1;
|
|
463
|
-
// Chunk template as HTML (fixed size for now, strict AST for HTML is hard without lib)
|
|
464
|
-
addSubChunks(templateContent, "html", startLine);
|
|
465
|
-
}
|
|
466
|
-
else if (options.language === "html") {
|
|
467
|
-
// For pure HTML files, just use fixed size chunking but exclude script/style if possible?
|
|
468
|
-
// Actually, letting it chunk the whole file by fixed size is a safe fallback for the "structure"
|
|
469
|
-
// The script/style chunks will strictly point to logic/styles.
|
|
470
|
-
// Overlapping coverage is acceptable.
|
|
471
|
-
// Let's rely on fixed partitioning for HTML content
|
|
472
|
-
const htmlChunks = chunkByFixedSize({
|
|
473
|
-
...options,
|
|
474
|
-
language: "html"
|
|
475
|
-
});
|
|
476
|
-
// We only add these if we are sure we aren't duplicating too much logic?
|
|
477
|
-
// Actually duplication is fine, vector search handles it.
|
|
478
|
-
// But better to separate concerns.
|
|
479
|
-
chunks.push(...htmlChunks);
|
|
480
|
-
}
|
|
481
|
-
return chunks;
|
|
482
|
-
}
|
|
483
|
-
/**
|
|
484
|
-
* Chunks CSS/SCSS code by parsing rule blocks
|
|
485
|
-
*/
|
|
486
|
-
function chunkCss(options) {
|
|
487
|
-
const chunks = [];
|
|
488
|
-
const lines = options.content.split("\n");
|
|
489
|
-
const context = extractContext(options.content, options.language);
|
|
490
|
-
let currentChunk = [];
|
|
491
|
-
let chunkStartLine = 1;
|
|
492
|
-
let braceDepth = 0;
|
|
493
|
-
let inComment = false;
|
|
494
|
-
for (let i = 0; i < lines.length; i++) {
|
|
495
|
-
const line = lines[i];
|
|
496
|
-
const trimmed = line.trim();
|
|
497
|
-
if (trimmed.startsWith("/*") && !inComment)
|
|
498
|
-
inComment = true;
|
|
499
|
-
if (trimmed.endsWith("*/") && inComment)
|
|
500
|
-
inComment = false;
|
|
501
|
-
// Count braces to detect block boundaries
|
|
502
|
-
// Simple heuristic, might fail on complex strings containing braces
|
|
503
|
-
const openBraces = (line.match(/\{/g) || []).length;
|
|
504
|
-
const closeBraces = (line.match(/\}/g) || []).length;
|
|
505
|
-
braceDepth += openBraces - closeBraces;
|
|
506
|
-
currentChunk.push(line);
|
|
507
|
-
// If we are at root level (depth 0) and have content, and just closed a block or ended a property
|
|
508
|
-
if (braceDepth === 0 && !inComment && currentChunk.length > 0) {
|
|
509
|
-
const chunkContent = currentChunk.join("\n").trim();
|
|
510
|
-
// Don't chunk empty lines
|
|
511
|
-
if (chunkContent.length > 0 && chunkContent !== "}") {
|
|
512
|
-
// Only finalize chunk if it looks like a complete rule or directive
|
|
513
|
-
// i.e. ends with } or ;
|
|
514
|
-
if (chunkContent.endsWith("}") || chunkContent.endsWith(";")) {
|
|
515
|
-
chunks.push({
|
|
516
|
-
id: generateChunkId(options.filePath, chunkContent, chunkStartLine),
|
|
517
|
-
filePath: options.filePath,
|
|
518
|
-
content: chunkContent,
|
|
519
|
-
startLine: chunkStartLine,
|
|
520
|
-
endLine: i + 1,
|
|
521
|
-
chunkType: "block", // CSS rule
|
|
522
|
-
language: options.language,
|
|
523
|
-
context,
|
|
524
|
-
});
|
|
525
|
-
currentChunk = [];
|
|
526
|
-
chunkStartLine = i + 2; // Next line
|
|
527
|
-
}
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
// Safety break for very large chunks
|
|
531
|
-
if (currentChunk.join("\n").length > (options.maxChunkSize * 2)) {
|
|
532
|
-
// Force split if rule is too massive
|
|
533
|
-
const chunkContent = currentChunk.join("\n");
|
|
534
|
-
// Validate content before pushing
|
|
535
|
-
if (chunkContent.trim().length > 0 && chunkContent.trim() !== "}") {
|
|
536
|
-
chunks.push({
|
|
537
|
-
id: generateChunkId(options.filePath, chunkContent, chunkStartLine),
|
|
538
|
-
filePath: options.filePath,
|
|
539
|
-
content: chunkContent,
|
|
540
|
-
startLine: chunkStartLine,
|
|
541
|
-
endLine: i + 1,
|
|
542
|
-
chunkType: "block",
|
|
543
|
-
language: options.language,
|
|
544
|
-
context,
|
|
545
|
-
});
|
|
546
|
-
}
|
|
547
|
-
currentChunk = [];
|
|
548
|
-
chunkStartLine = i + 2;
|
|
549
|
-
braceDepth = 0; // Reset to avoid getting stuck
|
|
550
|
-
}
|
|
551
|
-
}
|
|
552
|
-
// Remaining
|
|
553
|
-
// Remaining
|
|
554
|
-
if (currentChunk.length > 0) {
|
|
555
|
-
const chunkContent = currentChunk.join("\n");
|
|
556
|
-
// Validate content before pushing
|
|
557
|
-
if (chunkContent.trim().length > 0 && chunkContent.trim() !== "}") {
|
|
558
|
-
chunks.push({
|
|
559
|
-
id: generateChunkId(options.filePath, chunkContent, chunkStartLine),
|
|
560
|
-
filePath: options.filePath,
|
|
561
|
-
content: chunkContent,
|
|
562
|
-
startLine: chunkStartLine,
|
|
563
|
-
endLine: lines.length,
|
|
564
|
-
chunkType: "block",
|
|
565
|
-
language: options.language,
|
|
566
|
-
context,
|
|
567
|
-
});
|
|
568
|
-
}
|
|
569
|
-
}
|
|
570
|
-
return chunks;
|
|
571
|
-
}
|
|
572
|
-
/**
|
|
573
|
-
* Chunks JSON files by parsing structure
|
|
574
|
-
*/
|
|
575
|
-
function chunkJson(options) {
|
|
576
|
-
const chunks = [];
|
|
577
|
-
// Context for JSON is usually not useful (just start of file)
|
|
578
|
-
const context = "";
|
|
579
|
-
try {
|
|
580
|
-
const json = JSON.parse(options.content);
|
|
581
|
-
if (Array.isArray(json)) {
|
|
582
|
-
// Chunk array items
|
|
583
|
-
json.forEach((item, index) => {
|
|
584
|
-
const itemStr = JSON.stringify(item, null, 2);
|
|
585
|
-
// We can't easily get exact lines from JSON.parse
|
|
586
|
-
// So we approximate or just treat as logical chunks without strict line mapping
|
|
587
|
-
// For semantic search, the content is what matters.
|
|
588
|
-
// Line numbers will be approximate (0-0 or 1-1) unless we re-search the string in content
|
|
589
|
-
// Let's try to locate the item in string roughly? expensive.
|
|
590
|
-
// We will just create chunks with content.
|
|
591
|
-
chunks.push({
|
|
592
|
-
id: generateChunkId(options.filePath, itemStr, index), // index as salt
|
|
593
|
-
filePath: options.filePath,
|
|
594
|
-
content: itemStr,
|
|
595
|
-
startLine: 1, // Unknown
|
|
596
|
-
endLine: 1, // Unknown
|
|
597
|
-
chunkType: "block",
|
|
598
|
-
name: `[${index}]`,
|
|
599
|
-
language: "json",
|
|
600
|
-
context,
|
|
601
|
-
});
|
|
602
|
-
});
|
|
603
|
-
}
|
|
604
|
-
else if (typeof json === "object" && json !== null) {
|
|
605
|
-
// Chunk top-level keys
|
|
606
|
-
Object.keys(json).forEach((key) => {
|
|
607
|
-
const val = json[key];
|
|
608
|
-
const valStr = JSON.stringify(val, null, 2);
|
|
609
|
-
const chunkContent = `"${key}": ${valStr}`;
|
|
610
|
-
if (chunkContent.length > options.maxChunkSize) {
|
|
611
|
-
// If value is huge, maybe we should recurse or fixed-chunk it?
|
|
612
|
-
// For now, let's just push it.
|
|
613
|
-
}
|
|
614
|
-
chunks.push({
|
|
615
|
-
id: generateChunkId(options.filePath, chunkContent, 0),
|
|
616
|
-
filePath: options.filePath,
|
|
617
|
-
content: chunkContent,
|
|
618
|
-
startLine: 1,
|
|
619
|
-
endLine: 1,
|
|
620
|
-
chunkType: "block",
|
|
621
|
-
name: key,
|
|
622
|
-
language: "json",
|
|
623
|
-
context,
|
|
624
|
-
});
|
|
625
|
-
});
|
|
626
|
-
}
|
|
627
|
-
else {
|
|
628
|
-
// Primitive, single chunk
|
|
629
|
-
chunks.push({
|
|
630
|
-
id: generateChunkId(options.filePath, options.content, 1),
|
|
631
|
-
filePath: options.filePath,
|
|
632
|
-
content: options.content,
|
|
633
|
-
startLine: 1,
|
|
634
|
-
endLine: options.content.split("\n").length,
|
|
635
|
-
chunkType: "file",
|
|
636
|
-
language: "json",
|
|
637
|
-
});
|
|
638
|
-
}
|
|
639
|
-
}
|
|
640
|
-
catch (e) {
|
|
641
|
-
// Fallback to fixed size if invalid JSON
|
|
642
|
-
return chunkByFixedSize(options);
|
|
643
|
-
}
|
|
644
|
-
return chunks;
|
|
645
|
-
}
|
|
646
|
-
/**
|
|
647
|
-
* Chunks Java code (Spring Boot support) using brace tracking and regex
|
|
648
|
-
*/
|
|
649
|
-
function chunkJava(options) {
|
|
650
|
-
const chunks = [];
|
|
651
|
-
const lines = options.content.split("\n");
|
|
652
|
-
const context = extractContext(options.content, options.language);
|
|
653
|
-
let currentChunk = [];
|
|
654
|
-
let chunkStartLine = 1;
|
|
655
|
-
let braceDepth = 0;
|
|
656
|
-
let inClass = false;
|
|
657
|
-
let inMethod = false;
|
|
658
|
-
let className;
|
|
659
|
-
let methodName;
|
|
660
|
-
let chunkBaseDepth = 0;
|
|
661
|
-
let annotations = [];
|
|
662
|
-
for (let i = 0; i < lines.length; i++) {
|
|
663
|
-
const line = lines[i];
|
|
664
|
-
const trimmed = line.trim();
|
|
665
|
-
// Skip comments for logic but include in chunk
|
|
666
|
-
const isComment = trimmed.startsWith("//") || trimmed.startsWith("/*") || trimmed.startsWith("*");
|
|
667
|
-
// Track strict brace depth
|
|
668
|
-
const openBraces = (line.match(/\{/g) || []).length;
|
|
669
|
-
const closeBraces = (line.match(/\}/g) || []).length;
|
|
670
|
-
// Check for annotations
|
|
671
|
-
if (trimmed.startsWith("@") && !isComment) {
|
|
672
|
-
if (currentChunk.length === 0 && annotations.length === 0) {
|
|
673
|
-
chunkStartLine = i + 1;
|
|
674
|
-
}
|
|
675
|
-
annotations.push(line);
|
|
676
|
-
// Annotations are part of the next chunk
|
|
677
|
-
currentChunk.push(line);
|
|
678
|
-
continue;
|
|
679
|
-
}
|
|
680
|
-
// Detect Class/Interface
|
|
681
|
-
const classMatch = trimmed.match(/(?:public|protected|private)?\s*(?:static)?\s*(?:class|interface|enum)\s+(\w+)/);
|
|
682
|
-
if (classMatch && !isComment) {
|
|
683
|
-
// If we are already in a chunk (e.g. previous class ended), push it
|
|
684
|
-
// But if we are just starting (annotations only), keep going
|
|
685
|
-
if (currentChunk.length > annotations.length && braceDepth === chunkBaseDepth) {
|
|
686
|
-
const content = currentChunk.join("\n");
|
|
687
|
-
chunks.push({
|
|
688
|
-
id: generateChunkId(options.filePath, content, chunkStartLine),
|
|
689
|
-
filePath: options.filePath,
|
|
690
|
-
content,
|
|
691
|
-
startLine: chunkStartLine,
|
|
692
|
-
endLine: i,
|
|
693
|
-
chunkType: inClass ? "class" : "file", // inner class
|
|
694
|
-
name: className,
|
|
695
|
-
language: options.language,
|
|
696
|
-
context
|
|
697
|
-
});
|
|
698
|
-
currentChunk = [...annotations]; // Start new chunk with potential accumulated annotations
|
|
699
|
-
chunkStartLine = i + 1 - annotations.length;
|
|
700
|
-
}
|
|
701
|
-
else if (currentChunk.length === 0) {
|
|
702
|
-
chunkStartLine = i + 1;
|
|
703
|
-
}
|
|
704
|
-
inClass = true;
|
|
705
|
-
inMethod = false;
|
|
706
|
-
className = classMatch[1];
|
|
707
|
-
chunkBaseDepth = braceDepth;
|
|
708
|
-
annotations = [];
|
|
709
|
-
}
|
|
710
|
-
// Detect Method (heuristic: access modifier + type + name + (args) + {)
|
|
711
|
-
// Avoid control structures like if/for/while/switch/catch
|
|
712
|
-
const methodMatch = trimmed.match(/(?:public|protected|private)\s+(?:[\w<>?\[\]]+\s+)(\w+)\s*\(/);
|
|
713
|
-
const isControlFlow = /^(if|for|while|switch|catch|try)\b/.test(trimmed);
|
|
714
|
-
if (methodMatch && !isControlFlow && !isComment) {
|
|
715
|
-
// if we are inside a class, this is a method chunk
|
|
716
|
-
if (braceDepth === chunkBaseDepth + 1) { // Direct member of class
|
|
717
|
-
// Previous logical block (fields, etc) ends here
|
|
718
|
-
if (currentChunk.length > annotations.length) {
|
|
719
|
-
const content = currentChunk.join("\n");
|
|
720
|
-
chunks.push({
|
|
721
|
-
id: generateChunkId(options.filePath, content, chunkStartLine),
|
|
722
|
-
filePath: options.filePath,
|
|
723
|
-
content,
|
|
724
|
-
startLine: chunkStartLine,
|
|
725
|
-
endLine: i,
|
|
726
|
-
chunkType: "block",
|
|
727
|
-
name: className, // Context of class
|
|
728
|
-
language: options.language,
|
|
729
|
-
context
|
|
730
|
-
});
|
|
731
|
-
}
|
|
732
|
-
currentChunk = [...annotations];
|
|
733
|
-
chunkStartLine = i + 1 - annotations.length;
|
|
734
|
-
methodName = methodMatch[1];
|
|
735
|
-
inMethod = true;
|
|
736
|
-
annotations = [];
|
|
737
|
-
}
|
|
738
|
-
}
|
|
739
|
-
currentChunk.push(line);
|
|
740
|
-
braceDepth += openBraces - closeBraces;
|
|
741
|
-
// Check if block ended (method or class)
|
|
742
|
-
// We close the chunk if we return to the depth where we started THIS chunk
|
|
743
|
-
// But we need to handle the case where we just closed the class itself
|
|
744
|
-
// Logic: If we are in a method, and brace depth returns to class level -> method closed
|
|
745
|
-
if (inMethod && braceDepth === chunkBaseDepth + 1 && closeBraces > 0) {
|
|
746
|
-
const content = currentChunk.join("\n");
|
|
747
|
-
chunks.push({
|
|
748
|
-
id: generateChunkId(options.filePath, content, chunkStartLine),
|
|
749
|
-
filePath: options.filePath,
|
|
750
|
-
content,
|
|
751
|
-
startLine: chunkStartLine,
|
|
752
|
-
endLine: i + 1,
|
|
753
|
-
chunkType: "method",
|
|
754
|
-
name: methodName,
|
|
755
|
-
language: options.language,
|
|
756
|
-
context
|
|
757
|
-
});
|
|
758
|
-
currentChunk = [];
|
|
759
|
-
inMethod = false;
|
|
760
|
-
methodName = undefined;
|
|
761
|
-
chunkStartLine = i + 2;
|
|
762
|
-
}
|
|
763
|
-
// If brace depth returns to chunkBaseDepth -> class closed
|
|
764
|
-
else if (inClass && braceDepth === chunkBaseDepth && closeBraces > 0) {
|
|
765
|
-
const content = currentChunk.join("\n");
|
|
766
|
-
chunks.push({
|
|
767
|
-
id: generateChunkId(options.filePath, content, chunkStartLine),
|
|
768
|
-
filePath: options.filePath,
|
|
769
|
-
content,
|
|
770
|
-
startLine: chunkStartLine,
|
|
771
|
-
endLine: i + 1,
|
|
772
|
-
chunkType: "class",
|
|
773
|
-
name: className,
|
|
774
|
-
language: options.language,
|
|
775
|
-
context
|
|
776
|
-
});
|
|
777
|
-
currentChunk = [];
|
|
778
|
-
inClass = false;
|
|
779
|
-
className = undefined;
|
|
780
|
-
chunkStartLine = i + 2;
|
|
781
|
-
}
|
|
782
|
-
// Safety break for very large chunks
|
|
783
|
-
if (currentChunk.join("\n").length > (options.maxChunkSize * 3)) {
|
|
784
|
-
// If a single method is massive, we have to split it.
|
|
785
|
-
// enforceTokenLimits will handle strict splitting, but we should probably
|
|
786
|
-
// force a commit here to avoid memory pressure if it's crazy huge
|
|
787
|
-
}
|
|
788
|
-
if (closeBraces > 0 && annotations.length > 0)
|
|
789
|
-
chunks.push(...[]); // no-op just to use variable
|
|
790
|
-
if (openBraces > 0)
|
|
791
|
-
annotations = []; // Clear annotations if we opened a brace (they were consumed)
|
|
792
|
-
}
|
|
793
|
-
// Remaining content
|
|
794
|
-
if (currentChunk.length > 0) {
|
|
795
|
-
const content = currentChunk.join("\n");
|
|
796
|
-
if (content.trim().length > 0) {
|
|
797
|
-
chunks.push({
|
|
798
|
-
id: generateChunkId(options.filePath, content, chunkStartLine),
|
|
799
|
-
filePath: options.filePath,
|
|
800
|
-
content,
|
|
801
|
-
startLine: chunkStartLine,
|
|
802
|
-
endLine: lines.length,
|
|
803
|
-
chunkType: "file",
|
|
804
|
-
language: options.language,
|
|
805
|
-
context
|
|
806
|
-
});
|
|
807
|
-
}
|
|
808
|
-
}
|
|
809
|
-
// Fallback if regex failed to find anything
|
|
810
|
-
if (chunks.length === 0) {
|
|
811
|
-
return chunkByFixedSize(options);
|
|
812
|
-
}
|
|
813
|
-
return chunks;
|
|
429
|
+
// Enforce token limits on all chunks
|
|
430
|
+
return enforceTokenLimits(chunks, options.maxTokens, options.chunkOverlapTokens);
|
|
814
431
|
}
|
|
815
432
|
/**
|
|
816
|
-
* Chunks code by
|
|
433
|
+
* Chunks code by token count with overlap (replacement for chunkByFixedSize)
|
|
817
434
|
*/
|
|
818
|
-
function
|
|
435
|
+
function chunkByTokens(options) {
|
|
819
436
|
const chunks = [];
|
|
820
437
|
const lines = options.content.split("\n");
|
|
821
438
|
const context = extractContext(options.content, options.language);
|
|
822
439
|
let currentLines = [];
|
|
823
|
-
let
|
|
440
|
+
let currentTokens = 0;
|
|
824
441
|
let chunkStartLine = 1;
|
|
825
442
|
for (let i = 0; i < lines.length; i++) {
|
|
826
443
|
const line = lines[i];
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
if (currentSize >= options.maxChunkSize) {
|
|
444
|
+
const lineTokens = countTokens(line + "\n");
|
|
445
|
+
// If we've reached max tokens
|
|
446
|
+
if (currentTokens + lineTokens > options.maxTokens && currentLines.length > 0) {
|
|
831
447
|
const content = currentLines.join("\n");
|
|
448
|
+
const actualTokens = countTokens(content);
|
|
832
449
|
chunks.push({
|
|
833
450
|
id: generateChunkId(options.filePath, content, chunkStartLine),
|
|
834
451
|
filePath: options.filePath,
|
|
835
452
|
content,
|
|
836
453
|
startLine: chunkStartLine,
|
|
837
|
-
endLine: i
|
|
454
|
+
endLine: i,
|
|
838
455
|
chunkType: "block",
|
|
839
456
|
language: options.language,
|
|
840
457
|
context,
|
|
458
|
+
tokenCount: actualTokens,
|
|
841
459
|
});
|
|
842
|
-
// Calculate overlap
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
460
|
+
// Calculate overlap in lines (approximate)
|
|
461
|
+
let overlapLines = [];
|
|
462
|
+
let overlapTokenCount = 0;
|
|
463
|
+
for (let j = currentLines.length - 1; j >= 0 && overlapTokenCount < options.chunkOverlapTokens; j--) {
|
|
464
|
+
overlapLines.unshift(currentLines[j]);
|
|
465
|
+
overlapTokenCount += countTokens(currentLines[j] + "\n");
|
|
466
|
+
}
|
|
467
|
+
currentLines = [...overlapLines, line];
|
|
468
|
+
currentTokens = overlapTokenCount + lineTokens;
|
|
469
|
+
chunkStartLine = i + 1 - overlapLines.length;
|
|
470
|
+
}
|
|
471
|
+
else {
|
|
472
|
+
currentLines.push(line);
|
|
473
|
+
currentTokens += lineTokens;
|
|
847
474
|
}
|
|
848
475
|
}
|
|
849
476
|
// Add remaining content as final chunk
|
|
850
477
|
if (currentLines.length > 0) {
|
|
851
478
|
const content = currentLines.join("\n");
|
|
479
|
+
const actualTokens = countTokens(content);
|
|
852
480
|
chunks.push({
|
|
853
481
|
id: generateChunkId(options.filePath, content, chunkStartLine),
|
|
854
482
|
filePath: options.filePath,
|
|
@@ -858,10 +486,18 @@ function chunkByFixedSize(options) {
|
|
|
858
486
|
chunkType: "block",
|
|
859
487
|
language: options.language,
|
|
860
488
|
context,
|
|
489
|
+
tokenCount: actualTokens,
|
|
861
490
|
});
|
|
862
491
|
}
|
|
863
492
|
return chunks;
|
|
864
493
|
}
|
|
494
|
+
/**
|
|
495
|
+
* Legacy function for backwards compatibility
|
|
496
|
+
* @deprecated Use chunkByTokens instead
|
|
497
|
+
*/
|
|
498
|
+
function chunkByFixedSize(options) {
|
|
499
|
+
return chunkByTokens(options);
|
|
500
|
+
}
|
|
865
501
|
/**
|
|
866
502
|
* Main chunking function - routes to appropriate strategy based on language
|
|
867
503
|
*/
|
|
@@ -870,50 +506,46 @@ export function chunkCode(options) {
|
|
|
870
506
|
filePath: options.filePath,
|
|
871
507
|
content: options.content,
|
|
872
508
|
language: options.language,
|
|
509
|
+
maxTokens: options.maxTokens || MAX_TOKENS_PER_CHUNK,
|
|
510
|
+
chunkOverlapTokens: options.chunkOverlapTokens || DEFAULT_CHUNK_OVERLAP_TOKENS,
|
|
511
|
+
// Legacy options mapping
|
|
873
512
|
maxChunkSize: options.maxChunkSize || 1000,
|
|
874
513
|
chunkOverlap: options.chunkOverlap || 200,
|
|
875
514
|
};
|
|
876
|
-
// Force fixed-size chunking for minified files to prevent context length errors
|
|
877
|
-
if (fullOptions.filePath.includes(".min.")) {
|
|
878
|
-
const rawChunks = chunkByFixedSize(fullOptions);
|
|
879
|
-
return enforceTokenLimits(rawChunks);
|
|
880
|
-
}
|
|
881
515
|
// Route to appropriate chunking strategy
|
|
882
|
-
let chunks = [];
|
|
883
516
|
if (fullOptions.language === "typescript" || fullOptions.language === "javascript") {
|
|
884
|
-
|
|
517
|
+
return chunkTypeScriptJavaScript(fullOptions);
|
|
885
518
|
}
|
|
886
519
|
else if (fullOptions.language === "python") {
|
|
887
|
-
|
|
888
|
-
}
|
|
889
|
-
else if (["html", "vue", "svelte"].includes(fullOptions.language)) {
|
|
890
|
-
chunks = chunkHtml(fullOptions);
|
|
891
|
-
}
|
|
892
|
-
else if (["css", "scss", "sass", "less"].includes(fullOptions.language)) {
|
|
893
|
-
chunks = chunkCss(fullOptions);
|
|
894
|
-
}
|
|
895
|
-
else if (fullOptions.language === "json") {
|
|
896
|
-
chunks = chunkJson(fullOptions);
|
|
897
|
-
}
|
|
898
|
-
else if (fullOptions.language === "java") {
|
|
899
|
-
chunks = chunkJava(fullOptions);
|
|
520
|
+
return chunkPython(fullOptions);
|
|
900
521
|
}
|
|
901
522
|
else {
|
|
902
|
-
// For other languages, use
|
|
903
|
-
|
|
523
|
+
// For other languages, use token-based chunking
|
|
524
|
+
return chunkByTokens(fullOptions);
|
|
904
525
|
}
|
|
905
|
-
return enforceTokenLimits(chunks);
|
|
906
526
|
}
|
|
907
527
|
/**
|
|
908
528
|
* Chunks a file by reading it from disk
|
|
909
529
|
*/
|
|
910
|
-
export function chunkFile(filePath, language,
|
|
530
|
+
export function chunkFile(filePath, language, maxTokens, chunkOverlapTokens) {
|
|
911
531
|
const content = fs.readFileSync(filePath, "utf-8");
|
|
912
532
|
return chunkCode({
|
|
913
533
|
filePath,
|
|
914
534
|
content,
|
|
915
535
|
language,
|
|
916
|
-
|
|
917
|
-
|
|
536
|
+
maxTokens,
|
|
537
|
+
chunkOverlapTokens,
|
|
918
538
|
});
|
|
919
539
|
}
|
|
540
|
+
/**
|
|
541
|
+
* Utility to check if content would fit in a single embedding
|
|
542
|
+
*/
|
|
543
|
+
export function wouldFitInSingleEmbedding(content, maxTokens = MAX_TOKENS_PER_CHUNK) {
|
|
544
|
+
return countTokens(content) <= maxTokens;
|
|
545
|
+
}
|
|
546
|
+
/**
|
|
547
|
+
* Get the maximum tokens allowed per chunk
|
|
548
|
+
*/
|
|
549
|
+
export function getMaxTokensPerChunk() {
|
|
550
|
+
return MAX_TOKENS_PER_CHUNK;
|
|
551
|
+
}
|