@grec0/memory-bank-mcp 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,73 +1,30 @@
1
1
  /**
2
2
  * @fileoverview Intelligent code chunker for Memory Bank
3
3
  * Fragments code intelligently using AST parsing when possible
4
+ * Uses token counting to respect embedding model limits
4
5
  */
5
6
  import * as fs from "fs";
6
7
  import { parse } from "@babel/parser";
7
8
  import traverseLib from "@babel/traverse";
8
9
  import * as crypto from "crypto";
9
- import { getEncoding } from "js-tiktoken";
10
+ import { encode } from "gpt-tokenizer";
10
11
  // Handle traverse library export
11
12
  const traverse = typeof traverseLib === 'function' ? traverseLib : traverseLib.default;
12
- // Initialize tokenizer
13
- const enc = getEncoding("cl100k_base");
13
+ // Constants for embedding model limits
14
+ // text-embedding-3-small has 8192 token limit, use 7500 for safety margin
15
+ const MAX_TOKENS_PER_CHUNK = 7500;
16
+ const DEFAULT_CHUNK_OVERLAP_TOKENS = 200;
14
17
  /**
15
- * Enforces token limits on chunks, splitting them if necessary
18
+ * Counts tokens in a text using tiktoken-compatible tokenizer
16
19
  */
17
- function enforceTokenLimits(chunks, maxTokens = 8000) {
18
- const result = [];
19
- for (const chunk of chunks) {
20
- const tokens = enc.encode(chunk.content);
21
- if (tokens.length <= maxTokens) {
22
- result.push(chunk);
23
- }
24
- else {
25
- // Split into smaller chunks
26
- const content = chunk.content;
27
- const lines = content.split('\n');
28
- let currentChunkLines = [];
29
- let currentTokens = 0;
30
- let startLine = chunk.startLine;
31
- let partIndex = 1;
32
- for (let i = 0; i < lines.length; i++) {
33
- const line = lines[i];
34
- const lineTokens = enc.encode(line + '\n').length;
35
- if (currentTokens + lineTokens > maxTokens) {
36
- // Push current chunk
37
- if (currentChunkLines.length > 0) {
38
- const subContent = currentChunkLines.join('\n');
39
- result.push({
40
- ...chunk,
41
- id: `${chunk.id}-${partIndex}`,
42
- content: subContent,
43
- startLine: startLine,
44
- endLine: startLine + currentChunkLines.length - 1,
45
- name: chunk.name ? `${chunk.name} (Part ${partIndex})` : undefined
46
- });
47
- partIndex++;
48
- startLine += currentChunkLines.length;
49
- currentChunkLines = [];
50
- currentTokens = 0;
51
- }
52
- }
53
- currentChunkLines.push(line);
54
- currentTokens += lineTokens;
55
- }
56
- // Remaining
57
- if (currentChunkLines.length > 0) {
58
- const subContent = currentChunkLines.join('\n');
59
- result.push({
60
- ...chunk,
61
- id: `${chunk.id}-${partIndex}`,
62
- content: subContent,
63
- startLine: startLine,
64
- endLine: chunk.endLine, // Best effort
65
- name: chunk.name ? `${chunk.name} (Part ${partIndex})` : undefined
66
- });
67
- }
68
- }
20
+ export function countTokens(text) {
21
+ try {
22
+ return encode(text).length;
23
+ }
24
+ catch {
25
+ // Fallback estimation: ~4 characters per token for code
26
+ return Math.ceil(text.length / 4);
69
27
  }
70
- return result;
71
28
  }
72
29
  /**
73
30
  * Generates unique ID for a chunk based on content and metadata
@@ -120,6 +77,94 @@ function extractContext(content, language) {
120
77
  }
121
78
  return contextLines.join("\n");
122
79
  }
80
+ /**
81
+ * Splits a chunk that exceeds the token limit into smaller chunks
82
+ */
83
+ function splitLargeChunk(chunk, maxTokens, overlapTokens) {
84
+ const tokenCount = countTokens(chunk.content);
85
+ // If under limit, return as-is
86
+ if (tokenCount <= maxTokens) {
87
+ return [{ ...chunk, tokenCount }];
88
+ }
89
+ console.error(`Splitting large chunk: ${chunk.filePath} (${chunk.name || 'unnamed'}) - ${tokenCount} tokens exceeds ${maxTokens} limit`);
90
+ const subChunks = [];
91
+ const lines = chunk.content.split("\n");
92
+ let currentLines = [];
93
+ let currentTokens = 0;
94
+ let subChunkStartLine = chunk.startLine;
95
+ let subChunkIndex = 0;
96
+ for (let i = 0; i < lines.length; i++) {
97
+ const line = lines[i];
98
+ const lineTokens = countTokens(line + "\n");
99
+ // If single line exceeds max, we have to include it anyway (extreme edge case)
100
+ if (lineTokens > maxTokens && currentLines.length === 0) {
101
+ currentLines.push(line);
102
+ currentTokens = lineTokens;
103
+ }
104
+ else if (currentTokens + lineTokens > maxTokens && currentLines.length > 0) {
105
+ // Save current chunk
106
+ const content = currentLines.join("\n");
107
+ const actualTokens = countTokens(content);
108
+ subChunks.push({
109
+ id: generateChunkId(chunk.filePath, content, subChunkStartLine),
110
+ filePath: chunk.filePath,
111
+ content,
112
+ startLine: subChunkStartLine,
113
+ endLine: chunk.startLine + i - 1,
114
+ chunkType: chunk.chunkType,
115
+ name: chunk.name ? `${chunk.name}_part${subChunkIndex + 1}` : undefined,
116
+ language: chunk.language,
117
+ context: chunk.context,
118
+ tokenCount: actualTokens,
119
+ });
120
+ subChunkIndex++;
121
+ // Calculate overlap - try to include enough lines to reach overlapTokens
122
+ let overlapLines = [];
123
+ let overlapTokenCount = 0;
124
+ for (let j = currentLines.length - 1; j >= 0 && overlapTokenCount < overlapTokens; j--) {
125
+ overlapLines.unshift(currentLines[j]);
126
+ overlapTokenCount += countTokens(currentLines[j] + "\n");
127
+ }
128
+ currentLines = [...overlapLines, line];
129
+ currentTokens = overlapTokenCount + lineTokens;
130
+ subChunkStartLine = chunk.startLine + i - overlapLines.length;
131
+ }
132
+ else {
133
+ currentLines.push(line);
134
+ currentTokens += lineTokens;
135
+ }
136
+ }
137
+ // Save final sub-chunk
138
+ if (currentLines.length > 0) {
139
+ const content = currentLines.join("\n");
140
+ const actualTokens = countTokens(content);
141
+ subChunks.push({
142
+ id: generateChunkId(chunk.filePath, content, subChunkStartLine),
143
+ filePath: chunk.filePath,
144
+ content,
145
+ startLine: subChunkStartLine,
146
+ endLine: chunk.endLine,
147
+ chunkType: chunk.chunkType,
148
+ name: chunk.name ? `${chunk.name}_part${subChunkIndex + 1}` : undefined,
149
+ language: chunk.language,
150
+ context: chunk.context,
151
+ tokenCount: actualTokens,
152
+ });
153
+ }
154
+ console.error(` Split into ${subChunks.length} sub-chunks`);
155
+ return subChunks;
156
+ }
157
+ /**
158
+ * Processes chunks to ensure none exceed the token limit
159
+ */
160
+ function enforceTokenLimits(chunks, maxTokens, overlapTokens) {
161
+ const result = [];
162
+ for (const chunk of chunks) {
163
+ const splitChunks = splitLargeChunk(chunk, maxTokens, overlapTokens);
164
+ result.push(...splitChunks);
165
+ }
166
+ return result;
167
+ }
123
168
  /**
124
169
  * Chunks TypeScript/JavaScript code using AST parsing
125
170
  */
@@ -231,8 +276,9 @@ function chunkTypeScriptJavaScript(options) {
231
276
  }
232
277
  },
233
278
  });
234
- // If no chunks were extracted or file is small, treat as single chunk
235
- if (chunks.length === 0 || options.content.length <= options.maxChunkSize) {
279
+ // If no chunks were extracted, treat as single chunk
280
+ if (chunks.length === 0) {
281
+ const tokenCount = countTokens(options.content);
236
282
  chunks.push({
237
283
  id: generateChunkId(options.filePath, options.content, 1),
238
284
  filePath: options.filePath,
@@ -242,15 +288,17 @@ function chunkTypeScriptJavaScript(options) {
242
288
  chunkType: "file",
243
289
  language: options.language,
244
290
  context,
291
+ tokenCount,
245
292
  });
246
293
  }
247
294
  }
248
295
  catch (error) {
249
296
  console.error(`AST parsing failed for ${options.filePath}, falling back to fixed chunking: ${error}`);
250
297
  // Fallback to fixed chunking if AST parsing fails
251
- return chunkByFixedSize(options);
298
+ return chunkByTokens(options);
252
299
  }
253
- return chunks;
300
+ // Enforce token limits on all chunks
301
+ return enforceTokenLimits(chunks, options.maxTokens, options.chunkOverlapTokens);
254
302
  }
255
303
  /**
256
304
  * Chunks Python code using simple pattern matching
@@ -269,7 +317,7 @@ function chunkPython(options) {
269
317
  for (let i = 0; i < lines.length; i++) {
270
318
  const line = lines[i];
271
319
  const trimmed = line.trim();
272
- const indent = line.length - line.trimLeft().length;
320
+ const indent = line.length - line.trimStart().length;
273
321
  // Detect function definition
274
322
  if (trimmed.startsWith("def ")) {
275
323
  // Save previous chunk if exists
@@ -378,477 +426,57 @@ function chunkPython(options) {
378
426
  context,
379
427
  });
380
428
  }
381
- return chunks;
382
- }
383
- /**
384
- * Chunks HTML/Vue/Svelte code by extracting script/style blocks
385
- */
386
- function chunkHtml(options) {
387
- const chunks = [];
388
- const content = options.content;
389
- const context = extractContext(content, options.language);
390
- // Helper to add chunks from other languages
391
- const addSubChunks = (subContent, subLang, offsetLine) => {
392
- // If language is not supported for semantic chunking, it will fall back to fixed size
393
- // We need to adjust line numbers relative to the file
394
- const subOptions = {
395
- ...options,
396
- content: subContent,
397
- language: subLang,
398
- };
399
- // We use the main chunkCode router to handle the sub-content
400
- // This allows reusing JS/TS/CSS logic
401
- let subChunks = [];
402
- if (subLang === "typescript" || subLang === "javascript" || subLang === "ts" || subLang === "js") {
403
- subChunks = chunkTypeScriptJavaScript(subOptions);
404
- }
405
- else if (subLang === "css" || subLang === "scss" || subLang === "sass") {
406
- subChunks = chunkCss(subOptions);
407
- }
408
- else {
409
- subChunks = chunkByFixedSize(subOptions);
410
- }
411
- subChunks.forEach(chunk => {
412
- chunk.startLine += offsetLine;
413
- chunk.endLine += offsetLine;
414
- // Regenerate ID to ensure it includes the correct line numbers and file context
415
- chunk.id = generateChunkId(options.filePath, chunk.content, chunk.startLine);
416
- chunks.push(chunk);
417
- });
418
- };
419
- // 1. Extract <script> blocks
420
- const scriptRegex = /<script\s*(?:lang=["']([\w-]+)["'])?\s*(?:setup)?\s*>([\s\S]*?)<\/script>/gi;
421
- let match;
422
- while ((match = scriptRegex.exec(content)) !== null) {
423
- const langIdx = match[1] || "javascript"; // Default to JS
424
- const scriptContent = match[2];
425
- // Normalize language
426
- let subLang = langIdx.toLowerCase();
427
- if (subLang === "ts")
428
- subLang = "typescript";
429
- if (subLang === "js")
430
- subLang = "javascript";
431
- // Calculate start line
432
- const preMatch = content.substring(0, match.index);
433
- const startLine = preMatch.split("\n").length - 1; // 0-indexed adjustment for calc
434
- addSubChunks(scriptContent, subLang, startLine);
435
- }
436
- // 2. Extract <style> blocks
437
- const styleRegex = /<style\s*(?:lang=["']([\w-]+)["'])?\s*(?:scoped)?\s*>([\s\S]*?)<\/style>/gi;
438
- while ((match = styleRegex.exec(content)) !== null) {
439
- const langIdx = match[1] || "css"; // Default to CSS
440
- const styleContent = match[2];
441
- // Normalize language
442
- let subLang = langIdx.toLowerCase();
443
- // Calculate start line
444
- const preMatch = content.substring(0, match.index);
445
- const startLine = preMatch.split("\n").length - 1;
446
- addSubChunks(styleContent, subLang, startLine);
447
- }
448
- // 3. Process the template/HTML structure (rest of file or specific template block)
449
- // For Vue, we might look for <template>, for pure HTML it's the whole file
450
- // For simplicity, we'll try to find <template> first, if not, treat whole file (minus script/style) as HTML structure
451
- // But removing script/style from content to chunk remainder is complex with line numbers.
452
- // Instead, we will just chunk the whole file as "html" fixed chunks,
453
- // but we can be smarter: split by top-level tags if possible?
454
- // Given complexity, falling back to fixed-size chunking for the *entire* file content
455
- // but labeled as "template" might be redundant with the script/style chunks.
456
- // Better approach: Regex for <template> block in Vue/Svelte
457
- const templateRegex = /<template>([\s\S]*?)<\/template>/i;
458
- const templateMatch = templateRegex.exec(content);
459
- if (templateMatch) {
460
- const templateContent = templateMatch[1];
461
- const preMatch = content.substring(0, templateMatch.index);
462
- const startLine = preMatch.split("\n").length - 1;
463
- // Chunk template as HTML (fixed size for now, strict AST for HTML is hard without lib)
464
- addSubChunks(templateContent, "html", startLine);
465
- }
466
- else if (options.language === "html") {
467
- // For pure HTML files, just use fixed size chunking but exclude script/style if possible?
468
- // Actually, letting it chunk the whole file by fixed size is a safe fallback for the "structure"
469
- // The script/style chunks will strictly point to logic/styles.
470
- // Overlapping coverage is acceptable.
471
- // Let's rely on fixed partitioning for HTML content
472
- const htmlChunks = chunkByFixedSize({
473
- ...options,
474
- language: "html"
475
- });
476
- // We only add these if we are sure we aren't duplicating too much logic?
477
- // Actually duplication is fine, vector search handles it.
478
- // But better to separate concerns.
479
- chunks.push(...htmlChunks);
480
- }
481
- return chunks;
482
- }
483
- /**
484
- * Chunks CSS/SCSS code by parsing rule blocks
485
- */
486
- function chunkCss(options) {
487
- const chunks = [];
488
- const lines = options.content.split("\n");
489
- const context = extractContext(options.content, options.language);
490
- let currentChunk = [];
491
- let chunkStartLine = 1;
492
- let braceDepth = 0;
493
- let inComment = false;
494
- for (let i = 0; i < lines.length; i++) {
495
- const line = lines[i];
496
- const trimmed = line.trim();
497
- if (trimmed.startsWith("/*") && !inComment)
498
- inComment = true;
499
- if (trimmed.endsWith("*/") && inComment)
500
- inComment = false;
501
- // Count braces to detect block boundaries
502
- // Simple heuristic, might fail on complex strings containing braces
503
- const openBraces = (line.match(/\{/g) || []).length;
504
- const closeBraces = (line.match(/\}/g) || []).length;
505
- braceDepth += openBraces - closeBraces;
506
- currentChunk.push(line);
507
- // If we are at root level (depth 0) and have content, and just closed a block or ended a property
508
- if (braceDepth === 0 && !inComment && currentChunk.length > 0) {
509
- const chunkContent = currentChunk.join("\n").trim();
510
- // Don't chunk empty lines
511
- if (chunkContent.length > 0 && chunkContent !== "}") {
512
- // Only finalize chunk if it looks like a complete rule or directive
513
- // i.e. ends with } or ;
514
- if (chunkContent.endsWith("}") || chunkContent.endsWith(";")) {
515
- chunks.push({
516
- id: generateChunkId(options.filePath, chunkContent, chunkStartLine),
517
- filePath: options.filePath,
518
- content: chunkContent,
519
- startLine: chunkStartLine,
520
- endLine: i + 1,
521
- chunkType: "block", // CSS rule
522
- language: options.language,
523
- context,
524
- });
525
- currentChunk = [];
526
- chunkStartLine = i + 2; // Next line
527
- }
528
- }
529
- }
530
- // Safety break for very large chunks
531
- if (currentChunk.join("\n").length > (options.maxChunkSize * 2)) {
532
- // Force split if rule is too massive
533
- const chunkContent = currentChunk.join("\n");
534
- // Validate content before pushing
535
- if (chunkContent.trim().length > 0 && chunkContent.trim() !== "}") {
536
- chunks.push({
537
- id: generateChunkId(options.filePath, chunkContent, chunkStartLine),
538
- filePath: options.filePath,
539
- content: chunkContent,
540
- startLine: chunkStartLine,
541
- endLine: i + 1,
542
- chunkType: "block",
543
- language: options.language,
544
- context,
545
- });
546
- }
547
- currentChunk = [];
548
- chunkStartLine = i + 2;
549
- braceDepth = 0; // Reset to avoid getting stuck
550
- }
551
- }
552
- // Remaining
553
- // Remaining
554
- if (currentChunk.length > 0) {
555
- const chunkContent = currentChunk.join("\n");
556
- // Validate content before pushing
557
- if (chunkContent.trim().length > 0 && chunkContent.trim() !== "}") {
558
- chunks.push({
559
- id: generateChunkId(options.filePath, chunkContent, chunkStartLine),
560
- filePath: options.filePath,
561
- content: chunkContent,
562
- startLine: chunkStartLine,
563
- endLine: lines.length,
564
- chunkType: "block",
565
- language: options.language,
566
- context,
567
- });
568
- }
569
- }
570
- return chunks;
571
- }
572
- /**
573
- * Chunks JSON files by parsing structure
574
- */
575
- function chunkJson(options) {
576
- const chunks = [];
577
- // Context for JSON is usually not useful (just start of file)
578
- const context = "";
579
- try {
580
- const json = JSON.parse(options.content);
581
- if (Array.isArray(json)) {
582
- // Chunk array items
583
- json.forEach((item, index) => {
584
- const itemStr = JSON.stringify(item, null, 2);
585
- // We can't easily get exact lines from JSON.parse
586
- // So we approximate or just treat as logical chunks without strict line mapping
587
- // For semantic search, the content is what matters.
588
- // Line numbers will be approximate (0-0 or 1-1) unless we re-search the string in content
589
- // Let's try to locate the item in string roughly? expensive.
590
- // We will just create chunks with content.
591
- chunks.push({
592
- id: generateChunkId(options.filePath, itemStr, index), // index as salt
593
- filePath: options.filePath,
594
- content: itemStr,
595
- startLine: 1, // Unknown
596
- endLine: 1, // Unknown
597
- chunkType: "block",
598
- name: `[${index}]`,
599
- language: "json",
600
- context,
601
- });
602
- });
603
- }
604
- else if (typeof json === "object" && json !== null) {
605
- // Chunk top-level keys
606
- Object.keys(json).forEach((key) => {
607
- const val = json[key];
608
- const valStr = JSON.stringify(val, null, 2);
609
- const chunkContent = `"${key}": ${valStr}`;
610
- if (chunkContent.length > options.maxChunkSize) {
611
- // If value is huge, maybe we should recurse or fixed-chunk it?
612
- // For now, let's just push it.
613
- }
614
- chunks.push({
615
- id: generateChunkId(options.filePath, chunkContent, 0),
616
- filePath: options.filePath,
617
- content: chunkContent,
618
- startLine: 1,
619
- endLine: 1,
620
- chunkType: "block",
621
- name: key,
622
- language: "json",
623
- context,
624
- });
625
- });
626
- }
627
- else {
628
- // Primitive, single chunk
629
- chunks.push({
630
- id: generateChunkId(options.filePath, options.content, 1),
631
- filePath: options.filePath,
632
- content: options.content,
633
- startLine: 1,
634
- endLine: options.content.split("\n").length,
635
- chunkType: "file",
636
- language: "json",
637
- });
638
- }
639
- }
640
- catch (e) {
641
- // Fallback to fixed size if invalid JSON
642
- return chunkByFixedSize(options);
643
- }
644
- return chunks;
645
- }
646
- /**
647
- * Chunks Java code (Spring Boot support) using brace tracking and regex
648
- */
649
- function chunkJava(options) {
650
- const chunks = [];
651
- const lines = options.content.split("\n");
652
- const context = extractContext(options.content, options.language);
653
- let currentChunk = [];
654
- let chunkStartLine = 1;
655
- let braceDepth = 0;
656
- let inClass = false;
657
- let inMethod = false;
658
- let className;
659
- let methodName;
660
- let chunkBaseDepth = 0;
661
- let annotations = [];
662
- for (let i = 0; i < lines.length; i++) {
663
- const line = lines[i];
664
- const trimmed = line.trim();
665
- // Skip comments for logic but include in chunk
666
- const isComment = trimmed.startsWith("//") || trimmed.startsWith("/*") || trimmed.startsWith("*");
667
- // Track strict brace depth
668
- const openBraces = (line.match(/\{/g) || []).length;
669
- const closeBraces = (line.match(/\}/g) || []).length;
670
- // Check for annotations
671
- if (trimmed.startsWith("@") && !isComment) {
672
- if (currentChunk.length === 0 && annotations.length === 0) {
673
- chunkStartLine = i + 1;
674
- }
675
- annotations.push(line);
676
- // Annotations are part of the next chunk
677
- currentChunk.push(line);
678
- continue;
679
- }
680
- // Detect Class/Interface
681
- const classMatch = trimmed.match(/(?:public|protected|private)?\s*(?:static)?\s*(?:class|interface|enum)\s+(\w+)/);
682
- if (classMatch && !isComment) {
683
- // If we are already in a chunk (e.g. previous class ended), push it
684
- // But if we are just starting (annotations only), keep going
685
- if (currentChunk.length > annotations.length && braceDepth === chunkBaseDepth) {
686
- const content = currentChunk.join("\n");
687
- chunks.push({
688
- id: generateChunkId(options.filePath, content, chunkStartLine),
689
- filePath: options.filePath,
690
- content,
691
- startLine: chunkStartLine,
692
- endLine: i,
693
- chunkType: inClass ? "class" : "file", // inner class
694
- name: className,
695
- language: options.language,
696
- context
697
- });
698
- currentChunk = [...annotations]; // Start new chunk with potential accumulated annotations
699
- chunkStartLine = i + 1 - annotations.length;
700
- }
701
- else if (currentChunk.length === 0) {
702
- chunkStartLine = i + 1;
703
- }
704
- inClass = true;
705
- inMethod = false;
706
- className = classMatch[1];
707
- chunkBaseDepth = braceDepth;
708
- annotations = [];
709
- }
710
- // Detect Method (heuristic: access modifier + type + name + (args) + {)
711
- // Avoid control structures like if/for/while/switch/catch
712
- const methodMatch = trimmed.match(/(?:public|protected|private)\s+(?:[\w<>?\[\]]+\s+)(\w+)\s*\(/);
713
- const isControlFlow = /^(if|for|while|switch|catch|try)\b/.test(trimmed);
714
- if (methodMatch && !isControlFlow && !isComment) {
715
- // if we are inside a class, this is a method chunk
716
- if (braceDepth === chunkBaseDepth + 1) { // Direct member of class
717
- // Previous logical block (fields, etc) ends here
718
- if (currentChunk.length > annotations.length) {
719
- const content = currentChunk.join("\n");
720
- chunks.push({
721
- id: generateChunkId(options.filePath, content, chunkStartLine),
722
- filePath: options.filePath,
723
- content,
724
- startLine: chunkStartLine,
725
- endLine: i,
726
- chunkType: "block",
727
- name: className, // Context of class
728
- language: options.language,
729
- context
730
- });
731
- }
732
- currentChunk = [...annotations];
733
- chunkStartLine = i + 1 - annotations.length;
734
- methodName = methodMatch[1];
735
- inMethod = true;
736
- annotations = [];
737
- }
738
- }
739
- currentChunk.push(line);
740
- braceDepth += openBraces - closeBraces;
741
- // Check if block ended (method or class)
742
- // We close the chunk if we return to the depth where we started THIS chunk
743
- // But we need to handle the case where we just closed the class itself
744
- // Logic: If we are in a method, and brace depth returns to class level -> method closed
745
- if (inMethod && braceDepth === chunkBaseDepth + 1 && closeBraces > 0) {
746
- const content = currentChunk.join("\n");
747
- chunks.push({
748
- id: generateChunkId(options.filePath, content, chunkStartLine),
749
- filePath: options.filePath,
750
- content,
751
- startLine: chunkStartLine,
752
- endLine: i + 1,
753
- chunkType: "method",
754
- name: methodName,
755
- language: options.language,
756
- context
757
- });
758
- currentChunk = [];
759
- inMethod = false;
760
- methodName = undefined;
761
- chunkStartLine = i + 2;
762
- }
763
- // If brace depth returns to chunkBaseDepth -> class closed
764
- else if (inClass && braceDepth === chunkBaseDepth && closeBraces > 0) {
765
- const content = currentChunk.join("\n");
766
- chunks.push({
767
- id: generateChunkId(options.filePath, content, chunkStartLine),
768
- filePath: options.filePath,
769
- content,
770
- startLine: chunkStartLine,
771
- endLine: i + 1,
772
- chunkType: "class",
773
- name: className,
774
- language: options.language,
775
- context
776
- });
777
- currentChunk = [];
778
- inClass = false;
779
- className = undefined;
780
- chunkStartLine = i + 2;
781
- }
782
- // Safety break for very large chunks
783
- if (currentChunk.join("\n").length > (options.maxChunkSize * 3)) {
784
- // If a single method is massive, we have to split it.
785
- // enforceTokenLimits will handle strict splitting, but we should probably
786
- // force a commit here to avoid memory pressure if it's crazy huge
787
- }
788
- if (closeBraces > 0 && annotations.length > 0)
789
- chunks.push(...[]); // no-op just to use variable
790
- if (openBraces > 0)
791
- annotations = []; // Clear annotations if we opened a brace (they were consumed)
792
- }
793
- // Remaining content
794
- if (currentChunk.length > 0) {
795
- const content = currentChunk.join("\n");
796
- if (content.trim().length > 0) {
797
- chunks.push({
798
- id: generateChunkId(options.filePath, content, chunkStartLine),
799
- filePath: options.filePath,
800
- content,
801
- startLine: chunkStartLine,
802
- endLine: lines.length,
803
- chunkType: "file",
804
- language: options.language,
805
- context
806
- });
807
- }
808
- }
809
- // Fallback if regex failed to find anything
810
- if (chunks.length === 0) {
811
- return chunkByFixedSize(options);
812
- }
813
- return chunks;
429
+ // Enforce token limits on all chunks
430
+ return enforceTokenLimits(chunks, options.maxTokens, options.chunkOverlapTokens);
814
431
  }
815
432
  /**
816
- * Chunks code by fixed size with overlap
433
+ * Chunks code by token count with overlap (replacement for chunkByFixedSize)
817
434
  */
818
- function chunkByFixedSize(options) {
435
+ function chunkByTokens(options) {
819
436
  const chunks = [];
820
437
  const lines = options.content.split("\n");
821
438
  const context = extractContext(options.content, options.language);
822
439
  let currentLines = [];
823
- let currentSize = 0;
440
+ let currentTokens = 0;
824
441
  let chunkStartLine = 1;
825
442
  for (let i = 0; i < lines.length; i++) {
826
443
  const line = lines[i];
827
- currentLines.push(line);
828
- currentSize += line.length + 1; // +1 for newline
829
- // If we've reached max chunk size
830
- if (currentSize >= options.maxChunkSize) {
444
+ const lineTokens = countTokens(line + "\n");
445
+ // If we've reached max tokens
446
+ if (currentTokens + lineTokens > options.maxTokens && currentLines.length > 0) {
831
447
  const content = currentLines.join("\n");
448
+ const actualTokens = countTokens(content);
832
449
  chunks.push({
833
450
  id: generateChunkId(options.filePath, content, chunkStartLine),
834
451
  filePath: options.filePath,
835
452
  content,
836
453
  startLine: chunkStartLine,
837
- endLine: i + 1,
454
+ endLine: i,
838
455
  chunkType: "block",
839
456
  language: options.language,
840
457
  context,
458
+ tokenCount: actualTokens,
841
459
  });
842
- // Calculate overlap
843
- const overlapLines = Math.floor(options.chunkOverlap / 50); // Approximate lines
844
- currentLines = currentLines.slice(-overlapLines);
845
- currentSize = currentLines.reduce((sum, l) => sum + l.length + 1, 0);
846
- chunkStartLine = i + 1 - overlapLines + 1;
460
+ // Calculate overlap in lines (approximate)
461
+ let overlapLines = [];
462
+ let overlapTokenCount = 0;
463
+ for (let j = currentLines.length - 1; j >= 0 && overlapTokenCount < options.chunkOverlapTokens; j--) {
464
+ overlapLines.unshift(currentLines[j]);
465
+ overlapTokenCount += countTokens(currentLines[j] + "\n");
466
+ }
467
+ currentLines = [...overlapLines, line];
468
+ currentTokens = overlapTokenCount + lineTokens;
469
+ chunkStartLine = i + 1 - overlapLines.length;
470
+ }
471
+ else {
472
+ currentLines.push(line);
473
+ currentTokens += lineTokens;
847
474
  }
848
475
  }
849
476
  // Add remaining content as final chunk
850
477
  if (currentLines.length > 0) {
851
478
  const content = currentLines.join("\n");
479
+ const actualTokens = countTokens(content);
852
480
  chunks.push({
853
481
  id: generateChunkId(options.filePath, content, chunkStartLine),
854
482
  filePath: options.filePath,
@@ -858,10 +486,18 @@ function chunkByFixedSize(options) {
858
486
  chunkType: "block",
859
487
  language: options.language,
860
488
  context,
489
+ tokenCount: actualTokens,
861
490
  });
862
491
  }
863
492
  return chunks;
864
493
  }
494
+ /**
495
+ * Legacy function for backwards compatibility
496
+ * @deprecated Use chunkByTokens instead
497
+ */
498
+ function chunkByFixedSize(options) {
499
+ return chunkByTokens(options);
500
+ }
865
501
  /**
866
502
  * Main chunking function - routes to appropriate strategy based on language
867
503
  */
@@ -870,50 +506,46 @@ export function chunkCode(options) {
870
506
  filePath: options.filePath,
871
507
  content: options.content,
872
508
  language: options.language,
509
+ maxTokens: options.maxTokens || MAX_TOKENS_PER_CHUNK,
510
+ chunkOverlapTokens: options.chunkOverlapTokens || DEFAULT_CHUNK_OVERLAP_TOKENS,
511
+ // Legacy options mapping
873
512
  maxChunkSize: options.maxChunkSize || 1000,
874
513
  chunkOverlap: options.chunkOverlap || 200,
875
514
  };
876
- // Force fixed-size chunking for minified files to prevent context length errors
877
- if (fullOptions.filePath.includes(".min.")) {
878
- const rawChunks = chunkByFixedSize(fullOptions);
879
- return enforceTokenLimits(rawChunks);
880
- }
881
515
  // Route to appropriate chunking strategy
882
- let chunks = [];
883
516
  if (fullOptions.language === "typescript" || fullOptions.language === "javascript") {
884
- chunks = chunkTypeScriptJavaScript(fullOptions);
517
+ return chunkTypeScriptJavaScript(fullOptions);
885
518
  }
886
519
  else if (fullOptions.language === "python") {
887
- chunks = chunkPython(fullOptions);
888
- }
889
- else if (["html", "vue", "svelte"].includes(fullOptions.language)) {
890
- chunks = chunkHtml(fullOptions);
891
- }
892
- else if (["css", "scss", "sass", "less"].includes(fullOptions.language)) {
893
- chunks = chunkCss(fullOptions);
894
- }
895
- else if (fullOptions.language === "json") {
896
- chunks = chunkJson(fullOptions);
897
- }
898
- else if (fullOptions.language === "java") {
899
- chunks = chunkJava(fullOptions);
520
+ return chunkPython(fullOptions);
900
521
  }
901
522
  else {
902
- // For other languages, use fixed-size chunking
903
- chunks = chunkByFixedSize(fullOptions);
523
+ // For other languages, use token-based chunking
524
+ return chunkByTokens(fullOptions);
904
525
  }
905
- return enforceTokenLimits(chunks);
906
526
  }
907
527
  /**
908
528
  * Chunks a file by reading it from disk
909
529
  */
910
- export function chunkFile(filePath, language, maxChunkSize, chunkOverlap) {
530
+ export function chunkFile(filePath, language, maxTokens, chunkOverlapTokens) {
911
531
  const content = fs.readFileSync(filePath, "utf-8");
912
532
  return chunkCode({
913
533
  filePath,
914
534
  content,
915
535
  language,
916
- maxChunkSize,
917
- chunkOverlap,
536
+ maxTokens,
537
+ chunkOverlapTokens,
918
538
  });
919
539
  }
540
+ /**
541
+ * Utility to check if content would fit in a single embedding
542
+ */
543
+ export function wouldFitInSingleEmbedding(content, maxTokens = MAX_TOKENS_PER_CHUNK) {
544
+ return countTokens(content) <= maxTokens;
545
+ }
546
+ /**
547
+ * Get the maximum tokens allowed per chunk
548
+ */
549
+ export function getMaxTokensPerChunk() {
550
+ return MAX_TOKENS_PER_CHUNK;
551
+ }