@softerist/heuristic-mcp 2.1.47 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/.agent/workflows/code-review.md +60 -0
  2. package/.prettierrc +7 -0
  3. package/ARCHITECTURE.md +105 -170
  4. package/CONTRIBUTING.md +32 -113
  5. package/GEMINI.md +73 -0
  6. package/LICENSE +21 -21
  7. package/README.md +161 -54
  8. package/config.json +876 -75
  9. package/debug-pids.js +27 -0
  10. package/eslint.config.js +36 -0
  11. package/features/ann-config.js +37 -26
  12. package/features/clear-cache.js +28 -19
  13. package/features/find-similar-code.js +142 -66
  14. package/features/hybrid-search.js +253 -93
  15. package/features/index-codebase.js +1455 -394
  16. package/features/lifecycle.js +813 -180
  17. package/features/register.js +58 -52
  18. package/index.js +450 -306
  19. package/lib/cache-ops.js +22 -0
  20. package/lib/cache-utils.js +68 -0
  21. package/lib/cache.js +1392 -587
  22. package/lib/call-graph.js +165 -50
  23. package/lib/cli.js +154 -0
  24. package/lib/config.js +462 -121
  25. package/lib/embedding-process.js +77 -0
  26. package/lib/embedding-worker.js +545 -30
  27. package/lib/ignore-patterns.js +61 -59
  28. package/lib/json-worker.js +14 -0
  29. package/lib/json-writer.js +344 -0
  30. package/lib/logging.js +88 -0
  31. package/lib/memory-logger.js +13 -0
  32. package/lib/project-detector.js +13 -17
  33. package/lib/server-lifecycle.js +38 -0
  34. package/lib/settings-editor.js +645 -0
  35. package/lib/tokenizer.js +207 -104
  36. package/lib/utils.js +273 -198
  37. package/lib/vector-store-binary.js +592 -0
  38. package/mcp_config.example.json +13 -0
  39. package/package.json +13 -2
  40. package/scripts/clear-cache.js +6 -17
  41. package/scripts/download-model.js +14 -9
  42. package/scripts/postinstall.js +5 -5
  43. package/search-configs.js +36 -0
  44. package/test/ann-config.test.js +179 -0
  45. package/test/ann-fallback.test.js +6 -6
  46. package/test/binary-store.test.js +69 -0
  47. package/test/cache-branches.test.js +120 -0
  48. package/test/cache-errors.test.js +264 -0
  49. package/test/cache-extra.test.js +300 -0
  50. package/test/cache-helpers.test.js +205 -0
  51. package/test/cache-hnsw-failure.test.js +40 -0
  52. package/test/cache-json-worker.test.js +190 -0
  53. package/test/cache-worker.test.js +102 -0
  54. package/test/cache.test.js +443 -0
  55. package/test/call-graph.test.js +103 -4
  56. package/test/clear-cache.test.js +69 -68
  57. package/test/code-review-workflow.test.js +50 -0
  58. package/test/config.test.js +418 -0
  59. package/test/coverage-gap.test.js +497 -0
  60. package/test/coverage-maximizer.test.js +236 -0
  61. package/test/debug-analysis.js +107 -0
  62. package/test/embedding-model.test.js +173 -103
  63. package/test/embedding-worker-extra.test.js +272 -0
  64. package/test/embedding-worker.test.js +158 -0
  65. package/test/features.test.js +139 -0
  66. package/test/final-boost.test.js +271 -0
  67. package/test/final-polish.test.js +183 -0
  68. package/test/final.test.js +95 -0
  69. package/test/find-similar-code.test.js +191 -0
  70. package/test/helpers.js +92 -11
  71. package/test/helpers.test.js +46 -0
  72. package/test/hybrid-search-basic.test.js +62 -0
  73. package/test/hybrid-search-branch.test.js +202 -0
  74. package/test/hybrid-search-callgraph.test.js +229 -0
  75. package/test/hybrid-search-extra.test.js +81 -0
  76. package/test/hybrid-search.test.js +484 -71
  77. package/test/index-cli.test.js +520 -0
  78. package/test/index-codebase-batch.test.js +119 -0
  79. package/test/index-codebase-branches.test.js +585 -0
  80. package/test/index-codebase-core.test.js +1032 -0
  81. package/test/index-codebase-edge-cases.test.js +254 -0
  82. package/test/index-codebase-errors.test.js +132 -0
  83. package/test/index-codebase-gap.test.js +239 -0
  84. package/test/index-codebase-lines.test.js +151 -0
  85. package/test/index-codebase-watcher.test.js +259 -0
  86. package/test/index-codebase-zone.test.js +259 -0
  87. package/test/index-codebase.test.js +371 -69
  88. package/test/index-memory.test.js +220 -0
  89. package/test/indexer-detailed.test.js +176 -0
  90. package/test/integration.test.js +148 -92
  91. package/test/json-worker.test.js +50 -0
  92. package/test/lifecycle.test.js +541 -0
  93. package/test/master.test.js +198 -0
  94. package/test/perfection.test.js +349 -0
  95. package/test/project-detector.test.js +65 -0
  96. package/test/register.test.js +262 -0
  97. package/test/tokenizer.test.js +55 -93
  98. package/test/ultra-maximizer.test.js +116 -0
  99. package/test/utils-branches.test.js +161 -0
  100. package/test/utils-extra.test.js +116 -0
  101. package/test/utils.test.js +131 -0
  102. package/test/verify_fixes.js +76 -0
  103. package/test/worker-errors.test.js +96 -0
  104. package/test/worker-init.test.js +102 -0
  105. package/test/worker_throttling.test.js +93 -0
  106. package/tools/scripts/benchmark-search.js +95 -0
  107. package/tools/scripts/cache-stats.js +71 -0
  108. package/tools/scripts/manual-search.js +34 -0
  109. package/vitest.config.js +19 -9
package/lib/utils.js CHANGED
@@ -1,31 +1,35 @@
1
- import crypto from "crypto";
2
- import path from "path";
3
- import { estimateTokens, getChunkingParams, getModelTokenLimit } from "./tokenizer.js";
1
+ import crypto from 'crypto';
2
+ import path from 'path';
3
+ import { estimateTokens, getChunkingParams } from './tokenizer.js';
4
4
 
5
5
  // Re-export tokenizer utilities
6
- export { estimateTokens, getChunkingParams, getModelTokenLimit, MODEL_TOKEN_LIMITS } from "./tokenizer.js";
7
-
8
- /**
9
- * Calculate cosine similarity between two vectors
10
- */
11
- export function cosineSimilarity(a, b) {
12
- let dot = 0, normA = 0, normB = 0;
13
- for (let i = 0; i < a.length; i++) {
14
- dot += a[i] * b[i];
15
- normA += a[i] * a[i];
16
- normB += b[i] * b[i];
17
- }
18
- return dot / (Math.sqrt(normA) * Math.sqrt(normB));
19
- }
6
+ export {
7
+ estimateTokens,
8
+ getChunkingParams,
9
+ getModelTokenLimit,
10
+ MODEL_TOKEN_LIMITS,
11
+ } from './tokenizer.js';
20
12
 
21
13
  /**
22
14
  * Fast similarity for normalized vectors (dot product)
23
15
  */
24
16
  export function dotSimilarity(a, b) {
17
+ if (a.length !== b.length) return 0;
25
18
  let dot = 0;
26
- for (let i = 0; i < a.length; i++) {
19
+ let i = 0;
20
+ const len = a.length;
21
+ const m = len % 4;
22
+
23
+ while (i < m) {
27
24
  dot += a[i] * b[i];
25
+ i++;
26
+ }
27
+
28
+ while (i < len) {
29
+ dot += a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3];
30
+ i += 4;
28
31
  }
32
+
29
33
  return dot;
30
34
  }
31
35
 
@@ -33,9 +37,118 @@ export function dotSimilarity(a, b) {
33
37
  * Generate hash for file content to detect changes
34
38
  */
35
39
  export function hashContent(content) {
36
- return crypto.createHash("md5").update(content).digest("hex");
40
+ return crypto.createHash('md5').update(content).digest('hex');
37
41
  }
38
42
 
43
+ // Language-specific patterns for function/class detection
44
+ const patterns = {
45
+ // JavaScript/TypeScript
46
+ js: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
47
+ jsx: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
48
+ ts: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
49
+ tsx: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
50
+ mjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
51
+ cjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
52
+
53
+ // Python
54
+ py: /^(class|def|async\s+def)\s+\w+/,
55
+ pyw: /^(class|def|async\s+def)\s+\w+/,
56
+ pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
57
+
58
+ // Java/Kotlin/Scala
59
+ java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
60
+ kt: /^(class|interface|object|fun|val|var)\s+\w+/,
61
+ kts: /^(class|interface|object|fun|val|var)\s+\w+/,
62
+ scala: /^(class|object|trait|def|val|var)\s+\w+/,
63
+
64
+ // C/C++
65
+ c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
66
+ cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
67
+ cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
68
+ cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
69
+ h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
70
+ hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
71
+ hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
72
+
73
+ // C#
74
+ cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
75
+ csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
76
+
77
+ // Go
78
+ go: /^(func|type|const|var)\s+\w+/,
79
+
80
+ // Rust
81
+ rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
82
+
83
+ // PHP
84
+ php: /^(class|interface|trait|function|const)\s+\w+/,
85
+ phtml: /^(<\?php|class|interface|trait|function)\s*/,
86
+
87
+ // Ruby
88
+ rb: /^(class|module|def)\s+\w+/,
89
+ rake: /^(class|module|def|task|namespace)\s+\w+/,
90
+
91
+ // Swift
92
+ swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
93
+
94
+ // R
95
+ r: /^(\w+)\s*(<-|=)\s*function/,
96
+ R: /^(\w+)\s*(<-|=)\s*function/,
97
+
98
+ // Lua
99
+ lua: /^(function|local\s+function)\s+\w+/,
100
+
101
+ // Shell scripts
102
+ sh: /^(\w+\s*\(\)|function\s+\w+)/,
103
+ bash: /^(\w+\s*\(\)|function\s+\w+)/,
104
+ zsh: /^(\w+\s*\(\)|function\s+\w+)/,
105
+ fish: /^function\s+\w+/,
106
+
107
+ // CSS/Styles
108
+ css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
109
+ scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
110
+ sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
111
+ less: /^(@\w+:|\.|#|@media)\s*/,
112
+ styl: /^(\$\w+\s*=|\w+\(|\.|#)\s*/,
113
+
114
+ // Markup/HTML
115
+ html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
116
+ htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
117
+ xml: /^(<\w+|\s*<!\[CDATA\[)/,
118
+ svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
119
+
120
+ // Config files
121
+ json: /^(\s*"[\w-]+"\s*:\s*[[{])/,
122
+ yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
123
+ yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
124
+ toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
125
+ ini: /^(\[\w+\]|\w+\s*=)/,
126
+ env: /^[A-Z_][A-Z0-9_]*=/,
127
+
128
+ // Makefile
129
+ makefile: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
130
+ mk: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
131
+
132
+ // Docker
133
+ dockerfile: /^(FROM|RUN|CMD|LABEL|EXPOSE|ENV|ADD|COPY|ENTRYPOINT|VOLUME|USER|WORKDIR|ARG|ONBUILD|STOPSIGNAL|HEALTHCHECK|SHELL)\s+/i,
134
+
135
+ // Documentation
136
+ md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
137
+ mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
138
+ txt: /^.{50,}/, // Split on long paragraphs
139
+ rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
140
+
141
+ // Database
142
+ sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
143
+
144
+ // Perl
145
+ pl: /^(sub|package|use|require)\s+\w+/,
146
+ pm: /^(sub|package|use|require)\s+\w+/,
147
+
148
+ // Vim
149
+ vim: /^(function|command|autocmd|let\s+g:)\s*/,
150
+ };
151
+
39
152
  /**
40
153
  * Intelligent chunking with token limit awareness
41
154
  * Tries to split by function/class boundaries while respecting token limits
@@ -46,118 +159,29 @@ export function hashContent(content) {
46
159
  * @returns {Array<{text: string, startLine: number, endLine: number, tokenCount: number}>}
47
160
  */
48
161
  export function smartChunk(content, file, config) {
49
- const lines = content.split("\n");
162
+ const lines = content.split('\n');
50
163
  const chunks = [];
51
- const ext = path.extname(file);
52
-
53
- // Get model-specific chunking parameters
54
- const { targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
55
-
56
- // Language-specific patterns for function/class detection
57
- const patterns = {
58
- // JavaScript/TypeScript
59
- js: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
60
- jsx: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
61
- ts: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
62
- tsx: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
63
- mjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
64
- cjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
65
-
66
- // Python
67
- py: /^(class|def|async\s+def)\s+\w+/,
68
- pyw: /^(class|def|async\s+def)\s+\w+/,
69
- pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
70
-
71
- // Java/Kotlin/Scala
72
- java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
73
- kt: /^(class|interface|object|fun|val|var)\s+\w+/,
74
- kts: /^(class|interface|object|fun|val|var)\s+\w+/,
75
- scala: /^(class|object|trait|def|val|var)\s+\w+/,
76
-
77
- // C/C++
78
- c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
79
- cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
80
- cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
81
- cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
82
- h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
83
- hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
84
- hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
85
-
86
- // C#
87
- cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
88
- csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
89
-
90
- // Go
91
- go: /^(func|type|const|var)\s+\w+/,
92
-
93
- // Rust
94
- rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
95
-
96
- // PHP
97
- php: /^(class|interface|trait|function|const)\s+\w+/,
98
- phtml: /^(<\?php|class|interface|trait|function)\s*/,
99
-
100
- // Ruby
101
- rb: /^(class|module|def)\s+\w+/,
102
- rake: /^(class|module|def|task|namespace)\s+\w+/,
103
-
104
- // Swift
105
- swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
106
-
107
- // R
108
- r: /^(\w+)\s*(<-|=)\s*function/,
109
- R: /^(\w+)\s*(<-|=)\s*function/,
110
-
111
- // Lua
112
- lua: /^(function|local\s+function)\s+\w+/,
113
-
114
- // Shell scripts
115
- sh: /^(\w+\s*\(\)|function\s+\w+)/,
116
- bash: /^(\w+\s*\(\)|function\s+\w+)/,
117
- zsh: /^(\w+\s*\(\)|function\s+\w+)/,
118
- fish: /^function\s+\w+/,
119
-
120
- // CSS/Styles
121
- css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
122
- scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
123
- sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
124
- less: /^(@\w+:|\.|\#|@media)\s*/,
125
- styl: /^(\$\w+\s*=|\w+\(|\.|\#)\s*/,
126
-
127
- // Markup/HTML
128
- html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
129
- htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
130
- xml: /^(<\w+|\s*<!\[CDATA\[)/,
131
- svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
132
-
133
- // Config files
134
- json: /^(\s*"[\w-]+"\s*:\s*[\[{])/,
135
- yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
136
- yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
137
- toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
138
- ini: /^(\[\w+\]|\w+\s*=)/,
139
- env: /^[A-Z_][A-Z0-9_]*=/,
140
-
141
- // Documentation
142
- md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
143
- mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
144
- txt: /^.{50,}/, // Split on long paragraphs
145
- rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
146
-
147
- // Database
148
- sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
149
-
150
- // Perl
151
- pl: /^(sub|package|use|require)\s+\w+/,
152
- pm: /^(sub|package|use|require)\s+\w+/,
153
-
154
- // Vim
155
- vim: /^(function|command|autocmd|let\s+g:)\s*/,
156
- };
157
-
158
- const langPattern = patterns[ext.slice(1)] || patterns.js;
164
+ const ext = path.extname(file).toLowerCase();
165
+ const base = path.basename(file).toLowerCase();
166
+
167
+ // Get model-specific chunking parameters with optional user overrides
168
+ let { maxTokens, targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
169
+ if (config.maxTokens) maxTokens = config.maxTokens;
170
+ if (config.targetTokens) targetTokens = config.targetTokens;
171
+ if (config.overlapTokens) overlapTokens = config.overlapTokens;
172
+
173
+ let langPattern = patterns[ext.slice(1)];
174
+ if (!langPattern) {
175
+ if (base === 'dockerfile') langPattern = patterns.dockerfile;
176
+ else if (base === 'makefile') langPattern = patterns.makefile;
177
+ else if (base.startsWith('.env')) langPattern = patterns.env;
178
+ }
179
+ if (!langPattern || typeof langPattern.test !== 'function') {
180
+ langPattern = patterns.js; // Default fallback
181
+ }
159
182
  let currentChunk = [];
160
183
  let chunkStartLine = 0;
184
+
161
185
  let currentTokenCount = 0;
162
186
 
163
187
  // Track bracket depth for better boundary detection
@@ -168,97 +192,145 @@ export function smartChunk(content, file, config) {
168
192
  let inComment = false;
169
193
  let stringChar = null; // ' or " or `
170
194
 
195
+ const splitOversizedLine = (line, lineTokens) => {
196
+ const charsPerToken = line.length / Math.max(1, lineTokens);
197
+ const segmentSize = Math.max(100, Math.floor(charsPerToken * targetTokens)); // Min 100 chars
198
+ const segments = [];
199
+
200
+ for (let start = 0; start < line.length; start += segmentSize) {
201
+ segments.push(line.slice(start, start + segmentSize));
202
+ }
203
+
204
+ return segments;
205
+ };
206
+
171
207
  for (let i = 0; i < lines.length; i++) {
172
208
  const line = lines[i];
173
209
  const lineTokens = estimateTokens(line);
174
- const trimmed = line.trim();
210
+
211
+ let j = 0;
175
212
 
176
213
  // Simple state tracking for heuristics (not a full parser)
177
214
  if (inComment) {
178
215
  // Look for end of block comment
179
- if (line.includes('*/')) {
180
- const parts = line.split('*/');
181
- // If there's content after the comment, process it (simplified)
182
- if (parts[parts.length - 1].trim().length > 0) {
183
- inComment = false;
184
- // Recursive call or continue logic would be better, but for heuristic this is fine
185
- // We just assume the line is mixed and skip granular checks
186
- } else {
187
- inComment = false;
188
- }
216
+ const endIdx = line.indexOf('*/');
217
+ if (endIdx !== -1) {
218
+ inComment = false;
219
+ j = endIdx + 2;
220
+ } else {
221
+ // Skip whole line
222
+ j = line.length;
189
223
  }
190
- } else {
191
- for (let j = 0; j < line.length; j++) {
192
- const char = line[j];
193
- const nextChar = line[j+1];
194
-
195
- if (inString) {
196
- if (char === '\\') {
197
- j++; // Skip escaped char
198
- } else if (char === stringChar) {
199
- inString = false;
200
- stringChar = null;
201
- }
202
- } else {
203
- // Check for comment start
204
- if (char === '/' && nextChar === '*') {
205
- inComment = true;
206
- j++;
207
- // Check if it ends on same line
208
- if (line.indexOf('*/', j) !== -1) {
209
- inComment = false;
210
- j = line.indexOf('*/', j) + 1;
211
- } else {
212
- break; // Rest of line is comment
213
- }
214
- } else if (char === '/' && nextChar === '/') {
215
- break; // Skip rest of line (line comment)
216
- } else if (char === '\'' || char === '"' || char === '`') {
217
- inString = true;
218
- stringChar = char;
224
+ }
225
+
226
+ const scanLine = j < line.length ? line.slice(j) : '';
227
+ const trimmed = scanLine.trim();
228
+
229
+ for (; j < line.length; j++) {
230
+ const char = line[j];
231
+ const nextChar = line[j + 1];
232
+
233
+ if (inString) {
234
+ if (char === '\\') {
235
+ j++; // Skip escaped char
236
+ } else if (char === stringChar) {
237
+ inString = false;
238
+ stringChar = null;
239
+ }
240
+ } else {
241
+ // Check for comment start
242
+ if (char === '/' && nextChar === '*') {
243
+ inComment = true;
244
+ j++;
245
+ // Check if it ends on same line
246
+ const endIdx = line.indexOf('*/', j);
247
+ if (endIdx !== -1) {
248
+ inComment = false;
249
+ j = endIdx + 1;
219
250
  } else {
220
- // Only count brackets if not in string or comment
221
- if (char === '{') braceDepth++;
222
- else if (char === '}') braceDepth = Math.max(0, braceDepth - 1);
223
- else if (char === '[') bracketDepth++;
224
- else if (char === ']') bracketDepth = Math.max(0, bracketDepth - 1);
225
- else if (char === '(') parenDepth++;
226
- else if (char === ')') parenDepth = Math.max(0, parenDepth - 1);
251
+ break; // Rest of line is comment
227
252
  }
253
+ } else if (char === '/' && nextChar === '/') {
254
+ break; // Skip rest of line (line comment)
255
+ } else if (char === "'" || char === '"' || char === '`') {
256
+ inString = true;
257
+ stringChar = char;
258
+ } else {
259
+ // Only count brackets if not in string or comment
260
+ if (char === '{') braceDepth++;
261
+ else if (char === '}') braceDepth = Math.max(0, braceDepth - 1);
262
+ else if (char === '[') bracketDepth++;
263
+ else if (char === ']') bracketDepth = Math.max(0, bracketDepth - 1);
264
+ else if (char === '(') parenDepth++;
265
+ else if (char === ')') parenDepth = Math.max(0, parenDepth - 1);
228
266
  }
229
267
  }
230
268
  }
231
269
 
270
+ // Split lines that are too large to ever fit in a single chunk
271
+ if (lineTokens > maxTokens) {
272
+ if (currentChunk.length > 0) {
273
+ const chunkText = currentChunk.join('\n');
274
+ if (chunkText.trim().length > 20) {
275
+ chunks.push({
276
+ text: chunkText,
277
+ startLine: chunkStartLine + 1,
278
+ endLine: i,
279
+ tokenCount: currentTokenCount,
280
+ });
281
+ }
282
+ }
283
+
284
+ const parts = splitOversizedLine(line, lineTokens);
285
+ for (const part of parts) {
286
+ if (part.trim().length <= 20) continue;
287
+ chunks.push({
288
+ text: part,
289
+ startLine: i + 1,
290
+ endLine: i + 1,
291
+ tokenCount: estimateTokens(part),
292
+ });
293
+ }
294
+
295
+ currentChunk = [];
296
+ currentTokenCount = 0;
297
+ chunkStartLine = i + 1;
298
+ continue;
299
+ }
300
+
232
301
  // Check if adding this line would exceed token limit
233
- const wouldExceedLimit = (currentTokenCount + lineTokens) > targetTokens;
302
+ const wouldExceedLimit = currentTokenCount + lineTokens > targetTokens;
234
303
 
235
304
  // Check if this is a good split point using multiple heuristics
236
305
  const matchesPattern = langPattern.test(trimmed);
237
- const atTopLevel = braceDepth === 0 && bracketDepth === 0 && parenDepth === 0 && !inString && !inComment;
238
- const startsAtColumn0 = line.length > 0 && /^\S/.test(line);
306
+ const atTopLevel =
307
+ braceDepth === 0 && bracketDepth === 0 && parenDepth === 0 && !inString && !inComment;
308
+ const startsAtColumn0 = scanLine.length > 0 && /^\S/.test(scanLine);
239
309
  const isEmptyLine = trimmed.length === 0;
240
- const prevWasEmpty = i > 0 && currentChunk.length > 0 && currentChunk[currentChunk.length - 1].trim().length === 0;
241
- const isCommentStart = /^\s*(\/\*\*|\/\/\s*[-=]{3,}|#\s*[-=]{3,})/.test(line);
310
+ const prevWasEmpty =
311
+ i > 0 && currentChunk.length > 0 && currentChunk[currentChunk.length - 1].trim().length === 0;
312
+ const isCommentStart = /^\s*(\/\*\*|\/\/\s*[-=]{3,}|#\s*[-=]{3,})/.test(scanLine);
242
313
 
243
- const isGoodSplitPoint = currentChunk.length > 3 && (
244
- (matchesPattern && (atTopLevel || braceDepth <= 1)) ||
245
- (atTopLevel && startsAtColumn0 && !isEmptyLine) ||
246
- (prevWasEmpty && (matchesPattern || isCommentStart))
247
- );
314
+ const isGoodSplitPoint =
315
+ currentChunk.length > 3 &&
316
+ ((matchesPattern && (atTopLevel || braceDepth <= 1)) ||
317
+ (atTopLevel && startsAtColumn0 && !isEmptyLine) ||
318
+ (prevWasEmpty && (matchesPattern || isCommentStart)));
248
319
 
249
- const shouldSplit = wouldExceedLimit || (isGoodSplitPoint && currentTokenCount > targetTokens * 0.6);
320
+ const shouldSplit =
321
+ wouldExceedLimit || (isGoodSplitPoint && currentTokenCount > targetTokens * 0.6);
250
322
 
251
323
  // Avoid splitting in weird states if possible
252
324
  const safeToSplit = (braceDepth <= 1 && !inString) || wouldExceedLimit;
253
325
 
254
326
  if (shouldSplit && safeToSplit && currentChunk.length > 0) {
255
- const chunkText = currentChunk.join("\n");
327
+ const chunkText = currentChunk.join('\n');
256
328
  if (chunkText.trim().length > 20) {
257
329
  chunks.push({
258
330
  text: chunkText,
259
331
  startLine: chunkStartLine + 1,
260
332
  endLine: i,
261
- tokenCount: currentTokenCount
333
+ tokenCount: currentTokenCount,
262
334
  });
263
335
  }
264
336
 
@@ -282,19 +354,22 @@ export function smartChunk(content, file, config) {
282
354
 
283
355
  currentChunk.push(line);
284
356
  currentTokenCount += lineTokens;
357
+
358
+ if (chunks.length >= (config.maxChunksPerFile || 1000)) {
359
+ // Hard limit to prevent memory explosion on minified/data files
360
+ break;
361
+ }
285
362
  }
286
363
 
287
364
  // Add remaining chunk
288
- if (currentChunk.length > 0) {
289
- const chunkText = currentChunk.join("\n");
290
- if (chunkText.trim().length > 20) {
291
- chunks.push({
292
- text: chunkText,
293
- startLine: chunkStartLine + 1,
294
- endLine: lines.length,
295
- tokenCount: currentTokenCount
296
- });
297
- }
365
+ const chunkText = currentChunk.join('\n');
366
+ if (chunkText.trim().length > 20) {
367
+ chunks.push({
368
+ text: chunkText,
369
+ startLine: chunkStartLine + 1,
370
+ endLine: lines.length,
371
+ tokenCount: currentTokenCount,
372
+ });
298
373
  }
299
374
 
300
375
  return chunks;