@softerist/heuristic-mcp 2.1.47 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/workflows/code-review.md +60 -0
- package/.prettierrc +7 -0
- package/ARCHITECTURE.md +105 -170
- package/CONTRIBUTING.md +32 -113
- package/GEMINI.md +73 -0
- package/LICENSE +21 -21
- package/README.md +161 -54
- package/config.json +876 -75
- package/debug-pids.js +27 -0
- package/eslint.config.js +36 -0
- package/features/ann-config.js +37 -26
- package/features/clear-cache.js +28 -19
- package/features/find-similar-code.js +142 -66
- package/features/hybrid-search.js +253 -93
- package/features/index-codebase.js +1455 -394
- package/features/lifecycle.js +813 -180
- package/features/register.js +58 -52
- package/index.js +450 -306
- package/lib/cache-ops.js +22 -0
- package/lib/cache-utils.js +68 -0
- package/lib/cache.js +1392 -587
- package/lib/call-graph.js +165 -50
- package/lib/cli.js +154 -0
- package/lib/config.js +462 -121
- package/lib/embedding-process.js +77 -0
- package/lib/embedding-worker.js +545 -30
- package/lib/ignore-patterns.js +61 -59
- package/lib/json-worker.js +14 -0
- package/lib/json-writer.js +344 -0
- package/lib/logging.js +88 -0
- package/lib/memory-logger.js +13 -0
- package/lib/project-detector.js +13 -17
- package/lib/server-lifecycle.js +38 -0
- package/lib/settings-editor.js +645 -0
- package/lib/tokenizer.js +207 -104
- package/lib/utils.js +273 -198
- package/lib/vector-store-binary.js +592 -0
- package/mcp_config.example.json +13 -0
- package/package.json +13 -2
- package/scripts/clear-cache.js +6 -17
- package/scripts/download-model.js +14 -9
- package/scripts/postinstall.js +5 -5
- package/search-configs.js +36 -0
- package/test/ann-config.test.js +179 -0
- package/test/ann-fallback.test.js +6 -6
- package/test/binary-store.test.js +69 -0
- package/test/cache-branches.test.js +120 -0
- package/test/cache-errors.test.js +264 -0
- package/test/cache-extra.test.js +300 -0
- package/test/cache-helpers.test.js +205 -0
- package/test/cache-hnsw-failure.test.js +40 -0
- package/test/cache-json-worker.test.js +190 -0
- package/test/cache-worker.test.js +102 -0
- package/test/cache.test.js +443 -0
- package/test/call-graph.test.js +103 -4
- package/test/clear-cache.test.js +69 -68
- package/test/code-review-workflow.test.js +50 -0
- package/test/config.test.js +418 -0
- package/test/coverage-gap.test.js +497 -0
- package/test/coverage-maximizer.test.js +236 -0
- package/test/debug-analysis.js +107 -0
- package/test/embedding-model.test.js +173 -103
- package/test/embedding-worker-extra.test.js +272 -0
- package/test/embedding-worker.test.js +158 -0
- package/test/features.test.js +139 -0
- package/test/final-boost.test.js +271 -0
- package/test/final-polish.test.js +183 -0
- package/test/final.test.js +95 -0
- package/test/find-similar-code.test.js +191 -0
- package/test/helpers.js +92 -11
- package/test/helpers.test.js +46 -0
- package/test/hybrid-search-basic.test.js +62 -0
- package/test/hybrid-search-branch.test.js +202 -0
- package/test/hybrid-search-callgraph.test.js +229 -0
- package/test/hybrid-search-extra.test.js +81 -0
- package/test/hybrid-search.test.js +484 -71
- package/test/index-cli.test.js +520 -0
- package/test/index-codebase-batch.test.js +119 -0
- package/test/index-codebase-branches.test.js +585 -0
- package/test/index-codebase-core.test.js +1032 -0
- package/test/index-codebase-edge-cases.test.js +254 -0
- package/test/index-codebase-errors.test.js +132 -0
- package/test/index-codebase-gap.test.js +239 -0
- package/test/index-codebase-lines.test.js +151 -0
- package/test/index-codebase-watcher.test.js +259 -0
- package/test/index-codebase-zone.test.js +259 -0
- package/test/index-codebase.test.js +371 -69
- package/test/index-memory.test.js +220 -0
- package/test/indexer-detailed.test.js +176 -0
- package/test/integration.test.js +148 -92
- package/test/json-worker.test.js +50 -0
- package/test/lifecycle.test.js +541 -0
- package/test/master.test.js +198 -0
- package/test/perfection.test.js +349 -0
- package/test/project-detector.test.js +65 -0
- package/test/register.test.js +262 -0
- package/test/tokenizer.test.js +55 -93
- package/test/ultra-maximizer.test.js +116 -0
- package/test/utils-branches.test.js +161 -0
- package/test/utils-extra.test.js +116 -0
- package/test/utils.test.js +131 -0
- package/test/verify_fixes.js +76 -0
- package/test/worker-errors.test.js +96 -0
- package/test/worker-init.test.js +102 -0
- package/test/worker_throttling.test.js +93 -0
- package/tools/scripts/benchmark-search.js +95 -0
- package/tools/scripts/cache-stats.js +71 -0
- package/tools/scripts/manual-search.js +34 -0
- package/vitest.config.js +19 -9
package/lib/utils.js
CHANGED
|
@@ -1,31 +1,35 @@
|
|
|
1
|
-
import crypto from
|
|
2
|
-
import path from
|
|
3
|
-
import { estimateTokens, getChunkingParams
|
|
1
|
+
import crypto from 'crypto';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { estimateTokens, getChunkingParams } from './tokenizer.js';
|
|
4
4
|
|
|
5
5
|
// Re-export tokenizer utilities
|
|
6
|
-
export {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
let dot = 0, normA = 0, normB = 0;
|
|
13
|
-
for (let i = 0; i < a.length; i++) {
|
|
14
|
-
dot += a[i] * b[i];
|
|
15
|
-
normA += a[i] * a[i];
|
|
16
|
-
normB += b[i] * b[i];
|
|
17
|
-
}
|
|
18
|
-
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
19
|
-
}
|
|
6
|
+
export {
|
|
7
|
+
estimateTokens,
|
|
8
|
+
getChunkingParams,
|
|
9
|
+
getModelTokenLimit,
|
|
10
|
+
MODEL_TOKEN_LIMITS,
|
|
11
|
+
} from './tokenizer.js';
|
|
20
12
|
|
|
21
13
|
/**
|
|
22
14
|
* Fast similarity for normalized vectors (dot product)
|
|
23
15
|
*/
|
|
24
16
|
export function dotSimilarity(a, b) {
|
|
17
|
+
if (a.length !== b.length) return 0;
|
|
25
18
|
let dot = 0;
|
|
26
|
-
|
|
19
|
+
let i = 0;
|
|
20
|
+
const len = a.length;
|
|
21
|
+
const m = len % 4;
|
|
22
|
+
|
|
23
|
+
while (i < m) {
|
|
27
24
|
dot += a[i] * b[i];
|
|
25
|
+
i++;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
while (i < len) {
|
|
29
|
+
dot += a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3];
|
|
30
|
+
i += 4;
|
|
28
31
|
}
|
|
32
|
+
|
|
29
33
|
return dot;
|
|
30
34
|
}
|
|
31
35
|
|
|
@@ -33,9 +37,118 @@ export function dotSimilarity(a, b) {
|
|
|
33
37
|
* Generate hash for file content to detect changes
|
|
34
38
|
*/
|
|
35
39
|
export function hashContent(content) {
|
|
36
|
-
return crypto.createHash(
|
|
40
|
+
return crypto.createHash('md5').update(content).digest('hex');
|
|
37
41
|
}
|
|
38
42
|
|
|
43
|
+
// Language-specific patterns for function/class detection
|
|
44
|
+
const patterns = {
|
|
45
|
+
// JavaScript/TypeScript
|
|
46
|
+
js: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
47
|
+
jsx: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
48
|
+
ts: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
|
|
49
|
+
tsx: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
|
|
50
|
+
mjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
51
|
+
cjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
52
|
+
|
|
53
|
+
// Python
|
|
54
|
+
py: /^(class|def|async\s+def)\s+\w+/,
|
|
55
|
+
pyw: /^(class|def|async\s+def)\s+\w+/,
|
|
56
|
+
pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
|
|
57
|
+
|
|
58
|
+
// Java/Kotlin/Scala
|
|
59
|
+
java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
|
|
60
|
+
kt: /^(class|interface|object|fun|val|var)\s+\w+/,
|
|
61
|
+
kts: /^(class|interface|object|fun|val|var)\s+\w+/,
|
|
62
|
+
scala: /^(class|object|trait|def|val|var)\s+\w+/,
|
|
63
|
+
|
|
64
|
+
// C/C++
|
|
65
|
+
c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
|
|
66
|
+
cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
67
|
+
cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
68
|
+
cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
69
|
+
h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
70
|
+
hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
71
|
+
hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
72
|
+
|
|
73
|
+
// C#
|
|
74
|
+
cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
75
|
+
csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
76
|
+
|
|
77
|
+
// Go
|
|
78
|
+
go: /^(func|type|const|var)\s+\w+/,
|
|
79
|
+
|
|
80
|
+
// Rust
|
|
81
|
+
rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
|
|
82
|
+
|
|
83
|
+
// PHP
|
|
84
|
+
php: /^(class|interface|trait|function|const)\s+\w+/,
|
|
85
|
+
phtml: /^(<\?php|class|interface|trait|function)\s*/,
|
|
86
|
+
|
|
87
|
+
// Ruby
|
|
88
|
+
rb: /^(class|module|def)\s+\w+/,
|
|
89
|
+
rake: /^(class|module|def|task|namespace)\s+\w+/,
|
|
90
|
+
|
|
91
|
+
// Swift
|
|
92
|
+
swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
|
|
93
|
+
|
|
94
|
+
// R
|
|
95
|
+
r: /^(\w+)\s*(<-|=)\s*function/,
|
|
96
|
+
R: /^(\w+)\s*(<-|=)\s*function/,
|
|
97
|
+
|
|
98
|
+
// Lua
|
|
99
|
+
lua: /^(function|local\s+function)\s+\w+/,
|
|
100
|
+
|
|
101
|
+
// Shell scripts
|
|
102
|
+
sh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
103
|
+
bash: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
104
|
+
zsh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
105
|
+
fish: /^function\s+\w+/,
|
|
106
|
+
|
|
107
|
+
// CSS/Styles
|
|
108
|
+
css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
|
|
109
|
+
scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
|
|
110
|
+
sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
|
|
111
|
+
less: /^(@\w+:|\.|#|@media)\s*/,
|
|
112
|
+
styl: /^(\$\w+\s*=|\w+\(|\.|#)\s*/,
|
|
113
|
+
|
|
114
|
+
// Markup/HTML
|
|
115
|
+
html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
116
|
+
htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
117
|
+
xml: /^(<\w+|\s*<!\[CDATA\[)/,
|
|
118
|
+
svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
|
|
119
|
+
|
|
120
|
+
// Config files
|
|
121
|
+
json: /^(\s*"[\w-]+"\s*:\s*[[{])/,
|
|
122
|
+
yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
123
|
+
yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
124
|
+
toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
|
|
125
|
+
ini: /^(\[\w+\]|\w+\s*=)/,
|
|
126
|
+
env: /^[A-Z_][A-Z0-9_]*=/,
|
|
127
|
+
|
|
128
|
+
// Makefile
|
|
129
|
+
makefile: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
|
|
130
|
+
mk: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
|
|
131
|
+
|
|
132
|
+
// Docker
|
|
133
|
+
dockerfile: /^(FROM|RUN|CMD|LABEL|EXPOSE|ENV|ADD|COPY|ENTRYPOINT|VOLUME|USER|WORKDIR|ARG|ONBUILD|STOPSIGNAL|HEALTHCHECK|SHELL)\s+/i,
|
|
134
|
+
|
|
135
|
+
// Documentation
|
|
136
|
+
md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
|
|
137
|
+
mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
|
|
138
|
+
txt: /^.{50,}/, // Split on long paragraphs
|
|
139
|
+
rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
|
|
140
|
+
|
|
141
|
+
// Database
|
|
142
|
+
sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
|
|
143
|
+
|
|
144
|
+
// Perl
|
|
145
|
+
pl: /^(sub|package|use|require)\s+\w+/,
|
|
146
|
+
pm: /^(sub|package|use|require)\s+\w+/,
|
|
147
|
+
|
|
148
|
+
// Vim
|
|
149
|
+
vim: /^(function|command|autocmd|let\s+g:)\s*/,
|
|
150
|
+
};
|
|
151
|
+
|
|
39
152
|
/**
|
|
40
153
|
* Intelligent chunking with token limit awareness
|
|
41
154
|
* Tries to split by function/class boundaries while respecting token limits
|
|
@@ -46,118 +159,29 @@ export function hashContent(content) {
|
|
|
46
159
|
* @returns {Array<{text: string, startLine: number, endLine: number, tokenCount: number}>}
|
|
47
160
|
*/
|
|
48
161
|
export function smartChunk(content, file, config) {
|
|
49
|
-
const lines = content.split(
|
|
162
|
+
const lines = content.split('\n');
|
|
50
163
|
const chunks = [];
|
|
51
|
-
const ext = path.extname(file);
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
|
|
70
|
-
|
|
71
|
-
// Java/Kotlin/Scala
|
|
72
|
-
java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
|
|
73
|
-
kt: /^(class|interface|object|fun|val|var)\s+\w+/,
|
|
74
|
-
kts: /^(class|interface|object|fun|val|var)\s+\w+/,
|
|
75
|
-
scala: /^(class|object|trait|def|val|var)\s+\w+/,
|
|
76
|
-
|
|
77
|
-
// C/C++
|
|
78
|
-
c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
|
|
79
|
-
cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
80
|
-
cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
81
|
-
cxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
82
|
-
h: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
83
|
-
hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
84
|
-
hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
|
|
85
|
-
|
|
86
|
-
// C#
|
|
87
|
-
cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
88
|
-
csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
|
|
89
|
-
|
|
90
|
-
// Go
|
|
91
|
-
go: /^(func|type|const|var)\s+\w+/,
|
|
92
|
-
|
|
93
|
-
// Rust
|
|
94
|
-
rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
|
|
95
|
-
|
|
96
|
-
// PHP
|
|
97
|
-
php: /^(class|interface|trait|function|const)\s+\w+/,
|
|
98
|
-
phtml: /^(<\?php|class|interface|trait|function)\s*/,
|
|
99
|
-
|
|
100
|
-
// Ruby
|
|
101
|
-
rb: /^(class|module|def)\s+\w+/,
|
|
102
|
-
rake: /^(class|module|def|task|namespace)\s+\w+/,
|
|
103
|
-
|
|
104
|
-
// Swift
|
|
105
|
-
swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
|
|
106
|
-
|
|
107
|
-
// R
|
|
108
|
-
r: /^(\w+)\s*(<-|=)\s*function/,
|
|
109
|
-
R: /^(\w+)\s*(<-|=)\s*function/,
|
|
110
|
-
|
|
111
|
-
// Lua
|
|
112
|
-
lua: /^(function|local\s+function)\s+\w+/,
|
|
113
|
-
|
|
114
|
-
// Shell scripts
|
|
115
|
-
sh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
116
|
-
bash: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
117
|
-
zsh: /^(\w+\s*\(\)|function\s+\w+)/,
|
|
118
|
-
fish: /^function\s+\w+/,
|
|
119
|
-
|
|
120
|
-
// CSS/Styles
|
|
121
|
-
css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
|
|
122
|
-
scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
|
|
123
|
-
sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
|
|
124
|
-
less: /^(@\w+:|\.|\#|@media)\s*/,
|
|
125
|
-
styl: /^(\$\w+\s*=|\w+\(|\.|\#)\s*/,
|
|
126
|
-
|
|
127
|
-
// Markup/HTML
|
|
128
|
-
html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
129
|
-
htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
|
|
130
|
-
xml: /^(<\w+|\s*<!\[CDATA\[)/,
|
|
131
|
-
svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
|
|
132
|
-
|
|
133
|
-
// Config files
|
|
134
|
-
json: /^(\s*"[\w-]+"\s*:\s*[\[{])/,
|
|
135
|
-
yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
136
|
-
yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
|
|
137
|
-
toml: /^(\[\[?\w+\]?\]?|\w+\s*=)/,
|
|
138
|
-
ini: /^(\[\w+\]|\w+\s*=)/,
|
|
139
|
-
env: /^[A-Z_][A-Z0-9_]*=/,
|
|
140
|
-
|
|
141
|
-
// Documentation
|
|
142
|
-
md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
|
|
143
|
-
mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
|
|
144
|
-
txt: /^.{50,}/, // Split on long paragraphs
|
|
145
|
-
rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
|
|
146
|
-
|
|
147
|
-
// Database
|
|
148
|
-
sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
|
|
149
|
-
|
|
150
|
-
// Perl
|
|
151
|
-
pl: /^(sub|package|use|require)\s+\w+/,
|
|
152
|
-
pm: /^(sub|package|use|require)\s+\w+/,
|
|
153
|
-
|
|
154
|
-
// Vim
|
|
155
|
-
vim: /^(function|command|autocmd|let\s+g:)\s*/,
|
|
156
|
-
};
|
|
157
|
-
|
|
158
|
-
const langPattern = patterns[ext.slice(1)] || patterns.js;
|
|
164
|
+
const ext = path.extname(file).toLowerCase();
|
|
165
|
+
const base = path.basename(file).toLowerCase();
|
|
166
|
+
|
|
167
|
+
// Get model-specific chunking parameters with optional user overrides
|
|
168
|
+
let { maxTokens, targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
|
|
169
|
+
if (config.maxTokens) maxTokens = config.maxTokens;
|
|
170
|
+
if (config.targetTokens) targetTokens = config.targetTokens;
|
|
171
|
+
if (config.overlapTokens) overlapTokens = config.overlapTokens;
|
|
172
|
+
|
|
173
|
+
let langPattern = patterns[ext.slice(1)];
|
|
174
|
+
if (!langPattern) {
|
|
175
|
+
if (base === 'dockerfile') langPattern = patterns.dockerfile;
|
|
176
|
+
else if (base === 'makefile') langPattern = patterns.makefile;
|
|
177
|
+
else if (base.startsWith('.env')) langPattern = patterns.env;
|
|
178
|
+
}
|
|
179
|
+
if (!langPattern || typeof langPattern.test !== 'function') {
|
|
180
|
+
langPattern = patterns.js; // Default fallback
|
|
181
|
+
}
|
|
159
182
|
let currentChunk = [];
|
|
160
183
|
let chunkStartLine = 0;
|
|
184
|
+
|
|
161
185
|
let currentTokenCount = 0;
|
|
162
186
|
|
|
163
187
|
// Track bracket depth for better boundary detection
|
|
@@ -168,97 +192,145 @@ export function smartChunk(content, file, config) {
|
|
|
168
192
|
let inComment = false;
|
|
169
193
|
let stringChar = null; // ' or " or `
|
|
170
194
|
|
|
195
|
+
const splitOversizedLine = (line, lineTokens) => {
|
|
196
|
+
const charsPerToken = line.length / Math.max(1, lineTokens);
|
|
197
|
+
const segmentSize = Math.max(100, Math.floor(charsPerToken * targetTokens)); // Min 100 chars
|
|
198
|
+
const segments = [];
|
|
199
|
+
|
|
200
|
+
for (let start = 0; start < line.length; start += segmentSize) {
|
|
201
|
+
segments.push(line.slice(start, start + segmentSize));
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return segments;
|
|
205
|
+
};
|
|
206
|
+
|
|
171
207
|
for (let i = 0; i < lines.length; i++) {
|
|
172
208
|
const line = lines[i];
|
|
173
209
|
const lineTokens = estimateTokens(line);
|
|
174
|
-
|
|
210
|
+
|
|
211
|
+
let j = 0;
|
|
175
212
|
|
|
176
213
|
// Simple state tracking for heuristics (not a full parser)
|
|
177
214
|
if (inComment) {
|
|
178
215
|
// Look for end of block comment
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
} else {
|
|
187
|
-
inComment = false;
|
|
188
|
-
}
|
|
216
|
+
const endIdx = line.indexOf('*/');
|
|
217
|
+
if (endIdx !== -1) {
|
|
218
|
+
inComment = false;
|
|
219
|
+
j = endIdx + 2;
|
|
220
|
+
} else {
|
|
221
|
+
// Skip whole line
|
|
222
|
+
j = line.length;
|
|
189
223
|
}
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
} else {
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
} else if (char === '\'' || char === '"' || char === '`') {
|
|
217
|
-
inString = true;
|
|
218
|
-
stringChar = char;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
const scanLine = j < line.length ? line.slice(j) : '';
|
|
227
|
+
const trimmed = scanLine.trim();
|
|
228
|
+
|
|
229
|
+
for (; j < line.length; j++) {
|
|
230
|
+
const char = line[j];
|
|
231
|
+
const nextChar = line[j + 1];
|
|
232
|
+
|
|
233
|
+
if (inString) {
|
|
234
|
+
if (char === '\\') {
|
|
235
|
+
j++; // Skip escaped char
|
|
236
|
+
} else if (char === stringChar) {
|
|
237
|
+
inString = false;
|
|
238
|
+
stringChar = null;
|
|
239
|
+
}
|
|
240
|
+
} else {
|
|
241
|
+
// Check for comment start
|
|
242
|
+
if (char === '/' && nextChar === '*') {
|
|
243
|
+
inComment = true;
|
|
244
|
+
j++;
|
|
245
|
+
// Check if it ends on same line
|
|
246
|
+
const endIdx = line.indexOf('*/', j);
|
|
247
|
+
if (endIdx !== -1) {
|
|
248
|
+
inComment = false;
|
|
249
|
+
j = endIdx + 1;
|
|
219
250
|
} else {
|
|
220
|
-
//
|
|
221
|
-
if (char === '{') braceDepth++;
|
|
222
|
-
else if (char === '}') braceDepth = Math.max(0, braceDepth - 1);
|
|
223
|
-
else if (char === '[') bracketDepth++;
|
|
224
|
-
else if (char === ']') bracketDepth = Math.max(0, bracketDepth - 1);
|
|
225
|
-
else if (char === '(') parenDepth++;
|
|
226
|
-
else if (char === ')') parenDepth = Math.max(0, parenDepth - 1);
|
|
251
|
+
break; // Rest of line is comment
|
|
227
252
|
}
|
|
253
|
+
} else if (char === '/' && nextChar === '/') {
|
|
254
|
+
break; // Skip rest of line (line comment)
|
|
255
|
+
} else if (char === "'" || char === '"' || char === '`') {
|
|
256
|
+
inString = true;
|
|
257
|
+
stringChar = char;
|
|
258
|
+
} else {
|
|
259
|
+
// Only count brackets if not in string or comment
|
|
260
|
+
if (char === '{') braceDepth++;
|
|
261
|
+
else if (char === '}') braceDepth = Math.max(0, braceDepth - 1);
|
|
262
|
+
else if (char === '[') bracketDepth++;
|
|
263
|
+
else if (char === ']') bracketDepth = Math.max(0, bracketDepth - 1);
|
|
264
|
+
else if (char === '(') parenDepth++;
|
|
265
|
+
else if (char === ')') parenDepth = Math.max(0, parenDepth - 1);
|
|
228
266
|
}
|
|
229
267
|
}
|
|
230
268
|
}
|
|
231
269
|
|
|
270
|
+
// Split lines that are too large to ever fit in a single chunk
|
|
271
|
+
if (lineTokens > maxTokens) {
|
|
272
|
+
if (currentChunk.length > 0) {
|
|
273
|
+
const chunkText = currentChunk.join('\n');
|
|
274
|
+
if (chunkText.trim().length > 20) {
|
|
275
|
+
chunks.push({
|
|
276
|
+
text: chunkText,
|
|
277
|
+
startLine: chunkStartLine + 1,
|
|
278
|
+
endLine: i,
|
|
279
|
+
tokenCount: currentTokenCount,
|
|
280
|
+
});
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
const parts = splitOversizedLine(line, lineTokens);
|
|
285
|
+
for (const part of parts) {
|
|
286
|
+
if (part.trim().length <= 20) continue;
|
|
287
|
+
chunks.push({
|
|
288
|
+
text: part,
|
|
289
|
+
startLine: i + 1,
|
|
290
|
+
endLine: i + 1,
|
|
291
|
+
tokenCount: estimateTokens(part),
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
currentChunk = [];
|
|
296
|
+
currentTokenCount = 0;
|
|
297
|
+
chunkStartLine = i + 1;
|
|
298
|
+
continue;
|
|
299
|
+
}
|
|
300
|
+
|
|
232
301
|
// Check if adding this line would exceed token limit
|
|
233
|
-
const wouldExceedLimit =
|
|
302
|
+
const wouldExceedLimit = currentTokenCount + lineTokens > targetTokens;
|
|
234
303
|
|
|
235
304
|
// Check if this is a good split point using multiple heuristics
|
|
236
305
|
const matchesPattern = langPattern.test(trimmed);
|
|
237
|
-
const atTopLevel =
|
|
238
|
-
|
|
306
|
+
const atTopLevel =
|
|
307
|
+
braceDepth === 0 && bracketDepth === 0 && parenDepth === 0 && !inString && !inComment;
|
|
308
|
+
const startsAtColumn0 = scanLine.length > 0 && /^\S/.test(scanLine);
|
|
239
309
|
const isEmptyLine = trimmed.length === 0;
|
|
240
|
-
const prevWasEmpty =
|
|
241
|
-
|
|
310
|
+
const prevWasEmpty =
|
|
311
|
+
i > 0 && currentChunk.length > 0 && currentChunk[currentChunk.length - 1].trim().length === 0;
|
|
312
|
+
const isCommentStart = /^\s*(\/\*\*|\/\/\s*[-=]{3,}|#\s*[-=]{3,})/.test(scanLine);
|
|
242
313
|
|
|
243
|
-
const isGoodSplitPoint =
|
|
244
|
-
|
|
245
|
-
(
|
|
246
|
-
|
|
247
|
-
|
|
314
|
+
const isGoodSplitPoint =
|
|
315
|
+
currentChunk.length > 3 &&
|
|
316
|
+
((matchesPattern && (atTopLevel || braceDepth <= 1)) ||
|
|
317
|
+
(atTopLevel && startsAtColumn0 && !isEmptyLine) ||
|
|
318
|
+
(prevWasEmpty && (matchesPattern || isCommentStart)));
|
|
248
319
|
|
|
249
|
-
const shouldSplit =
|
|
320
|
+
const shouldSplit =
|
|
321
|
+
wouldExceedLimit || (isGoodSplitPoint && currentTokenCount > targetTokens * 0.6);
|
|
250
322
|
|
|
251
323
|
// Avoid splitting in weird states if possible
|
|
252
324
|
const safeToSplit = (braceDepth <= 1 && !inString) || wouldExceedLimit;
|
|
253
325
|
|
|
254
326
|
if (shouldSplit && safeToSplit && currentChunk.length > 0) {
|
|
255
|
-
const chunkText = currentChunk.join(
|
|
327
|
+
const chunkText = currentChunk.join('\n');
|
|
256
328
|
if (chunkText.trim().length > 20) {
|
|
257
329
|
chunks.push({
|
|
258
330
|
text: chunkText,
|
|
259
331
|
startLine: chunkStartLine + 1,
|
|
260
332
|
endLine: i,
|
|
261
|
-
tokenCount: currentTokenCount
|
|
333
|
+
tokenCount: currentTokenCount,
|
|
262
334
|
});
|
|
263
335
|
}
|
|
264
336
|
|
|
@@ -282,19 +354,22 @@ export function smartChunk(content, file, config) {
|
|
|
282
354
|
|
|
283
355
|
currentChunk.push(line);
|
|
284
356
|
currentTokenCount += lineTokens;
|
|
357
|
+
|
|
358
|
+
if (chunks.length >= (config.maxChunksPerFile || 1000)) {
|
|
359
|
+
// Hard limit to prevent memory explosion on minified/data files
|
|
360
|
+
break;
|
|
361
|
+
}
|
|
285
362
|
}
|
|
286
363
|
|
|
287
364
|
// Add remaining chunk
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
});
|
|
297
|
-
}
|
|
365
|
+
const chunkText = currentChunk.join('\n');
|
|
366
|
+
if (chunkText.trim().length > 20) {
|
|
367
|
+
chunks.push({
|
|
368
|
+
text: chunkText,
|
|
369
|
+
startLine: chunkStartLine + 1,
|
|
370
|
+
endLine: lines.length,
|
|
371
|
+
tokenCount: currentTokenCount,
|
|
372
|
+
});
|
|
298
373
|
}
|
|
299
374
|
|
|
300
375
|
return chunks;
|