@softerist/heuristic-mcp 3.0.17 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/utils.js CHANGED
@@ -2,7 +2,7 @@ import crypto from 'crypto';
2
2
  import path from 'path';
3
3
  import { estimateTokens, getChunkingParams } from './tokenizer.js';
4
4
 
5
- // Re-export tokenizer utilities
5
+
6
6
  export {
7
7
  estimateTokens,
8
8
  getChunkingParams,
@@ -10,19 +10,10 @@ export {
10
10
  MODEL_TOKEN_LIMITS,
11
11
  } from './tokenizer.js';
12
12
 
13
- // Minimum text length for a chunk to be considered valid (avoids tiny fragments)
13
+
14
14
  import { MIN_CHUNK_TEXT_LENGTH } from './constants.js';
15
15
 
16
- /**
17
- * Fast similarity for normalized vectors (dot product).
18
- * Uses loop unrolling for performance on large vectors.
19
- * NOTE: For very large codebases (10k+ chunks), consider WebAssembly SIMD
20
- * for ~2-4x speedup on 768-dim vectors.
21
- * @param {Float32Array} a - First normalized vector
22
- * @param {Float32Array} b - Second normalized vector
23
- * @returns {number} Dot product similarity score (-1 to 1 for normalized vectors)
24
- * @throws {Error} If vectors are null/undefined or have different dimensions
25
- */
16
+
26
17
  export function dotSimilarity(a, b) {
27
18
  if (!a || !b) {
28
19
  throw new Error(
@@ -54,16 +45,14 @@ export function dotSimilarity(a, b) {
54
45
  return dot;
55
46
  }
56
47
 
57
- /**
58
- * Generate hash for file content to detect changes
59
- */
48
+
60
49
  export function hashContent(content) {
61
50
  return crypto.createHash('md5').update(content).digest('hex');
62
51
  }
63
52
 
64
- // Language-specific patterns for function/class detection
53
+
65
54
  const patterns = {
66
- // JavaScript/TypeScript
55
+
67
56
  js: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
68
57
  jsx: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
69
58
  ts: /^(export\s+)?(async\s+)?(function|class|const|let|var|interface|type)\s+\w+/,
@@ -71,18 +60,18 @@ const patterns = {
71
60
  mjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
72
61
  cjs: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
73
62
 
74
- // Python
63
+
75
64
  py: /^(class|def|async\s+def)\s+\w+/,
76
65
  pyw: /^(class|def|async\s+def)\s+\w+/,
77
- pyx: /^(cdef|cpdef|def|class)\s+\w+/, // Cython
66
+ pyx: /^(cdef|cpdef|def|class)\s+\w+/,
78
67
 
79
- // Java/Kotlin/Scala
68
+
80
69
  java: /^(public|private|protected)?\s*(static\s+)?(class|interface|enum|void|int|String|boolean)\s+\w+/,
81
70
  kt: /^(class|interface|object|fun|val|var)\s+\w+/,
82
71
  kts: /^(class|interface|object|fun|val|var)\s+\w+/,
83
72
  scala: /^(class|object|trait|def|val|var)\s+\w+/,
84
73
 
85
- // C/C++
74
+
86
75
  c: /^(struct|enum|union|void|int|char|float|double)\s+\w+/,
87
76
  cpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
88
77
  cc: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
@@ -91,54 +80,54 @@ const patterns = {
91
80
  hpp: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
92
81
  hxx: /^(class|struct|namespace|template|void|int|bool)\s+\w+/,
93
82
 
94
- // C#
83
+
95
84
  cs: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
96
85
  csx: /^(public|private|protected)?\s*(static\s+)?(class|interface|struct|enum|void|int|string|bool)\s+\w+/,
97
86
 
98
- // Go
87
+
99
88
  go: /^(func|type|const|var)\s+\w+/,
100
89
 
101
- // Rust
90
+
102
91
  rs: /^(pub\s+)?(fn|struct|enum|trait|impl|const|static|mod)\s+\w+/,
103
92
 
104
- // PHP
93
+
105
94
  php: /^(class|interface|trait|function|const)\s+\w+/,
106
95
  phtml: /^(<\?php|class|interface|trait|function)\s*/,
107
96
 
108
- // Ruby
97
+
109
98
  rb: /^(class|module|def)\s+\w+/,
110
99
  rake: /^(class|module|def|task|namespace)\s+\w+/,
111
100
 
112
- // Swift
101
+
113
102
  swift: /^(class|struct|enum|protocol|func|var|let|extension)\s+\w+/,
114
103
 
115
- // R
104
+
116
105
  r: /^(\w+)\s*(<-|=)\s*function/,
117
106
  R: /^(\w+)\s*(<-|=)\s*function/,
118
107
 
119
- // Lua
108
+
120
109
  lua: /^(function|local\s+function)\s+\w+/,
121
110
 
122
- // Shell scripts
111
+
123
112
  sh: /^(\w+\s*\(\)|function\s+\w+)/,
124
113
  bash: /^(\w+\s*\(\)|function\s+\w+)/,
125
114
  zsh: /^(\w+\s*\(\)|function\s+\w+)/,
126
115
  fish: /^function\s+\w+/,
127
116
 
128
- // CSS/Styles
117
+
129
118
  css: /^(\.|#|@media|@keyframes|@font-face|\w+)\s*[{,]/,
130
119
  scss: /^(\$\w+:|@mixin|@function|@include|\.|#|@media)\s*/,
131
120
  sass: /^(\$\w+:|=\w+|\+\w+|\.|#|@media)\s*/,
132
121
  less: /^(@\w+:|\.|#|@media)\s*/,
133
122
  styl: /^(\$\w+\s*=|\w+\(|\.|#)\s*/,
134
123
 
135
- // Markup/HTML
124
+
136
125
  html: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
137
126
  htm: /^(<(div|section|article|header|footer|nav|main|aside|form|table|template|script|style)\b)/i,
138
127
  xml: /^(<\w+|\s*<!\[CDATA\[)/,
139
128
  svg: /^(<svg|<g|<path|<defs|<symbol)\b/,
140
129
 
141
- // Config files
130
+
142
131
  json: /^(\s*"[\w-]+"\s*:\s*[[{])/,
143
132
  yaml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
144
133
  yml: /^(\w[\w-]*:\s*[|>]?$|\w[\w-]*:\s*$)/,
@@ -146,48 +135,40 @@ const patterns = {
146
135
  ini: /^(\[\w+\]|\w+\s*=)/,
147
136
  env: /^[A-Z_][A-Z0-9_]*=/,
148
137
 
149
- // Makefile
138
+
150
139
  makefile: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
151
140
  mk: /^([A-Za-z0-9_./-]+)\s*:(?!=)/,
152
141
 
153
- // Docker
142
+
154
143
  dockerfile:
155
144
  /^(FROM|RUN|CMD|LABEL|EXPOSE|ENV|ADD|COPY|ENTRYPOINT|VOLUME|USER|WORKDIR|ARG|ONBUILD|STOPSIGNAL|HEALTHCHECK|SHELL)\s+/i,
156
145
 
157
- // Documentation
146
+
158
147
  md: /^(#{1,6}\s+|```|\*{3}|_{3})/,
159
148
  mdx: /^(#{1,6}\s+|```|import\s+|export\s+)/,
160
- txt: /^.{50,}/, // Split on long paragraphs
149
+ txt: /^.{50,}/,
161
150
  rst: /^(={3,}|-{3,}|~{3,}|\.\.\s+\w+::)/,
162
151
 
163
- // Database
152
+
164
153
  sql: /^(CREATE|ALTER|INSERT|UPDATE|DELETE|SELECT|DROP|GRANT|REVOKE|WITH|DECLARE|BEGIN|END)\s+/i,
165
154
 
166
- // Perl
155
+
167
156
  pl: /^(sub|package|use|require)\s+\w+/,
168
157
  pm: /^(sub|package|use|require)\s+\w+/,
169
158
 
170
- // Vim
159
+
171
160
  vim: /^(function|command|autocmd|let\s+g:)\s*/,
172
161
  };
173
162
 
174
- /**
175
- * Intelligent chunking with token limit awareness
176
- * Tries to split by function/class boundaries while respecting token limits
177
- *
178
- * @param {string} content - File content to chunk
179
- * @param {string} file - File path (for language detection)
180
- * @param {object} config - Configuration object with embeddingModel
181
- * @returns {Array<{text: string, startLine: number, endLine: number, tokenCount: number}>}
182
- */
163
+
183
164
  export function smartChunk(content, file, config) {
184
165
  const lines = content.split('\n');
185
166
  const chunks = [];
186
167
  const ext = path.extname(file).toLowerCase();
187
168
  const base = path.basename(file).toLowerCase();
188
- const SPECIAL_TOKENS = 2; // [CLS] + [SEP] accounted once per chunk
169
+ const SPECIAL_TOKENS = 2;
189
170
 
190
- // Get model-specific chunking parameters with optional user overrides
171
+
191
172
  let { maxTokens, targetTokens, overlapTokens } = getChunkingParams(config.embeddingModel);
192
173
  if (config.maxTokens) maxTokens = config.maxTokens;
193
174
  if (config.targetTokens) targetTokens = config.targetTokens;
@@ -200,25 +181,25 @@ export function smartChunk(content, file, config) {
200
181
  else if (base.startsWith('.env')) langPattern = patterns.env;
201
182
  }
202
183
  if (!langPattern || typeof langPattern.test !== 'function') {
203
- langPattern = patterns.js; // Default fallback
184
+ langPattern = patterns.js;
204
185
  }
205
186
  let currentChunk = [];
206
187
  let chunkStartLine = 0;
207
- let lineTokenCounts = []; // Cache token counts for overlap calculation
188
+ let lineTokenCounts = [];
208
189
 
209
190
  let currentTokenCount = 0;
210
191
 
211
- // Track bracket depth for better boundary detection
192
+
212
193
  let bracketDepth = 0;
213
194
  let braceDepth = 0;
214
195
  let parenDepth = 0;
215
196
  let inString = false;
216
197
  let inComment = false;
217
- let stringChar = null; // ' or " or `
198
+ let stringChar = null;
218
199
 
219
200
  const splitOversizedLine = (line, lineTokens) => {
220
201
  const charsPerToken = line.length / Math.max(1, lineTokens);
221
- const segmentSize = Math.max(100, Math.floor(charsPerToken * targetTokens)); // Min 100 chars
202
+ const segmentSize = Math.max(100, Math.floor(charsPerToken * targetTokens));
222
203
  const segments = [];
223
204
 
224
205
  for (let start = 0; start < line.length; start += segmentSize) {
@@ -234,15 +215,15 @@ export function smartChunk(content, file, config) {
234
215
 
235
216
  let j = 0;
236
217
 
237
- // Simple state tracking for heuristics (not a full parser)
218
+
238
219
  if (inComment) {
239
- // Look for end of block comment
220
+
240
221
  const endIdx = line.indexOf('*/');
241
222
  if (endIdx !== -1) {
242
223
  inComment = false;
243
224
  j = endIdx + 2;
244
225
  } else {
245
- // Skip whole line
226
+
246
227
  j = line.length;
247
228
  }
248
229
  }
@@ -256,31 +237,31 @@ export function smartChunk(content, file, config) {
256
237
 
257
238
  if (inString) {
258
239
  if (char === '\\') {
259
- j++; // Skip escaped char
240
+ j++;
260
241
  } else if (char === stringChar) {
261
242
  inString = false;
262
243
  stringChar = null;
263
244
  }
264
245
  } else {
265
- // Check for comment start
246
+
266
247
  if (char === '/' && nextChar === '*') {
267
248
  inComment = true;
268
249
  j++;
269
- // Check if it ends on same line
250
+
270
251
  const endIdx = line.indexOf('*/', j);
271
252
  if (endIdx !== -1) {
272
253
  inComment = false;
273
254
  j = endIdx + 1;
274
255
  } else {
275
- break; // Rest of line is comment
256
+ break;
276
257
  }
277
258
  } else if (char === '/' && nextChar === '/') {
278
- break; // Skip rest of line (line comment)
259
+ break;
279
260
  } else if (char === "'" || char === '"' || char === '`') {
280
261
  inString = true;
281
262
  stringChar = char;
282
263
  } else {
283
- // Only count brackets if not in string or comment
264
+
284
265
  if (char === '{') braceDepth++;
285
266
  else if (char === '}') braceDepth = Math.max(0, braceDepth - 1);
286
267
  else if (char === '[') bracketDepth++;
@@ -291,7 +272,7 @@ export function smartChunk(content, file, config) {
291
272
  }
292
273
  }
293
274
 
294
- // Split lines that are too large to ever fit in a single chunk
275
+
295
276
  if (lineTokens + SPECIAL_TOKENS > maxTokens) {
296
277
  if (currentChunk.length > 0) {
297
278
  const chunkText = currentChunk.join('\n');
@@ -324,11 +305,11 @@ export function smartChunk(content, file, config) {
324
305
  continue;
325
306
  }
326
307
 
327
- // Check if adding this line would exceed token limit
308
+
328
309
  const effectiveTokenCount = currentTokenCount + SPECIAL_TOKENS;
329
310
  const wouldExceedLimit = currentTokenCount + lineTokens + SPECIAL_TOKENS > targetTokens;
330
311
 
331
- // Check if this is a good split point using multiple heuristics
312
+
332
313
  const matchesPattern = langPattern.test(trimmed);
333
314
  const atTopLevel =
334
315
  braceDepth === 0 && bracketDepth === 0 && parenDepth === 0 && !inString && !inComment;
@@ -347,7 +328,7 @@ export function smartChunk(content, file, config) {
347
328
  const shouldSplit =
348
329
  wouldExceedLimit || (isGoodSplitPoint && effectiveTokenCount > targetTokens * 0.6);
349
330
 
350
- // Avoid splitting in weird states if possible
331
+
351
332
  const safeToSplit = (braceDepth <= 1 && !inString) || wouldExceedLimit;
352
333
 
353
334
  if (shouldSplit && safeToSplit && currentChunk.length > 0) {
@@ -363,8 +344,8 @@ export function smartChunk(content, file, config) {
363
344
 
364
345
  let overlapLines = [];
365
346
  let overlapTokensCount = 0;
366
- let overlapStartOffset = 0; // Track how many lines back we went
367
- const MAX_OVERLAP_ITERATIONS = 50; // Absolute limit to prevent unbounded loops
347
+ let overlapStartOffset = 0;
348
+ const MAX_OVERLAP_ITERATIONS = 50;
368
349
  let overlapIterations = 0;
369
350
  for (
370
351
  let k = currentChunk.length - 1;
@@ -372,12 +353,12 @@ export function smartChunk(content, file, config) {
372
353
  k--
373
354
  ) {
374
355
  overlapIterations++;
375
- // Use cached token count instead of re-estimating
356
+
376
357
  const lineT = lineTokenCounts[k] ?? 0;
377
- // Guard against infinite loops: if lineT is 0, count the line but don't loop forever
358
+
378
359
  if (lineT <= 0) {
379
- // Include zero-token lines (e.g., empty lines) but limit to prevent infinite spin
380
- // Also guard with overlapStartOffset < 20 to prevent excessive lines even if under 10 in overlapLines
360
+
361
+
381
362
  if (overlapLines.length < 10 && overlapStartOffset < 20) {
382
363
  overlapLines.unshift(currentChunk[k]);
383
364
  overlapStartOffset++;
@@ -394,12 +375,12 @@ export function smartChunk(content, file, config) {
394
375
  }
395
376
 
396
377
  currentChunk = overlapLines;
397
- // Rebuild lineTokenCounts for the overlap lines
378
+
398
379
  lineTokenCounts = overlapLines.map(l => estimateTokens(l, { includeSpecialTokens: false }));
399
380
  currentTokenCount = overlapTokensCount;
400
- // The new chunk starts from where the overlap begins in the original file
401
- // i is the current line we're about to process, overlap lines are from before
402
- // Ensure non-negative to handle edge cases where overlapStartOffset > i
381
+
382
+
383
+
403
384
  chunkStartLine = Math.max(0, i - overlapStartOffset);
404
385
  }
405
386
 
@@ -408,12 +389,12 @@ export function smartChunk(content, file, config) {
408
389
  currentTokenCount += lineTokens;
409
390
 
410
391
  if (chunks.length >= (config.maxChunksPerFile || 1000)) {
411
- // Hard limit to prevent memory explosion on minified/data files
392
+
412
393
  break;
413
394
  }
414
395
  }
415
396
 
416
- // Add remaining chunk
397
+
417
398
  const chunkText = currentChunk.join('\n');
418
399
  if (chunkText.trim().length > MIN_CHUNK_TEXT_LENGTH) {
419
400
  chunks.push({
@@ -164,7 +164,7 @@ export class BinaryVectorStore {
164
164
  try {
165
165
  await this.vectorsHandle.close();
166
166
  } catch {
167
- // ignore close errors
167
+
168
168
  }
169
169
  }
170
170
  this.vectorsHandle = null;
@@ -172,7 +172,7 @@ export class BinaryVectorStore {
172
172
  try {
173
173
  fsSync.closeSync(this.vectorsFd);
174
174
  } catch {
175
- // ignore close errors
175
+
176
176
  }
177
177
  }
178
178
  this.vectorsFd = null;
@@ -180,7 +180,7 @@ export class BinaryVectorStore {
180
180
  try {
181
181
  await this.contentHandle.close();
182
182
  } catch {
183
- // ignore close errors
183
+
184
184
  }
185
185
  }
186
186
  this.contentHandle = null;
@@ -276,7 +276,7 @@ export class BinaryVectorStore {
276
276
  try {
277
277
  fsSync.closeSync(vectorsFd);
278
278
  } catch {
279
- // ignore close errors
279
+
280
280
  }
281
281
  }
282
282
  throw err;
@@ -330,8 +330,8 @@ export class BinaryVectorStore {
330
330
  this.dim
331
331
  );
332
332
  } else if (Number.isInteger(this.vectorsFd)) {
333
- // Use Buffer.alloc (not allocUnsafe) for safety - prevents potential
334
- // information leak if read is partial or fails silently
333
+
334
+
335
335
  const buffer = Buffer.alloc(byteLength);
336
336
  const bytesRead = fsSync.readSync(this.vectorsFd, buffer, 0, byteLength, offset);
337
337
  if (bytesRead === byteLength) {
@@ -592,7 +592,7 @@ export class BinaryVectorStore {
592
592
  vectorPos += vectorBuffer.length;
593
593
 
594
594
  if (entry.contentLength > 0) {
595
- // Re-fetch content to avoid holding all strings in memory
595
+
596
596
  const val = await resolveContent(chunk, sourceIndex);
597
597
  const contentBuffer = Buffer.from(val, 'utf-8');
598
598
  await contentHandle.write(contentBuffer, 0, contentBuffer.length, contentPos);