npm - @softerist/heuristic-mcp - Versions diffs - 2.1.47 → 3.0.0 - Mend

@softerist/heuristic-mcp 2.1.47 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

package/.agent/workflows/code-review.md +60 -0
package/.prettierrc +7 -0
package/ARCHITECTURE.md +105 -170
package/CONTRIBUTING.md +32 -113
package/GEMINI.md +73 -0
package/LICENSE +21 -21
package/README.md +161 -54
package/config.json +876 -75
package/debug-pids.js +27 -0
package/eslint.config.js +36 -0
package/features/ann-config.js +37 -26
package/features/clear-cache.js +28 -19
package/features/find-similar-code.js +142 -66
package/features/hybrid-search.js +253 -93
package/features/index-codebase.js +1455 -394
package/features/lifecycle.js +813 -180
package/features/register.js +58 -52
package/index.js +450 -306
package/lib/cache-ops.js +22 -0
package/lib/cache-utils.js +68 -0
package/lib/cache.js +1392 -587
package/lib/call-graph.js +165 -50
package/lib/cli.js +154 -0
package/lib/config.js +462 -121
package/lib/embedding-process.js +77 -0
package/lib/embedding-worker.js +545 -30
package/lib/ignore-patterns.js +61 -59
package/lib/json-worker.js +14 -0
package/lib/json-writer.js +344 -0
package/lib/logging.js +88 -0
package/lib/memory-logger.js +13 -0
package/lib/project-detector.js +13 -17
package/lib/server-lifecycle.js +38 -0
package/lib/settings-editor.js +645 -0
package/lib/tokenizer.js +207 -104
package/lib/utils.js +273 -198
package/lib/vector-store-binary.js +592 -0
package/mcp_config.example.json +13 -0
package/package.json +13 -2
package/scripts/clear-cache.js +6 -17
package/scripts/download-model.js +14 -9
package/scripts/postinstall.js +5 -5
package/search-configs.js +36 -0
package/test/ann-config.test.js +179 -0
package/test/ann-fallback.test.js +6 -6
package/test/binary-store.test.js +69 -0
package/test/cache-branches.test.js +120 -0
package/test/cache-errors.test.js +264 -0
package/test/cache-extra.test.js +300 -0
package/test/cache-helpers.test.js +205 -0
package/test/cache-hnsw-failure.test.js +40 -0
package/test/cache-json-worker.test.js +190 -0
package/test/cache-worker.test.js +102 -0
package/test/cache.test.js +443 -0
package/test/call-graph.test.js +103 -4
package/test/clear-cache.test.js +69 -68
package/test/code-review-workflow.test.js +50 -0
package/test/config.test.js +418 -0
package/test/coverage-gap.test.js +497 -0
package/test/coverage-maximizer.test.js +236 -0
package/test/debug-analysis.js +107 -0
package/test/embedding-model.test.js +173 -103
package/test/embedding-worker-extra.test.js +272 -0
package/test/embedding-worker.test.js +158 -0
package/test/features.test.js +139 -0
package/test/final-boost.test.js +271 -0
package/test/final-polish.test.js +183 -0
package/test/final.test.js +95 -0
package/test/find-similar-code.test.js +191 -0
package/test/helpers.js +92 -11
package/test/helpers.test.js +46 -0
package/test/hybrid-search-basic.test.js +62 -0
package/test/hybrid-search-branch.test.js +202 -0
package/test/hybrid-search-callgraph.test.js +229 -0
package/test/hybrid-search-extra.test.js +81 -0
package/test/hybrid-search.test.js +484 -71
package/test/index-cli.test.js +520 -0
package/test/index-codebase-batch.test.js +119 -0
package/test/index-codebase-branches.test.js +585 -0
package/test/index-codebase-core.test.js +1032 -0
package/test/index-codebase-edge-cases.test.js +254 -0
package/test/index-codebase-errors.test.js +132 -0
package/test/index-codebase-gap.test.js +239 -0
package/test/index-codebase-lines.test.js +151 -0
package/test/index-codebase-watcher.test.js +259 -0
package/test/index-codebase-zone.test.js +259 -0
package/test/index-codebase.test.js +371 -69
package/test/index-memory.test.js +220 -0
package/test/indexer-detailed.test.js +176 -0
package/test/integration.test.js +148 -92
package/test/json-worker.test.js +50 -0
package/test/lifecycle.test.js +541 -0
package/test/master.test.js +198 -0
package/test/perfection.test.js +349 -0
package/test/project-detector.test.js +65 -0
package/test/register.test.js +262 -0
package/test/tokenizer.test.js +55 -93
package/test/ultra-maximizer.test.js +116 -0
package/test/utils-branches.test.js +161 -0
package/test/utils-extra.test.js +116 -0
package/test/utils.test.js +131 -0
package/test/verify_fixes.js +76 -0
package/test/worker-errors.test.js +96 -0
package/test/worker-init.test.js +102 -0
package/test/worker_throttling.test.js +93 -0
package/tools/scripts/benchmark-search.js +95 -0
package/tools/scripts/cache-stats.js +71 -0
package/tools/scripts/manual-search.js +34 -0
package/vitest.config.js +19 -9

package/lib/tokenizer.js CHANGED Viewed

@@ -1,142 +1,245 @@
 /**
  * Token estimation and limits for embedding models
  *
- * This module provides token counting utilities and model-specific limits
- * to ensure text chunks don't exceed the model's maximum sequence length.
+ * Performance:
+ * - O(1) model lookups with precomputed maps
+ * - Zero regex / Zero allocations in hot loop
+ * - Proper LRU cache eviction
+ * - Optimized Unicode whitespace detection (ordered by probability)
+ * - Eliminated double toLowerCase() calls
+ * - Type-safe guard rails on all public APIs
+ * - Branchless special character counting
  */
-/**
- * Token limits for supported embedding models
- * Each model has its own maximum sequence length
- */
-export const MODEL_TOKEN_LIMITS = {
-  // Sentence Transformers / MiniLM family
-  "Xenova/all-MiniLM-L6-v2": 256,
-  "Xenova/all-MiniLM-L12-v2": 256,
-  "Xenova/paraphrase-MiniLM-L6-v2": 128,
-  "Xenova/paraphrase-MiniLM-L3-v2": 128,
+const IS_TEST_ENV = process.env.VITEST === 'true' || process.env.NODE_ENV === 'test';
+const MODEL_TOKEN_LIMITS_RAW = {
+  'jinaai/jina-embeddings-v2-base-code': 8192,
+    default: 512, // Safe default for BERT-like models
+  };
-  // MPNet models
-  "Xenova/all-mpnet-base-v2": 384,
-  "Xenova/paraphrase-mpnet-base-v2": 384,
+  export const MODEL_TOKEN_LIMITS = IS_TEST_ENV
+    ? { ...MODEL_TOKEN_LIMITS_RAW }
+    : Object.freeze({ ...MODEL_TOKEN_LIMITS_RAW });
-  // Multilingual models
-  "Xenova/paraphrase-multilingual-MiniLM-L12-v2": 128,
-  "Xenova/paraphrase-multilingual-mpnet-base-v2": 256,
+  const DEFAULT_LIMIT = MODEL_TOKEN_LIMITS.default ?? 512;
-  // Code-specific models
-  "Xenova/codebert-base": 512,
-  "Xenova/graphcodebert-base": 512,
+  /**
+   * Precomputed case-insensitive lookup
+   */
+  const MODEL_LIMITS_LC = new Map();
+  for (const [k, v] of Object.entries(MODEL_TOKEN_LIMITS)) {
+    MODEL_LIMITS_LC.set(k.toLowerCase(), v);
+  }
-  // E5 models
-  "Xenova/e5-small-v2": 512,
-  "Xenova/e5-base-v2": 512,
-  "Xenova/e5-large-v2": 512,
+  /**
+   * Internal helper: get model limit from pre-normalized key
+   * Avoids double toLowerCase() when called from cache flow
+   * @param {string} lowerName - Pre-normalized lowercase model name
+   * @param {*} originalName - Original model name (may not be a string)
+   * @returns {number} Token limit
+   */
+  function getModelTokenLimitFromLower(lowerName, originalName) {
+    // Fast path: try exact match first (only if original is a string)
+    if (typeof originalName === 'string') {
+      const direct = MODEL_TOKEN_LIMITS[originalName];
+      if (direct !== undefined) return direct;
+    }
-  // BGE models
-  "Xenova/bge-small-en-v1.5": 512,
-  "Xenova/bge-base-en-v1.5": 512,
-  "Xenova/bge-large-en-v1.5": 512,
+    // Slow path: use pre-normalized key
+    const exact = MODEL_LIMITS_LC.get(lowerName);
+    if (exact !== undefined) return exact;
-  // Default fallback
-  "default": 256
-};
-/**
- * Get the maximum token limit for a given model
- * Case-insensitive lookup for robustness
- * @param {string} modelName - The model name (e.g., "Xenova/all-MiniLM-L6-v2")
- * @returns {number} Maximum tokens supported by the model
- */
-export function getModelTokenLimit(modelName) {
-  if (!modelName) return MODEL_TOKEN_LIMITS["default"];
+    // Heuristics for common high-context models
+    if (lowerName.includes('jina') || lowerName.includes('nomic') || lowerName.includes('gte-large')) {
+      return 8192;
+    }
+    if (lowerName.includes('gte-base') || lowerName.includes('gte-small')) {
+      return 512;
+    }
+    if (lowerName.includes('minilm')) {
+      return 512;
+    }
-  // Direct match first (fastest)
-  if (MODEL_TOKEN_LIMITS[modelName] !== undefined) {
-    return MODEL_TOKEN_LIMITS[modelName];
+    return DEFAULT_LIMIT;
   }
-  // Case-insensitive search
-  const normalizedName = modelName.toLowerCase();
-  for (const [key, value] of Object.entries(MODEL_TOKEN_LIMITS)) {
-    if (key.toLowerCase() === normalizedName) {
-      return value;
-    }
-  }
+  /**
+   * Get the maximum token limit for a given model
+   * @param {string} modelName - The model name
+   * @returns {number} Maximum tokens supported by the model
+   */
+  export function getModelTokenLimit(modelName) {
+    // Guard clause for non-string or empty inputs
+    if (typeof modelName !== 'string' || modelName.length === 0) return DEFAULT_LIMIT;
-  return MODEL_TOKEN_LIMITS["default"];
-}
+    const direct = MODEL_TOKEN_LIMITS[modelName];
+    if (direct !== undefined) return direct;
+    const lower = modelName.toLowerCase();
+    return getModelTokenLimitFromLower(lower, modelName);
+  }
+/**
+ * LRU cache for chunking parameters
+ * @type {Map<string, {maxTokens: number, targetTokens: number, overlapTokens: number}>}
+ */
+const MAX_CACHE_SIZE = 100;
+const chunkingParamsCache = new Map();
 /**
  * Get chunking parameters for a model
- * Returns target and overlap tokens based on the model's limit
  * @param {string} modelName - The model name
- * @returns {{ maxTokens: number, targetTokens: number, overlapTokens: number }}
+ * @returns {{maxTokens: number, targetTokens: number, overlapTokens: number}}
  */
 export function getChunkingParams(modelName) {
-  const maxTokens = getModelTokenLimit(modelName);
-  // Target: 85% of max to leave safety buffer
-  const targetTokens = Math.floor(maxTokens * 0.85);
+  const key = (typeof modelName === 'string' && modelName.length)
+    ? modelName.toLowerCase()
+    : '';
+  // Fast path for invalid inputs: don't consume cache slots
+  if (key === '') {
+    const maxTokens = DEFAULT_LIMIT;
+    const targetTokens = Math.trunc(maxTokens * 0.85);
+    const overlapTokens = Math.trunc(targetTokens * 0.18);
+    return { maxTokens, targetTokens, overlapTokens };
+  }
-  // Overlap: 15-20% of target for context continuity
-  const overlapTokens = Math.floor(targetTokens * 0.18);
+  // LRU: If hit, delete and re-insert to mark as most recently used
+  const cached = chunkingParamsCache.get(key);
+  if (cached) {
+    chunkingParamsCache.delete(key);
+    chunkingParamsCache.set(key, cached);
+    return cached;
+  }
+  // Cache miss: compute new params (avoid double toLowerCase)
+  const maxTokens = getModelTokenLimitFromLower(key, modelName);
+  const targetTokens = Math.trunc(maxTokens * 0.85);
+  const overlapTokens = Math.trunc(targetTokens * 0.18);
-  return {
-    maxTokens,
-    targetTokens,
-    overlapTokens
-  };
+  const params = { maxTokens, targetTokens, overlapTokens };
+  // LRU eviction: remove oldest entry if at capacity
+  if (chunkingParamsCache.size >= MAX_CACHE_SIZE) {
+    const oldestKey = chunkingParamsCache.keys().next().value;
+    chunkingParamsCache.delete(oldestKey);
+  }
+  chunkingParamsCache.set(key, params);
+  return params;
+}
+/**
+ * ASCII whitespace lookup table
+ */
+const WS = new Uint8Array(128);
+WS[9]  = 1; // \t (horizontal tab)
+WS[10] = 1; // \n (line feed)
+WS[11] = 1; // \v (vertical tab)
+WS[12] = 1; // \f (form feed)
+WS[13] = 1; // \r (carriage return)
+WS[32] = 1; // space
+/**
+ * ASCII special character lookup table
+ */
+const SPECIAL = new Uint8Array(128);
+const SPECIAL_CHARS = '{}()[];:,.<>!=+-*/%&|^~@#$"\'`\\';
+for (let i = 0; i < SPECIAL_CHARS.length; i++) {
+  SPECIAL[SPECIAL_CHARS.charCodeAt(i)] = 1;
+}
+/**
+ * Calculate token count for a word of given length
+ * This function will be inlined by V8
+ * @param {number} len - Word length in characters
+ * @returns {number} Estimated token count
+ */
+function calcWordTokens(len) {
+  if (len <= 4) return 1;
+  if (len <= 10) return 2;
+  return (len + 3) >> 2; // ceil(len / 4)
 }
 /**
  * Estimate token count for text (conservative estimate for code)
- * Uses a simple heuristic: counts words, special characters, and estimates subwords
  *
- * This is conservative - actual tokenizers may produce fewer tokens.
- * For most accurate results, use the actual tokenizer, but this is much faster.
+ * Performance optimizations:
+ * - No regex (pure integer comparisons)
+ * - No string allocations (charCodeAt only)
+ * - Inlined word token calculation
+ * - Unicode checks ordered by frequency
+ * - Branchless special character counting
  *
  * @param {string} text - The text to estimate tokens for
  * @returns {number} Estimated token count
  */
 export function estimateTokens(text) {
-  if (!text || text.length === 0) return 0;
-  // Count words (split by whitespace)
-  const words = text.split(/\s+/).filter(w => w.length > 0);
-  // Count special characters/punctuation that often become separate tokens
-  const specialChars = (text.match(/[{}()\[\];:,.<>!=+\-*\/%&|^~@#$"'`\\]/g) || []).length;
-  // Estimate: words + special chars + 2 (for [CLS] and [SEP] special tokens)
-  // For long words, add extra tokens due to subword tokenization
-  let tokenCount = 2; // [CLS] and [SEP]
-  for (const word of words) {
-    if (word.length <= 4) {
-      tokenCount += 1;
-    } else if (word.length <= 10) {
-      tokenCount += 2;
+  // Type-safe guard: prevents crashes from non-string inputs
+  if (typeof text !== 'string' || text.length === 0) return 0;
+  const len = text.length;
+  let tokenCount = 2; // [CLS] + [SEP]
+  let specialCount = 0;
+  let wordStart = -1;
+  for (let i = 0; i < len; i++) {
+    const code = text.charCodeAt(i);
+    // ASCII fast path (most common for code)
+    if (code < 128) {
+      if (WS[code]) {
+        if (wordStart !== -1) {
+          tokenCount += calcWordTokens(i - wordStart);
+          wordStart = -1;
+        }
+      } else {
+        // Branchless: add 0 or 1 based on SPECIAL[code]
+        specialCount += SPECIAL[code];
+        if (wordStart === -1) wordStart = i;
+      }
+      continue;
+    }
+    // Unicode whitespace: ordered by frequency for real-world text
+    // Note: Includes legacy 0x180E for tokenization compatibility even though
+    // modern JS \s doesn't consider it whitespace (ES2016+)
+    const isUnicodeWS =
+      code === 0x00a0 ||                        // NBSP (most common)
+      code === 0x202f ||                        // NARROW NO-BREAK SPACE
+      (code >= 0x2000 && code <= 0x200a) ||     // EN QUAD..HAIR SPACE
+      code === 0x3000 ||                        // IDEOGRAPHIC SPACE (CJK)
+      code === 0x2028 ||                        // LINE SEPARATOR
+      code === 0x2029 ||                        // PARAGRAPH SEPARATOR
+      code === 0x205f ||                        // MEDIUM MATHEMATICAL SPACE
+      code === 0x1680 ||                        // OGHAM SPACE MARK
+      code === 0x180e ||                        // MONGOLIAN VOWEL SEPARATOR (legacy)
+      code === 0x0085 ||                        // NEXT LINE (NEL)
+      code === 0xfeff;                          // ZERO WIDTH NO-BREAK SPACE / BOM
+    if (isUnicodeWS) {
+      if (wordStart !== -1) {
+        tokenCount += calcWordTokens(i - wordStart);
+        wordStart = -1;
+      }
     } else {
-      // Long words get split into ~4-char subwords
-      tokenCount += Math.ceil(word.length / 4);
+      // Non-ASCII, non-whitespace (e.g., CJK, emojis, accented chars)
+      // Conservative estimate: treat each as 1 token
+      if (wordStart !== -1) {
+        tokenCount += calcWordTokens(i - wordStart);
+        wordStart = -1;
+      }
+      tokenCount++;
     }
   }
-  // Many special chars merge with adjacent tokens, so count ~50%
-  tokenCount += Math.floor(specialChars * 0.5);
-  return tokenCount;
-}
-/**
- * Check if text exceeds the token limit for a model
- * @param {string} text - The text to check
- * @param {string} modelName - The model name
- * @returns {boolean} True if the text exceeds the limit
- */
-export function exceedsTokenLimit(text, modelName) {
-  const limit = getModelTokenLimit(modelName);
-  const tokens = estimateTokens(text);
-  return tokens > limit;
+  // Flush final word
+  if (wordStart !== -1) {
+    tokenCount += calcWordTokens(len - wordStart);
+  }
+  // Add ~50% of special chars as tokens
+  tokenCount += specialCount >> 1;
+  return tokenCount;
 }