npm - sweet-search - Versions diffs - 2.4.2 → 2.5.2 - Mend

sweet-search 2.4.2 → 2.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/core/cli.js +43 -5
package/core/embedding/embedding-cache.js +266 -18
package/core/embedding/embedding-service.js +45 -9
package/core/graph/graph-expansion.js +52 -12
package/core/graph/graph-extractor.js +30 -1
package/core/indexing/ast-chunker.js +331 -16
package/core/indexing/chunking/chunk-builder.js +34 -1
package/core/indexing/index-codebase-v21.js +31 -2
package/core/indexing/index.js +6 -3
package/core/indexing/indexer-ann.js +45 -6
package/core/indexing/indexer-build.js +9 -1
package/core/indexing/indexer-phases.js +6 -4
package/core/indexing/indexing-file-policy.js +140 -0
package/core/indexing/li-skip-policy.js +11 -220
package/core/infrastructure/codebase-repository.js +21 -0
package/core/infrastructure/config/embedding.js +20 -1
package/core/infrastructure/config/graph.js +2 -2
package/core/infrastructure/config/ranking.js +10 -0
package/core/infrastructure/config/vector-store.js +1 -1
package/core/infrastructure/coreml-cascade.js +236 -30
package/core/infrastructure/coreml-cascade.json +25 -0
package/core/infrastructure/index.js +17 -0
package/core/infrastructure/init-config.js +216 -0
package/core/infrastructure/language-patterns/registry-core.js +18 -0
package/core/infrastructure/model-registry.js +12 -0
package/core/infrastructure/native-inference.js +143 -51
package/core/infrastructure/tree-sitter-provider.js +92 -2
package/core/ranking/cascaded-scorer.js +6 -2
package/core/ranking/file-kind-ranking.js +264 -0
package/core/ranking/late-interaction-index.js +10 -4
package/core/ranking/late-interaction-policy.js +304 -0
package/core/search/context-expander.js +267 -28
package/core/search/index.js +4 -0
package/core/search/search-cli.js +3 -1
package/core/search/search-pattern.js +4 -3
package/core/search/search-postprocess.js +189 -8
package/core/search/search-read-semantic.js +734 -0
package/core/search/search-read.js +481 -0
package/core/search/search-server.js +153 -5
package/core/search/sweet-search.js +133 -16
package/core/start-server.js +13 -2
package/mcp/server.js +41 -0
package/mcp/tool-handlers.js +117 -6
package/package.json +9 -7
package/scripts/init.js +386 -5
package/scripts/uninstall.js +152 -6

package/core/indexing/indexing-file-policy.js ADDED Viewed

@@ -0,0 +1,140 @@
+/**
+ * Shared indexing file policy.
+ *
+ * Embedding discovery, BM25/sparse artifacts, and late-interaction indexing
+ * must agree on the same project include/exclude policy. This module provides
+ * the chunk-level checks that happen after discovery has already applied the
+ * shared include/exclude globs, max file size, and gitignore alignment.
+ */
+import { existsSync, readFileSync } from 'fs';
+import { minimatch } from 'minimatch';
+import { loadProjectConfig } from '../infrastructure/config/index.js';
+const MM_OPTS = { dot: true, nocase: false };
+function normalizePath(p) {
+  return p.replace(/\\/g, '/');
+}
+const _excludesByRoot = new Map();
+function getExcludes(projectRoot) {
+  const key = projectRoot || '__cwd__';
+  let cached = _excludesByRoot.get(key);
+  if (cached) return cached;
+  const config = loadProjectConfig(projectRoot || process.cwd());
+  cached = Array.isArray(config.exclude) ? config.exclude : [];
+  _excludesByRoot.set(key, cached);
+  return cached;
+}
+function resetCache() {
+  _excludesByRoot.clear();
+  _cachedExtraPatterns = null;
+}
+const GENERATED_MARKERS = [
+  '@generated',
+  'Code generated by',
+  'DO NOT EDIT',
+  'AUTO-GENERATED FILE',
+  'AUTOGENERATED FILE',
+  'This file is automatically generated',
+];
+let _cachedExtraPatterns = null;
+function loadExtraPatternsFromFile() {
+  if (_cachedExtraPatterns !== null) return _cachedExtraPatterns;
+  const path = process.env.SWEET_SEARCH_LI_SKIP_FILE;
+  if (!path || !existsSync(path)) {
+    _cachedExtraPatterns = [];
+    return _cachedExtraPatterns;
+  }
+  try {
+    const lines = readFileSync(path, 'utf8').split(/\r?\n/);
+    _cachedExtraPatterns = lines
+      .map((l) => l.trim())
+      .filter((l) => l && !l.startsWith('#'));
+  } catch {
+    _cachedExtraPatterns = [];
+  }
+  return _cachedExtraPatterns;
+}
+export function isExcludedByConfig(filePath, projectRoot) {
+  if (!filePath) return false;
+  const p = normalizePath(filePath);
+  const excludes = getExcludes(projectRoot);
+  for (const g of excludes) {
+    if (typeof g === 'string' && minimatch(p, g, MM_OPTS)) return true;
+  }
+  const extras = loadExtraPatternsFromFile();
+  for (const g of extras) {
+    if (minimatch(p, g, MM_OPTS)) return true;
+  }
+  return false;
+}
+export function chunkLooksGenerated(text) {
+  if (!text) return false;
+  const head = text.slice(0, 500);
+  for (const marker of GENERATED_MARKERS) {
+    if (head.includes(marker)) return true;
+  }
+  return false;
+}
+export function applyIndexingChunkPolicy(chunks, options = {}) {
+  if (process.env.SWEET_SEARCH_LI_SKIP_DISABLE === '1' || !Array.isArray(chunks)) {
+    return { kept: chunks || [], skipped: [], stats: emptyStats() };
+  }
+  const { projectRoot } = options;
+  const fileFirstReason = new Map();
+  for (const chunk of chunks) {
+    if (!chunk?.file) continue;
+    if (fileFirstReason.has(chunk.file)) continue;
+    let reason = null;
+    if (isExcludedByConfig(chunk.file, projectRoot)) {
+      reason = 'excluded';
+    } else {
+      const text = chunk.text || chunk.content || '';
+      if (chunkLooksGenerated(text)) reason = 'generated';
+    }
+    fileFirstReason.set(chunk.file, reason);
+  }
+  const kept = [];
+  const skipped = [];
+  const stats = emptyStats();
+  for (const chunk of chunks) {
+    const reason = chunk?.file ? fileFirstReason.get(chunk.file) : null;
+    if (reason) {
+      skipped.push(chunk);
+      stats[reason]++;
+      stats.totalSkipped++;
+    } else {
+      kept.push(chunk);
+    }
+  }
+  stats.skippedFiles = new Set(skipped.map((c) => c.file).filter(Boolean)).size;
+  stats.keptFiles = new Set(kept.map((c) => c.file).filter(Boolean)).size;
+  return { kept, skipped, stats };
+}
+function emptyStats() {
+  return {
+    excluded: 0,
+    generated: 0,
+    totalSkipped: 0,
+    skippedFiles: 0,
+    keptFiles: 0,
+  };
+}
+export const _internals = {
+  GENERATED_MARKERS,
+  resetCache,
+};

package/core/indexing/li-skip-policy.js CHANGED Viewed

@@ -1,225 +1,16 @@
 /**
- * Late Interaction skip policy.
+ * Late Interaction skip policy compatibility exports.
  *
- * The embedding indexer and the LI reranker share a SINGLE unified skip list —
- * the `exclude` globs loaded by `loadProjectConfig()` from
- * `core/infrastructure/config/search.js`. That list is the authoritative
- * source of truth for vendored / build-output / lock / minified / secret /
- * binary noise, AND it includes any user extensions from
- * `.sweet-search.config.json`. The embed indexer at
- * `core/indexing/indexer-utils.js` passes the same list to fast-glob; we
- * match it here with `minimatch`.
- *
- * This module adds TWO things on top of the unified glob check that globs
- * alone cannot express:
- *
- *   1. `chunkLooksGenerated` — content-based detection of auto-generated files
- *      (starting with `// @generated` / `Code generated by` / `DO NOT EDIT`).
- *   2. `applyLiSkipPolicy` per-file token budget — a resource guard for the
- *      slow LI encoder so one huge JSON/grammar file can't blow the budget.
- *      This is a latency decision, not a semantic one.
- *
- * Configurable via:
- *   - SWEET_SEARCH_LI_SKIP_DISABLE=1     → disable all skip policy (LI everything)
- *   - SWEET_SEARCH_LI_SKIP_FILE=<path>   → extra GLOB patterns, one per line
- *                                          (# comments allowed; blank lines ignored)
- *   - SWEET_SEARCH_LI_MAX_FILE_TOKENS=N  → per-file token cap (default: 50_000)
- */
-import { existsSync, readFileSync } from 'fs';
-import { minimatch } from 'minimatch';
-import { loadProjectConfig } from '../infrastructure/config/index.js';
-const MM_OPTS = { dot: true, nocase: false };
-function normalizePath(p) {
-  return p.replace(/\\/g, '/');
-}
-/**
- * Cache the resolved exclude list keyed by projectRoot so we don't reload
- * `.sweet-search.config.json` on every chunk. Keyed by projectRoot string;
- * `null`/`undefined` collapse to a default `cwd` key.
- */
-const _excludesByRoot = new Map();
-function getExcludes(projectRoot) {
-  const key = projectRoot || '__cwd__';
-  let cached = _excludesByRoot.get(key);
-  if (cached) return cached;
-  const config = loadProjectConfig(projectRoot || process.cwd());
-  cached = Array.isArray(config.exclude) ? config.exclude : [];
-  _excludesByRoot.set(key, cached);
-  return cached;
-}
-/**
- * Test-only: clear the excludes cache. Not exported through the public
- * surface; reachable via `_internals.resetCache()`.
- */
-function resetCache() {
-  _excludesByRoot.clear();
-  _cachedExtraPatterns = null;
-}
-/**
- * Markers that indicate a file is auto-generated. We only check the FIRST
- * ~500 characters of a chunk's text since real generated headers always
- * appear at the top.
- */
-const GENERATED_MARKERS = [
-  '@generated',
-  'Code generated by',
-  'DO NOT EDIT',
-  'AUTO-GENERATED FILE',
-  'AUTOGENERATED FILE',
-  'This file is automatically generated',
-];
-const DEFAULT_MAX_FILE_TOKENS = 50_000;
-let _cachedExtraPatterns = null;
-function loadExtraPatternsFromFile() {
-  if (_cachedExtraPatterns !== null) return _cachedExtraPatterns;
-  const path = process.env.SWEET_SEARCH_LI_SKIP_FILE;
-  if (!path || !existsSync(path)) {
-    _cachedExtraPatterns = [];
-    return _cachedExtraPatterns;
-  }
-  try {
-    const lines = readFileSync(path, 'utf8').split(/\r?\n/);
-    _cachedExtraPatterns = lines
-      .map((l) => l.trim())
-      .filter((l) => l && !l.startsWith('#'));
-  } catch {
-    _cachedExtraPatterns = [];
-  }
-  return _cachedExtraPatterns;
-}
-/**
- * Check if a file path matches any exclude glob from the unified
- * `loadProjectConfig(projectRoot).exclude` list, plus any extra globs loaded
- * from `SWEET_SEARCH_LI_SKIP_FILE`.
- *
- * @param {string} filePath
- * @param {string} [projectRoot]
- */
-export function isExcludedByConfig(filePath, projectRoot) {
-  if (!filePath) return false;
-  const p = normalizePath(filePath);
-  const excludes = getExcludes(projectRoot);
-  for (const g of excludes) {
-    if (typeof g === 'string' && minimatch(p, g, MM_OPTS)) return true;
-  }
-  const extras = loadExtraPatternsFromFile();
-  for (const g of extras) {
-    if (minimatch(p, g, MM_OPTS)) return true;
-  }
-  return false;
-}
-/**
- * Check if a chunk's text starts with a generated-file marker.
- * Only inspects the first ~500 chars (these markers always appear at the top).
- */
-export function chunkLooksGenerated(text) {
-  if (!text) return false;
-  const head = text.slice(0, 500);
-  for (const marker of GENERATED_MARKERS) {
-    if (head.includes(marker)) return true;
-  }
-  return false;
-}
-/**
- * Filter a chunk list down to those eligible for late-interaction encoding.
- *
- * In the normal pipeline the embed indexer has already rejected most glob
- * matches at file-discovery time, so the glob check here is defense-in-depth:
- * it catches chunks produced by alternate code paths or config drift. The
- * content-based `chunkLooksGenerated` and the per-file token cap are the
- * checks that actually do new work.
- *
- * @param {Array<{file: string, text?: string, content?: string}>} chunks
- * @param {{ projectRoot?: string }} [options]
- * @returns {{ kept: Array, skipped: Array, stats: object }}
+ * The shared implementation lives in `indexing-file-policy.js` so embedding,
+ * sparse/BM25 artifacts, and LI all depend on one file-policy source.
  */
-export function applyLiSkipPolicy(chunks, options = {}) {
-  if (process.env.SWEET_SEARCH_LI_SKIP_DISABLE === '1' || !Array.isArray(chunks)) {
-    return { kept: chunks || [], skipped: [], stats: emptyStats() };
-  }
-  const { projectRoot } = options;
-  const maxFileTokens = parseInt(process.env.SWEET_SEARCH_LI_MAX_FILE_TOKENS || '', 10) > 0
-    ? parseInt(process.env.SWEET_SEARCH_LI_MAX_FILE_TOKENS, 10)
-    : DEFAULT_MAX_FILE_TOKENS;
-  // First pass: per-file token totals + per-chunk reasons.
-  const fileTokenTotal = new Map();
-  const fileFirstReason = new Map(); // file → reason ('excluded', 'generated', 'huge', null)
-  for (const chunk of chunks) {
-    if (!chunk?.file) continue;
-    if (fileFirstReason.has(chunk.file)) continue; // already classified
-    let reason = null;
-    if (isExcludedByConfig(chunk.file, projectRoot)) {
-      reason = 'excluded';
-    } else {
-      const text = chunk.text || chunk.content || '';
-      if (chunkLooksGenerated(text)) reason = 'generated';
-    }
-    fileFirstReason.set(chunk.file, reason);
-  }
-  // Per-file token totals (only for files not already classified for skip)
-  for (const chunk of chunks) {
-    if (!chunk?.file) continue;
-    if (fileFirstReason.get(chunk.file)) continue; // already skipped
-    const text = chunk.text || chunk.content || '';
-    // Cheap char/4 estimate; exact tokenization isn't needed here.
-    const est = Math.ceil(text.length / 4);
-    fileTokenTotal.set(chunk.file, (fileTokenTotal.get(chunk.file) || 0) + est);
-  }
-  for (const [file, total] of fileTokenTotal) {
-    if (total > maxFileTokens) fileFirstReason.set(file, 'huge');
-  }
-  // Second pass: split chunks into kept/skipped buckets using the file decisions.
-  const kept = [];
-  const skipped = [];
-  const stats = emptyStats();
-  for (const chunk of chunks) {
-    const reason = chunk?.file ? fileFirstReason.get(chunk.file) : null;
-    if (reason) {
-      skipped.push(chunk);
-      stats[reason]++;
-      stats.totalSkipped++;
-    } else {
-      kept.push(chunk);
-    }
-  }
-  // Distinct file counts (helpful for the summary line)
-  stats.skippedFiles = new Set(skipped.map((c) => c.file).filter(Boolean)).size;
-  stats.keptFiles = new Set(kept.map((c) => c.file).filter(Boolean)).size;
-  return { kept, skipped, stats };
-}
-function emptyStats() {
-  return {
-    excluded: 0,
-    generated: 0,
-    huge: 0,
-    totalSkipped: 0,
-    skippedFiles: 0,
-    keptFiles: 0,
-  };
-}
+export {
+  isExcludedByConfig,
+  chunkLooksGenerated,
+  _internals,
+} from './indexing-file-policy.js';
-// Exported for tests
-export const _internals = {
-  GENERATED_MARKERS,
-  DEFAULT_MAX_FILE_TOKENS,
-  resetCache,
-};
+export {
+  applyIndexingChunkPolicy as applyLiSkipPolicy,
+} from './indexing-file-policy.js';

package/core/infrastructure/codebase-repository.js CHANGED Viewed

@@ -78,6 +78,27 @@ export class CodebaseRepository {
     }
   }
+  /**
+   * Return all chunk metadata rows for a single file_path.
+   * Used by sweet-search read / read-semantic for symbol-aware metadata
+   * and for in-file candidate enumeration. Returns empty array if the file
+   * is not indexed, the DB is missing, or the table doesn't exist yet.
+   *
+   * @param {string} filePath - Project-relative file path as stored in vectors.file_path
+   * @returns {Array<{id, file_path, text, metadata}>}
+   */
+  getChunksByFilePath(filePath) {
+    if (!filePath) return [];
+    try {
+      const db = this._open();
+      return db.prepare(
+        'SELECT id, file_path, text, metadata FROM vectors WHERE file_path = ? ORDER BY id'
+      ).all(filePath);
+    } catch {
+      return [];
+    }
+  }
   /**
    * Full vector scan in an ephemeral connection (no persistent state).
    * Used by the O(N) fallback path — opens, scans, closes immediately.

package/core/infrastructure/config/embedding.js CHANGED Viewed

@@ -245,8 +245,27 @@ export const EMBEDDING_CONFIG = {
     enabled: true,
     maxSize: 1000,
     vocabularyPath: DB_PATHS.vocabulary,
-    autoExpand: process.env.SWEET_SEARCH_VOCAB_AUTO_EXPAND !== '0',
+    // Whether `getEmbedding` consults the persistent query-vocabulary
+    // cache before calling the live model. Disable to force fresh
+    // model output on every query — required for reproducible
+    // benchmarks against a populated vocab file. Reads only; writes
+    // are gated separately by `autoExpand` below.
+    useVocabulary: process.env.SWEET_SEARCH_VOCAB_USE !== '0'
+      && process.env.SWEET_SEARCH_VOCAB_USE !== 'false',
+    // Whether queries that fire ≥ `expansionThreshold` times within a
+    // process are auto-promoted into the persistent vocabulary file.
+    autoExpand: process.env.SWEET_SEARCH_VOCAB_AUTO_EXPAND !== '0'
+      && process.env.SWEET_SEARCH_VOCAB_AUTO_EXPAND !== 'false',
     expansionThreshold: 3,
+    // Hard cap on auto-expanded vocabulary size. Once reached, new
+    // auto-promotions are skipped; explicit `addToVocabulary` /
+    // `expandVocabulary` calls still write through. Override with
+    // `SWEET_SEARCH_VOCAB_MAX_TERMS` (range 1..1e6).
+    maxTerms: (() => {
+      const v = parseInt(process.env.SWEET_SEARCH_VOCAB_MAX_TERMS || '', 10);
+      if (Number.isFinite(v) && v > 0 && v <= 1_000_000) return v;
+      return 10_000;
+    })(),
   },
   // All available providers for fallback

package/core/infrastructure/config/graph.js CHANGED Viewed

@@ -10,7 +10,7 @@
 export const HCGS_CONFIG = {
   // Summary generation
-  enabled: true,
+  enabled: false,
   // Hierarchy levels (bottom-up order)
   levels: ['function', 'method', 'field', 'class', 'interface', 'enum', 'package', 'file'],
@@ -54,7 +54,7 @@ export const HCGS_CONFIG = {
   cacheEnabled: true,
   // Token savings: return summary first, full code on "expand"
-  returnSummaryFirst: true,
+  returnSummaryFirst: false,
   summaryTokenBudget: 150,      // tokens per result in summary mode
   fullCodeTokenBudget: 1000,    // tokens per result in expanded mode
 };

package/core/infrastructure/config/ranking.js CHANGED Viewed

@@ -179,6 +179,14 @@ export const LATE_INTERACTION_CONFIG = {
       backboneDim: 768,                      // raw ModernBERT hidden size
       tokenDimension: 128,                   // final output after projection
       projectionPaths: ['1_Dense/model.safetensors'],  // 768→128 single stage
+      // Per-stage `out_features`. Length must equal projectionPaths.length.
+      // Consumed by the native Rust LI loader to validate safetensors shapes.
+      projectionDims: [128],
+      // Registry key for the FP32-safetensors variant of this model used by
+      // the native (candle / Metal / CUDA) inference path. The ORT-side INT8
+      // path uses the parent key (`lateon-code`) directly. See
+      // `core/infrastructure/native-inference.js::resolveNativeLiVariant`.
+      nativeRegistryKey: 'lateon-code-fp32',
       maxQueryLength: 256,
       get maxDocLength() {
         const env = parseInt(process.env.SWEET_SEARCH_LI_MAX_DOC_LENGTH || '', 10);
@@ -193,6 +201,8 @@ export const LATE_INTERACTION_CONFIG = {
       backboneDim: 256,                      // raw ModernBERT hidden size
       tokenDimension: 48,                    // final output after 2-stage projection
       projectionPaths: ['1_Dense/model.safetensors', '2_Dense/model.safetensors'],  // 256→512→48
+      projectionDims: [512, 48],
+      nativeRegistryKey: 'lateon-code-edge-fp32',
       maxQueryLength: 256,
       get maxDocLength() {
         const env = parseInt(process.env.SWEET_SEARCH_LI_MAX_DOC_LENGTH || '', 10);

package/core/infrastructure/config/vector-store.js CHANGED Viewed

@@ -64,7 +64,7 @@ export const BINARY_HNSW_CONFIG = {
     stage1Candidates: 1000,  // Binary HNSW retrieves top 1000
     stage2Candidates: 200,   // Int8 rescores top 200 (legacy fixed, used as maxStage2 fallback)
     stage2_5Candidates: 200, // Float rescore pool size (legacy fixed, used as maxStage2_5 fallback)
-    stage3Candidates: 20,    // Reranker sees top 20
+    stage3Candidates: 30,    // Reranker sees top 30 (validated 2026-05-03 on GenCodeSearchNet n=6000: +1.52pp R@10, +0.34pp MRR vs 20, no per-language regression)
     // Phase 1 flag: batched normalized-dot Stage 2 scoring.
     // When false, falls back to per-candidate int8CosineSimilarity.