npm - sweet-search - Versions diffs - 2.4.2 → 2.5.2 - Mend

sweet-search 2.4.2 → 2.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/core/cli.js +43 -5
package/core/embedding/embedding-cache.js +266 -18
package/core/embedding/embedding-service.js +45 -9
package/core/graph/graph-expansion.js +52 -12
package/core/graph/graph-extractor.js +30 -1
package/core/indexing/ast-chunker.js +331 -16
package/core/indexing/chunking/chunk-builder.js +34 -1
package/core/indexing/index-codebase-v21.js +31 -2
package/core/indexing/index.js +6 -3
package/core/indexing/indexer-ann.js +45 -6
package/core/indexing/indexer-build.js +9 -1
package/core/indexing/indexer-phases.js +6 -4
package/core/indexing/indexing-file-policy.js +140 -0
package/core/indexing/li-skip-policy.js +11 -220
package/core/infrastructure/codebase-repository.js +21 -0
package/core/infrastructure/config/embedding.js +20 -1
package/core/infrastructure/config/graph.js +2 -2
package/core/infrastructure/config/ranking.js +10 -0
package/core/infrastructure/config/vector-store.js +1 -1
package/core/infrastructure/coreml-cascade.js +236 -30
package/core/infrastructure/coreml-cascade.json +25 -0
package/core/infrastructure/index.js +17 -0
package/core/infrastructure/init-config.js +216 -0
package/core/infrastructure/language-patterns/registry-core.js +18 -0
package/core/infrastructure/model-registry.js +12 -0
package/core/infrastructure/native-inference.js +143 -51
package/core/infrastructure/tree-sitter-provider.js +92 -2
package/core/ranking/cascaded-scorer.js +6 -2
package/core/ranking/file-kind-ranking.js +264 -0
package/core/ranking/late-interaction-index.js +10 -4
package/core/ranking/late-interaction-policy.js +304 -0
package/core/search/context-expander.js +267 -28
package/core/search/index.js +4 -0
package/core/search/search-cli.js +3 -1
package/core/search/search-pattern.js +4 -3
package/core/search/search-postprocess.js +189 -8
package/core/search/search-read-semantic.js +734 -0
package/core/search/search-read.js +481 -0
package/core/search/search-server.js +153 -5
package/core/search/sweet-search.js +133 -16
package/core/start-server.js +13 -2
package/mcp/server.js +41 -0
package/mcp/tool-handlers.js +117 -6
package/package.json +9 -7
package/scripts/init.js +386 -5
package/scripts/uninstall.js +152 -6

package/core/infrastructure/tree-sitter-provider.js CHANGED Viewed

@@ -69,6 +69,28 @@ const BOUNDARY_TYPES = new Set([
   'class_specifier', 'namespace_definition',
 ]);
+// AST node types that represent function/class bodies. Used by
+// extractSignature() to find where the declaration's body starts so
+// the signature span is everything before it (decorators + name +
+// parameters + return type, excluding body).
+const BODY_TYPES = new Set([
+  // JS/TS, Java, Go, Rust, Kotlin, Swift, C#, Ruby (sometimes)
+  'block', 'statement_block', 'class_body', 'function_body',
+  // C / C++ — function bodies
+  'compound_statement', 'field_declaration_list',
+  // Python uses `block` (already covered) but `:` precedes it
+  // PHP — function/method body
+  'compound_statement_php',
+  // Swift / Kotlin — sometimes labelled differently
+  'enum_class_body', 'enum_body', 'interface_body',
+  // Rust impl/trait bodies
+  'declaration_list',
+]);
+// Maximum signature length (chars) after whitespace normalization.
+// Signatures longer than this get truncated with `…`.
+const MAX_SIGNATURE_LENGTH = 200;
 // Map tree-sitter node type -> our chunk type label
 const NODE_TYPE_MAP = {
   'function_declaration': 'function',
@@ -410,12 +432,23 @@ export class TreeSitterProvider {
   /**
    * Parse file content into semantic chunks using the cAST recursive algorithm.
    * Returns array of chunk objects or null if tree-sitter can't handle it.
+   *
+   * Header-aware budget (research-only ablation, May 2026): set
+   * SWEET_SEARCH_CHUNK_HEADER_OVERHEAD=N to subtract N chars from the
+   * cAST max chunk size, leaving room for the embedding-text headers
+   * (path / parent / symbol / language ≈ 50–100 chars) without spilling
+   * past the embedding cap. Default 0 = byte-identical to shipped. The
+   * audit motivating this lever lives in eval/results/chunk-overflow-audit.md.
    */
   async parseFileToChunks(content, languageId, options = {}) {
     const tree = await this.parse(content, languageId);
     if (!tree) return null;
-    const maxChunkSize = options.maxChunkSize || 2000;
+    const headerOverhead = (() => {
+      const v = parseInt(process.env.SWEET_SEARCH_CHUNK_HEADER_OVERHEAD || '', 10);
+      return Number.isFinite(v) && v >= 0 ? v : 0;
+    })();
+    const maxChunkSize = (options.maxChunkSize || 2000) - headerOverhead;
     this._chunkCounter = 0;
     const children = this._getChildren(tree.rootNode);
@@ -467,6 +500,7 @@ export class TreeSitterProvider {
         const firstBoundary = buffer.find(n => BOUNDARY_TYPES.has(n.type));
         const name = firstBoundary ? this._extractNodeName(firstBoundary) : null;
         const type = firstBoundary ? (NODE_TYPE_MAP[firstBoundary.type] || 'code') : 'code';
+        const signature = firstBoundary ? this._extractSignature(firstBoundary, content) : null;
         chunks.push({
           chunkId: this._nextChunkId(),
@@ -478,6 +512,7 @@ export class TreeSitterProvider {
           endLine: buffer[buffer.length - 1].endPosition.row,
           type,
           name: name || (buffer.length === 1 ? null : null),
+          signature,
         });
       }
       buffer = [];
@@ -536,6 +571,7 @@ export class TreeSitterProvider {
               endLine: node.endPosition.row,
               type: NODE_TYPE_MAP[node.type] || 'code',
               name: this._extractNodeName(node),
+              signature: this._extractSignature(node, content),
             });
           }
         }
@@ -546,6 +582,60 @@ export class TreeSitterProvider {
     return chunks;
   }
+  /**
+   * Extract a compact, single-line signature for a boundary AST node.
+   *
+   * Strategy: find the first body-like child (block / statement_block /
+   * compound_statement / class_body / declaration_list / …), and return
+   * the source span [node.startIndex, body.startIndex) with whitespace
+   * normalized to single spaces. If no body child is found (e.g.
+   * declarations without a body, abstract methods, interface members),
+   * return the full first line of the node.
+   *
+   * Returns null when the node has no children to inspect.
+   *
+   * Used by the `signature` R1 embedding-text variant. Intentionally
+   * does NOT alter `text`, `li_text`, or `li_greedy_text` — signature
+   * surface is research-only on `embedding_text`.
+   */
+  _extractSignature(node, content) {
+    if (!node || !content) return null;
+    if (!BOUNDARY_TYPES.has(node.type)) return null;
+    let bodyStart = null;
+    // Try field-name lookup first (works for most modern grammars).
+    const bodyField = node.childForFieldName?.('body');
+    if (bodyField && BODY_TYPES.has(bodyField.type)) {
+      bodyStart = bodyField.startIndex;
+    } else {
+      // Fall back to scanning children for a body-shaped child.
+      for (let i = 0; i < node.childCount; i++) {
+        const child = node.child(i);
+        if (BODY_TYPES.has(child.type)) {
+          bodyStart = child.startIndex;
+          break;
+        }
+      }
+    }
+    let raw;
+    if (bodyStart != null && bodyStart > node.startIndex) {
+      raw = content.substring(node.startIndex, bodyStart);
+    } else {
+      // No body found — declaration only (e.g. abstract method, type
+      // alias). Take the whole node text.
+      raw = content.substring(node.startIndex, node.endIndex);
+    }
+    // Normalize: collapse runs of whitespace (including newlines) to a
+    // single space, drop leading/trailing whitespace.
+    const normalized = raw.replace(/\s+/g, ' ').trim();
+    if (!normalized) return null;
+    if (normalized.length <= MAX_SIGNATURE_LENGTH) return normalized;
+    return normalized.slice(0, MAX_SIGNATURE_LENGTH - 1) + '…';
+  }
   /** Extract symbol name from an AST node */
   _extractNodeName(node) {
     // Try field name first (most reliable)
@@ -662,4 +752,4 @@ export function resetTreeSitterProvider() {
 }
 // Re-export constants for testing
-export { GRAMMAR_MAP, IDENT_TYPES, BOUNDARY_TYPES, NODE_TYPE_MAP, TAGS_QUERIES, CAPTURE_TO_ENTITY_TYPE };
+export { GRAMMAR_MAP, IDENT_TYPES, BOUNDARY_TYPES, BODY_TYPES, MAX_SIGNATURE_LENGTH, NODE_TYPE_MAP, TAGS_QUERIES, CAPTURE_TO_ENTITY_TYPE };

package/core/ranking/cascaded-scorer.js CHANGED Viewed

@@ -121,11 +121,15 @@ function partitionByTokenAvailability(candidates, liIndex) {
   if (!liIndex) {
     return { withTokens: [], withoutTokens: [...candidates] };
   }
-  const available = liIndex.hasTokens(candidates.map(c => c.id || c.entity_id));
+  // Graph-expanded candidates have entity_id-based public ids that don't
+  // match LI-indexed chunk ids; they carry the resolved chunk id under
+  // _liChunkId. Honour it so expanded candidates can participate in MaxSim.
+  const lookupId = (c) => c._liChunkId || c.id || c.entity_id;
+  const available = liIndex.hasTokens(candidates.map(lookupId));
   const withTokens = [];
   const withoutTokens = [];
   for (const c of candidates) {
-    (available.has(c.id || c.entity_id) ? withTokens : withoutTokens).push(c);
+    (available.has(lookupId(c)) ? withTokens : withoutTokens).push(c);
   }
   return { withTokens, withoutTokens };
 }

package/core/ranking/file-kind-ranking.js ADDED Viewed

@@ -0,0 +1,264 @@
+/**
+ * Intent-aware file-kind ranking (conservative variant).
+ *
+ * Background: real-codebase miss analysis found that documentation, test, and
+ * TypeScript-declaration files often outrank the implementation file users
+ * were actually looking for on multi-file codebases. The first version of
+ * this rule (commit f6fcfd1) lifted graph-2hop R@1 from 47.46 % → 64.41 %
+ * but catastrophically regressed GenCodeSearchNet under the dense profile
+ * (full-6 000 dense run: MRR@10 84.4 % → 47.4 %, Recall@5 92.0 % → 48.4 %).
+ * Root cause: the legacy LI-rerank pipeline assembles
+ * `results = [...liScored, ...tail]`, where `liScored` carries MaxSim
+ * scores that are sometimes *lower* (in absolute value) than the int8
+ * cosine scores already on the un-reranked tail. The concatenated list is
+ * therefore not globally score-monotonic. The old helper unconditionally
+ * spread and re-sorted *all* results by `score`, which floated the
+ * int8-only tail above the LI-reranked head and undid the rerank — even
+ * when every multiplier was 1 (GenCodeSearchNet is a single-source
+ * corpus, so no docs/tests/types kind ever matches there).
+ *
+ * Conservative variant fixes both regressions with three guards:
+ *
+ *   1. Confident-intent gating. `classifyFileKindIntent` now returns
+ *      `'unknown'` for queries with no implementation-seeking signal. Only
+ *      explicit `'implementation'` intent triggers demotion. `'unknown'`,
+ *      `'docs'`, `'tests'`, `'types'` are no-ops.
+ *
+ *   2. Structural skip. The rule looks at the top-N candidates (default 30).
+ *      If the window has zero docs/tests/types files (single-source corpus
+ *      like GCSN) or zero implementation files (nothing to promote), the
+ *      input is returned untouched. No re-sort, no new objects.
+ *
+ *   3. Window-bounded re-sort. When the rule does fire, only the top-N
+ *      window is re-ranked. The tail — where the rerank/non-rerank score-
+ *      scale boundary usually lives — is concatenated unchanged. This
+ *      keeps mixed-scale damage contained.
+ *
+ * Disable at runtime with `SWEET_SEARCH_FILE_KIND_RANKING=0`. Tune the soft
+ * factor with `SWEET_SEARCH_FILE_KIND_FACTOR` (default 0.85; range (0, 1]).
+ * Tune the window with `SWEET_SEARCH_FILE_KIND_WINDOW` (default 30).
+ */
+const DOCS_RE  = /\.md$|\.mdx$|\.rst$|(?:^|\/)docs?\//i;
+const TESTS_RE = /(?:^|\/)tests?\/|(?:^|\/)spec\/|\.test\.[a-z0-9]+$|_test\.[a-z0-9]+$|\.spec\.[a-z0-9]+$|_spec\.[a-z0-9]+$/i;
+const TYPES_RE = /\.d\.ts$|(?:^|\/)types\//i;
+// Strong implementation-seeking signals. A query that fires one of these is
+// confidently asking for source code; anything else is treated as `'unknown'`.
+// Curated to cover the validated guard-set queries plus common phrasings,
+// without matching pure descriptive corpus prose like "Convert XML to URL List".
+const IMPL_INTENT_RE = new RegExp(
+  '\\b(' + [
+    // English wh-questions about location/behaviour
+    'where', 'how does', 'how do',
+    // Definition / implementation phrasing
+    'implements?', 'implementation', 'defines?', 'definition', 'declared?',
+    // Code-structure nouns
+    'function', 'functions', 'method', 'methods', 'class', 'classes',
+    'constructor', 'module', 'library', 'crate', 'package',
+    // Verbs that strongly signal a code unit
+    'dispatch(?:es|er)?', 'handles?', 'handler', 'handlers',
+    'parses?', 'parser', 'parsers',
+    'router?', 'routes?', 'routing',
+    'register(?:s|ed|ing)?',
+    'builds?', 'builder', 'builders',
+    'generat(?:es?|or|ors|ed|ing)',
+    'creat(?:es?|or|ed|ion|ing)',
+    'loads?', 'loader',
+    'writes?', 'writer',
+    'reads?', 'reader',
+    'sends?', 'receives?',
+    'computes?', 'computed',
+    'encodes?', 'encoder', 'decodes?', 'decoder',
+    'transforms?', 'transformer',
+    'invokes?', 'calls?', 'returns?',
+    'valid(?:ate|ates|ator|ation)',
+    'serial(?:ize|izes|izer)', 'deserial(?:ize|izes|izer)',
+    'wrap(?:s|per|ped|ping)?',
+    'matchers?', 'matches?',
+    'printers?', 'prints?',
+    'searchers?', 'searches?',
+    // Specific terms common in real-repo guard queries
+    'callback', 'callbacks',
+    'factory', 'factories',
+    'controller', 'controllers',
+    'middleware',
+    'fallback', 'fallbacks',
+    'entrypoint', 'entry-point', 'main',
+    'init', 'initialise', 'initialize', 'initialiser', 'initializer',
+    'kernel', 'engine',
+    'wrapper', 'wrappers',
+    'singleton',
+    'factory',
+    'decorator', 'decorators',
+    'closure', 'closures',
+  ].join('|') + ')\\b',
+  'i',
+);
+const DOCS_INTENT_RE  = /\b(doc|docs|documentation|readme|guide|tutorial|reference|example)\b/i;
+const TESTS_INTENT_RE = /\b(test|tests|spec|specs|fixture|fixtures|mock|mocks)\b/i;
+const TYPES_INTENT_RE = /\b(type|types|interface|declaration|signature|typings|typedef)\b/i;
+/**
+ * Detect the file kind from a result path.
+ * @returns {'docs'|'tests'|'types'|'implementation'}
+ */
+export function detectFileKind(filePath) {
+  if (!filePath || typeof filePath !== 'string') return 'implementation';
+  if (DOCS_RE.test(filePath))  return 'docs';
+  if (TESTS_RE.test(filePath)) return 'tests';
+  if (TYPES_RE.test(filePath)) return 'types';
+  return 'implementation';
+}
+/**
+ * Detect file-kind intent of a query along the docs/tests/types/implementation
+ * axis. Conservative: a query with no implementation-seeking signal returns
+ * `'unknown'`, and the helper treats `'unknown'` as a no-op (just like the
+ * docs/tests/types intents).
+ *
+ * @returns {'docs'|'tests'|'types'|'implementation'|'unknown'}
+ */
+export function classifyFileKindIntent(query) {
+  const q = (query || '').toLowerCase();
+  if (!q) return 'unknown';
+  // Type-seeking trumps test-seeking when both fire (existing convention).
+  if (TYPES_INTENT_RE.test(q)) return 'types';
+  if (DOCS_INTENT_RE.test(q))  return 'docs';
+  if (TESTS_INTENT_RE.test(q)) return 'tests';
+  if (IMPL_INTENT_RE.test(q))  return 'implementation';
+  return 'unknown';
+}
+function resolveFilePath(r) {
+  return r?.file
+    || r?.file_path
+    || r?.path
+    || r?.metadata?.file
+    || r?.metadata?.file_path
+    || r?.metadata?.path
+    || '';
+}
+function envOff() {
+  return process.env.SWEET_SEARCH_FILE_KIND_RANKING === '0'
+      || process.env.SWEET_SEARCH_FILE_KIND_RANKING === 'false';
+}
+function envFactor(name, fallback) {
+  const v = process.env[name];
+  if (!v) return fallback;
+  const n = Number.parseFloat(v);
+  return Number.isFinite(n) && n > 0 && n <= 1 ? n : fallback;
+}
+function envWindow(name, fallback) {
+  const v = process.env[name];
+  if (!v) return fallback;
+  const n = Number.parseInt(v, 10);
+  return Number.isFinite(n) && n > 0 ? n : fallback;
+}
+const DEFAULT_FACTOR = 0.85;
+const DEFAULT_WINDOW = 30;
+/**
+ * Apply intent-aware file-kind score multipliers, then re-sort the top-N
+ * window descending. The original array is not mutated.
+ *
+ * Demotion fires only when:
+ *   - intent === 'implementation' (confident, NOT 'unknown'), AND
+ *   - the top-N window contains at least one docs/tests/types candidate, AND
+ *   - the top-N window contains at least one implementation candidate.
+ *
+ * In every other case the original `results` array is returned unchanged
+ * (same reference, no copy, no re-sort) — this is critical so the helper is
+ * a structural no-op on single-source corpora (GCSN) and on cascades whose
+ * top-N has no demotable competition.
+ *
+ * @param {Array} results - search results carrying .score and a file-path
+ *                          field (.file / .file_path / .path / .metadata.*).
+ * @param {Object} [opts]
+ * @param {string} [opts.query]            - raw query (used to infer intent
+ *                                            if opts.intent isn't supplied)
+ * @param {'docs'|'tests'|'types'|'implementation'|'unknown'} [opts.intent]
+ *                                            - explicit intent override
+ * @param {number} [opts.docFactor]        - default from env / 0.85
+ * @param {number} [opts.testFactor]       - default from env / 0.85
+ * @param {number} [opts.typeFactor]       - default from env / 0.85
+ * @param {number} [opts.window]           - top-N window for analysis +
+ *                                            bounded re-sort (default 30)
+ * @returns {Array} either the original `results` (no-op) or a new array
+ *                  whose head is sorted by adjusted score and whose tail is
+ *                  the unchanged input tail. Stable on ties.
+ */
+export function applyFileKindRanking(results, opts = {}) {
+  if (envOff()) return results;
+  if (!Array.isArray(results) || results.length === 0) return results;
+  const intent = opts.intent != null
+    ? opts.intent
+    : classifyFileKindIntent(opts.query || '');
+  // Conservative gate: only confident 'implementation' intent fires.
+  if (intent !== 'implementation') return results;
+  const window = opts.window != null
+    ? opts.window
+    : envWindow('SWEET_SEARCH_FILE_KIND_WINDOW', DEFAULT_WINDOW);
+  const windowSize = Math.min(window, results.length);
+  // Walk the window once: classify kinds and check for competition.
+  const kinds = new Array(windowSize);
+  let demotableCount = 0;
+  let implCount = 0;
+  for (let i = 0; i < windowSize; i++) {
+    const k = detectFileKind(resolveFilePath(results[i]));
+    kinds[i] = k;
+    if (k === 'docs' || k === 'tests' || k === 'types') demotableCount++;
+    else if (k === 'implementation') implCount++;
+  }
+  // Structural skip: nothing to demote, or nothing to promote.
+  if (demotableCount === 0 || implCount === 0) return results;
+  const factor = envFactor('SWEET_SEARCH_FILE_KIND_FACTOR', DEFAULT_FACTOR);
+  const docFactor  = opts.docFactor  != null ? opts.docFactor  : factor;
+  const testFactor = opts.testFactor != null ? opts.testFactor : factor;
+  const typeFactor = opts.typeFactor != null ? opts.typeFactor : factor;
+  const reranked = new Array(windowSize);
+  for (let i = 0; i < windowSize; i++) {
+    const r = results[i];
+    const kind = kinds[i];
+    let mult = 1;
+    if (kind === 'docs')  mult = docFactor;
+    else if (kind === 'tests') mult = testFactor;
+    else if (kind === 'types') mult = typeFactor;
+    const baseScore = (typeof r.score === 'number') ? r.score : 0;
+    reranked[i] = {
+      ...r,
+      _fileKindOrigScore: baseScore,
+      _fileKindMult: mult,
+      _fileKindKind: kind,
+      _fileKindOrigIndex: i,
+      score: baseScore * mult,
+    };
+  }
+  // Stable sort: descending score, tie-break on original index.
+  reranked.sort((a, b) => {
+    const d = (b.score || 0) - (a.score || 0);
+    return d !== 0 ? d : a._fileKindOrigIndex - b._fileKindOrigIndex;
+  });
+  for (const r of reranked) delete r._fileKindOrigIndex;
+  // Concatenate unchanged tail. The cascade's CE/MaxSim score-scale
+  // boundary typically lives near rank `ceTopK`, so leaving rank
+  // `windowSize`+ untouched contains the damage from any cross-scale
+  // re-sort that might happen inside the window.
+  if (windowSize === results.length) return reranked;
+  return reranked.concat(results.slice(windowSize));
+}

package/core/ranking/late-interaction-index.js CHANGED Viewed

@@ -1414,10 +1414,16 @@ export class LateInteractionIndex {
     // don't support importance weighting, so we must use the JS-tier weighted path.
     const nativeScored = new Set();
+    // Resolve a doc-lookup ID for each candidate. Graph-expanded candidates
+    // carry `_liChunkId` (a chunk id pointing into the LI index) while their
+    // public `id` is the entity id from the code graph. Honouring _liChunkId
+    // lets expanded candidates participate in MaxSim rerank.
+    const docIdOf = (c) => c._liChunkId || c.id;
     if (useFlatPath && !this.useTokenWeights) {
       const groups = { bit4: [], perToken: [], perDoc: [] };
       for (const candidate of toScore) {
-        const doc = this.documents.get(candidate.id);
+        const doc = this.documents.get(docIdOf(candidate));
         if (!doc) continue;
         if (doc.quantBits === 4 && doc.minArray && doc.tokenNorms) {
           groups.bit4.push({ candidate, doc });
@@ -1453,7 +1459,7 @@ export class LateInteractionIndex {
     // Try WASM fused kernels first (avoids JS-side dequant), fall back to JS dequant + wasmMaxSimF32.
     for (const candidate of toScore) {
       if (nativeScored.has(candidate.id)) continue;
-      const doc = this.documents.get(candidate.id);
+      const doc = this.documents.get(docIdOf(candidate));
       if (!doc) { pushFallback(candidate); continue; }
       if (useFlatPath) {
@@ -1488,7 +1494,7 @@ export class LateInteractionIndex {
         }
         // JS dequant → WASM f32 or JS fallback
-        const flatData = this.getTokensFlat(candidate.id);
+        const flatData = this.getTokensFlat(docIdOf(candidate));
         if (flatData) {
           pushScored(candidate, this.maxSimScoreFlat(
             effectiveQueryTokens, flatData.flat, flatData.numTokens, flatData.dim,
@@ -1498,7 +1504,7 @@ export class LateInteractionIndex {
           pushFallback(candidate);
         }
       } else {
-        const docTokens = this.getTokens(candidate.id);
+        const docTokens = this.getTokens(docIdOf(candidate));
         if (docTokens) {
           pushScored(candidate, this.maxSimScore(effectiveQueryTokens, docTokens, pruneOpts));
         } else {