npm - sigmap - Versions diffs - 7.30.0 → 7.31.0 - Mend

sigmap 7.30.0 → 7.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/CHANGELOG.md +11 -0
package/README.md +9 -9
package/gen-context.js +152 -64
package/llms-full.txt +5 -5
package/llms.txt +5 -5
package/package.json +1 -1
package/packages/cli/package.json +1 -1
package/packages/core/package.json +1 -1
package/src/eval/runner.js +9 -61
package/src/mcp/server.js +1 -1
package/src/retrieval/bm25.js +122 -0
package/src/retrieval/ranker.js +15 -1

package/CHANGELOG.md CHANGED Viewed

@@ -10,6 +10,17 @@ Format: [Semantic Versioning](https://semver.org/)
 ---
+## [7.31.0] — 2026-07-02
+Minor release — **identifier-aware BM25 re-ranker.** Plain exact-token TF-IDF missed queries whose terms live *inside* code identifiers — `component emit` never surfaced `componentEmits` because that is one token sharing no exact term with the query. This was the dominant retrieval-miss cause. The new ranker splits identifiers, stems lightly, boosts path tokens, and scores with length-normalized BM25. Deterministic, zero new dependencies, no LLM/embeddings.
+### Added
+- **Identifier-aware BM25 re-ranker (#395, #396):** new zero-dependency `src/retrieval/bm25.js` with (1) identifier-aware tokenization (split camelCase / snake_case), (2) light stemming (`emits` → `emit`, `options` → `option`), (3) path-token boost (filename weighed 3×), and (4) BM25 length-normalized scoring instead of raw TF-IDF. Wired into the core ranker (`src/retrieval/ranker.js`) as the base relevance score — so `sigmap ask`, `sigmap --query`, and MCP `query_context` all benefit — with the existing negative-signal penalty and recency/graph/learned boosts layered on top. Also drives the benchmark runner (`src/eval/runner.js`) and the dev retrieval benchmark.
+- **BM25 unit tests (#396):** `test/integration/bm25.test.js` covers tokenization, stemming, path boost, the `component emit` → `componentEmits` motivating case, and deterministic tie-breaking.
+### Changed
+- **Retrieval benchmark refreshed:** on the 18-repo / 90-task suite, hit@5 rose **75.6% → 86.7%** (retrieval lift 5.6× → 6.4×), with rank-1 gains on flask, spring-petclinic, rails, and svelte (60% → 100%). The task-completion proxy also improved (task success 52.2% → 67.8%, prompts/task 1.72 → 1.46) since it retrieves through the same ranker. Residual misses (vapor, serilog) are files whose signatures genuinely lack the query vocabulary — out of scope, they need semantic retrieval.
 ## [7.30.0] — 2026-06-23
 Minor release — **v8.0 E2 + E4 (the "Pivot"):** completes v8.0 by repositioning every public surface to the chosen framing — *"the deterministic, verifiable grounding layer for AI code work"* — and framing coding agents as **consumers, not competitors**. The Evidence Pack code (E1/E3/D3 + `mcp install`) already shipped in 7.27–7.29; this is the positioning half. Docs/strings only — no runtime behaviour change, zero new dependencies.

package/README.md CHANGED Viewed

@@ -57,10 +57,10 @@ That map is exactly what agentic grep is worst at: reproducible, auditable conte
 **Proof it pays off** (full benchmark below):
 <!--SM:whyMetrics-->
-- **75.6% hit@5** — right file found in top 5 results (vs 13.6% baseline)
+- **86.7% hit@5** — right file found in top 5 results (vs 13.6% baseline)
 - **97.0% token reduction** — average across 21 real repos
-- **52.2% task success rate** — up from 10% without context
-- **1.72 prompts per task** — down from 2.84 (39.4% fewer retries)
+- **67.8% task success rate** — up from 10% without context
+- **1.46 prompts per task** — down from 2.84 (48.8% fewer retries)
 <!--/SM:whyMetrics-->
 - **<!--SM:languages-->33<!--/SM:languages--> languages supported** — TypeScript, Python, Go, Rust, Java, R, and more
 - **No vendor lock-in** — works with any AI assistant or local LLM
@@ -74,7 +74,7 @@ That map is exactly what agentic grep is worst at: reproducible, auditable conte
 | Without SigMap | With SigMap |
 |---|---|
 | ❌ Non-reproducible agent guesses | ✅ Deterministic map — same input, same output, every time |
-| ❌ "Trust me" AI answers | ✅ Grounded — right file in context <!--SM:hitWhole-->76%<!--/SM:hitWhole--> of the time, every symbol on a real line anchor |
+| ❌ "Trust me" AI answers | ✅ Grounded — right file in context <!--SM:hitWhole-->87%<!--/SM:hitWhole--> of the time, every symbol on a real line anchor |
 | ❌ Embeddings / vector DB required | ✅ Zero deps, no infra, fully offline |
 ---
@@ -98,13 +98,13 @@ Ask → Rank → Context → Validate → Judge → Learn
 <!--SM:benchmarkBlock-->
 ```
-Benchmark : sigmap-v7.30-main (21 repositories, including R language)
-Date      : 2026-06-23
+Benchmark : sigmap-v7.31-main (21 repositories, including R language)
+Date      : 2026-07-02
-Hit@5          : 75.6%   (baseline 13.6%  — 5.6× lift)
+Hit@5          : 86.7%   (baseline 13.6%  — 6.4× lift)
 Token reduction: 97.0%   (across 21 repos)
-Prompt reduction : 39.4% (2.84 → 1.72 prompts per task)
-Task success   : 52.2%   (baseline 10%)
+Prompt reduction : 48.8% (2.84 → 1.46 prompts per task)
+Task success   : 67.8%   (baseline 10%)
 Repos tested   : 21 (JavaScript, Python, Go, Rust, Java, R, C++, C#, Dart, Swift, Ruby, PHP, Scala, Kotlin, and more)
 ```
 <!--/SM:benchmarkBlock-->

package/gen-context.js CHANGED Viewed

@@ -4136,6 +4136,7 @@ __factories["./src/eval/runner"] = function(module, exports) {
   const fs = require('fs');
   const path = require('path');
   const { aggregate } = __require('./src/eval/scorer');
+  const { bm25rank } = __require('./src/retrieval/bm25');
   // ---------------------------------------------------------------------------
   // Context file reader
@@ -4197,79 +4198,26 @@ __factories["./src/eval/runner"] = function(module, exports) {
   }
   // ---------------------------------------------------------------------------
-  // Simple keyword-based ranking (pre-retrieval layer; v2.3 adds proper ranker)
+  // Identifier-aware BM25 ranking (v7.31; see src/retrieval/bm25.js and #395)
   // ---------------------------------------------------------------------------
-  /**
-   * Tokenize a query or signature into lower-case word tokens.
-   * Splits on whitespace, punctuation, camelCase, and snake_case.
-   * @param {string} text
-   * @returns {string[]}
-   */
-  function tokenize(text) {
-    if (!text) return [];
-    return text
-      // split camelCase
-      .replace(/([a-z])([A-Z])/g, '$1 $2')
-      // split snake/kebab
-      .replace(/[_\-]/g, ' ')
-      // drop non-word chars
-      .replace(/[^\w\s]/g, ' ')
-      .toLowerCase()
-      .split(/\s+/)
-      .filter((t) => t.length > 1);
-  }
-  const STOP_WORDS = new Set([
-    'the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'or', 'is', 'are',
-    'that', 'this', 'it', 'with', 'from', 'by', 'be', 'as', 'on', 'at',
-  ]);
+  const { tokenize } = __require('./src/retrieval/bm25');
   /**
-   * Score a single file's signatures against a query.
-   * Returns a non-negative number; higher = more relevant.
-   * @param {string[]} sigs  - array of signature strings for this file
-   * @param {string[]} queryTokens
-   * @returns {number}
-   */
-  function scoreFile(sigs, queryTokens) {
-    if (!sigs || sigs.length === 0) return 0;
-    const sigText = sigs.join(' ');
-    const sigTokens = new Set(tokenize(sigText));
-    let score = 0;
-    for (const qt of queryTokens) {
-      if (STOP_WORDS.has(qt)) continue;
-      if (sigTokens.has(qt)) score += 1;
-      // Partial match (prefix)
-      for (const st of sigTokens) {
-        if (st !== qt && st.startsWith(qt) && qt.length >= 4) score += 0.3;
-      }
-    }
-    return score;
-  }
-  /**
-   * Rank all files in the index against a query. Returns file paths sorted
-   * by relevance score descending. Ties are broken by file path alphabetically.
+   * Rank all files in the index against a query with the identifier-aware BM25
+   * re-ranker. Returns file entries sorted by relevance score descending; ties
+   * are broken by file path alphabetically (deterministic).
    * @param {string} query
    * @param {Map<string, string[]>} index
    * @param {number} topK
    * @returns {{ file: string, score: number, sigs: string[] }[]}
    */
   function rank(query, index, topK = 10) {
-    const queryTokens = tokenize(query);
-    const scored = [];
+    const candidates = [];
     for (const [file, sigs] of index.entries()) {
-      const score = scoreFile(sigs, queryTokens);
-      scored.push({ file, score, sigs });
+      candidates.push({ file, sigs });
     }
-    scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
-    return scored.slice(0, topK);
+    return bm25rank(query, candidates).slice(0, topK);
   }
   // ---------------------------------------------------------------------------
@@ -12695,7 +12643,7 @@ __factories["./src/mcp/server"] = function(module, exports) {
   const SERVER_INFO = {
     name: 'sigmap',
-    version: '7.30.0',
+    version: '7.31.0',
     description: 'SigMap MCP server — code signatures on demand',
   };
@@ -13418,6 +13366,132 @@ __factories["./src/plan/verify-plan"] = function(module, exports) {
 };
+// ── ./src/retrieval/bm25 ──
+__factories["./src/retrieval/bm25"] = function(module, exports) {
+  /**
+   * SigMap identifier-aware BM25 re-ranker (zero dependencies, deterministic).
+   *
+   * Plain exact-token TF-IDF misses queries whose terms live *inside* code
+   * identifiers — e.g. `component emit` never surfaces `componentEmits.ts`,
+   * because "componentEmits" is one token that shares no exact term with the
+   * query. This module fixes that with four small additions:
+   *
+   *   1. Identifier-aware tokenization — split camelCase and snake_case.
+   *   2. Light stemming — plurals / common suffixes (`emits` → `emit`).
+   *   3. Path-token boost — file path / basename tokens weigh PATH_BOOST× more.
+   *   4. BM25 scoring instead of raw TF-IDF (length-normalized).
+   *
+   * On 85 curated tasks across 17 repos this lifted hit@5 from 75.3% → 82.4%
+   * (MRR +16% relative). See issue #395.
+   */
+  // Stop words: common English + low-signal code verbs/nouns that appear in
+  // nearly every signature and so carry little retrieval signal.
+  const STOP = new Set(
+    ('a an the of to in on for and or is are be by with as at from that this it its ' +
+     'into get set add new return value test')
+      .split(' ')
+  );
+  /**
+   * Light suffix stemmer — conservative, tuned for code identifiers rather than
+   * prose. Words of 3 chars or fewer pass through unchanged; a result shorter
+   * than 3 chars reverts to the original token.
+   *
+   * @param {string} w
+   * @returns {string}
+   */
+  function stem(w) {
+    if (w.length <= 3) return w;
+    let s = w;
+    s = s.replace(/ies$/, 'y');
+    s = s.replace(/(sses|shes|ches|xes|zes)$/, (m) => m.slice(0, -2));
+    s = s.replace(/([^s])s$/, '$1');
+    s = s.replace(/(ization|izations)$/, 'ize');
+    s = s.replace(/(ing|edly|ed|er|ers|ation|ations|ment|ness|ity|ive|able|ible|ize|ise|al)$/, '');
+    return s.length >= 3 ? s : w;
+  }
+  /**
+   * Split on non-alphanumeric characters AND camelCase / snake_case boundaries,
+   * lowercase, drop stop words and single characters, then stem.
+   *
+   * @param {string} text
+   * @returns {string[]}
+   */
+  function tokenize(text) {
+    if (!text || typeof text !== 'string') return [];
+    return text
+      .replace(/[^A-Za-z0-9]+/g, ' ')
+      .replace(/([a-z0-9])([A-Z])/g, '$1 $2')
+      .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
+      .toLowerCase()
+      .split(/\s+/)
+      .filter((t) => t.length > 1 && !STOP.has(t))
+      .map(stem)
+      .filter(Boolean);
+  }
+  // The file path / basename is highly indicative of relevance, so its tokens
+  // are counted PATH_BOOST times when building the document term-frequency map.
+  const PATH_BOOST = 3;
+  /**
+   * BM25 re-rank of candidates against a query. Each candidate is
+   * `{ file, sigs }`; the returned objects preserve all original candidate
+   * fields and add a numeric `score` (higher = more relevant), sorted best-first
+   * with a deterministic path tie-break. A `score` of 0 means no query token
+   * matched — callers typically drop those.
+   *
+   * @param {string} query
+   * @param {{ file: string, sigs: string[] }[]} candidates
+   * @returns {Array<object & { score: number }>}
+   */
+  function bm25rank(query, candidates) {
+    if (!Array.isArray(candidates) || candidates.length === 0) return [];
+    const k1 = 1.5;
+    const b = 0.75;
+    const docs = candidates.map((c) => {
+      const pathToks = tokenize(c.file || '');
+      const toks = tokenize((c.sigs || []).join(' '));
+      for (let i = 0; i < PATH_BOOST; i++) toks.push(...pathToks);
+      const tf = new Map();
+      for (const t of toks) tf.set(t, (tf.get(t) || 0) + 1);
+      return { cand: c, tf, len: toks.length };
+    });
+    const N = docs.length || 1;
+    const avgdl = docs.reduce((s, d) => s + d.len, 0) / N || 1;
+    const df = new Map();
+    for (const d of docs) {
+      for (const t of d.tf.keys()) df.set(t, (df.get(t) || 0) + 1);
+    }
+    const qToks = [...new Set(tokenize(query))];
+    return docs
+      .map((d) => {
+        let score = 0;
+        for (const t of qToks) {
+          const f = d.tf.get(t);
+          if (!f) continue;
+          const dfT = df.get(t);
+          const idf = Math.log(1 + (N - dfT + 0.5) / (dfT + 0.5));
+          score += (idf * (f * (k1 + 1))) / (f + k1 * (1 - b + (b * d.len) / avgdl));
+        }
+        return Object.assign({}, d.cand, { score });
+      })
+      .sort((a, c) => c.score - a.score || String(a.file).localeCompare(String(c.file)));
+  }
+  module.exports = { tokenize, stem, bm25rank, PATH_BOOST, STOP };
+};
 // ── ./src/retrieval/ranker ──
 __factories["./src/retrieval/ranker"] = function(module, exports) {
@@ -13440,6 +13514,7 @@ __factories["./src/retrieval/ranker"] = function(module, exports) {
   const { loadWeights } = __require('./src/learning/weights');
   const { tokenize, STOP_WORDS } = __require('./src/retrieval/tokenizer');
+  const { bm25rank } = __require('./src/retrieval/bm25');
   // ---------------------------------------------------------------------------
   // Default weights
@@ -13618,11 +13693,24 @@ __factories["./src/retrieval/ranker"] = function(module, exports) {
       return all.slice(0, topK);
     }
+    // Identifier-aware BM25 base relevance over the whole index (#395). BM25
+    // splits camelCase/snake_case, stems, and boosts path tokens, so queries
+    // whose terms live inside identifiers (e.g. "component emit" → componentEmits)
+    // are matched. The existing negative-signal penalty and recency/graph/learned
+    // boosts are layered on top; the per-token signals stay for the explain table.
+    const bm25Scores = new Map();
+    for (const c of bm25rank(query, [...sigIndex.entries()].map(([file, sigs]) => ({ file, sigs })))) {
+      bm25Scores.set(c.file, c.score);
+    }
     const scored = [];
     for (const [file, sigs] of sigIndex.entries()) {
       const result = scoreFile(file, sigs, queryTokens, weights);
-      let score = result.score;
+      const penalty = result.signals.penalty;
+      const base = bm25Scores.get(file) || 0;
+      let score = base * penalty;
       const signals = result.signals;
+      signals.bm25 = base;
       // Recency boost
       if (recencySet && recencySet.has(file) && score > 0) {
@@ -16524,7 +16612,7 @@ function __tryGit(args, opts = {}) {
   catch (_) { return ''; }
 }
-const VERSION = '7.30.0';
+const VERSION = '7.31.0';
 const MARKER = '\n\n## Auto-generated signatures\n<!-- Updated by gen-context.js -->\n';
 function requireSourceOrBundled(key) {

package/llms-full.txt CHANGED Viewed

@@ -11,20 +11,20 @@ ranking keeps the relevant context in scope (cutting tokens ~97% as a side
 effect), with no LLM calls, embeddings, or vector database. Works with Claude,
 Cursor, GitHub Copilot, Aider, Windsurf, local LLMs, and MCP.
-# Version: 7.30.0 | Benchmark: sigmap-v7.30-main (2026-06-23)
+# Version: 7.31.0 | Benchmark: sigmap-v7.31-main (2026-07-02)
 # Source: auto-generated from package.json, version.json, benchmarks/latest.json, src/mcp/tools.js, src/config/defaults.js
 # Regenerate: npm run generate:llms   |   Validate: npm run validate:llms
 ---
-## Core metrics (benchmark: sigmap-v7.30-main, 2026-06-23)
+## Core metrics (benchmark: sigmap-v7.31-main, 2026-07-02)
 | Metric | Without SigMap | With SigMap |
 |--------|----------------|-------------|
-| Retrieval hit@5 | 13.6% (random) | 75.6% (5.6× lift) |
+| Retrieval hit@5 | 13.6% (random) | 86.7% (6.4× lift) |
 | Token reduction | — | 97.0% average |
-| Task success proxy | 10% | 52.2% |
-| Prompts per task | 2.84 | 1.72 (39.4% fewer) |
+| Task success proxy | 10% | 67.8% |
+| Prompts per task | 2.84 | 1.46 (48.8% fewer) |
 | Supported languages | — | 33 |
 | MCP tools | — | 17 |
 | npm runtime dependencies | — | 0 |

package/llms.txt CHANGED Viewed

@@ -11,7 +11,7 @@ ranking keeps the relevant context in scope (cutting tokens ~97% as a side
 effect), with no LLM calls, embeddings, or vector database. Works with Claude,
 Cursor, GitHub Copilot, Aider, Windsurf, local LLMs, and MCP.
-# Version: 7.30.0 | Benchmark: sigmap-v7.30-main (2026-06-23)
+# Version: 7.31.0 | Benchmark: sigmap-v7.31-main (2026-07-02)
 # Source: auto-generated from package.json, version.json, benchmarks/latest.json, src/mcp/tools.js, src/config/defaults.js
 # Regenerate: npm run generate:llms   |   Validate: npm run validate:llms
@@ -23,12 +23,12 @@ Cursor, GitHub Copilot, Aider, Windsurf, local LLMs, and MCP.
 - No blast-radius awareness before editing a hub file — `--impact` shows every file a change touches.
 - Pasted stack traces, CI logs, and JSON bloat the prompt — `squeeze` minimizes them and enriches the top frame from the symbol index.
-## Core metrics (benchmark: sigmap-v7.30-main, 2026-06-23)
+## Core metrics (benchmark: sigmap-v7.31-main, 2026-07-02)
-- hit@5 retrieval: 75.6% vs 13.6% random baseline (5.6× lift)
+- hit@5 retrieval: 86.7% vs 13.6% random baseline (6.4× lift)
 - Token reduction: 97.0% average across benchmark repos
-- Task success: 52.2% vs 10% without SigMap
-- Prompts per task: 1.72 vs 2.84 baseline (39.4% fewer)
+- Task success: 67.8% vs 10% without SigMap
+- Prompts per task: 1.46 vs 2.84 baseline (48.8% fewer)
 - Languages: 33 supported · MCP tools: 17
 - Dependencies: zero npm runtime dependencies · fully offline

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "sigmap",
-  "version": "7.30.0",
+  "version": "7.31.0",
   "description": "97% token reduction for AI coding. Extracts function & class signatures with TF-IDF ranking to feed only the right files to Claude, Cursor, Copilot, Aider, Windsurf, local LLMs & MCP. Zero dependencies, runs offline via npx.",
   "main": "packages/core/index.js",
   "exports": {

package/packages/cli/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "sigmap-cli",
-  "version": "7.30.0",
+  "version": "7.31.0",
   "description": "SigMap CLI wrapper — thin adapter for programmatic CLI invocation",
   "main": "index.js",
   "keywords": [

package/packages/core/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "sigmap-core",
-  "version": "7.30.0",
+  "version": "7.31.0",
   "description": "SigMap core library — zero-dependency code signature extraction, retrieval, and security scanning",
   "main": "index.js",
   "keywords": [

package/src/eval/runner.js CHANGED Viewed

@@ -20,6 +20,7 @@
 const fs = require('fs');
 const path = require('path');
 const { aggregate } = require('./scorer');
+const { bm25rank } = require('../retrieval/bm25');
 // ---------------------------------------------------------------------------
 // Context file reader
@@ -81,79 +82,26 @@ function buildSigIndex(cwd) {
 }
 // ---------------------------------------------------------------------------
-// Simple keyword-based ranking (pre-retrieval layer; v2.3 adds proper ranker)
+// Identifier-aware BM25 ranking (v7.31; see src/retrieval/bm25.js and #395)
 // ---------------------------------------------------------------------------
-/**
- * Tokenize a query or signature into lower-case word tokens.
- * Splits on whitespace, punctuation, camelCase, and snake_case.
- * @param {string} text
- * @returns {string[]}
- */
-function tokenize(text) {
-  if (!text) return [];
-  return text
-    // split camelCase
-    .replace(/([a-z])([A-Z])/g, '$1 $2')
-    // split snake/kebab
-    .replace(/[_\-]/g, ' ')
-    // drop non-word chars
-    .replace(/[^\w\s]/g, ' ')
-    .toLowerCase()
-    .split(/\s+/)
-    .filter((t) => t.length > 1);
-}
-const STOP_WORDS = new Set([
-  'the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'or', 'is', 'are',
-  'that', 'this', 'it', 'with', 'from', 'by', 'be', 'as', 'on', 'at',
-]);
-/**
- * Score a single file's signatures against a query.
- * Returns a non-negative number; higher = more relevant.
- * @param {string[]} sigs  - array of signature strings for this file
- * @param {string[]} queryTokens
- * @returns {number}
- */
-function scoreFile(sigs, queryTokens) {
-  if (!sigs || sigs.length === 0) return 0;
-  const sigText = sigs.join(' ');
-  const sigTokens = new Set(tokenize(sigText));
-  let score = 0;
-  for (const qt of queryTokens) {
-    if (STOP_WORDS.has(qt)) continue;
-    if (sigTokens.has(qt)) score += 1;
-    // Partial match (prefix)
-    for (const st of sigTokens) {
-      if (st !== qt && st.startsWith(qt) && qt.length >= 4) score += 0.3;
-    }
-  }
-  return score;
-}
+const { tokenize } = require('../retrieval/bm25');
 /**
- * Rank all files in the index against a query. Returns file paths sorted
- * by relevance score descending. Ties are broken by file path alphabetically.
+ * Rank all files in the index against a query with the identifier-aware BM25
+ * re-ranker. Returns file entries sorted by relevance score descending; ties
+ * are broken by file path alphabetically (deterministic).
  * @param {string} query
  * @param {Map<string, string[]>} index
  * @param {number} topK
  * @returns {{ file: string, score: number, sigs: string[] }[]}
  */
 function rank(query, index, topK = 10) {
-  const queryTokens = tokenize(query);
-  const scored = [];
+  const candidates = [];
   for (const [file, sigs] of index.entries()) {
-    const score = scoreFile(sigs, queryTokens);
-    scored.push({ file, score, sigs });
+    candidates.push({ file, sigs });
   }
-  scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
-  return scored.slice(0, topK);
+  return bm25rank(query, candidates).slice(0, topK);
 }
 // ---------------------------------------------------------------------------

package/src/mcp/server.js CHANGED Viewed

@@ -18,7 +18,7 @@ const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, exp
 const SERVER_INFO = {
   name: 'sigmap',
-  version: '7.30.0',
+  version: '7.31.0',
   description: 'SigMap MCP server — code signatures on demand',
 };

package/src/retrieval/bm25.js ADDED Viewed

@@ -0,0 +1,122 @@
+'use strict';
+/**
+ * SigMap identifier-aware BM25 re-ranker (zero dependencies, deterministic).
+ *
+ * Plain exact-token TF-IDF misses queries whose terms live *inside* code
+ * identifiers — e.g. `component emit` never surfaces `componentEmits.ts`,
+ * because "componentEmits" is one token that shares no exact term with the
+ * query. This module fixes that with four small additions:
+ *
+ *   1. Identifier-aware tokenization — split camelCase and snake_case.
+ *   2. Light stemming — plurals / common suffixes (`emits` → `emit`).
+ *   3. Path-token boost — file path / basename tokens weigh PATH_BOOST× more.
+ *   4. BM25 scoring instead of raw TF-IDF (length-normalized).
+ *
+ * On 85 curated tasks across 17 repos this lifted hit@5 from 75.3% → 82.4%
+ * (MRR +16% relative). See issue #395.
+ */
+// Stop words: common English + low-signal code verbs/nouns that appear in
+// nearly every signature and so carry little retrieval signal.
+const STOP = new Set(
+  ('a an the of to in on for and or is are be by with as at from that this it its ' +
+   'into get set add new return value test')
+    .split(' ')
+);
+/**
+ * Light suffix stemmer — conservative, tuned for code identifiers rather than
+ * prose. Words of 3 chars or fewer pass through unchanged; a result shorter
+ * than 3 chars reverts to the original token.
+ *
+ * @param {string} w
+ * @returns {string}
+ */
+function stem(w) {
+  if (w.length <= 3) return w;
+  let s = w;
+  s = s.replace(/ies$/, 'y');
+  s = s.replace(/(sses|shes|ches|xes|zes)$/, (m) => m.slice(0, -2));
+  s = s.replace(/([^s])s$/, '$1');
+  s = s.replace(/(ization|izations)$/, 'ize');
+  s = s.replace(/(ing|edly|ed|er|ers|ation|ations|ment|ness|ity|ive|able|ible|ize|ise|al)$/, '');
+  return s.length >= 3 ? s : w;
+}
+/**
+ * Split on non-alphanumeric characters AND camelCase / snake_case boundaries,
+ * lowercase, drop stop words and single characters, then stem.
+ *
+ * @param {string} text
+ * @returns {string[]}
+ */
+function tokenize(text) {
+  if (!text || typeof text !== 'string') return [];
+  return text
+    .replace(/[^A-Za-z0-9]+/g, ' ')
+    .replace(/([a-z0-9])([A-Z])/g, '$1 $2')
+    .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
+    .toLowerCase()
+    .split(/\s+/)
+    .filter((t) => t.length > 1 && !STOP.has(t))
+    .map(stem)
+    .filter(Boolean);
+}
+// The file path / basename is highly indicative of relevance, so its tokens
+// are counted PATH_BOOST times when building the document term-frequency map.
+const PATH_BOOST = 3;
+/**
+ * BM25 re-rank of candidates against a query. Each candidate is
+ * `{ file, sigs }`; the returned objects preserve all original candidate
+ * fields and add a numeric `score` (higher = more relevant), sorted best-first
+ * with a deterministic path tie-break. A `score` of 0 means no query token
+ * matched — callers typically drop those.
+ *
+ * @param {string} query
+ * @param {{ file: string, sigs: string[] }[]} candidates
+ * @returns {Array<object & { score: number }>}
+ */
+function bm25rank(query, candidates) {
+  if (!Array.isArray(candidates) || candidates.length === 0) return [];
+  const k1 = 1.5;
+  const b = 0.75;
+  const docs = candidates.map((c) => {
+    const pathToks = tokenize(c.file || '');
+    const toks = tokenize((c.sigs || []).join(' '));
+    for (let i = 0; i < PATH_BOOST; i++) toks.push(...pathToks);
+    const tf = new Map();
+    for (const t of toks) tf.set(t, (tf.get(t) || 0) + 1);
+    return { cand: c, tf, len: toks.length };
+  });
+  const N = docs.length || 1;
+  const avgdl = docs.reduce((s, d) => s + d.len, 0) / N || 1;
+  const df = new Map();
+  for (const d of docs) {
+    for (const t of d.tf.keys()) df.set(t, (df.get(t) || 0) + 1);
+  }
+  const qToks = [...new Set(tokenize(query))];
+  return docs
+    .map((d) => {
+      let score = 0;
+      for (const t of qToks) {
+        const f = d.tf.get(t);
+        if (!f) continue;
+        const dfT = df.get(t);
+        const idf = Math.log(1 + (N - dfT + 0.5) / (dfT + 0.5));
+        score += (idf * (f * (k1 + 1))) / (f + k1 * (1 - b + (b * d.len) / avgdl));
+      }
+      return Object.assign({}, d.cand, { score });
+    })
+    .sort((a, c) => c.score - a.score || String(a.file).localeCompare(String(c.file)));
+}
+module.exports = { tokenize, stem, bm25rank, PATH_BOOST, STOP };

package/src/retrieval/ranker.js CHANGED Viewed

@@ -19,6 +19,7 @@
 const { loadWeights } = require('../learning/weights');
 const { tokenize, STOP_WORDS } = require('./tokenizer');
+const { bm25rank } = require('./bm25');
 // ---------------------------------------------------------------------------
 // Default weights
@@ -197,11 +198,24 @@ function rank(query, sigIndex, opts) {
     return all.slice(0, topK);
   }
+  // Identifier-aware BM25 base relevance over the whole index (#395). BM25
+  // splits camelCase/snake_case, stems, and boosts path tokens, so queries
+  // whose terms live inside identifiers (e.g. "component emit" → componentEmits)
+  // are matched. The existing negative-signal penalty and recency/graph/learned
+  // boosts are layered on top; the per-token signals stay for the explain table.
+  const bm25Scores = new Map();
+  for (const c of bm25rank(query, [...sigIndex.entries()].map(([file, sigs]) => ({ file, sigs })))) {
+    bm25Scores.set(c.file, c.score);
+  }
   const scored = [];
   for (const [file, sigs] of sigIndex.entries()) {
     const result = scoreFile(file, sigs, queryTokens, weights);
-    let score = result.score;
+    const penalty = result.signals.penalty;
+    const base = bm25Scores.get(file) || 0;
+    let score = base * penalty;
     const signals = result.signals;
+    signals.bm25 = base;
     // Recency boost
     if (recencySet && recencySet.has(file) && score > 0) {