npm - sigmap - Versions diffs - 2.2.0 → 2.3.0 - Mend

sigmap 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/CHANGELOG.md +24 -0
package/gen-context.js +193 -3
package/package.json +1 -1
package/src/config/defaults.js +8 -0
package/src/mcp/handlers.js +28 -1
package/src/mcp/server.js +3 -2
package/src/mcp/tools.js +24 -0
package/src/retrieval/ranker.js +242 -0
package/src/retrieval/tokenizer.js +54 -0

package/CHANGELOG.md CHANGED Viewed

@@ -6,6 +6,30 @@ Format: [Semantic Versioning](https://semver.org/)
 ---
+## [2.3.0] — 2026-04-07
+### Added
+- **Query-aware retrieval** — `src/retrieval/tokenizer.js` and `src/retrieval/ranker.js`: zero-dependency relevance ranker that scores every file against a free-text query by exact token, symbol, prefix, path, and recency signals.
+- **`--query "<text>"` CLI flag** — ranks all context files by relevance and prints a scored table (Rank | File | Score | Sigs | Tokens) plus the top-3 signature blocks; `--query "<text>" --json` for machine-readable output; `--query "<text>" --top <n>` to limit result set.
+- **`query_context` MCP tool** — 8th MCP tool; accepts `{ query: string, topK?: number }` and returns the same ranked table as the `--query` CLI flag; live within any running MCP session.
+- **Retrieval config** — `config.retrieval.topK` (default 10) and `config.retrieval.recencyBoost` (default 1.5×) added to `src/config/defaults.js`.
+- **`test/integration/retrieval.test.js`** — 23 integration tests covering tokenizer unit tests, ranker sorting/scoring/topK/empty-query, `formatRankTable`, `formatRankJSON`, CLI `--query` flags, and MCP `query_context`.
+### Changed
+- `src/mcp/server.js` version bumped to `2.3.0`.
+- `test/integration/mcp-server.test.js` and `mcp-v14.test.js` updated to assert 8 tools.
+- `test/integration/analyze.test.js` version assertion updated to `2.3.0`.
+### Validation gate
+- 21/21 extractor unit tests passed
+- 20/20 integration suites passed (0 failures)
+- `node gen-context.js --version` → `2.3.0`
+- `node gen-context.js --query "python extractor"` → `src/extractors/python.js` in top-3
+- `node gen-context.js --query "fix secret scanning" --json` → valid JSON
+- MCP `tools/list` → 8 tools including `query_context`
+---
 ## [2.2.0] — 2026-04-06
 ### Added

package/gen-context.js CHANGED Viewed

@@ -2879,7 +2879,23 @@ __factories["./src/mcp/handlers"] = function(module, exports) {
     ].join('\n');
   }
-  module.exports = { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules };
+  function queryContext(args, cwd) {
+    if (!args || !args.query) return 'Missing required argument: query';
+    const contextPath = path.join(cwd, CONTEXT_FILE);
+    if (!fs.existsSync(contextPath)) return 'No context file found. Run: node gen-context.js';
+    try {
+      const { rank, buildSigIndex, formatRankTable } = __require('./src/retrieval/ranker');
+      const index = buildSigIndex(cwd);
+      if (index.size === 0) return 'No signatures indexed. Run: node gen-context.js';
+      const topK = Math.min(Math.max(1, parseInt(args.topK, 10) || 10), 25);
+      const results = rank(args.query, index, { topK });
+      return formatRankTable(results, args.query);
+    } catch (err) {
+      return `_query_context failed: ${err.message}_`;
+    }
+  }
+  module.exports = { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext };
 };
 // ── ./src/mcp/server ──
@@ -2899,7 +2915,7 @@ __factories["./src/mcp/server"] = function(module, exports) {
   const readline = require('readline');
   const { TOOLS } = __require('./src/mcp/tools');
-  const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules } = __require('./src/mcp/handlers');
+  const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext } = __require('./src/mcp/handlers');
   const SERVER_INFO = {
     name: 'sigmap',
@@ -2958,6 +2974,7 @@ __factories["./src/mcp/server"] = function(module, exports) {
         else if (name === 'get_routing') text = getRouting(args, cwd);
         else if (name === 'explain_file') text = explainFile(args, cwd);
         else if (name === 'list_modules') text = listModules(args, cwd);
+        else if (name === 'query_context') text = queryContext(args, cwd);
         else {
           respondError(id, -32601, `Unknown tool: ${name}`);
           return;
@@ -3137,6 +3154,30 @@ __factories["./src/mcp/tools"] = function(module, exports) {
         required: [],
       },
     },
+    {
+      name: 'query_context',
+      description:
+        'Rank and return the most relevant files for a specific task or question. ' +
+        'Uses keyword + symbol + path scoring to surface only the top-K files relevant ' +
+        'to the query — much cheaper than reading all context. ' +
+        'Returns ranked file list with signatures and relevance scores.',
+      inputSchema: {
+        type: 'object',
+        properties: {
+          query: {
+            type: 'string',
+            description:
+              'Natural language task description or keyword(s) to rank files against. ' +
+              'E.g. "add a new language extractor", "fix secret scanning", "auth module".',
+          },
+          topK: {
+            type: 'number',
+            description: 'Maximum number of files to return (default: 10, max: 25).',
+          },
+        },
+        required: ['query'],
+      },
+    },
   ];
   module.exports = { TOOLS };
@@ -3570,6 +3611,120 @@ __factories["./src/tracking/logger"] = function(module, exports) {
 };
+// ── ./src/retrieval/tokenizer ──
+__factories["./src/retrieval/tokenizer"] = function(module, exports) {
+  'use strict';
+  const STOP_WORDS = new Set([
+    'the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'or', 'is', 'are',
+    'that', 'this', 'it', 'with', 'from', 'by', 'be', 'as', 'on', 'at',
+    'do', 'not', 'use', 'get', 'set', 'up', 'if', 'no', 'so', 'we',
+  ]);
+  function tokenize(text, opts) {
+    if (!text || typeof text !== 'string') return [];
+    const removeStop = opts && opts.removeStopWords === false ? false : true;
+    const minLen = (opts && opts.minLength) || 2;
+    const tokens = text
+      .replace(/\.\w{1,6}(?=\s|\/|$)/g, ' ')
+      .replace(/([a-z])([A-Z])/g, '$1 $2')
+      .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
+      .replace(/[_\-\.\/]/g, ' ')
+      .replace(/[^\w\s]/g, ' ')
+      .toLowerCase()
+      .split(/\s+/)
+      .filter((t) => t.length >= minLen);
+    if (!removeStop) return [...new Set(tokens)];
+    return [...new Set(tokens.filter((t) => !STOP_WORDS.has(t)))];
+  }
+  module.exports = { tokenize, STOP_WORDS };
+};
+// ── ./src/retrieval/ranker ──
+__factories["./src/retrieval/ranker"] = function(module, exports) {
+  'use strict';
+  const { tokenize, STOP_WORDS } = __require('./src/retrieval/tokenizer');
+  const DEFAULT_WEIGHTS = {
+    exactToken: 1.0, symbolMatch: 0.5, prefixMatch: 0.3, pathMatch: 0.8, recencyBoost: 1.5,
+  };
+  function scoreFile(filePath, sigs, queryTokens, weights) {
+    if (!sigs || sigs.length === 0) return 0;
+    const w = weights || DEFAULT_WEIGHTS;
+    const sigTokenSet = new Set(tokenize(sigs.join(' ')));
+    const pathTokenSet = new Set(tokenize(filePath));
+    let score = 0;
+    for (const qt of queryTokens) {
+      if (STOP_WORDS.has(qt)) continue;
+      if (sigTokenSet.has(qt)) {
+        score += w.exactToken;
+        if (sigs.some((sig) => tokenize(sig.replace(/[^a-zA-Z0-9_\s]/g, ' ')).includes(qt))) score += w.symbolMatch;
+      }
+      if (qt.length >= 4) {
+        for (const st of sigTokenSet) {
+          if (st !== qt && st.startsWith(qt)) { score += w.prefixMatch; break; }
+        }
+      }
+      if (pathTokenSet.has(qt)) score += w.pathMatch;
+    }
+    return score;
+  }
+  function rank(query, sigIndex, opts) {
+    if (!query || typeof query !== 'string') return [];
+    if (!sigIndex || !(sigIndex instanceof Map) || sigIndex.size === 0) return [];
+    const topK = (opts && opts.topK) || 10;
+    const recencyMultiplier = (opts && opts.recencyBoost) || DEFAULT_WEIGHTS.recencyBoost;
+    const recencySet = (opts && opts.recencySet) || null;
+    const weights = (opts && opts.weights) ? Object.assign({}, DEFAULT_WEIGHTS, opts.weights) : DEFAULT_WEIGHTS;
+    const queryTokens = tokenize(query);
+    if (queryTokens.length === 0) {
+      const all = [];
+      for (const [file, sigs] of sigIndex.entries()) all.push({ file, score: sigs.length, sigs, tokens: Math.ceil(sigs.join('\n').length / 4) });
+      all.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
+      return all.slice(0, topK);
+    }
+    const scored = [];
+    for (const [file, sigs] of sigIndex.entries()) {
+      let score = scoreFile(file, sigs, queryTokens, weights);
+      if (recencySet && recencySet.has(file) && score > 0) score *= recencyMultiplier;
+      scored.push({ file, score, sigs, tokens: Math.ceil(sigs.join('\n').length / 4) });
+    }
+    scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
+    return scored.slice(0, topK);
+  }
+  function buildSigIndex(cwd) {
+    const fs = require('fs'); const path = require('path');
+    const contextPath = path.join(cwd, '.github', 'copilot-instructions.md');
+    const index = new Map();
+    if (!fs.existsSync(contextPath)) return index;
+    const content = fs.readFileSync(contextPath, 'utf8');
+    const lines = content.split('\n');
+    let currentFile = null; let inBlock = false; let sigs = [];
+    for (const line of lines) {
+      const hm = line.match(/^###\s+(\S+)\s*$/);
+      if (hm) { if (currentFile !== null) index.set(currentFile, sigs); currentFile = hm[1]; sigs = []; inBlock = false; continue; }
+      if (line.startsWith('```')) { inBlock = !inBlock; continue; }
+      if (inBlock && currentFile && line.trim()) sigs.push(line.trim());
+    }
+    if (currentFile !== null) index.set(currentFile, sigs);
+    return index;
+  }
+  function formatRankTable(results, query) {
+    if (!results || results.length === 0) return `No matching files found for query: "${query}"\n`;
+    const lines = [`## Query: ${query}`, '', '| Rank | File | Score | Sigs | Tokens |', '|------|------|-------|------|--------|',
+      ...results.map((r, i) => `| ${i + 1} | ${r.file} | ${r.score.toFixed(2)} | ${r.sigs.length} | ${r.tokens} |`), ''];
+    for (const r of results.slice(0, 3)) {
+      if (r.sigs.length > 0) {
+        lines.push(`### ${r.file}`, '```', ...r.sigs.slice(0, 10));
+        if (r.sigs.length > 10) lines.push(`... (${r.sigs.length - 10} more)`);
+        lines.push('```', '');
+      }
+    }
+    return lines.join('\n');
+  }
+  function formatRankJSON(results, query) {
+    return { query, results: (results || []).map((r, i) => ({ rank: i + 1, file: r.file, score: r.score, sigs: r.sigs, tokens: r.tokens })), totalResults: (results || []).length };
+  }
+  module.exports = { rank, buildSigIndex, scoreFile, formatRankTable, formatRankJSON, DEFAULT_WEIGHTS };
+};
 // ── ./src/eval/scorer ──
 __factories["./src/eval/scorer"] = function(module, exports) {
   'use strict';
@@ -3936,7 +4091,7 @@ const path = require('path');
 const os = require('os');
 const { execSync } = require('child_process');
-const VERSION = '2.2.0';
+const VERSION = '2.3.0';
 const MARKER = '\n\n## Auto-generated signatures\n<!-- Updated by gen-context.js -->\n';
 function requireSourceOrBundled(key) {
@@ -5149,6 +5304,9 @@ Usage:
   node gen-context.js --analyze --json                  Breakdown as JSON
   node gen-context.js --analyze --slow                  Re-time each extractor; flag files >50ms
   node gen-context.js --diagnose-extractors             Run all 21 extractors vs fixtures; show pass/fail + diff
+  node gen-context.js --query "<text>"                  Rank files by relevance to a query
+  node gen-context.js --query "<text>" --json           Ranked results as JSON
+  node gen-context.js --query "<text>" --top <n>        Limit results to top N files (default 10)
   node gen-context.js --init                            Write example config + .contextignore scaffold
   node gen-context.js --help                            Show this message
   node gen-context.js --version                         Show version
@@ -5435,6 +5593,38 @@ function main() {
     }
   }
+  if (args.includes('--query')) {
+    try {
+      const qIdx = args.indexOf('--query');
+      const query = (args[qIdx + 1] || '').trim();
+      if (!query || query.startsWith('--')) {
+        console.error('[sigmap] --query requires a search string');
+        console.error('  Example: node gen-context.js --query "add a new language extractor"');
+        process.exit(1);
+      }
+      const { rank, buildSigIndex, formatRankTable, formatRankJSON } = requireSourceOrBundled('./src/retrieval/ranker');
+      const index = buildSigIndex(cwd);
+      if (index.size === 0) {
+        console.error('[sigmap] no context file found. Run: node gen-context.js');
+        process.exit(1);
+      }
+      const topIdx = args.indexOf('--top');
+      const topK = topIdx >= 0 ? Math.min(Math.max(1, parseInt(args[topIdx + 1], 10) || 10), 25)
+                               : ((config && config.retrieval && config.retrieval.topK) || 10);
+      const recencyBoost = (config && config.retrieval && config.retrieval.recencyBoost) || 1.5;
+      const results = rank(query, index, { topK, recencyBoost });
+      if (args.includes('--json')) {
+        process.stdout.write(JSON.stringify(formatRankJSON(results, query)) + '\n');
+      } else {
+        process.stdout.write(formatRankTable(results, query));
+      }
+    } catch (err) {
+      console.error(`[sigmap] query error: ${err.message}`);
+      process.exit(1);
+    }
+    process.exit(0);
+  }
   if (args.includes('--report')) {
     if (args.includes('--history')) {
       try {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "sigmap",
-  "version": "2.2.0",
+  "version": "2.3.0",
   "description": "Zero-dependency AI context engine — 97% token reduction. No npm install. Runs on Node 18+.",
   "main": "gen-context.js",
   "bin": {

package/src/config/defaults.js CHANGED Viewed

@@ -92,6 +92,14 @@ const DEFAULTS = {
   // Add reverse dependency usage hints on file headings (opt-in)
   impactRadius: false,
+  // Query-aware retrieval settings (v2.3)
+  retrieval: {
+    // Maximum number of files to return for --query
+    topK: 10,
+    // Multiplier applied to recently-changed files (>1 boosts them up)
+    recencyBoost: 1.5,
+  },
 };
 module.exports = { DEFAULTS };

package/src/mcp/handlers.js CHANGED Viewed

@@ -430,4 +430,31 @@ function listModules(args, cwd) {
   ].join('\n');
 }
-module.exports = { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules };
+/**
+ * query_context({ query, topK? }) → string
+ *
+ * Ranks context-file entries by relevance to the query and returns the
+ * top-K most relevant files with their signatures and scores.
+ */
+function queryContext(args, cwd) {
+  if (!args || !args.query) return 'Missing required argument: query';
+  const contextPath = path.join(cwd, CONTEXT_FILE);
+  if (!fs.existsSync(contextPath)) {
+    return 'No context file found. Run: node gen-context.js';
+  }
+  try {
+    const { rank, buildSigIndex, formatRankTable } = require('../retrieval/ranker');
+    const index = buildSigIndex(cwd);
+    if (index.size === 0) return 'No signatures indexed. Run: node gen-context.js';
+    const topK = Math.min(Math.max(1, parseInt(args.topK, 10) || 10), 25);
+    const results = rank(args.query, index, { topK });
+    return formatRankTable(results, args.query);
+  } catch (err) {
+    return `_query_context failed: ${err.message}_`;
+  }
+}
+module.exports = { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext };

package/src/mcp/server.js CHANGED Viewed

@@ -14,11 +14,11 @@
 const readline = require('readline');
 const { TOOLS } = require('./tools');
-const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules } = require('./handlers');
+const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext } = require('./handlers');
 const SERVER_INFO = {
   name: 'sigmap',
-  version: '2.2.0',
+  version: '2.3.0',
   description: 'SigMap MCP server — code signatures on demand',
 };
@@ -73,6 +73,7 @@ function dispatch(msg, cwd) {
       else if (name === 'get_routing') text = getRouting(args, cwd);
       else if (name === 'explain_file') text = explainFile(args, cwd);
       else if (name === 'list_modules') text = listModules(args, cwd);
+      else if (name === 'query_context') text = queryContext(args, cwd);
       else {
         respondError(id, -32601, `Unknown tool: ${name}`);
         return;

package/src/mcp/tools.js CHANGED Viewed

@@ -120,6 +120,30 @@ const TOOLS = [
       required: [],
     },
   },
+  {
+    name: 'query_context',
+    description:
+      'Rank and return the most relevant files for a specific task or question. ' +
+      'Uses keyword + symbol + path scoring to surface only the top-K files relevant ' +
+      'to the query — much cheaper than reading all context. ' +
+      'Returns ranked file list with signatures and relevance scores.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        query: {
+          type: 'string',
+          description:
+            'Natural language task description or keyword(s) to rank files against. ' +
+            'E.g. "add a new language extractor", "fix secret scanning", "auth module".',
+        },
+        topK: {
+          type: 'number',
+          description: 'Maximum number of files to return (default: 10, max: 25).',
+        },
+      },
+      required: ['query'],
+    },
+  },
 ];
 module.exports = { TOOLS };

package/src/retrieval/ranker.js ADDED Viewed

@@ -0,0 +1,242 @@
+'use strict';
+/**
+ * SigMap zero-dependency relevance ranker.
+ *
+ * Ranks all files in a signature index against a natural-language query.
+ * Scoring weights:
+ *   - keyword overlap (exact token match against sigs)
+ *   - symbol match (token appears in a top-level identifier / function name)
+ *   - partial prefix match (token is prefix of a sig token, length ≥ 4)
+ *   - path relevance (query token appears in the file path)
+ *   - recency boost (applied externally via recency map)
+ *
+ * Usage:
+ *   const { rank } = require('./src/retrieval/ranker');
+ *   const results = rank(query, sigIndex, { topK: 10 });
+ *   // results: [{ file, score, sigs, tokens }]
+ */
+const { tokenize, STOP_WORDS } = require('./tokenizer');
+// ---------------------------------------------------------------------------
+// Default weights
+// ---------------------------------------------------------------------------
+const DEFAULT_WEIGHTS = {
+  exactToken: 1.0,       // query token exactly in sig tokens
+  symbolMatch: 0.5,      // bonus if token appears in a function/class name line
+  prefixMatch: 0.3,      // partial prefix hit (query token ≥ 4 chars)
+  pathMatch: 0.8,        // query token appears in the file path
+  recencyBoost: 1.5,     // multiplier applied when file is in recencySet
+};
+/**
+ * Score a single file against a query.
+ *
+ * @param {string}   filePath   - relative file path (e.g. 'src/extractors/python.js')
+ * @param {string[]} sigs       - signature strings for this file
+ * @param {string[]} queryTokens - pre-tokenized query
+ * @param {object}   weights
+ * @returns {number}
+ */
+function scoreFile(filePath, sigs, queryTokens, weights) {
+  if (!sigs || sigs.length === 0) return 0;
+  const w = weights || DEFAULT_WEIGHTS;
+  // Build token set from all signatures
+  const sigText = sigs.join(' ');
+  const sigTokenSet = new Set(tokenize(sigText));
+  // Build token set from the file path
+  const pathTokenSet = new Set(tokenize(filePath));
+  let score = 0;
+  for (const qt of queryTokens) {
+    if (STOP_WORDS.has(qt)) continue;
+    // Exact token match in sigs
+    if (sigTokenSet.has(qt)) {
+      score += w.exactToken;
+      // Bonus: appears directly in a function/class/method name line
+      const nameLineMatch = sigs.some((sig) => {
+        const nt = tokenize(sig.replace(/[^a-zA-Z0-9_\s]/g, ' '));
+        return nt.includes(qt);
+      });
+      if (nameLineMatch) score += w.symbolMatch;
+    }
+    // Prefix match (e.g. query "python" matches "pythonDeps")
+    if (qt.length >= 4) {
+      for (const st of sigTokenSet) {
+        if (st !== qt && st.startsWith(qt)) {
+          score += w.prefixMatch;
+          break; // one bonus per query token
+        }
+      }
+    }
+    // Path token match
+    if (pathTokenSet.has(qt)) {
+      score += w.pathMatch;
+    }
+  }
+  return score;
+}
+/**
+ * Rank all files in a signature index against a query.
+ *
+ * @param {string}              query     - natural language query
+ * @param {Map<string,string[]>} sigIndex - Map<file, sigs[]>
+ * @param {object}  [opts]
+ * @param {number}  [opts.topK=10]               - max results to return
+ * @param {number}  [opts.recencyBoost=1.5]       - multiplier for recent files
+ * @param {Set<string>} [opts.recencySet]         - set of recently-changed file paths
+ * @param {object}  [opts.weights]               - override scoring weights
+ * @returns {{ file: string, score: number, sigs: string[], tokens: number }[]}
+ */
+function rank(query, sigIndex, opts) {
+  if (!query || typeof query !== 'string') return [];
+  if (!sigIndex || !(sigIndex instanceof Map) || sigIndex.size === 0) return [];
+  const topK = (opts && opts.topK) || 10;
+  const recencyMultiplier = (opts && opts.recencyBoost) || DEFAULT_WEIGHTS.recencyBoost;
+  const recencySet = (opts && opts.recencySet) || null;
+  const weights = (opts && opts.weights) ? Object.assign({}, DEFAULT_WEIGHTS, opts.weights) : DEFAULT_WEIGHTS;
+  const queryTokens = tokenize(query);
+  if (queryTokens.length === 0) {
+    // Empty query: return top-K by file count (most signatures = most useful)
+    const all = [];
+    for (const [file, sigs] of sigIndex.entries()) {
+      all.push({ file, score: sigs.length, sigs, tokens: Math.ceil(sigs.join('\n').length / 4) });
+    }
+    all.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
+    return all.slice(0, topK);
+  }
+  const scored = [];
+  for (const [file, sigs] of sigIndex.entries()) {
+    let score = scoreFile(file, sigs, queryTokens, weights);
+    // Recency boost
+    if (recencySet && recencySet.has(file) && score > 0) {
+      score *= recencyMultiplier;
+    }
+    scored.push({
+      file,
+      score,
+      sigs,
+      tokens: Math.ceil(sigs.join('\n').length / 4),
+    });
+  }
+  scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
+  return scored.slice(0, topK);
+}
+/**
+ * Build a signature index from the generated context file.
+ * Returns Map<filePath, string[]> where filePath is the relative path
+ * as it appears in the ### headers of copilot-instructions.md.
+ *
+ * @param {string} cwd
+ * @returns {Map<string, string[]>}
+ */
+function buildSigIndex(cwd) {
+  const fs   = require('fs');
+  const path = require('path');
+  const contextPath = path.join(cwd, '.github', 'copilot-instructions.md');
+  const index = new Map();
+  if (!fs.existsSync(contextPath)) return index;
+  const content = fs.readFileSync(contextPath, 'utf8');
+  const lines = content.split('\n');
+  let currentFile = null;
+  let inBlock = false;
+  let sigs = [];
+  for (const line of lines) {
+    const headerMatch = line.match(/^###\s+(\S+)\s*$/);
+    if (headerMatch) {
+      if (currentFile !== null) index.set(currentFile, sigs);
+      currentFile = headerMatch[1];
+      sigs = [];
+      inBlock = false;
+      continue;
+    }
+    if (line.startsWith('```')) { inBlock = !inBlock; continue; }
+    if (inBlock && currentFile && line.trim()) sigs.push(line.trim());
+  }
+  if (currentFile !== null) index.set(currentFile, sigs);
+  return index;
+}
+/**
+ * Format ranked results as a markdown table string.
+ *
+ * @param {{ file: string, score: number, sigs: string[], tokens: number }[]} results
+ * @param {string} query
+ * @returns {string}
+ */
+function formatRankTable(results, query) {
+  if (!results || results.length === 0) {
+    return `No matching files found for query: "${query}"\n`;
+  }
+  const lines = [
+    `## Query: ${query}`,
+    '',
+    '| Rank | File | Score | Sigs | Tokens |',
+    '|------|------|-------|------|--------|',
+    ...results.map((r, i) =>
+      `| ${i + 1} | ${r.file} | ${r.score.toFixed(2)} | ${r.sigs.length} | ${r.tokens} |`
+    ),
+    '',
+  ];
+  // Add signature details for top results
+  for (const r of results.slice(0, 3)) {
+    if (r.sigs.length > 0) {
+      lines.push(`### ${r.file}`);
+      lines.push('```');
+      lines.push(...r.sigs.slice(0, 10));
+      if (r.sigs.length > 10) lines.push(`... (${r.sigs.length - 10} more)`);
+      lines.push('```');
+      lines.push('');
+    }
+  }
+  return lines.join('\n');
+}
+/**
+ * Format ranked results as a structured JSON-serialisable object.
+ *
+ * @param {{ file: string, score: number, sigs: string[], tokens: number }[]} results
+ * @param {string} query
+ * @returns {object}
+ */
+function formatRankJSON(results, query) {
+  return {
+    query,
+    results: (results || []).map((r, i) => ({
+      rank: i + 1,
+      file: r.file,
+      score: r.score,
+      sigs: r.sigs,
+      tokens: r.tokens,
+    })),
+    totalResults: (results || []).length,
+  };
+}
+module.exports = { rank, buildSigIndex, scoreFile, formatRankTable, formatRankJSON, DEFAULT_WEIGHTS };

package/src/retrieval/tokenizer.js ADDED Viewed

@@ -0,0 +1,54 @@
+'use strict';
+/**
+ * SigMap zero-dependency tokenizer.
+ * Splits code identifiers: camelCase, snake_case, kebab-case, PascalCase,
+ * removes stop words, and returns lower-case tokens.
+ */
+const STOP_WORDS = new Set([
+  'the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'or', 'is', 'are',
+  'that', 'this', 'it', 'with', 'from', 'by', 'be', 'as', 'on', 'at',
+  'do', 'not', 'use', 'get', 'set', 'up', 'if', 'no', 'so', 'we',
+]);
+/**
+ * Tokenize any text (query or code signature) into unique lower-case tokens.
+ * Handles:
+ *   - camelCase  → ['camel', 'case']
+ *   - PascalCase → ['pascal', 'case']
+ *   - snake_case → ['snake', 'case']
+ *   - kebab-case → ['kebab', 'case']
+ *   - dot.notation → ['dot', 'notation']
+ *   - File paths  → individual path components (no extension)
+ *
+ * @param {string} text
+ * @param {object} [opts]
+ * @param {boolean} [opts.removeStopWords=true]
+ * @param {number}  [opts.minLength=2]
+ * @returns {string[]}
+ */
+function tokenize(text, opts) {
+  if (!text || typeof text !== 'string') return [];
+  const removeStop = opts && opts.removeStopWords === false ? false : true;
+  const minLen = (opts && opts.minLength) || 2;
+  const tokens = text
+    // strip file extension (e.g. .js, .ts, .py)
+    .replace(/\.\w{1,6}(?=\s|\/|$)/g, ' ')
+    // camelCase / PascalCase split
+    .replace(/([a-z])([A-Z])/g, '$1 $2')
+    .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
+    // snake_case / kebab-case / dot.notation
+    .replace(/[_\-\.\/]/g, ' ')
+    // drop remaining non-word characters
+    .replace(/[^\w\s]/g, ' ')
+    .toLowerCase()
+    .split(/\s+/)
+    .filter((t) => t.length >= minLen);
+  if (!removeStop) return [...new Set(tokens)];
+  return [...new Set(tokens.filter((t) => !STOP_WORDS.has(t)))];
+}
+module.exports = { tokenize, STOP_WORDS };