sigmap 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -6,6 +6,30 @@ Format: [Semantic Versioning](https://semver.org/)
6
6
 
7
7
  ---
8
8
 
9
+ ## [2.3.0] — 2026-04-07
10
+
11
+ ### Added
12
+ - **Query-aware retrieval** — `src/retrieval/tokenizer.js` and `src/retrieval/ranker.js`: zero-dependency relevance ranker that scores every file against a free-text query by exact token, symbol, prefix, path, and recency signals.
13
+ - **`--query "<text>"` CLI flag** — ranks all context files by relevance and prints a scored table (Rank | File | Score | Sigs | Tokens) plus the top-3 signature blocks; `--query "<text>" --json` for machine-readable output; `--query "<text>" --top <n>` to limit result set.
14
+ - **`query_context` MCP tool** — 8th MCP tool; accepts `{ query: string, topK?: number }` and returns the same ranked table as the `--query` CLI flag; live within any running MCP session.
15
+ - **Retrieval config** — `config.retrieval.topK` (default 10) and `config.retrieval.recencyBoost` (default 1.5×) added to `src/config/defaults.js`.
16
+ - **`test/integration/retrieval.test.js`** — 23 integration tests covering tokenizer unit tests, ranker sorting/scoring/topK/empty-query, `formatRankTable`, `formatRankJSON`, CLI `--query` flags, and MCP `query_context`.
17
+
18
+ ### Changed
19
+ - `src/mcp/server.js` version bumped to `2.3.0`.
20
+ - `test/integration/mcp-server.test.js` and `mcp-v14.test.js` updated to assert 8 tools.
21
+ - `test/integration/analyze.test.js` version assertion updated to `2.3.0`.
22
+
23
+ ### Validation gate
24
+ - 21/21 extractor unit tests passed
25
+ - 20/20 integration suites passed (0 failures)
26
+ - `node gen-context.js --version` → `2.3.0`
27
+ - `node gen-context.js --query "python extractor"` → `src/extractors/python.js` in top-3
28
+ - `node gen-context.js --query "fix secret scanning" --json` → valid JSON
29
+ - MCP `tools/list` → 8 tools including `query_context`
30
+
31
+ ---
32
+
9
33
  ## [2.2.0] — 2026-04-06
10
34
 
11
35
  ### Added
package/gen-context.js CHANGED
@@ -2879,7 +2879,23 @@ __factories["./src/mcp/handlers"] = function(module, exports) {
2879
2879
  ].join('\n');
2880
2880
  }
2881
2881
 
2882
- module.exports = { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules };
2882
+ function queryContext(args, cwd) {
2883
+ if (!args || !args.query) return 'Missing required argument: query';
2884
+ const contextPath = path.join(cwd, CONTEXT_FILE);
2885
+ if (!fs.existsSync(contextPath)) return 'No context file found. Run: node gen-context.js';
2886
+ try {
2887
+ const { rank, buildSigIndex, formatRankTable } = __require('./src/retrieval/ranker');
2888
+ const index = buildSigIndex(cwd);
2889
+ if (index.size === 0) return 'No signatures indexed. Run: node gen-context.js';
2890
+ const topK = Math.min(Math.max(1, parseInt(args.topK, 10) || 10), 25);
2891
+ const results = rank(args.query, index, { topK });
2892
+ return formatRankTable(results, args.query);
2893
+ } catch (err) {
2894
+ return `_query_context failed: ${err.message}_`;
2895
+ }
2896
+ }
2897
+
2898
+ module.exports = { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext };
2883
2899
  };
2884
2900
 
2885
2901
  // ── ./src/mcp/server ──
@@ -2899,7 +2915,7 @@ __factories["./src/mcp/server"] = function(module, exports) {
2899
2915
 
2900
2916
  const readline = require('readline');
2901
2917
  const { TOOLS } = __require('./src/mcp/tools');
2902
- const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules } = __require('./src/mcp/handlers');
2918
+ const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext } = __require('./src/mcp/handlers');
2903
2919
 
2904
2920
  const SERVER_INFO = {
2905
2921
  name: 'sigmap',
@@ -2958,6 +2974,7 @@ __factories["./src/mcp/server"] = function(module, exports) {
2958
2974
  else if (name === 'get_routing') text = getRouting(args, cwd);
2959
2975
  else if (name === 'explain_file') text = explainFile(args, cwd);
2960
2976
  else if (name === 'list_modules') text = listModules(args, cwd);
2977
+ else if (name === 'query_context') text = queryContext(args, cwd);
2961
2978
  else {
2962
2979
  respondError(id, -32601, `Unknown tool: ${name}`);
2963
2980
  return;
@@ -3137,6 +3154,30 @@ __factories["./src/mcp/tools"] = function(module, exports) {
3137
3154
  required: [],
3138
3155
  },
3139
3156
  },
3157
+ {
3158
+ name: 'query_context',
3159
+ description:
3160
+ 'Rank and return the most relevant files for a specific task or question. ' +
3161
+ 'Uses keyword + symbol + path scoring to surface only the top-K files relevant ' +
3162
+ 'to the query — much cheaper than reading all context. ' +
3163
+ 'Returns ranked file list with signatures and relevance scores.',
3164
+ inputSchema: {
3165
+ type: 'object',
3166
+ properties: {
3167
+ query: {
3168
+ type: 'string',
3169
+ description:
3170
+ 'Natural language task description or keyword(s) to rank files against. ' +
3171
+ 'E.g. "add a new language extractor", "fix secret scanning", "auth module".',
3172
+ },
3173
+ topK: {
3174
+ type: 'number',
3175
+ description: 'Maximum number of files to return (default: 10, max: 25).',
3176
+ },
3177
+ },
3178
+ required: ['query'],
3179
+ },
3180
+ },
3140
3181
  ];
3141
3182
 
3142
3183
  module.exports = { TOOLS };
@@ -3570,6 +3611,120 @@ __factories["./src/tracking/logger"] = function(module, exports) {
3570
3611
 
3571
3612
  };
3572
3613
 
3614
+ // ── ./src/retrieval/tokenizer ──
3615
+ __factories["./src/retrieval/tokenizer"] = function(module, exports) {
3616
+ 'use strict';
3617
+ const STOP_WORDS = new Set([
3618
+ 'the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'or', 'is', 'are',
3619
+ 'that', 'this', 'it', 'with', 'from', 'by', 'be', 'as', 'on', 'at',
3620
+ 'do', 'not', 'use', 'get', 'set', 'up', 'if', 'no', 'so', 'we',
3621
+ ]);
3622
+ function tokenize(text, opts) {
3623
+ if (!text || typeof text !== 'string') return [];
3624
+ const removeStop = opts && opts.removeStopWords === false ? false : true;
3625
+ const minLen = (opts && opts.minLength) || 2;
3626
+ const tokens = text
3627
+ .replace(/\.\w{1,6}(?=\s|\/|$)/g, ' ')
3628
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
3629
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
3630
+ .replace(/[_\-\.\/]/g, ' ')
3631
+ .replace(/[^\w\s]/g, ' ')
3632
+ .toLowerCase()
3633
+ .split(/\s+/)
3634
+ .filter((t) => t.length >= minLen);
3635
+ if (!removeStop) return [...new Set(tokens)];
3636
+ return [...new Set(tokens.filter((t) => !STOP_WORDS.has(t)))];
3637
+ }
3638
+ module.exports = { tokenize, STOP_WORDS };
3639
+ };
3640
+
3641
+ // ── ./src/retrieval/ranker ──
3642
+ __factories["./src/retrieval/ranker"] = function(module, exports) {
3643
+ 'use strict';
3644
+ const { tokenize, STOP_WORDS } = __require('./src/retrieval/tokenizer');
3645
+ const DEFAULT_WEIGHTS = {
3646
+ exactToken: 1.0, symbolMatch: 0.5, prefixMatch: 0.3, pathMatch: 0.8, recencyBoost: 1.5,
3647
+ };
3648
+ function scoreFile(filePath, sigs, queryTokens, weights) {
3649
+ if (!sigs || sigs.length === 0) return 0;
3650
+ const w = weights || DEFAULT_WEIGHTS;
3651
+ const sigTokenSet = new Set(tokenize(sigs.join(' ')));
3652
+ const pathTokenSet = new Set(tokenize(filePath));
3653
+ let score = 0;
3654
+ for (const qt of queryTokens) {
3655
+ if (STOP_WORDS.has(qt)) continue;
3656
+ if (sigTokenSet.has(qt)) {
3657
+ score += w.exactToken;
3658
+ if (sigs.some((sig) => tokenize(sig.replace(/[^a-zA-Z0-9_\s]/g, ' ')).includes(qt))) score += w.symbolMatch;
3659
+ }
3660
+ if (qt.length >= 4) {
3661
+ for (const st of sigTokenSet) {
3662
+ if (st !== qt && st.startsWith(qt)) { score += w.prefixMatch; break; }
3663
+ }
3664
+ }
3665
+ if (pathTokenSet.has(qt)) score += w.pathMatch;
3666
+ }
3667
+ return score;
3668
+ }
3669
+ function rank(query, sigIndex, opts) {
3670
+ if (!query || typeof query !== 'string') return [];
3671
+ if (!sigIndex || !(sigIndex instanceof Map) || sigIndex.size === 0) return [];
3672
+ const topK = (opts && opts.topK) || 10;
3673
+ const recencyMultiplier = (opts && opts.recencyBoost) || DEFAULT_WEIGHTS.recencyBoost;
3674
+ const recencySet = (opts && opts.recencySet) || null;
3675
+ const weights = (opts && opts.weights) ? Object.assign({}, DEFAULT_WEIGHTS, opts.weights) : DEFAULT_WEIGHTS;
3676
+ const queryTokens = tokenize(query);
3677
+ if (queryTokens.length === 0) {
3678
+ const all = [];
3679
+ for (const [file, sigs] of sigIndex.entries()) all.push({ file, score: sigs.length, sigs, tokens: Math.ceil(sigs.join('\n').length / 4) });
3680
+ all.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
3681
+ return all.slice(0, topK);
3682
+ }
3683
+ const scored = [];
3684
+ for (const [file, sigs] of sigIndex.entries()) {
3685
+ let score = scoreFile(file, sigs, queryTokens, weights);
3686
+ if (recencySet && recencySet.has(file) && score > 0) score *= recencyMultiplier;
3687
+ scored.push({ file, score, sigs, tokens: Math.ceil(sigs.join('\n').length / 4) });
3688
+ }
3689
+ scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
3690
+ return scored.slice(0, topK);
3691
+ }
3692
+ function buildSigIndex(cwd) {
3693
+ const fs = require('fs'); const path = require('path');
3694
+ const contextPath = path.join(cwd, '.github', 'copilot-instructions.md');
3695
+ const index = new Map();
3696
+ if (!fs.existsSync(contextPath)) return index;
3697
+ const content = fs.readFileSync(contextPath, 'utf8');
3698
+ const lines = content.split('\n');
3699
+ let currentFile = null; let inBlock = false; let sigs = [];
3700
+ for (const line of lines) {
3701
+ const hm = line.match(/^###\s+(\S+)\s*$/);
3702
+ if (hm) { if (currentFile !== null) index.set(currentFile, sigs); currentFile = hm[1]; sigs = []; inBlock = false; continue; }
3703
+ if (line.startsWith('```')) { inBlock = !inBlock; continue; }
3704
+ if (inBlock && currentFile && line.trim()) sigs.push(line.trim());
3705
+ }
3706
+ if (currentFile !== null) index.set(currentFile, sigs);
3707
+ return index;
3708
+ }
3709
+ function formatRankTable(results, query) {
3710
+ if (!results || results.length === 0) return `No matching files found for query: "${query}"\n`;
3711
+ const lines = [`## Query: ${query}`, '', '| Rank | File | Score | Sigs | Tokens |', '|------|------|-------|------|--------|',
3712
+ ...results.map((r, i) => `| ${i + 1} | ${r.file} | ${r.score.toFixed(2)} | ${r.sigs.length} | ${r.tokens} |`), ''];
3713
+ for (const r of results.slice(0, 3)) {
3714
+ if (r.sigs.length > 0) {
3715
+ lines.push(`### ${r.file}`, '```', ...r.sigs.slice(0, 10));
3716
+ if (r.sigs.length > 10) lines.push(`... (${r.sigs.length - 10} more)`);
3717
+ lines.push('```', '');
3718
+ }
3719
+ }
3720
+ return lines.join('\n');
3721
+ }
3722
+ function formatRankJSON(results, query) {
3723
+ return { query, results: (results || []).map((r, i) => ({ rank: i + 1, file: r.file, score: r.score, sigs: r.sigs, tokens: r.tokens })), totalResults: (results || []).length };
3724
+ }
3725
+ module.exports = { rank, buildSigIndex, scoreFile, formatRankTable, formatRankJSON, DEFAULT_WEIGHTS };
3726
+ };
3727
+
3573
3728
  // ── ./src/eval/scorer ──
3574
3729
  __factories["./src/eval/scorer"] = function(module, exports) {
3575
3730
  'use strict';
@@ -3936,7 +4091,7 @@ const path = require('path');
3936
4091
  const os = require('os');
3937
4092
  const { execSync } = require('child_process');
3938
4093
 
3939
- const VERSION = '2.2.0';
4094
+ const VERSION = '2.3.0';
3940
4095
  const MARKER = '\n\n## Auto-generated signatures\n<!-- Updated by gen-context.js -->\n';
3941
4096
 
3942
4097
  function requireSourceOrBundled(key) {
@@ -5149,6 +5304,9 @@ Usage:
5149
5304
  node gen-context.js --analyze --json Breakdown as JSON
5150
5305
  node gen-context.js --analyze --slow Re-time each extractor; flag files >50ms
5151
5306
  node gen-context.js --diagnose-extractors Run all 21 extractors vs fixtures; show pass/fail + diff
5307
+ node gen-context.js --query "<text>" Rank files by relevance to a query
5308
+ node gen-context.js --query "<text>" --json Ranked results as JSON
5309
+ node gen-context.js --query "<text>" --top <n> Limit results to top N files (default 10)
5152
5310
  node gen-context.js --init Write example config + .contextignore scaffold
5153
5311
  node gen-context.js --help Show this message
5154
5312
  node gen-context.js --version Show version
@@ -5435,6 +5593,38 @@ function main() {
5435
5593
  }
5436
5594
  }
5437
5595
 
5596
+ if (args.includes('--query')) {
5597
+ try {
5598
+ const qIdx = args.indexOf('--query');
5599
+ const query = (args[qIdx + 1] || '').trim();
5600
+ if (!query || query.startsWith('--')) {
5601
+ console.error('[sigmap] --query requires a search string');
5602
+ console.error(' Example: node gen-context.js --query "add a new language extractor"');
5603
+ process.exit(1);
5604
+ }
5605
+ const { rank, buildSigIndex, formatRankTable, formatRankJSON } = requireSourceOrBundled('./src/retrieval/ranker');
5606
+ const index = buildSigIndex(cwd);
5607
+ if (index.size === 0) {
5608
+ console.error('[sigmap] no context file found. Run: node gen-context.js');
5609
+ process.exit(1);
5610
+ }
5611
+ const topIdx = args.indexOf('--top');
5612
+ const topK = topIdx >= 0 ? Math.min(Math.max(1, parseInt(args[topIdx + 1], 10) || 10), 25)
5613
+ : ((config && config.retrieval && config.retrieval.topK) || 10);
5614
+ const recencyBoost = (config && config.retrieval && config.retrieval.recencyBoost) || 1.5;
5615
+ const results = rank(query, index, { topK, recencyBoost });
5616
+ if (args.includes('--json')) {
5617
+ process.stdout.write(JSON.stringify(formatRankJSON(results, query)) + '\n');
5618
+ } else {
5619
+ process.stdout.write(formatRankTable(results, query));
5620
+ }
5621
+ } catch (err) {
5622
+ console.error(`[sigmap] query error: ${err.message}`);
5623
+ process.exit(1);
5624
+ }
5625
+ process.exit(0);
5626
+ }
5627
+
5438
5628
  if (args.includes('--report')) {
5439
5629
  if (args.includes('--history')) {
5440
5630
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "sigmap",
3
- "version": "2.2.0",
3
+ "version": "2.3.0",
4
4
  "description": "Zero-dependency AI context engine — 97% token reduction. No npm install. Runs on Node 18+.",
5
5
  "main": "gen-context.js",
6
6
  "bin": {
@@ -92,6 +92,14 @@ const DEFAULTS = {
92
92
 
93
93
  // Add reverse dependency usage hints on file headings (opt-in)
94
94
  impactRadius: false,
95
+
96
+ // Query-aware retrieval settings (v2.3)
97
+ retrieval: {
98
+ // Maximum number of files to return for --query
99
+ topK: 10,
100
+ // Multiplier applied to recently-changed files (>1 boosts them up)
101
+ recencyBoost: 1.5,
102
+ },
95
103
  };
96
104
 
97
105
  module.exports = { DEFAULTS };
@@ -430,4 +430,31 @@ function listModules(args, cwd) {
430
430
  ].join('\n');
431
431
  }
432
432
 
433
- module.exports = { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules };
433
+ /**
434
+ * query_context({ query, topK? }) → string
435
+ *
436
+ * Ranks context-file entries by relevance to the query and returns the
437
+ * top-K most relevant files with their signatures and scores.
438
+ */
439
+ function queryContext(args, cwd) {
440
+ if (!args || !args.query) return 'Missing required argument: query';
441
+
442
+ const contextPath = path.join(cwd, CONTEXT_FILE);
443
+ if (!fs.existsSync(contextPath)) {
444
+ return 'No context file found. Run: node gen-context.js';
445
+ }
446
+
447
+ try {
448
+ const { rank, buildSigIndex, formatRankTable } = require('../retrieval/ranker');
449
+ const index = buildSigIndex(cwd);
450
+ if (index.size === 0) return 'No signatures indexed. Run: node gen-context.js';
451
+
452
+ const topK = Math.min(Math.max(1, parseInt(args.topK, 10) || 10), 25);
453
+ const results = rank(args.query, index, { topK });
454
+ return formatRankTable(results, args.query);
455
+ } catch (err) {
456
+ return `_query_context failed: ${err.message}_`;
457
+ }
458
+ }
459
+
460
+ module.exports = { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext };
package/src/mcp/server.js CHANGED
@@ -14,11 +14,11 @@
14
14
 
15
15
  const readline = require('readline');
16
16
  const { TOOLS } = require('./tools');
17
- const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules } = require('./handlers');
17
+ const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext } = require('./handlers');
18
18
 
19
19
  const SERVER_INFO = {
20
20
  name: 'sigmap',
21
- version: '2.2.0',
21
+ version: '2.3.0',
22
22
  description: 'SigMap MCP server — code signatures on demand',
23
23
  };
24
24
 
@@ -73,6 +73,7 @@ function dispatch(msg, cwd) {
73
73
  else if (name === 'get_routing') text = getRouting(args, cwd);
74
74
  else if (name === 'explain_file') text = explainFile(args, cwd);
75
75
  else if (name === 'list_modules') text = listModules(args, cwd);
76
+ else if (name === 'query_context') text = queryContext(args, cwd);
76
77
  else {
77
78
  respondError(id, -32601, `Unknown tool: ${name}`);
78
79
  return;
package/src/mcp/tools.js CHANGED
@@ -120,6 +120,30 @@ const TOOLS = [
120
120
  required: [],
121
121
  },
122
122
  },
123
+ {
124
+ name: 'query_context',
125
+ description:
126
+ 'Rank and return the most relevant files for a specific task or question. ' +
127
+ 'Uses keyword + symbol + path scoring to surface only the top-K files relevant ' +
128
+ 'to the query — much cheaper than reading all context. ' +
129
+ 'Returns ranked file list with signatures and relevance scores.',
130
+ inputSchema: {
131
+ type: 'object',
132
+ properties: {
133
+ query: {
134
+ type: 'string',
135
+ description:
136
+ 'Natural language task description or keyword(s) to rank files against. ' +
137
+ 'E.g. "add a new language extractor", "fix secret scanning", "auth module".',
138
+ },
139
+ topK: {
140
+ type: 'number',
141
+ description: 'Maximum number of files to return (default: 10, max: 25).',
142
+ },
143
+ },
144
+ required: ['query'],
145
+ },
146
+ },
123
147
  ];
124
148
 
125
149
  module.exports = { TOOLS };
@@ -0,0 +1,242 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * SigMap zero-dependency relevance ranker.
5
+ *
6
+ * Ranks all files in a signature index against a natural-language query.
7
+ * Scoring weights:
8
+ * - keyword overlap (exact token match against sigs)
9
+ * - symbol match (token appears in a top-level identifier / function name)
10
+ * - partial prefix match (token is prefix of a sig token, length ≥ 4)
11
+ * - path relevance (query token appears in the file path)
12
+ * - recency boost (applied externally via recency map)
13
+ *
14
+ * Usage:
15
+ * const { rank } = require('./src/retrieval/ranker');
16
+ * const results = rank(query, sigIndex, { topK: 10 });
17
+ * // results: [{ file, score, sigs, tokens }]
18
+ */
19
+
20
+ const { tokenize, STOP_WORDS } = require('./tokenizer');
21
+
22
+ // ---------------------------------------------------------------------------
23
+ // Default weights
24
+ // ---------------------------------------------------------------------------
25
+ const DEFAULT_WEIGHTS = {
26
+ exactToken: 1.0, // query token exactly in sig tokens
27
+ symbolMatch: 0.5, // bonus if token appears in a function/class name line
28
+ prefixMatch: 0.3, // partial prefix hit (query token ≥ 4 chars)
29
+ pathMatch: 0.8, // query token appears in the file path
30
+ recencyBoost: 1.5, // multiplier applied when file is in recencySet
31
+ };
32
+
33
+ /**
34
+ * Score a single file against a query.
35
+ *
36
+ * @param {string} filePath - relative file path (e.g. 'src/extractors/python.js')
37
+ * @param {string[]} sigs - signature strings for this file
38
+ * @param {string[]} queryTokens - pre-tokenized query
39
+ * @param {object} weights
40
+ * @returns {number}
41
+ */
42
+ function scoreFile(filePath, sigs, queryTokens, weights) {
43
+ if (!sigs || sigs.length === 0) return 0;
44
+
45
+ const w = weights || DEFAULT_WEIGHTS;
46
+
47
+ // Build token set from all signatures
48
+ const sigText = sigs.join(' ');
49
+ const sigTokenSet = new Set(tokenize(sigText));
50
+
51
+ // Build token set from the file path
52
+ const pathTokenSet = new Set(tokenize(filePath));
53
+
54
+ let score = 0;
55
+
56
+ for (const qt of queryTokens) {
57
+ if (STOP_WORDS.has(qt)) continue;
58
+
59
+ // Exact token match in sigs
60
+ if (sigTokenSet.has(qt)) {
61
+ score += w.exactToken;
62
+
63
+ // Bonus: appears directly in a function/class/method name line
64
+ const nameLineMatch = sigs.some((sig) => {
65
+ const nt = tokenize(sig.replace(/[^a-zA-Z0-9_\s]/g, ' '));
66
+ return nt.includes(qt);
67
+ });
68
+ if (nameLineMatch) score += w.symbolMatch;
69
+ }
70
+
71
+ // Prefix match (e.g. query "python" matches "pythonDeps")
72
+ if (qt.length >= 4) {
73
+ for (const st of sigTokenSet) {
74
+ if (st !== qt && st.startsWith(qt)) {
75
+ score += w.prefixMatch;
76
+ break; // one bonus per query token
77
+ }
78
+ }
79
+ }
80
+
81
+ // Path token match
82
+ if (pathTokenSet.has(qt)) {
83
+ score += w.pathMatch;
84
+ }
85
+ }
86
+
87
+ return score;
88
+ }
89
+
90
+ /**
91
+ * Rank all files in a signature index against a query.
92
+ *
93
+ * @param {string} query - natural language query
94
+ * @param {Map<string,string[]>} sigIndex - Map<file, sigs[]>
95
+ * @param {object} [opts]
96
+ * @param {number} [opts.topK=10] - max results to return
97
+ * @param {number} [opts.recencyBoost=1.5] - multiplier for recent files
98
+ * @param {Set<string>} [opts.recencySet] - set of recently-changed file paths
99
+ * @param {object} [opts.weights] - override scoring weights
100
+ * @returns {{ file: string, score: number, sigs: string[], tokens: number }[]}
101
+ */
102
+ function rank(query, sigIndex, opts) {
103
+ if (!query || typeof query !== 'string') return [];
104
+ if (!sigIndex || !(sigIndex instanceof Map) || sigIndex.size === 0) return [];
105
+
106
+ const topK = (opts && opts.topK) || 10;
107
+ const recencyMultiplier = (opts && opts.recencyBoost) || DEFAULT_WEIGHTS.recencyBoost;
108
+ const recencySet = (opts && opts.recencySet) || null;
109
+ const weights = (opts && opts.weights) ? Object.assign({}, DEFAULT_WEIGHTS, opts.weights) : DEFAULT_WEIGHTS;
110
+
111
+ const queryTokens = tokenize(query);
112
+ if (queryTokens.length === 0) {
113
+ // Empty query: return top-K by file count (most signatures = most useful)
114
+ const all = [];
115
+ for (const [file, sigs] of sigIndex.entries()) {
116
+ all.push({ file, score: sigs.length, sigs, tokens: Math.ceil(sigs.join('\n').length / 4) });
117
+ }
118
+ all.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
119
+ return all.slice(0, topK);
120
+ }
121
+
122
+ const scored = [];
123
+ for (const [file, sigs] of sigIndex.entries()) {
124
+ let score = scoreFile(file, sigs, queryTokens, weights);
125
+
126
+ // Recency boost
127
+ if (recencySet && recencySet.has(file) && score > 0) {
128
+ score *= recencyMultiplier;
129
+ }
130
+
131
+ scored.push({
132
+ file,
133
+ score,
134
+ sigs,
135
+ tokens: Math.ceil(sigs.join('\n').length / 4),
136
+ });
137
+ }
138
+
139
+ scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
140
+ return scored.slice(0, topK);
141
+ }
142
+
143
+ /**
144
+ * Build a signature index from the generated context file.
145
+ * Returns Map<filePath, string[]> where filePath is the relative path
146
+ * as it appears in the ### headers of copilot-instructions.md.
147
+ *
148
+ * @param {string} cwd
149
+ * @returns {Map<string, string[]>}
150
+ */
151
+ function buildSigIndex(cwd) {
152
+ const fs = require('fs');
153
+ const path = require('path');
154
+ const contextPath = path.join(cwd, '.github', 'copilot-instructions.md');
155
+ const index = new Map();
156
+
157
+ if (!fs.existsSync(contextPath)) return index;
158
+
159
+ const content = fs.readFileSync(contextPath, 'utf8');
160
+ const lines = content.split('\n');
161
+
162
+ let currentFile = null;
163
+ let inBlock = false;
164
+ let sigs = [];
165
+
166
+ for (const line of lines) {
167
+ const headerMatch = line.match(/^###\s+(\S+)\s*$/);
168
+ if (headerMatch) {
169
+ if (currentFile !== null) index.set(currentFile, sigs);
170
+ currentFile = headerMatch[1];
171
+ sigs = [];
172
+ inBlock = false;
173
+ continue;
174
+ }
175
+ if (line.startsWith('```')) { inBlock = !inBlock; continue; }
176
+ if (inBlock && currentFile && line.trim()) sigs.push(line.trim());
177
+ }
178
+ if (currentFile !== null) index.set(currentFile, sigs);
179
+
180
+ return index;
181
+ }
182
+
183
+ /**
184
+ * Format ranked results as a markdown table string.
185
+ *
186
+ * @param {{ file: string, score: number, sigs: string[], tokens: number }[]} results
187
+ * @param {string} query
188
+ * @returns {string}
189
+ */
190
+ function formatRankTable(results, query) {
191
+ if (!results || results.length === 0) {
192
+ return `No matching files found for query: "${query}"\n`;
193
+ }
194
+
195
+ const lines = [
196
+ `## Query: ${query}`,
197
+ '',
198
+ '| Rank | File | Score | Sigs | Tokens |',
199
+ '|------|------|-------|------|--------|',
200
+ ...results.map((r, i) =>
201
+ `| ${i + 1} | ${r.file} | ${r.score.toFixed(2)} | ${r.sigs.length} | ${r.tokens} |`
202
+ ),
203
+ '',
204
+ ];
205
+
206
+ // Add signature details for top results
207
+ for (const r of results.slice(0, 3)) {
208
+ if (r.sigs.length > 0) {
209
+ lines.push(`### ${r.file}`);
210
+ lines.push('```');
211
+ lines.push(...r.sigs.slice(0, 10));
212
+ if (r.sigs.length > 10) lines.push(`... (${r.sigs.length - 10} more)`);
213
+ lines.push('```');
214
+ lines.push('');
215
+ }
216
+ }
217
+
218
+ return lines.join('\n');
219
+ }
220
+
221
+ /**
222
+ * Format ranked results as a structured JSON-serialisable object.
223
+ *
224
+ * @param {{ file: string, score: number, sigs: string[], tokens: number }[]} results
225
+ * @param {string} query
226
+ * @returns {object}
227
+ */
228
+ function formatRankJSON(results, query) {
229
+ return {
230
+ query,
231
+ results: (results || []).map((r, i) => ({
232
+ rank: i + 1,
233
+ file: r.file,
234
+ score: r.score,
235
+ sigs: r.sigs,
236
+ tokens: r.tokens,
237
+ })),
238
+ totalResults: (results || []).length,
239
+ };
240
+ }
241
+
242
+ module.exports = { rank, buildSigIndex, scoreFile, formatRankTable, formatRankJSON, DEFAULT_WEIGHTS };
@@ -0,0 +1,54 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * SigMap zero-dependency tokenizer.
5
+ * Splits code identifiers: camelCase, snake_case, kebab-case, PascalCase,
6
+ * removes stop words, and returns lower-case tokens.
7
+ */
8
+
9
+ const STOP_WORDS = new Set([
10
+ 'the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'or', 'is', 'are',
11
+ 'that', 'this', 'it', 'with', 'from', 'by', 'be', 'as', 'on', 'at',
12
+ 'do', 'not', 'use', 'get', 'set', 'up', 'if', 'no', 'so', 'we',
13
+ ]);
14
+
15
+ /**
16
+ * Tokenize any text (query or code signature) into unique lower-case tokens.
17
+ * Handles:
18
+ * - camelCase → ['camel', 'case']
19
+ * - PascalCase → ['pascal', 'case']
20
+ * - snake_case → ['snake', 'case']
21
+ * - kebab-case → ['kebab', 'case']
22
+ * - dot.notation → ['dot', 'notation']
23
+ * - File paths → individual path components (no extension)
24
+ *
25
+ * @param {string} text
26
+ * @param {object} [opts]
27
+ * @param {boolean} [opts.removeStopWords=true]
28
+ * @param {number} [opts.minLength=2]
29
+ * @returns {string[]}
30
+ */
31
+ function tokenize(text, opts) {
32
+ if (!text || typeof text !== 'string') return [];
33
+ const removeStop = opts && opts.removeStopWords === false ? false : true;
34
+ const minLen = (opts && opts.minLength) || 2;
35
+
36
+ const tokens = text
37
+ // strip file extension (e.g. .js, .ts, .py)
38
+ .replace(/\.\w{1,6}(?=\s|\/|$)/g, ' ')
39
+ // camelCase / PascalCase split
40
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
41
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
42
+ // snake_case / kebab-case / dot.notation
43
+ .replace(/[_\-\.\/]/g, ' ')
44
+ // drop remaining non-word characters
45
+ .replace(/[^\w\s]/g, ' ')
46
+ .toLowerCase()
47
+ .split(/\s+/)
48
+ .filter((t) => t.length >= minLen);
49
+
50
+ if (!removeStop) return [...new Set(tokens)];
51
+ return [...new Set(tokens.filter((t) => !STOP_WORDS.has(t)))];
52
+ }
53
+
54
+ module.exports = { tokenize, STOP_WORDS };