sigmap 2.1.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/gen-context.js +459 -3
- package/package.json +1 -1
- package/src/config/defaults.js +8 -0
- package/src/eval/analyzer.js +221 -0
- package/src/mcp/handlers.js +28 -1
- package/src/mcp/server.js +3 -2
- package/src/mcp/tools.js +24 -0
- package/src/retrieval/ranker.js +242 -0
- package/src/retrieval/tokenizer.js +54 -0
package/CHANGELOG.md
CHANGED
|
@@ -6,6 +6,50 @@ Format: [Semantic Versioning](https://semver.org/)
|
|
|
6
6
|
|
|
7
7
|
---
|
|
8
8
|
|
|
9
|
+
## [2.3.0] — 2026-04-07
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- **Query-aware retrieval** — `src/retrieval/tokenizer.js` and `src/retrieval/ranker.js`: zero-dependency relevance ranker that scores every file against a free-text query by exact token, symbol, prefix, path, and recency signals.
|
|
13
|
+
- **`--query "<text>"` CLI flag** — ranks all context files by relevance and prints a scored table (Rank | File | Score | Sigs | Tokens) plus the top-3 signature blocks; `--query "<text>" --json` for machine-readable output; `--query "<text>" --top <n>` to limit result set.
|
|
14
|
+
- **`query_context` MCP tool** — 8th MCP tool; accepts `{ query: string, topK?: number }` and returns the same ranked table as the `--query` CLI flag; live within any running MCP session.
|
|
15
|
+
- **Retrieval config** — `config.retrieval.topK` (default 10) and `config.retrieval.recencyBoost` (default 1.5×) added to `src/config/defaults.js`.
|
|
16
|
+
- **`test/integration/retrieval.test.js`** — 23 integration tests covering tokenizer unit tests, ranker sorting/scoring/topK/empty-query, `formatRankTable`, `formatRankJSON`, CLI `--query` flags, and MCP `query_context`.
|
|
17
|
+
|
|
18
|
+
### Changed
|
|
19
|
+
- `src/mcp/server.js` version bumped to `2.3.0`.
|
|
20
|
+
- `test/integration/mcp-server.test.js` and `mcp-v14.test.js` updated to assert 8 tools.
|
|
21
|
+
- `test/integration/analyze.test.js` version assertion updated to `2.3.0`.
|
|
22
|
+
|
|
23
|
+
### Validation gate
|
|
24
|
+
- 21/21 extractor unit tests passed
|
|
25
|
+
- 20/20 integration suites passed (0 failures)
|
|
26
|
+
- `node gen-context.js --version` → `2.3.0`
|
|
27
|
+
- `node gen-context.js --query "python extractor"` → `src/extractors/python.js` in top-3
|
|
28
|
+
- `node gen-context.js --query "fix secret scanning" --json` → valid JSON
|
|
29
|
+
- MCP `tools/list` → 8 tools including `query_context`
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## [2.2.0] — 2026-04-06
|
|
34
|
+
|
|
35
|
+
### Added
|
|
36
|
+
- **Diagnostics & analyze command** — `src/eval/analyzer.js`: per-file breakdown of signature count, token cost, extractor used, and test coverage status.
|
|
37
|
+
- **`--analyze` CLI flag** — prints a per-file table (File | Extractor | Sigs | Tokens | Covered) across all srcDirs; respects `exclude` config.
|
|
38
|
+
- **`--analyze --json` flag** — outputs the same breakdown as structured JSON (`{ files, totalSigs, totalTokens, slowFiles, fileCount }`).
|
|
39
|
+
- **`--analyze --slow` flag** — re-times each extractor and flags any file whose extraction takes >50ms in the table.
|
|
40
|
+
- **`--diagnose-extractors` CLI flag** — runs all 21 language extractors against `test/fixtures/` and compares output to `test/expected/`; exits non-zero if any extractor diverges, shows first diff line per failure.
|
|
41
|
+
- **`test/integration/analyze.test.js`** — 14 integration tests covering `analyzeFiles`, `formatAnalysisTable`, `formatAnalysisJSON`, and all four CLI flags.
|
|
42
|
+
|
|
43
|
+
### Validation gate
|
|
44
|
+
- 21/21 extractor tests passed
|
|
45
|
+
- All integration suites passed (19 suites, 19 passed, 0 failed — includes 14 new analyze tests)
|
|
46
|
+
- `node gen-context.js --version` → `2.2.0`
|
|
47
|
+
- `node gen-context.js --analyze` runs without error on SigMap repo
|
|
48
|
+
- `node gen-context.js --analyze --json` → valid JSON with required keys
|
|
49
|
+
- `node gen-context.js --diagnose-extractors` → exits 0 on SigMap repo
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
9
53
|
## [2.1.0] — 2026-04-05
|
|
10
54
|
|
|
11
55
|
### Added
|
package/gen-context.js
CHANGED
|
@@ -2879,7 +2879,23 @@ __factories["./src/mcp/handlers"] = function(module, exports) {
|
|
|
2879
2879
|
].join('\n');
|
|
2880
2880
|
}
|
|
2881
2881
|
|
|
2882
|
-
|
|
2882
|
+
function queryContext(args, cwd) {
|
|
2883
|
+
if (!args || !args.query) return 'Missing required argument: query';
|
|
2884
|
+
const contextPath = path.join(cwd, CONTEXT_FILE);
|
|
2885
|
+
if (!fs.existsSync(contextPath)) return 'No context file found. Run: node gen-context.js';
|
|
2886
|
+
try {
|
|
2887
|
+
const { rank, buildSigIndex, formatRankTable } = __require('./src/retrieval/ranker');
|
|
2888
|
+
const index = buildSigIndex(cwd);
|
|
2889
|
+
if (index.size === 0) return 'No signatures indexed. Run: node gen-context.js';
|
|
2890
|
+
const topK = Math.min(Math.max(1, parseInt(args.topK, 10) || 10), 25);
|
|
2891
|
+
const results = rank(args.query, index, { topK });
|
|
2892
|
+
return formatRankTable(results, args.query);
|
|
2893
|
+
} catch (err) {
|
|
2894
|
+
return `_query_context failed: ${err.message}_`;
|
|
2895
|
+
}
|
|
2896
|
+
}
|
|
2897
|
+
|
|
2898
|
+
module.exports = { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext };
|
|
2883
2899
|
};
|
|
2884
2900
|
|
|
2885
2901
|
// ── ./src/mcp/server ──
|
|
@@ -2899,7 +2915,7 @@ __factories["./src/mcp/server"] = function(module, exports) {
|
|
|
2899
2915
|
|
|
2900
2916
|
const readline = require('readline');
|
|
2901
2917
|
const { TOOLS } = __require('./src/mcp/tools');
|
|
2902
|
-
const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules } = __require('./src/mcp/handlers');
|
|
2918
|
+
const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext } = __require('./src/mcp/handlers');
|
|
2903
2919
|
|
|
2904
2920
|
const SERVER_INFO = {
|
|
2905
2921
|
name: 'sigmap',
|
|
@@ -2958,6 +2974,7 @@ __factories["./src/mcp/server"] = function(module, exports) {
|
|
|
2958
2974
|
else if (name === 'get_routing') text = getRouting(args, cwd);
|
|
2959
2975
|
else if (name === 'explain_file') text = explainFile(args, cwd);
|
|
2960
2976
|
else if (name === 'list_modules') text = listModules(args, cwd);
|
|
2977
|
+
else if (name === 'query_context') text = queryContext(args, cwd);
|
|
2961
2978
|
else {
|
|
2962
2979
|
respondError(id, -32601, `Unknown tool: ${name}`);
|
|
2963
2980
|
return;
|
|
@@ -3137,6 +3154,30 @@ __factories["./src/mcp/tools"] = function(module, exports) {
|
|
|
3137
3154
|
required: [],
|
|
3138
3155
|
},
|
|
3139
3156
|
},
|
|
3157
|
+
{
|
|
3158
|
+
name: 'query_context',
|
|
3159
|
+
description:
|
|
3160
|
+
'Rank and return the most relevant files for a specific task or question. ' +
|
|
3161
|
+
'Uses keyword + symbol + path scoring to surface only the top-K files relevant ' +
|
|
3162
|
+
'to the query — much cheaper than reading all context. ' +
|
|
3163
|
+
'Returns ranked file list with signatures and relevance scores.',
|
|
3164
|
+
inputSchema: {
|
|
3165
|
+
type: 'object',
|
|
3166
|
+
properties: {
|
|
3167
|
+
query: {
|
|
3168
|
+
type: 'string',
|
|
3169
|
+
description:
|
|
3170
|
+
'Natural language task description or keyword(s) to rank files against. ' +
|
|
3171
|
+
'E.g. "add a new language extractor", "fix secret scanning", "auth module".',
|
|
3172
|
+
},
|
|
3173
|
+
topK: {
|
|
3174
|
+
type: 'number',
|
|
3175
|
+
description: 'Maximum number of files to return (default: 10, max: 25).',
|
|
3176
|
+
},
|
|
3177
|
+
},
|
|
3178
|
+
required: ['query'],
|
|
3179
|
+
},
|
|
3180
|
+
},
|
|
3140
3181
|
];
|
|
3141
3182
|
|
|
3142
3183
|
module.exports = { TOOLS };
|
|
@@ -3570,6 +3611,120 @@ __factories["./src/tracking/logger"] = function(module, exports) {
|
|
|
3570
3611
|
|
|
3571
3612
|
};
|
|
3572
3613
|
|
|
3614
|
+
// ── ./src/retrieval/tokenizer ──
|
|
3615
|
+
__factories["./src/retrieval/tokenizer"] = function(module, exports) {
|
|
3616
|
+
'use strict';
|
|
3617
|
+
const STOP_WORDS = new Set([
|
|
3618
|
+
'the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'or', 'is', 'are',
|
|
3619
|
+
'that', 'this', 'it', 'with', 'from', 'by', 'be', 'as', 'on', 'at',
|
|
3620
|
+
'do', 'not', 'use', 'get', 'set', 'up', 'if', 'no', 'so', 'we',
|
|
3621
|
+
]);
|
|
3622
|
+
function tokenize(text, opts) {
|
|
3623
|
+
if (!text || typeof text !== 'string') return [];
|
|
3624
|
+
const removeStop = opts && opts.removeStopWords === false ? false : true;
|
|
3625
|
+
const minLen = (opts && opts.minLength) || 2;
|
|
3626
|
+
const tokens = text
|
|
3627
|
+
.replace(/\.\w{1,6}(?=\s|\/|$)/g, ' ')
|
|
3628
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
3629
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
|
|
3630
|
+
.replace(/[_\-\.\/]/g, ' ')
|
|
3631
|
+
.replace(/[^\w\s]/g, ' ')
|
|
3632
|
+
.toLowerCase()
|
|
3633
|
+
.split(/\s+/)
|
|
3634
|
+
.filter((t) => t.length >= minLen);
|
|
3635
|
+
if (!removeStop) return [...new Set(tokens)];
|
|
3636
|
+
return [...new Set(tokens.filter((t) => !STOP_WORDS.has(t)))];
|
|
3637
|
+
}
|
|
3638
|
+
module.exports = { tokenize, STOP_WORDS };
|
|
3639
|
+
};
|
|
3640
|
+
|
|
3641
|
+
// ── ./src/retrieval/ranker ──
|
|
3642
|
+
__factories["./src/retrieval/ranker"] = function(module, exports) {
|
|
3643
|
+
'use strict';
|
|
3644
|
+
const { tokenize, STOP_WORDS } = __require('./src/retrieval/tokenizer');
|
|
3645
|
+
const DEFAULT_WEIGHTS = {
|
|
3646
|
+
exactToken: 1.0, symbolMatch: 0.5, prefixMatch: 0.3, pathMatch: 0.8, recencyBoost: 1.5,
|
|
3647
|
+
};
|
|
3648
|
+
function scoreFile(filePath, sigs, queryTokens, weights) {
|
|
3649
|
+
if (!sigs || sigs.length === 0) return 0;
|
|
3650
|
+
const w = weights || DEFAULT_WEIGHTS;
|
|
3651
|
+
const sigTokenSet = new Set(tokenize(sigs.join(' ')));
|
|
3652
|
+
const pathTokenSet = new Set(tokenize(filePath));
|
|
3653
|
+
let score = 0;
|
|
3654
|
+
for (const qt of queryTokens) {
|
|
3655
|
+
if (STOP_WORDS.has(qt)) continue;
|
|
3656
|
+
if (sigTokenSet.has(qt)) {
|
|
3657
|
+
score += w.exactToken;
|
|
3658
|
+
if (sigs.some((sig) => tokenize(sig.replace(/[^a-zA-Z0-9_\s]/g, ' ')).includes(qt))) score += w.symbolMatch;
|
|
3659
|
+
}
|
|
3660
|
+
if (qt.length >= 4) {
|
|
3661
|
+
for (const st of sigTokenSet) {
|
|
3662
|
+
if (st !== qt && st.startsWith(qt)) { score += w.prefixMatch; break; }
|
|
3663
|
+
}
|
|
3664
|
+
}
|
|
3665
|
+
if (pathTokenSet.has(qt)) score += w.pathMatch;
|
|
3666
|
+
}
|
|
3667
|
+
return score;
|
|
3668
|
+
}
|
|
3669
|
+
function rank(query, sigIndex, opts) {
|
|
3670
|
+
if (!query || typeof query !== 'string') return [];
|
|
3671
|
+
if (!sigIndex || !(sigIndex instanceof Map) || sigIndex.size === 0) return [];
|
|
3672
|
+
const topK = (opts && opts.topK) || 10;
|
|
3673
|
+
const recencyMultiplier = (opts && opts.recencyBoost) || DEFAULT_WEIGHTS.recencyBoost;
|
|
3674
|
+
const recencySet = (opts && opts.recencySet) || null;
|
|
3675
|
+
const weights = (opts && opts.weights) ? Object.assign({}, DEFAULT_WEIGHTS, opts.weights) : DEFAULT_WEIGHTS;
|
|
3676
|
+
const queryTokens = tokenize(query);
|
|
3677
|
+
if (queryTokens.length === 0) {
|
|
3678
|
+
const all = [];
|
|
3679
|
+
for (const [file, sigs] of sigIndex.entries()) all.push({ file, score: sigs.length, sigs, tokens: Math.ceil(sigs.join('\n').length / 4) });
|
|
3680
|
+
all.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
|
|
3681
|
+
return all.slice(0, topK);
|
|
3682
|
+
}
|
|
3683
|
+
const scored = [];
|
|
3684
|
+
for (const [file, sigs] of sigIndex.entries()) {
|
|
3685
|
+
let score = scoreFile(file, sigs, queryTokens, weights);
|
|
3686
|
+
if (recencySet && recencySet.has(file) && score > 0) score *= recencyMultiplier;
|
|
3687
|
+
scored.push({ file, score, sigs, tokens: Math.ceil(sigs.join('\n').length / 4) });
|
|
3688
|
+
}
|
|
3689
|
+
scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
|
|
3690
|
+
return scored.slice(0, topK);
|
|
3691
|
+
}
|
|
3692
|
+
function buildSigIndex(cwd) {
|
|
3693
|
+
const fs = require('fs'); const path = require('path');
|
|
3694
|
+
const contextPath = path.join(cwd, '.github', 'copilot-instructions.md');
|
|
3695
|
+
const index = new Map();
|
|
3696
|
+
if (!fs.existsSync(contextPath)) return index;
|
|
3697
|
+
const content = fs.readFileSync(contextPath, 'utf8');
|
|
3698
|
+
const lines = content.split('\n');
|
|
3699
|
+
let currentFile = null; let inBlock = false; let sigs = [];
|
|
3700
|
+
for (const line of lines) {
|
|
3701
|
+
const hm = line.match(/^###\s+(\S+)\s*$/);
|
|
3702
|
+
if (hm) { if (currentFile !== null) index.set(currentFile, sigs); currentFile = hm[1]; sigs = []; inBlock = false; continue; }
|
|
3703
|
+
if (line.startsWith('```')) { inBlock = !inBlock; continue; }
|
|
3704
|
+
if (inBlock && currentFile && line.trim()) sigs.push(line.trim());
|
|
3705
|
+
}
|
|
3706
|
+
if (currentFile !== null) index.set(currentFile, sigs);
|
|
3707
|
+
return index;
|
|
3708
|
+
}
|
|
3709
|
+
function formatRankTable(results, query) {
|
|
3710
|
+
if (!results || results.length === 0) return `No matching files found for query: "${query}"\n`;
|
|
3711
|
+
const lines = [`## Query: ${query}`, '', '| Rank | File | Score | Sigs | Tokens |', '|------|------|-------|------|--------|',
|
|
3712
|
+
...results.map((r, i) => `| ${i + 1} | ${r.file} | ${r.score.toFixed(2)} | ${r.sigs.length} | ${r.tokens} |`), ''];
|
|
3713
|
+
for (const r of results.slice(0, 3)) {
|
|
3714
|
+
if (r.sigs.length > 0) {
|
|
3715
|
+
lines.push(`### ${r.file}`, '```', ...r.sigs.slice(0, 10));
|
|
3716
|
+
if (r.sigs.length > 10) lines.push(`... (${r.sigs.length - 10} more)`);
|
|
3717
|
+
lines.push('```', '');
|
|
3718
|
+
}
|
|
3719
|
+
}
|
|
3720
|
+
return lines.join('\n');
|
|
3721
|
+
}
|
|
3722
|
+
function formatRankJSON(results, query) {
|
|
3723
|
+
return { query, results: (results || []).map((r, i) => ({ rank: i + 1, file: r.file, score: r.score, sigs: r.sigs, tokens: r.tokens })), totalResults: (results || []).length };
|
|
3724
|
+
}
|
|
3725
|
+
module.exports = { rank, buildSigIndex, scoreFile, formatRankTable, formatRankJSON, DEFAULT_WEIGHTS };
|
|
3726
|
+
};
|
|
3727
|
+
|
|
3573
3728
|
// ── ./src/eval/scorer ──
|
|
3574
3729
|
__factories["./src/eval/scorer"] = function(module, exports) {
|
|
3575
3730
|
'use strict';
|
|
@@ -3630,6 +3785,143 @@ __factories["./src/eval/scorer"] = function(module, exports) {
|
|
|
3630
3785
|
module.exports = { hitAtK, reciprocalRank, precisionAtK, aggregate, firstRank };
|
|
3631
3786
|
};
|
|
3632
3787
|
|
|
3788
|
+
// ── ./src/eval/analyzer ──
|
|
3789
|
+
__factories["./src/eval/analyzer"] = function(module, exports) {
|
|
3790
|
+
'use strict';
|
|
3791
|
+
|
|
3792
|
+
const fs = require('fs');
|
|
3793
|
+
const path = require('path');
|
|
3794
|
+
|
|
3795
|
+
const EXT_MAP = {
|
|
3796
|
+
'.ts': 'typescript', '.tsx': 'typescript',
|
|
3797
|
+
'.js': 'javascript', '.jsx': 'javascript', '.mjs': 'javascript', '.cjs': 'javascript',
|
|
3798
|
+
'.py': 'python', '.pyw': 'python',
|
|
3799
|
+
'.java': 'java',
|
|
3800
|
+
'.kt': 'kotlin', '.kts': 'kotlin',
|
|
3801
|
+
'.go': 'go',
|
|
3802
|
+
'.rs': 'rust',
|
|
3803
|
+
'.cs': 'csharp',
|
|
3804
|
+
'.cpp': 'cpp', '.c': 'cpp', '.h': 'cpp', '.hpp': 'cpp', '.cc': 'cpp',
|
|
3805
|
+
'.rb': 'ruby', '.rake': 'ruby',
|
|
3806
|
+
'.php': 'php',
|
|
3807
|
+
'.swift': 'swift',
|
|
3808
|
+
'.dart': 'dart',
|
|
3809
|
+
'.scala': 'scala', '.sc': 'scala',
|
|
3810
|
+
'.vue': 'vue',
|
|
3811
|
+
'.svelte': 'svelte',
|
|
3812
|
+
'.html': 'html', '.htm': 'html',
|
|
3813
|
+
'.css': 'css', '.scss': 'css', '.sass': 'css', '.less': 'css',
|
|
3814
|
+
'.yml': 'yaml', '.yaml': 'yaml',
|
|
3815
|
+
'.sh': 'shell', '.bash': 'shell', '.zsh': 'shell', '.fish': 'shell',
|
|
3816
|
+
};
|
|
3817
|
+
|
|
3818
|
+
function isDockerfile(name) { return name === 'Dockerfile' || name.startsWith('Dockerfile.'); }
|
|
3819
|
+
|
|
3820
|
+
function getExtractorName(filePath) {
|
|
3821
|
+
const base = path.basename(filePath);
|
|
3822
|
+
const ext = path.extname(base).toLowerCase();
|
|
3823
|
+
if (EXT_MAP[ext]) return EXT_MAP[ext];
|
|
3824
|
+
if (isDockerfile(base)) return 'dockerfile';
|
|
3825
|
+
return null;
|
|
3826
|
+
}
|
|
3827
|
+
|
|
3828
|
+
function tokenCount(sigs) {
|
|
3829
|
+
return Math.ceil(sigs.reduce((sum, s) => sum + s.length, 0) / 4);
|
|
3830
|
+
}
|
|
3831
|
+
|
|
3832
|
+
function hasCoverage(filePath, cwd) {
|
|
3833
|
+
const base = path.basename(filePath, path.extname(filePath));
|
|
3834
|
+
const testDirs = ['test', 'tests', '__tests__', 'spec'];
|
|
3835
|
+
for (const d of testDirs) {
|
|
3836
|
+
const abs = path.join(cwd, d);
|
|
3837
|
+
if (!fs.existsSync(abs)) continue;
|
|
3838
|
+
let entries;
|
|
3839
|
+
try { entries = fs.readdirSync(abs, { withFileTypes: true }); } catch (_) { continue; }
|
|
3840
|
+
for (const e of entries) { if (e.name.includes(base)) return true; }
|
|
3841
|
+
}
|
|
3842
|
+
return false;
|
|
3843
|
+
}
|
|
3844
|
+
|
|
3845
|
+
function analyzeFiles(files, cwd, opts) {
|
|
3846
|
+
const slow = (opts && opts.slow) || false;
|
|
3847
|
+
const slowMs = (opts && opts.slowMs) || 50;
|
|
3848
|
+
const maxSigs = (opts && opts.maxSigs) || 25;
|
|
3849
|
+
const stats = [];
|
|
3850
|
+
const cache = {};
|
|
3851
|
+
|
|
3852
|
+
for (const filePath of files) {
|
|
3853
|
+
const extractorName = getExtractorName(filePath);
|
|
3854
|
+
if (!extractorName) continue;
|
|
3855
|
+
if (!cache[extractorName]) {
|
|
3856
|
+
try { cache[extractorName] = __require(`./src/extractors/${extractorName}`); } catch (_) { cache[extractorName] = null; }
|
|
3857
|
+
}
|
|
3858
|
+
const extractor = cache[extractorName];
|
|
3859
|
+
if (!extractor || typeof extractor.extract !== 'function') continue;
|
|
3860
|
+
|
|
3861
|
+
let content;
|
|
3862
|
+
try { content = fs.readFileSync(filePath, 'utf8'); } catch (_) { continue; }
|
|
3863
|
+
|
|
3864
|
+
let sigs; let elapsedMs = 0;
|
|
3865
|
+
if (slow) {
|
|
3866
|
+
const t0 = Date.now();
|
|
3867
|
+
try { sigs = extractor.extract(content); } catch (_) { sigs = []; }
|
|
3868
|
+
elapsedMs = Date.now() - t0;
|
|
3869
|
+
} else {
|
|
3870
|
+
try { sigs = extractor.extract(content); } catch (_) { sigs = []; }
|
|
3871
|
+
}
|
|
3872
|
+
sigs = (Array.isArray(sigs) ? sigs : []).slice(0, maxSigs);
|
|
3873
|
+
|
|
3874
|
+
stats.push({
|
|
3875
|
+
file: path.relative(cwd, filePath),
|
|
3876
|
+
extractor: extractorName,
|
|
3877
|
+
sigs: sigs.length,
|
|
3878
|
+
tokens: tokenCount(sigs),
|
|
3879
|
+
covered: hasCoverage(filePath, cwd),
|
|
3880
|
+
elapsedMs: slow ? elapsedMs : undefined,
|
|
3881
|
+
slow: slow ? (elapsedMs > slowMs) : undefined,
|
|
3882
|
+
});
|
|
3883
|
+
}
|
|
3884
|
+
return stats;
|
|
3885
|
+
}
|
|
3886
|
+
|
|
3887
|
+
function formatAnalysisTable(stats, showSlow) {
|
|
3888
|
+
if (!stats || stats.length === 0) return '_(no files analyzed)_\n';
|
|
3889
|
+
const maxFile = Math.max(4, ...stats.map((s) => s.file.length));
|
|
3890
|
+
const header = showSlow
|
|
3891
|
+
? `| ${'File'.padEnd(maxFile)} | Sigs | Tokens | Extractor | Coverage | Elapsed |`
|
|
3892
|
+
: `| ${'File'.padEnd(maxFile)} | Sigs | Tokens | Extractor | Coverage |`;
|
|
3893
|
+
const sep = showSlow
|
|
3894
|
+
? `|${'-'.repeat(maxFile + 2)}|------|--------|-------------|------------|----------|`
|
|
3895
|
+
: `|${'-'.repeat(maxFile + 2)}|------|--------|-------------|------------|`;
|
|
3896
|
+
const rows = stats.map((s) => {
|
|
3897
|
+
const cov = s.covered ? '✓ tested ' : '✗ untested';
|
|
3898
|
+
const file = s.file.padEnd(maxFile);
|
|
3899
|
+
const ext = (s.extractor || '').padEnd(11);
|
|
3900
|
+
const base = `| ${file} | ${String(s.sigs).padStart(4)} | ${String(s.tokens).padStart(6)} | ${ext} | ${cov} |`;
|
|
3901
|
+
if (showSlow) {
|
|
3902
|
+
const ms = s.elapsedMs !== undefined ? `${s.elapsedMs}ms` : '';
|
|
3903
|
+
return `${base} ${ms.padStart(6)}${s.slow ? ' ⚠️' : ''} |`;
|
|
3904
|
+
}
|
|
3905
|
+
return base;
|
|
3906
|
+
});
|
|
3907
|
+
const totalSigs = stats.reduce((n, s) => n + s.sigs, 0);
|
|
3908
|
+
const totalTokens = stats.reduce((n, s) => n + s.tokens, 0);
|
|
3909
|
+
const slotFile = ''.padEnd(maxFile);
|
|
3910
|
+
const baseFoot = `| ${slotFile} | ${String(totalSigs).padStart(4)} | ${String(totalTokens).padStart(6)} | **Total** | |`;
|
|
3911
|
+
const footer = showSlow ? `${baseFoot} ${' '.padStart(8)} |` : baseFoot;
|
|
3912
|
+
return [header, sep, ...rows, sep, footer].join('\n') + '\n';
|
|
3913
|
+
}
|
|
3914
|
+
|
|
3915
|
+
function formatAnalysisJSON(stats) {
|
|
3916
|
+
const totalSigs = stats.reduce((n, s) => n + s.sigs, 0);
|
|
3917
|
+
const totalTokens = stats.reduce((n, s) => n + s.tokens, 0);
|
|
3918
|
+
const slowFiles = stats.filter((s) => s.slow).map((s) => ({ file: s.file, elapsedMs: s.elapsedMs }));
|
|
3919
|
+
return { files: stats, totalSigs, totalTokens, slowFiles, fileCount: stats.length };
|
|
3920
|
+
}
|
|
3921
|
+
|
|
3922
|
+
module.exports = { analyzeFiles, formatAnalysisTable, formatAnalysisJSON };
|
|
3923
|
+
};
|
|
3924
|
+
|
|
3633
3925
|
// ── ./src/eval/runner ──
|
|
3634
3926
|
__factories["./src/eval/runner"] = function(module, exports) {
|
|
3635
3927
|
'use strict';
|
|
@@ -3799,7 +4091,7 @@ const path = require('path');
|
|
|
3799
4091
|
const os = require('os');
|
|
3800
4092
|
const { execSync } = require('child_process');
|
|
3801
4093
|
|
|
3802
|
-
const VERSION = '2.
|
|
4094
|
+
const VERSION = '2.3.0';
|
|
3803
4095
|
const MARKER = '\n\n## Auto-generated signatures\n<!-- Updated by gen-context.js -->\n';
|
|
3804
4096
|
|
|
3805
4097
|
function requireSourceOrBundled(key) {
|
|
@@ -5008,6 +5300,13 @@ Usage:
|
|
|
5008
5300
|
node gen-context.js --benchmark Run retrieval benchmark (benchmarks/tasks/retrieval.jsonl)
|
|
5009
5301
|
node gen-context.js --benchmark --json Benchmark results as JSON
|
|
5010
5302
|
node gen-context.js --eval Alias for --benchmark
|
|
5303
|
+
node gen-context.js --analyze Per-file breakdown: sigs, tokens, extractor, coverage
|
|
5304
|
+
node gen-context.js --analyze --json Breakdown as JSON
|
|
5305
|
+
node gen-context.js --analyze --slow Re-time each extractor; flag files >50ms
|
|
5306
|
+
node gen-context.js --diagnose-extractors Run all 21 extractors vs fixtures; show pass/fail + diff
|
|
5307
|
+
node gen-context.js --query "<text>" Rank files by relevance to a query
|
|
5308
|
+
node gen-context.js --query "<text>" --json Ranked results as JSON
|
|
5309
|
+
node gen-context.js --query "<text>" --top <n> Limit results to top N files (default 10)
|
|
5011
5310
|
node gen-context.js --init Write example config + .contextignore scaffold
|
|
5012
5311
|
node gen-context.js --help Show this message
|
|
5013
5312
|
node gen-context.js --version Show version
|
|
@@ -5169,6 +5468,163 @@ function main() {
|
|
|
5169
5468
|
process.exit(0);
|
|
5170
5469
|
}
|
|
5171
5470
|
|
|
5471
|
+
if (args.includes('--analyze')) {
|
|
5472
|
+
try {
|
|
5473
|
+
const { analyzeFiles, formatAnalysisTable, formatAnalysisJSON } = requireSourceOrBundled('./src/eval/analyzer');
|
|
5474
|
+
const cfg = config || {};
|
|
5475
|
+
const srcDirs = cfg.srcDirs || DEFAULTS.srcDirs;
|
|
5476
|
+
const exclude = cfg.exclude || DEFAULTS.exclude;
|
|
5477
|
+
const slow = args.includes('--slow');
|
|
5478
|
+
|
|
5479
|
+
// Collect files (reuse existing file-walker if accessible, else inline)
|
|
5480
|
+
const allFiles = [];
|
|
5481
|
+
function walkForAnalyze(dir, depth) {
|
|
5482
|
+
if (depth > (cfg.maxDepth || 6)) return;
|
|
5483
|
+
let entries;
|
|
5484
|
+
try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch (_) { return; }
|
|
5485
|
+
for (const e of entries) {
|
|
5486
|
+
if (exclude.some((x) => e.name === x || e.name.startsWith(x))) continue;
|
|
5487
|
+
const full = path.join(dir, e.name);
|
|
5488
|
+
if (e.isDirectory()) walkForAnalyze(full, depth + 1);
|
|
5489
|
+
else if (e.isFile()) allFiles.push(full);
|
|
5490
|
+
}
|
|
5491
|
+
}
|
|
5492
|
+
for (const sd of srcDirs) {
|
|
5493
|
+
const abs = path.join(cwd, sd);
|
|
5494
|
+
if (fs.existsSync(abs)) walkForAnalyze(abs, 0);
|
|
5495
|
+
}
|
|
5496
|
+
|
|
5497
|
+
const stats = analyzeFiles(allFiles, cwd, { slow, maxSigs: cfg.maxSigsPerFile || 25 });
|
|
5498
|
+
|
|
5499
|
+
if (args.includes('--json')) {
|
|
5500
|
+
process.stdout.write(JSON.stringify(formatAnalysisJSON(stats)) + '\n');
|
|
5501
|
+
} else {
|
|
5502
|
+
const table = formatAnalysisTable(stats, slow);
|
|
5503
|
+
process.stdout.write(table);
|
|
5504
|
+
}
|
|
5505
|
+
} catch (err) {
|
|
5506
|
+
console.error(`[sigmap] analyze error: ${err.message}`);
|
|
5507
|
+
process.exit(1);
|
|
5508
|
+
}
|
|
5509
|
+
process.exit(0);
|
|
5510
|
+
}
|
|
5511
|
+
|
|
5512
|
+
if (args.includes('--diagnose-extractors')) {
|
|
5513
|
+
try {
|
|
5514
|
+
const fixturesDir = path.join(cwd, 'test', 'fixtures');
|
|
5515
|
+
const expectedDir = path.join(cwd, 'test', 'expected');
|
|
5516
|
+
if (!fs.existsSync(fixturesDir) || !fs.existsSync(expectedDir)) {
|
|
5517
|
+
console.error('[sigmap] test/fixtures or test/expected not found — run from the SigMap repo root');
|
|
5518
|
+
process.exit(1);
|
|
5519
|
+
}
|
|
5520
|
+
|
|
5521
|
+
const EXT_TO_LANG = {
|
|
5522
|
+
'.ts': 'typescript', '.js': 'javascript', '.py': 'python',
|
|
5523
|
+
'.java': 'java', '.kt': 'kotlin', '.go': 'go', '.rs': 'rust',
|
|
5524
|
+
'.cs': 'csharp', '.cpp': 'cpp', '.rb': 'ruby', '.php': 'php',
|
|
5525
|
+
'.swift': 'swift', '.dart': 'dart', '.scala': 'scala',
|
|
5526
|
+
'.vue': 'vue', '.svelte': 'svelte', '.html': 'html',
|
|
5527
|
+
'.css': 'css', '.yml': 'yaml', '.sh': 'shell',
|
|
5528
|
+
};
|
|
5529
|
+
const SPECIAL = { 'Dockerfile': 'dockerfile' };
|
|
5530
|
+
|
|
5531
|
+
let passed = 0; let failed = 0;
|
|
5532
|
+
const entries = fs.readdirSync(fixturesDir).sort();
|
|
5533
|
+
|
|
5534
|
+
for (const filename of entries) {
|
|
5535
|
+
const ext = path.extname(filename).toLowerCase();
|
|
5536
|
+
const lang = EXT_TO_LANG[ext] || SPECIAL[filename];
|
|
5537
|
+
if (!lang) continue;
|
|
5538
|
+
|
|
5539
|
+
const fixturePath = path.join(fixturesDir, filename);
|
|
5540
|
+
const expectedPath = path.join(expectedDir, `${lang}.txt`);
|
|
5541
|
+
if (!fs.existsSync(expectedPath)) {
|
|
5542
|
+
console.log(` SKIP ${lang.padEnd(12)} (no expected file)`);
|
|
5543
|
+
continue;
|
|
5544
|
+
}
|
|
5545
|
+
|
|
5546
|
+
const src = fs.readFileSync(fixturePath, 'utf8');
|
|
5547
|
+
const expected = fs.readFileSync(expectedPath, 'utf8').trim();
|
|
5548
|
+
|
|
5549
|
+
let mod;
|
|
5550
|
+
try {
|
|
5551
|
+
mod = requireSourceOrBundled(`./src/extractors/${lang}`);
|
|
5552
|
+
} catch (e) {
|
|
5553
|
+
console.log(` ERROR ${lang.padEnd(12)} loader failed: ${e.message}`);
|
|
5554
|
+
failed++;
|
|
5555
|
+
continue;
|
|
5556
|
+
}
|
|
5557
|
+
|
|
5558
|
+
let actual;
|
|
5559
|
+
try {
|
|
5560
|
+
const sigs = mod.extract(src);
|
|
5561
|
+
actual = sigs.join('\n').trim();
|
|
5562
|
+
} catch (e) {
|
|
5563
|
+
console.log(` ERROR ${lang.padEnd(12)} extract() threw: ${e.message}`);
|
|
5564
|
+
failed++;
|
|
5565
|
+
continue;
|
|
5566
|
+
}
|
|
5567
|
+
|
|
5568
|
+
if (actual === expected) {
|
|
5569
|
+
console.log(` PASS ${lang}`);
|
|
5570
|
+
passed++;
|
|
5571
|
+
} else {
|
|
5572
|
+
console.log(` FAIL ${lang}`);
|
|
5573
|
+
// Show first diff line
|
|
5574
|
+
const aLines = actual.split('\n');
|
|
5575
|
+
const eLines = expected.split('\n');
|
|
5576
|
+
const maxLen = Math.max(aLines.length, eLines.length);
|
|
5577
|
+
for (let i = 0; i < maxLen; i++) {
|
|
5578
|
+
if (aLines[i] !== eLines[i]) {
|
|
5579
|
+
console.log(` expected: ${(eLines[i] || '(missing)').slice(0, 100)}`);
|
|
5580
|
+
console.log(` actual : ${(aLines[i] || '(missing)').slice(0, 100)}`);
|
|
5581
|
+
break;
|
|
5582
|
+
}
|
|
5583
|
+
}
|
|
5584
|
+
failed++;
|
|
5585
|
+
}
|
|
5586
|
+
}
|
|
5587
|
+
|
|
5588
|
+
console.log(`\n${passed} passed, ${failed} failed`);
|
|
5589
|
+
process.exit(failed > 0 ? 1 : 0);
|
|
5590
|
+
} catch (err) {
|
|
5591
|
+
console.error(`[sigmap] diagnose error: ${err.message}`);
|
|
5592
|
+
process.exit(1);
|
|
5593
|
+
}
|
|
5594
|
+
}
|
|
5595
|
+
|
|
5596
|
+
if (args.includes('--query')) {
|
|
5597
|
+
try {
|
|
5598
|
+
const qIdx = args.indexOf('--query');
|
|
5599
|
+
const query = (args[qIdx + 1] || '').trim();
|
|
5600
|
+
if (!query || query.startsWith('--')) {
|
|
5601
|
+
console.error('[sigmap] --query requires a search string');
|
|
5602
|
+
console.error(' Example: node gen-context.js --query "add a new language extractor"');
|
|
5603
|
+
process.exit(1);
|
|
5604
|
+
}
|
|
5605
|
+
const { rank, buildSigIndex, formatRankTable, formatRankJSON } = requireSourceOrBundled('./src/retrieval/ranker');
|
|
5606
|
+
const index = buildSigIndex(cwd);
|
|
5607
|
+
if (index.size === 0) {
|
|
5608
|
+
console.error('[sigmap] no context file found. Run: node gen-context.js');
|
|
5609
|
+
process.exit(1);
|
|
5610
|
+
}
|
|
5611
|
+
const topIdx = args.indexOf('--top');
|
|
5612
|
+
const topK = topIdx >= 0 ? Math.min(Math.max(1, parseInt(args[topIdx + 1], 10) || 10), 25)
|
|
5613
|
+
: ((config && config.retrieval && config.retrieval.topK) || 10);
|
|
5614
|
+
const recencyBoost = (config && config.retrieval && config.retrieval.recencyBoost) || 1.5;
|
|
5615
|
+
const results = rank(query, index, { topK, recencyBoost });
|
|
5616
|
+
if (args.includes('--json')) {
|
|
5617
|
+
process.stdout.write(JSON.stringify(formatRankJSON(results, query)) + '\n');
|
|
5618
|
+
} else {
|
|
5619
|
+
process.stdout.write(formatRankTable(results, query));
|
|
5620
|
+
}
|
|
5621
|
+
} catch (err) {
|
|
5622
|
+
console.error(`[sigmap] query error: ${err.message}`);
|
|
5623
|
+
process.exit(1);
|
|
5624
|
+
}
|
|
5625
|
+
process.exit(0);
|
|
5626
|
+
}
|
|
5627
|
+
|
|
5172
5628
|
if (args.includes('--report')) {
|
|
5173
5629
|
if (args.includes('--history')) {
|
|
5174
5630
|
try {
|
package/package.json
CHANGED
package/src/config/defaults.js
CHANGED
|
@@ -92,6 +92,14 @@ const DEFAULTS = {
|
|
|
92
92
|
|
|
93
93
|
// Add reverse dependency usage hints on file headings (opt-in)
|
|
94
94
|
impactRadius: false,
|
|
95
|
+
|
|
96
|
+
// Query-aware retrieval settings (v2.3)
|
|
97
|
+
retrieval: {
|
|
98
|
+
// Maximum number of files to return for --query
|
|
99
|
+
topK: 10,
|
|
100
|
+
// Multiplier applied to recently-changed files (>1 boosts them up)
|
|
101
|
+
recencyBoost: 1.5,
|
|
102
|
+
},
|
|
95
103
|
};
|
|
96
104
|
|
|
97
105
|
module.exports = { DEFAULTS };
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* SigMap file analyzer — per-file diagnostic statistics.
|
|
5
|
+
* Zero npm dependencies.
|
|
6
|
+
*
|
|
7
|
+
* Exports:
|
|
8
|
+
* analyzeFiles(files, cwd, opts) → stats[]
|
|
9
|
+
* formatAnalysisTable(stats) → markdown table string
|
|
10
|
+
* formatAnalysisJSON(stats) → plain object suitable for JSON.stringify
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const fs = require('fs');
|
|
14
|
+
const path = require('path');
|
|
15
|
+
|
|
16
|
+
// Extension → extractor name (mirrors EXT_MAP in gen-context.js)
|
|
17
|
+
const EXT_MAP = {
|
|
18
|
+
'.ts': 'typescript', '.tsx': 'typescript',
|
|
19
|
+
'.js': 'javascript', '.jsx': 'javascript', '.mjs': 'javascript', '.cjs': 'javascript',
|
|
20
|
+
'.py': 'python', '.pyw': 'python',
|
|
21
|
+
'.java': 'java',
|
|
22
|
+
'.kt': 'kotlin', '.kts': 'kotlin',
|
|
23
|
+
'.go': 'go',
|
|
24
|
+
'.rs': 'rust',
|
|
25
|
+
'.cs': 'csharp',
|
|
26
|
+
'.cpp': 'cpp', '.c': 'cpp', '.h': 'cpp', '.hpp': 'cpp', '.cc': 'cpp',
|
|
27
|
+
'.rb': 'ruby', '.rake': 'ruby',
|
|
28
|
+
'.php': 'php',
|
|
29
|
+
'.swift': 'swift',
|
|
30
|
+
'.dart': 'dart',
|
|
31
|
+
'.scala': 'scala', '.sc': 'scala',
|
|
32
|
+
'.vue': 'vue',
|
|
33
|
+
'.svelte': 'svelte',
|
|
34
|
+
'.html': 'html', '.htm': 'html',
|
|
35
|
+
'.css': 'css', '.scss': 'css', '.sass': 'css', '.less': 'css',
|
|
36
|
+
'.yml': 'yaml', '.yaml': 'yaml',
|
|
37
|
+
'.sh': 'shell', '.bash': 'shell', '.zsh': 'shell', '.fish': 'shell',
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
function isDockerfile(name) {
|
|
41
|
+
return name === 'Dockerfile' || name.startsWith('Dockerfile.');
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function getExtractorName(filePath) {
|
|
45
|
+
const base = path.basename(filePath);
|
|
46
|
+
const ext = path.extname(base).toLowerCase();
|
|
47
|
+
if (EXT_MAP[ext]) return EXT_MAP[ext];
|
|
48
|
+
if (isDockerfile(base)) return 'dockerfile';
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** Rough token estimate: chars / 4 */
|
|
53
|
+
function tokenCount(sigs) {
|
|
54
|
+
return Math.ceil(sigs.reduce((sum, s) => sum + s.length, 0) / 4);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Check whether a test file exists for this source file by looking for
|
|
59
|
+
* *.test.* / *.spec.* patterns in the test/ directory tree.
|
|
60
|
+
*/
|
|
61
|
+
function hasCoverage(filePath, cwd) {
|
|
62
|
+
const rel = path.relative(cwd, filePath);
|
|
63
|
+
const base = path.basename(rel, path.extname(rel)); // e.g. "python"
|
|
64
|
+
const testDirs = ['test', 'tests', '__tests__', 'spec'];
|
|
65
|
+
for (const d of testDirs) {
|
|
66
|
+
const abs = path.join(cwd, d);
|
|
67
|
+
if (!fs.existsSync(abs)) continue;
|
|
68
|
+
// Walk only one depth for speed
|
|
69
|
+
let entries;
|
|
70
|
+
try { entries = fs.readdirSync(abs, { withFileTypes: true }); } catch (_) { continue; }
|
|
71
|
+
for (const e of entries) {
|
|
72
|
+
if (e.name.includes(base)) return true;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return false;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Load an extractor module from src/extractors/ relative to cwd.
|
|
80
|
+
* Falls back to requiring from the module directory itself.
|
|
81
|
+
*/
|
|
82
|
+
function loadExtractor(name, cwd) {
|
|
83
|
+
// Try repo-local src/extractors first (for projects that embed sigmap)
|
|
84
|
+
const local = path.join(cwd, 'src', 'extractors', `${name}.js`);
|
|
85
|
+
if (fs.existsSync(local)) {
|
|
86
|
+
try { return require(local); } catch (_) {}
|
|
87
|
+
}
|
|
88
|
+
// Then standard node resolution from the current package
|
|
89
|
+
try { return require(path.join(__dirname, '..', 'extractors', `${name}.js`)); } catch (_) {}
|
|
90
|
+
return null;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Analyze a list of absolute file paths.
|
|
95
|
+
*
|
|
96
|
+
* @param {string[]} files - absolute paths to analyze
|
|
97
|
+
* @param {string} cwd - project root
|
|
98
|
+
* @param {object} [opts]
|
|
99
|
+
* @param {boolean} [opts.slow=false] - if true, measure extraction time per file
|
|
100
|
+
* @param {number} [opts.slowMs=50] - threshold (ms) before a file is "slow"
|
|
101
|
+
* @param {number} [opts.maxSigs=25] - max sigs per file
|
|
102
|
+
* @returns {object[]} array of per-file stat objects
|
|
103
|
+
*/
|
|
104
|
+
function analyzeFiles(files, cwd, opts) {
|
|
105
|
+
const slow = (opts && opts.slow) || false;
|
|
106
|
+
const slowMs = (opts && opts.slowMs) || 50;
|
|
107
|
+
const maxSigs = (opts && opts.maxSigs) || 25;
|
|
108
|
+
|
|
109
|
+
const stats = [];
|
|
110
|
+
const extractorCache = {};
|
|
111
|
+
|
|
112
|
+
for (const filePath of files) {
|
|
113
|
+
const extractorName = getExtractorName(filePath);
|
|
114
|
+
if (!extractorName) continue;
|
|
115
|
+
|
|
116
|
+
// Load extractor (cached)
|
|
117
|
+
if (!extractorCache[extractorName]) {
|
|
118
|
+
extractorCache[extractorName] = loadExtractor(extractorName, cwd);
|
|
119
|
+
}
|
|
120
|
+
const extractor = extractorCache[extractorName];
|
|
121
|
+
if (!extractor || typeof extractor.extract !== 'function') continue;
|
|
122
|
+
|
|
123
|
+
let content;
|
|
124
|
+
try { content = fs.readFileSync(filePath, 'utf8'); } catch (_) { continue; }
|
|
125
|
+
|
|
126
|
+
let sigs;
|
|
127
|
+
let elapsedMs = 0;
|
|
128
|
+
|
|
129
|
+
if (slow) {
|
|
130
|
+
const t0 = Date.now();
|
|
131
|
+
try { sigs = extractor.extract(content); } catch (_) { sigs = []; }
|
|
132
|
+
elapsedMs = Date.now() - t0;
|
|
133
|
+
} else {
|
|
134
|
+
try { sigs = extractor.extract(content); } catch (_) { sigs = []; }
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
sigs = (Array.isArray(sigs) ? sigs : []).slice(0, maxSigs);
|
|
138
|
+
|
|
139
|
+
const rel = path.relative(cwd, filePath);
|
|
140
|
+
const tokens = tokenCount(sigs);
|
|
141
|
+
const covered = hasCoverage(filePath, cwd);
|
|
142
|
+
const isSlow = slow && elapsedMs > slowMs;
|
|
143
|
+
|
|
144
|
+
stats.push({
|
|
145
|
+
file: rel,
|
|
146
|
+
extractor: extractorName,
|
|
147
|
+
sigs: sigs.length,
|
|
148
|
+
tokens,
|
|
149
|
+
covered,
|
|
150
|
+
elapsedMs: slow ? elapsedMs : undefined,
|
|
151
|
+
slow: slow ? isSlow : undefined,
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return stats;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Format stats as a markdown table.
|
|
160
|
+
*
|
|
161
|
+
* @param {object[]} stats - output of analyzeFiles()
|
|
162
|
+
* @param {boolean} showSlow - whether to include the Elapsed column
|
|
163
|
+
* @returns {string}
|
|
164
|
+
*/
|
|
165
|
+
function formatAnalysisTable(stats, showSlow) {
|
|
166
|
+
if (!stats || stats.length === 0) return '_(no files analyzed)_\n';
|
|
167
|
+
|
|
168
|
+
// Column widths
|
|
169
|
+
const maxFile = Math.max(4, ...stats.map((s) => s.file.length));
|
|
170
|
+
|
|
171
|
+
const header = showSlow
|
|
172
|
+
? `| ${'File'.padEnd(maxFile)} | Sigs | Tokens | Extractor | Coverage | Elapsed |`
|
|
173
|
+
: `| ${'File'.padEnd(maxFile)} | Sigs | Tokens | Extractor | Coverage |`;
|
|
174
|
+
|
|
175
|
+
const sep = showSlow
|
|
176
|
+
? `|${'-'.repeat(maxFile + 2)}|------|--------|-------------|------------|----------|`
|
|
177
|
+
: `|${'-'.repeat(maxFile + 2)}|------|--------|-------------|------------|`;
|
|
178
|
+
|
|
179
|
+
const rows = stats.map((s) => {
|
|
180
|
+
const cov = s.covered ? '✓ tested ' : '✗ untested';
|
|
181
|
+
const file = s.file.padEnd(maxFile);
|
|
182
|
+
const ext = (s.extractor || '').padEnd(11);
|
|
183
|
+
const base = `| ${file} | ${String(s.sigs).padStart(4)} | ${String(s.tokens).padStart(6)} | ${ext} | ${cov} |`;
|
|
184
|
+
if (showSlow) {
|
|
185
|
+
const ms = s.elapsedMs !== undefined ? `${s.elapsedMs}ms` : '';
|
|
186
|
+
const flag = s.slow ? ' ⚠️' : '';
|
|
187
|
+
return `${base} ${ms.padStart(6)}${flag} |`;
|
|
188
|
+
}
|
|
189
|
+
return base;
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
const totalSigs = stats.reduce((n, s) => n + s.sigs, 0);
|
|
193
|
+
const totalTokens = stats.reduce((n, s) => n + s.tokens, 0);
|
|
194
|
+
const slotFile = ''.padEnd(maxFile);
|
|
195
|
+
const baseFoot = `| ${slotFile} | ${String(totalSigs).padStart(4)} | ${String(totalTokens).padStart(6)} | **Total** | |`;
|
|
196
|
+
const footer = showSlow ? `${baseFoot} ${' '.padStart(8)} |` : baseFoot;
|
|
197
|
+
|
|
198
|
+
return [header, sep, ...rows, sep, footer].join('\n') + '\n';
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Format stats as a plain-object suitable for JSON.stringify.
|
|
203
|
+
*
|
|
204
|
+
* @param {object[]} stats
|
|
205
|
+
* @returns {object}
|
|
206
|
+
*/
|
|
207
|
+
function formatAnalysisJSON(stats) {
|
|
208
|
+
const totalSigs = stats.reduce((n, s) => n + s.sigs, 0);
|
|
209
|
+
const totalTokens = stats.reduce((n, s) => n + s.tokens, 0);
|
|
210
|
+
const slowFiles = stats.filter((s) => s.slow);
|
|
211
|
+
|
|
212
|
+
return {
|
|
213
|
+
files: stats,
|
|
214
|
+
totalSigs,
|
|
215
|
+
totalTokens,
|
|
216
|
+
slowFiles: slowFiles.map((s) => ({ file: s.file, elapsedMs: s.elapsedMs })),
|
|
217
|
+
fileCount: stats.length,
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
module.exports = { analyzeFiles, formatAnalysisTable, formatAnalysisJSON };
|
package/src/mcp/handlers.js
CHANGED
|
@@ -430,4 +430,31 @@ function listModules(args, cwd) {
|
|
|
430
430
|
].join('\n');
|
|
431
431
|
}
|
|
432
432
|
|
|
433
|
-
|
|
433
|
+
/**
|
|
434
|
+
* query_context({ query, topK? }) → string
|
|
435
|
+
*
|
|
436
|
+
* Ranks context-file entries by relevance to the query and returns the
|
|
437
|
+
* top-K most relevant files with their signatures and scores.
|
|
438
|
+
*/
|
|
439
|
+
function queryContext(args, cwd) {
|
|
440
|
+
if (!args || !args.query) return 'Missing required argument: query';
|
|
441
|
+
|
|
442
|
+
const contextPath = path.join(cwd, CONTEXT_FILE);
|
|
443
|
+
if (!fs.existsSync(contextPath)) {
|
|
444
|
+
return 'No context file found. Run: node gen-context.js';
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
try {
|
|
448
|
+
const { rank, buildSigIndex, formatRankTable } = require('../retrieval/ranker');
|
|
449
|
+
const index = buildSigIndex(cwd);
|
|
450
|
+
if (index.size === 0) return 'No signatures indexed. Run: node gen-context.js';
|
|
451
|
+
|
|
452
|
+
const topK = Math.min(Math.max(1, parseInt(args.topK, 10) || 10), 25);
|
|
453
|
+
const results = rank(args.query, index, { topK });
|
|
454
|
+
return formatRankTable(results, args.query);
|
|
455
|
+
} catch (err) {
|
|
456
|
+
return `_query_context failed: ${err.message}_`;
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
module.exports = { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext };
|
package/src/mcp/server.js
CHANGED
|
@@ -14,11 +14,11 @@
|
|
|
14
14
|
|
|
15
15
|
const readline = require('readline');
|
|
16
16
|
const { TOOLS } = require('./tools');
|
|
17
|
-
const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules } = require('./handlers');
|
|
17
|
+
const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, explainFile, listModules, queryContext } = require('./handlers');
|
|
18
18
|
|
|
19
19
|
const SERVER_INFO = {
|
|
20
20
|
name: 'sigmap',
|
|
21
|
-
version: '2.
|
|
21
|
+
version: '2.3.0',
|
|
22
22
|
description: 'SigMap MCP server — code signatures on demand',
|
|
23
23
|
};
|
|
24
24
|
|
|
@@ -73,6 +73,7 @@ function dispatch(msg, cwd) {
|
|
|
73
73
|
else if (name === 'get_routing') text = getRouting(args, cwd);
|
|
74
74
|
else if (name === 'explain_file') text = explainFile(args, cwd);
|
|
75
75
|
else if (name === 'list_modules') text = listModules(args, cwd);
|
|
76
|
+
else if (name === 'query_context') text = queryContext(args, cwd);
|
|
76
77
|
else {
|
|
77
78
|
respondError(id, -32601, `Unknown tool: ${name}`);
|
|
78
79
|
return;
|
package/src/mcp/tools.js
CHANGED
|
@@ -120,6 +120,30 @@ const TOOLS = [
|
|
|
120
120
|
required: [],
|
|
121
121
|
},
|
|
122
122
|
},
|
|
123
|
+
{
|
|
124
|
+
name: 'query_context',
|
|
125
|
+
description:
|
|
126
|
+
'Rank and return the most relevant files for a specific task or question. ' +
|
|
127
|
+
'Uses keyword + symbol + path scoring to surface only the top-K files relevant ' +
|
|
128
|
+
'to the query — much cheaper than reading all context. ' +
|
|
129
|
+
'Returns ranked file list with signatures and relevance scores.',
|
|
130
|
+
inputSchema: {
|
|
131
|
+
type: 'object',
|
|
132
|
+
properties: {
|
|
133
|
+
query: {
|
|
134
|
+
type: 'string',
|
|
135
|
+
description:
|
|
136
|
+
'Natural language task description or keyword(s) to rank files against. ' +
|
|
137
|
+
'E.g. "add a new language extractor", "fix secret scanning", "auth module".',
|
|
138
|
+
},
|
|
139
|
+
topK: {
|
|
140
|
+
type: 'number',
|
|
141
|
+
description: 'Maximum number of files to return (default: 10, max: 25).',
|
|
142
|
+
},
|
|
143
|
+
},
|
|
144
|
+
required: ['query'],
|
|
145
|
+
},
|
|
146
|
+
},
|
|
123
147
|
];
|
|
124
148
|
|
|
125
149
|
module.exports = { TOOLS };
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* SigMap zero-dependency relevance ranker.
|
|
5
|
+
*
|
|
6
|
+
* Ranks all files in a signature index against a natural-language query.
|
|
7
|
+
* Scoring weights:
|
|
8
|
+
* - keyword overlap (exact token match against sigs)
|
|
9
|
+
* - symbol match (token appears in a top-level identifier / function name)
|
|
10
|
+
* - partial prefix match (token is prefix of a sig token, length ≥ 4)
|
|
11
|
+
* - path relevance (query token appears in the file path)
|
|
12
|
+
* - recency boost (applied externally via recency map)
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* const { rank } = require('./src/retrieval/ranker');
|
|
16
|
+
* const results = rank(query, sigIndex, { topK: 10 });
|
|
17
|
+
* // results: [{ file, score, sigs, tokens }]
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
const { tokenize, STOP_WORDS } = require('./tokenizer');
|
|
21
|
+
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Default weights
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
const DEFAULT_WEIGHTS = {
|
|
26
|
+
exactToken: 1.0, // query token exactly in sig tokens
|
|
27
|
+
symbolMatch: 0.5, // bonus if token appears in a function/class name line
|
|
28
|
+
prefixMatch: 0.3, // partial prefix hit (query token ≥ 4 chars)
|
|
29
|
+
pathMatch: 0.8, // query token appears in the file path
|
|
30
|
+
recencyBoost: 1.5, // multiplier applied when file is in recencySet
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Score a single file against a query.
|
|
35
|
+
*
|
|
36
|
+
* @param {string} filePath - relative file path (e.g. 'src/extractors/python.js')
|
|
37
|
+
* @param {string[]} sigs - signature strings for this file
|
|
38
|
+
* @param {string[]} queryTokens - pre-tokenized query
|
|
39
|
+
* @param {object} weights
|
|
40
|
+
* @returns {number}
|
|
41
|
+
*/
|
|
42
|
+
function scoreFile(filePath, sigs, queryTokens, weights) {
|
|
43
|
+
if (!sigs || sigs.length === 0) return 0;
|
|
44
|
+
|
|
45
|
+
const w = weights || DEFAULT_WEIGHTS;
|
|
46
|
+
|
|
47
|
+
// Build token set from all signatures
|
|
48
|
+
const sigText = sigs.join(' ');
|
|
49
|
+
const sigTokenSet = new Set(tokenize(sigText));
|
|
50
|
+
|
|
51
|
+
// Build token set from the file path
|
|
52
|
+
const pathTokenSet = new Set(tokenize(filePath));
|
|
53
|
+
|
|
54
|
+
let score = 0;
|
|
55
|
+
|
|
56
|
+
for (const qt of queryTokens) {
|
|
57
|
+
if (STOP_WORDS.has(qt)) continue;
|
|
58
|
+
|
|
59
|
+
// Exact token match in sigs
|
|
60
|
+
if (sigTokenSet.has(qt)) {
|
|
61
|
+
score += w.exactToken;
|
|
62
|
+
|
|
63
|
+
// Bonus: appears directly in a function/class/method name line
|
|
64
|
+
const nameLineMatch = sigs.some((sig) => {
|
|
65
|
+
const nt = tokenize(sig.replace(/[^a-zA-Z0-9_\s]/g, ' '));
|
|
66
|
+
return nt.includes(qt);
|
|
67
|
+
});
|
|
68
|
+
if (nameLineMatch) score += w.symbolMatch;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Prefix match (e.g. query "python" matches "pythonDeps")
|
|
72
|
+
if (qt.length >= 4) {
|
|
73
|
+
for (const st of sigTokenSet) {
|
|
74
|
+
if (st !== qt && st.startsWith(qt)) {
|
|
75
|
+
score += w.prefixMatch;
|
|
76
|
+
break; // one bonus per query token
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Path token match
|
|
82
|
+
if (pathTokenSet.has(qt)) {
|
|
83
|
+
score += w.pathMatch;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return score;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Rank all files in a signature index against a query.
|
|
92
|
+
*
|
|
93
|
+
* @param {string} query - natural language query
|
|
94
|
+
* @param {Map<string,string[]>} sigIndex - Map<file, sigs[]>
|
|
95
|
+
* @param {object} [opts]
|
|
96
|
+
* @param {number} [opts.topK=10] - max results to return
|
|
97
|
+
* @param {number} [opts.recencyBoost=1.5] - multiplier for recent files
|
|
98
|
+
* @param {Set<string>} [opts.recencySet] - set of recently-changed file paths
|
|
99
|
+
* @param {object} [opts.weights] - override scoring weights
|
|
100
|
+
* @returns {{ file: string, score: number, sigs: string[], tokens: number }[]}
|
|
101
|
+
*/
|
|
102
|
+
function rank(query, sigIndex, opts) {
|
|
103
|
+
if (!query || typeof query !== 'string') return [];
|
|
104
|
+
if (!sigIndex || !(sigIndex instanceof Map) || sigIndex.size === 0) return [];
|
|
105
|
+
|
|
106
|
+
const topK = (opts && opts.topK) || 10;
|
|
107
|
+
const recencyMultiplier = (opts && opts.recencyBoost) || DEFAULT_WEIGHTS.recencyBoost;
|
|
108
|
+
const recencySet = (opts && opts.recencySet) || null;
|
|
109
|
+
const weights = (opts && opts.weights) ? Object.assign({}, DEFAULT_WEIGHTS, opts.weights) : DEFAULT_WEIGHTS;
|
|
110
|
+
|
|
111
|
+
const queryTokens = tokenize(query);
|
|
112
|
+
if (queryTokens.length === 0) {
|
|
113
|
+
// Empty query: return top-K by file count (most signatures = most useful)
|
|
114
|
+
const all = [];
|
|
115
|
+
for (const [file, sigs] of sigIndex.entries()) {
|
|
116
|
+
all.push({ file, score: sigs.length, sigs, tokens: Math.ceil(sigs.join('\n').length / 4) });
|
|
117
|
+
}
|
|
118
|
+
all.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
|
|
119
|
+
return all.slice(0, topK);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const scored = [];
|
|
123
|
+
for (const [file, sigs] of sigIndex.entries()) {
|
|
124
|
+
let score = scoreFile(file, sigs, queryTokens, weights);
|
|
125
|
+
|
|
126
|
+
// Recency boost
|
|
127
|
+
if (recencySet && recencySet.has(file) && score > 0) {
|
|
128
|
+
score *= recencyMultiplier;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
scored.push({
|
|
132
|
+
file,
|
|
133
|
+
score,
|
|
134
|
+
sigs,
|
|
135
|
+
tokens: Math.ceil(sigs.join('\n').length / 4),
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
|
|
140
|
+
return scored.slice(0, topK);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Build a signature index from the generated context file.
|
|
145
|
+
* Returns Map<filePath, string[]> where filePath is the relative path
|
|
146
|
+
* as it appears in the ### headers of copilot-instructions.md.
|
|
147
|
+
*
|
|
148
|
+
* @param {string} cwd
|
|
149
|
+
* @returns {Map<string, string[]>}
|
|
150
|
+
*/
|
|
151
|
+
function buildSigIndex(cwd) {
|
|
152
|
+
const fs = require('fs');
|
|
153
|
+
const path = require('path');
|
|
154
|
+
const contextPath = path.join(cwd, '.github', 'copilot-instructions.md');
|
|
155
|
+
const index = new Map();
|
|
156
|
+
|
|
157
|
+
if (!fs.existsSync(contextPath)) return index;
|
|
158
|
+
|
|
159
|
+
const content = fs.readFileSync(contextPath, 'utf8');
|
|
160
|
+
const lines = content.split('\n');
|
|
161
|
+
|
|
162
|
+
let currentFile = null;
|
|
163
|
+
let inBlock = false;
|
|
164
|
+
let sigs = [];
|
|
165
|
+
|
|
166
|
+
for (const line of lines) {
|
|
167
|
+
const headerMatch = line.match(/^###\s+(\S+)\s*$/);
|
|
168
|
+
if (headerMatch) {
|
|
169
|
+
if (currentFile !== null) index.set(currentFile, sigs);
|
|
170
|
+
currentFile = headerMatch[1];
|
|
171
|
+
sigs = [];
|
|
172
|
+
inBlock = false;
|
|
173
|
+
continue;
|
|
174
|
+
}
|
|
175
|
+
if (line.startsWith('```')) { inBlock = !inBlock; continue; }
|
|
176
|
+
if (inBlock && currentFile && line.trim()) sigs.push(line.trim());
|
|
177
|
+
}
|
|
178
|
+
if (currentFile !== null) index.set(currentFile, sigs);
|
|
179
|
+
|
|
180
|
+
return index;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Format ranked results as a markdown table string.
|
|
185
|
+
*
|
|
186
|
+
* @param {{ file: string, score: number, sigs: string[], tokens: number }[]} results
|
|
187
|
+
* @param {string} query
|
|
188
|
+
* @returns {string}
|
|
189
|
+
*/
|
|
190
|
+
function formatRankTable(results, query) {
|
|
191
|
+
if (!results || results.length === 0) {
|
|
192
|
+
return `No matching files found for query: "${query}"\n`;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
const lines = [
|
|
196
|
+
`## Query: ${query}`,
|
|
197
|
+
'',
|
|
198
|
+
'| Rank | File | Score | Sigs | Tokens |',
|
|
199
|
+
'|------|------|-------|------|--------|',
|
|
200
|
+
...results.map((r, i) =>
|
|
201
|
+
`| ${i + 1} | ${r.file} | ${r.score.toFixed(2)} | ${r.sigs.length} | ${r.tokens} |`
|
|
202
|
+
),
|
|
203
|
+
'',
|
|
204
|
+
];
|
|
205
|
+
|
|
206
|
+
// Add signature details for top results
|
|
207
|
+
for (const r of results.slice(0, 3)) {
|
|
208
|
+
if (r.sigs.length > 0) {
|
|
209
|
+
lines.push(`### ${r.file}`);
|
|
210
|
+
lines.push('```');
|
|
211
|
+
lines.push(...r.sigs.slice(0, 10));
|
|
212
|
+
if (r.sigs.length > 10) lines.push(`... (${r.sigs.length - 10} more)`);
|
|
213
|
+
lines.push('```');
|
|
214
|
+
lines.push('');
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return lines.join('\n');
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Format ranked results as a structured JSON-serialisable object.
|
|
223
|
+
*
|
|
224
|
+
* @param {{ file: string, score: number, sigs: string[], tokens: number }[]} results
|
|
225
|
+
* @param {string} query
|
|
226
|
+
* @returns {object}
|
|
227
|
+
*/
|
|
228
|
+
function formatRankJSON(results, query) {
|
|
229
|
+
return {
|
|
230
|
+
query,
|
|
231
|
+
results: (results || []).map((r, i) => ({
|
|
232
|
+
rank: i + 1,
|
|
233
|
+
file: r.file,
|
|
234
|
+
score: r.score,
|
|
235
|
+
sigs: r.sigs,
|
|
236
|
+
tokens: r.tokens,
|
|
237
|
+
})),
|
|
238
|
+
totalResults: (results || []).length,
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
module.exports = { rank, buildSigIndex, scoreFile, formatRankTable, formatRankJSON, DEFAULT_WEIGHTS };
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* SigMap zero-dependency tokenizer.
|
|
5
|
+
* Splits code identifiers: camelCase, snake_case, kebab-case, PascalCase,
|
|
6
|
+
* removes stop words, and returns lower-case tokens.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
const STOP_WORDS = new Set([
|
|
10
|
+
'the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'or', 'is', 'are',
|
|
11
|
+
'that', 'this', 'it', 'with', 'from', 'by', 'be', 'as', 'on', 'at',
|
|
12
|
+
'do', 'not', 'use', 'get', 'set', 'up', 'if', 'no', 'so', 'we',
|
|
13
|
+
]);
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Tokenize any text (query or code signature) into unique lower-case tokens.
|
|
17
|
+
* Handles:
|
|
18
|
+
* - camelCase → ['camel', 'case']
|
|
19
|
+
* - PascalCase → ['pascal', 'case']
|
|
20
|
+
* - snake_case → ['snake', 'case']
|
|
21
|
+
* - kebab-case → ['kebab', 'case']
|
|
22
|
+
* - dot.notation → ['dot', 'notation']
|
|
23
|
+
* - File paths → individual path components (no extension)
|
|
24
|
+
*
|
|
25
|
+
* @param {string} text
|
|
26
|
+
* @param {object} [opts]
|
|
27
|
+
* @param {boolean} [opts.removeStopWords=true]
|
|
28
|
+
* @param {number} [opts.minLength=2]
|
|
29
|
+
* @returns {string[]}
|
|
30
|
+
*/
|
|
31
|
+
function tokenize(text, opts) {
|
|
32
|
+
if (!text || typeof text !== 'string') return [];
|
|
33
|
+
const removeStop = opts && opts.removeStopWords === false ? false : true;
|
|
34
|
+
const minLen = (opts && opts.minLength) || 2;
|
|
35
|
+
|
|
36
|
+
const tokens = text
|
|
37
|
+
// strip file extension (e.g. .js, .ts, .py)
|
|
38
|
+
.replace(/\.\w{1,6}(?=\s|\/|$)/g, ' ')
|
|
39
|
+
// camelCase / PascalCase split
|
|
40
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
41
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
|
|
42
|
+
// snake_case / kebab-case / dot.notation
|
|
43
|
+
.replace(/[_\-\.\/]/g, ' ')
|
|
44
|
+
// drop remaining non-word characters
|
|
45
|
+
.replace(/[^\w\s]/g, ' ')
|
|
46
|
+
.toLowerCase()
|
|
47
|
+
.split(/\s+/)
|
|
48
|
+
.filter((t) => t.length >= minLen);
|
|
49
|
+
|
|
50
|
+
if (!removeStop) return [...new Set(tokens)];
|
|
51
|
+
return [...new Set(tokens.filter((t) => !STOP_WORDS.has(t)))];
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
module.exports = { tokenize, STOP_WORDS };
|