sigmap 2.2.0 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +48 -0
- package/README.md +60 -16
- package/gen-context.js +193 -3
- package/package.json +7 -1
- package/packages/cli/index.js +63 -0
- package/packages/cli/package.json +26 -0
- package/packages/core/README.md +133 -0
- package/packages/core/index.js +215 -0
- package/packages/core/package.json +28 -0
- package/src/config/defaults.js +8 -0
- package/src/mcp/handlers.js +28 -1
- package/src/mcp/server.js +3 -2
- package/src/mcp/tools.js +24 -0
- package/src/retrieval/ranker.js +242 -0
- package/src/retrieval/tokenizer.js +54 -0
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* SigMap zero-dependency relevance ranker.
|
|
5
|
+
*
|
|
6
|
+
* Ranks all files in a signature index against a natural-language query.
|
|
7
|
+
* Scoring weights:
|
|
8
|
+
* - keyword overlap (exact token match against sigs)
|
|
9
|
+
* - symbol match (token appears in a top-level identifier / function name)
|
|
10
|
+
* - partial prefix match (token is prefix of a sig token, length ≥ 4)
|
|
11
|
+
* - path relevance (query token appears in the file path)
|
|
12
|
+
* - recency boost (applied externally via recency map)
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* const { rank } = require('./src/retrieval/ranker');
|
|
16
|
+
* const results = rank(query, sigIndex, { topK: 10 });
|
|
17
|
+
* // results: [{ file, score, sigs, tokens }]
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
const { tokenize, STOP_WORDS } = require('./tokenizer');
|
|
21
|
+
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Default weights
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
const DEFAULT_WEIGHTS = {
|
|
26
|
+
exactToken: 1.0, // query token exactly in sig tokens
|
|
27
|
+
symbolMatch: 0.5, // bonus if token appears in a function/class name line
|
|
28
|
+
prefixMatch: 0.3, // partial prefix hit (query token ≥ 4 chars)
|
|
29
|
+
pathMatch: 0.8, // query token appears in the file path
|
|
30
|
+
recencyBoost: 1.5, // multiplier applied when file is in recencySet
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Score a single file against a query.
|
|
35
|
+
*
|
|
36
|
+
* @param {string} filePath - relative file path (e.g. 'src/extractors/python.js')
|
|
37
|
+
* @param {string[]} sigs - signature strings for this file
|
|
38
|
+
* @param {string[]} queryTokens - pre-tokenized query
|
|
39
|
+
* @param {object} weights
|
|
40
|
+
* @returns {number}
|
|
41
|
+
*/
|
|
42
|
+
function scoreFile(filePath, sigs, queryTokens, weights) {
|
|
43
|
+
if (!sigs || sigs.length === 0) return 0;
|
|
44
|
+
|
|
45
|
+
const w = weights || DEFAULT_WEIGHTS;
|
|
46
|
+
|
|
47
|
+
// Build token set from all signatures
|
|
48
|
+
const sigText = sigs.join(' ');
|
|
49
|
+
const sigTokenSet = new Set(tokenize(sigText));
|
|
50
|
+
|
|
51
|
+
// Build token set from the file path
|
|
52
|
+
const pathTokenSet = new Set(tokenize(filePath));
|
|
53
|
+
|
|
54
|
+
let score = 0;
|
|
55
|
+
|
|
56
|
+
for (const qt of queryTokens) {
|
|
57
|
+
if (STOP_WORDS.has(qt)) continue;
|
|
58
|
+
|
|
59
|
+
// Exact token match in sigs
|
|
60
|
+
if (sigTokenSet.has(qt)) {
|
|
61
|
+
score += w.exactToken;
|
|
62
|
+
|
|
63
|
+
// Bonus: appears directly in a function/class/method name line
|
|
64
|
+
const nameLineMatch = sigs.some((sig) => {
|
|
65
|
+
const nt = tokenize(sig.replace(/[^a-zA-Z0-9_\s]/g, ' '));
|
|
66
|
+
return nt.includes(qt);
|
|
67
|
+
});
|
|
68
|
+
if (nameLineMatch) score += w.symbolMatch;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Prefix match (e.g. query "python" matches "pythonDeps")
|
|
72
|
+
if (qt.length >= 4) {
|
|
73
|
+
for (const st of sigTokenSet) {
|
|
74
|
+
if (st !== qt && st.startsWith(qt)) {
|
|
75
|
+
score += w.prefixMatch;
|
|
76
|
+
break; // one bonus per query token
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Path token match
|
|
82
|
+
if (pathTokenSet.has(qt)) {
|
|
83
|
+
score += w.pathMatch;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return score;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Rank all files in a signature index against a query.
|
|
92
|
+
*
|
|
93
|
+
* @param {string} query - natural language query
|
|
94
|
+
* @param {Map<string,string[]>} sigIndex - Map<file, sigs[]>
|
|
95
|
+
* @param {object} [opts]
|
|
96
|
+
* @param {number} [opts.topK=10] - max results to return
|
|
97
|
+
* @param {number} [opts.recencyBoost=1.5] - multiplier for recent files
|
|
98
|
+
* @param {Set<string>} [opts.recencySet] - set of recently-changed file paths
|
|
99
|
+
* @param {object} [opts.weights] - override scoring weights
|
|
100
|
+
* @returns {{ file: string, score: number, sigs: string[], tokens: number }[]}
|
|
101
|
+
*/
|
|
102
|
+
function rank(query, sigIndex, opts) {
|
|
103
|
+
if (!query || typeof query !== 'string') return [];
|
|
104
|
+
if (!sigIndex || !(sigIndex instanceof Map) || sigIndex.size === 0) return [];
|
|
105
|
+
|
|
106
|
+
const topK = (opts && opts.topK) || 10;
|
|
107
|
+
const recencyMultiplier = (opts && opts.recencyBoost) || DEFAULT_WEIGHTS.recencyBoost;
|
|
108
|
+
const recencySet = (opts && opts.recencySet) || null;
|
|
109
|
+
const weights = (opts && opts.weights) ? Object.assign({}, DEFAULT_WEIGHTS, opts.weights) : DEFAULT_WEIGHTS;
|
|
110
|
+
|
|
111
|
+
const queryTokens = tokenize(query);
|
|
112
|
+
if (queryTokens.length === 0) {
|
|
113
|
+
// Empty query: return top-K by file count (most signatures = most useful)
|
|
114
|
+
const all = [];
|
|
115
|
+
for (const [file, sigs] of sigIndex.entries()) {
|
|
116
|
+
all.push({ file, score: sigs.length, sigs, tokens: Math.ceil(sigs.join('\n').length / 4) });
|
|
117
|
+
}
|
|
118
|
+
all.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
|
|
119
|
+
return all.slice(0, topK);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const scored = [];
|
|
123
|
+
for (const [file, sigs] of sigIndex.entries()) {
|
|
124
|
+
let score = scoreFile(file, sigs, queryTokens, weights);
|
|
125
|
+
|
|
126
|
+
// Recency boost
|
|
127
|
+
if (recencySet && recencySet.has(file) && score > 0) {
|
|
128
|
+
score *= recencyMultiplier;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
scored.push({
|
|
132
|
+
file,
|
|
133
|
+
score,
|
|
134
|
+
sigs,
|
|
135
|
+
tokens: Math.ceil(sigs.join('\n').length / 4),
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
|
|
140
|
+
return scored.slice(0, topK);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Build a signature index from the generated context file.
|
|
145
|
+
* Returns Map<filePath, string[]> where filePath is the relative path
|
|
146
|
+
* as it appears in the ### headers of copilot-instructions.md.
|
|
147
|
+
*
|
|
148
|
+
* @param {string} cwd
|
|
149
|
+
* @returns {Map<string, string[]>}
|
|
150
|
+
*/
|
|
151
|
+
function buildSigIndex(cwd) {
|
|
152
|
+
const fs = require('fs');
|
|
153
|
+
const path = require('path');
|
|
154
|
+
const contextPath = path.join(cwd, '.github', 'copilot-instructions.md');
|
|
155
|
+
const index = new Map();
|
|
156
|
+
|
|
157
|
+
if (!fs.existsSync(contextPath)) return index;
|
|
158
|
+
|
|
159
|
+
const content = fs.readFileSync(contextPath, 'utf8');
|
|
160
|
+
const lines = content.split('\n');
|
|
161
|
+
|
|
162
|
+
let currentFile = null;
|
|
163
|
+
let inBlock = false;
|
|
164
|
+
let sigs = [];
|
|
165
|
+
|
|
166
|
+
for (const line of lines) {
|
|
167
|
+
const headerMatch = line.match(/^###\s+(\S+)\s*$/);
|
|
168
|
+
if (headerMatch) {
|
|
169
|
+
if (currentFile !== null) index.set(currentFile, sigs);
|
|
170
|
+
currentFile = headerMatch[1];
|
|
171
|
+
sigs = [];
|
|
172
|
+
inBlock = false;
|
|
173
|
+
continue;
|
|
174
|
+
}
|
|
175
|
+
if (line.startsWith('```')) { inBlock = !inBlock; continue; }
|
|
176
|
+
if (inBlock && currentFile && line.trim()) sigs.push(line.trim());
|
|
177
|
+
}
|
|
178
|
+
if (currentFile !== null) index.set(currentFile, sigs);
|
|
179
|
+
|
|
180
|
+
return index;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Format ranked results as a markdown table string.
|
|
185
|
+
*
|
|
186
|
+
* @param {{ file: string, score: number, sigs: string[], tokens: number }[]} results
|
|
187
|
+
* @param {string} query
|
|
188
|
+
* @returns {string}
|
|
189
|
+
*/
|
|
190
|
+
function formatRankTable(results, query) {
|
|
191
|
+
if (!results || results.length === 0) {
|
|
192
|
+
return `No matching files found for query: "${query}"\n`;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
const lines = [
|
|
196
|
+
`## Query: ${query}`,
|
|
197
|
+
'',
|
|
198
|
+
'| Rank | File | Score | Sigs | Tokens |',
|
|
199
|
+
'|------|------|-------|------|--------|',
|
|
200
|
+
...results.map((r, i) =>
|
|
201
|
+
`| ${i + 1} | ${r.file} | ${r.score.toFixed(2)} | ${r.sigs.length} | ${r.tokens} |`
|
|
202
|
+
),
|
|
203
|
+
'',
|
|
204
|
+
];
|
|
205
|
+
|
|
206
|
+
// Add signature details for top results
|
|
207
|
+
for (const r of results.slice(0, 3)) {
|
|
208
|
+
if (r.sigs.length > 0) {
|
|
209
|
+
lines.push(`### ${r.file}`);
|
|
210
|
+
lines.push('```');
|
|
211
|
+
lines.push(...r.sigs.slice(0, 10));
|
|
212
|
+
if (r.sigs.length > 10) lines.push(`... (${r.sigs.length - 10} more)`);
|
|
213
|
+
lines.push('```');
|
|
214
|
+
lines.push('');
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return lines.join('\n');
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Format ranked results as a structured JSON-serialisable object.
|
|
223
|
+
*
|
|
224
|
+
* @param {{ file: string, score: number, sigs: string[], tokens: number }[]} results
|
|
225
|
+
* @param {string} query
|
|
226
|
+
* @returns {object}
|
|
227
|
+
*/
|
|
228
|
+
function formatRankJSON(results, query) {
|
|
229
|
+
return {
|
|
230
|
+
query,
|
|
231
|
+
results: (results || []).map((r, i) => ({
|
|
232
|
+
rank: i + 1,
|
|
233
|
+
file: r.file,
|
|
234
|
+
score: r.score,
|
|
235
|
+
sigs: r.sigs,
|
|
236
|
+
tokens: r.tokens,
|
|
237
|
+
})),
|
|
238
|
+
totalResults: (results || []).length,
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
module.exports = { rank, buildSigIndex, scoreFile, formatRankTable, formatRankJSON, DEFAULT_WEIGHTS };
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* SigMap zero-dependency tokenizer.
|
|
5
|
+
* Splits code identifiers: camelCase, snake_case, kebab-case, PascalCase,
|
|
6
|
+
* removes stop words, and returns lower-case tokens.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
const STOP_WORDS = new Set([
|
|
10
|
+
'the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'or', 'is', 'are',
|
|
11
|
+
'that', 'this', 'it', 'with', 'from', 'by', 'be', 'as', 'on', 'at',
|
|
12
|
+
'do', 'not', 'use', 'get', 'set', 'up', 'if', 'no', 'so', 'we',
|
|
13
|
+
]);
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Tokenize any text (query or code signature) into unique lower-case tokens.
|
|
17
|
+
* Handles:
|
|
18
|
+
* - camelCase → ['camel', 'case']
|
|
19
|
+
* - PascalCase → ['pascal', 'case']
|
|
20
|
+
* - snake_case → ['snake', 'case']
|
|
21
|
+
* - kebab-case → ['kebab', 'case']
|
|
22
|
+
* - dot.notation → ['dot', 'notation']
|
|
23
|
+
* - File paths → individual path components (no extension)
|
|
24
|
+
*
|
|
25
|
+
* @param {string} text
|
|
26
|
+
* @param {object} [opts]
|
|
27
|
+
* @param {boolean} [opts.removeStopWords=true]
|
|
28
|
+
* @param {number} [opts.minLength=2]
|
|
29
|
+
* @returns {string[]}
|
|
30
|
+
*/
|
|
31
|
+
function tokenize(text, opts) {
|
|
32
|
+
if (!text || typeof text !== 'string') return [];
|
|
33
|
+
const removeStop = opts && opts.removeStopWords === false ? false : true;
|
|
34
|
+
const minLen = (opts && opts.minLength) || 2;
|
|
35
|
+
|
|
36
|
+
const tokens = text
|
|
37
|
+
// strip file extension (e.g. .js, .ts, .py)
|
|
38
|
+
.replace(/\.\w{1,6}(?=\s|\/|$)/g, ' ')
|
|
39
|
+
// camelCase / PascalCase split
|
|
40
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
41
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
|
|
42
|
+
// snake_case / kebab-case / dot.notation
|
|
43
|
+
.replace(/[_\-\.\/]/g, ' ')
|
|
44
|
+
// drop remaining non-word characters
|
|
45
|
+
.replace(/[^\w\s]/g, ' ')
|
|
46
|
+
.toLowerCase()
|
|
47
|
+
.split(/\s+/)
|
|
48
|
+
.filter((t) => t.length >= minLen);
|
|
49
|
+
|
|
50
|
+
if (!removeStop) return [...new Set(tokens)];
|
|
51
|
+
return [...new Set(tokens.filter((t) => !STOP_WORDS.has(t)))];
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
module.exports = { tokenize, STOP_WORDS };
|