sigmap 2.2.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,242 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * SigMap zero-dependency relevance ranker.
5
+ *
6
+ * Ranks all files in a signature index against a natural-language query.
7
+ * Scoring weights:
8
+ * - keyword overlap (exact token match against sigs)
9
+ * - symbol match (token appears in a top-level identifier / function name)
10
+ * - partial prefix match (token is prefix of a sig token, length ≥ 4)
11
+ * - path relevance (query token appears in the file path)
12
+ * - recency boost (applied externally via recency map)
13
+ *
14
+ * Usage:
15
+ * const { rank } = require('./src/retrieval/ranker');
16
+ * const results = rank(query, sigIndex, { topK: 10 });
17
+ * // results: [{ file, score, sigs, tokens }]
18
+ */
19
+
20
+ const { tokenize, STOP_WORDS } = require('./tokenizer');
21
+
22
+ // ---------------------------------------------------------------------------
23
+ // Default weights
24
+ // ---------------------------------------------------------------------------
25
+ const DEFAULT_WEIGHTS = {
26
+ exactToken: 1.0, // query token exactly in sig tokens
27
+ symbolMatch: 0.5, // bonus if token appears in a function/class name line
28
+ prefixMatch: 0.3, // partial prefix hit (query token ≥ 4 chars)
29
+ pathMatch: 0.8, // query token appears in the file path
30
+ recencyBoost: 1.5, // multiplier applied when file is in recencySet
31
+ };
32
+
33
+ /**
34
+ * Score a single file against a query.
35
+ *
36
+ * @param {string} filePath - relative file path (e.g. 'src/extractors/python.js')
37
+ * @param {string[]} sigs - signature strings for this file
38
+ * @param {string[]} queryTokens - pre-tokenized query
39
+ * @param {object} weights
40
+ * @returns {number}
41
+ */
42
+ function scoreFile(filePath, sigs, queryTokens, weights) {
43
+ if (!sigs || sigs.length === 0) return 0;
44
+
45
+ const w = weights || DEFAULT_WEIGHTS;
46
+
47
+ // Build token set from all signatures
48
+ const sigText = sigs.join(' ');
49
+ const sigTokenSet = new Set(tokenize(sigText));
50
+
51
+ // Build token set from the file path
52
+ const pathTokenSet = new Set(tokenize(filePath));
53
+
54
+ let score = 0;
55
+
56
+ for (const qt of queryTokens) {
57
+ if (STOP_WORDS.has(qt)) continue;
58
+
59
+ // Exact token match in sigs
60
+ if (sigTokenSet.has(qt)) {
61
+ score += w.exactToken;
62
+
63
+ // Bonus: appears directly in a function/class/method name line
64
+ const nameLineMatch = sigs.some((sig) => {
65
+ const nt = tokenize(sig.replace(/[^a-zA-Z0-9_\s]/g, ' '));
66
+ return nt.includes(qt);
67
+ });
68
+ if (nameLineMatch) score += w.symbolMatch;
69
+ }
70
+
71
+ // Prefix match (e.g. query "python" matches "pythonDeps")
72
+ if (qt.length >= 4) {
73
+ for (const st of sigTokenSet) {
74
+ if (st !== qt && st.startsWith(qt)) {
75
+ score += w.prefixMatch;
76
+ break; // one bonus per query token
77
+ }
78
+ }
79
+ }
80
+
81
+ // Path token match
82
+ if (pathTokenSet.has(qt)) {
83
+ score += w.pathMatch;
84
+ }
85
+ }
86
+
87
+ return score;
88
+ }
89
+
90
+ /**
91
+ * Rank all files in a signature index against a query.
92
+ *
93
+ * @param {string} query - natural language query
94
+ * @param {Map<string,string[]>} sigIndex - Map<file, sigs[]>
95
+ * @param {object} [opts]
96
+ * @param {number} [opts.topK=10] - max results to return
97
+ * @param {number} [opts.recencyBoost=1.5] - multiplier for recent files
98
+ * @param {Set<string>} [opts.recencySet] - set of recently-changed file paths
99
+ * @param {object} [opts.weights] - override scoring weights
100
+ * @returns {{ file: string, score: number, sigs: string[], tokens: number }[]}
101
+ */
102
+ function rank(query, sigIndex, opts) {
103
+ if (!query || typeof query !== 'string') return [];
104
+ if (!sigIndex || !(sigIndex instanceof Map) || sigIndex.size === 0) return [];
105
+
106
+ const topK = (opts && opts.topK) || 10;
107
+ const recencyMultiplier = (opts && opts.recencyBoost) || DEFAULT_WEIGHTS.recencyBoost;
108
+ const recencySet = (opts && opts.recencySet) || null;
109
+ const weights = (opts && opts.weights) ? Object.assign({}, DEFAULT_WEIGHTS, opts.weights) : DEFAULT_WEIGHTS;
110
+
111
+ const queryTokens = tokenize(query);
112
+ if (queryTokens.length === 0) {
113
+ // Empty query: return top-K by file count (most signatures = most useful)
114
+ const all = [];
115
+ for (const [file, sigs] of sigIndex.entries()) {
116
+ all.push({ file, score: sigs.length, sigs, tokens: Math.ceil(sigs.join('\n').length / 4) });
117
+ }
118
+ all.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
119
+ return all.slice(0, topK);
120
+ }
121
+
122
+ const scored = [];
123
+ for (const [file, sigs] of sigIndex.entries()) {
124
+ let score = scoreFile(file, sigs, queryTokens, weights);
125
+
126
+ // Recency boost
127
+ if (recencySet && recencySet.has(file) && score > 0) {
128
+ score *= recencyMultiplier;
129
+ }
130
+
131
+ scored.push({
132
+ file,
133
+ score,
134
+ sigs,
135
+ tokens: Math.ceil(sigs.join('\n').length / 4),
136
+ });
137
+ }
138
+
139
+ scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
140
+ return scored.slice(0, topK);
141
+ }
142
+
143
+ /**
144
+ * Build a signature index from the generated context file.
145
+ * Returns Map<filePath, string[]> where filePath is the relative path
146
+ * as it appears in the ### headers of copilot-instructions.md.
147
+ *
148
+ * @param {string} cwd
149
+ * @returns {Map<string, string[]>}
150
+ */
151
+ function buildSigIndex(cwd) {
152
+ const fs = require('fs');
153
+ const path = require('path');
154
+ const contextPath = path.join(cwd, '.github', 'copilot-instructions.md');
155
+ const index = new Map();
156
+
157
+ if (!fs.existsSync(contextPath)) return index;
158
+
159
+ const content = fs.readFileSync(contextPath, 'utf8');
160
+ const lines = content.split('\n');
161
+
162
+ let currentFile = null;
163
+ let inBlock = false;
164
+ let sigs = [];
165
+
166
+ for (const line of lines) {
167
+ const headerMatch = line.match(/^###\s+(\S+)\s*$/);
168
+ if (headerMatch) {
169
+ if (currentFile !== null) index.set(currentFile, sigs);
170
+ currentFile = headerMatch[1];
171
+ sigs = [];
172
+ inBlock = false;
173
+ continue;
174
+ }
175
+ if (line.startsWith('```')) { inBlock = !inBlock; continue; }
176
+ if (inBlock && currentFile && line.trim()) sigs.push(line.trim());
177
+ }
178
+ if (currentFile !== null) index.set(currentFile, sigs);
179
+
180
+ return index;
181
+ }
182
+
183
+ /**
184
+ * Format ranked results as a markdown table string.
185
+ *
186
+ * @param {{ file: string, score: number, sigs: string[], tokens: number }[]} results
187
+ * @param {string} query
188
+ * @returns {string}
189
+ */
190
+ function formatRankTable(results, query) {
191
+ if (!results || results.length === 0) {
192
+ return `No matching files found for query: "${query}"\n`;
193
+ }
194
+
195
+ const lines = [
196
+ `## Query: ${query}`,
197
+ '',
198
+ '| Rank | File | Score | Sigs | Tokens |',
199
+ '|------|------|-------|------|--------|',
200
+ ...results.map((r, i) =>
201
+ `| ${i + 1} | ${r.file} | ${r.score.toFixed(2)} | ${r.sigs.length} | ${r.tokens} |`
202
+ ),
203
+ '',
204
+ ];
205
+
206
+ // Add signature details for top results
207
+ for (const r of results.slice(0, 3)) {
208
+ if (r.sigs.length > 0) {
209
+ lines.push(`### ${r.file}`);
210
+ lines.push('```');
211
+ lines.push(...r.sigs.slice(0, 10));
212
+ if (r.sigs.length > 10) lines.push(`... (${r.sigs.length - 10} more)`);
213
+ lines.push('```');
214
+ lines.push('');
215
+ }
216
+ }
217
+
218
+ return lines.join('\n');
219
+ }
220
+
221
+ /**
222
+ * Format ranked results as a structured JSON-serialisable object.
223
+ *
224
+ * @param {{ file: string, score: number, sigs: string[], tokens: number }[]} results
225
+ * @param {string} query
226
+ * @returns {object}
227
+ */
228
+ function formatRankJSON(results, query) {
229
+ return {
230
+ query,
231
+ results: (results || []).map((r, i) => ({
232
+ rank: i + 1,
233
+ file: r.file,
234
+ score: r.score,
235
+ sigs: r.sigs,
236
+ tokens: r.tokens,
237
+ })),
238
+ totalResults: (results || []).length,
239
+ };
240
+ }
241
+
242
+ module.exports = { rank, buildSigIndex, scoreFile, formatRankTable, formatRankJSON, DEFAULT_WEIGHTS };
@@ -0,0 +1,54 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * SigMap zero-dependency tokenizer.
5
+ * Splits code identifiers: camelCase, snake_case, kebab-case, PascalCase,
6
+ * removes stop words, and returns lower-case tokens.
7
+ */
8
+
9
+ const STOP_WORDS = new Set([
10
+ 'the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'or', 'is', 'are',
11
+ 'that', 'this', 'it', 'with', 'from', 'by', 'be', 'as', 'on', 'at',
12
+ 'do', 'not', 'use', 'get', 'set', 'up', 'if', 'no', 'so', 'we',
13
+ ]);
14
+
15
+ /**
16
+ * Tokenize any text (query or code signature) into unique lower-case tokens.
17
+ * Handles:
18
+ * - camelCase → ['camel', 'case']
19
+ * - PascalCase → ['pascal', 'case']
20
+ * - snake_case → ['snake', 'case']
21
+ * - kebab-case → ['kebab', 'case']
22
+ * - dot.notation → ['dot', 'notation']
23
+ * - File paths → individual path components (no extension)
24
+ *
25
+ * @param {string} text
26
+ * @param {object} [opts]
27
+ * @param {boolean} [opts.removeStopWords=true]
28
+ * @param {number} [opts.minLength=2]
29
+ * @returns {string[]}
30
+ */
31
+ function tokenize(text, opts) {
32
+ if (!text || typeof text !== 'string') return [];
33
+ const removeStop = opts && opts.removeStopWords === false ? false : true;
34
+ const minLen = (opts && opts.minLength) || 2;
35
+
36
+ const tokens = text
37
+ // strip file extension (e.g. .js, .ts, .py)
38
+ .replace(/\.\w{1,6}(?=\s|\/|$)/g, ' ')
39
+ // camelCase / PascalCase split
40
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
41
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
42
+ // snake_case / kebab-case / dot.notation
43
+ .replace(/[_\-\.\/]/g, ' ')
44
+ // drop remaining non-word characters
45
+ .replace(/[^\w\s]/g, ' ')
46
+ .toLowerCase()
47
+ .split(/\s+/)
48
+ .filter((t) => t.length >= minLen);
49
+
50
+ if (!removeStop) return [...new Set(tokens)];
51
+ return [...new Set(tokens.filter((t) => !STOP_WORDS.has(t)))];
52
+ }
53
+
54
+ module.exports = { tokenize, STOP_WORDS };