sigmap 7.30.0 → 8.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/README.md +9 -9
- package/gen-context.js +581 -73
- package/gen-project-map.js +14 -6
- package/llms-full.txt +5 -5
- package/llms.txt +5 -5
- package/package.json +2 -1
- package/packages/cli/package.json +1 -1
- package/packages/core/package.json +1 -1
- package/src/eval/runner.js +9 -61
- package/src/evidence/pack.js +42 -8
- package/src/map/build-ci.js +91 -0
- package/src/map/config-manifest.js +101 -0
- package/src/map/env-schema.js +90 -0
- package/src/map/migrations.js +84 -0
- package/src/mcp/handlers.js +5 -1
- package/src/mcp/server.js +1 -1
- package/src/retrieval/bm25.js +122 -0
- package/src/retrieval/ranker.js +15 -1
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Database-migration extractor (v8.5 C1).
|
|
5
|
+
*
|
|
6
|
+
* Detects schema-migration files across the common frameworks — Rails
|
|
7
|
+
* (db/migrate), Django/Alembic, Prisma, Flyway (`V1__name.sql`), knex/Sequelize,
|
|
8
|
+
* and timestamped SQL — and surfaces them with a parsed version + name. Pure,
|
|
9
|
+
* zero-dependency, deterministic.
|
|
10
|
+
*
|
|
11
|
+
* @param {string[]} files — absolute file paths (unused; the tree is walked)
|
|
12
|
+
* @param {string} cwd — project root
|
|
13
|
+
* @returns {string} formatted markdown table (empty string if none found)
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
const fs = require('fs');
|
|
17
|
+
const path = require('path');
|
|
18
|
+
|
|
19
|
+
const MAX_DEPTH = 6;
|
|
20
|
+
const MAX_ROWS = 200;
|
|
21
|
+
const SKIP_DIR = new Set(['.git', 'node_modules', 'vendor', 'dist', 'build', 'target', '.venv', 'venv', '__pycache__']);
|
|
22
|
+
const MIG_EXT = new Set(['.sql', '.rb', '.py', '.js', '.ts']);
|
|
23
|
+
|
|
24
|
+
// A directory whose path marks its children as migrations.
|
|
25
|
+
const MIG_DIR_RE = /(^|\/)(db\/migrate|migrations?|alembic\/versions|prisma\/migrations)$/i;
|
|
26
|
+
// A filename that is itself a migration regardless of directory.
|
|
27
|
+
const FLYWAY_RE = /^V\d+(?:[._]\d+)*__(.+)\.(sql|java)$/;
|
|
28
|
+
const TIMESTAMP_RE = /^(\d{8,})[_-](.+)\.(sql|rb|py|js|ts)$/;
|
|
29
|
+
const NAMED_RE = /[._-]migrations?[._-]/i;
|
|
30
|
+
|
|
31
|
+
function walk(dir, cwd, depth, out) {
|
|
32
|
+
if (depth > MAX_DEPTH) return;
|
|
33
|
+
let entries;
|
|
34
|
+
try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch (_) { return; }
|
|
35
|
+
entries.sort((a, b) => (a.name < b.name ? -1 : a.name > b.name ? 1 : 0));
|
|
36
|
+
|
|
37
|
+
const relDir = path.relative(cwd, dir).replace(/\\/g, '/');
|
|
38
|
+
const dirIsMigration = MIG_DIR_RE.test(relDir);
|
|
39
|
+
|
|
40
|
+
for (const e of entries) {
|
|
41
|
+
if (e.isDirectory()) {
|
|
42
|
+
if (SKIP_DIR.has(e.name)) continue;
|
|
43
|
+
walk(path.join(dir, e.name), cwd, depth + 1, out);
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
const ext = path.extname(e.name).toLowerCase();
|
|
47
|
+
if (!MIG_EXT.has(ext)) continue;
|
|
48
|
+
|
|
49
|
+
const rel = path.relative(cwd, path.join(dir, e.name)).replace(/\\/g, '/');
|
|
50
|
+
let version = null;
|
|
51
|
+
let name = null;
|
|
52
|
+
|
|
53
|
+
let m;
|
|
54
|
+
if ((m = e.name.match(FLYWAY_RE))) { version = e.name.split('__')[0]; name = m[1].replace(/_/g, ' '); }
|
|
55
|
+
else if ((m = e.name.match(TIMESTAMP_RE))) { version = m[1]; name = m[2].replace(/[_-]/g, ' '); }
|
|
56
|
+
else if (dirIsMigration) { version = '—'; name = e.name.replace(ext, ''); }
|
|
57
|
+
else if (NAMED_RE.test(e.name)) { version = '—'; name = e.name.replace(ext, ''); }
|
|
58
|
+
else continue;
|
|
59
|
+
|
|
60
|
+
out.push({ version, name, file: rel });
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function analyze(files, cwd) {
|
|
65
|
+
const found = [];
|
|
66
|
+
walk(cwd, cwd, 0, found);
|
|
67
|
+
if (found.length === 0) return '';
|
|
68
|
+
|
|
69
|
+
found.sort((a, b) => (a.file < b.file ? -1 : a.file > b.file ? 1 : 0));
|
|
70
|
+
|
|
71
|
+
const lines = [
|
|
72
|
+
'| Version | Migration | File |',
|
|
73
|
+
'|---------|-----------|------|',
|
|
74
|
+
];
|
|
75
|
+
for (const r of found.slice(0, MAX_ROWS)) {
|
|
76
|
+
lines.push(`| ${r.version} | ${r.name} | ${r.file} |`);
|
|
77
|
+
}
|
|
78
|
+
if (found.length > MAX_ROWS) {
|
|
79
|
+
lines.push(`| … | +${found.length - MAX_ROWS} more | |`);
|
|
80
|
+
}
|
|
81
|
+
return lines.join('\n');
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
module.exports = { analyze };
|
package/src/mcp/handlers.js
CHANGED
|
@@ -21,6 +21,10 @@ const MAP_SECTIONS = {
|
|
|
21
21
|
imports: '### Import graph',
|
|
22
22
|
classes: '### Class hierarchy',
|
|
23
23
|
routes: '### Route table',
|
|
24
|
+
env: '### Environment variables',
|
|
25
|
+
buildci: '### Build & CI',
|
|
26
|
+
manifests: '### Config & manifests',
|
|
27
|
+
migrations: '### Database migrations',
|
|
24
28
|
};
|
|
25
29
|
|
|
26
30
|
/**
|
|
@@ -106,7 +110,7 @@ function getMap(args, cwd) {
|
|
|
106
110
|
|
|
107
111
|
const header = MAP_SECTIONS[args.type];
|
|
108
112
|
if (!header) {
|
|
109
|
-
return `Unknown map type: "${args.type}". Use:
|
|
113
|
+
return `Unknown map type: "${args.type}". Use: ${Object.keys(MAP_SECTIONS).join(', ')}`;
|
|
110
114
|
}
|
|
111
115
|
|
|
112
116
|
const mapPath = path.join(cwd, 'PROJECT_MAP.md');
|
package/src/mcp/server.js
CHANGED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* SigMap identifier-aware BM25 re-ranker (zero dependencies, deterministic).
|
|
5
|
+
*
|
|
6
|
+
* Plain exact-token TF-IDF misses queries whose terms live *inside* code
|
|
7
|
+
* identifiers — e.g. `component emit` never surfaces `componentEmits.ts`,
|
|
8
|
+
* because "componentEmits" is one token that shares no exact term with the
|
|
9
|
+
* query. This module fixes that with four small additions:
|
|
10
|
+
*
|
|
11
|
+
* 1. Identifier-aware tokenization — split camelCase and snake_case.
|
|
12
|
+
* 2. Light stemming — plurals / common suffixes (`emits` → `emit`).
|
|
13
|
+
* 3. Path-token boost — file path / basename tokens weigh PATH_BOOST× more.
|
|
14
|
+
* 4. BM25 scoring instead of raw TF-IDF (length-normalized).
|
|
15
|
+
*
|
|
16
|
+
* On 85 curated tasks across 17 repos this lifted hit@5 from 75.3% → 82.4%
|
|
17
|
+
* (MRR +16% relative). See issue #395.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
// Stop words: common English + low-signal code verbs/nouns that appear in
|
|
21
|
+
// nearly every signature and so carry little retrieval signal.
|
|
22
|
+
const STOP = new Set(
|
|
23
|
+
('a an the of to in on for and or is are be by with as at from that this it its ' +
|
|
24
|
+
'into get set add new return value test')
|
|
25
|
+
.split(' ')
|
|
26
|
+
);
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Light suffix stemmer — conservative, tuned for code identifiers rather than
|
|
30
|
+
* prose. Words of 3 chars or fewer pass through unchanged; a result shorter
|
|
31
|
+
* than 3 chars reverts to the original token.
|
|
32
|
+
*
|
|
33
|
+
* @param {string} w
|
|
34
|
+
* @returns {string}
|
|
35
|
+
*/
|
|
36
|
+
function stem(w) {
|
|
37
|
+
if (w.length <= 3) return w;
|
|
38
|
+
let s = w;
|
|
39
|
+
s = s.replace(/ies$/, 'y');
|
|
40
|
+
s = s.replace(/(sses|shes|ches|xes|zes)$/, (m) => m.slice(0, -2));
|
|
41
|
+
s = s.replace(/([^s])s$/, '$1');
|
|
42
|
+
s = s.replace(/(ization|izations)$/, 'ize');
|
|
43
|
+
s = s.replace(/(ing|edly|ed|er|ers|ation|ations|ment|ness|ity|ive|able|ible|ize|ise|al)$/, '');
|
|
44
|
+
return s.length >= 3 ? s : w;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Split on non-alphanumeric characters AND camelCase / snake_case boundaries,
|
|
49
|
+
* lowercase, drop stop words and single characters, then stem.
|
|
50
|
+
*
|
|
51
|
+
* @param {string} text
|
|
52
|
+
* @returns {string[]}
|
|
53
|
+
*/
|
|
54
|
+
function tokenize(text) {
|
|
55
|
+
if (!text || typeof text !== 'string') return [];
|
|
56
|
+
return text
|
|
57
|
+
.replace(/[^A-Za-z0-9]+/g, ' ')
|
|
58
|
+
.replace(/([a-z0-9])([A-Z])/g, '$1 $2')
|
|
59
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
|
|
60
|
+
.toLowerCase()
|
|
61
|
+
.split(/\s+/)
|
|
62
|
+
.filter((t) => t.length > 1 && !STOP.has(t))
|
|
63
|
+
.map(stem)
|
|
64
|
+
.filter(Boolean);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// The file path / basename is highly indicative of relevance, so its tokens
|
|
68
|
+
// are counted PATH_BOOST times when building the document term-frequency map.
|
|
69
|
+
const PATH_BOOST = 3;
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* BM25 re-rank of candidates against a query. Each candidate is
|
|
73
|
+
* `{ file, sigs }`; the returned objects preserve all original candidate
|
|
74
|
+
* fields and add a numeric `score` (higher = more relevant), sorted best-first
|
|
75
|
+
* with a deterministic path tie-break. A `score` of 0 means no query token
|
|
76
|
+
* matched — callers typically drop those.
|
|
77
|
+
*
|
|
78
|
+
* @param {string} query
|
|
79
|
+
* @param {{ file: string, sigs: string[] }[]} candidates
|
|
80
|
+
* @returns {Array<object & { score: number }>}
|
|
81
|
+
*/
|
|
82
|
+
function bm25rank(query, candidates) {
|
|
83
|
+
if (!Array.isArray(candidates) || candidates.length === 0) return [];
|
|
84
|
+
|
|
85
|
+
const k1 = 1.5;
|
|
86
|
+
const b = 0.75;
|
|
87
|
+
|
|
88
|
+
const docs = candidates.map((c) => {
|
|
89
|
+
const pathToks = tokenize(c.file || '');
|
|
90
|
+
const toks = tokenize((c.sigs || []).join(' '));
|
|
91
|
+
for (let i = 0; i < PATH_BOOST; i++) toks.push(...pathToks);
|
|
92
|
+
const tf = new Map();
|
|
93
|
+
for (const t of toks) tf.set(t, (tf.get(t) || 0) + 1);
|
|
94
|
+
return { cand: c, tf, len: toks.length };
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
const N = docs.length || 1;
|
|
98
|
+
const avgdl = docs.reduce((s, d) => s + d.len, 0) / N || 1;
|
|
99
|
+
|
|
100
|
+
const df = new Map();
|
|
101
|
+
for (const d of docs) {
|
|
102
|
+
for (const t of d.tf.keys()) df.set(t, (df.get(t) || 0) + 1);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
const qToks = [...new Set(tokenize(query))];
|
|
106
|
+
|
|
107
|
+
return docs
|
|
108
|
+
.map((d) => {
|
|
109
|
+
let score = 0;
|
|
110
|
+
for (const t of qToks) {
|
|
111
|
+
const f = d.tf.get(t);
|
|
112
|
+
if (!f) continue;
|
|
113
|
+
const dfT = df.get(t);
|
|
114
|
+
const idf = Math.log(1 + (N - dfT + 0.5) / (dfT + 0.5));
|
|
115
|
+
score += (idf * (f * (k1 + 1))) / (f + k1 * (1 - b + (b * d.len) / avgdl));
|
|
116
|
+
}
|
|
117
|
+
return Object.assign({}, d.cand, { score });
|
|
118
|
+
})
|
|
119
|
+
.sort((a, c) => c.score - a.score || String(a.file).localeCompare(String(c.file)));
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
module.exports = { tokenize, stem, bm25rank, PATH_BOOST, STOP };
|
package/src/retrieval/ranker.js
CHANGED
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
|
|
20
20
|
const { loadWeights } = require('../learning/weights');
|
|
21
21
|
const { tokenize, STOP_WORDS } = require('./tokenizer');
|
|
22
|
+
const { bm25rank } = require('./bm25');
|
|
22
23
|
|
|
23
24
|
// ---------------------------------------------------------------------------
|
|
24
25
|
// Default weights
|
|
@@ -197,11 +198,24 @@ function rank(query, sigIndex, opts) {
|
|
|
197
198
|
return all.slice(0, topK);
|
|
198
199
|
}
|
|
199
200
|
|
|
201
|
+
// Identifier-aware BM25 base relevance over the whole index (#395). BM25
|
|
202
|
+
// splits camelCase/snake_case, stems, and boosts path tokens, so queries
|
|
203
|
+
// whose terms live inside identifiers (e.g. "component emit" → componentEmits)
|
|
204
|
+
// are matched. The existing negative-signal penalty and recency/graph/learned
|
|
205
|
+
// boosts are layered on top; the per-token signals stay for the explain table.
|
|
206
|
+
const bm25Scores = new Map();
|
|
207
|
+
for (const c of bm25rank(query, [...sigIndex.entries()].map(([file, sigs]) => ({ file, sigs })))) {
|
|
208
|
+
bm25Scores.set(c.file, c.score);
|
|
209
|
+
}
|
|
210
|
+
|
|
200
211
|
const scored = [];
|
|
201
212
|
for (const [file, sigs] of sigIndex.entries()) {
|
|
202
213
|
const result = scoreFile(file, sigs, queryTokens, weights);
|
|
203
|
-
|
|
214
|
+
const penalty = result.signals.penalty;
|
|
215
|
+
const base = bm25Scores.get(file) || 0;
|
|
216
|
+
let score = base * penalty;
|
|
204
217
|
const signals = result.signals;
|
|
218
|
+
signals.bm25 = base;
|
|
205
219
|
|
|
206
220
|
// Recency boost
|
|
207
221
|
if (recencySet && recencySet.has(file) && score > 0) {
|