sigmap 7.30.0 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,84 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Database-migration extractor (v8.5 C1).
5
+ *
6
+ * Detects schema-migration files across the common frameworks — Rails
7
+ * (db/migrate), Django/Alembic, Prisma, Flyway (`V1__name.sql`), knex/Sequelize,
8
+ * and timestamped SQL — and surfaces them with a parsed version + name. Pure,
9
+ * zero-dependency, deterministic.
10
+ *
11
+ * @param {string[]} files — absolute file paths (unused; the tree is walked)
12
+ * @param {string} cwd — project root
13
+ * @returns {string} formatted markdown table (empty string if none found)
14
+ */
15
+
16
+ const fs = require('fs');
17
+ const path = require('path');
18
+
19
+ const MAX_DEPTH = 6;
20
+ const MAX_ROWS = 200;
21
+ const SKIP_DIR = new Set(['.git', 'node_modules', 'vendor', 'dist', 'build', 'target', '.venv', 'venv', '__pycache__']);
22
+ const MIG_EXT = new Set(['.sql', '.rb', '.py', '.js', '.ts']);
23
+
24
+ // A directory whose path marks its children as migrations.
25
+ const MIG_DIR_RE = /(^|\/)(db\/migrate|migrations?|alembic\/versions|prisma\/migrations)$/i;
26
+ // A filename that is itself a migration regardless of directory.
27
+ const FLYWAY_RE = /^V\d+(?:[._]\d+)*__(.+)\.(sql|java)$/;
28
+ const TIMESTAMP_RE = /^(\d{8,})[_-](.+)\.(sql|rb|py|js|ts)$/;
29
+ const NAMED_RE = /[._-]migrations?[._-]/i;
30
+
31
+ function walk(dir, cwd, depth, out) {
32
+ if (depth > MAX_DEPTH) return;
33
+ let entries;
34
+ try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch (_) { return; }
35
+ entries.sort((a, b) => (a.name < b.name ? -1 : a.name > b.name ? 1 : 0));
36
+
37
+ const relDir = path.relative(cwd, dir).replace(/\\/g, '/');
38
+ const dirIsMigration = MIG_DIR_RE.test(relDir);
39
+
40
+ for (const e of entries) {
41
+ if (e.isDirectory()) {
42
+ if (SKIP_DIR.has(e.name)) continue;
43
+ walk(path.join(dir, e.name), cwd, depth + 1, out);
44
+ continue;
45
+ }
46
+ const ext = path.extname(e.name).toLowerCase();
47
+ if (!MIG_EXT.has(ext)) continue;
48
+
49
+ const rel = path.relative(cwd, path.join(dir, e.name)).replace(/\\/g, '/');
50
+ let version = null;
51
+ let name = null;
52
+
53
+ let m;
54
+ if ((m = e.name.match(FLYWAY_RE))) { version = e.name.split('__')[0]; name = m[1].replace(/_/g, ' '); }
55
+ else if ((m = e.name.match(TIMESTAMP_RE))) { version = m[1]; name = m[2].replace(/[_-]/g, ' '); }
56
+ else if (dirIsMigration) { version = '—'; name = e.name.replace(ext, ''); }
57
+ else if (NAMED_RE.test(e.name)) { version = '—'; name = e.name.replace(ext, ''); }
58
+ else continue;
59
+
60
+ out.push({ version, name, file: rel });
61
+ }
62
+ }
63
+
64
+ function analyze(files, cwd) {
65
+ const found = [];
66
+ walk(cwd, cwd, 0, found);
67
+ if (found.length === 0) return '';
68
+
69
+ found.sort((a, b) => (a.file < b.file ? -1 : a.file > b.file ? 1 : 0));
70
+
71
+ const lines = [
72
+ '| Version | Migration | File |',
73
+ '|---------|-----------|------|',
74
+ ];
75
+ for (const r of found.slice(0, MAX_ROWS)) {
76
+ lines.push(`| ${r.version} | ${r.name} | ${r.file} |`);
77
+ }
78
+ if (found.length > MAX_ROWS) {
79
+ lines.push(`| … | +${found.length - MAX_ROWS} more | |`);
80
+ }
81
+ return lines.join('\n');
82
+ }
83
+
84
+ module.exports = { analyze };
@@ -21,6 +21,10 @@ const MAP_SECTIONS = {
21
21
  imports: '### Import graph',
22
22
  classes: '### Class hierarchy',
23
23
  routes: '### Route table',
24
+ env: '### Environment variables',
25
+ buildci: '### Build & CI',
26
+ manifests: '### Config & manifests',
27
+ migrations: '### Database migrations',
24
28
  };
25
29
 
26
30
  /**
@@ -106,7 +110,7 @@ function getMap(args, cwd) {
106
110
 
107
111
  const header = MAP_SECTIONS[args.type];
108
112
  if (!header) {
109
- return `Unknown map type: "${args.type}". Use: imports, classes, routes`;
113
+ return `Unknown map type: "${args.type}". Use: ${Object.keys(MAP_SECTIONS).join(', ')}`;
110
114
  }
111
115
 
112
116
  const mapPath = path.join(cwd, 'PROJECT_MAP.md');
package/src/mcp/server.js CHANGED
@@ -18,7 +18,7 @@ const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, exp
18
18
 
19
19
  const SERVER_INFO = {
20
20
  name: 'sigmap',
21
- version: '7.30.0',
21
+ version: '8.0.0',
22
22
  description: 'SigMap MCP server — code signatures on demand',
23
23
  };
24
24
 
@@ -0,0 +1,122 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * SigMap identifier-aware BM25 re-ranker (zero dependencies, deterministic).
5
+ *
6
+ * Plain exact-token TF-IDF misses queries whose terms live *inside* code
7
+ * identifiers — e.g. `component emit` never surfaces `componentEmits.ts`,
8
+ * because "componentEmits" is one token that shares no exact term with the
9
+ * query. This module fixes that with four small additions:
10
+ *
11
+ * 1. Identifier-aware tokenization — split camelCase and snake_case.
12
+ * 2. Light stemming — plurals / common suffixes (`emits` → `emit`).
13
+ * 3. Path-token boost — file path / basename tokens weigh PATH_BOOST× more.
14
+ * 4. BM25 scoring instead of raw TF-IDF (length-normalized).
15
+ *
16
+ * On 85 curated tasks across 17 repos this lifted hit@5 from 75.3% → 82.4%
17
+ * (MRR +16% relative). See issue #395.
18
+ */
19
+
20
+ // Stop words: common English + low-signal code verbs/nouns that appear in
21
+ // nearly every signature and so carry little retrieval signal.
22
+ const STOP = new Set(
23
+ ('a an the of to in on for and or is are be by with as at from that this it its ' +
24
+ 'into get set add new return value test')
25
+ .split(' ')
26
+ );
27
+
28
+ /**
29
+ * Light suffix stemmer — conservative, tuned for code identifiers rather than
30
+ * prose. Words of 3 chars or fewer pass through unchanged; a result shorter
31
+ * than 3 chars reverts to the original token.
32
+ *
33
+ * @param {string} w
34
+ * @returns {string}
35
+ */
36
+ function stem(w) {
37
+ if (w.length <= 3) return w;
38
+ let s = w;
39
+ s = s.replace(/ies$/, 'y');
40
+ s = s.replace(/(sses|shes|ches|xes|zes)$/, (m) => m.slice(0, -2));
41
+ s = s.replace(/([^s])s$/, '$1');
42
+ s = s.replace(/(ization|izations)$/, 'ize');
43
+ s = s.replace(/(ing|edly|ed|er|ers|ation|ations|ment|ness|ity|ive|able|ible|ize|ise|al)$/, '');
44
+ return s.length >= 3 ? s : w;
45
+ }
46
+
47
+ /**
48
+ * Split on non-alphanumeric characters AND camelCase / snake_case boundaries,
49
+ * lowercase, drop stop words and single characters, then stem.
50
+ *
51
+ * @param {string} text
52
+ * @returns {string[]}
53
+ */
54
+ function tokenize(text) {
55
+ if (!text || typeof text !== 'string') return [];
56
+ return text
57
+ .replace(/[^A-Za-z0-9]+/g, ' ')
58
+ .replace(/([a-z0-9])([A-Z])/g, '$1 $2')
59
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
60
+ .toLowerCase()
61
+ .split(/\s+/)
62
+ .filter((t) => t.length > 1 && !STOP.has(t))
63
+ .map(stem)
64
+ .filter(Boolean);
65
+ }
66
+
67
+ // The file path / basename is highly indicative of relevance, so its tokens
68
+ // are counted PATH_BOOST times when building the document term-frequency map.
69
+ const PATH_BOOST = 3;
70
+
71
+ /**
72
+ * BM25 re-rank of candidates against a query. Each candidate is
73
+ * `{ file, sigs }`; the returned objects preserve all original candidate
74
+ * fields and add a numeric `score` (higher = more relevant), sorted best-first
75
+ * with a deterministic path tie-break. A `score` of 0 means no query token
76
+ * matched — callers typically drop those.
77
+ *
78
+ * @param {string} query
79
+ * @param {{ file: string, sigs: string[] }[]} candidates
80
+ * @returns {Array<object & { score: number }>}
81
+ */
82
+ function bm25rank(query, candidates) {
83
+ if (!Array.isArray(candidates) || candidates.length === 0) return [];
84
+
85
+ const k1 = 1.5;
86
+ const b = 0.75;
87
+
88
+ const docs = candidates.map((c) => {
89
+ const pathToks = tokenize(c.file || '');
90
+ const toks = tokenize((c.sigs || []).join(' '));
91
+ for (let i = 0; i < PATH_BOOST; i++) toks.push(...pathToks);
92
+ const tf = new Map();
93
+ for (const t of toks) tf.set(t, (tf.get(t) || 0) + 1);
94
+ return { cand: c, tf, len: toks.length };
95
+ });
96
+
97
+ const N = docs.length || 1;
98
+ const avgdl = docs.reduce((s, d) => s + d.len, 0) / N || 1;
99
+
100
+ const df = new Map();
101
+ for (const d of docs) {
102
+ for (const t of d.tf.keys()) df.set(t, (df.get(t) || 0) + 1);
103
+ }
104
+
105
+ const qToks = [...new Set(tokenize(query))];
106
+
107
+ return docs
108
+ .map((d) => {
109
+ let score = 0;
110
+ for (const t of qToks) {
111
+ const f = d.tf.get(t);
112
+ if (!f) continue;
113
+ const dfT = df.get(t);
114
+ const idf = Math.log(1 + (N - dfT + 0.5) / (dfT + 0.5));
115
+ score += (idf * (f * (k1 + 1))) / (f + k1 * (1 - b + (b * d.len) / avgdl));
116
+ }
117
+ return Object.assign({}, d.cand, { score });
118
+ })
119
+ .sort((a, c) => c.score - a.score || String(a.file).localeCompare(String(c.file)));
120
+ }
121
+
122
+ module.exports = { tokenize, stem, bm25rank, PATH_BOOST, STOP };
@@ -19,6 +19,7 @@
19
19
 
20
20
  const { loadWeights } = require('../learning/weights');
21
21
  const { tokenize, STOP_WORDS } = require('./tokenizer');
22
+ const { bm25rank } = require('./bm25');
22
23
 
23
24
  // ---------------------------------------------------------------------------
24
25
  // Default weights
@@ -197,11 +198,24 @@ function rank(query, sigIndex, opts) {
197
198
  return all.slice(0, topK);
198
199
  }
199
200
 
201
+ // Identifier-aware BM25 base relevance over the whole index (#395). BM25
202
+ // splits camelCase/snake_case, stems, and boosts path tokens, so queries
203
+ // whose terms live inside identifiers (e.g. "component emit" → componentEmits)
204
+ // are matched. The existing negative-signal penalty and recency/graph/learned
205
+ // boosts are layered on top; the per-token signals stay for the explain table.
206
+ const bm25Scores = new Map();
207
+ for (const c of bm25rank(query, [...sigIndex.entries()].map(([file, sigs]) => ({ file, sigs })))) {
208
+ bm25Scores.set(c.file, c.score);
209
+ }
210
+
200
211
  const scored = [];
201
212
  for (const [file, sigs] of sigIndex.entries()) {
202
213
  const result = scoreFile(file, sigs, queryTokens, weights);
203
- let score = result.score;
214
+ const penalty = result.signals.penalty;
215
+ const base = bm25Scores.get(file) || 0;
216
+ let score = base * penalty;
204
217
  const signals = result.signals;
218
+ signals.bm25 = base;
205
219
 
206
220
  // Recency boost
207
221
  if (recencySet && recencySet.has(file) && score > 0) {