sigmap 7.30.0 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/gen-context.js CHANGED
@@ -4136,6 +4136,7 @@ __factories["./src/eval/runner"] = function(module, exports) {
4136
4136
  const fs = require('fs');
4137
4137
  const path = require('path');
4138
4138
  const { aggregate } = __require('./src/eval/scorer');
4139
+ const { bm25rank } = __require('./src/retrieval/bm25');
4139
4140
 
4140
4141
  // ---------------------------------------------------------------------------
4141
4142
  // Context file reader
@@ -4197,79 +4198,26 @@ __factories["./src/eval/runner"] = function(module, exports) {
4197
4198
  }
4198
4199
 
4199
4200
  // ---------------------------------------------------------------------------
4200
- // Simple keyword-based ranking (pre-retrieval layer; v2.3 adds proper ranker)
4201
+ // Identifier-aware BM25 ranking (v7.31; see src/retrieval/bm25.js and #395)
4201
4202
  // ---------------------------------------------------------------------------
4202
4203
 
4203
- /**
4204
- * Tokenize a query or signature into lower-case word tokens.
4205
- * Splits on whitespace, punctuation, camelCase, and snake_case.
4206
- * @param {string} text
4207
- * @returns {string[]}
4208
- */
4209
- function tokenize(text) {
4210
- if (!text) return [];
4211
- return text
4212
- // split camelCase
4213
- .replace(/([a-z])([A-Z])/g, '$1 $2')
4214
- // split snake/kebab
4215
- .replace(/[_\-]/g, ' ')
4216
- // drop non-word chars
4217
- .replace(/[^\w\s]/g, ' ')
4218
- .toLowerCase()
4219
- .split(/\s+/)
4220
- .filter((t) => t.length > 1);
4221
- }
4222
-
4223
- const STOP_WORDS = new Set([
4224
- 'the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'or', 'is', 'are',
4225
- 'that', 'this', 'it', 'with', 'from', 'by', 'be', 'as', 'on', 'at',
4226
- ]);
4204
+ const { tokenize } = __require('./src/retrieval/bm25');
4227
4205
 
4228
4206
  /**
4229
- * Score a single file's signatures against a query.
4230
- * Returns a non-negative number; higher = more relevant.
4231
- * @param {string[]} sigs - array of signature strings for this file
4232
- * @param {string[]} queryTokens
4233
- * @returns {number}
4234
- */
4235
- function scoreFile(sigs, queryTokens) {
4236
- if (!sigs || sigs.length === 0) return 0;
4237
-
4238
- const sigText = sigs.join(' ');
4239
- const sigTokens = new Set(tokenize(sigText));
4240
-
4241
- let score = 0;
4242
- for (const qt of queryTokens) {
4243
- if (STOP_WORDS.has(qt)) continue;
4244
- if (sigTokens.has(qt)) score += 1;
4245
- // Partial match (prefix)
4246
- for (const st of sigTokens) {
4247
- if (st !== qt && st.startsWith(qt) && qt.length >= 4) score += 0.3;
4248
- }
4249
- }
4250
-
4251
- return score;
4252
- }
4253
-
4254
- /**
4255
- * Rank all files in the index against a query. Returns file paths sorted
4256
- * by relevance score descending. Ties are broken by file path alphabetically.
4207
+ * Rank all files in the index against a query with the identifier-aware BM25
4208
+ * re-ranker. Returns file entries sorted by relevance score descending; ties
4209
+ * are broken by file path alphabetically (deterministic).
4257
4210
  * @param {string} query
4258
4211
  * @param {Map<string, string[]>} index
4259
4212
  * @param {number} topK
4260
4213
  * @returns {{ file: string, score: number, sigs: string[] }[]}
4261
4214
  */
4262
4215
  function rank(query, index, topK = 10) {
4263
- const queryTokens = tokenize(query);
4264
- const scored = [];
4265
-
4216
+ const candidates = [];
4266
4217
  for (const [file, sigs] of index.entries()) {
4267
- const score = scoreFile(sigs, queryTokens);
4268
- scored.push({ file, score, sigs });
4218
+ candidates.push({ file, sigs });
4269
4219
  }
4270
-
4271
- scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
4272
- return scored.slice(0, topK);
4220
+ return bm25rank(query, candidates).slice(0, topK);
4273
4221
  }
4274
4222
 
4275
4223
  // ---------------------------------------------------------------------------
@@ -4661,7 +4609,14 @@ __factories["./src/evidence/pack"] = function(module, exports) {
4661
4609
  const GENERATED_RE = /(^|\/)(dist|build|out|vendor|node_modules)\/|\.(generated|min|bundle)\.|\.(pb|_pb)\.|\.pb\.go$|_pb2\.py$/;
4662
4610
  const TEST_RE = /(^|\/)(tests?|__tests__|spec|specs)\/|\.(test|spec)\.[a-z]+$|(^|\/)test_[^/]+\.py$|_test\.(go|py|rb)$/;
4663
4611
  const CONFIG_RE = /\.(json|ya?ml|toml|ini|conf|config|properties|env)$|(^|\/)(\.?[a-z]+rc)$|\.config\.[a-z]+$/i;
4664
- const SECURITY_RE = /(^|\/|[._-])(auth|authn|authz|login|password|passwd|secret|credential|token|session|crypto|cipher|payment|billing|checkout|oauth|jwt|permission|acl|rbac)([._-]|\/|$)/i;
4612
+ // DB migrations: framework dirs (Rails/Alembic/Prisma), Flyway `V1__x.sql`,
4613
+ // timestamped migration files, and `*_migration.*` naming.
4614
+ const MIGRATION_RE = /(^|\/)(migrations?|alembic\/versions|prisma\/migrations)(\/|$)|(^|\/)db\/migrate\/|(^|\/)V\d+(_\d+)*__[^/]+\.(sql|java)$|(^|\/)\d{8,}[_-][^/]+\.(sql|rb|py|js|ts)$|[._-]migration[s]?[._-]/i;
4615
+ const PAYMENT_RE = /(^|\/|[._-])(payment|payments|billing|checkout|invoice|invoicing|subscription|stripe|paypal|braintree|charge|refund|payout)([._-]|\/|$)/i;
4616
+ const AUTH_RE = /(^|\/|[._-])(auth|authn|authz|login|logout|signin|signup|password|passwd|session|oauth|jwt|permission|permissions|acl|rbac|credential|credentials)([._-]|\/|$)/i;
4617
+ const SECURITY_RE = /(^|\/|[._-])(secret|secrets|crypto|cipher|encrypt|decrypt|token|signing|keystore|vault)([._-]|\/|$)/i;
4618
+ // Public API surface: `api/` dirs, `public-api`, and module barrel entrypoints.
4619
+ const PUBLIC_API_RE = /(^|\/)api(\/|$)|(^|\/)public[-_]?api(\/|$)|(^|\/)index\.(js|ts|mjs|cjs)$/i;
4665
4620
 
4666
4621
  /**
4667
4622
  * Split a signature's ` :start-end` line anchor from its symbol text.
@@ -4679,17 +4634,25 @@ __factories["./src/evidence/pack"] = function(module, exports) {
4679
4634
  }
4680
4635
 
4681
4636
  /**
4682
- * Classify a file into a coarse risk label. Path-based heuristic (v1) — the
4683
- * richer label set (C3) lands in v8.5.
4637
+ * Classify a file into a risk label (C3, v8.5). Path-based, deterministic.
4638
+ * Precedence is strict, most-specific-risk first: a migration touching payments
4639
+ * is labeled `migration` (a schema change is the dominant risk), payment/auth
4640
+ * outrank the generic `security` bucket, and `config`/`public-api` resolve
4641
+ * before the `source` fallback. `test`/`generated` semantics are preserved so
4642
+ * existing consumers (findRelatedTests, verifier) keep working.
4684
4643
  * @param {string} relPath
4685
- * @returns {'generated'|'test'|'config'|'security'|'source'}
4644
+ * @returns {'generated'|'test'|'migration'|'payment'|'auth'|'security'|'config'|'public-api'|'source'}
4686
4645
  */
4687
4646
  function riskLabelFor(relPath) {
4688
4647
  const p = relPath.replace(/\\/g, '/');
4689
4648
  if (GENERATED_RE.test(p)) return 'generated';
4690
4649
  if (TEST_RE.test(p)) return 'test';
4650
+ if (MIGRATION_RE.test(p)) return 'migration';
4651
+ if (PAYMENT_RE.test(p)) return 'payment';
4652
+ if (AUTH_RE.test(p)) return 'auth';
4691
4653
  if (SECURITY_RE.test(p)) return 'security';
4692
4654
  if (CONFIG_RE.test(p)) return 'config';
4655
+ if (PUBLIC_API_RE.test(p)) return 'public-api';
4693
4656
  return 'source';
4694
4657
  }
4695
4658
 
@@ -4700,9 +4663,28 @@ __factories["./src/evidence/pack"] = function(module, exports) {
4700
4663
  }
4701
4664
 
4702
4665
  /**
4703
- * Best-effort impl→test discovery (v1). Matches test files whose stem equals
4704
- * the implementation file's stem, by common convention. Deterministic. The
4705
- * accuracy-measured discovery (C2) lands in v8.5.
4666
+ * Infer the implementation stem a test file targets, by stripping the
4667
+ * conventional test affixes across languages (measured in the C2 benchmark):
4668
+ * foo.test.js / foo.spec.ts → foo (JS/TS)
4669
+ * test_foo.py → foo (Python / pytest)
4670
+ * foo_test.go / foo_test.py → foo (Go, unittest)
4671
+ * FooTest.java / BarSpec.scala → Foo (JVM, PascalCase)
4672
+ * @param {string} relPath
4673
+ * @returns {string}
4674
+ */
4675
+ function testTargetStem(relPath) {
4676
+ let s = stemOf(relPath); // strips ext + trailing .test/.spec
4677
+ s = s.replace(/^test[_-]/i, ''); // Python: test_foo
4678
+ s = s.replace(/[_-]test$/i, ''); // Go / unittest: foo_test
4679
+ s = s.replace(/(Tests?|Specs?)$/, ''); // JVM PascalCase: FooTest, BarSpec
4680
+ return s;
4681
+ }
4682
+
4683
+ /**
4684
+ * Impl→test discovery (C2, v8.5). Matches test files back to their
4685
+ * implementation by normalizing conventional test affixes, so JS/TS, Python,
4686
+ * Go, and JVM naming conventions all resolve. Deterministic; accuracy is
4687
+ * measured by `scripts/run-test-discovery-benchmark.mjs`.
4706
4688
  * @param {string} relPath
4707
4689
  * @param {string[]} allFiles - universe of indexed files (relative paths)
4708
4690
  * @returns {string[]}
@@ -4715,7 +4697,7 @@ __factories["./src/evidence/pack"] = function(module, exports) {
4715
4697
  for (const f of allFiles) {
4716
4698
  if (f === relPath) continue;
4717
4699
  if (riskLabelFor(f) !== 'test') continue;
4718
- if (stemOf(f).toLowerCase() === stem) out.push(f);
4700
+ if (testTargetStem(f).toLowerCase() === stem) out.push(f);
4719
4701
  }
4720
4702
  return out.sort();
4721
4703
  }
@@ -11231,6 +11213,101 @@ __factories["./src/learning/weights"] = function(module, exports) {
11231
11213
 
11232
11214
  };
11233
11215
 
11216
+ // ── ./src/map/build-ci ──
11217
+ __factories["./src/map/build-ci"] = function(module, exports) {
11218
+
11219
+ /**
11220
+ * Build & CI extractor (v8.5 C1).
11221
+ *
11222
+ * Surfaces how the project is built and validated: npm/pnpm/yarn scripts
11223
+ * (package.json), GitHub Actions workflows (.github/workflows/*.yml), and
11224
+ * Makefile targets. Pure, zero-dependency, deterministic.
11225
+ *
11226
+ * @param {string[]} files — absolute file paths (unused; roots are read directly)
11227
+ * @param {string} cwd — project root
11228
+ * @returns {string} formatted markdown table (empty string if none found)
11229
+ */
11230
+
11231
+ const fs = require('fs');
11232
+ const path = require('path');
11233
+
11234
+ const MAX_ROWS = 120;
11235
+
11236
+ function readJson(p) {
11237
+ try { return JSON.parse(fs.readFileSync(p, 'utf8')); } catch (_) { return null; }
11238
+ }
11239
+
11240
+ function npmScripts(cwd, rows) {
11241
+ const pkg = readJson(path.join(cwd, 'package.json'));
11242
+ if (!pkg || !pkg.scripts || typeof pkg.scripts !== 'object') return;
11243
+ for (const name of Object.keys(pkg.scripts).sort()) {
11244
+ rows.push({ kind: 'script', name, detail: 'npm run ' + name });
11245
+ }
11246
+ }
11247
+
11248
+ function ciWorkflows(cwd, rows) {
11249
+ const dir = path.join(cwd, '.github', 'workflows');
11250
+ let entries;
11251
+ try { entries = fs.readdirSync(dir); } catch (_) { return; }
11252
+ for (const file of entries.sort()) {
11253
+ if (!/\.ya?ml$/i.test(file)) continue;
11254
+ let content;
11255
+ try { content = fs.readFileSync(path.join(dir, file), 'utf8'); } catch (_) { continue; }
11256
+ const nameMatch = content.match(/^name:\s*(.+)$/m);
11257
+ const name = nameMatch ? nameMatch[1].trim().replace(/^['"]|['"]$/g, '') : file;
11258
+ // Trigger events from an `on:` mapping or inline form.
11259
+ const onMatch = content.match(/^on:\s*(.*)$/m);
11260
+ let triggers = '';
11261
+ if (onMatch) {
11262
+ if (onMatch[1].trim()) {
11263
+ triggers = onMatch[1].replace(/[[\]{}'"]/g, '').trim();
11264
+ } else {
11265
+ const block = content.slice(onMatch.index);
11266
+ const events = [...block.matchAll(/^\s{2,}([a-z_]+):/gm)].map((m) => m[1]);
11267
+ triggers = [...new Set(events)].slice(0, 6).join(', ');
11268
+ }
11269
+ }
11270
+ rows.push({ kind: 'ci', name, detail: `${file}${triggers ? ' — ' + triggers : ''}` });
11271
+ }
11272
+ }
11273
+
11274
+ function makeTargets(cwd, rows) {
11275
+ let content;
11276
+ try { content = fs.readFileSync(path.join(cwd, 'Makefile'), 'utf8'); } catch (_) { return; }
11277
+ const targets = [];
11278
+ for (const line of content.split('\n')) {
11279
+ const m = line.match(/^([a-zA-Z0-9_][a-zA-Z0-9_.-]*)\s*:(?!=)/);
11280
+ if (m && m[1] !== '.PHONY') targets.push(m[1]);
11281
+ }
11282
+ for (const t of [...new Set(targets)].sort()) {
11283
+ rows.push({ kind: 'make', name: t, detail: 'make ' + t });
11284
+ }
11285
+ }
11286
+
11287
+ function analyze(files, cwd) {
11288
+ const rows = [];
11289
+ npmScripts(cwd, rows);
11290
+ ciWorkflows(cwd, rows);
11291
+ makeTargets(cwd, rows);
11292
+ if (rows.length === 0) return '';
11293
+
11294
+ const lines = [
11295
+ '| Kind | Name | Detail |',
11296
+ '|------|------|--------|',
11297
+ ];
11298
+ for (const r of rows.slice(0, MAX_ROWS)) {
11299
+ lines.push(`| ${r.kind} | ${r.name} | ${r.detail} |`);
11300
+ }
11301
+ if (rows.length > MAX_ROWS) {
11302
+ lines.push(`| … | | +${rows.length - MAX_ROWS} more |`);
11303
+ }
11304
+ return lines.join('\n');
11305
+ }
11306
+
11307
+ module.exports = { analyze };
11308
+
11309
+ };
11310
+
11234
11311
  // ── ./src/map/class-hierarchy ──
11235
11312
  __factories["./src/map/class-hierarchy"] = function(module, exports) {
11236
11313
 
@@ -11352,6 +11429,205 @@ __factories["./src/map/class-hierarchy"] = function(module, exports) {
11352
11429
 
11353
11430
  };
11354
11431
 
11432
+ // ── ./src/map/config-manifest ──
11433
+ __factories["./src/map/config-manifest"] = function(module, exports) {
11434
+
11435
+ /**
11436
+ * Config & package-manifest extractor (v8.5 C1).
11437
+ *
11438
+ * Surfaces the project's package manifests (name / version / dependency counts)
11439
+ * across ecosystems and the notable root config files present. Pure,
11440
+ * zero-dependency, deterministic.
11441
+ *
11442
+ * @param {string[]} files — absolute file paths (unused; roots are read directly)
11443
+ * @param {string} cwd — project root
11444
+ * @returns {string} formatted markdown table (empty string if none found)
11445
+ */
11446
+
11447
+ const fs = require('fs');
11448
+ const path = require('path');
11449
+
11450
+ const CONFIG_FILES = [
11451
+ 'tsconfig.json', 'jsconfig.json', '.eslintrc', '.eslintrc.json', '.eslintrc.js',
11452
+ '.prettierrc', 'babel.config.js', 'jest.config.js', 'vitest.config.ts',
11453
+ 'webpack.config.js', 'vite.config.ts', 'rollup.config.js', 'tailwind.config.js',
11454
+ 'docker-compose.yml', 'docker-compose.yaml', 'Dockerfile', '.editorconfig',
11455
+ ];
11456
+
11457
+ function readText(p) { try { return fs.readFileSync(p, 'utf8'); } catch (_) { return null; } }
11458
+ function readJson(p) { try { return JSON.parse(fs.readFileSync(p, 'utf8')); } catch (_) { return null; } }
11459
+ function count(obj) { return obj && typeof obj === 'object' ? Object.keys(obj).length : 0; }
11460
+
11461
+ function manifests(cwd, rows) {
11462
+ const pkg = readJson(path.join(cwd, 'package.json'));
11463
+ if (pkg) {
11464
+ const deps = count(pkg.dependencies);
11465
+ const dev = count(pkg.devDependencies);
11466
+ const id = [pkg.name, pkg.version].filter(Boolean).join('@') || 'package.json';
11467
+ rows.push({ manifest: 'package.json (npm)', detail: `${id} · ${deps} deps, ${dev} devDeps` });
11468
+ }
11469
+
11470
+ const pyproject = readText(path.join(cwd, 'pyproject.toml'));
11471
+ if (pyproject) {
11472
+ const name = (pyproject.match(/^\s*name\s*=\s*["']([^"']+)["']/m) || [])[1];
11473
+ const ver = (pyproject.match(/^\s*version\s*=\s*["']([^"']+)["']/m) || [])[1];
11474
+ rows.push({ manifest: 'pyproject.toml (python)', detail: [name, ver].filter(Boolean).join('@') || 'present' });
11475
+ } else if (readText(path.join(cwd, 'setup.py'))) {
11476
+ rows.push({ manifest: 'setup.py (python)', detail: 'present' });
11477
+ }
11478
+ if (readText(path.join(cwd, 'requirements.txt'))) {
11479
+ rows.push({ manifest: 'requirements.txt (python)', detail: 'present' });
11480
+ }
11481
+
11482
+ const cargo = readText(path.join(cwd, 'Cargo.toml'));
11483
+ if (cargo) {
11484
+ const name = (cargo.match(/^\s*name\s*=\s*["']([^"']+)["']/m) || [])[1];
11485
+ const ver = (cargo.match(/^\s*version\s*=\s*["']([^"']+)["']/m) || [])[1];
11486
+ rows.push({ manifest: 'Cargo.toml (rust)', detail: [name, ver].filter(Boolean).join('@') || 'present' });
11487
+ }
11488
+
11489
+ const gomod = readText(path.join(cwd, 'go.mod'));
11490
+ if (gomod) {
11491
+ const mod = (gomod.match(/^module\s+(\S+)/m) || [])[1];
11492
+ const go = (gomod.match(/^go\s+(\S+)/m) || [])[1];
11493
+ rows.push({ manifest: 'go.mod (go)', detail: [mod, go && 'go ' + go].filter(Boolean).join(' · ') || 'present' });
11494
+ }
11495
+
11496
+ if (readText(path.join(cwd, 'pom.xml'))) rows.push({ manifest: 'pom.xml (maven)', detail: 'present' });
11497
+ if (readText(path.join(cwd, 'build.gradle')) || readText(path.join(cwd, 'build.gradle.kts'))) {
11498
+ rows.push({ manifest: 'build.gradle (gradle)', detail: 'present' });
11499
+ }
11500
+ if (readText(path.join(cwd, 'Gemfile'))) rows.push({ manifest: 'Gemfile (ruby)', detail: 'present' });
11501
+ const composer = readJson(path.join(cwd, 'composer.json'));
11502
+ if (composer) {
11503
+ rows.push({ manifest: 'composer.json (php)', detail: `${composer.name || 'present'} · ${count(composer.require)} deps` });
11504
+ }
11505
+ }
11506
+
11507
+ function configFiles(cwd) {
11508
+ const present = [];
11509
+ for (const f of CONFIG_FILES) {
11510
+ if (fs.existsSync(path.join(cwd, f))) present.push(f);
11511
+ }
11512
+ return present;
11513
+ }
11514
+
11515
+ function analyze(files, cwd) {
11516
+ const rows = [];
11517
+ manifests(cwd, rows);
11518
+ const configs = configFiles(cwd);
11519
+ if (rows.length === 0 && configs.length === 0) return '';
11520
+
11521
+ const lines = [];
11522
+ if (rows.length) {
11523
+ lines.push('| Manifest | Detail |', '|----------|--------|');
11524
+ for (const r of rows) lines.push(`| ${r.manifest} | ${r.detail} |`);
11525
+ }
11526
+ if (configs.length) {
11527
+ if (lines.length) lines.push('');
11528
+ lines.push(`**Config files:** ${configs.map((c) => '`' + c + '`').join(', ')}`);
11529
+ }
11530
+ return lines.join('\n');
11531
+ }
11532
+
11533
+ module.exports = { analyze };
11534
+
11535
+ };
11536
+
11537
+ // ── ./src/map/env-schema ──
11538
+ __factories["./src/map/env-schema"] = function(module, exports) {
11539
+
11540
+ /**
11541
+ * Environment-variable schema extractor (v8.5 C1).
11542
+ *
11543
+ * Surfaces the environment the project actually reads — from source across
11544
+ * JS/TS, Python, Ruby, and Go, plus keys declared in a committed `.env.example`
11545
+ * / `.env.sample` / `.env.template`. Pure, zero-dependency, deterministic.
11546
+ *
11547
+ * @param {string[]} files — absolute file paths to analyze (srcDirs-scoped)
11548
+ * @param {string} cwd — project root
11549
+ * @returns {string} formatted markdown table (empty string if none found)
11550
+ */
11551
+
11552
+ const fs = require('fs');
11553
+ const path = require('path');
11554
+
11555
+ const SCAN_EXTS = new Set(['.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs', '.py', '.rb', '.go']);
11556
+ const EXAMPLE_FILES = ['.env.example', '.env.sample', '.env.template', '.env.dist'];
11557
+
11558
+ // process.env.X / process.env['X'] / import.meta.env.X / Deno.env.get('X')
11559
+ const JS_RE = /(?:process\.env|import\.meta\.env)(?:\.([A-Z_][A-Z0-9_]*)|\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\])|Deno\.env\.get\(\s*['"]([A-Z_][A-Z0-9_]*)['"]/g;
11560
+ // os.environ['X'] / os.environ.get('X') / os.getenv('X') / getenv('X')
11561
+ const PY_RE = /(?:os\.)?(?:environ(?:\.get)?\[?\s*['"]([A-Z_][A-Z0-9_]*)['"]|getenv\(\s*['"]([A-Z_][A-Z0-9_]*)['"])/g;
11562
+ const RB_RE = /ENV\[\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*\]/g;
11563
+ const GO_RE = /os\.(?:Getenv|LookupEnv)\(\s*["`']([A-Z_][A-Z0-9_]*)["`']/g;
11564
+
11565
+ const MAX_ROWS = 200;
11566
+
11567
+ function collectMatches(re, content, into) {
11568
+ let m;
11569
+ re.lastIndex = 0;
11570
+ while ((m = re.exec(content)) !== null) {
11571
+ const name = m[1] || m[2] || m[3];
11572
+ if (name) into.add(name);
11573
+ }
11574
+ }
11575
+
11576
+ function readExampleKeys(cwd) {
11577
+ const keys = new Set();
11578
+ for (const name of EXAMPLE_FILES) {
11579
+ let content;
11580
+ try { content = fs.readFileSync(path.join(cwd, name), 'utf8'); } catch (_) { continue; }
11581
+ for (const line of content.split('\n')) {
11582
+ const t = line.trim();
11583
+ if (!t || t.startsWith('#')) continue;
11584
+ const eq = t.match(/^(?:export\s+)?([A-Z_][A-Z0-9_]*)\s*=/);
11585
+ if (eq) keys.add(eq[1]);
11586
+ }
11587
+ }
11588
+ return keys;
11589
+ }
11590
+
11591
+ function analyze(files, cwd) {
11592
+ const fromCode = new Set();
11593
+
11594
+ for (const filePath of files) {
11595
+ const ext = path.extname(filePath).toLowerCase();
11596
+ if (!SCAN_EXTS.has(ext)) continue;
11597
+ let content;
11598
+ try { content = fs.readFileSync(filePath, 'utf8'); } catch (_) { continue; }
11599
+
11600
+ if (ext === '.py') collectMatches(PY_RE, content, fromCode);
11601
+ else if (ext === '.rb') collectMatches(RB_RE, content, fromCode);
11602
+ else if (ext === '.go') collectMatches(GO_RE, content, fromCode);
11603
+ else collectMatches(JS_RE, content, fromCode);
11604
+ }
11605
+
11606
+ const fromExample = readExampleKeys(cwd);
11607
+ const all = new Set([...fromCode, ...fromExample]);
11608
+ if (all.size === 0) return '';
11609
+
11610
+ const names = [...all].sort();
11611
+ const lines = [
11612
+ '| Variable | Source |',
11613
+ '|----------|--------|',
11614
+ ];
11615
+ for (const name of names.slice(0, MAX_ROWS)) {
11616
+ const src = [];
11617
+ if (fromCode.has(name)) src.push('code');
11618
+ if (fromExample.has(name)) src.push('.env.example');
11619
+ lines.push(`| ${name} | ${src.join(', ')} |`);
11620
+ }
11621
+ if (names.length > MAX_ROWS) {
11622
+ lines.push(`| … | +${names.length - MAX_ROWS} more |`);
11623
+ }
11624
+ return lines.join('\n');
11625
+ }
11626
+
11627
+ module.exports = { analyze };
11628
+
11629
+ };
11630
+
11355
11631
  // ── ./src/map/import-graph ──
11356
11632
  __factories["./src/map/import-graph"] = function(module, exports) {
11357
11633
 
@@ -11541,6 +11817,94 @@ __factories["./src/map/import-graph"] = function(module, exports) {
11541
11817
 
11542
11818
  };
11543
11819
 
11820
+ // ── ./src/map/migrations ──
11821
+ __factories["./src/map/migrations"] = function(module, exports) {
11822
+
11823
+ /**
11824
+ * Database-migration extractor (v8.5 C1).
11825
+ *
11826
+ * Detects schema-migration files across the common frameworks — Rails
11827
+ * (db/migrate), Django/Alembic, Prisma, Flyway (`V1__name.sql`), knex/Sequelize,
11828
+ * and timestamped SQL — and surfaces them with a parsed version + name. Pure,
11829
+ * zero-dependency, deterministic.
11830
+ *
11831
+ * @param {string[]} files — absolute file paths (unused; the tree is walked)
11832
+ * @param {string} cwd — project root
11833
+ * @returns {string} formatted markdown table (empty string if none found)
11834
+ */
11835
+
11836
+ const fs = require('fs');
11837
+ const path = require('path');
11838
+
11839
+ const MAX_DEPTH = 6;
11840
+ const MAX_ROWS = 200;
11841
+ const SKIP_DIR = new Set(['.git', 'node_modules', 'vendor', 'dist', 'build', 'target', '.venv', 'venv', '__pycache__']);
11842
+ const MIG_EXT = new Set(['.sql', '.rb', '.py', '.js', '.ts']);
11843
+
11844
+ // A directory whose path marks its children as migrations.
11845
+ const MIG_DIR_RE = /(^|\/)(db\/migrate|migrations?|alembic\/versions|prisma\/migrations)$/i;
11846
+ // A filename that is itself a migration regardless of directory.
11847
+ const FLYWAY_RE = /^V\d+(?:[._]\d+)*__(.+)\.(sql|java)$/;
11848
+ const TIMESTAMP_RE = /^(\d{8,})[_-](.+)\.(sql|rb|py|js|ts)$/;
11849
+ const NAMED_RE = /[._-]migrations?[._-]/i;
11850
+
11851
+ function walk(dir, cwd, depth, out) {
11852
+ if (depth > MAX_DEPTH) return;
11853
+ let entries;
11854
+ try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch (_) { return; }
11855
+ entries.sort((a, b) => (a.name < b.name ? -1 : a.name > b.name ? 1 : 0));
11856
+
11857
+ const relDir = path.relative(cwd, dir).replace(/\\/g, '/');
11858
+ const dirIsMigration = MIG_DIR_RE.test(relDir);
11859
+
11860
+ for (const e of entries) {
11861
+ if (e.isDirectory()) {
11862
+ if (SKIP_DIR.has(e.name)) continue;
11863
+ walk(path.join(dir, e.name), cwd, depth + 1, out);
11864
+ continue;
11865
+ }
11866
+ const ext = path.extname(e.name).toLowerCase();
11867
+ if (!MIG_EXT.has(ext)) continue;
11868
+
11869
+ const rel = path.relative(cwd, path.join(dir, e.name)).replace(/\\/g, '/');
11870
+ let version = null;
11871
+ let name = null;
11872
+
11873
+ let m;
11874
+ if ((m = e.name.match(FLYWAY_RE))) { version = e.name.split('__')[0]; name = m[1].replace(/_/g, ' '); }
11875
+ else if ((m = e.name.match(TIMESTAMP_RE))) { version = m[1]; name = m[2].replace(/[_-]/g, ' '); }
11876
+ else if (dirIsMigration) { version = '—'; name = e.name.replace(ext, ''); }
11877
+ else if (NAMED_RE.test(e.name)) { version = '—'; name = e.name.replace(ext, ''); }
11878
+ else continue;
11879
+
11880
+ out.push({ version, name, file: rel });
11881
+ }
11882
+ }
11883
+
11884
+ function analyze(files, cwd) {
11885
+ const found = [];
11886
+ walk(cwd, cwd, 0, found);
11887
+ if (found.length === 0) return '';
11888
+
11889
+ found.sort((a, b) => (a.file < b.file ? -1 : a.file > b.file ? 1 : 0));
11890
+
11891
+ const lines = [
11892
+ '| Version | Migration | File |',
11893
+ '|---------|-----------|------|',
11894
+ ];
11895
+ for (const r of found.slice(0, MAX_ROWS)) {
11896
+ lines.push(`| ${r.version} | ${r.name} | ${r.file} |`);
11897
+ }
11898
+ if (found.length > MAX_ROWS) {
11899
+ lines.push(`| … | +${found.length - MAX_ROWS} more | |`);
11900
+ }
11901
+ return lines.join('\n');
11902
+ }
11903
+
11904
+ module.exports = { analyze };
11905
+
11906
+ };
11907
+
11544
11908
  // ── ./src/map/route-table ──
11545
11909
  __factories["./src/map/route-table"] = function(module, exports) {
11546
11910
 
@@ -11696,6 +12060,10 @@ __factories["./src/mcp/handlers"] = function(module, exports) {
11696
12060
  imports: '### Import graph',
11697
12061
  classes: '### Class hierarchy',
11698
12062
  routes: '### Route table',
12063
+ env: '### Environment variables',
12064
+ buildci: '### Build & CI',
12065
+ manifests: '### Config & manifests',
12066
+ migrations: '### Database migrations',
11699
12067
  };
11700
12068
 
11701
12069
  /**
@@ -11781,7 +12149,7 @@ __factories["./src/mcp/handlers"] = function(module, exports) {
11781
12149
 
11782
12150
  const header = MAP_SECTIONS[args.type];
11783
12151
  if (!header) {
11784
- return `Unknown map type: "${args.type}". Use: imports, classes, routes`;
12152
+ return `Unknown map type: "${args.type}". Use: ${Object.keys(MAP_SECTIONS).join(', ')}`;
11785
12153
  }
11786
12154
 
11787
12155
  const mapPath = path.join(cwd, 'PROJECT_MAP.md');
@@ -12695,7 +13063,7 @@ __factories["./src/mcp/server"] = function(module, exports) {
12695
13063
 
12696
13064
  const SERVER_INFO = {
12697
13065
  name: 'sigmap',
12698
- version: '7.30.0',
13066
+ version: '8.0.0',
12699
13067
  description: 'SigMap MCP server — code signatures on demand',
12700
13068
  };
12701
13069
 
@@ -13418,6 +13786,132 @@ __factories["./src/plan/verify-plan"] = function(module, exports) {
13418
13786
 
13419
13787
  };
13420
13788
 
13789
+ // ── ./src/retrieval/bm25 ──
13790
+ __factories["./src/retrieval/bm25"] = function(module, exports) {
13791
+
13792
+ /**
13793
+ * SigMap identifier-aware BM25 re-ranker (zero dependencies, deterministic).
13794
+ *
13795
+ * Plain exact-token TF-IDF misses queries whose terms live *inside* code
13796
+ * identifiers — e.g. `component emit` never surfaces `componentEmits.ts`,
13797
+ * because "componentEmits" is one token that shares no exact term with the
13798
+ * query. This module fixes that with four small additions:
13799
+ *
13800
+ * 1. Identifier-aware tokenization — split camelCase and snake_case.
13801
+ * 2. Light stemming — plurals / common suffixes (`emits` → `emit`).
13802
+ * 3. Path-token boost — file path / basename tokens weigh PATH_BOOST× more.
13803
+ * 4. BM25 scoring instead of raw TF-IDF (length-normalized).
13804
+ *
13805
+ * On 85 curated tasks across 17 repos this lifted hit@5 from 75.3% → 82.4%
13806
+ * (MRR +16% relative). See issue #395.
13807
+ */
13808
+
13809
+ // Stop words: common English + low-signal code verbs/nouns that appear in
13810
+ // nearly every signature and so carry little retrieval signal.
13811
+ const STOP = new Set(
13812
+ ('a an the of to in on for and or is are be by with as at from that this it its ' +
13813
+ 'into get set add new return value test')
13814
+ .split(' ')
13815
+ );
13816
+
13817
+ /**
13818
+ * Light suffix stemmer — conservative, tuned for code identifiers rather than
13819
+ * prose. Words of 3 chars or fewer pass through unchanged; a result shorter
13820
+ * than 3 chars reverts to the original token.
13821
+ *
13822
+ * @param {string} w
13823
+ * @returns {string}
13824
+ */
13825
+ function stem(w) {
13826
+ if (w.length <= 3) return w;
13827
+ let s = w;
13828
+ s = s.replace(/ies$/, 'y');
13829
+ s = s.replace(/(sses|shes|ches|xes|zes)$/, (m) => m.slice(0, -2));
13830
+ s = s.replace(/([^s])s$/, '$1');
13831
+ s = s.replace(/(ization|izations)$/, 'ize');
13832
+ s = s.replace(/(ing|edly|ed|er|ers|ation|ations|ment|ness|ity|ive|able|ible|ize|ise|al)$/, '');
13833
+ return s.length >= 3 ? s : w;
13834
+ }
13835
+
13836
+ /**
13837
+ * Split on non-alphanumeric characters AND camelCase / snake_case boundaries,
13838
+ * lowercase, drop stop words and single characters, then stem.
13839
+ *
13840
+ * @param {string} text
13841
+ * @returns {string[]}
13842
+ */
13843
+ function tokenize(text) {
13844
+ if (!text || typeof text !== 'string') return [];
13845
+ return text
13846
+ .replace(/[^A-Za-z0-9]+/g, ' ')
13847
+ .replace(/([a-z0-9])([A-Z])/g, '$1 $2')
13848
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
13849
+ .toLowerCase()
13850
+ .split(/\s+/)
13851
+ .filter((t) => t.length > 1 && !STOP.has(t))
13852
+ .map(stem)
13853
+ .filter(Boolean);
13854
+ }
13855
+
13856
+ // The file path / basename is highly indicative of relevance, so its tokens
13857
+ // are counted PATH_BOOST times when building the document term-frequency map.
13858
+ const PATH_BOOST = 3;
13859
+
13860
+ /**
13861
+ * BM25 re-rank of candidates against a query. Each candidate is
13862
+ * `{ file, sigs }`; the returned objects preserve all original candidate
13863
+ * fields and add a numeric `score` (higher = more relevant), sorted best-first
13864
+ * with a deterministic path tie-break. A `score` of 0 means no query token
13865
+ * matched — callers typically drop those.
13866
+ *
13867
+ * @param {string} query
13868
+ * @param {{ file: string, sigs: string[] }[]} candidates
13869
+ * @returns {Array<object & { score: number }>}
13870
+ */
13871
+ function bm25rank(query, candidates) {
13872
+ if (!Array.isArray(candidates) || candidates.length === 0) return [];
13873
+
13874
+ const k1 = 1.5;
13875
+ const b = 0.75;
13876
+
13877
+ const docs = candidates.map((c) => {
13878
+ const pathToks = tokenize(c.file || '');
13879
+ const toks = tokenize((c.sigs || []).join(' '));
13880
+ for (let i = 0; i < PATH_BOOST; i++) toks.push(...pathToks);
13881
+ const tf = new Map();
13882
+ for (const t of toks) tf.set(t, (tf.get(t) || 0) + 1);
13883
+ return { cand: c, tf, len: toks.length };
13884
+ });
13885
+
13886
+ const N = docs.length || 1;
13887
+ const avgdl = docs.reduce((s, d) => s + d.len, 0) / N || 1;
13888
+
13889
+ const df = new Map();
13890
+ for (const d of docs) {
13891
+ for (const t of d.tf.keys()) df.set(t, (df.get(t) || 0) + 1);
13892
+ }
13893
+
13894
+ const qToks = [...new Set(tokenize(query))];
13895
+
13896
+ return docs
13897
+ .map((d) => {
13898
+ let score = 0;
13899
+ for (const t of qToks) {
13900
+ const f = d.tf.get(t);
13901
+ if (!f) continue;
13902
+ const dfT = df.get(t);
13903
+ const idf = Math.log(1 + (N - dfT + 0.5) / (dfT + 0.5));
13904
+ score += (idf * (f * (k1 + 1))) / (f + k1 * (1 - b + (b * d.len) / avgdl));
13905
+ }
13906
+ return Object.assign({}, d.cand, { score });
13907
+ })
13908
+ .sort((a, c) => c.score - a.score || String(a.file).localeCompare(String(c.file)));
13909
+ }
13910
+
13911
+ module.exports = { tokenize, stem, bm25rank, PATH_BOOST, STOP };
13912
+
13913
+ };
13914
+
13421
13915
  // ── ./src/retrieval/ranker ──
13422
13916
  __factories["./src/retrieval/ranker"] = function(module, exports) {
13423
13917
 
@@ -13440,6 +13934,7 @@ __factories["./src/retrieval/ranker"] = function(module, exports) {
13440
13934
 
13441
13935
  const { loadWeights } = __require('./src/learning/weights');
13442
13936
  const { tokenize, STOP_WORDS } = __require('./src/retrieval/tokenizer');
13937
+ const { bm25rank } = __require('./src/retrieval/bm25');
13443
13938
 
13444
13939
  // ---------------------------------------------------------------------------
13445
13940
  // Default weights
@@ -13618,11 +14113,24 @@ __factories["./src/retrieval/ranker"] = function(module, exports) {
13618
14113
  return all.slice(0, topK);
13619
14114
  }
13620
14115
 
14116
+ // Identifier-aware BM25 base relevance over the whole index (#395). BM25
14117
+ // splits camelCase/snake_case, stems, and boosts path tokens, so queries
14118
+ // whose terms live inside identifiers (e.g. "component emit" → componentEmits)
14119
+ // are matched. The existing negative-signal penalty and recency/graph/learned
14120
+ // boosts are layered on top; the per-token signals stay for the explain table.
14121
+ const bm25Scores = new Map();
14122
+ for (const c of bm25rank(query, [...sigIndex.entries()].map(([file, sigs]) => ({ file, sigs })))) {
14123
+ bm25Scores.set(c.file, c.score);
14124
+ }
14125
+
13621
14126
  const scored = [];
13622
14127
  for (const [file, sigs] of sigIndex.entries()) {
13623
14128
  const result = scoreFile(file, sigs, queryTokens, weights);
13624
- let score = result.score;
14129
+ const penalty = result.signals.penalty;
14130
+ const base = bm25Scores.get(file) || 0;
14131
+ let score = base * penalty;
13625
14132
  const signals = result.signals;
14133
+ signals.bm25 = base;
13626
14134
 
13627
14135
  // Recency boost
13628
14136
  if (recencySet && recencySet.has(file) && score > 0) {
@@ -16524,7 +17032,7 @@ function __tryGit(args, opts = {}) {
16524
17032
  catch (_) { return ''; }
16525
17033
  }
16526
17034
 
16527
- const VERSION = '7.30.0';
17035
+ const VERSION = '8.0.0';
16528
17036
  const MARKER = '\n\n## Auto-generated signatures\n<!-- Updated by gen-context.js -->\n';
16529
17037
 
16530
17038
  function requireSourceOrBundled(key) {