npm - sigmap - Versions diffs - 7.22.2 → 7.24.0 - Mend

sigmap 7.22.2 → 7.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/CHANGELOG.md +18 -0
package/gen-context.js +173 -2
package/llms-full.txt +1 -1
package/llms.txt +1 -1
package/package.json +1 -1
package/packages/cli/package.json +1 -1
package/packages/core/package.json +1 -1
package/src/eval/llm-ablation.js +30 -1
package/src/mcp/server.js +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -10,6 +10,24 @@ Format: [Semantic Versioning](https://semver.org/)
 ---
+## [7.24.0] — 2026-06-19
+Minor release — redesign the §9 ablation corpus so it measures grounding, not guard precision.
+### Changed
+- **§9 ablation corpus → checkable repo-fact questions (#356):** the previous "write a minimal example that requires X" tasks inherently elicited placeholder scaffolding (`src/main.js`, `minimal.js`, real modules referenced by basename), which a string-based guard cannot distinguish from claimed repo files — so the metric measured guard precision, not grounding. A 100-task run confirmed grounding drives *genuine* invented-file hallucinations to ~0 while with-arm scaffolding noise masked it (9 → 7). `scripts/gen-ablation-corpus.mjs` now generates fact questions — *"which file defines `<name>`, and what are its parameters?"* — where a wrong file path is an unambiguous, checkable hallucination and the prompt forbids example code. The grounded arm (given exact signatures grouped by file) answers correctly; the ungrounded arm must guess. Task ids `call-` → `fact-`; 100 real-symbol tasks; a regression test pins the methodology. Run: `npm run benchmark:llm-ablation -- --runs 5 --save`.
+---
+## [7.23.0] — 2026-06-19
+Minor release — make the §9 LLM ablation produce a statistically stable number.
+### Added
+- **§9 ablation: `--runs N` averaging + 100-task corpus (#353):** the cleaned-guard §9 result is directionally clear (grounding cuts flagged codebase-fact errors ~13 → 3 per 100) but at N=40 with single-digit raw counts a single pass is noisy. `scripts/run-llm-ablation.mjs` gains `--runs N` (default 1) that runs the full task set N times with **fresh model calls per pass** and prints a mean ± [min–max] summary; `src/eval/llm-ablation.js` adds a pure, unit-tested `aggregateRuns(aggregates[])` (mean/min/max of without/with per-100 and delta). The committed corpus (`benchmarks/llm-ablation-tasks.json`) expands from 40 to **100** real-symbol tasks (`gen-ablation-corpus.mjs` default 40 → 100) for a tighter single-run estimate. The network touch stays confined to `scripts/`; the offline harness is unchanged. Run the robust headline with `npm run benchmark:llm-ablation -- --runs 5 --save`.
+---
 ## [7.22.2] — 2026-06-19
 Patch release — clears the two remaining `verify-ai-output` false-positive classes surfaced by the §9 ablation.

package/gen-context.js CHANGED Viewed

@@ -32,6 +32,177 @@ function __require(key) {
 // ── ./src/conventions/report ──
 // ── ./src/conventions/ci ──
 // ── ./src/eval/llm-ablation ──
+__factories["./src/eval/llm-ablation"] = function(module, exports) {
+  /**
+   * LLM A/B hallucination ablation (IMPL.md §9) — the honest measurement.
+   *
+   * Runs a model twice per task — (A) no SigMap context, (B) with SigMap
+   * grounding — pipes both outputs through the hallucination guard, and reports
+   * the measured delta in flagged codebase-fact errors. The model call is
+   * INJECTED (`complete(prompt) → text`), so the harness itself is pure and
+   * offline-testable; the live model adapter lives in `scripts/run-llm-ablation.mjs`.
+   * Zero-dependency, bundle-safe (no network here).
+   */
+  const { verify } = __require('./src/verify/hallucination-guard');
+  const path = require('path');
+  /** Strip a signature's trailing line anchor (` :12-20`) for prompt cleanliness. */
+  function _cleanSig(sig) {
+    return String(sig).replace(/\s*:\d+(?:-\d+)?\s*$/, '').trim();
+  }
+  /**
+   * Build the SigMap grounding block for a repo — what we prepend to a task
+   * prompt in arm B. Conventions (the house style) + **exact signatures** grouped
+   * by file (what `get_callee_signatures` returns), so the model references the
+   * real surface instead of guessing — the actual product behavior, not a flat
+   * name dump.
+   * @param {string} cwd
+   * @param {object} [opts]
+   * @param {number} [opts.maxSignatures=150] cap on signature lines (bounds prompt size)
+   * @returns {string}
+   */
+  function buildGrounding(cwd, opts = {}) {
+    const maxSignatures = opts.maxSignatures != null ? opts.maxSignatures : 150;
+    const parts = [];
+    let index = null;
+    try {
+      const { buildSigIndex } = __require('./src/retrieval/ranker');
+      index = buildSigIndex(cwd);
+    } catch (_) {}
+    try {
+      const { extractConventions } = __require('./src/conventions/extract');
+      const { renderConventionsBlock } = __require('./src/conventions/inject');
+      const files = index ? [...index.keys()] : [];
+      parts.push(renderConventionsBlock(extractConventions(cwd, files)));
+    } catch (_) {}
+    if (index) {
+      const lines = ['## Exact signatures (use these — do not invent symbols or paths)'];
+      let count = 0;
+      for (const [file, sigs] of index) {
+        if (count >= maxSignatures) break;
+        const rel = path.relative(cwd, file).replace(/\\/g, '/');
+        const clean = (sigs || []).map(_cleanSig).filter(Boolean);
+        if (!clean.length) continue;
+        lines.push(`### ${rel}`);
+        for (const s of clean) {
+          if (count >= maxSignatures) break;
+          lines.push(s);
+          count++;
+        }
+      }
+      if (count > 0) parts.push(lines.join('\n'));
+    }
+    return parts.join('\n\n');
+  }
+  /**
+   * Score an answer: flagged codebase-fact errors + the issue list (the §9 metric).
+   * @param {string} answerText
+   * @param {string} cwd
+   * @returns {{ total: number, issues: object[] }}
+   */
+  function scoreAnswerDetail(answerText, cwd) {
+    try {
+      const { issues, summary } = verify(String(answerText || ''), cwd);
+      return { total: summary.total || 0, issues: issues || [] };
+    } catch (_) {
+      return { total: 0, issues: [] };
+    }
+  }
+  /** Count flagged codebase-fact errors in an answer (the §9 metric). */
+  function scoreAnswer(answerText, cwd) {
+    return scoreAnswerDetail(answerText, cwd).total;
+  }
+  /**
+   * Run the A/B ablation over a task corpus.
+   * @param {Array<{id:string, prompt:string}>} tasks
+   * @param {string} cwd
+   * @param {(prompt:string, meta:object)=>string} complete injected model call
+   * @param {object} [opts]
+   * @param {string} [opts.grounding] precomputed grounding (else built from cwd)
+   * @param {boolean} [opts.collectIssues] attach `aIssues`/`bIssues` per task
+   * @returns {{ tasks: object[], aggregate: object }}
+   */
+  function runAblation(tasks, cwd, complete, opts = {}) {
+    const grounding = opts.grounding != null ? opts.grounding : buildGrounding(cwd);
+    const rows = [];
+    let sumA = 0;
+    let sumB = 0;
+    for (const task of tasks || []) {
+      const basePrompt = task.prompt || '';
+      const groundedPrompt = grounding ? `${grounding}\n\n---\n\n${basePrompt}` : basePrompt;
+      const outA = String(complete(basePrompt, { id: task.id, grounded: false }) || '');
+      const outB = String(complete(groundedPrompt, { id: task.id, grounded: true }) || '');
+      const a = scoreAnswerDetail(outA, cwd);
+      const b = scoreAnswerDetail(outB, cwd);
+      sumA += a.total;
+      sumB += b.total;
+      const row = { id: task.id, aFlagged: a.total, bFlagged: b.total };
+      if (opts.collectIssues) { row.aIssues = a.issues; row.bIssues = b.issues; }
+      rows.push(row);
+    }
+    const n = rows.length;
+    const per100 = (sum) => (n > 0 ? (sum / n) * 100 : 0);
+    return {
+      tasks: rows,
+      aggregate: {
+        n,
+        withoutFlagged: sumA,
+        withFlagged: sumB,
+        delta: sumA - sumB,
+        withoutPer100: per100(sumA),
+        withPer100: per100(sumB),
+      },
+    };
+  }
+  /** mean/min/max of a number list (0s for an empty list). */
+  function _stats(nums) {
+    if (!nums.length) return { mean: 0, min: 0, max: 0 };
+    const sum = nums.reduce((a, b) => a + b, 0);
+    return { mean: sum / nums.length, min: Math.min(...nums), max: Math.max(...nums) };
+  }
+  /**
+   * Aggregate several `runAblation` passes into a stable estimate — mean ± range
+   * of the without/with per-100 flag rates and their delta. At N=40 with tiny raw
+   * counts a single pass is noisy; averaging repeated passes gives a publishable
+   * number with an honest spread.
+   * @param {object[]} aggregates the `.aggregate` object from each runAblation pass
+   * @returns {{ runs:number, n:number, withoutPer100:object, withPer100:object, deltaPer100:object }}
+   */
+  function aggregateRuns(aggregates) {
+    const runs = (aggregates || []).filter(Boolean);
+    const without = runs.map((a) => a.withoutPer100);
+    const withG = runs.map((a) => a.withPer100);
+    const delta = runs.map((a) => a.withoutPer100 - a.withPer100);
+    return {
+      runs: runs.length,
+      n: runs.length ? runs[0].n : 0,
+      withoutPer100: _stats(without),
+      withPer100: _stats(withG),
+      deltaPer100: _stats(delta),
+    };
+  }
+  module.exports = { buildGrounding, scoreAnswer, scoreAnswerDetail, runAblation, aggregateRuns };
+};
 // ── ./src/conventions/fix ──
 // ── ./src/conventions/update ──
 // ── ./src/scaffold/persist ──
@@ -7931,7 +8102,7 @@ __factories["./src/mcp/server"] = function(module, exports) {
   const SERVER_INFO = {
     name: 'sigmap',
-    version: '7.22.2',
+    version: '7.24.0',
     description: 'SigMap MCP server — code signatures on demand',
   };
@@ -13634,7 +13805,7 @@ function __tryGit(args, opts = {}) {
   catch (_) { return ''; }
 }
-const VERSION = '7.22.2';
+const VERSION = '7.24.0';
 const MARKER = '\n\n## Auto-generated signatures\n<!-- Updated by gen-context.js -->\n';
 function requireSourceOrBundled(key) {

package/llms-full.txt CHANGED Viewed

@@ -9,7 +9,7 @@ the files relevant to the task — cutting tokens ~97% while keeping answers
 grounded. Deterministic, offline, no embeddings or vector database. Works with
 Claude, Cursor, GitHub Copilot, Aider, Windsurf, local LLMs, and MCP.
-# Version: 7.22.2 | Benchmark: sigmap-v7.0-main (2026-06-19)
+# Version: 7.24.0 | Benchmark: sigmap-v7.0-main (2026-06-19)
 # Source: auto-generated from package.json, version.json, src/mcp/tools.js, src/config/defaults.js
 # Regenerate: npm run generate:llms   |   Validate: npm run validate:llms

package/llms.txt CHANGED Viewed

@@ -9,7 +9,7 @@ the files relevant to the task — cutting tokens ~97% while keeping answers
 grounded. Deterministic, offline, no embeddings or vector database. Works with
 Claude, Cursor, GitHub Copilot, Aider, Windsurf, local LLMs, and MCP.
-# Version: 7.22.2 | Benchmark: sigmap-v7.0-main (2026-06-19)
+# Version: 7.24.0 | Benchmark: sigmap-v7.0-main (2026-06-19)
 # Source: auto-generated from package.json, version.json, src/mcp/tools.js, src/config/defaults.js
 # Regenerate: npm run generate:llms   |   Validate: npm run validate:llms

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "sigmap",
-  "version": "7.22.2",
+  "version": "7.24.0",
   "description": "97% token reduction for AI coding. Extracts function & class signatures with TF-IDF ranking to feed only the right files to Claude, Cursor, Copilot, Aider, Windsurf, local LLMs & MCP. Zero dependencies, runs offline via npx.",
   "main": "packages/core/index.js",
   "exports": {

package/packages/cli/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "sigmap-cli",
-  "version": "7.22.2",
+  "version": "7.24.0",
   "description": "SigMap CLI wrapper — thin adapter for programmatic CLI invocation",
   "main": "index.js",
   "keywords": [

package/packages/core/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "sigmap-core",
-  "version": "7.22.2",
+  "version": "7.24.0",
   "description": "SigMap core library — zero-dependency code signature extraction, retrieval, and security scanning",
   "main": "index.js",
   "keywords": [

package/src/eval/llm-ablation.js CHANGED Viewed

@@ -136,4 +136,33 @@ function runAblation(tasks, cwd, complete, opts = {}) {
   };
 }
-module.exports = { buildGrounding, scoreAnswer, scoreAnswerDetail, runAblation };
+/** mean/min/max of a number list (0s for an empty list). */
+function _stats(nums) {
+  if (!nums.length) return { mean: 0, min: 0, max: 0 };
+  const sum = nums.reduce((a, b) => a + b, 0);
+  return { mean: sum / nums.length, min: Math.min(...nums), max: Math.max(...nums) };
+}
+/**
+ * Aggregate several `runAblation` passes into a stable estimate — mean ± range
+ * of the without/with per-100 flag rates and their delta. At N=40 with tiny raw
+ * counts a single pass is noisy; averaging repeated passes gives a publishable
+ * number with an honest spread.
+ * @param {object[]} aggregates the `.aggregate` object from each runAblation pass
+ * @returns {{ runs:number, n:number, withoutPer100:object, withPer100:object, deltaPer100:object }}
+ */
+function aggregateRuns(aggregates) {
+  const runs = (aggregates || []).filter(Boolean);
+  const without = runs.map((a) => a.withoutPer100);
+  const withG = runs.map((a) => a.withPer100);
+  const delta = runs.map((a) => a.withoutPer100 - a.withPer100);
+  return {
+    runs: runs.length,
+    n: runs.length ? runs[0].n : 0,
+    withoutPer100: _stats(without),
+    withPer100: _stats(withG),
+    deltaPer100: _stats(delta),
+  };
+}
+module.exports = { buildGrounding, scoreAnswer, scoreAnswerDetail, runAblation, aggregateRuns };

package/src/mcp/server.js CHANGED Viewed

@@ -18,7 +18,7 @@ const { readContext, searchSignatures, getMap, createCheckpoint, getRouting, exp
 const SERVER_INFO = {
   name: 'sigmap',
-  version: '7.22.2',
+  version: '7.24.0',
   description: 'SigMap MCP server — code signatures on demand',
 };