sigmap 7.22.2 → 7.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/gen-context.js +173 -2
- package/llms-full.txt +1 -1
- package/llms.txt +1 -1
- package/package.json +1 -1
- package/packages/cli/package.json +1 -1
- package/packages/core/package.json +1 -1
- package/src/eval/llm-ablation.js +30 -1
- package/src/mcp/server.js +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -10,6 +10,24 @@ Format: [Semantic Versioning](https://semver.org/)
|
|
|
10
10
|
|
|
11
11
|
---
|
|
12
12
|
|
|
13
|
+
## [7.24.0] — 2026-06-19
|
|
14
|
+
|
|
15
|
+
Minor release — redesign the §9 ablation corpus so it measures grounding, not guard precision.
|
|
16
|
+
|
|
17
|
+
### Changed
|
|
18
|
+
- **§9 ablation corpus → checkable repo-fact questions (#356):** the previous "write a minimal example that requires X" tasks inherently elicited placeholder scaffolding (`src/main.js`, `minimal.js`, real modules referenced by basename), which a string-based guard cannot distinguish from claimed repo files — so the metric measured guard precision, not grounding. A 100-task run confirmed grounding drives *genuine* invented-file hallucinations to ~0 while with-arm scaffolding noise masked it (9 → 7). `scripts/gen-ablation-corpus.mjs` now generates fact questions — *"which file defines `<name>`, and what are its parameters?"* — where a wrong file path is an unambiguous, checkable hallucination and the prompt forbids example code. The grounded arm (given exact signatures grouped by file) answers correctly; the ungrounded arm must guess. Task ids `call-` → `fact-`; 100 real-symbol tasks; a regression test pins the methodology. Run: `npm run benchmark:llm-ablation -- --runs 5 --save`.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## [7.23.0] — 2026-06-19
|
|
23
|
+
|
|
24
|
+
Minor release — make the §9 LLM ablation produce a statistically stable number.
|
|
25
|
+
|
|
26
|
+
### Added
|
|
27
|
+
- **§9 ablation: `--runs N` averaging + 100-task corpus (#353):** the cleaned-guard §9 result is directionally clear (grounding cuts flagged codebase-fact errors ~13 → 3 per 100) but at N=40 with single-digit raw counts a single pass is noisy. `scripts/run-llm-ablation.mjs` gains `--runs N` (default 1) that runs the full task set N times with **fresh model calls per pass** and prints a mean ± [min–max] summary; `src/eval/llm-ablation.js` adds a pure, unit-tested `aggregateRuns(aggregates[])` (mean/min/max of without/with per-100 and delta). The committed corpus (`benchmarks/llm-ablation-tasks.json`) expands from 40 to **100** real-symbol tasks (`gen-ablation-corpus.mjs` default 40 → 100) for a tighter single-run estimate. The network touch stays confined to `scripts/`; the offline harness is unchanged. Run the robust headline with `npm run benchmark:llm-ablation -- --runs 5 --save`.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
13
31
|
## [7.22.2] — 2026-06-19
|
|
14
32
|
|
|
15
33
|
Patch release — clears the two remaining `verify-ai-output` false-positive classes surfaced by the §9 ablation.
|
package/gen-context.js
CHANGED
|
@@ -32,6 +32,177 @@ function __require(key) {
|
|
|
32
32
|
// ── ./src/conventions/report ──
|
|
33
33
|
// ── ./src/conventions/ci ──
|
|
34
34
|
// ── ./src/eval/llm-ablation ──
|
|
35
|
+
__factories["./src/eval/llm-ablation"] = function(module, exports) {
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* LLM A/B hallucination ablation (IMPL.md §9) — the honest measurement.
|
|
39
|
+
*
|
|
40
|
+
* Runs a model twice per task — (A) no SigMap context, (B) with SigMap
|
|
41
|
+
* grounding — pipes both outputs through the hallucination guard, and reports
|
|
42
|
+
* the measured delta in flagged codebase-fact errors. The model call is
|
|
43
|
+
* INJECTED (`complete(prompt) → text`), so the harness itself is pure and
|
|
44
|
+
* offline-testable; the live model adapter lives in `scripts/run-llm-ablation.mjs`.
|
|
45
|
+
* Zero-dependency, bundle-safe (no network here).
|
|
46
|
+
*/
|
|
47
|
+
|
|
48
|
+
const { verify } = __require('./src/verify/hallucination-guard');
|
|
49
|
+
|
|
50
|
+
const path = require('path');
|
|
51
|
+
|
|
52
|
+
/** Strip a signature's trailing line anchor (` :12-20`) for prompt cleanliness. */
|
|
53
|
+
function _cleanSig(sig) {
|
|
54
|
+
return String(sig).replace(/\s*:\d+(?:-\d+)?\s*$/, '').trim();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Build the SigMap grounding block for a repo — what we prepend to a task
|
|
59
|
+
* prompt in arm B. Conventions (the house style) + **exact signatures** grouped
|
|
60
|
+
* by file (what `get_callee_signatures` returns), so the model references the
|
|
61
|
+
* real surface instead of guessing — the actual product behavior, not a flat
|
|
62
|
+
* name dump.
|
|
63
|
+
* @param {string} cwd
|
|
64
|
+
* @param {object} [opts]
|
|
65
|
+
* @param {number} [opts.maxSignatures=150] cap on signature lines (bounds prompt size)
|
|
66
|
+
* @returns {string}
|
|
67
|
+
*/
|
|
68
|
+
function buildGrounding(cwd, opts = {}) {
|
|
69
|
+
const maxSignatures = opts.maxSignatures != null ? opts.maxSignatures : 150;
|
|
70
|
+
const parts = [];
|
|
71
|
+
|
|
72
|
+
let index = null;
|
|
73
|
+
try {
|
|
74
|
+
const { buildSigIndex } = __require('./src/retrieval/ranker');
|
|
75
|
+
index = buildSigIndex(cwd);
|
|
76
|
+
} catch (_) {}
|
|
77
|
+
|
|
78
|
+
try {
|
|
79
|
+
const { extractConventions } = __require('./src/conventions/extract');
|
|
80
|
+
const { renderConventionsBlock } = __require('./src/conventions/inject');
|
|
81
|
+
const files = index ? [...index.keys()] : [];
|
|
82
|
+
parts.push(renderConventionsBlock(extractConventions(cwd, files)));
|
|
83
|
+
} catch (_) {}
|
|
84
|
+
|
|
85
|
+
if (index) {
|
|
86
|
+
const lines = ['## Exact signatures (use these — do not invent symbols or paths)'];
|
|
87
|
+
let count = 0;
|
|
88
|
+
for (const [file, sigs] of index) {
|
|
89
|
+
if (count >= maxSignatures) break;
|
|
90
|
+
const rel = path.relative(cwd, file).replace(/\\/g, '/');
|
|
91
|
+
const clean = (sigs || []).map(_cleanSig).filter(Boolean);
|
|
92
|
+
if (!clean.length) continue;
|
|
93
|
+
lines.push(`### ${rel}`);
|
|
94
|
+
for (const s of clean) {
|
|
95
|
+
if (count >= maxSignatures) break;
|
|
96
|
+
lines.push(s);
|
|
97
|
+
count++;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
if (count > 0) parts.push(lines.join('\n'));
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return parts.join('\n\n');
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Score an answer: flagged codebase-fact errors + the issue list (the §9 metric).
|
|
108
|
+
* @param {string} answerText
|
|
109
|
+
* @param {string} cwd
|
|
110
|
+
* @returns {{ total: number, issues: object[] }}
|
|
111
|
+
*/
|
|
112
|
+
function scoreAnswerDetail(answerText, cwd) {
|
|
113
|
+
try {
|
|
114
|
+
const { issues, summary } = verify(String(answerText || ''), cwd);
|
|
115
|
+
return { total: summary.total || 0, issues: issues || [] };
|
|
116
|
+
} catch (_) {
|
|
117
|
+
return { total: 0, issues: [] };
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/** Count flagged codebase-fact errors in an answer (the §9 metric). */
|
|
122
|
+
function scoreAnswer(answerText, cwd) {
|
|
123
|
+
return scoreAnswerDetail(answerText, cwd).total;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Run the A/B ablation over a task corpus.
|
|
128
|
+
* @param {Array<{id:string, prompt:string}>} tasks
|
|
129
|
+
* @param {string} cwd
|
|
130
|
+
* @param {(prompt:string, meta:object)=>string} complete injected model call
|
|
131
|
+
* @param {object} [opts]
|
|
132
|
+
* @param {string} [opts.grounding] precomputed grounding (else built from cwd)
|
|
133
|
+
* @param {boolean} [opts.collectIssues] attach `aIssues`/`bIssues` per task
|
|
134
|
+
* @returns {{ tasks: object[], aggregate: object }}
|
|
135
|
+
*/
|
|
136
|
+
function runAblation(tasks, cwd, complete, opts = {}) {
|
|
137
|
+
const grounding = opts.grounding != null ? opts.grounding : buildGrounding(cwd);
|
|
138
|
+
const rows = [];
|
|
139
|
+
let sumA = 0;
|
|
140
|
+
let sumB = 0;
|
|
141
|
+
|
|
142
|
+
for (const task of tasks || []) {
|
|
143
|
+
const basePrompt = task.prompt || '';
|
|
144
|
+
const groundedPrompt = grounding ? `${grounding}\n\n---\n\n${basePrompt}` : basePrompt;
|
|
145
|
+
|
|
146
|
+
const outA = String(complete(basePrompt, { id: task.id, grounded: false }) || '');
|
|
147
|
+
const outB = String(complete(groundedPrompt, { id: task.id, grounded: true }) || '');
|
|
148
|
+
|
|
149
|
+
const a = scoreAnswerDetail(outA, cwd);
|
|
150
|
+
const b = scoreAnswerDetail(outB, cwd);
|
|
151
|
+
sumA += a.total;
|
|
152
|
+
sumB += b.total;
|
|
153
|
+
const row = { id: task.id, aFlagged: a.total, bFlagged: b.total };
|
|
154
|
+
if (opts.collectIssues) { row.aIssues = a.issues; row.bIssues = b.issues; }
|
|
155
|
+
rows.push(row);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const n = rows.length;
|
|
159
|
+
const per100 = (sum) => (n > 0 ? (sum / n) * 100 : 0);
|
|
160
|
+
return {
|
|
161
|
+
tasks: rows,
|
|
162
|
+
aggregate: {
|
|
163
|
+
n,
|
|
164
|
+
withoutFlagged: sumA,
|
|
165
|
+
withFlagged: sumB,
|
|
166
|
+
delta: sumA - sumB,
|
|
167
|
+
withoutPer100: per100(sumA),
|
|
168
|
+
withPer100: per100(sumB),
|
|
169
|
+
},
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/** mean/min/max of a number list (0s for an empty list). */
|
|
174
|
+
function _stats(nums) {
|
|
175
|
+
if (!nums.length) return { mean: 0, min: 0, max: 0 };
|
|
176
|
+
const sum = nums.reduce((a, b) => a + b, 0);
|
|
177
|
+
return { mean: sum / nums.length, min: Math.min(...nums), max: Math.max(...nums) };
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Aggregate several `runAblation` passes into a stable estimate — mean ± range
|
|
182
|
+
* of the without/with per-100 flag rates and their delta. At N=40 with tiny raw
|
|
183
|
+
* counts a single pass is noisy; averaging repeated passes gives a publishable
|
|
184
|
+
* number with an honest spread.
|
|
185
|
+
* @param {object[]} aggregates the `.aggregate` object from each runAblation pass
|
|
186
|
+
* @returns {{ runs:number, n:number, withoutPer100:object, withPer100:object, deltaPer100:object }}
|
|
187
|
+
*/
|
|
188
|
+
function aggregateRuns(aggregates) {
|
|
189
|
+
const runs = (aggregates || []).filter(Boolean);
|
|
190
|
+
const without = runs.map((a) => a.withoutPer100);
|
|
191
|
+
const withG = runs.map((a) => a.withPer100);
|
|
192
|
+
const delta = runs.map((a) => a.withoutPer100 - a.withPer100);
|
|
193
|
+
return {
|
|
194
|
+
runs: runs.length,
|
|
195
|
+
n: runs.length ? runs[0].n : 0,
|
|
196
|
+
withoutPer100: _stats(without),
|
|
197
|
+
withPer100: _stats(withG),
|
|
198
|
+
deltaPer100: _stats(delta),
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
module.exports = { buildGrounding, scoreAnswer, scoreAnswerDetail, runAblation, aggregateRuns };
|
|
203
|
+
|
|
204
|
+
};
|
|
205
|
+
|
|
35
206
|
// ── ./src/conventions/fix ──
|
|
36
207
|
// ── ./src/conventions/update ──
|
|
37
208
|
// ── ./src/scaffold/persist ──
|
|
@@ -7931,7 +8102,7 @@ __factories["./src/mcp/server"] = function(module, exports) {
|
|
|
7931
8102
|
|
|
7932
8103
|
const SERVER_INFO = {
|
|
7933
8104
|
name: 'sigmap',
|
|
7934
|
-
version: '7.
|
|
8105
|
+
version: '7.24.0',
|
|
7935
8106
|
description: 'SigMap MCP server — code signatures on demand',
|
|
7936
8107
|
};
|
|
7937
8108
|
|
|
@@ -13634,7 +13805,7 @@ function __tryGit(args, opts = {}) {
|
|
|
13634
13805
|
catch (_) { return ''; }
|
|
13635
13806
|
}
|
|
13636
13807
|
|
|
13637
|
-
const VERSION = '7.
|
|
13808
|
+
const VERSION = '7.24.0';
|
|
13638
13809
|
const MARKER = '\n\n## Auto-generated signatures\n<!-- Updated by gen-context.js -->\n';
|
|
13639
13810
|
|
|
13640
13811
|
function requireSourceOrBundled(key) {
|
package/llms-full.txt
CHANGED
|
@@ -9,7 +9,7 @@ the files relevant to the task — cutting tokens ~97% while keeping answers
|
|
|
9
9
|
grounded. Deterministic, offline, no embeddings or vector database. Works with
|
|
10
10
|
Claude, Cursor, GitHub Copilot, Aider, Windsurf, local LLMs, and MCP.
|
|
11
11
|
|
|
12
|
-
# Version: 7.
|
|
12
|
+
# Version: 7.24.0 | Benchmark: sigmap-v7.0-main (2026-06-19)
|
|
13
13
|
# Source: auto-generated from package.json, version.json, src/mcp/tools.js, src/config/defaults.js
|
|
14
14
|
# Regenerate: npm run generate:llms | Validate: npm run validate:llms
|
|
15
15
|
|
package/llms.txt
CHANGED
|
@@ -9,7 +9,7 @@ the files relevant to the task — cutting tokens ~97% while keeping answers
|
|
|
9
9
|
grounded. Deterministic, offline, no embeddings or vector database. Works with
|
|
10
10
|
Claude, Cursor, GitHub Copilot, Aider, Windsurf, local LLMs, and MCP.
|
|
11
11
|
|
|
12
|
-
# Version: 7.
|
|
12
|
+
# Version: 7.24.0 | Benchmark: sigmap-v7.0-main (2026-06-19)
|
|
13
13
|
# Source: auto-generated from package.json, version.json, src/mcp/tools.js, src/config/defaults.js
|
|
14
14
|
# Regenerate: npm run generate:llms | Validate: npm run validate:llms
|
|
15
15
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "sigmap",
|
|
3
|
-
"version": "7.
|
|
3
|
+
"version": "7.24.0",
|
|
4
4
|
"description": "97% token reduction for AI coding. Extracts function & class signatures with TF-IDF ranking to feed only the right files to Claude, Cursor, Copilot, Aider, Windsurf, local LLMs & MCP. Zero dependencies, runs offline via npx.",
|
|
5
5
|
"main": "packages/core/index.js",
|
|
6
6
|
"exports": {
|
package/src/eval/llm-ablation.js
CHANGED
|
@@ -136,4 +136,33 @@ function runAblation(tasks, cwd, complete, opts = {}) {
|
|
|
136
136
|
};
|
|
137
137
|
}
|
|
138
138
|
|
|
139
|
-
|
|
139
|
+
/** mean/min/max of a number list (0s for an empty list). */
|
|
140
|
+
function _stats(nums) {
|
|
141
|
+
if (!nums.length) return { mean: 0, min: 0, max: 0 };
|
|
142
|
+
const sum = nums.reduce((a, b) => a + b, 0);
|
|
143
|
+
return { mean: sum / nums.length, min: Math.min(...nums), max: Math.max(...nums) };
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Aggregate several `runAblation` passes into a stable estimate — mean ± range
|
|
148
|
+
* of the without/with per-100 flag rates and their delta. At N=40 with tiny raw
|
|
149
|
+
* counts a single pass is noisy; averaging repeated passes gives a publishable
|
|
150
|
+
* number with an honest spread.
|
|
151
|
+
* @param {object[]} aggregates the `.aggregate` object from each runAblation pass
|
|
152
|
+
* @returns {{ runs:number, n:number, withoutPer100:object, withPer100:object, deltaPer100:object }}
|
|
153
|
+
*/
|
|
154
|
+
function aggregateRuns(aggregates) {
|
|
155
|
+
const runs = (aggregates || []).filter(Boolean);
|
|
156
|
+
const without = runs.map((a) => a.withoutPer100);
|
|
157
|
+
const withG = runs.map((a) => a.withPer100);
|
|
158
|
+
const delta = runs.map((a) => a.withoutPer100 - a.withPer100);
|
|
159
|
+
return {
|
|
160
|
+
runs: runs.length,
|
|
161
|
+
n: runs.length ? runs[0].n : 0,
|
|
162
|
+
withoutPer100: _stats(without),
|
|
163
|
+
withPer100: _stats(withG),
|
|
164
|
+
deltaPer100: _stats(delta),
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
module.exports = { buildGrounding, scoreAnswer, scoreAnswerDetail, runAblation, aggregateRuns };
|
package/src/mcp/server.js
CHANGED