@toolbaux/guardian 0.1.22 → 0.1.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/adapters/runner.js +72 -3
- package/dist/adapters/typescript-adapter.js +24 -10
- package/dist/benchmarking/metrics/context-coverage.js +82 -0
- package/dist/benchmarking/metrics/drift-score.js +104 -0
- package/dist/benchmarking/metrics/search-recall.js +207 -0
- package/dist/benchmarking/metrics/token-efficiency.js +79 -0
- package/dist/benchmarking/report.js +131 -0
- package/dist/benchmarking/runner.js +175 -0
- package/dist/benchmarking/types.js +13 -0
- package/dist/cli.js +53 -10
- package/dist/commands/benchmark.js +62 -0
- package/dist/commands/discrepancy.js +1 -1
- package/dist/commands/doc-generate.js +1 -1
- package/dist/commands/doc-html.js +1 -1
- package/dist/commands/extract.js +1 -1
- package/dist/commands/feature-context.js +1 -1
- package/dist/commands/init.js +1 -0
- package/dist/commands/intel.js +47 -1
- package/dist/commands/mcp-serve.js +48 -321
- package/dist/commands/search.js +602 -14
- package/dist/db/file-specs-store.js +174 -0
- package/dist/db/fts-builder.js +305 -0
- package/dist/db/index.js +55 -0
- package/dist/db/specs-store.js +13 -0
- package/dist/db/sqlite-specs-store.js +441 -0
- package/dist/extract/codebase-intel.js +31 -2
- package/dist/extract/compress.js +70 -3
- package/dist/extract/context-block.js +11 -2
- package/dist/extract/function-intel.js +5 -2
- package/dist/extract/index.js +1 -23
- package/dist/extract/writer.js +6 -0
- package/package.json +3 -1
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SqliteSpecsStore — SQLite implementation of SpecsStore.
|
|
3
|
+
*
|
|
4
|
+
* Stores everything that was previously scattered across .specs/machine/*.json
|
|
5
|
+
* and .specs/human/*.md into a single guardian.db file.
|
|
6
|
+
*
|
|
7
|
+
* Schema:
|
|
8
|
+
* specs — blob storage for all machine intelligence files
|
|
9
|
+
* docs — human-readable doc sections (markdown)
|
|
10
|
+
* metrics_log — append-only event log (replaces mcp-metrics.jsonl)
|
|
11
|
+
* search_fts — FTS5 virtual table built from specs content (extra index)
|
|
12
|
+
*
|
|
13
|
+
* Tier gating is stored per-row; the caller passes a tier filter when reading.
|
|
14
|
+
* This is the foundation for the pro/enterprise access control layer.
|
|
15
|
+
*/
|
|
16
|
+
import Database from "better-sqlite3";
|
|
17
|
+
import path from "node:path";
|
|
18
|
+
/**
|
|
19
|
+
* Normalise a file path to a canonical repo-relative form.
|
|
20
|
+
* Used consistently by the FTS builder, dep-graph builder, and search query.
|
|
21
|
+
* All paths stored in guardian.db go through this function.
|
|
22
|
+
*
|
|
23
|
+
* "flask-full/src/flask/sessions.py" → "src/flask/sessions.py"
|
|
24
|
+
* "django/django/contrib/auth.py" → "django/contrib/auth.py"
|
|
25
|
+
* "sqlalchemy/lib/sqlalchemy/sql/base.py" → "lib/sqlalchemy/sql/base.py"
|
|
26
|
+
*/
|
|
27
|
+
export function normPath(p) {
|
|
28
|
+
// Strip leading reponame/src/ → src/
|
|
29
|
+
p = p.replace(/^[^/]+\/src\//, "src/");
|
|
30
|
+
// Strip double-prefix X/X/ → X/ (package namespace matches repo clone dir)
|
|
31
|
+
const dm = p.match(/^([^/]+)\/\1\//);
|
|
32
|
+
if (dm)
|
|
33
|
+
return p.slice(dm[1].length + 1);
|
|
34
|
+
// Strip leading repo segment when followed by a known source-directory name
|
|
35
|
+
if (/^[^/]+\/(?:lib|examples|pkg|packages|apps|internal|cmd|src)\//i.test(p)) {
|
|
36
|
+
p = p.slice(p.indexOf("/") + 1);
|
|
37
|
+
}
|
|
38
|
+
return p;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Split camelCase and snake_case identifiers into individual tokens so the
|
|
42
|
+
* porter stemmer can match partial terms.
|
|
43
|
+
* getUserById → "get user by id"
|
|
44
|
+
* auth_service → "auth service"
|
|
45
|
+
*/
|
|
46
|
+
function splitIdentifiers(s) {
|
|
47
|
+
return s
|
|
48
|
+
.replace(/_/g, " ")
|
|
49
|
+
.replace(/([a-z])([A-Z])/g, "$1 $2")
|
|
50
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2")
|
|
51
|
+
.toLowerCase();
|
|
52
|
+
}
|
|
53
|
+
export const DB_FILENAME = "guardian.db";
|
|
54
|
+
export class SqliteSpecsStore {
|
|
55
|
+
storeDir;
|
|
56
|
+
db;
|
|
57
|
+
constructor(storeDir) {
|
|
58
|
+
this.storeDir = storeDir;
|
|
59
|
+
}
|
|
60
|
+
async init() {
|
|
61
|
+
const dbPath = path.join(this.storeDir, DB_FILENAME);
|
|
62
|
+
this.db = new Database(dbPath);
|
|
63
|
+
this.db.pragma("journal_mode = WAL");
|
|
64
|
+
this.db.pragma("synchronous = NORMAL");
|
|
65
|
+
this._migrate();
|
|
66
|
+
}
|
|
67
|
+
async close() {
|
|
68
|
+
this.db?.close();
|
|
69
|
+
}
|
|
70
|
+
// ── Spec blobs ─────────────────────────────────────────────────────────────
|
|
71
|
+
async readSpec(name) {
|
|
72
|
+
const row = this.db
|
|
73
|
+
.prepare("SELECT * FROM specs WHERE name = ?")
|
|
74
|
+
.get(name);
|
|
75
|
+
return row ?? null;
|
|
76
|
+
}
|
|
77
|
+
async writeSpec(name, content, format, tier = "free") {
|
|
78
|
+
this.db
|
|
79
|
+
.prepare(`
|
|
80
|
+
INSERT INTO specs (name, format, content, tier, updated_at)
|
|
81
|
+
VALUES (?, ?, ?, ?, ?)
|
|
82
|
+
ON CONFLICT(name) DO UPDATE SET
|
|
83
|
+
content = excluded.content,
|
|
84
|
+
format = excluded.format,
|
|
85
|
+
tier = excluded.tier,
|
|
86
|
+
updated_at = excluded.updated_at
|
|
87
|
+
`)
|
|
88
|
+
.run(name, format, content, tier, Date.now());
|
|
89
|
+
}
|
|
90
|
+
async listSpecs() {
|
|
91
|
+
const rows = this.db
|
|
92
|
+
.prepare("SELECT name FROM specs ORDER BY name")
|
|
93
|
+
.all();
|
|
94
|
+
return rows.map(r => r.name);
|
|
95
|
+
}
|
|
96
|
+
async hasSpec(name) {
|
|
97
|
+
const row = this.db
|
|
98
|
+
.prepare("SELECT 1 FROM specs WHERE name = ?")
|
|
99
|
+
.get(name);
|
|
100
|
+
return !!row;
|
|
101
|
+
}
|
|
102
|
+
// ── Human docs ─────────────────────────────────────────────────────────────
|
|
103
|
+
async readDoc(id) {
|
|
104
|
+
const row = this.db
|
|
105
|
+
.prepare("SELECT * FROM docs WHERE id = ?")
|
|
106
|
+
.get(id);
|
|
107
|
+
return row ?? null;
|
|
108
|
+
}
|
|
109
|
+
async writeDoc(entry) {
|
|
110
|
+
this.db
|
|
111
|
+
.prepare(`
|
|
112
|
+
INSERT INTO docs (id, section, title, body, tier, updated_at)
|
|
113
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
114
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
115
|
+
section = excluded.section,
|
|
116
|
+
title = excluded.title,
|
|
117
|
+
body = excluded.body,
|
|
118
|
+
tier = excluded.tier,
|
|
119
|
+
updated_at = excluded.updated_at
|
|
120
|
+
`)
|
|
121
|
+
.run(entry.id, entry.section, entry.title, entry.body, entry.tier, Date.now());
|
|
122
|
+
}
|
|
123
|
+
async listDocs(section) {
|
|
124
|
+
if (section) {
|
|
125
|
+
return this.db
|
|
126
|
+
.prepare("SELECT * FROM docs WHERE section = ? ORDER BY id")
|
|
127
|
+
.all(section);
|
|
128
|
+
}
|
|
129
|
+
return this.db
|
|
130
|
+
.prepare("SELECT * FROM docs ORDER BY section, id")
|
|
131
|
+
.all();
|
|
132
|
+
}
|
|
133
|
+
// ── Metrics log ────────────────────────────────────────────────────────────
|
|
134
|
+
async appendMetric(event, payload) {
|
|
135
|
+
this.db
|
|
136
|
+
.prepare("INSERT INTO metrics_log (ts, event, payload) VALUES (?, ?, ?)")
|
|
137
|
+
.run(Date.now(), event, JSON.stringify(payload));
|
|
138
|
+
}
|
|
139
|
+
async readMetrics(limit = 1000) {
|
|
140
|
+
return this.db
|
|
141
|
+
.prepare("SELECT * FROM metrics_log ORDER BY id DESC LIMIT ?")
|
|
142
|
+
.all(limit);
|
|
143
|
+
}
|
|
144
|
+
// ── FTS search (extra index, no equivalent in FileSpecsStore) ─────────────
|
|
145
|
+
/**
|
|
146
|
+
* Rebuild the FTS5 search index from extracted codebase data.
|
|
147
|
+
*
|
|
148
|
+
* Each row is one file. Symbol names and endpoints are pre-expanded with
|
|
149
|
+
* splitIdentifiers() so "getUserById" becomes "get user by id" before
|
|
150
|
+
* the porter stemmer runs — this gives sub-token recall without trigrams.
|
|
151
|
+
*
|
|
152
|
+
* Column BM25 weights (bm25 args, lower = more important):
|
|
153
|
+
* file_path(1), symbol_name(2), endpoint(3), body(4), module(5)
|
|
154
|
+
* weights: 1.0 0.5 0.7 1.0 0.6
|
|
155
|
+
*/
|
|
156
|
+
rebuildSearchIndex(rows) {
|
|
157
|
+
this.db.prepare("DELETE FROM search_fts").run();
|
|
158
|
+
const insert = this.db.prepare("INSERT INTO search_fts (file_path, symbol_name, endpoint, body, module) VALUES (?, ?, ?, ?, ?)");
|
|
159
|
+
const insertAll = this.db.transaction((items) => {
|
|
160
|
+
for (const r of items) {
|
|
161
|
+
insert.run(r.file_path, splitIdentifiers(r.symbol_name), splitIdentifiers(r.endpoint), r.body, r.module ?? "");
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
insertAll(rows);
|
|
165
|
+
}
|
|
166
|
+
/** BM25-ranked full-text search over the indexed content. */
|
|
167
|
+
searchFTS(query, limit = 20) {
|
|
168
|
+
const tokens = this._buildTokens(query);
|
|
169
|
+
if (tokens.length === 0)
|
|
170
|
+
return [];
|
|
171
|
+
const ftsQuery = tokens.join(" OR ");
|
|
172
|
+
try {
|
|
173
|
+
return this.db
|
|
174
|
+
.prepare(`
|
|
175
|
+
SELECT file_path, symbol_name,
|
|
176
|
+
bm25(search_fts, 1.0, 0.5, 0.7, 1.0, 0.6) AS rank
|
|
177
|
+
FROM search_fts
|
|
178
|
+
WHERE search_fts MATCH ?
|
|
179
|
+
ORDER BY rank
|
|
180
|
+
LIMIT ?
|
|
181
|
+
`)
|
|
182
|
+
.all(ftsQuery, limit);
|
|
183
|
+
}
|
|
184
|
+
catch {
|
|
185
|
+
return [];
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Score how well a query maps to indexed codebase content.
|
|
190
|
+
*
|
|
191
|
+
* Returns a 0–1 confidence score and a short reason string.
|
|
192
|
+
* Useful for:
|
|
193
|
+
* - Filtering low-quality benchmark tasks
|
|
194
|
+
* - Returning confidence alongside guardian_search results
|
|
195
|
+
* - Advising agents when a query needs reformulation
|
|
196
|
+
*
|
|
197
|
+
* Three signals (each 0–1, combined with weights):
|
|
198
|
+
* token_coverage 0.4 — fraction of query tokens that hit anything in the index
|
|
199
|
+
* top_bm25 0.4 — strength of the best match (normalised from BM25 score)
|
|
200
|
+
* result_cluster 0.2 — do top results cluster in one module (high) or scatter (low)?
|
|
201
|
+
*/
|
|
202
|
+
querySignal(query) {
|
|
203
|
+
const tokens = this._buildTokens(query);
|
|
204
|
+
if (tokens.length === 0) {
|
|
205
|
+
return { score: 0, confidence: "low", reason: "query produced no searchable tokens" };
|
|
206
|
+
}
|
|
207
|
+
// English stop words that appear everywhere — don't count as code signal.
|
|
208
|
+
const STOP = new Set(["the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "her", "was", "one", "our", "out", "day", "get", "has", "him", "his", "how", "its", "let", "may", "new", "now", "old", "see", "two", "use", "way", "who", "did", "man", "use", "say", "she", "than", "then", "them", "these", "they", "this", "will", "with", "have", "from", "that", "been", "each", "into", "like", "make", "more", "other", "over", "same", "such", "take", "than", "them", "then", "they", "this", "when", "your", "also", "back", "came", "come", "does", "even", "find", "give", "good", "here", "just", "keep", "kind", "last", "left", "life", "long", "much", "must", "name", "need", "next", "only", "open", "own", "part", "plan", "play", "put", "read", "real", "said", "show", "side", "some", "tell", "time", "very", "well", "went", "what", "with", "work", "year", "change", "update", "remove", "add", "fix", "like", "file", "files", "other", "also"]);
|
|
209
|
+
// Domain-specific tokens: those NOT in the stop list.
|
|
210
|
+
const domainTokens = tokens.filter(t => !STOP.has(t.replace(/\*$/, "")));
|
|
211
|
+
// ── Signal 1: domain token coverage ──────────────────────────────────
|
|
212
|
+
// Only count tokens that are domain-specific AND match SOURCE files (not config).
|
|
213
|
+
const SOURCE_EXT_RE = /\.(py|ts|tsx|js|jsx|go|java|cs|rb|rs|cpp|c|php|swift|kt)$/;
|
|
214
|
+
let domainHits = 0;
|
|
215
|
+
for (const tok of domainTokens) {
|
|
216
|
+
try {
|
|
217
|
+
const row = this.db.prepare("SELECT file_path FROM search_fts WHERE search_fts MATCH ? LIMIT 5").all(tok);
|
|
218
|
+
// Token must hit at least one actual source file (not config/build)
|
|
219
|
+
if (row.some(r => SOURCE_EXT_RE.test(r.file_path)))
|
|
220
|
+
domainHits++;
|
|
221
|
+
}
|
|
222
|
+
catch { /* skip */ }
|
|
223
|
+
}
|
|
224
|
+
const tokenCoverage = domainTokens.length > 0 ? domainHits / domainTokens.length : 0;
|
|
225
|
+
// ── Signal 2: joint match strength ───────────────────────────────────
|
|
226
|
+
// Use AND (not OR) to find files matching ALL domain tokens together.
|
|
227
|
+
// Joint co-occurrence in one file means the query is specific, not coincidental.
|
|
228
|
+
let jointStrength = 0;
|
|
229
|
+
if (domainTokens.length > 0) {
|
|
230
|
+
try {
|
|
231
|
+
const andQuery = domainTokens.join(" AND ");
|
|
232
|
+
const row = this.db.prepare(`
|
|
233
|
+
SELECT bm25(search_fts, 1.0, 0.5, 0.7, 1.0, 0.6) AS rank, file_path
|
|
234
|
+
FROM search_fts WHERE search_fts MATCH ? ORDER BY rank LIMIT 1
|
|
235
|
+
`).get(andQuery);
|
|
236
|
+
if (row && SOURCE_EXT_RE.test(row.file_path)) {
|
|
237
|
+
// Clamp [-15, 0] → [1, 0]
|
|
238
|
+
jointStrength = Math.min(1, Math.max(0, -row.rank / 8));
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
catch {
|
|
242
|
+
// AND query failed (no joint match) → fall back to OR top score
|
|
243
|
+
try {
|
|
244
|
+
const orQuery = domainTokens.join(" OR ");
|
|
245
|
+
const row = this.db.prepare(`
|
|
246
|
+
SELECT bm25(search_fts, 1.0, 0.5, 0.7, 1.0, 0.6) AS rank, file_path
|
|
247
|
+
FROM search_fts WHERE search_fts MATCH ? ORDER BY rank LIMIT 1
|
|
248
|
+
`).get(orQuery);
|
|
249
|
+
if (row && SOURCE_EXT_RE.test(row.file_path)) {
|
|
250
|
+
// OR match is weaker signal — scale down by 50%
|
|
251
|
+
jointStrength = Math.min(0.5, Math.max(0, -row.rank / 16));
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
catch { /* skip */ }
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
// ── Signal 3: result clustering ───────────────────────────────────────
|
|
258
|
+
let clustering = 0;
|
|
259
|
+
try {
|
|
260
|
+
const orQuery = domainTokens.length > 0 ? domainTokens.join(" OR ") : tokens.join(" OR ");
|
|
261
|
+
const rows = this.db.prepare(`
|
|
262
|
+
SELECT file_path FROM search_fts WHERE search_fts MATCH ? ORDER BY bm25(search_fts) LIMIT 5
|
|
263
|
+
`).all(orQuery);
|
|
264
|
+
const srcRows = rows.filter(r => SOURCE_EXT_RE.test(r.file_path));
|
|
265
|
+
if (srcRows.length > 1) {
|
|
266
|
+
const dirs = srcRows.map(r => r.file_path.split("/").slice(0, -1).join("/"));
|
|
267
|
+
const unique = new Set(dirs).size;
|
|
268
|
+
clustering = 1 - (unique - 1) / Math.max(srcRows.length - 1, 1);
|
|
269
|
+
}
|
|
270
|
+
else if (srcRows.length === 1) {
|
|
271
|
+
clustering = 1;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
catch { /* skip */ }
|
|
275
|
+
const score = tokenCoverage * 0.35 + jointStrength * 0.45 + clustering * 0.2;
|
|
276
|
+
const confidence = score >= 0.55 ? "high" : score >= 0.25 ? "medium" : "low";
|
|
277
|
+
const noCodeTokens = domainTokens.length === 0;
|
|
278
|
+
const reason = noCodeTokens
|
|
279
|
+
? "query contains only generic English words, no code-domain terms"
|
|
280
|
+
: tokenCoverage < 0.3
|
|
281
|
+
? `only ${Math.round(tokenCoverage * 100)}% of domain tokens match indexed source files`
|
|
282
|
+
: jointStrength < 0.15
|
|
283
|
+
? "tokens don't co-occur in any single source file — query is too generic"
|
|
284
|
+
: clustering < 0.3
|
|
285
|
+
? "matching files scatter across unrelated modules — query is ambiguous"
|
|
286
|
+
: `${Math.round(tokenCoverage * 100)}% domain coverage, strong co-occurrence match`;
|
|
287
|
+
return { score: Math.round(score * 100) / 100, confidence, reason };
|
|
288
|
+
}
|
|
289
|
+
// ── Private ────────────────────────────────────────────────────────────────
|
|
290
|
+
/** Build FTS5 token list from a natural language query. */
|
|
291
|
+
_buildTokens(query) {
|
|
292
|
+
return splitIdentifiers(query)
|
|
293
|
+
.split(/\s+/)
|
|
294
|
+
.filter(t => t.length > 1)
|
|
295
|
+
.map(t => `${t.replace(/[^a-z0-9]/g, "")}*`)
|
|
296
|
+
.filter(Boolean);
|
|
297
|
+
}
|
|
298
|
+
_migrate() {
|
|
299
|
+
this.db.exec(`
|
|
300
|
+
CREATE TABLE IF NOT EXISTS specs (
|
|
301
|
+
name TEXT PRIMARY KEY,
|
|
302
|
+
format TEXT NOT NULL,
|
|
303
|
+
content TEXT NOT NULL,
|
|
304
|
+
tier TEXT NOT NULL DEFAULT 'free',
|
|
305
|
+
updated_at INTEGER NOT NULL
|
|
306
|
+
);
|
|
307
|
+
|
|
308
|
+
CREATE TABLE IF NOT EXISTS docs (
|
|
309
|
+
id TEXT PRIMARY KEY,
|
|
310
|
+
section TEXT NOT NULL,
|
|
311
|
+
title TEXT NOT NULL,
|
|
312
|
+
body TEXT NOT NULL,
|
|
313
|
+
tier TEXT NOT NULL DEFAULT 'free',
|
|
314
|
+
updated_at INTEGER NOT NULL
|
|
315
|
+
);
|
|
316
|
+
|
|
317
|
+
CREATE INDEX IF NOT EXISTS docs_section ON docs(section);
|
|
318
|
+
|
|
319
|
+
CREATE TABLE IF NOT EXISTS metrics_log (
|
|
320
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
321
|
+
ts INTEGER NOT NULL,
|
|
322
|
+
event TEXT NOT NULL,
|
|
323
|
+
payload TEXT NOT NULL
|
|
324
|
+
);
|
|
325
|
+
|
|
326
|
+
CREATE INDEX IF NOT EXISTS metrics_log_ts ON metrics_log(ts);
|
|
327
|
+
|
|
328
|
+
CREATE TABLE IF NOT EXISTS file_deps (
|
|
329
|
+
file TEXT NOT NULL,
|
|
330
|
+
imports TEXT NOT NULL,
|
|
331
|
+
PRIMARY KEY (file, imports)
|
|
332
|
+
);
|
|
333
|
+
|
|
334
|
+
CREATE INDEX IF NOT EXISTS file_deps_reverse ON file_deps(imports);
|
|
335
|
+
`);
|
|
336
|
+
// FTS5 table — recreate if module column is missing (no ALTER TABLE for virtual tables).
|
|
337
|
+
// search_fts is always rebuilt on extract, so drop+recreate is safe.
|
|
338
|
+
const existing = this.db
|
|
339
|
+
.prepare("SELECT sql FROM sqlite_master WHERE type='table' AND name='search_fts'")
|
|
340
|
+
.get();
|
|
341
|
+
if (!existing?.sql?.includes("module")) {
|
|
342
|
+
this.db.exec(`
|
|
343
|
+
DROP TABLE IF EXISTS search_fts;
|
|
344
|
+
CREATE VIRTUAL TABLE search_fts USING fts5(
|
|
345
|
+
file_path,
|
|
346
|
+
symbol_name,
|
|
347
|
+
endpoint,
|
|
348
|
+
body,
|
|
349
|
+
module,
|
|
350
|
+
tokenize='porter unicode61'
|
|
351
|
+
);
|
|
352
|
+
`);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
// ── Dependency graph ────────────────────────────────────────────────────────
|
|
356
|
+
/** Replace all import edges (run once per guardian extract --backend sqlite). */
|
|
357
|
+
rebuildDeps(edges) {
|
|
358
|
+
const del = this.db.prepare("DELETE FROM file_deps");
|
|
359
|
+
const ins = this.db.prepare("INSERT OR IGNORE INTO file_deps (file, imports) VALUES (?, ?)");
|
|
360
|
+
this.db.transaction(() => {
|
|
361
|
+
del.run();
|
|
362
|
+
for (const e of edges)
|
|
363
|
+
ins.run(e.file, e.imports);
|
|
364
|
+
})();
|
|
365
|
+
}
|
|
366
|
+
/**
|
|
367
|
+
* BM25 search + dependency-graph quality reranking.
|
|
368
|
+
*
|
|
369
|
+
* Ranking model (inspired by HITS / PageRank applied to code):
|
|
370
|
+
* - Source files are "authorities": many files import them (high used_by count)
|
|
371
|
+
* - Test/example files are "hubs": they import source files but nothing imports them
|
|
372
|
+
*
|
|
373
|
+
* Quality score = authority_ratio = used_by / (used_by + imports)
|
|
374
|
+
* Combined score = bm25_rank / quality (bm25 is negative; dividing dampens hubs)
|
|
375
|
+
*
|
|
376
|
+
* This naturally demotes test/example files without hardcoding path patterns.
|
|
377
|
+
* Files with no dependency data get a neutral quality (0.7) to avoid over-penalising
|
|
378
|
+
* isolated scripts or newly-added files not yet in the graph.
|
|
379
|
+
*/
|
|
380
|
+
searchWithGraph(query, limit = 5) {
|
|
381
|
+
const tokens = this._buildTokens(query);
|
|
382
|
+
if (tokens.length === 0)
|
|
383
|
+
return [];
|
|
384
|
+
const ftsQuery = tokens.join(" OR ");
|
|
385
|
+
// Fetch a wider candidate pool so reranking has enough material.
|
|
386
|
+
const candidateLimit = Math.max(limit * 4, 60);
|
|
387
|
+
let rows;
|
|
388
|
+
try {
|
|
389
|
+
rows = this.db.prepare(`
|
|
390
|
+
WITH candidates AS (
|
|
391
|
+
SELECT file_path, symbol_name,
|
|
392
|
+
bm25(search_fts, 1.0, 0.5, 0.7, 1.0, 0.6) AS rank
|
|
393
|
+
FROM search_fts
|
|
394
|
+
WHERE search_fts MATCH ?
|
|
395
|
+
ORDER BY rank
|
|
396
|
+
LIMIT ?
|
|
397
|
+
)
|
|
398
|
+
SELECT
|
|
399
|
+
c.file_path,
|
|
400
|
+
c.symbol_name,
|
|
401
|
+
c.rank,
|
|
402
|
+
GROUP_CONCAT(DISTINCT d.imports) AS imports_,
|
|
403
|
+
GROUP_CONCAT(DISTINCT r.file) AS used_by_
|
|
404
|
+
FROM candidates c
|
|
405
|
+
LEFT JOIN file_deps d ON d.file = c.file_path
|
|
406
|
+
LEFT JOIN file_deps r ON r.imports = c.file_path
|
|
407
|
+
GROUP BY c.file_path, c.symbol_name, c.rank
|
|
408
|
+
ORDER BY c.rank
|
|
409
|
+
`).all(ftsQuery, candidateLimit);
|
|
410
|
+
}
|
|
411
|
+
catch {
|
|
412
|
+
return [];
|
|
413
|
+
}
|
|
414
|
+
// Apply quality reranking using dependency-graph authority score.
|
|
415
|
+
const reranked = rows.map(r => {
|
|
416
|
+
const imports = r.imports_ ? r.imports_.split(",").filter(Boolean) : [];
|
|
417
|
+
const used_by = r.used_by_ ? r.used_by_.split(",").filter(Boolean) : [];
|
|
418
|
+
const usedByN = used_by.length;
|
|
419
|
+
const importsN = imports.length;
|
|
420
|
+
let quality;
|
|
421
|
+
if (usedByN === 0 && importsN === 0) {
|
|
422
|
+
// No dependency data — preserve BM25 rank entirely.
|
|
423
|
+
quality = 1.0;
|
|
424
|
+
}
|
|
425
|
+
else {
|
|
426
|
+
// authority_ratio ∈ [0, 1]: 1.0 = pure authority (many things import this file)
|
|
427
|
+
// 0.0 = pure hub (imports many, nothing imports it)
|
|
428
|
+
const authority = usedByN / (usedByN + importsN);
|
|
429
|
+
// Gentle nudge: [0.7, 1.0] — hubs are demoted by at most 30%.
|
|
430
|
+
// BM25 relevance still dominates; this is a tiebreaker, not a hard filter.
|
|
431
|
+
quality = 0.7 + 0.3 * authority;
|
|
432
|
+
}
|
|
433
|
+
// bm25 is negative (more negative = better). Multiplying by quality < 1
|
|
434
|
+
// moves the score toward 0 — making low-quality files rank worse.
|
|
435
|
+
const combined = r.rank * quality;
|
|
436
|
+
return { file_path: r.file_path, symbol_name: r.symbol_name, rank: combined, imports, used_by };
|
|
437
|
+
});
|
|
438
|
+
reranked.sort((a, b) => a.rank - b.rank);
|
|
439
|
+
return reranked.slice(0, limit);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
@@ -171,8 +171,10 @@ function buildEndpointPatternMap(architecture) {
|
|
|
171
171
|
}
|
|
172
172
|
return result;
|
|
173
173
|
}
|
|
174
|
+
// ── File-based IO (original implementation — unchanged) ────────────────────
|
|
174
175
|
/**
|
|
175
|
-
* Load snapshots and
|
|
176
|
+
* Load snapshots and write codebase-intelligence.json to disk.
|
|
177
|
+
* This is the original file-based implementation, kept intact.
|
|
176
178
|
*/
|
|
177
179
|
export async function writeCodebaseIntelligence(specsDir, outputPath) {
|
|
178
180
|
const machineDir = await resolveMachineInputDir(specsDir);
|
|
@@ -187,9 +189,36 @@ export async function writeCodebaseIntelligence(specsDir, outputPath) {
|
|
|
187
189
|
await fs.writeFile(outputPath, JSON.stringify(intel, null, 2), "utf8");
|
|
188
190
|
}
|
|
189
191
|
/**
|
|
190
|
-
* Load an existing codebase-intelligence.json from
|
|
192
|
+
* Load an existing codebase-intelligence.json from a file path.
|
|
193
|
+
* Original file-based implementation, kept intact.
|
|
191
194
|
*/
|
|
192
195
|
export async function loadCodebaseIntelligence(intelPath) {
|
|
193
196
|
const raw = await fs.readFile(intelPath, "utf8");
|
|
194
197
|
return JSON.parse(raw);
|
|
195
198
|
}
|
|
199
|
+
// ── Store-based IO (new — works with both FileSpecsStore and SqliteSpecsStore) ─
|
|
200
|
+
/**
|
|
201
|
+
* Build CodebaseIntelligence and write it via a SpecsStore.
|
|
202
|
+
* Use this when operating on a guardian.db or when you already have a store open.
|
|
203
|
+
*/
|
|
204
|
+
export async function writeCodebaseIntelligenceViaStore(store) {
|
|
205
|
+
const archEntry = await store.readSpec("architecture.snapshot");
|
|
206
|
+
const uxEntry = await store.readSpec("ux.snapshot");
|
|
207
|
+
if (!archEntry || !uxEntry) {
|
|
208
|
+
throw new Error("architecture.snapshot or ux.snapshot not found in store. Run `guardian extract` first.");
|
|
209
|
+
}
|
|
210
|
+
const architecture = yaml.load(archEntry.content);
|
|
211
|
+
const ux = yaml.load(uxEntry.content);
|
|
212
|
+
const intel = buildCodebaseIntelligence(architecture, ux);
|
|
213
|
+
await store.writeSpec("codebase-intelligence", JSON.stringify(intel, null, 2), "json");
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Load CodebaseIntelligence from a SpecsStore.
|
|
217
|
+
* Returns null if not yet built.
|
|
218
|
+
*/
|
|
219
|
+
export async function loadCodebaseIntelligenceViaStore(store) {
|
|
220
|
+
const entry = await store.readSpec("codebase-intelligence");
|
|
221
|
+
if (!entry)
|
|
222
|
+
return null;
|
|
223
|
+
return JSON.parse(entry.content);
|
|
224
|
+
}
|
package/dist/extract/compress.js
CHANGED
|
@@ -319,6 +319,7 @@ function buildHeatmapFromGraph(level, nodes, edges, nodeLayers) {
|
|
|
319
319
|
}
|
|
320
320
|
}
|
|
321
321
|
const cycleNodes = findCycleNodes(nodes, adjacency, reverse);
|
|
322
|
+
const pageRank = computePageRank(nodes, adjacency, reverse);
|
|
322
323
|
const degreeValues = nodes.map((node) => (outbound.get(node) ?? 0) + (inbound.get(node) ?? 0));
|
|
323
324
|
const maxDegree = Math.max(1, ...degreeValues);
|
|
324
325
|
const maxCrossRatio = Math.max(1, ...nodes.map((node) => {
|
|
@@ -332,15 +333,22 @@ function buildHeatmapFromGraph(level, nodes, edges, nodeLayers) {
|
|
|
332
333
|
const out = outbound.get(node) ?? 0;
|
|
333
334
|
const crossRatio = out === 0 ? 0 : crossOut / out;
|
|
334
335
|
const cycleFlag = cycleNodes.has(node) ? 1 : 0;
|
|
335
|
-
const
|
|
336
|
-
|
|
337
|
-
|
|
336
|
+
const pr = pageRank.get(node) ?? 0;
|
|
337
|
+
// PageRank (40%) — importance by what depends on this node
|
|
338
|
+
// Degree (30%) — raw connectivity (fallback signal)
|
|
339
|
+
// Cross-layer (20%) — architectural violation risk
|
|
340
|
+
// Cycle (10%) — circular dependency penalty
|
|
341
|
+
const score = 0.4 * pr +
|
|
342
|
+
0.3 * (degree / maxDegree) +
|
|
343
|
+
0.2 * (crossRatio / maxCrossRatio) +
|
|
344
|
+
0.1 * cycleFlag;
|
|
338
345
|
return {
|
|
339
346
|
id: node,
|
|
340
347
|
layer: nodeLayers.get(node) ?? "unknown",
|
|
341
348
|
score: round(score, 4),
|
|
342
349
|
components: {
|
|
343
350
|
degree,
|
|
351
|
+
pagerank: round(pr, 4),
|
|
344
352
|
cross_layer_ratio: round(crossRatio, 4),
|
|
345
353
|
cycle: cycleFlag
|
|
346
354
|
}
|
|
@@ -368,6 +376,65 @@ function resolveDomainForModule(moduleId, domainMap) {
|
|
|
368
376
|
}
|
|
369
377
|
return null;
|
|
370
378
|
}
|
|
379
|
+
/**
|
|
380
|
+
* Iterative PageRank over a directed graph.
|
|
381
|
+
* Returns a map of node → normalized score in [0, 1].
|
|
382
|
+
*
|
|
383
|
+
* Semantics: a node is important if many important nodes import/depend on it.
|
|
384
|
+
* Damping factor α=0.85 (web-standard). Converges in ~20 iterations for
|
|
385
|
+
* codebases with <10K files.
|
|
386
|
+
*
|
|
387
|
+
* Edge direction follows dependency arrows (A imports B → edge A→B).
|
|
388
|
+
* Rank flows *backward*: B gains rank because A depends on it, meaning
|
|
389
|
+
* files that many other files rely on get high scores — exactly what we
|
|
390
|
+
* want to surface in AI context.
|
|
391
|
+
*/
|
|
392
|
+
function computePageRank(nodes, adjacency, // forward edges (importer → imported)
|
|
393
|
+
reverse // backward edges (imported → importers)
|
|
394
|
+
) {
|
|
395
|
+
const N = nodes.length;
|
|
396
|
+
if (N === 0)
|
|
397
|
+
return new Map();
|
|
398
|
+
const DAMPING = 0.85;
|
|
399
|
+
const ITERATIONS = 30;
|
|
400
|
+
const BASE = (1 - DAMPING) / N;
|
|
401
|
+
// Initialize uniform rank
|
|
402
|
+
const rank = new Map();
|
|
403
|
+
for (const node of nodes)
|
|
404
|
+
rank.set(node, 1 / N);
|
|
405
|
+
// Precompute out-degrees (how many nodes each node imports)
|
|
406
|
+
const outDeg = new Map();
|
|
407
|
+
for (const node of nodes)
|
|
408
|
+
outDeg.set(node, (adjacency.get(node) ?? []).length);
|
|
409
|
+
// Dangling nodes (no outgoing edges) distribute rank uniformly
|
|
410
|
+
for (let iter = 0; iter < ITERATIONS; iter++) {
|
|
411
|
+
const next = new Map();
|
|
412
|
+
// Dangling mass: sum of ranks of sink nodes spread across all nodes
|
|
413
|
+
let danglingMass = 0;
|
|
414
|
+
for (const node of nodes) {
|
|
415
|
+
if ((outDeg.get(node) ?? 0) === 0) {
|
|
416
|
+
danglingMass += (rank.get(node) ?? 0);
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
const danglingContrib = DAMPING * danglingMass / N;
|
|
420
|
+
for (const node of nodes) {
|
|
421
|
+
let incoming = 0;
|
|
422
|
+
for (const importer of (reverse.get(node) ?? [])) {
|
|
423
|
+
const d = outDeg.get(importer) ?? 1;
|
|
424
|
+
incoming += (rank.get(importer) ?? 0) / d;
|
|
425
|
+
}
|
|
426
|
+
next.set(node, BASE + danglingContrib + DAMPING * incoming);
|
|
427
|
+
}
|
|
428
|
+
for (const node of nodes)
|
|
429
|
+
rank.set(node, next.get(node) ?? 0);
|
|
430
|
+
}
|
|
431
|
+
// Normalize to [0, 1] relative to max
|
|
432
|
+
const max = Math.max(1e-10, ...Array.from(rank.values()));
|
|
433
|
+
const normalized = new Map();
|
|
434
|
+
for (const [node, r] of rank.entries())
|
|
435
|
+
normalized.set(node, r / max);
|
|
436
|
+
return normalized;
|
|
437
|
+
}
|
|
371
438
|
function findCycleNodes(nodes, adjacency, reverse) {
|
|
372
439
|
const visited = new Set();
|
|
373
440
|
const order = [];
|
|
@@ -29,8 +29,17 @@ export function renderContextBlock(architecture, ux, options) {
|
|
|
29
29
|
}
|
|
30
30
|
lines.push("");
|
|
31
31
|
}
|
|
32
|
-
// Cross-module dependencies
|
|
33
|
-
const
|
|
32
|
+
// Cross-module dependencies (deduplicated)
|
|
33
|
+
const seenEdges = new Set();
|
|
34
|
+
const crossEdges = architecture.dependencies.module_graph.filter(e => {
|
|
35
|
+
if (e.from === e.to)
|
|
36
|
+
return false;
|
|
37
|
+
const key = `${e.from}→${e.to}`;
|
|
38
|
+
if (seenEdges.has(key))
|
|
39
|
+
return false;
|
|
40
|
+
seenEdges.add(key);
|
|
41
|
+
return true;
|
|
42
|
+
});
|
|
34
43
|
if (crossEdges.length > 0) {
|
|
35
44
|
lines.push("### Module Dependencies");
|
|
36
45
|
for (const edge of crossEdges.slice(0, 10)) {
|
|
@@ -160,8 +160,10 @@ async function listSourceFiles(dir, config, results = []) {
|
|
|
160
160
|
* Scan one or more project roots, run adapters on every source file, and
|
|
161
161
|
* return the aggregated FunctionIntelligence index.
|
|
162
162
|
*/
|
|
163
|
-
export async function buildFunctionIntelligenceFromRoots(roots, config) {
|
|
163
|
+
export async function buildFunctionIntelligenceFromRoots(roots, config, projectRoot) {
|
|
164
164
|
const allFunctions = [];
|
|
165
|
+
// Relativize against project root if provided; otherwise fall back to the scan root
|
|
166
|
+
const baseDir = projectRoot ?? roots[0];
|
|
165
167
|
for (const root of roots) {
|
|
166
168
|
const files = await listSourceFiles(root, config);
|
|
167
169
|
await Promise.all(files.map(async (filePath) => {
|
|
@@ -177,7 +179,8 @@ export async function buildFunctionIntelligenceFromRoots(roots, config) {
|
|
|
177
179
|
}
|
|
178
180
|
try {
|
|
179
181
|
const result = runAdapter(adapter, filePath, source);
|
|
180
|
-
|
|
182
|
+
const relPath = path.relative(baseDir, filePath);
|
|
183
|
+
allFunctions.push(...result.functions.map(fn => ({ ...fn, file: relPath })));
|
|
181
184
|
}
|
|
182
185
|
catch {
|
|
183
186
|
// Skip files that fail to parse (malformed source, encoding issues)
|
package/dist/extract/index.js
CHANGED
|
@@ -191,8 +191,7 @@ export async function extractProject(options) {
|
|
|
191
191
|
// Generate Function Intelligence — call graph, literal index across all languages.
|
|
192
192
|
// Runs as an additive second pass; never modifies the architecture snapshot.
|
|
193
193
|
try {
|
|
194
|
-
const
|
|
195
|
-
const funcIntel = await buildFunctionIntelligenceFromRoots(allRoots, config);
|
|
194
|
+
const funcIntel = await buildFunctionIntelligenceFromRoots([projectRoot], config, projectRoot);
|
|
196
195
|
await writeFunctionIntelligence(layout.machineDir, funcIntel);
|
|
197
196
|
}
|
|
198
197
|
catch (err) {
|
|
@@ -421,27 +420,6 @@ function mergeFrontendAnalyses(results, _roots, _workspaceRoot) {
|
|
|
421
420
|
tests: results.flatMap(r => r.tests)
|
|
422
421
|
};
|
|
423
422
|
}
|
|
424
|
-
function findCommonRoot(paths) {
|
|
425
|
-
if (paths.length === 0) {
|
|
426
|
-
return process.cwd();
|
|
427
|
-
}
|
|
428
|
-
const splitPaths = paths.map((entry) => path.resolve(entry).split(path.sep));
|
|
429
|
-
const minLength = Math.min(...splitPaths.map((parts) => parts.length));
|
|
430
|
-
const shared = [];
|
|
431
|
-
for (let i = 0; i < minLength; i += 1) {
|
|
432
|
-
const segment = splitPaths[0][i];
|
|
433
|
-
if (splitPaths.every((parts) => parts[i] === segment)) {
|
|
434
|
-
shared.push(segment);
|
|
435
|
-
}
|
|
436
|
-
else {
|
|
437
|
-
break;
|
|
438
|
-
}
|
|
439
|
-
}
|
|
440
|
-
if (shared.length === 0) {
|
|
441
|
-
return path.parse(paths[0]).root;
|
|
442
|
-
}
|
|
443
|
-
return shared.join(path.sep);
|
|
444
|
-
}
|
|
445
423
|
async function loadPreviousSnapshots(machineDir, rootDir) {
|
|
446
424
|
const result = {};
|
|
447
425
|
const candidates = [
|