@toolbaux/guardian 0.1.21 → 0.1.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -230,8 +230,8 @@ npm install && npm run build && npm link
230
230
  ```bash
231
231
  guardian init # config, .specs dir, pre-commit hook, CLAUDE.md
232
232
  guardian extract # full architecture + UX snapshots + docs
233
+ guardian extract --backend sqlite # same + builds guardian.db with FTS index
233
234
  guardian generate --ai-context # compact ~3K token AI context only
234
- guardian intel # build codebase-intelligence.json
235
235
  ```
236
236
 
237
237
  ### Search & Context
@@ -15,14 +15,26 @@ export function runAdapter(adapter, file, source) {
15
15
  }
16
16
  return { endpoints: [], models: [], components: [], tests: [], functions: [] };
17
17
  }
18
- // tree-sitter native binding throws "Invalid argument" for very large files.
19
- // Skip files over 1 MB to avoid silent crashes; they are rare in practice.
18
+ // tree-sitter's native binding throws "Invalid argument" for files with high AST
19
+ // complexity this can happen well below 1 MB for deeply-nested source files.
20
+ // Parse defensively: try the whole file first, then fall back to chunked parsing
21
+ // if tree-sitter throws. Chunks are split at top-level definition boundaries so
22
+ // each piece is syntactically self-contained.
20
23
  if (source.length > 1_000_000) {
21
24
  return { endpoints: [], models: [], components: [], tests: [], functions: [] };
22
25
  }
23
26
  const parser = new Parser();
24
27
  parser.setLanguage(adapter.language);
25
- const tree = parser.parse(source);
28
+ let tree;
29
+ try {
30
+ tree = parser.parse(source);
31
+ }
32
+ catch {
33
+ // File is too complex for a single parse — split at top-level definitions and
34
+ // merge results. Each chunk is a run of lines from one top-level def/class to
35
+ // the next, so it is syntactically valid on its own.
36
+ return runAdapterChunked(adapter, file, source, parser);
37
+ }
26
38
  if (adapter.extract) {
27
39
  const result = adapter.extract(file, source, tree.rootNode);
28
40
  return {
@@ -94,3 +106,60 @@ export function runAdapter(adapter, file, source) {
94
106
  }
95
107
  return { endpoints, models, components, tests, functions: [] };
96
108
  }
109
+ /**
110
+ * Fallback for files that tree-sitter can't parse as a whole.
111
+ * Splits source at top-level definition boundaries (lines starting with
112
+ * "def ", "class ", "async def ", "fn ", "func ", "public class ", etc.),
113
+ * parses each chunk independently with the same adapter, and merges results.
114
+ */
115
+ function runAdapterChunked(adapter, file, source, parser) {
116
+ const merged = {
117
+ endpoints: [], models: [], components: [], tests: [], functions: [],
118
+ };
119
+ if (!adapter.extract)
120
+ return merged;
121
+ // Split at lines that start a new top-level definition.
122
+ // Pattern covers Python, Go, Rust, JS/TS, Java, C#.
123
+ const TOP_DEF = /^(?:(?:pub(?:\s+(?:unsafe\s+)?)?|private|protected|public|static|async|export\s+(?:default\s+)?|abstract\s+)*(?:def |class |fn |func |function |interface |struct |enum |impl |type ))/;
124
+ const lines = source.split("\n");
125
+ const splitPoints = [0];
126
+ for (let i = 1; i < lines.length; i++) {
127
+ if (TOP_DEF.test(lines[i]))
128
+ splitPoints.push(i);
129
+ }
130
+ splitPoints.push(lines.length);
131
+ // Group split points into chunks of up to ~25 KB to stay within parser limits.
132
+ const CHUNK_BYTES = 25_000;
133
+ let chunkBytes = 0;
134
+ let chunkLines = [];
135
+ function flushChunk() {
136
+ if (chunkLines.length === 0)
137
+ return;
138
+ const chunk = chunkLines.join("\n");
139
+ try {
140
+ const tree = parser.parse(chunk);
141
+ const result = adapter.extract(file, chunk, tree.rootNode);
142
+ merged.endpoints.push(...result.endpoints);
143
+ merged.models.push(...result.models);
144
+ merged.components.push(...result.components);
145
+ merged.tests.push(...result.tests);
146
+ merged.functions.push(...(result.functions ?? []));
147
+ }
148
+ catch {
149
+ // skip unparseable chunk
150
+ }
151
+ chunkLines = [];
152
+ chunkBytes = 0;
153
+ }
154
+ for (let s = 0; s < splitPoints.length - 1; s++) {
155
+ const segLines = lines.slice(splitPoints[s], splitPoints[s + 1]);
156
+ const segText = segLines.join("\n");
157
+ if (chunkBytes + segText.length > CHUNK_BYTES && chunkLines.length > 0) {
158
+ flushChunk();
159
+ }
160
+ chunkLines.push(...segLines);
161
+ chunkBytes += segText.length;
162
+ }
163
+ flushChunk();
164
+ return merged;
165
+ }
@@ -1,16 +1,6 @@
1
1
  import TypeScript from "tree-sitter-typescript";
2
2
  import Parser from "tree-sitter";
3
3
  import path from "node:path";
4
- // Utility to recursively find children of a certain type
5
- function findChildren(node, type) {
6
- const results = [];
7
- if (node.type === type)
8
- results.push(node);
9
- for (const child of node.namedChildren) {
10
- results.push(...findChildren(child, type));
11
- }
12
- return results;
13
- }
14
4
  // ── Function-level intelligence helpers ──────────────────────────────────
15
5
  /** Walk all descendants depth-first. */
16
6
  function* walkAll(node) {
@@ -98,6 +88,30 @@ function extractTsFunctions(file, source, node) {
98
88
  isAsync = valN.children.some((c) => c.type === "async");
99
89
  }
100
90
  }
91
+ else if (n.type === "interface_declaration" ||
92
+ n.type === "type_alias_declaration" ||
93
+ n.type === "class_declaration" ||
94
+ n.type === "abstract_class_declaration" ||
95
+ n.type === "enum_declaration") {
96
+ // Type-level declarations: interfaces, types, classes, enums.
97
+ // These are the primary symbols in .d.ts files and typed source files.
98
+ const nameN = n.childForFieldName("name");
99
+ if (nameN) {
100
+ const name = getText(nameN);
101
+ records.push({
102
+ id: `${file}#${name}:${n.startPosition.row + 1}`,
103
+ name,
104
+ file,
105
+ lines: [n.startPosition.row + 1, n.endPosition.row + 1],
106
+ calls: [],
107
+ stringLiterals: [],
108
+ regexPatterns: [],
109
+ isAsync: false,
110
+ language: "typescript",
111
+ });
112
+ }
113
+ // Still recurse to catch methods inside classes
114
+ }
101
115
  if (funcName && bodyNode) {
102
116
  const intel = collectBodyIntel(bodyNode, getText);
103
117
  records.push({
@@ -0,0 +1,82 @@
1
+ /**
2
+ * Context Coverage Metric
3
+ *
4
+ * Measures how well guardian_context covers the modules and files
5
+ * relevant to a benchmark task.
6
+ *
7
+ * Method:
8
+ * 1. Read architecture-context.md from the specs dir
9
+ * 2. For each ground-truth file, check if its basename or containing module
10
+ * is mentioned anywhere in the context block
11
+ * 3. For modules: check if the module ID appears (e.g. "src/auth", "auth")
12
+ *
13
+ * A coverage of 1.0 means every ground-truth file/module appears in the context.
14
+ */
15
+ import path from "node:path";
16
+ import fs from "node:fs/promises";
17
+ export async function measureContextCoverage(params) {
18
+ const { specsDir, groundTruthFiles } = params;
19
+ // Read architecture-context.md
20
+ const contextPath = path.join(specsDir, "machine", "architecture-context.md");
21
+ let contextText = "";
22
+ try {
23
+ const raw = await fs.readFile(contextPath, "utf8");
24
+ // Extract the guardian:context block for fair comparison
25
+ const match = raw.match(/<!-- guardian:context[^>]*-->([\s\S]*?)<!-- \/guardian:context -->/);
26
+ contextText = (match ? match[1] : raw).toLowerCase();
27
+ }
28
+ catch {
29
+ // No context file — zero coverage
30
+ return {
31
+ coverage: 0,
32
+ modules_mentioned: [],
33
+ modules_missing: groundTruthFiles.map(moduleIdFor),
34
+ files_mentioned: 0,
35
+ files_total: groundTruthFiles.length,
36
+ };
37
+ }
38
+ // ── Check file coverage ──────────────────────────────────────────────────
39
+ let filesMentioned = 0;
40
+ for (const f of groundTruthFiles) {
41
+ const basename = path.basename(f).toLowerCase();
42
+ const noExt = basename.replace(/\.[^.]+$/, "");
43
+ if (contextText.includes(basename) || contextText.includes(noExt)) {
44
+ filesMentioned++;
45
+ }
46
+ }
47
+ // ── Check module coverage ────────────────────────────────────────────────
48
+ // Derive module IDs from ground-truth file paths (e.g. "src/auth/service.ts" → "src/auth")
49
+ const allModuleIds = [...new Set(groundTruthFiles.map(moduleIdFor))];
50
+ const modulesMentioned = [];
51
+ const modulesMissing = [];
52
+ for (const modId of allModuleIds) {
53
+ // Check if the module ID (or any segment) appears in context
54
+ const segments = modId.split("/").filter(Boolean);
55
+ const mentioned = segments.some(seg => contextText.includes(seg.toLowerCase())) ||
56
+ contextText.includes(modId.toLowerCase());
57
+ if (mentioned) {
58
+ modulesMentioned.push(modId);
59
+ }
60
+ else {
61
+ modulesMissing.push(modId);
62
+ }
63
+ }
64
+ const coverage = allModuleIds.length > 0
65
+ ? round(modulesMentioned.length / allModuleIds.length)
66
+ : 0;
67
+ return {
68
+ coverage,
69
+ modules_mentioned: modulesMentioned,
70
+ modules_missing: modulesMissing,
71
+ files_mentioned: filesMentioned,
72
+ files_total: groundTruthFiles.length,
73
+ };
74
+ }
75
+ /** Derive a module-level ID from a file path (parent directory) */
76
+ function moduleIdFor(filePath) {
77
+ const normalized = filePath.replace(/\\/g, "/").replace(/^\.\//, "");
78
+ return path.dirname(normalized);
79
+ }
80
+ function round(n) {
81
+ return Math.round(n * 1000) / 1000;
82
+ }
@@ -0,0 +1,104 @@
1
+ /**
2
+ * Drift Score Metric
3
+ *
4
+ * Measures how much architectural drift a proposed patch introduces.
5
+ *
6
+ * Method:
7
+ * baseline → read pre-computed drift from architecture.diff.summary.json
8
+ * post-patch → if a patch is provided, count changed files and estimate delta
9
+ * by counting new/modified module edges in the diff
10
+ *
11
+ * For publication: lower drift_increase means the patch respected architecture.
12
+ * A delta of 0 means the patch introduced no new coupling.
13
+ */
14
+ import path from "node:path";
15
+ import fs from "node:fs/promises";
16
+ export async function measureDriftScore(params) {
17
+ const { specsDir, patch } = params;
18
+ const machineDir = path.join(specsDir, "machine");
19
+ // ── Read baseline drift summary ──────────────────────────────────────────
20
+ const diffPath = path.join(machineDir, "architecture.diff.summary.json");
21
+ let diff = null;
22
+ let baselineStatus = "unknown";
23
+ let baselineDelta = null;
24
+ try {
25
+ const raw = await fs.readFile(diffPath, "utf8");
26
+ diff = JSON.parse(raw);
27
+ // Compute a drift delta from the counts_delta
28
+ const cd = diff.counts_delta ?? {};
29
+ const edgeDelta = Math.abs(cd.module_edges ?? 0) + Math.abs(cd.file_edges ?? 0);
30
+ const structDelta = Math.abs(cd.modules ?? 0) * 2; // new modules weigh more
31
+ baselineDelta = edgeDelta + structDelta;
32
+ baselineStatus = diff.structural_change ? "drift" : "stable";
33
+ }
34
+ catch {
35
+ baselineStatus = "no-baseline";
36
+ }
37
+ // ── Estimate post-patch drift ────────────────────────────────────────────
38
+ let postPatchDelta = null;
39
+ let postPatchStatus = "unknown";
40
+ let patchApplied = false;
41
+ if (patch) {
42
+ patchApplied = true;
43
+ // Parse the unified diff to count touched files and new import patterns
44
+ const changedFiles = countPatchFiles(patch);
45
+ const newImports = countNewImports(patch);
46
+ const removedImports = countRemovedImports(patch);
47
+ // Heuristic delta: each new import edge that isn't in a removal = +1 coupling
48
+ const netNewImports = Math.max(0, newImports - removedImports);
49
+ postPatchDelta = (baselineDelta ?? 0) + netNewImports + Math.floor(changedFiles / 3);
50
+ postPatchStatus = postPatchDelta > (baselineDelta ?? 0) + 2
51
+ ? "drift"
52
+ : postPatchDelta > 0
53
+ ? "warning"
54
+ : "stable";
55
+ }
56
+ const driftIncrease = postPatchDelta !== null && baselineDelta !== null
57
+ ? postPatchDelta - baselineDelta
58
+ : null;
59
+ return {
60
+ baseline_delta: baselineDelta,
61
+ post_patch_delta: postPatchDelta,
62
+ drift_increase: driftIncrease !== null ? Math.max(0, driftIncrease) : null,
63
+ baseline_status: baselineStatus,
64
+ post_patch_status: patchApplied ? postPatchStatus : "not-computed",
65
+ patch_applied: patchApplied,
66
+ };
67
+ }
68
+ // ── Patch helpers ────────────────────────────────────────────────────────────
69
+ /** Count distinct files touched by a unified diff */
70
+ function countPatchFiles(patch) {
71
+ const files = new Set();
72
+ for (const line of patch.split("\n")) {
73
+ if (line.startsWith("--- ") || line.startsWith("+++ ")) {
74
+ const f = line.slice(4).replace(/\t.*/, "").trim();
75
+ if (f !== "/dev/null")
76
+ files.add(f);
77
+ }
78
+ }
79
+ return files.size;
80
+ }
81
+ /** Count added import lines (import/from/require) in the patch */
82
+ function countNewImports(patch) {
83
+ let count = 0;
84
+ for (const line of patch.split("\n")) {
85
+ if (line.startsWith("+") && !line.startsWith("+++")) {
86
+ const l = line.slice(1).trim();
87
+ if (/^(import|from|require)\b/.test(l))
88
+ count++;
89
+ }
90
+ }
91
+ return count;
92
+ }
93
+ /** Count removed import lines in the patch */
94
+ function countRemovedImports(patch) {
95
+ let count = 0;
96
+ for (const line of patch.split("\n")) {
97
+ if (line.startsWith("-") && !line.startsWith("---")) {
98
+ const l = line.slice(1).trim();
99
+ if (/^(import|from|require)\b/.test(l))
100
+ count++;
101
+ }
102
+ }
103
+ return count;
104
+ }
@@ -0,0 +1,207 @@
1
+ /**
2
+ * Search Recall Metric
3
+ *
4
+ * Measures how well guardian_search surfaces the files and symbols
5
+ * that the correct solution actually touches (ground truth).
6
+ *
7
+ * Uses the codebase-intelligence.json search logic (same as MCP guardian_search)
8
+ * plus the richer architecture.snapshot.yaml for file-level recall.
9
+ *
10
+ * Paper metric: precision@k, recall@k, F1@k (default k=5)
11
+ */
12
+ import path from "node:path";
13
+ import fs from "node:fs/promises";
14
+ const DEFAULT_K = 5;
15
+ /**
16
+ * Run search against codebase-intelligence.json + function-intelligence.json
17
+ * and score recall against the ground-truth files and symbols from a benchmark task.
18
+ */
19
+ export async function measureSearchRecall(params) {
20
+ const { specsDir, query, groundTruthFiles, groundTruthSymbols = [], k = DEFAULT_K } = params;
21
+ const intelPath = path.join(specsDir, "machine", "codebase-intelligence.json");
22
+ let intel;
23
+ try {
24
+ const raw = await fs.readFile(intelPath, "utf8");
25
+ intel = JSON.parse(raw);
26
+ }
27
+ catch {
28
+ return emptyResult(k, groundTruthFiles, groundTruthSymbols);
29
+ }
30
+ // Also load function-intelligence.json if available (same as guardian_search MCP tool)
31
+ let funcIntel = null;
32
+ try {
33
+ const funcRaw = await fs.readFile(path.join(specsDir, "machine", "function-intelligence.json"), "utf8");
34
+ funcIntel = JSON.parse(funcRaw);
35
+ }
36
+ catch { /* optional */ }
37
+ const { resultFiles, resultSymbols } = searchIntel(intel, funcIntel, query, k * 4);
38
+ // Normalize ground truth for comparison (basename + full path both accepted)
39
+ const gtFilesNorm = groundTruthFiles.map(normalizeFilePath);
40
+ const gtSymbolsNorm = groundTruthSymbols.map((s) => s.toLowerCase());
41
+ const topKFiles = resultFiles.slice(0, k);
42
+ const topKSymbols = resultSymbols.slice(0, k);
43
+ const filesFound = gtFilesNorm.filter((gt) => topKFiles.some((r) => filePathMatches(r, gt)));
44
+ const filesMissed = gtFilesNorm.filter((gt) => !topKFiles.some((r) => filePathMatches(r, gt)));
45
+ const symbolsFound = gtSymbolsNorm.filter((gt) => topKSymbols.some((r) => r.toLowerCase() === gt));
46
+ const symbolsMissed = gtSymbolsNorm.filter((gt) => !topKSymbols.some((r) => r.toLowerCase() === gt));
47
+ const truePositives = filesFound.length;
48
+ const precision = topKFiles.length > 0 ? truePositives / Math.min(k, topKFiles.length) : 0;
49
+ const recall = gtFilesNorm.length > 0 ? truePositives / gtFilesNorm.length : 0;
50
+ const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
51
+ return {
52
+ precision_at_k: round(precision),
53
+ recall_at_k: round(recall),
54
+ f1_at_k: round(f1),
55
+ k,
56
+ files_found: filesFound,
57
+ files_missed: filesMissed,
58
+ symbols_found: symbolsFound,
59
+ symbols_missed: symbolsMissed,
60
+ result_files: topKFiles,
61
+ result_symbols: topKSymbols,
62
+ };
63
+ }
64
+ // ── Internal search (mirrors mcp-serve.ts search() but returns structured data) ──
65
+ function searchIntel(intel, funcIntel, query, limit) {
66
+ const q = query.toLowerCase();
67
+ const fileHits = new Map(); // file → score
68
+ const symbolHits = new Map(); // symbol → score
69
+ // Endpoints — path/handler weighted higher than service_calls, generic calls filtered
70
+ for (const ep of Object.values(intel.api_registry || {})) {
71
+ const pathScore = scoreField(q, ep.path ?? "", 1.0);
72
+ const handlerScore = scoreField(q, ep.handler ?? "", 0.9);
73
+ const callScore = Math.max(0, ...(ep.service_calls ?? [])
74
+ .filter((s) => !isGenericCall(s))
75
+ .map((s) => scoreField(q, s, 0.5)));
76
+ const score = Math.max(pathScore, handlerScore, callScore);
77
+ if (score > 0 && ep.file)
78
+ addHit(fileHits, ep.file, score);
79
+ if (score > 0 && ep.handler)
80
+ addHit(symbolHits, ep.handler, score);
81
+ }
82
+ // Models
83
+ for (const m of Object.values(intel.model_registry || {})) {
84
+ const nameScore = scoreField(q, m.name ?? "", 1.0);
85
+ const fieldScore = Math.max(0, ...(m.fields ?? []).map((f) => scoreField(q, f, 0.6)));
86
+ const score = Math.max(nameScore, fieldScore);
87
+ if (score > 0 && m.file)
88
+ addHit(fileHits, m.file, score);
89
+ if (score > 0 && m.name)
90
+ addHit(symbolHits, m.name, score);
91
+ }
92
+ // Modules: id, imports, exports, files
93
+ for (const mod of intel.service_map || []) {
94
+ const modScore = scoreField(q, mod.id ?? "", 0.8);
95
+ // Exports — symbol names are high specificity
96
+ for (const sym of mod.exports || []) {
97
+ const symScore = scoreField(q, sym, 1.0);
98
+ if (symScore > 0)
99
+ addHit(symbolHits, sym, symScore);
100
+ }
101
+ // Files — basename weighted higher than full path
102
+ for (const f of mod.files || []) {
103
+ const fileScore = Math.max(modScore, scoreField(q, path.basename(f), 1.0), // filename is most specific
104
+ scoreField(q, f, 0.5));
105
+ if (fileScore > 0)
106
+ addHit(fileHits, f, fileScore);
107
+ }
108
+ }
109
+ // Enums
110
+ for (const en of Object.values(intel.enum_registry || {})) {
111
+ const score = scoreItem(q, [en.name, ...(en.values || [])]);
112
+ if (score > 0 && en.file)
113
+ addHit(fileHits, en.file, score);
114
+ if (score > 0 && en.name)
115
+ addHit(symbolHits, en.name, score);
116
+ }
117
+ // Background tasks
118
+ for (const t of intel.background_tasks || []) {
119
+ const score = scoreItem(q, [t.name, t.kind]);
120
+ if (score > 0 && t.file)
121
+ addHit(fileHits, t.file, score);
122
+ if (score > 0 && t.name)
123
+ addHit(symbolHits, t.name, score);
124
+ }
125
+ // Frontend pages
126
+ for (const p of intel.frontend_pages || []) {
127
+ const score = scoreItem(q, [p.path, p.component, ...(p.api_calls || [])]);
128
+ if (score > 0 && p.component)
129
+ addHit(symbolHits, p.component, score);
130
+ }
131
+ // Functions (from function-intelligence.json — same as guardian_search MCP)
132
+ for (const fn of funcIntel?.functions || []) {
133
+ const score = scoreItem(q, [fn.name, ...(fn.calls || []), ...(fn.stringLiterals || [])]);
134
+ if (score > 0 && fn.file)
135
+ addHit(fileHits, fn.file, score * 0.8); // slightly lower weight than structural
136
+ if (score > 0)
137
+ addHit(symbolHits, fn.name, score * 0.8);
138
+ }
139
+ const sortedFiles = [...fileHits.entries()]
140
+ .sort((a, b) => b[1] - a[1])
141
+ .slice(0, limit)
142
+ .map(([f]) => f);
143
+ const sortedSymbols = [...symbolHits.entries()]
144
+ .sort((a, b) => b[1] - a[1])
145
+ .slice(0, limit)
146
+ .map(([s]) => s);
147
+ return { resultFiles: sortedFiles, resultSymbols: sortedSymbols };
148
+ }
149
+ /** Generic service_call patterns that pollute search (service.*, db.*, self.*, etc.) */
150
+ function isGenericCall(s) {
151
+ const genericPrefixes = ["service.", "self.", "db.", "session.", "response.", "request.", "app.", "router.", "logger.", "config.", "os.", "json.", "re.", "datetime.", "uuid."];
152
+ return genericPrefixes.some(p => s.toLowerCase().startsWith(p));
153
+ }
154
+ /**
155
+ * Score a query (possibly multi-word) against a field with a specificity weight.
156
+ * weight=1.0 for filenames/symbol names, weight=0.5 for service_calls, etc.
157
+ */
158
+ function scoreField(query, field, weight) {
159
+ const q = query.toLowerCase();
160
+ const low = field.toLowerCase();
161
+ const tokens = q.split(/\s+/).filter(t => t.length >= 3);
162
+ if (low === q)
163
+ return weight * 1.0;
164
+ if (low.includes(q))
165
+ return weight * 0.8;
166
+ if (tokens.length > 1 && tokens.every(t => low.includes(t)))
167
+ return weight * 0.6;
168
+ // Scale by fraction of tokens matched — more specific matches rank higher
169
+ // 1-token match = 0.3, 2+ tokens = 0.45 (bonus for specificity without penalising long queries)
170
+ const matched = tokens.filter(t => low.includes(t)).length;
171
+ if (matched > 0)
172
+ return weight * (matched >= 2 ? 0.45 : 0.3);
173
+ return 0;
174
+ }
175
+ function scoreItem(query, fields) {
176
+ // Legacy: all fields treated at weight 1.0
177
+ let best = 0;
178
+ for (const f of fields) {
179
+ if (!f)
180
+ continue;
181
+ best = Math.max(best, scoreField(query, f, 1.0));
182
+ }
183
+ return best;
184
+ }
185
+ function addHit(map, key, score) {
186
+ map.set(key, Math.max(map.get(key) ?? 0, score));
187
+ }
188
+ function normalizeFilePath(p) {
189
+ return p.replace(/\\/g, "/").replace(/^\.\//, "");
190
+ }
191
+ function filePathMatches(result, groundTruth) {
192
+ const r = normalizeFilePath(result);
193
+ const g = normalizeFilePath(groundTruth);
194
+ return r === g || r.endsWith("/" + g) || g.endsWith("/" + r) ||
195
+ path.basename(r) === path.basename(g);
196
+ }
197
+ function emptyResult(k, gtFiles, gtSymbols) {
198
+ return {
199
+ precision_at_k: 0, recall_at_k: 0, f1_at_k: 0, k,
200
+ files_found: [], files_missed: gtFiles,
201
+ symbols_found: [], symbols_missed: gtSymbols,
202
+ result_files: [], result_symbols: [],
203
+ };
204
+ }
205
+ function round(n) {
206
+ return Math.round(n * 1000) / 1000;
207
+ }
@@ -0,0 +1,79 @@
1
+ /**
2
+ * Token Efficiency Metric
3
+ *
4
+ * Measures how many tokens an agent needs to orient itself using Guardian MCP
5
+ * vs reading the ground-truth files directly.
6
+ *
7
+ * Method:
8
+ * MCP path → read architecture-context.md (orient) + codebase-intelligence.json (search)
9
+ * Raw path → read each ground-truth file byte count
10
+ * Ratio → MCP bytes / raw bytes (lower = more efficient)
11
+ *
12
+ * Token estimate: chars / 3.5 (industry-standard rough approximation)
13
+ */
14
+ import path from "node:path";
15
+ import fs from "node:fs/promises";
16
+ const CHARS_PER_TOKEN = 3.5;
17
+ export async function measureTokenEfficiency(params) {
18
+ const { specsDir, groundTruthFiles, repoDir } = params;
19
+ const machineDir = path.join(specsDir, "machine");
20
+ // ── MCP response size ────────────────────────────────────────────────────
21
+ // An agent using Guardian issues two calls: guardian_orient + guardian_search
22
+ // We estimate their response sizes from the files they serve.
23
+ let mcpBytes = 0;
24
+ // orient: architecture-context.md (the guardian:context block only)
25
+ const contextPath = path.join(machineDir, "architecture-context.md");
26
+ try {
27
+ const raw = await fs.readFile(contextPath, "utf8");
28
+ const match = raw.match(/<!-- guardian:context[^>]*-->([\s\S]*?)<!-- \/guardian:context -->/);
29
+ const block = match ? match[1] : raw;
30
+ // MCP compacts this into JSON — roughly 40% of markdown size
31
+ mcpBytes += Math.round(Buffer.byteLength(block, "utf8") * 0.4);
32
+ }
33
+ catch {
34
+ // Fallback: estimate from codebase-intelligence.json header
35
+ try {
36
+ const stat = await fs.stat(path.join(machineDir, "codebase-intelligence.json"));
37
+ mcpBytes += Math.round(stat.size * 0.05); // orient only emits a compact summary
38
+ }
39
+ catch { /* ignore */ }
40
+ }
41
+ // search: the guardian_search response is a compact JSON of matched items
42
+ // We estimate it as a fraction of the full intel file
43
+ try {
44
+ const stat = await fs.stat(path.join(machineDir, "codebase-intelligence.json"));
45
+ mcpBytes += Math.round(stat.size * 0.08); // search returns ~8% of intel on average
46
+ }
47
+ catch { /* ignore */ }
48
+ // ── Raw file size ────────────────────────────────────────────────────────
49
+ let rawBytes = 0;
50
+ for (const relPath of groundTruthFiles) {
51
+ const candidates = repoDir
52
+ ? [path.join(repoDir, relPath), relPath]
53
+ : [relPath];
54
+ for (const candidate of candidates) {
55
+ try {
56
+ const stat = await fs.stat(candidate);
57
+ rawBytes += stat.size;
58
+ break;
59
+ }
60
+ catch { /* try next */ }
61
+ }
62
+ }
63
+ // ── Compute metrics ──────────────────────────────────────────────────────
64
+ const mcpTokens = Math.ceil(mcpBytes / CHARS_PER_TOKEN);
65
+ const rawFileTokens = Math.ceil(rawBytes / CHARS_PER_TOKEN);
66
+ const efficiencyRatio = rawFileTokens > 0 ? round(mcpTokens / rawFileTokens) : 0;
67
+ const tokensSaved = Math.max(0, rawFileTokens - mcpTokens);
68
+ return {
69
+ mcp_tokens: mcpTokens,
70
+ raw_file_tokens: rawFileTokens,
71
+ efficiency_ratio: efficiencyRatio,
72
+ tokens_saved: tokensSaved,
73
+ raw_file_bytes: rawBytes,
74
+ mcp_response_bytes: mcpBytes,
75
+ };
76
+ }
77
+ function round(n) {
78
+ return Math.round(n * 1000) / 1000;
79
+ }