npm - @toolbaux/guardian - Versions diffs - 0.1.21 → 0.1.23 - Mend

@toolbaux/guardian 0.1.21 → 0.1.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/README.md +1 -1
package/dist/adapters/runner.js +72 -3
package/dist/adapters/typescript-adapter.js +24 -10
package/dist/benchmarking/metrics/context-coverage.js +82 -0
package/dist/benchmarking/metrics/drift-score.js +104 -0
package/dist/benchmarking/metrics/search-recall.js +207 -0
package/dist/benchmarking/metrics/token-efficiency.js +79 -0
package/dist/benchmarking/report.js +131 -0
package/dist/benchmarking/runner.js +175 -0
package/dist/benchmarking/types.js +13 -0
package/dist/cli.js +53 -10
package/dist/commands/benchmark.js +62 -0
package/dist/commands/discrepancy.js +1 -1
package/dist/commands/doc-generate.js +1 -1
package/dist/commands/doc-html.js +1 -1
package/dist/commands/extract.js +1 -1
package/dist/commands/feature-context.js +1 -1
package/dist/commands/init.js +1 -0
package/dist/commands/intel.js +47 -1
package/dist/commands/mcp-serve.js +48 -288
package/dist/commands/search.js +602 -14
package/dist/db/file-specs-store.js +174 -0
package/dist/db/fts-builder.js +305 -0
package/dist/db/index.js +55 -0
package/dist/db/specs-store.js +13 -0
package/dist/db/sqlite-specs-store.js +441 -0
package/dist/extract/codebase-intel.js +31 -2
package/dist/extract/compress.js +70 -3
package/dist/extract/context-block.js +11 -2
package/dist/extract/function-intel.js +5 -2
package/dist/extract/index.js +1 -23
package/dist/extract/writer.js +6 -0
package/package.json +3 -1

package/README.md CHANGED Viewed

@@ -230,8 +230,8 @@ npm install && npm run build && npm link
 ```bash
 guardian init                          # config, .specs dir, pre-commit hook, CLAUDE.md
 guardian extract                       # full architecture + UX snapshots + docs
+guardian extract --backend sqlite      # same + builds guardian.db with FTS index
 guardian generate --ai-context         # compact ~3K token AI context only
-guardian intel                         # build codebase-intelligence.json
 ```
 ### Search & Context

package/dist/adapters/runner.js CHANGED Viewed

@@ -15,14 +15,26 @@ export function runAdapter(adapter, file, source) {
         }
         return { endpoints: [], models: [], components: [], tests: [], functions: [] };
     }
-    // tree-sitter native binding throws "Invalid argument" for very large files.
-    // Skip files over 1 MB to avoid silent crashes; they are rare in practice.
+    // tree-sitter's native binding throws "Invalid argument" for files with high AST
+    // complexity — this can happen well below 1 MB for deeply-nested source files.
+    // Parse defensively: try the whole file first, then fall back to chunked parsing
+    // if tree-sitter throws.  Chunks are split at top-level definition boundaries so
+    // each piece is syntactically self-contained.
     if (source.length > 1_000_000) {
         return { endpoints: [], models: [], components: [], tests: [], functions: [] };
     }
     const parser = new Parser();
     parser.setLanguage(adapter.language);
-    const tree = parser.parse(source);
+    let tree;
+    try {
+        tree = parser.parse(source);
+    }
+    catch {
+        // File is too complex for a single parse — split at top-level definitions and
+        // merge results.  Each chunk is a run of lines from one top-level def/class to
+        // the next, so it is syntactically valid on its own.
+        return runAdapterChunked(adapter, file, source, parser);
+    }
     if (adapter.extract) {
         const result = adapter.extract(file, source, tree.rootNode);
         return {
@@ -94,3 +106,60 @@ export function runAdapter(adapter, file, source) {
     }
     return { endpoints, models, components, tests, functions: [] };
 }
+/**
+ * Fallback for files that tree-sitter can't parse as a whole.
+ * Splits source at top-level definition boundaries (lines starting with
+ * "def ", "class ", "async def ", "fn ", "func ", "public class ", etc.),
+ * parses each chunk independently with the same adapter, and merges results.
+ */
+function runAdapterChunked(adapter, file, source, parser) {
+    const merged = {
+        endpoints: [], models: [], components: [], tests: [], functions: [],
+    };
+    if (!adapter.extract)
+        return merged;
+    // Split at lines that start a new top-level definition.
+    // Pattern covers Python, Go, Rust, JS/TS, Java, C#.
+    const TOP_DEF = /^(?:(?:pub(?:\s+(?:unsafe\s+)?)?|private|protected|public|static|async|export\s+(?:default\s+)?|abstract\s+)*(?:def |class |fn |func |function |interface |struct |enum |impl |type ))/;
+    const lines = source.split("\n");
+    const splitPoints = [0];
+    for (let i = 1; i < lines.length; i++) {
+        if (TOP_DEF.test(lines[i]))
+            splitPoints.push(i);
+    }
+    splitPoints.push(lines.length);
+    // Group split points into chunks of up to ~25 KB to stay within parser limits.
+    const CHUNK_BYTES = 25_000;
+    let chunkBytes = 0;
+    let chunkLines = [];
+    function flushChunk() {
+        if (chunkLines.length === 0)
+            return;
+        const chunk = chunkLines.join("\n");
+        try {
+            const tree = parser.parse(chunk);
+            const result = adapter.extract(file, chunk, tree.rootNode);
+            merged.endpoints.push(...result.endpoints);
+            merged.models.push(...result.models);
+            merged.components.push(...result.components);
+            merged.tests.push(...result.tests);
+            merged.functions.push(...(result.functions ?? []));
+        }
+        catch {
+            // skip unparseable chunk
+        }
+        chunkLines = [];
+        chunkBytes = 0;
+    }
+    for (let s = 0; s < splitPoints.length - 1; s++) {
+        const segLines = lines.slice(splitPoints[s], splitPoints[s + 1]);
+        const segText = segLines.join("\n");
+        if (chunkBytes + segText.length > CHUNK_BYTES && chunkLines.length > 0) {
+            flushChunk();
+        }
+        chunkLines.push(...segLines);
+        chunkBytes += segText.length;
+    }
+    flushChunk();
+    return merged;
+}

package/dist/adapters/typescript-adapter.js CHANGED Viewed

@@ -1,16 +1,6 @@
 import TypeScript from "tree-sitter-typescript";
 import Parser from "tree-sitter";
 import path from "node:path";
-// Utility to recursively find children of a certain type
-function findChildren(node, type) {
-    const results = [];
-    if (node.type === type)
-        results.push(node);
-    for (const child of node.namedChildren) {
-        results.push(...findChildren(child, type));
-    }
-    return results;
-}
 // ── Function-level intelligence helpers ──────────────────────────────────
 /** Walk all descendants depth-first. */
 function* walkAll(node) {
@@ -98,6 +88,30 @@ function extractTsFunctions(file, source, node) {
                 isAsync = valN.children.some((c) => c.type === "async");
             }
         }
+        else if (n.type === "interface_declaration" ||
+            n.type === "type_alias_declaration" ||
+            n.type === "class_declaration" ||
+            n.type === "abstract_class_declaration" ||
+            n.type === "enum_declaration") {
+            // Type-level declarations: interfaces, types, classes, enums.
+            // These are the primary symbols in .d.ts files and typed source files.
+            const nameN = n.childForFieldName("name");
+            if (nameN) {
+                const name = getText(nameN);
+                records.push({
+                    id: `${file}#${name}:${n.startPosition.row + 1}`,
+                    name,
+                    file,
+                    lines: [n.startPosition.row + 1, n.endPosition.row + 1],
+                    calls: [],
+                    stringLiterals: [],
+                    regexPatterns: [],
+                    isAsync: false,
+                    language: "typescript",
+                });
+            }
+            // Still recurse to catch methods inside classes
+        }
         if (funcName && bodyNode) {
             const intel = collectBodyIntel(bodyNode, getText);
             records.push({

package/dist/benchmarking/metrics/context-coverage.js ADDED Viewed

@@ -0,0 +1,82 @@
+/**
+ * Context Coverage Metric
+ *
+ * Measures how well guardian_context covers the modules and files
+ * relevant to a benchmark task.
+ *
+ * Method:
+ *   1. Read architecture-context.md from the specs dir
+ *   2. For each ground-truth file, check if its basename or containing module
+ *      is mentioned anywhere in the context block
+ *   3. For modules: check if the module ID appears (e.g. "src/auth", "auth")
+ *
+ * A coverage of 1.0 means every ground-truth file/module appears in the context.
+ */
+import path from "node:path";
+import fs from "node:fs/promises";
+export async function measureContextCoverage(params) {
+    const { specsDir, groundTruthFiles } = params;
+    // Read architecture-context.md
+    const contextPath = path.join(specsDir, "machine", "architecture-context.md");
+    let contextText = "";
+    try {
+        const raw = await fs.readFile(contextPath, "utf8");
+        // Extract the guardian:context block for fair comparison
+        const match = raw.match(/<!-- guardian:context[^>]*-->([\s\S]*?)<!-- \/guardian:context -->/);
+        contextText = (match ? match[1] : raw).toLowerCase();
+    }
+    catch {
+        // No context file — zero coverage
+        return {
+            coverage: 0,
+            modules_mentioned: [],
+            modules_missing: groundTruthFiles.map(moduleIdFor),
+            files_mentioned: 0,
+            files_total: groundTruthFiles.length,
+        };
+    }
+    // ── Check file coverage ──────────────────────────────────────────────────
+    let filesMentioned = 0;
+    for (const f of groundTruthFiles) {
+        const basename = path.basename(f).toLowerCase();
+        const noExt = basename.replace(/\.[^.]+$/, "");
+        if (contextText.includes(basename) || contextText.includes(noExt)) {
+            filesMentioned++;
+        }
+    }
+    // ── Check module coverage ────────────────────────────────────────────────
+    // Derive module IDs from ground-truth file paths (e.g. "src/auth/service.ts" → "src/auth")
+    const allModuleIds = [...new Set(groundTruthFiles.map(moduleIdFor))];
+    const modulesMentioned = [];
+    const modulesMissing = [];
+    for (const modId of allModuleIds) {
+        // Check if the module ID (or any segment) appears in context
+        const segments = modId.split("/").filter(Boolean);
+        const mentioned = segments.some(seg => contextText.includes(seg.toLowerCase())) ||
+            contextText.includes(modId.toLowerCase());
+        if (mentioned) {
+            modulesMentioned.push(modId);
+        }
+        else {
+            modulesMissing.push(modId);
+        }
+    }
+    const coverage = allModuleIds.length > 0
+        ? round(modulesMentioned.length / allModuleIds.length)
+        : 0;
+    return {
+        coverage,
+        modules_mentioned: modulesMentioned,
+        modules_missing: modulesMissing,
+        files_mentioned: filesMentioned,
+        files_total: groundTruthFiles.length,
+    };
+}
+/** Derive a module-level ID from a file path (parent directory) */
+function moduleIdFor(filePath) {
+    const normalized = filePath.replace(/\\/g, "/").replace(/^\.\//, "");
+    return path.dirname(normalized);
+}
+function round(n) {
+    return Math.round(n * 1000) / 1000;
+}

package/dist/benchmarking/metrics/drift-score.js ADDED Viewed

@@ -0,0 +1,104 @@
+/**
+ * Drift Score Metric
+ *
+ * Measures how much architectural drift a proposed patch introduces.
+ *
+ * Method:
+ *   baseline  → read pre-computed drift from architecture.diff.summary.json
+ *   post-patch → if a patch is provided, count changed files and estimate delta
+ *                by counting new/modified module edges in the diff
+ *
+ * For publication: lower drift_increase means the patch respected architecture.
+ * A delta of 0 means the patch introduced no new coupling.
+ */
+import path from "node:path";
+import fs from "node:fs/promises";
+export async function measureDriftScore(params) {
+    const { specsDir, patch } = params;
+    const machineDir = path.join(specsDir, "machine");
+    // ── Read baseline drift summary ──────────────────────────────────────────
+    const diffPath = path.join(machineDir, "architecture.diff.summary.json");
+    let diff = null;
+    let baselineStatus = "unknown";
+    let baselineDelta = null;
+    try {
+        const raw = await fs.readFile(diffPath, "utf8");
+        diff = JSON.parse(raw);
+        // Compute a drift delta from the counts_delta
+        const cd = diff.counts_delta ?? {};
+        const edgeDelta = Math.abs(cd.module_edges ?? 0) + Math.abs(cd.file_edges ?? 0);
+        const structDelta = Math.abs(cd.modules ?? 0) * 2; // new modules weigh more
+        baselineDelta = edgeDelta + structDelta;
+        baselineStatus = diff.structural_change ? "drift" : "stable";
+    }
+    catch {
+        baselineStatus = "no-baseline";
+    }
+    // ── Estimate post-patch drift ────────────────────────────────────────────
+    let postPatchDelta = null;
+    let postPatchStatus = "unknown";
+    let patchApplied = false;
+    if (patch) {
+        patchApplied = true;
+        // Parse the unified diff to count touched files and new import patterns
+        const changedFiles = countPatchFiles(patch);
+        const newImports = countNewImports(patch);
+        const removedImports = countRemovedImports(patch);
+        // Heuristic delta: each new import edge that isn't in a removal = +1 coupling
+        const netNewImports = Math.max(0, newImports - removedImports);
+        postPatchDelta = (baselineDelta ?? 0) + netNewImports + Math.floor(changedFiles / 3);
+        postPatchStatus = postPatchDelta > (baselineDelta ?? 0) + 2
+            ? "drift"
+            : postPatchDelta > 0
+                ? "warning"
+                : "stable";
+    }
+    const driftIncrease = postPatchDelta !== null && baselineDelta !== null
+        ? postPatchDelta - baselineDelta
+        : null;
+    return {
+        baseline_delta: baselineDelta,
+        post_patch_delta: postPatchDelta,
+        drift_increase: driftIncrease !== null ? Math.max(0, driftIncrease) : null,
+        baseline_status: baselineStatus,
+        post_patch_status: patchApplied ? postPatchStatus : "not-computed",
+        patch_applied: patchApplied,
+    };
+}
+// ── Patch helpers ────────────────────────────────────────────────────────────
+/** Count distinct files touched by a unified diff */
+function countPatchFiles(patch) {
+    const files = new Set();
+    for (const line of patch.split("\n")) {
+        if (line.startsWith("--- ") || line.startsWith("+++ ")) {
+            const f = line.slice(4).replace(/\t.*/, "").trim();
+            if (f !== "/dev/null")
+                files.add(f);
+        }
+    }
+    return files.size;
+}
+/** Count added import lines (import/from/require) in the patch */
+function countNewImports(patch) {
+    let count = 0;
+    for (const line of patch.split("\n")) {
+        if (line.startsWith("+") && !line.startsWith("+++")) {
+            const l = line.slice(1).trim();
+            if (/^(import|from|require)\b/.test(l))
+                count++;
+        }
+    }
+    return count;
+}
+/** Count removed import lines in the patch */
+function countRemovedImports(patch) {
+    let count = 0;
+    for (const line of patch.split("\n")) {
+        if (line.startsWith("-") && !line.startsWith("---")) {
+            const l = line.slice(1).trim();
+            if (/^(import|from|require)\b/.test(l))
+                count++;
+        }
+    }
+    return count;
+}

package/dist/benchmarking/metrics/search-recall.js ADDED Viewed

@@ -0,0 +1,207 @@
+/**
+ * Search Recall Metric
+ *
+ * Measures how well guardian_search surfaces the files and symbols
+ * that the correct solution actually touches (ground truth).
+ *
+ * Uses the codebase-intelligence.json search logic (same as MCP guardian_search)
+ * plus the richer architecture.snapshot.yaml for file-level recall.
+ *
+ * Paper metric: precision@k, recall@k, F1@k (default k=5)
+ */
+import path from "node:path";
+import fs from "node:fs/promises";
+const DEFAULT_K = 5;
+/**
+ * Run search against codebase-intelligence.json + function-intelligence.json
+ * and score recall against the ground-truth files and symbols from a benchmark task.
+ */
+export async function measureSearchRecall(params) {
+    const { specsDir, query, groundTruthFiles, groundTruthSymbols = [], k = DEFAULT_K } = params;
+    const intelPath = path.join(specsDir, "machine", "codebase-intelligence.json");
+    let intel;
+    try {
+        const raw = await fs.readFile(intelPath, "utf8");
+        intel = JSON.parse(raw);
+    }
+    catch {
+        return emptyResult(k, groundTruthFiles, groundTruthSymbols);
+    }
+    // Also load function-intelligence.json if available (same as guardian_search MCP tool)
+    let funcIntel = null;
+    try {
+        const funcRaw = await fs.readFile(path.join(specsDir, "machine", "function-intelligence.json"), "utf8");
+        funcIntel = JSON.parse(funcRaw);
+    }
+    catch { /* optional */ }
+    const { resultFiles, resultSymbols } = searchIntel(intel, funcIntel, query, k * 4);
+    // Normalize ground truth for comparison (basename + full path both accepted)
+    const gtFilesNorm = groundTruthFiles.map(normalizeFilePath);
+    const gtSymbolsNorm = groundTruthSymbols.map((s) => s.toLowerCase());
+    const topKFiles = resultFiles.slice(0, k);
+    const topKSymbols = resultSymbols.slice(0, k);
+    const filesFound = gtFilesNorm.filter((gt) => topKFiles.some((r) => filePathMatches(r, gt)));
+    const filesMissed = gtFilesNorm.filter((gt) => !topKFiles.some((r) => filePathMatches(r, gt)));
+    const symbolsFound = gtSymbolsNorm.filter((gt) => topKSymbols.some((r) => r.toLowerCase() === gt));
+    const symbolsMissed = gtSymbolsNorm.filter((gt) => !topKSymbols.some((r) => r.toLowerCase() === gt));
+    const truePositives = filesFound.length;
+    const precision = topKFiles.length > 0 ? truePositives / Math.min(k, topKFiles.length) : 0;
+    const recall = gtFilesNorm.length > 0 ? truePositives / gtFilesNorm.length : 0;
+    const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
+    return {
+        precision_at_k: round(precision),
+        recall_at_k: round(recall),
+        f1_at_k: round(f1),
+        k,
+        files_found: filesFound,
+        files_missed: filesMissed,
+        symbols_found: symbolsFound,
+        symbols_missed: symbolsMissed,
+        result_files: topKFiles,
+        result_symbols: topKSymbols,
+    };
+}
+// ── Internal search (mirrors mcp-serve.ts search() but returns structured data) ──
+function searchIntel(intel, funcIntel, query, limit) {
+    const q = query.toLowerCase();
+    const fileHits = new Map(); // file → score
+    const symbolHits = new Map(); // symbol → score
+    // Endpoints — path/handler weighted higher than service_calls, generic calls filtered
+    for (const ep of Object.values(intel.api_registry || {})) {
+        const pathScore = scoreField(q, ep.path ?? "", 1.0);
+        const handlerScore = scoreField(q, ep.handler ?? "", 0.9);
+        const callScore = Math.max(0, ...(ep.service_calls ?? [])
+            .filter((s) => !isGenericCall(s))
+            .map((s) => scoreField(q, s, 0.5)));
+        const score = Math.max(pathScore, handlerScore, callScore);
+        if (score > 0 && ep.file)
+            addHit(fileHits, ep.file, score);
+        if (score > 0 && ep.handler)
+            addHit(symbolHits, ep.handler, score);
+    }
+    // Models
+    for (const m of Object.values(intel.model_registry || {})) {
+        const nameScore = scoreField(q, m.name ?? "", 1.0);
+        const fieldScore = Math.max(0, ...(m.fields ?? []).map((f) => scoreField(q, f, 0.6)));
+        const score = Math.max(nameScore, fieldScore);
+        if (score > 0 && m.file)
+            addHit(fileHits, m.file, score);
+        if (score > 0 && m.name)
+            addHit(symbolHits, m.name, score);
+    }
+    // Modules: id, imports, exports, files
+    for (const mod of intel.service_map || []) {
+        const modScore = scoreField(q, mod.id ?? "", 0.8);
+        // Exports — symbol names are high specificity
+        for (const sym of mod.exports || []) {
+            const symScore = scoreField(q, sym, 1.0);
+            if (symScore > 0)
+                addHit(symbolHits, sym, symScore);
+        }
+        // Files — basename weighted higher than full path
+        for (const f of mod.files || []) {
+            const fileScore = Math.max(modScore, scoreField(q, path.basename(f), 1.0), // filename is most specific
+            scoreField(q, f, 0.5));
+            if (fileScore > 0)
+                addHit(fileHits, f, fileScore);
+        }
+    }
+    // Enums
+    for (const en of Object.values(intel.enum_registry || {})) {
+        const score = scoreItem(q, [en.name, ...(en.values || [])]);
+        if (score > 0 && en.file)
+            addHit(fileHits, en.file, score);
+        if (score > 0 && en.name)
+            addHit(symbolHits, en.name, score);
+    }
+    // Background tasks
+    for (const t of intel.background_tasks || []) {
+        const score = scoreItem(q, [t.name, t.kind]);
+        if (score > 0 && t.file)
+            addHit(fileHits, t.file, score);
+        if (score > 0 && t.name)
+            addHit(symbolHits, t.name, score);
+    }
+    // Frontend pages
+    for (const p of intel.frontend_pages || []) {
+        const score = scoreItem(q, [p.path, p.component, ...(p.api_calls || [])]);
+        if (score > 0 && p.component)
+            addHit(symbolHits, p.component, score);
+    }
+    // Functions (from function-intelligence.json — same as guardian_search MCP)
+    for (const fn of funcIntel?.functions || []) {
+        const score = scoreItem(q, [fn.name, ...(fn.calls || []), ...(fn.stringLiterals || [])]);
+        if (score > 0 && fn.file)
+            addHit(fileHits, fn.file, score * 0.8); // slightly lower weight than structural
+        if (score > 0)
+            addHit(symbolHits, fn.name, score * 0.8);
+    }
+    const sortedFiles = [...fileHits.entries()]
+        .sort((a, b) => b[1] - a[1])
+        .slice(0, limit)
+        .map(([f]) => f);
+    const sortedSymbols = [...symbolHits.entries()]
+        .sort((a, b) => b[1] - a[1])
+        .slice(0, limit)
+        .map(([s]) => s);
+    return { resultFiles: sortedFiles, resultSymbols: sortedSymbols };
+}
+/** Generic service_call patterns that pollute search (service.*, db.*, self.*, etc.) */
+function isGenericCall(s) {
+    const genericPrefixes = ["service.", "self.", "db.", "session.", "response.", "request.", "app.", "router.", "logger.", "config.", "os.", "json.", "re.", "datetime.", "uuid."];
+    return genericPrefixes.some(p => s.toLowerCase().startsWith(p));
+}
+/**
+ * Score a query (possibly multi-word) against a field with a specificity weight.
+ * weight=1.0 for filenames/symbol names, weight=0.5 for service_calls, etc.
+ */
+function scoreField(query, field, weight) {
+    const q = query.toLowerCase();
+    const low = field.toLowerCase();
+    const tokens = q.split(/\s+/).filter(t => t.length >= 3);
+    if (low === q)
+        return weight * 1.0;
+    if (low.includes(q))
+        return weight * 0.8;
+    if (tokens.length > 1 && tokens.every(t => low.includes(t)))
+        return weight * 0.6;
+    // Scale by fraction of tokens matched — more specific matches rank higher
+    // 1-token match = 0.3, 2+ tokens = 0.45 (bonus for specificity without penalising long queries)
+    const matched = tokens.filter(t => low.includes(t)).length;
+    if (matched > 0)
+        return weight * (matched >= 2 ? 0.45 : 0.3);
+    return 0;
+}
+function scoreItem(query, fields) {
+    // Legacy: all fields treated at weight 1.0
+    let best = 0;
+    for (const f of fields) {
+        if (!f)
+            continue;
+        best = Math.max(best, scoreField(query, f, 1.0));
+    }
+    return best;
+}
+function addHit(map, key, score) {
+    map.set(key, Math.max(map.get(key) ?? 0, score));
+}
+function normalizeFilePath(p) {
+    return p.replace(/\\/g, "/").replace(/^\.\//, "");
+}
+function filePathMatches(result, groundTruth) {
+    const r = normalizeFilePath(result);
+    const g = normalizeFilePath(groundTruth);
+    return r === g || r.endsWith("/" + g) || g.endsWith("/" + r) ||
+        path.basename(r) === path.basename(g);
+}
+function emptyResult(k, gtFiles, gtSymbols) {
+    return {
+        precision_at_k: 0, recall_at_k: 0, f1_at_k: 0, k,
+        files_found: [], files_missed: gtFiles,
+        symbols_found: [], symbols_missed: gtSymbols,
+        result_files: [], result_symbols: [],
+    };
+}
+function round(n) {
+    return Math.round(n * 1000) / 1000;
+}

package/dist/benchmarking/metrics/token-efficiency.js ADDED Viewed

@@ -0,0 +1,79 @@
+/**
+ * Token Efficiency Metric
+ *
+ * Measures how many tokens an agent needs to orient itself using Guardian MCP
+ * vs reading the ground-truth files directly.
+ *
+ * Method:
+ *   MCP path    → read architecture-context.md (orient) + codebase-intelligence.json (search)
+ *   Raw path    → read each ground-truth file byte count
+ *   Ratio       → MCP bytes / raw bytes  (lower = more efficient)
+ *
+ * Token estimate: chars / 3.5  (industry-standard rough approximation)
+ */
+import path from "node:path";
+import fs from "node:fs/promises";
+const CHARS_PER_TOKEN = 3.5;
+export async function measureTokenEfficiency(params) {
+    const { specsDir, groundTruthFiles, repoDir } = params;
+    const machineDir = path.join(specsDir, "machine");
+    // ── MCP response size ────────────────────────────────────────────────────
+    // An agent using Guardian issues two calls: guardian_orient + guardian_search
+    // We estimate their response sizes from the files they serve.
+    let mcpBytes = 0;
+    // orient: architecture-context.md (the guardian:context block only)
+    const contextPath = path.join(machineDir, "architecture-context.md");
+    try {
+        const raw = await fs.readFile(contextPath, "utf8");
+        const match = raw.match(/<!-- guardian:context[^>]*-->([\s\S]*?)<!-- \/guardian:context -->/);
+        const block = match ? match[1] : raw;
+        // MCP compacts this into JSON — roughly 40% of markdown size
+        mcpBytes += Math.round(Buffer.byteLength(block, "utf8") * 0.4);
+    }
+    catch {
+        // Fallback: estimate from codebase-intelligence.json header
+        try {
+            const stat = await fs.stat(path.join(machineDir, "codebase-intelligence.json"));
+            mcpBytes += Math.round(stat.size * 0.05); // orient only emits a compact summary
+        }
+        catch { /* ignore */ }
+    }
+    // search: the guardian_search response is a compact JSON of matched items
+    // We estimate it as a fraction of the full intel file
+    try {
+        const stat = await fs.stat(path.join(machineDir, "codebase-intelligence.json"));
+        mcpBytes += Math.round(stat.size * 0.08); // search returns ~8% of intel on average
+    }
+    catch { /* ignore */ }
+    // ── Raw file size ────────────────────────────────────────────────────────
+    let rawBytes = 0;
+    for (const relPath of groundTruthFiles) {
+        const candidates = repoDir
+            ? [path.join(repoDir, relPath), relPath]
+            : [relPath];
+        for (const candidate of candidates) {
+            try {
+                const stat = await fs.stat(candidate);
+                rawBytes += stat.size;
+                break;
+            }
+            catch { /* try next */ }
+        }
+    }
+    // ── Compute metrics ──────────────────────────────────────────────────────
+    const mcpTokens = Math.ceil(mcpBytes / CHARS_PER_TOKEN);
+    const rawFileTokens = Math.ceil(rawBytes / CHARS_PER_TOKEN);
+    const efficiencyRatio = rawFileTokens > 0 ? round(mcpTokens / rawFileTokens) : 0;
+    const tokensSaved = Math.max(0, rawFileTokens - mcpTokens);
+    return {
+        mcp_tokens: mcpTokens,
+        raw_file_tokens: rawFileTokens,
+        efficiency_ratio: efficiencyRatio,
+        tokens_saved: tokensSaved,
+        raw_file_bytes: rawBytes,
+        mcp_response_bytes: mcpBytes,
+    };
+}
+function round(n) {
+    return Math.round(n * 1000) / 1000;
+}