npm - @toolbaux/guardian - Versions diffs - 0.1.23 → 0.2.0 - Mend

@toolbaux/guardian 0.1.23 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +6 -4
package/dist/cli.js +1 -1
package/dist/commands/context.js +87 -29
package/dist/commands/extract.js +4 -1
package/dist/commands/generate.js +83 -10
package/dist/commands/init.js +88 -56
package/dist/commands/intel.js +23 -0
package/dist/commands/mcp-serve.js +112 -0
package/dist/commands/search.js +43 -3
package/dist/config.js +1 -0
package/dist/db/embeddings.js +113 -0
package/dist/db/fts-builder.js +85 -0
package/dist/db/sqlite-specs-store.js +496 -3
package/package.json +2 -1

package/dist/db/sqlite-specs-store.js CHANGED Viewed

@@ -50,6 +50,58 @@ function splitIdentifiers(s) {
         .replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2")
         .toLowerCase();
 }
+/**
+ * Normalise a callee name by stripping receiver/object/package prefix.
+ * "engine.handleHTTPRequest" → "handleHTTPRequest"
+ * "self.add_to_class"       → "add_to_class"
+ * "apps.get_model"          → "get_model"
+ * "fmt.Println"             → "Println"
+ * "bare_name"               → "bare_name"  (unchanged)
+ */
+function normalizeCallee(name) {
+    const lastDot = name.lastIndexOf(".");
+    if (lastDot >= 0) {
+        const bare = name.slice(lastDot + 1);
+        if (bare && /^[A-Za-z_]\w*$/.test(bare))
+            return bare;
+    }
+    return name;
+}
+/**
+ * Extract meaningful directory-segment tokens from a file path.
+ * "fastapi/dependencies/utils.py" → "fastapi dependencies"
+ * "lib/router/layer.js"           → "router layer"
+ *
+ * Skips generic segments that add noise but no recall value.
+ */
+const PATH_NOISE = new Set([
+    "src", "lib", "app", "pkg", "internal", "cmd", "api", "dist", "build",
+    "test", "tests", "spec", "specs", "docs", "doc", "examples", "example",
+    "scripts", "utils", "helpers", "common", "shared", "core", "main",
+]);
+function filePathTokens(fp) {
+    return fp
+        .split("/")
+        .slice(0, -1) // exclude the filename itself
+        .filter(s => s && !PATH_NOISE.has(s.toLowerCase()))
+        .map(splitIdentifiers)
+        .join(" ");
+}
+/** L2 norm of a Float32Array. */
+function vecNorm(v) {
+    let sum = 0;
+    for (let i = 0; i < v.length; i++)
+        sum += v[i] * v[i];
+    return Math.sqrt(sum);
+}
+/** Cosine similarity between two unit-normalised Float32Arrays. */
+function cosineSim(a, b) {
+    let dot = 0;
+    const len = Math.min(a.length, b.length);
+    for (let i = 0; i < len; i++)
+        dot += a[i] * b[i];
+    return dot;
+}
 export const DB_FILENAME = "guardian.db";
 export class SqliteSpecsStore {
     storeDir;
@@ -351,6 +403,431 @@ export class SqliteSpecsStore {
         );
       `);
         }
+        // Per-function FTS table — one row per function/class/symbol with line number.
+        // file_path and line are UNINDEXED (stored but not tokenised); name + body are searched.
+        this.db.exec(`
+      CREATE VIRTUAL TABLE IF NOT EXISTS functions_fts USING fts5(
+        file_path UNINDEXED,
+        line      UNINDEXED,
+        name,
+        body,
+        tokenize='porter unicode61'
+      );
+    `);
+        // Call-graph edges — caller → callee name mapping.
+        // caller_file stored so test callers can be excluded from in-degree authority ranking.
+        this.db.exec(`
+      CREATE TABLE IF NOT EXISTS function_calls (
+        caller_name  TEXT NOT NULL,
+        callee_name  TEXT NOT NULL,
+        caller_file  TEXT NOT NULL DEFAULT '',
+        PRIMARY KEY (caller_name, callee_name)
+      );
+      CREATE INDEX IF NOT EXISTS function_calls_callee ON function_calls(callee_name);
+    `);
+        // Migration: add caller_file column to existing DBs that predate this schema.
+        try {
+            this.db.exec("ALTER TABLE function_calls ADD COLUMN caller_file TEXT NOT NULL DEFAULT ''");
+        }
+        catch { /* column already exists — fine */ }
+        // Migration: normalise dotted callee names from older extractions.
+        // "engine.handleHTTPRequest" → "handleHTTPRequest", "self.method" → "method", etc.
+        // Uses UPDATE OR IGNORE to skip rows that would violate the (caller_name, callee_name) PK.
+        // Filters exclude Go parenthetical expressions: "(**time.Time)", "(*t).Equal", etc.
+        try {
+            this.db.exec(`
+        UPDATE OR IGNORE function_calls
+        SET callee_name = SUBSTR(callee_name, INSTR(callee_name, '.') + 1)
+        WHERE INSTR(callee_name, '.') > 0
+          AND INSTR(callee_name, '(') = 0
+          AND INSTR(callee_name, ' ') = 0
+          AND INSTR(SUBSTR(callee_name, INSTR(callee_name, '.') + 1), '.') = 0
+          AND INSTR(SUBSTR(callee_name, INSTR(callee_name, '.') + 1), ')') = 0
+      `);
+        }
+        catch { /* non-critical */ }
+        // Vector embeddings for semantic (non-keyword) search.
+        // vec is a Float32Array stored as BLOB (dim=256, model=text-embedding-3-small).
+        // Optional — only populated when OPENAI_API_KEY is present during extract.
+        this.db.exec(`
+      CREATE TABLE IF NOT EXISTS function_embeddings (
+        file_path  TEXT NOT NULL,
+        name       TEXT NOT NULL,
+        line       INTEGER NOT NULL,
+        vec        BLOB NOT NULL,
+        PRIMARY KEY (file_path, name, line)
+      );
+    `);
+        // ── Normalised fact tables (DB-first backend, Phase 1) ─────────────────────
+        //
+        // These tables store raw extracted facts with no rendering or formatting.
+        // Human docs and machine docs remain derived views generated from these facts.
+        // Future: generate.ts and context.ts read from these tables instead of files.
+        this.db.exec(`
+      -- Full FunctionRecord — one row per extracted function/method/symbol.
+      -- calls, string_lits, regex_pats are JSON arrays (compact, no pretty-print).
+      CREATE TABLE IF NOT EXISTS functions_raw (
+        file_path    TEXT NOT NULL,
+        name         TEXT NOT NULL,
+        line_start   INTEGER NOT NULL,
+        line_end     INTEGER NOT NULL,
+        language     TEXT NOT NULL DEFAULT '',
+        is_async     INTEGER NOT NULL DEFAULT 0,
+        docstring    TEXT NOT NULL DEFAULT '',
+        calls        TEXT NOT NULL DEFAULT '[]',
+        string_lits  TEXT NOT NULL DEFAULT '[]',
+        regex_pats   TEXT NOT NULL DEFAULT '[]',
+        PRIMARY KEY (file_path, name, line_start)
+      );
+      -- API endpoint registry — one row per route/handler pair.
+      -- service_calls is a JSON array.
+      CREATE TABLE IF NOT EXISTS endpoints_raw (
+        method          TEXT NOT NULL DEFAULT '',
+        path            TEXT NOT NULL,
+        handler         TEXT NOT NULL DEFAULT '',
+        file_path       TEXT NOT NULL DEFAULT '',
+        module          TEXT NOT NULL DEFAULT '',
+        service_calls   TEXT NOT NULL DEFAULT '[]',
+        request_schema  TEXT NOT NULL DEFAULT '',
+        response_schema TEXT NOT NULL DEFAULT '',
+        PRIMARY KEY (method, path)
+      );
+      -- ORM/data-model registry — one row per model/schema.
+      -- fields and relationships are JSON arrays.
+      CREATE TABLE IF NOT EXISTS models_raw (
+        name           TEXT PRIMARY KEY,
+        file_path      TEXT NOT NULL DEFAULT '',
+        module         TEXT NOT NULL DEFAULT '',
+        fields         TEXT NOT NULL DEFAULT '[]',
+        relationships  TEXT NOT NULL DEFAULT '[]'
+      );
+      -- Structural intelligence per module — one row per SI report.
+      -- Populated by rebuildModuleMetrics() called from guardian intel --backend sqlite.
+      CREATE TABLE IF NOT EXISTS module_metrics (
+        module           TEXT PRIMARY KEY,
+        depth_level      TEXT NOT NULL DEFAULT '',
+        propagation      TEXT NOT NULL DEFAULT '',
+        compressible     TEXT NOT NULL DEFAULT '',
+        pattern          TEXT NOT NULL DEFAULT '',
+        confidence       REAL NOT NULL DEFAULT 0,
+        confidence_level TEXT NOT NULL DEFAULT '',
+        nodes            INTEGER NOT NULL DEFAULT 0,
+        edges            INTEGER NOT NULL DEFAULT 0
+      );
+    `);
+    }
+    // ── Per-function index ──────────────────────────────────────────────────────
+    /**
+     * Populate functions_fts and function_calls from FunctionRecord data.
+     * One row per function/class/symbol — enables line-level search + call-graph authority.
+     */
+    rebuildFunctionIndex(functions) {
+        this.db.prepare("DELETE FROM functions_fts").run();
+        this.db.prepare("DELETE FROM function_calls").run();
+        this.db.prepare("DELETE FROM functions_raw").run();
+        const insFts = this.db.prepare("INSERT INTO functions_fts (file_path, line, name, body) VALUES (?, ?, ?, ?)");
+        const insCall = this.db.prepare("INSERT OR IGNORE INTO function_calls (caller_name, callee_name, caller_file) VALUES (?, ?, ?)");
+        const insRaw = this.db.prepare(`
+      INSERT OR REPLACE INTO functions_raw
+        (file_path, name, line_start, line_end, language, is_async, docstring, calls, string_lits, regex_pats)
+      VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    `);
+        this.db.transaction(() => {
+            for (const fn of functions) {
+                const fp = normPath(fn.file);
+                const line = String(fn.lines[0]);
+                // Store the original name for display and symbolMatch comparison.
+                // FTS5 porter/unicode61 tokenizer splits on '_' naturally;
+                // camelCase tokens are added to body so porter stemming applies to them too.
+                const pathToks = filePathTokens(fp);
+                const bodyParts = [
+                    splitIdentifiers(fn.name), // camelCase expansion for FTS recall
+                    pathToks, // dir segments: "fastapi dependencies" etc.
+                    ...(fn.calls ?? []).map(c => splitIdentifiers(normalizeCallee(c))),
+                    ...(fn.stringLiterals ?? []),
+                    fn.docstring ?? "",
+                ].join(" ");
+                insFts.run(fp, line, fn.name, bodyParts);
+                // Store call edges — normalise callee names to bare identifiers so the JOIN
+                // in searchSymbols matches function names (strips "engine.", "self.", etc.).
+                for (const callee of fn.calls ?? []) {
+                    const bare = normalizeCallee(callee);
+                    if (bare && bare !== fn.name)
+                        insCall.run(fn.name, bare, fp);
+                }
+                // Normalised fact row — all fields stored losslessly, no rendering.
+                insRaw.run(fp, fn.name, fn.lines[0], fn.lines[1], fn.language ?? "", fn.isAsync ? 1 : 0, fn.docstring ?? "", JSON.stringify(fn.calls ?? []), JSON.stringify(fn.stringLiterals ?? []), JSON.stringify(fn.regexPatterns ?? []));
+            }
+        })();
+    }
+    /**
+     * Store structural-intelligence reports per module.
+     * Called from `guardian intel --backend sqlite` after reading structural-intelligence.json.
+     * Idempotent: replaces all rows on each call.
+     */
+    rebuildModuleMetrics(reports) {
+        this.db.prepare("DELETE FROM module_metrics").run();
+        const ins = this.db.prepare(`
+      INSERT OR REPLACE INTO module_metrics
+        (module, depth_level, propagation, compressible, pattern, confidence, confidence_level, nodes, edges)
+      VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+    `);
+        this.db.transaction(() => {
+            for (const r of reports) {
+                ins.run(r.feature, r.classification.depth_level, r.classification.propagation, r.classification.compressible, r.recommendation.primary.pattern, r.confidence.value, r.confidence.level, r.structure.nodes, r.structure.edges);
+            }
+        })();
+    }
+    /**
+     * Read all module_metrics rows — used by generate/context to load SI reports from DB.
+     */
+    readModuleMetrics() {
+        try {
+            return this.db.prepare("SELECT * FROM module_metrics ORDER BY module").all();
+        }
+        catch {
+            return [];
+        }
+    }
+    /**
+     * Store API endpoint facts.
+     * Called from populateFTSIndex() after reading intel/arch objects.
+     * Idempotent: replaces all rows on each call.
+     */
+    rebuildEndpointsRaw(endpoints) {
+        this.db.prepare("DELETE FROM endpoints_raw").run();
+        const ins = this.db.prepare(`
+      INSERT OR REPLACE INTO endpoints_raw
+        (method, path, handler, file_path, module, service_calls, request_schema, response_schema)
+      VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+    `);
+        this.db.transaction(() => {
+            for (const ep of endpoints) {
+                ins.run(ep.method ?? "", ep.path, ep.handler ?? "", normPath(ep.file_path ?? ""), ep.module ?? "", JSON.stringify(ep.service_calls ?? []), ep.request_schema ?? "", ep.response_schema ?? "");
+            }
+        })();
+    }
+    /**
+     * Store ORM/data-model facts.
+     * Called from populateFTSIndex() after reading intel/arch objects.
+     * Idempotent: replaces all rows on each call.
+     */
+    rebuildModelsRaw(models) {
+        this.db.prepare("DELETE FROM models_raw").run();
+        const ins = this.db.prepare(`
+      INSERT OR REPLACE INTO models_raw
+        (name, file_path, module, fields, relationships)
+      VALUES (?, ?, ?, ?, ?)
+    `);
+        this.db.transaction(() => {
+            for (const m of models) {
+                ins.run(m.name, normPath(m.file_path ?? ""), m.module ?? "", JSON.stringify(m.fields ?? []), JSON.stringify(m.relationships ?? []));
+            }
+        })();
+    }
+    /**
+     * Store vector embeddings for semantic search.
+     * vec is a Float32Array serialised as Buffer (dim=256, text-embedding-3-small).
+     */
+    rebuildEmbeddings(rows) {
+        this.db.prepare("DELETE FROM function_embeddings").run();
+        const ins = this.db.prepare("INSERT OR REPLACE INTO function_embeddings (file_path, name, line, vec) VALUES (?, ?, ?, ?)");
+        this.db.transaction(() => {
+            for (const r of rows) {
+                ins.run(r.file_path, r.name, r.line, Buffer.from(r.vec.buffer));
+            }
+        })();
+    }
+    /**
+     * Vector similarity search — returns top-k functions closest to the query embedding.
+     * Cosine similarity computed in JS over all stored embeddings (fast for <100k functions).
+     */
+    searchByVector(queryVec, limit = 20) {
+        let all;
+        try {
+            all = this.db.prepare("SELECT file_path, name, line, vec FROM function_embeddings").all();
+        }
+        catch {
+            return [];
+        }
+        if (all.length === 0)
+            return [];
+        // Normalise query vector once.
+        const qNorm = vecNorm(queryVec);
+        if (qNorm === 0)
+            return [];
+        const qUnit = queryVec.map(v => v / qNorm);
+        const scored = all.map(row => {
+            const vec = new Float32Array(row.vec.buffer, row.vec.byteOffset, row.vec.byteLength / 4);
+            return { file_path: row.file_path, name: row.name, line: row.line, score: cosineSim(qUnit, vec) };
+        });
+        scored.sort((a, b) => b.score - a.score);
+        return scored.slice(0, limit);
+    }
+    /**
+     * Hybrid symbol search: BM25 + call-graph authority + callee traversal + optional vector.
+     *
+     * Three-tier candidate pool:
+     *   1. BM25 candidates  — direct FTS matches, scored by bm25_norm + auth_norm + vec_sim
+     *   2. Callee expansion — 1-hop outbound callees of BM25 candidates (source-only),
+     *                         scored by callee_hits_norm + auth_norm + vec_sim.
+     *                         Surfaces functions called BY what matches the query (e.g.
+     *                         "resolve dependency injection" → surfaces solve_dependencies
+     *                         called by the route handler BM25 match).
+     *
+     * Ranking formula:
+     *   BM25 tier:   W_BM25 * bm25_norm   + W_AUTH * auth_norm + W_VEC * vec_sim
+     *   Callee tier: W_CALLEE * hits_norm  + W_AUTH * auth_norm + W_VEC * vec_sim
+     *
+     * Test-file penalty: 0.5× applied to any result whose file matches test/spec/bench/mock.
+     */
+    searchSymbols(query, limit = 10, queryVec) {
+        const tokens = this._buildTokens(query);
+        if (tokens.length === 0)
+            return [];
+        const ftsQuery = tokens.join(" OR ");
+        // Pull a wider candidate pool so reranking has enough material.
+        const candidateLimit = Math.max(limit * 5, 60);
+        let rows;
+        try {
+            rows = this.db.prepare(`
+        WITH candidates AS (
+          SELECT file_path, line, name,
+                 bm25(functions_fts, 0.2, 1.0, 0.5) AS bm25
+          FROM functions_fts
+          WHERE functions_fts MATCH ?
+          ORDER BY bm25
+          LIMIT ?
+        )
+        SELECT c.file_path, c.line, c.name, c.bm25,
+               COUNT(CASE
+                 WHEN fc.caller_file NOT LIKE '%test%'
+                  AND fc.caller_file NOT LIKE '%spec%'
+                  AND fc.caller_file NOT LIKE '%mock%'
+                  AND fc.caller_file NOT LIKE '%fixture%'
+                  AND fc.caller_file NOT LIKE '%example%'
+                  AND fc.caller_file NOT LIKE '%demo%'
+                  AND fc.caller_file NOT LIKE '%sample%'
+                 THEN 1 END) AS indegree
+        FROM candidates c
+        LEFT JOIN function_calls fc ON fc.callee_name = c.name
+        GROUP BY c.file_path, c.line, c.name, c.bm25
+        ORDER BY c.bm25
+      `).all(ftsQuery, candidateLimit);
+        }
+        catch {
+            return [];
+        }
+        if (rows.length === 0)
+            return [];
+        const bm25Names = rows.map(r => r.name);
+        const bm25NameSet = new Set(bm25Names);
+        let calleeRows = [];
+        if (bm25Names.length > 0) {
+            try {
+                // Limit IN clause to avoid excess query plan cost on large candidate pools.
+                const callerNames = bm25Names.slice(0, 30);
+                const phs = callerNames.map(() => "?").join(",");
+                calleeRows = this.db.prepare(`
+          SELECT f.file_path, f.line, f.name,
+                 COUNT(*) AS callee_hits
+          FROM function_calls fc
+          JOIN functions_fts f ON f.name = fc.callee_name
+          WHERE fc.caller_name IN (${phs})
+            AND fc.caller_file NOT LIKE '%test%'
+            AND fc.caller_file NOT LIKE '%spec%'
+            AND fc.caller_file NOT LIKE '%mock%'
+            AND fc.caller_file NOT LIKE '%fixture%'
+            AND fc.caller_file NOT LIKE '%example%'
+            AND fc.caller_file NOT LIKE '%demo%'
+            AND fc.caller_file NOT LIKE '%sample%'
+          GROUP BY f.file_path, f.line, f.name
+          ORDER BY callee_hits DESC
+          LIMIT ?
+        `).all(...callerNames, 40);
+            }
+            catch { /* graceful — callee expansion is additive only */ }
+        }
+        // Build the callee membership set BEFORE removing BM25 overlap.
+        // This is used to apply a score bonus to BM25-tier functions that are also
+        // call-graph targets (e.g. handleHTTPRequest: low BM25 rank, but called by ServeHTTP).
+        const calleeNameSet = new Set(calleeRows.map(r => r.name));
+        // Remove BM25 names from the separate callee tier to avoid double-counting.
+        calleeRows = calleeRows.filter(r => !bm25NameSet.has(r.name));
+        // ── Normalisation scalars ─────────────────────────────────────────────────
+        // BM25: negative (more negative = better), invert then normalise.
+        const bm25Scores = rows.map(r => -r.bm25);
+        const bm25Max = Math.max(...bm25Scores);
+        const bm25Min = Math.min(...bm25Scores);
+        const bm25Range = bm25Max - bm25Min || 1;
+        // In-degree: BM25 tier only (callees don't carry their own in-degree here).
+        const indegreeMax = Math.max(...rows.map(r => r.indegree)) || 1;
+        // Callee hits: normalise by pool size so a single edge (1 of N candidates) = tiny score.
+        // This prevents callee expansion from flooding results when the BM25 signal is weak.
+        const maxCalleeHits = Math.max(bm25Names.slice(0, 30).length, 1);
+        // ── Vector scores (optional) ──────────────────────────────────────────────
+        const vecScores = new Map();
+        if (queryVec) {
+            const allNames = new Set([...bm25Names, ...calleeRows.map(r => r.name)]);
+            const vecResults = this.searchByVector(queryVec, candidateLimit * 2);
+            for (const v of vecResults) {
+                if (allNames.has(v.name))
+                    vecScores.set(`${v.file_path}::${v.name}::${v.line}`, v.score);
+            }
+        }
+        const hasVec = vecScores.size > 0;
+        // ── Weight tables ─────────────────────────────────────────────────────────
+        const W_BM25 = hasVec ? 0.50 : 0.70;
+        const W_AUTH = hasVec ? 0.20 : 0.30;
+        const W_VEC = hasVec ? 0.30 : 0.00;
+        // Callee tier: scored on hit count + vector (no separate in-degree).
+        const W_CALLEE = hasVec ? 0.35 : 0.45;
+        const W_CA_VEC = hasVec ? 0.30 : 0.00;
+        // Callee bonus: applied only to BM25-tier functions with WEAK BM25 signal (bm25Norm < 0.20)
+        // that also appear in the callee expansion. This targets long-tail pool members like
+        // handleHTTPRequest (rank ~90/150) without boosting already-competitive functions
+        // (e.g. render_template at rank ~40) which could displace authority-ranked results.
+        const W_CALLEE_BONUS = 0.28;
+        const CALLEE_BM25_THRESHOLD = 0.20; // only boost if bm25Norm below this
+        // ── Test/example-file penalty ─────────────────────────────────────────────
+        // Applied 0.5× to test files and example/demo/sample directories.
+        // Checks both the filename AND all directory segments so that files like
+        // "examples/static-files/index.js" are caught even if their basename is generic.
+        const TEST_PENALTY = 0.50;
+        const isNonSourceFile = (fp) => {
+            const parts = fp.split("/");
+            const filename = parts[parts.length - 1] ?? fp;
+            return /test|spec|bench|mock|fixture/i.test(filename) ||
+                parts.some(p => /^examples?$|^demos?$|^samples?$/i.test(p));
+        };
+        // ── Score BM25 tier ───────────────────────────────────────────────────────
+        const scored = rows.map(r => {
+            const bm25Norm = ((-r.bm25) - bm25Min) / bm25Range;
+            const authNorm = r.indegree / indegreeMax;
+            const key = `${r.file_path}::${r.name}::${r.line}`;
+            const vecSim = vecScores.get(key) ?? 0;
+            // Callee bonus only applies to source-file functions (test files are already penalised).
+            const calleeBonus = (!isNonSourceFile(r.file_path) && calleeNameSet.has(r.name)
+                && bm25Norm < CALLEE_BM25_THRESHOLD)
+                ? W_CALLEE_BONUS : 0;
+            const raw = W_BM25 * bm25Norm + W_AUTH * authNorm + W_VEC * vecSim + calleeBonus;
+            return { file_path: r.file_path, name: r.name, line: parseInt(r.line, 10),
+                score: isNonSourceFile(r.file_path) ? raw * TEST_PENALTY : raw };
+        });
+        // ── Score callee tier (functions NOT in BM25 pool) and merge ─────────────
+        for (const r of calleeRows) {
+            const hitsNorm = r.callee_hits / maxCalleeHits;
+            const key = `${r.file_path}::${r.name}::${r.line}`;
+            const vecSim = vecScores.get(key) ?? 0;
+            const raw = W_CALLEE * hitsNorm + W_CA_VEC * vecSim;
+            scored.push({ file_path: r.file_path, name: r.name, line: parseInt(r.line, 10),
+                score: isNonSourceFile(r.file_path) ? raw * TEST_PENALTY : raw });
+        }
+        scored.sort((a, b) => b.score - a.score);
+        return scored.slice(0, limit);
     }
     // ── Dependency graph ────────────────────────────────────────────────────────
     /** Replace all import edges (run once per guardian extract --backend sqlite). */
@@ -411,6 +888,9 @@ export class SqliteSpecsStore {
         catch {
             return [];
         }
+        // Build a set of bare query stems for matching_symbols computation.
+        // Strip the trailing '*' added by _buildTokens so we can do prefix matching.
+        const queryStems = tokens.map(t => t.replace(/\*$/, ""));
         // Apply quality reranking using dependency-graph authority score.
         const reranked = rows.map(r => {
             const imports = r.imports_ ? r.imports_.split(",").filter(Boolean) : [];
@@ -426,14 +906,27 @@ export class SqliteSpecsStore {
                 // authority_ratio ∈ [0, 1]: 1.0 = pure authority (many things import this file)
                 //                            0.0 = pure hub (imports many, nothing imports it)
                 const authority = usedByN / (usedByN + importsN);
-                // Gentle nudge: [0.7, 1.0] — hubs are demoted by at most 30%.
-                // BM25 relevance still dominates; this is a tiebreaker, not a hard filter.
+                // Range [0.7, 1.0]: hub files that import many things but aren't imported
+                // get a slight penalty vs authority files. Explicit path penalty handles examples.
                 quality = 0.7 + 0.3 * authority;
             }
+            // Path-based hard penalty for example/demo/sample directories — belt-and-suspenders
+            // on top of the authority demotion, for repos where dep graph may be sparse.
+            const pathParts = r.file_path.split("/");
+            if (pathParts.some(p => /^examples?$|^demos?$|^samples?$/i.test(p))) {
+                quality *= 0.5;
+            }
             // bm25 is negative (more negative = better). Multiplying by quality < 1
             // moves the score toward 0 — making low-quality files rank worse.
             const combined = r.rank * quality;
-            return { file_path: r.file_path, symbol_name: r.symbol_name, rank: combined, imports, used_by };
+            // Snippet equivalent: which named symbols in this file match query stems?
+            // symbol_name is a space-separated list of all symbols extracted from the file.
+            const fileSymbols = r.symbol_name ? r.symbol_name.split(/\s+/).filter(Boolean) : [];
+            const matching_symbols = fileSymbols.filter(sym => {
+                const symLower = splitIdentifiers(sym); // "isPublished" → "is published"
+                return queryStems.some(stem => symLower.includes(stem) || sym.toLowerCase().includes(stem));
+            }).slice(0, 6); // cap at 6 per file
+            return { file_path: r.file_path, symbol_name: r.symbol_name, rank: combined, imports, used_by, matching_symbols };
         });
         reranked.sort((a, b) => a.rank - b.rank);
         return reranked.slice(0, limit);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@toolbaux/guardian",
-  "version": "0.1.23",
+  "version": "0.2.0",
   "type": "module",
   "description": "Architectural intelligence for codebases. Verify that AI-generated code matches your architectural intent.",
   "keywords": [
@@ -53,6 +53,7 @@
     "benchmark:llm": "tsx scripts/benchmark-llm-context/index.ts"
   },
   "dependencies": {
+    "@xenova/transformers": "^2.17.2",
     "better-sqlite3": "^12.8.0",
     "commander": "^12.1.0",
     "dotenv": "^17.3.1",