npm - kiri-mcp-server - Versions diffs - 0.10.0 → 0.11.0 - Mend

kiri-mcp-server 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/README.md +10 -1
package/config/scoring-profiles.yml +82 -35
package/dist/config/scoring-profiles.yml +82 -35
package/dist/package.json +9 -1
package/dist/src/indexer/cli.d.ts.map +1 -1
package/dist/src/indexer/cli.js +712 -98
package/dist/src/indexer/cli.js.map +1 -1
package/dist/src/indexer/git.d.ts.map +1 -1
package/dist/src/indexer/git.js +41 -3
package/dist/src/indexer/git.js.map +1 -1
package/dist/src/indexer/migrations/repo-merger.d.ts +33 -0
package/dist/src/indexer/migrations/repo-merger.d.ts.map +1 -0
package/dist/src/indexer/migrations/repo-merger.js +67 -0
package/dist/src/indexer/migrations/repo-merger.js.map +1 -0
package/dist/src/indexer/schema.d.ts +66 -0
package/dist/src/indexer/schema.d.ts.map +1 -1
package/dist/src/indexer/schema.js +337 -0
package/dist/src/indexer/schema.js.map +1 -1
package/dist/src/server/boost-profiles.d.ts +1 -1
package/dist/src/server/boost-profiles.d.ts.map +1 -1
package/dist/src/server/boost-profiles.js +116 -0
package/dist/src/server/boost-profiles.js.map +1 -1
package/dist/src/server/config.d.ts +45 -0
package/dist/src/server/config.d.ts.map +1 -0
package/dist/src/server/config.js +146 -0
package/dist/src/server/config.js.map +1 -0
package/dist/src/server/context.d.ts +29 -0
package/dist/src/server/context.d.ts.map +1 -1
package/dist/src/server/context.js +26 -1
package/dist/src/server/context.js.map +1 -1
package/dist/src/server/handlers/snippets-get.d.ts +36 -0
package/dist/src/server/handlers/snippets-get.d.ts.map +1 -0
package/dist/src/server/handlers/snippets-get.js +120 -0
package/dist/src/server/handlers/snippets-get.js.map +1 -0
package/dist/src/server/handlers.d.ts +32 -20
package/dist/src/server/handlers.d.ts.map +1 -1
package/dist/src/server/handlers.js +1554 -338
package/dist/src/server/handlers.js.map +1 -1
package/dist/src/server/indexBootstrap.d.ts.map +1 -1
package/dist/src/server/indexBootstrap.js +49 -2
package/dist/src/server/indexBootstrap.js.map +1 -1
package/dist/src/server/main.d.ts.map +1 -1
package/dist/src/server/main.js +7 -0
package/dist/src/server/main.js.map +1 -1
package/dist/src/server/profile-selector.d.ts +33 -0
package/dist/src/server/profile-selector.d.ts.map +1 -0
package/dist/src/server/profile-selector.js +291 -0
package/dist/src/server/profile-selector.js.map +1 -0
package/dist/src/server/rpc.d.ts.map +1 -1
package/dist/src/server/rpc.js +36 -6
package/dist/src/server/rpc.js.map +1 -1
package/dist/src/server/runtime.d.ts.map +1 -1
package/dist/src/server/runtime.js +14 -4
package/dist/src/server/runtime.js.map +1 -1
package/dist/src/server/scoring.d.ts +7 -1
package/dist/src/server/scoring.d.ts.map +1 -1
package/dist/src/server/scoring.js +121 -21
package/dist/src/server/scoring.js.map +1 -1
package/dist/src/server/services/index.d.ts +24 -0
package/dist/src/server/services/index.d.ts.map +1 -0
package/dist/src/server/services/index.js +20 -0
package/dist/src/server/services/index.js.map +1 -0
package/dist/src/server/services/repo-repository.d.ts +61 -0
package/dist/src/server/services/repo-repository.d.ts.map +1 -0
package/dist/src/server/services/repo-repository.js +93 -0
package/dist/src/server/services/repo-repository.js.map +1 -0
package/dist/src/server/services/repo-resolver.d.ts +28 -0
package/dist/src/server/services/repo-resolver.d.ts.map +1 -0
package/dist/src/server/services/repo-resolver.js +62 -0
package/dist/src/server/services/repo-resolver.js.map +1 -0
package/dist/src/shared/duckdb.d.ts.map +1 -1
package/dist/src/shared/duckdb.js +21 -1
package/dist/src/shared/duckdb.js.map +1 -1
package/dist/src/shared/fs/safePath.d.ts +7 -0
package/dist/src/shared/fs/safePath.d.ts.map +1 -0
package/dist/src/shared/fs/safePath.js +23 -0
package/dist/src/shared/fs/safePath.js.map +1 -0
package/dist/src/shared/utils/glob.d.ts +5 -0
package/dist/src/shared/utils/glob.d.ts.map +1 -0
package/dist/src/shared/utils/glob.js +22 -0
package/dist/src/shared/utils/glob.js.map +1 -0
package/dist/src/shared/utils/retry.d.ts +8 -0
package/dist/src/shared/utils/retry.d.ts.map +1 -0
package/dist/src/shared/utils/retry.js +20 -0
package/dist/src/shared/utils/retry.js.map +1 -0
package/package.json +28 -22

package/dist/src/indexer/cli.js CHANGED Viewed

@@ -1,21 +1,110 @@
 import { createHash } from "node:crypto";
 import { existsSync } from "node:fs";
-import { readFile, stat } from "node:fs/promises";
-import { join, resolve, extname } from "node:path";
+import { readFile, readdir, stat } from "node:fs/promises";
+import { join, resolve, extname, posix as pathPosix } from "node:path";
 import { pathToFileURL } from "node:url";
+import { parse as parseYAML } from "yaml";
 import { DuckDBClient } from "../shared/duckdb.js";
 import { generateEmbedding } from "../shared/embedding.js";
 import { acquireLock, releaseLock, LockfileError, getLockOwner } from "../shared/utils/lockfile.js";
-import { normalizeDbPath, ensureDbParentDir, getRepoPathCandidates } from "../shared/utils/path.js";
+import { normalizeDbPath, normalizeRepoPath, ensureDbParentDir, getRepoPathCandidates, } from "../shared/utils/path.js";
 import { analyzeSource, buildFallbackSnippet } from "./codeintel.js";
 import { getDefaultBranch, getHeadCommit, gitLsFiles, gitDiffNameOnly } from "./git.js";
 import { detectLanguage } from "./language.js";
+import { mergeRepoRecords } from "./migrations/repo-merger.js";
 import { getIndexerQueue } from "./queue.js";
-import { ensureBaseSchema, ensureRepoMetaColumns, rebuildFTSIfNeeded } from "./schema.js";
+import { ensureBaseSchema, ensureDocumentMetadataTables, ensureNormalizedRootColumn, ensureRepoMetaColumns, rebuildFTSIfNeeded, } from "./schema.js";
 import { IndexWatcher } from "./watch.js";
+function normalizePathForIndex(value) {
+    return value.replace(/\\/g, "/");
+}
+function ensurePairState(stateMap, path) {
+    const existing = stateMap.get(path);
+    if (existing) {
+        return existing;
+    }
+    const created = { count: 0, seen: new Set() };
+    stateMap.set(path, created);
+    return created;
+}
 const MAX_SAMPLE_BYTES = 32_768;
 const MAX_FILE_BYTES = 32 * 1024 * 1024; // 32MB limit to prevent memory exhaustion
 const SCAN_BATCH_SIZE = 100; // Process files in batches to limit memory usage
+const MARKDOWN_EXTENSIONS = new Set([".md", ".mdx", ".markdown"]);
+const DOCMETA_SNAPSHOT_DIR = "docmeta/";
+const DOCMETA_SNAPSHOT_TARGET_FIELD = "target_path";
+const DOCMETA_SNAPSHOT_DATA_FIELD = "front_matter";
+/**
+ * Metadata processing limits to prevent DoS attacks and memory exhaustion.
+ *
+ * These values balance security, performance, and real-world usage patterns.
+ * Adjust based on:
+ * - Performance testing with 10000+ file repositories
+ * - Memory profiling (Node.js heap size impact)
+ * - Analysis of 99th percentile values in production data
+ */
+/**
+ * Maximum length of a single metadata value (characters).
+ *
+ * Rationale: Typical YAML front matter fields (title, description) are 200-300 chars.
+ * Setting to 512 provides headroom while preventing abuse.
+ *
+ * Example use cases:
+ * - Document titles: ~100 chars
+ * - Descriptions: ~300 chars
+ * - Tags (as comma-separated string): ~200 chars
+ */
+const MAX_METADATA_VALUE_LENGTH = 512;
+/**
+ * Maximum nesting depth for metadata tree structures.
+ *
+ * Rationale: Normal YAML/JSON documents nest 3-5 levels deep.
+ * Setting to 8 accommodates complex configurations while preventing stack overflow.
+ *
+ * Defense: Prevents malicious deeply-nested documents from causing:
+ * - Stack overflow (recursive function calls)
+ * - Exponential memory growth
+ * - CPU exhaustion during traversal
+ */
+const MAX_METADATA_DEPTH = 8;
+/**
+ * Maximum number of elements in a metadata array.
+ *
+ * Rationale: Common use case is tags/categories arrays with ~10 items.
+ * Setting to 64 provides generous headroom for edge cases.
+ *
+ * Example arrays:
+ * - Tags: ["frontend", "react", "typescript"] (~3-10 items)
+ * - Authors: ["John Doe", "Jane Smith"] (~1-5 items)
+ * - Categories: ["guide", "tutorial", "api"] (~2-8 items)
+ */
+const MAX_METADATA_ARRAY_LENGTH = 64;
+/**
+ * Maximum number of key-value pairs extracted per file.
+ *
+ * Rationale: Memory footprint calculation:
+ * - 256 pairs × ~40 bytes/pair ≈ 10KB per file
+ * - For 10000 files: 10KB × 10000 = 100MB (acceptable overhead)
+ *
+ * Prevents DoS from files with thousands of metadata fields.
+ * Normal documents have 5-20 metadata fields.
+ */
+const MAX_METADATA_PAIRS_PER_FILE = 256;
+/**
+ * Maximum number of object keys processed in a metadata tree node.
+ *
+ * Rationale: Prevents memory exhaustion from maliciously crafted objects with excessive keys.
+ * Normal metadata objects have 5-20 keys. Setting to 256 provides generous headroom.
+ *
+ * Memory impact: Each key entry requires ~50 bytes (key name + value reference).
+ * 256 keys × 50 bytes ≈ 12.8KB per object, which is acceptable.
+ */
+const MAX_METADATA_OBJECT_KEYS = 256;
+/**
+ * Key name used for root-level scalar values in metadata trees.
+ * Internal use only - not exposed in search results.
+ */
+const ROOT_METADATA_KEY = "__root";
 /**
  * Maximum number of SQL placeholders per INSERT statement.
  *
@@ -72,43 +161,17 @@ function isBinaryBuffer(buffer) {
  * @param defaultBranch - Default branch name (e.g., "main", "master"), or null if unknown
  * @returns The repository ID (auto-generated on first insert, reused thereafter)
  */
-async function mergeLegacyRepoRows(db, canonicalRepoId, legacyRepoIds) {
-    if (legacyRepoIds.length === 0) {
-        return;
-    }
-    const referencingTables = await db.all(`SELECT DISTINCT c.table_name
-       FROM duckdb_columns() AS c
-       JOIN duckdb_tables() AS t
-         ON c.database_name = t.database_name
-        AND c.schema_name = t.schema_name
-        AND c.table_name = t.table_name
-      WHERE c.column_name = 'repo_id'
-        AND c.table_name <> 'repo'
-        AND t.table_type = 'BASE TABLE'`);
-    const safeTables = referencingTables
-        .map((row) => row.table_name)
-        .filter((name) => /^[A-Za-z0-9_]+$/.test(name));
-    await db.transaction(async () => {
-        for (const legacyRepoId of legacyRepoIds) {
-            for (const tableName of safeTables) {
-                await db.run(`UPDATE ${tableName} SET repo_id = ? WHERE repo_id = ?`, [
-                    canonicalRepoId,
-                    legacyRepoId,
-                ]);
-            }
-            await db.run("DELETE FROM repo WHERE id = ?", [legacyRepoId]);
-        }
-    });
-}
 async function ensureRepo(db, repoRoot, defaultBranch, candidateRoots) {
     const searchRoots = Array.from(new Set([repoRoot, ...(candidateRoots ?? [])]));
     const placeholders = searchRoots.map(() => "?").join(", ");
     let rows = await db.all(`SELECT id, root FROM repo WHERE root IN (${placeholders})`, searchRoots);
     if (rows.length === 0) {
-        await db.run(`INSERT INTO repo (root, default_branch, indexed_at)
-       VALUES (?, ?, CURRENT_TIMESTAMP)
+        const normalized = normalizeRepoPath(repoRoot);
+        await db.run(`INSERT INTO repo (root, normalized_root, default_branch, indexed_at)
+       VALUES (?, ?, ?, CURRENT_TIMESTAMP)
        ON CONFLICT(root) DO UPDATE SET
-         default_branch = COALESCE(excluded.default_branch, repo.default_branch)`, [repoRoot, defaultBranch]);
+         normalized_root = excluded.normalized_root,
+         default_branch = COALESCE(excluded.default_branch, repo.default_branch)`, [repoRoot, normalized, defaultBranch]);
         rows = await db.all(`SELECT id, root FROM repo WHERE root IN (${placeholders})`, searchRoots);
     }
     if (rows.length === 0) {
@@ -123,7 +186,7 @@ async function ensureRepo(db, repoRoot, defaultBranch, candidateRoots) {
         canonicalRow = { ...canonicalRow, root: repoRoot };
     }
     const legacyIds = rows.filter((row) => row.id !== canonicalRow.id).map((row) => row.id);
-    await mergeLegacyRepoRows(db, canonicalRow.id, legacyIds);
+    await mergeRepoRecords(db, canonicalRow.id, legacyIds);
     return canonicalRow.id;
 }
 /**
@@ -302,6 +365,491 @@ async function persistEmbeddings(db, repoId, records) {
         ]),
     }));
 }
+async function persistDocumentMetadata(db, repoId, records) {
+    if (records.length === 0)
+        return;
+    const BATCH_SIZE = calculateBatchSize(4);
+    await persistInBatches(db, records, BATCH_SIZE, (batch) => ({
+        sql: `INSERT OR REPLACE INTO document_metadata (repo_id, path, source, data) VALUES ${batch.map(() => "(?, ?, ?, ?)").join(", ")}`,
+        params: batch.flatMap((record) => [
+            repoId,
+            record.path,
+            record.source,
+            JSON.stringify(record.data),
+        ]),
+    }));
+}
+async function persistMetadataPairs(db, repoId, records) {
+    if (records.length === 0)
+        return;
+    const BATCH_SIZE = calculateBatchSize(5);
+    await persistInBatches(db, records, BATCH_SIZE, (batch) => ({
+        sql: `INSERT OR REPLACE INTO document_metadata_kv (repo_id, path, source, key, value) VALUES ${batch.map(() => "(?, ?, ?, ?, ?)").join(", ")}`,
+        params: batch.flatMap((record) => [
+            repoId,
+            record.path,
+            record.source,
+            record.key,
+            record.value,
+        ]),
+    }));
+}
+async function persistMarkdownLinks(db, repoId, records) {
+    if (records.length === 0)
+        return;
+    const BATCH_SIZE = calculateBatchSize(6);
+    await persistInBatches(db, records, BATCH_SIZE, (batch) => ({
+        sql: `INSERT OR REPLACE INTO markdown_link (repo_id, src_path, target, resolved_path, anchor_text, kind) VALUES ${batch.map(() => "(?, ?, ?, ?, ?, ?)").join(", ")}`,
+        params: batch.flatMap((record) => [
+            repoId,
+            record.srcPath,
+            record.target,
+            record.resolvedPath,
+            record.anchorText,
+            record.kind,
+        ]),
+    }));
+}
+function sanitizeMetadataTree(value, depth = 0) {
+    // Depth check at the beginning to prevent stack overflow
+    if (depth > MAX_METADATA_DEPTH) {
+        console.warn(`Metadata depth limit (${MAX_METADATA_DEPTH}) exceeded, truncating nested value`);
+        return null;
+    }
+    if (value === null || value === undefined) {
+        return null;
+    }
+    if (value instanceof Date) {
+        return value.toISOString();
+    }
+    if (typeof value === "string") {
+        const trimmed = value.trim();
+        if (trimmed.length === 0) {
+            return null;
+        }
+        return trimmed.length > MAX_METADATA_VALUE_LENGTH
+            ? trimmed.slice(0, MAX_METADATA_VALUE_LENGTH)
+            : trimmed;
+    }
+    if (typeof value === "number") {
+        if (!Number.isFinite(value)) {
+            return null;
+        }
+        return value;
+    }
+    if (typeof value === "boolean") {
+        return value;
+    }
+    if (Array.isArray(value)) {
+        if (value.length === 0) {
+            return null;
+        }
+        // Warn if array is too large
+        if (value.length > MAX_METADATA_ARRAY_LENGTH) {
+            console.warn(`Metadata array has ${value.length} elements, limiting to ${MAX_METADATA_ARRAY_LENGTH}`);
+        }
+        const sanitized = [];
+        for (const item of value.slice(0, MAX_METADATA_ARRAY_LENGTH)) {
+            const child = sanitizeMetadataTree(item, depth + 1);
+            if (child !== null) {
+                sanitized.push(child);
+            }
+        }
+        return sanitized.length > 0 ? sanitized : null;
+    }
+    if (typeof value === "object") {
+        const result = {};
+        const entries = Object.entries(value);
+        // Limit number of object keys to prevent memory exhaustion
+        if (entries.length > MAX_METADATA_OBJECT_KEYS) {
+            console.warn(`Object has ${entries.length} keys, limiting to ${MAX_METADATA_OBJECT_KEYS} to prevent memory exhaustion`);
+        }
+        for (const [key, child] of entries.slice(0, MAX_METADATA_OBJECT_KEYS)) {
+            if (!key)
+                continue;
+            const sanitizedChild = sanitizeMetadataTree(child, depth + 1);
+            if (sanitizedChild !== null) {
+                result[key] = sanitizedChild;
+            }
+        }
+        return Object.keys(result).length > 0 ? result : null;
+    }
+    return null;
+}
+function metadataValueToString(value) {
+    if (typeof value === "string") {
+        return value;
+    }
+    if (typeof value === "number") {
+        return Number.isFinite(value) ? value.toString() : "";
+    }
+    return value ? "true" : "false";
+}
+function collectMetadataPairsFromValue(value, path, source, pairs, state, keyPrefix = "") {
+    if (state.count >= MAX_METADATA_PAIRS_PER_FILE) {
+        return;
+    }
+    if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
+        const key = keyPrefix.length > 0 ? keyPrefix : ROOT_METADATA_KEY;
+        let normalized = metadataValueToString(value).trim();
+        if (normalized.length === 0) {
+            return;
+        }
+        if (normalized.length > MAX_METADATA_VALUE_LENGTH) {
+            normalized = normalized.slice(0, MAX_METADATA_VALUE_LENGTH);
+        }
+        const dedupeKey = `${source}:${key}:${normalized.toLowerCase()}`;
+        if (state.seen.has(dedupeKey)) {
+            return;
+        }
+        state.seen.add(dedupeKey);
+        pairs.push({ path, source, key, value: normalized });
+        state.count += 1;
+        return;
+    }
+    if (Array.isArray(value)) {
+        for (const item of value) {
+            collectMetadataPairsFromValue(item, path, source, pairs, state, keyPrefix);
+            if (state.count >= MAX_METADATA_PAIRS_PER_FILE) {
+                break;
+            }
+        }
+        return;
+    }
+    if (typeof value === "object" && value !== null) {
+        for (const [childKey, childValue] of Object.entries(value)) {
+            const normalizedKey = childKey.toLowerCase();
+            const nextPrefix = keyPrefix.length > 0 ? `${keyPrefix}.${normalizedKey}` : normalizedKey;
+            collectMetadataPairsFromValue(childValue, path, source, pairs, state, nextPrefix);
+            if (state.count >= MAX_METADATA_PAIRS_PER_FILE) {
+                break;
+            }
+        }
+    }
+}
+function parseFrontMatterBlock(content, path) {
+    const leading = content.startsWith("\uFEFF") ? content.slice(1) : content;
+    if (!leading.startsWith("---")) {
+        return null;
+    }
+    const match = leading.match(/^---\s*\r?\n([\s\S]*?)\r?\n---\s*(?:\r?\n|$)/);
+    if (!match) {
+        return null;
+    }
+    const rawBlock = match[1] ?? "";
+    const body = leading.slice(match[0].length);
+    try {
+        const data = parseYAML(rawBlock);
+        return { data: data ?? null, body };
+    }
+    catch (error) {
+        // Structured error logging for better debugging
+        const errorMessage = error instanceof Error ? error.message : String(error);
+        console.warn(JSON.stringify({
+            level: "warn",
+            message: "Failed to parse Markdown front matter",
+            file: path,
+            error: errorMessage,
+            context: "Front matter YAML parsing failed, metadata will be skipped for this file",
+        }));
+        return { data: null, body };
+    }
+}
+function stripLinkTitle(target) {
+    const trimmed = target.trim();
+    if (trimmed.length === 0) {
+        return trimmed;
+    }
+    const angleWrapped = trimmed.startsWith("<") && trimmed.endsWith(">");
+    const unwrapped = angleWrapped ? trimmed.slice(1, -1) : trimmed;
+    return unwrapped.replace(/\s+("[^"]*"|'[^']*')\s*$/, "").trim();
+}
+function extractMarkdownLinks(content, srcPath, repoFileSet) {
+    const links = [];
+    const pattern = /\[(?<text>[^\]]+)\]\((?<target>[^)]+)\)/g;
+    let match;
+    while ((match = pattern.exec(content)) !== null) {
+        if (match.index > 0 && content[match.index - 1] === "!") {
+            continue; // Skip images
+        }
+        const text = match.groups?.text?.trim() ?? "";
+        let target = match.groups?.target?.trim() ?? "";
+        if (!text || !target) {
+            continue;
+        }
+        target = stripLinkTitle(target);
+        if (!target) {
+            continue;
+        }
+        const kind = classifyMarkdownTarget(target);
+        const resolvedPath = resolveMarkdownLink(kind, target, srcPath, repoFileSet);
+        if (kind === "anchor" && resolvedPath === null) {
+            continue;
+        }
+        links.push({
+            srcPath,
+            target,
+            resolvedPath,
+            anchorText: text.slice(0, 160),
+            kind,
+        });
+    }
+    return links;
+}
+function classifyMarkdownTarget(target) {
+    const trimmed = target.trim();
+    if (!trimmed) {
+        return "external";
+    }
+    if (trimmed.startsWith("#")) {
+        return "anchor";
+    }
+    if (/^[a-z][a-z0-9+.-]*:/i.test(trimmed) || trimmed.startsWith("//")) {
+        return "external";
+    }
+    if (trimmed.startsWith("/")) {
+        return "absolute";
+    }
+    return "relative";
+}
+function resolveMarkdownLink(kind, target, srcPath, repoFileSet) {
+    if (kind === "external" || kind === "anchor") {
+        return null;
+    }
+    let cleanTarget = target.split("?")[0] ?? "";
+    const hashIndex = cleanTarget.indexOf("#");
+    if (hashIndex >= 0) {
+        cleanTarget = cleanTarget.slice(0, hashIndex);
+    }
+    cleanTarget = cleanTarget.trim().replace(/\\/g, "/");
+    if (!cleanTarget) {
+        return null;
+    }
+    let candidate;
+    if (kind === "absolute") {
+        candidate = cleanTarget.replace(/^\/+/, "");
+    }
+    else {
+        const dir = pathPosix.dirname(srcPath);
+        candidate = pathPosix.join(dir, cleanTarget);
+    }
+    candidate = pathPosix.normalize(candidate);
+    if (!candidate || candidate.startsWith("..")) {
+        return null;
+    }
+    // Security: Prevent directory traversal by checking for ".." segments
+    // Even after normalization, check that no path segment contains ".." or "."
+    const segments = candidate.split("/");
+    if (segments.some((seg) => seg === ".." || seg === ".")) {
+        return null;
+    }
+    // Additional security: reject absolute paths that may have bypassed earlier checks
+    if (candidate.startsWith("/")) {
+        return null;
+    }
+    const candidates = buildLinkCandidatePaths(candidate);
+    for (const pathCandidate of candidates) {
+        if (repoFileSet.has(pathCandidate)) {
+            return pathCandidate;
+        }
+    }
+    return null;
+}
+function buildLinkCandidatePaths(basePath) {
+    const candidates = new Set();
+    candidates.add(basePath);
+    if (!pathPosix.extname(basePath)) {
+        candidates.add(`${basePath}.md`);
+        candidates.add(`${basePath}.mdx`);
+        candidates.add(`${basePath}/README.md`);
+        candidates.add(`${basePath}/readme.md`);
+        candidates.add(`${basePath}/index.md`);
+        candidates.add(`${basePath}/INDEX.md`);
+    }
+    return Array.from(candidates);
+}
+function parseJsonValue(content, path) {
+    try {
+        return JSON.parse(content);
+    }
+    catch (error) {
+        // Structured error logging for better debugging
+        const errorMessage = error instanceof Error ? error.message : String(error);
+        console.warn(JSON.stringify({
+            level: "warn",
+            message: "Failed to parse JSON metadata",
+            file: path,
+            error: errorMessage,
+            context: "JSON parsing failed, metadata will be skipped for this file",
+        }));
+        return null;
+    }
+}
+function parseYamlValue(content, path) {
+    try {
+        return parseYAML(content);
+    }
+    catch (error) {
+        // Structured error logging for better debugging
+        const errorMessage = error instanceof Error ? error.message : String(error);
+        console.warn(JSON.stringify({
+            level: "warn",
+            message: "Failed to parse YAML metadata",
+            file: path,
+            error: errorMessage,
+            context: "YAML parsing failed, metadata will be skipped for this file",
+        }));
+        return null;
+    }
+}
+function parseDocmetaSnapshot(content, path) {
+    const parsed = parseJsonValue(content, path);
+    if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
+        return null;
+    }
+    const candidate = parsed;
+    const targetPath = candidate[DOCMETA_SNAPSHOT_TARGET_FIELD];
+    const frontMatter = candidate[DOCMETA_SNAPSHOT_DATA_FIELD];
+    if (typeof targetPath !== "string") {
+        return null;
+    }
+    const sanitized = sanitizeMetadataTree(frontMatter);
+    if (!sanitized) {
+        return null;
+    }
+    return {
+        targetPath: normalizePathForIndex(targetPath),
+        data: sanitized,
+    };
+}
+async function collectPlainDocsPaths(repoRoot) {
+    const results = [];
+    async function walkRelative(relativeDir) {
+        const absDir = join(repoRoot, relativeDir);
+        let entries;
+        try {
+            entries = await readdir(absDir, { withFileTypes: true });
+        }
+        catch {
+            return;
+        }
+        for (const entry of entries) {
+            const relPath = pathPosix.join(relativeDir, entry.name);
+            if (entry.isDirectory()) {
+                await walkRelative(relPath);
+            }
+            else {
+                results.push(relPath);
+            }
+        }
+    }
+    await walkRelative("docs").catch(() => { });
+    await walkRelative("docmeta").catch(() => { });
+    return results;
+}
+function extractStructuredData(files, blobs, repoFileSet) {
+    const map = new Map();
+    const pairStates = new Map();
+    for (const file of files) {
+        if (file.isBinary)
+            continue;
+        const blob = blobs.get(file.blobHash);
+        if (!blob || blob.content === null) {
+            continue;
+        }
+        const ext = (file.ext ?? "").toLowerCase();
+        const normalizedPath = normalizePathForIndex(file.path);
+        if (normalizedPath.startsWith(DOCMETA_SNAPSHOT_DIR)) {
+            const snapshot = parseDocmetaSnapshot(blob.content, file.path);
+            if (snapshot) {
+                const existing = map.get(snapshot.targetPath);
+                const structured = existing ?? {
+                    metadataRecords: [],
+                    metadataPairs: [],
+                    links: [],
+                };
+                structured.metadataRecords.push({
+                    path: snapshot.targetPath,
+                    source: "front_matter",
+                    data: snapshot.data,
+                });
+                const pairState = ensurePairState(pairStates, snapshot.targetPath);
+                collectMetadataPairsFromValue(snapshot.data, snapshot.targetPath, "front_matter", structured.metadataPairs, pairState);
+                map.set(snapshot.targetPath, structured);
+            }
+            continue;
+        }
+        const existingEntry = map.get(file.path);
+        const structured = existingEntry ?? {
+            metadataRecords: [],
+            metadataPairs: [],
+            links: [],
+        };
+        let mutated = false;
+        if (ext === ".json") {
+            const parsed = parseJsonValue(blob.content, file.path);
+            const sanitized = sanitizeMetadataTree(parsed);
+            if (sanitized) {
+                structured.metadataRecords.push({ path: file.path, source: "json", data: sanitized });
+                const pairState = ensurePairState(pairStates, file.path);
+                collectMetadataPairsFromValue(sanitized, file.path, "json", structured.metadataPairs, pairState);
+                mutated = true;
+            }
+        }
+        else if (ext === ".yaml" || ext === ".yml") {
+            const parsed = parseYamlValue(blob.content, file.path);
+            const sanitized = sanitizeMetadataTree(parsed);
+            if (sanitized) {
+                structured.metadataRecords.push({ path: file.path, source: "yaml", data: sanitized });
+                const pairState = ensurePairState(pairStates, file.path);
+                collectMetadataPairsFromValue(sanitized, file.path, "yaml", structured.metadataPairs, pairState);
+                mutated = true;
+            }
+        }
+        if (MARKDOWN_EXTENSIONS.has(ext)) {
+            const frontMatter = parseFrontMatterBlock(blob.content, file.path);
+            let markdownBody = blob.content;
+            if (frontMatter) {
+                if (frontMatter.data) {
+                    const sanitized = sanitizeMetadataTree(frontMatter.data);
+                    if (sanitized) {
+                        structured.metadataRecords.push({
+                            path: file.path,
+                            source: "front_matter",
+                            data: sanitized,
+                        });
+                        const pairState = ensurePairState(pairStates, file.path);
+                        collectMetadataPairsFromValue(sanitized, file.path, "front_matter", structured.metadataPairs, pairState);
+                        mutated = true;
+                    }
+                }
+                markdownBody = frontMatter.body;
+            }
+            const links = extractMarkdownLinks(markdownBody, file.path, repoFileSet);
+            if (links.length > 0) {
+                structured.links.push(...links);
+                mutated = true;
+            }
+        }
+        if (mutated || existingEntry) {
+            map.set(file.path, structured);
+        }
+    }
+    return map;
+}
+function aggregateStructuredData(map) {
+    const aggregated = {
+        metadataRecords: [],
+        metadataPairs: [],
+        links: [],
+    };
+    for (const entry of map.values()) {
+        aggregated.metadataRecords.push(...entry.metadataRecords);
+        aggregated.metadataPairs.push(...entry.metadataPairs);
+        aggregated.links.push(...entry.links);
+    }
+    return aggregated;
+}
 async function buildCodeIntel(files, blobs, workspaceRoot) {
     const fileSet = new Set(files.map((file) => file.path));
     const symbols = [];
@@ -498,16 +1046,20 @@ async function reconcileDeletedFiles(db, repoId, repoRoot) {
         }
     }
     // Delete all records for removed files in a single transaction
+    // Batched DELETE operations to avoid N+1 query problem
     if (deletedPaths.length > 0) {
         await db.transaction(async () => {
-            for (const path of deletedPaths) {
-                await db.run("DELETE FROM symbol WHERE repo_id = ? AND path = ?", [repoId, path]);
-                await db.run("DELETE FROM snippet WHERE repo_id = ? AND path = ?", [repoId, path]);
-                await db.run("DELETE FROM dependency WHERE repo_id = ? AND src_path = ?", [repoId, path]);
-                await db.run("DELETE FROM file_embedding WHERE repo_id = ? AND path = ?", [repoId, path]);
-                await db.run("DELETE FROM tree WHERE repo_id = ? AND path = ?", [repoId, path]);
-                await db.run("DELETE FROM file WHERE repo_id = ? AND path = ?", [repoId, path]);
-            }
+            const placeholders = deletedPaths.map(() => "?").join(", ");
+            const params = [repoId, ...deletedPaths];
+            await db.run(`DELETE FROM symbol WHERE repo_id = ? AND path IN (${placeholders})`, params);
+            await db.run(`DELETE FROM snippet WHERE repo_id = ? AND path IN (${placeholders})`, params);
+            await db.run(`DELETE FROM dependency WHERE repo_id = ? AND src_path IN (${placeholders})`, params);
+            await db.run(`DELETE FROM file_embedding WHERE repo_id = ? AND path IN (${placeholders})`, params);
+            await db.run(`DELETE FROM document_metadata WHERE repo_id = ? AND path IN (${placeholders})`, params);
+            await db.run(`DELETE FROM document_metadata_kv WHERE repo_id = ? AND path IN (${placeholders})`, params);
+            await db.run(`DELETE FROM markdown_link WHERE repo_id = ? AND src_path IN (${placeholders})`, params);
+            await db.run(`DELETE FROM tree WHERE repo_id = ? AND path IN (${placeholders})`, params);
+            await db.run(`DELETE FROM file WHERE repo_id = ? AND path IN (${placeholders})`, params);
         });
     }
     return deletedPaths;
@@ -526,6 +1078,9 @@ async function deleteFileRecords(db, repoId, headCommit, path) {
     await db.run("DELETE FROM snippet WHERE repo_id = ? AND path = ?", [repoId, path]);
     await db.run("DELETE FROM dependency WHERE repo_id = ? AND src_path = ?", [repoId, path]);
     await db.run("DELETE FROM file_embedding WHERE repo_id = ? AND path = ?", [repoId, path]);
+    await db.run("DELETE FROM document_metadata WHERE repo_id = ? AND path = ?", [repoId, path]);
+    await db.run("DELETE FROM document_metadata_kv WHERE repo_id = ? AND path = ?", [repoId, path]);
+    await db.run("DELETE FROM markdown_link WHERE repo_id = ? AND src_path = ?", [repoId, path]);
     await db.run("DELETE FROM tree WHERE repo_id = ? AND commit_hash = ? AND path = ?", [
         repoId,
         headCommit,
@@ -533,6 +1088,25 @@ async function deleteFileRecords(db, repoId, headCommit, path) {
     ]);
     await db.run("DELETE FROM file WHERE repo_id = ? AND path = ?", [repoId, path]);
 }
+/**
+ * Remove blob records that are no longer referenced by any file.
+ * This garbage collection should be run after full re-indexing or periodically as maintenance.
+ *
+ * @param db - Database client
+ */
+async function garbageCollectBlobs(db) {
+    console.info("Running garbage collection on blob table...");
+    try {
+        await db.run(`
+      DELETE FROM blob
+      WHERE hash NOT IN (SELECT DISTINCT blob_hash FROM file)
+    `);
+        console.info("Blob garbage collection complete.");
+    }
+    catch (error) {
+        console.warn("Failed to garbage collect blobs:", error instanceof Error ? error.message : String(error));
+    }
+}
 export async function runIndexer(options) {
     const repoPathCandidates = getRepoPathCandidates(options.repoRoot);
     const repoRoot = repoPathCandidates[0];
@@ -571,6 +1145,10 @@ export async function runIndexer(options) {
             const dbClient = await DuckDBClient.connect({ databasePath, ensureDirectory: true });
             db = dbClient;
             await ensureBaseSchema(dbClient);
+            // Migration: Ensure document_metadata tables exist for existing DBs
+            await ensureDocumentMetadataTables(dbClient);
+            // Phase 1: Ensure normalized_root column exists (Critical #1)
+            await ensureNormalizedRootColumn(dbClient);
             // Phase 3: Ensure FTS metadata columns exist for existing DBs (migration)
             await ensureRepoMetaColumns(dbClient);
             const [headCommit, defaultBranch] = await Promise.all([
@@ -626,6 +1204,12 @@ export async function runIndexer(options) {
                     }
                     return;
                 }
+                const existingFileRows = await dbClient.all("SELECT path FROM file WHERE repo_id = ?", [repoId]);
+                const repoFileSet = new Set(existingFileRows.map((row) => row.path));
+                for (const file of files) {
+                    repoFileSet.add(file.path);
+                }
+                const structuredByFile = extractStructuredData(changedFiles, changedBlobs, repoFileSet);
                 // Process all changed files in a single transaction for atomicity
                 const fileSet = new Set(files.map((f) => f.path));
                 const embeddingMap = new Map();
@@ -648,67 +1232,79 @@ export async function runIndexer(options) {
                         const blob = changedBlobs.get(file.blobHash);
                         if (!blob)
                             continue;
-                        // Build code intelligence for this file
-                        const fileSymbols = [];
-                        const fileSnippets = [];
-                        const fileDependencies = [];
-                        if (!file.isBinary && blob.content) {
-                            const analysis = await analyzeSource(file.path, file.lang, blob.content, fileSet, repoRoot);
-                            for (const symbol of analysis.symbols) {
-                                fileSymbols.push({
-                                    path: file.path,
-                                    symbolId: symbol.symbolId,
-                                    name: symbol.name,
-                                    kind: symbol.kind,
-                                    rangeStartLine: symbol.rangeStartLine,
-                                    rangeEndLine: symbol.rangeEndLine,
-                                    signature: symbol.signature,
-                                    doc: symbol.doc,
-                                });
+                        try {
+                            // Build code intelligence for this file
+                            const fileSymbols = [];
+                            const fileSnippets = [];
+                            const fileDependencies = [];
+                            if (!file.isBinary && blob.content) {
+                                const analysis = await analyzeSource(file.path, file.lang, blob.content, fileSet, repoRoot);
+                                for (const symbol of analysis.symbols) {
+                                    fileSymbols.push({
+                                        path: file.path,
+                                        symbolId: symbol.symbolId,
+                                        name: symbol.name,
+                                        kind: symbol.kind,
+                                        rangeStartLine: symbol.rangeStartLine,
+                                        rangeEndLine: symbol.rangeEndLine,
+                                        signature: symbol.signature,
+                                        doc: symbol.doc,
+                                    });
+                                }
+                                for (const snippet of analysis.snippets) {
+                                    fileSnippets.push({
+                                        path: file.path,
+                                        snippetId: snippet.startLine,
+                                        startLine: snippet.startLine,
+                                        endLine: snippet.endLine,
+                                        symbolId: snippet.symbolId,
+                                    });
+                                }
+                                for (const dep of analysis.dependencies) {
+                                    fileDependencies.push({
+                                        srcPath: file.path,
+                                        dstKind: dep.dstKind,
+                                        dst: dep.dst,
+                                        rel: dep.rel,
+                                    });
+                                }
                             }
-                            for (const snippet of analysis.snippets) {
+                            else {
+                                // Binary or no content: add fallback snippet
+                                const fallback = buildFallbackSnippet(blob.lineCount ?? 1);
                                 fileSnippets.push({
                                     path: file.path,
-                                    snippetId: snippet.startLine,
-                                    startLine: snippet.startLine,
-                                    endLine: snippet.endLine,
-                                    symbolId: snippet.symbolId,
+                                    snippetId: fallback.startLine,
+                                    startLine: fallback.startLine,
+                                    endLine: fallback.endLine,
+                                    symbolId: fallback.symbolId,
                                 });
                             }
-                            for (const dep of analysis.dependencies) {
-                                fileDependencies.push({
-                                    srcPath: file.path,
-                                    dstKind: dep.dstKind,
-                                    dst: dep.dst,
-                                    rel: dep.rel,
-                                });
+                            const fileEmbedding = embeddingMap.get(file.path) ?? null;
+                            // Delete old records for this file (within main transaction)
+                            await deleteFileRecords(dbClient, repoId, headCommit, file.path);
+                            // Insert new records (within main transaction)
+                            await persistBlobs(dbClient, new Map([[blob.hash, blob]]));
+                            await persistTrees(dbClient, repoId, headCommit, [file]);
+                            await persistFiles(dbClient, repoId, [file]);
+                            await persistSymbols(dbClient, repoId, fileSymbols);
+                            await persistSnippets(dbClient, repoId, fileSnippets);
+                            await persistDependencies(dbClient, repoId, fileDependencies);
+                            const structured = structuredByFile.get(file.path);
+                            if (structured) {
+                                await persistDocumentMetadata(dbClient, repoId, structured.metadataRecords);
+                                await persistMetadataPairs(dbClient, repoId, structured.metadataPairs);
+                                await persistMarkdownLinks(dbClient, repoId, structured.links);
                             }
+                            if (fileEmbedding) {
+                                await persistEmbeddings(dbClient, repoId, [fileEmbedding]);
+                            }
+                            processedCount++;
                         }
-                        else {
-                            // Binary or no content: add fallback snippet
-                            const fallback = buildFallbackSnippet(blob.lineCount ?? 1);
-                            fileSnippets.push({
-                                path: file.path,
-                                snippetId: fallback.startLine,
-                                startLine: fallback.startLine,
-                                endLine: fallback.endLine,
-                                symbolId: fallback.symbolId,
-                            });
-                        }
-                        const fileEmbedding = embeddingMap.get(file.path) ?? null;
-                        // Delete old records for this file (within main transaction)
-                        await deleteFileRecords(dbClient, repoId, headCommit, file.path);
-                        // Insert new records (within main transaction)
-                        await persistBlobs(dbClient, new Map([[blob.hash, blob]]));
-                        await persistTrees(dbClient, repoId, headCommit, [file]);
-                        await persistFiles(dbClient, repoId, [file]);
-                        await persistSymbols(dbClient, repoId, fileSymbols);
-                        await persistSnippets(dbClient, repoId, fileSnippets);
-                        await persistDependencies(dbClient, repoId, fileDependencies);
-                        if (fileEmbedding) {
-                            await persistEmbeddings(dbClient, repoId, [fileEmbedding]);
+                        catch (error) {
+                            console.error(`Failed to process file ${file.path}, transaction will rollback:`, error instanceof Error ? error.message : String(error));
+                            throw error; // Re-throw to rollback the transaction
                         }
-                        processedCount++;
                     }
                     // Update timestamp and mark FTS dirty inside transaction for atomicity
                     // Fix: Increment fts_generation to prevent lost updates during concurrent rebuilds
@@ -725,7 +1321,14 @@ export async function runIndexer(options) {
                 return;
             }
             // Full mode: reindex entire repository
-            const paths = await gitLsFiles(repoRoot);
+            let paths = await gitLsFiles(repoRoot);
+            if (paths.length === 0) {
+                const fallbackPaths = await collectPlainDocsPaths(repoRoot);
+                if (fallbackPaths.length > 0) {
+                    console.warn(`git ls-files returned 0 paths for ${repoRoot}. Falling back to filesystem scan (${fallbackPaths.length} files).`);
+                    paths = fallbackPaths;
+                }
+            }
             const { blobs, files, embeddings, missingPaths } = await scanFilesInBatches(repoRoot, paths);
             // In full mode, missingPaths should be rare (git ls-files returns existing files)
             // But log them if they occur (race condition: file deleted between ls-files and scan)
@@ -733,6 +1336,9 @@ export async function runIndexer(options) {
                 console.warn(`${missingPaths.length} file(s) disappeared during full reindex (race condition)`);
             }
             const codeIntel = await buildCodeIntel(files, blobs, repoRoot);
+            const repoFileSetFull = new Set(files.map((file) => file.path));
+            const structuredMap = extractStructuredData(files, blobs, repoFileSetFull);
+            const aggregatedStructured = aggregateStructuredData(structuredMap);
             await dbClient.transaction(async () => {
                 await dbClient.run("DELETE FROM tree WHERE repo_id = ?", [repoId]);
                 await dbClient.run("DELETE FROM file WHERE repo_id = ?", [repoId]);
@@ -740,6 +1346,9 @@ export async function runIndexer(options) {
                 await dbClient.run("DELETE FROM snippet WHERE repo_id = ?", [repoId]);
                 await dbClient.run("DELETE FROM dependency WHERE repo_id = ?", [repoId]);
                 await dbClient.run("DELETE FROM file_embedding WHERE repo_id = ?", [repoId]);
+                await dbClient.run("DELETE FROM document_metadata WHERE repo_id = ?", [repoId]);
+                await dbClient.run("DELETE FROM document_metadata_kv WHERE repo_id = ?", [repoId]);
+                await dbClient.run("DELETE FROM markdown_link WHERE repo_id = ?", [repoId]);
                 await persistBlobs(dbClient, blobs);
                 await persistTrees(dbClient, repoId, headCommit, files);
                 await persistFiles(dbClient, repoId, files);
@@ -747,6 +1356,9 @@ export async function runIndexer(options) {
                 await persistSnippets(dbClient, repoId, codeIntel.snippets);
                 await persistDependencies(dbClient, repoId, codeIntel.dependencies);
                 await persistEmbeddings(dbClient, repoId, embeddings);
+                await persistDocumentMetadata(dbClient, repoId, aggregatedStructured.metadataRecords);
+                await persistMetadataPairs(dbClient, repoId, aggregatedStructured.metadataPairs);
+                await persistMarkdownLinks(dbClient, repoId, aggregatedStructured.links);
                 // Update timestamp and mark FTS dirty inside transaction to ensure atomicity
                 // Fix: Increment fts_generation to prevent lost updates during concurrent rebuilds
                 if (defaultBranch) {
@@ -759,6 +1371,8 @@ export async function runIndexer(options) {
             console.info(`Indexed ${files.length} files for repo ${repoRoot} at ${databasePath} (commit ${headCommit.slice(0, 12)})`);
             // Phase 2+3: Force rebuild FTS index after full reindex
             await rebuildFTSIfNeeded(dbClient, repoId, true);
+            // Garbage collect orphaned blobs after full reindex
+            await garbageCollectBlobs(dbClient);
         }
         finally {
             // Fix #2: Ensure lock is released even if DB connection fails