kiri-mcp-server 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/README.md +52 -10
  2. package/config/kiri.yml +25 -0
  3. package/config/scoring-profiles.yml +82 -35
  4. package/dist/config/kiri.yml +25 -0
  5. package/dist/config/scoring-profiles.yml +82 -35
  6. package/dist/package.json +9 -1
  7. package/dist/src/indexer/cli.d.ts.map +1 -1
  8. package/dist/src/indexer/cli.js +712 -98
  9. package/dist/src/indexer/cli.js.map +1 -1
  10. package/dist/src/indexer/git.d.ts.map +1 -1
  11. package/dist/src/indexer/git.js +41 -3
  12. package/dist/src/indexer/git.js.map +1 -1
  13. package/dist/src/indexer/migrations/repo-merger.d.ts +33 -0
  14. package/dist/src/indexer/migrations/repo-merger.d.ts.map +1 -0
  15. package/dist/src/indexer/migrations/repo-merger.js +67 -0
  16. package/dist/src/indexer/migrations/repo-merger.js.map +1 -0
  17. package/dist/src/indexer/schema.d.ts +66 -0
  18. package/dist/src/indexer/schema.d.ts.map +1 -1
  19. package/dist/src/indexer/schema.js +337 -0
  20. package/dist/src/indexer/schema.js.map +1 -1
  21. package/dist/src/server/boost-profiles.d.ts +6 -5
  22. package/dist/src/server/boost-profiles.d.ts.map +1 -1
  23. package/dist/src/server/boost-profiles.js +138 -0
  24. package/dist/src/server/boost-profiles.js.map +1 -1
  25. package/dist/src/server/config-loader.d.ts +9 -0
  26. package/dist/src/server/config-loader.d.ts.map +1 -0
  27. package/dist/src/server/config-loader.js +121 -0
  28. package/dist/src/server/config-loader.js.map +1 -0
  29. package/dist/src/server/config.d.ts +47 -0
  30. package/dist/src/server/config.d.ts.map +1 -0
  31. package/dist/src/server/config.js +157 -0
  32. package/dist/src/server/config.js.map +1 -0
  33. package/dist/src/server/context.d.ts +29 -0
  34. package/dist/src/server/context.d.ts.map +1 -1
  35. package/dist/src/server/context.js +26 -1
  36. package/dist/src/server/context.js.map +1 -1
  37. package/dist/src/server/handlers/snippets-get.d.ts +36 -0
  38. package/dist/src/server/handlers/snippets-get.d.ts.map +1 -0
  39. package/dist/src/server/handlers/snippets-get.js +120 -0
  40. package/dist/src/server/handlers/snippets-get.js.map +1 -0
  41. package/dist/src/server/handlers.d.ts +33 -20
  42. package/dist/src/server/handlers.d.ts.map +1 -1
  43. package/dist/src/server/handlers.js +1805 -370
  44. package/dist/src/server/handlers.js.map +1 -1
  45. package/dist/src/server/indexBootstrap.d.ts.map +1 -1
  46. package/dist/src/server/indexBootstrap.js +49 -2
  47. package/dist/src/server/indexBootstrap.js.map +1 -1
  48. package/dist/src/server/main.d.ts.map +1 -1
  49. package/dist/src/server/main.js +7 -0
  50. package/dist/src/server/main.js.map +1 -1
  51. package/dist/src/server/profile-selector.d.ts +33 -0
  52. package/dist/src/server/profile-selector.d.ts.map +1 -0
  53. package/dist/src/server/profile-selector.js +291 -0
  54. package/dist/src/server/profile-selector.js.map +1 -0
  55. package/dist/src/server/rpc.d.ts.map +1 -1
  56. package/dist/src/server/rpc.js +60 -10
  57. package/dist/src/server/rpc.js.map +1 -1
  58. package/dist/src/server/runtime.d.ts.map +1 -1
  59. package/dist/src/server/runtime.js +14 -4
  60. package/dist/src/server/runtime.js.map +1 -1
  61. package/dist/src/server/scoring.d.ts +7 -1
  62. package/dist/src/server/scoring.d.ts.map +1 -1
  63. package/dist/src/server/scoring.js +121 -21
  64. package/dist/src/server/scoring.js.map +1 -1
  65. package/dist/src/server/services/index.d.ts +24 -0
  66. package/dist/src/server/services/index.d.ts.map +1 -0
  67. package/dist/src/server/services/index.js +20 -0
  68. package/dist/src/server/services/index.js.map +1 -0
  69. package/dist/src/server/services/repo-repository.d.ts +61 -0
  70. package/dist/src/server/services/repo-repository.d.ts.map +1 -0
  71. package/dist/src/server/services/repo-repository.js +93 -0
  72. package/dist/src/server/services/repo-repository.js.map +1 -0
  73. package/dist/src/server/services/repo-resolver.d.ts +28 -0
  74. package/dist/src/server/services/repo-resolver.d.ts.map +1 -0
  75. package/dist/src/server/services/repo-resolver.js +62 -0
  76. package/dist/src/server/services/repo-resolver.js.map +1 -0
  77. package/dist/src/shared/duckdb.d.ts.map +1 -1
  78. package/dist/src/shared/duckdb.js +21 -1
  79. package/dist/src/shared/duckdb.js.map +1 -1
  80. package/dist/src/shared/fs/safePath.d.ts +7 -0
  81. package/dist/src/shared/fs/safePath.d.ts.map +1 -0
  82. package/dist/src/shared/fs/safePath.js +23 -0
  83. package/dist/src/shared/fs/safePath.js.map +1 -0
  84. package/dist/src/shared/tokenizer.d.ts +1 -1
  85. package/dist/src/shared/tokenizer.d.ts.map +1 -1
  86. package/dist/src/shared/tokenizer.js +97 -15
  87. package/dist/src/shared/tokenizer.js.map +1 -1
  88. package/dist/src/shared/utils/glob.d.ts +5 -0
  89. package/dist/src/shared/utils/glob.d.ts.map +1 -0
  90. package/dist/src/shared/utils/glob.js +22 -0
  91. package/dist/src/shared/utils/glob.js.map +1 -0
  92. package/dist/src/shared/utils/retry.d.ts +8 -0
  93. package/dist/src/shared/utils/retry.d.ts.map +1 -0
  94. package/dist/src/shared/utils/retry.js +20 -0
  95. package/dist/src/shared/utils/retry.js.map +1 -0
  96. package/package.json +9 -1
@@ -1,21 +1,110 @@
1
1
  import { createHash } from "node:crypto";
2
2
  import { existsSync } from "node:fs";
3
- import { readFile, stat } from "node:fs/promises";
4
- import { join, resolve, extname } from "node:path";
3
+ import { readFile, readdir, stat } from "node:fs/promises";
4
+ import { join, resolve, extname, posix as pathPosix } from "node:path";
5
5
  import { pathToFileURL } from "node:url";
6
+ import { parse as parseYAML } from "yaml";
6
7
  import { DuckDBClient } from "../shared/duckdb.js";
7
8
  import { generateEmbedding } from "../shared/embedding.js";
8
9
  import { acquireLock, releaseLock, LockfileError, getLockOwner } from "../shared/utils/lockfile.js";
9
- import { normalizeDbPath, ensureDbParentDir, getRepoPathCandidates } from "../shared/utils/path.js";
10
+ import { normalizeDbPath, normalizeRepoPath, ensureDbParentDir, getRepoPathCandidates, } from "../shared/utils/path.js";
10
11
  import { analyzeSource, buildFallbackSnippet } from "./codeintel.js";
11
12
  import { getDefaultBranch, getHeadCommit, gitLsFiles, gitDiffNameOnly } from "./git.js";
12
13
  import { detectLanguage } from "./language.js";
14
+ import { mergeRepoRecords } from "./migrations/repo-merger.js";
13
15
  import { getIndexerQueue } from "./queue.js";
14
- import { ensureBaseSchema, ensureRepoMetaColumns, rebuildFTSIfNeeded } from "./schema.js";
16
+ import { ensureBaseSchema, ensureDocumentMetadataTables, ensureNormalizedRootColumn, ensureRepoMetaColumns, rebuildFTSIfNeeded, } from "./schema.js";
15
17
  import { IndexWatcher } from "./watch.js";
18
+ function normalizePathForIndex(value) {
19
+ return value.replace(/\\/g, "/");
20
+ }
21
+ function ensurePairState(stateMap, path) {
22
+ const existing = stateMap.get(path);
23
+ if (existing) {
24
+ return existing;
25
+ }
26
+ const created = { count: 0, seen: new Set() };
27
+ stateMap.set(path, created);
28
+ return created;
29
+ }
16
30
  const MAX_SAMPLE_BYTES = 32_768;
17
31
  const MAX_FILE_BYTES = 32 * 1024 * 1024; // 32MB limit to prevent memory exhaustion
18
32
  const SCAN_BATCH_SIZE = 100; // Process files in batches to limit memory usage
33
+ const MARKDOWN_EXTENSIONS = new Set([".md", ".mdx", ".markdown"]);
34
+ const DOCMETA_SNAPSHOT_DIR = "docmeta/";
35
+ const DOCMETA_SNAPSHOT_TARGET_FIELD = "target_path";
36
+ const DOCMETA_SNAPSHOT_DATA_FIELD = "front_matter";
37
+ /**
38
+ * Metadata processing limits to prevent DoS attacks and memory exhaustion.
39
+ *
40
+ * These values balance security, performance, and real-world usage patterns.
41
+ * Adjust based on:
42
+ * - Performance testing with 10000+ file repositories
43
+ * - Memory profiling (Node.js heap size impact)
44
+ * - Analysis of 99th percentile values in production data
45
+ */
46
+ /**
47
+ * Maximum length of a single metadata value (characters).
48
+ *
49
+ * Rationale: Typical YAML front matter fields (title, description) are 200-300 chars.
50
+ * Setting to 512 provides headroom while preventing abuse.
51
+ *
52
+ * Example use cases:
53
+ * - Document titles: ~100 chars
54
+ * - Descriptions: ~300 chars
55
+ * - Tags (as comma-separated string): ~200 chars
56
+ */
57
+ const MAX_METADATA_VALUE_LENGTH = 512;
58
+ /**
59
+ * Maximum nesting depth for metadata tree structures.
60
+ *
61
+ * Rationale: Normal YAML/JSON documents nest 3-5 levels deep.
62
+ * Setting to 8 accommodates complex configurations while preventing stack overflow.
63
+ *
64
+ * Defense: Prevents malicious deeply-nested documents from causing:
65
+ * - Stack overflow (recursive function calls)
66
+ * - Exponential memory growth
67
+ * - CPU exhaustion during traversal
68
+ */
69
+ const MAX_METADATA_DEPTH = 8;
70
+ /**
71
+ * Maximum number of elements in a metadata array.
72
+ *
73
+ * Rationale: Common use case is tags/categories arrays with ~10 items.
74
+ * Setting to 64 provides generous headroom for edge cases.
75
+ *
76
+ * Example arrays:
77
+ * - Tags: ["frontend", "react", "typescript"] (~3-10 items)
78
+ * - Authors: ["John Doe", "Jane Smith"] (~1-5 items)
79
+ * - Categories: ["guide", "tutorial", "api"] (~2-8 items)
80
+ */
81
+ const MAX_METADATA_ARRAY_LENGTH = 64;
82
+ /**
83
+ * Maximum number of key-value pairs extracted per file.
84
+ *
85
+ * Rationale: Memory footprint calculation:
86
+ * - 256 pairs × ~40 bytes/pair ≈ 10KB per file
87
+ * - For 10000 files: 10KB × 10000 = 100MB (acceptable overhead)
88
+ *
89
+ * Prevents DoS from files with thousands of metadata fields.
90
+ * Normal documents have 5-20 metadata fields.
91
+ */
92
+ const MAX_METADATA_PAIRS_PER_FILE = 256;
93
+ /**
94
+ * Maximum number of object keys processed in a metadata tree node.
95
+ *
96
+ * Rationale: Prevents memory exhaustion from maliciously crafted objects with excessive keys.
97
+ * Normal metadata objects have 5-20 keys. Setting to 256 provides generous headroom.
98
+ *
99
+ * Memory impact: Each key entry requires ~50 bytes (key name + value reference).
100
+ * 256 keys × 50 bytes ≈ 12.8KB per object, which is acceptable.
101
+ */
102
+ const MAX_METADATA_OBJECT_KEYS = 256;
103
+ /**
104
+ * Key name used for root-level scalar values in metadata trees.
105
+ * Internal use only - not exposed in search results.
106
+ */
107
+ const ROOT_METADATA_KEY = "__root";
19
108
  /**
20
109
  * Maximum number of SQL placeholders per INSERT statement.
21
110
  *
@@ -72,43 +161,17 @@ function isBinaryBuffer(buffer) {
72
161
  * @param defaultBranch - Default branch name (e.g., "main", "master"), or null if unknown
73
162
  * @returns The repository ID (auto-generated on first insert, reused thereafter)
74
163
  */
75
- async function mergeLegacyRepoRows(db, canonicalRepoId, legacyRepoIds) {
76
- if (legacyRepoIds.length === 0) {
77
- return;
78
- }
79
- const referencingTables = await db.all(`SELECT DISTINCT c.table_name
80
- FROM duckdb_columns() AS c
81
- JOIN duckdb_tables() AS t
82
- ON c.database_name = t.database_name
83
- AND c.schema_name = t.schema_name
84
- AND c.table_name = t.table_name
85
- WHERE c.column_name = 'repo_id'
86
- AND c.table_name <> 'repo'
87
- AND t.table_type = 'BASE TABLE'`);
88
- const safeTables = referencingTables
89
- .map((row) => row.table_name)
90
- .filter((name) => /^[A-Za-z0-9_]+$/.test(name));
91
- await db.transaction(async () => {
92
- for (const legacyRepoId of legacyRepoIds) {
93
- for (const tableName of safeTables) {
94
- await db.run(`UPDATE ${tableName} SET repo_id = ? WHERE repo_id = ?`, [
95
- canonicalRepoId,
96
- legacyRepoId,
97
- ]);
98
- }
99
- await db.run("DELETE FROM repo WHERE id = ?", [legacyRepoId]);
100
- }
101
- });
102
- }
103
164
  async function ensureRepo(db, repoRoot, defaultBranch, candidateRoots) {
104
165
  const searchRoots = Array.from(new Set([repoRoot, ...(candidateRoots ?? [])]));
105
166
  const placeholders = searchRoots.map(() => "?").join(", ");
106
167
  let rows = await db.all(`SELECT id, root FROM repo WHERE root IN (${placeholders})`, searchRoots);
107
168
  if (rows.length === 0) {
108
- await db.run(`INSERT INTO repo (root, default_branch, indexed_at)
109
- VALUES (?, ?, CURRENT_TIMESTAMP)
169
+ const normalized = normalizeRepoPath(repoRoot);
170
+ await db.run(`INSERT INTO repo (root, normalized_root, default_branch, indexed_at)
171
+ VALUES (?, ?, ?, CURRENT_TIMESTAMP)
110
172
  ON CONFLICT(root) DO UPDATE SET
111
- default_branch = COALESCE(excluded.default_branch, repo.default_branch)`, [repoRoot, defaultBranch]);
173
+ normalized_root = excluded.normalized_root,
174
+ default_branch = COALESCE(excluded.default_branch, repo.default_branch)`, [repoRoot, normalized, defaultBranch]);
112
175
  rows = await db.all(`SELECT id, root FROM repo WHERE root IN (${placeholders})`, searchRoots);
113
176
  }
114
177
  if (rows.length === 0) {
@@ -123,7 +186,7 @@ async function ensureRepo(db, repoRoot, defaultBranch, candidateRoots) {
123
186
  canonicalRow = { ...canonicalRow, root: repoRoot };
124
187
  }
125
188
  const legacyIds = rows.filter((row) => row.id !== canonicalRow.id).map((row) => row.id);
126
- await mergeLegacyRepoRows(db, canonicalRow.id, legacyIds);
189
+ await mergeRepoRecords(db, canonicalRow.id, legacyIds);
127
190
  return canonicalRow.id;
128
191
  }
129
192
  /**
@@ -302,6 +365,491 @@ async function persistEmbeddings(db, repoId, records) {
302
365
  ]),
303
366
  }));
304
367
  }
368
+ async function persistDocumentMetadata(db, repoId, records) {
369
+ if (records.length === 0)
370
+ return;
371
+ const BATCH_SIZE = calculateBatchSize(4);
372
+ await persistInBatches(db, records, BATCH_SIZE, (batch) => ({
373
+ sql: `INSERT OR REPLACE INTO document_metadata (repo_id, path, source, data) VALUES ${batch.map(() => "(?, ?, ?, ?)").join(", ")}`,
374
+ params: batch.flatMap((record) => [
375
+ repoId,
376
+ record.path,
377
+ record.source,
378
+ JSON.stringify(record.data),
379
+ ]),
380
+ }));
381
+ }
382
+ async function persistMetadataPairs(db, repoId, records) {
383
+ if (records.length === 0)
384
+ return;
385
+ const BATCH_SIZE = calculateBatchSize(5);
386
+ await persistInBatches(db, records, BATCH_SIZE, (batch) => ({
387
+ sql: `INSERT OR REPLACE INTO document_metadata_kv (repo_id, path, source, key, value) VALUES ${batch.map(() => "(?, ?, ?, ?, ?)").join(", ")}`,
388
+ params: batch.flatMap((record) => [
389
+ repoId,
390
+ record.path,
391
+ record.source,
392
+ record.key,
393
+ record.value,
394
+ ]),
395
+ }));
396
+ }
397
+ async function persistMarkdownLinks(db, repoId, records) {
398
+ if (records.length === 0)
399
+ return;
400
+ const BATCH_SIZE = calculateBatchSize(6);
401
+ await persistInBatches(db, records, BATCH_SIZE, (batch) => ({
402
+ sql: `INSERT OR REPLACE INTO markdown_link (repo_id, src_path, target, resolved_path, anchor_text, kind) VALUES ${batch.map(() => "(?, ?, ?, ?, ?, ?)").join(", ")}`,
403
+ params: batch.flatMap((record) => [
404
+ repoId,
405
+ record.srcPath,
406
+ record.target,
407
+ record.resolvedPath,
408
+ record.anchorText,
409
+ record.kind,
410
+ ]),
411
+ }));
412
+ }
413
+ function sanitizeMetadataTree(value, depth = 0) {
414
+ // Depth check at the beginning to prevent stack overflow
415
+ if (depth > MAX_METADATA_DEPTH) {
416
+ console.warn(`Metadata depth limit (${MAX_METADATA_DEPTH}) exceeded, truncating nested value`);
417
+ return null;
418
+ }
419
+ if (value === null || value === undefined) {
420
+ return null;
421
+ }
422
+ if (value instanceof Date) {
423
+ return value.toISOString();
424
+ }
425
+ if (typeof value === "string") {
426
+ const trimmed = value.trim();
427
+ if (trimmed.length === 0) {
428
+ return null;
429
+ }
430
+ return trimmed.length > MAX_METADATA_VALUE_LENGTH
431
+ ? trimmed.slice(0, MAX_METADATA_VALUE_LENGTH)
432
+ : trimmed;
433
+ }
434
+ if (typeof value === "number") {
435
+ if (!Number.isFinite(value)) {
436
+ return null;
437
+ }
438
+ return value;
439
+ }
440
+ if (typeof value === "boolean") {
441
+ return value;
442
+ }
443
+ if (Array.isArray(value)) {
444
+ if (value.length === 0) {
445
+ return null;
446
+ }
447
+ // Warn if array is too large
448
+ if (value.length > MAX_METADATA_ARRAY_LENGTH) {
449
+ console.warn(`Metadata array has ${value.length} elements, limiting to ${MAX_METADATA_ARRAY_LENGTH}`);
450
+ }
451
+ const sanitized = [];
452
+ for (const item of value.slice(0, MAX_METADATA_ARRAY_LENGTH)) {
453
+ const child = sanitizeMetadataTree(item, depth + 1);
454
+ if (child !== null) {
455
+ sanitized.push(child);
456
+ }
457
+ }
458
+ return sanitized.length > 0 ? sanitized : null;
459
+ }
460
+ if (typeof value === "object") {
461
+ const result = {};
462
+ const entries = Object.entries(value);
463
+ // Limit number of object keys to prevent memory exhaustion
464
+ if (entries.length > MAX_METADATA_OBJECT_KEYS) {
465
+ console.warn(`Object has ${entries.length} keys, limiting to ${MAX_METADATA_OBJECT_KEYS} to prevent memory exhaustion`);
466
+ }
467
+ for (const [key, child] of entries.slice(0, MAX_METADATA_OBJECT_KEYS)) {
468
+ if (!key)
469
+ continue;
470
+ const sanitizedChild = sanitizeMetadataTree(child, depth + 1);
471
+ if (sanitizedChild !== null) {
472
+ result[key] = sanitizedChild;
473
+ }
474
+ }
475
+ return Object.keys(result).length > 0 ? result : null;
476
+ }
477
+ return null;
478
+ }
479
+ function metadataValueToString(value) {
480
+ if (typeof value === "string") {
481
+ return value;
482
+ }
483
+ if (typeof value === "number") {
484
+ return Number.isFinite(value) ? value.toString() : "";
485
+ }
486
+ return value ? "true" : "false";
487
+ }
488
+ function collectMetadataPairsFromValue(value, path, source, pairs, state, keyPrefix = "") {
489
+ if (state.count >= MAX_METADATA_PAIRS_PER_FILE) {
490
+ return;
491
+ }
492
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
493
+ const key = keyPrefix.length > 0 ? keyPrefix : ROOT_METADATA_KEY;
494
+ let normalized = metadataValueToString(value).trim();
495
+ if (normalized.length === 0) {
496
+ return;
497
+ }
498
+ if (normalized.length > MAX_METADATA_VALUE_LENGTH) {
499
+ normalized = normalized.slice(0, MAX_METADATA_VALUE_LENGTH);
500
+ }
501
+ const dedupeKey = `${source}:${key}:${normalized.toLowerCase()}`;
502
+ if (state.seen.has(dedupeKey)) {
503
+ return;
504
+ }
505
+ state.seen.add(dedupeKey);
506
+ pairs.push({ path, source, key, value: normalized });
507
+ state.count += 1;
508
+ return;
509
+ }
510
+ if (Array.isArray(value)) {
511
+ for (const item of value) {
512
+ collectMetadataPairsFromValue(item, path, source, pairs, state, keyPrefix);
513
+ if (state.count >= MAX_METADATA_PAIRS_PER_FILE) {
514
+ break;
515
+ }
516
+ }
517
+ return;
518
+ }
519
+ if (typeof value === "object" && value !== null) {
520
+ for (const [childKey, childValue] of Object.entries(value)) {
521
+ const normalizedKey = childKey.toLowerCase();
522
+ const nextPrefix = keyPrefix.length > 0 ? `${keyPrefix}.${normalizedKey}` : normalizedKey;
523
+ collectMetadataPairsFromValue(childValue, path, source, pairs, state, nextPrefix);
524
+ if (state.count >= MAX_METADATA_PAIRS_PER_FILE) {
525
+ break;
526
+ }
527
+ }
528
+ }
529
+ }
530
+ function parseFrontMatterBlock(content, path) {
531
+ const leading = content.startsWith("\uFEFF") ? content.slice(1) : content;
532
+ if (!leading.startsWith("---")) {
533
+ return null;
534
+ }
535
+ const match = leading.match(/^---\s*\r?\n([\s\S]*?)\r?\n---\s*(?:\r?\n|$)/);
536
+ if (!match) {
537
+ return null;
538
+ }
539
+ const rawBlock = match[1] ?? "";
540
+ const body = leading.slice(match[0].length);
541
+ try {
542
+ const data = parseYAML(rawBlock);
543
+ return { data: data ?? null, body };
544
+ }
545
+ catch (error) {
546
+ // Structured error logging for better debugging
547
+ const errorMessage = error instanceof Error ? error.message : String(error);
548
+ console.warn(JSON.stringify({
549
+ level: "warn",
550
+ message: "Failed to parse Markdown front matter",
551
+ file: path,
552
+ error: errorMessage,
553
+ context: "Front matter YAML parsing failed, metadata will be skipped for this file",
554
+ }));
555
+ return { data: null, body };
556
+ }
557
+ }
558
+ function stripLinkTitle(target) {
559
+ const trimmed = target.trim();
560
+ if (trimmed.length === 0) {
561
+ return trimmed;
562
+ }
563
+ const angleWrapped = trimmed.startsWith("<") && trimmed.endsWith(">");
564
+ const unwrapped = angleWrapped ? trimmed.slice(1, -1) : trimmed;
565
+ return unwrapped.replace(/\s+("[^"]*"|'[^']*')\s*$/, "").trim();
566
+ }
567
+ function extractMarkdownLinks(content, srcPath, repoFileSet) {
568
+ const links = [];
569
+ const pattern = /\[(?<text>[^\]]+)\]\((?<target>[^)]+)\)/g;
570
+ let match;
571
+ while ((match = pattern.exec(content)) !== null) {
572
+ if (match.index > 0 && content[match.index - 1] === "!") {
573
+ continue; // Skip images
574
+ }
575
+ const text = match.groups?.text?.trim() ?? "";
576
+ let target = match.groups?.target?.trim() ?? "";
577
+ if (!text || !target) {
578
+ continue;
579
+ }
580
+ target = stripLinkTitle(target);
581
+ if (!target) {
582
+ continue;
583
+ }
584
+ const kind = classifyMarkdownTarget(target);
585
+ const resolvedPath = resolveMarkdownLink(kind, target, srcPath, repoFileSet);
586
+ if (kind === "anchor" && resolvedPath === null) {
587
+ continue;
588
+ }
589
+ links.push({
590
+ srcPath,
591
+ target,
592
+ resolvedPath,
593
+ anchorText: text.slice(0, 160),
594
+ kind,
595
+ });
596
+ }
597
+ return links;
598
+ }
599
+ function classifyMarkdownTarget(target) {
600
+ const trimmed = target.trim();
601
+ if (!trimmed) {
602
+ return "external";
603
+ }
604
+ if (trimmed.startsWith("#")) {
605
+ return "anchor";
606
+ }
607
+ if (/^[a-z][a-z0-9+.-]*:/i.test(trimmed) || trimmed.startsWith("//")) {
608
+ return "external";
609
+ }
610
+ if (trimmed.startsWith("/")) {
611
+ return "absolute";
612
+ }
613
+ return "relative";
614
+ }
615
+ function resolveMarkdownLink(kind, target, srcPath, repoFileSet) {
616
+ if (kind === "external" || kind === "anchor") {
617
+ return null;
618
+ }
619
+ let cleanTarget = target.split("?")[0] ?? "";
620
+ const hashIndex = cleanTarget.indexOf("#");
621
+ if (hashIndex >= 0) {
622
+ cleanTarget = cleanTarget.slice(0, hashIndex);
623
+ }
624
+ cleanTarget = cleanTarget.trim().replace(/\\/g, "/");
625
+ if (!cleanTarget) {
626
+ return null;
627
+ }
628
+ let candidate;
629
+ if (kind === "absolute") {
630
+ candidate = cleanTarget.replace(/^\/+/, "");
631
+ }
632
+ else {
633
+ const dir = pathPosix.dirname(srcPath);
634
+ candidate = pathPosix.join(dir, cleanTarget);
635
+ }
636
+ candidate = pathPosix.normalize(candidate);
637
+ if (!candidate || candidate.startsWith("..")) {
638
+ return null;
639
+ }
640
+ // Security: Prevent directory traversal by checking for ".." segments
641
+ // Even after normalization, check that no path segment contains ".." or "."
642
+ const segments = candidate.split("/");
643
+ if (segments.some((seg) => seg === ".." || seg === ".")) {
644
+ return null;
645
+ }
646
+ // Additional security: reject absolute paths that may have bypassed earlier checks
647
+ if (candidate.startsWith("/")) {
648
+ return null;
649
+ }
650
+ const candidates = buildLinkCandidatePaths(candidate);
651
+ for (const pathCandidate of candidates) {
652
+ if (repoFileSet.has(pathCandidate)) {
653
+ return pathCandidate;
654
+ }
655
+ }
656
+ return null;
657
+ }
658
+ function buildLinkCandidatePaths(basePath) {
659
+ const candidates = new Set();
660
+ candidates.add(basePath);
661
+ if (!pathPosix.extname(basePath)) {
662
+ candidates.add(`${basePath}.md`);
663
+ candidates.add(`${basePath}.mdx`);
664
+ candidates.add(`${basePath}/README.md`);
665
+ candidates.add(`${basePath}/readme.md`);
666
+ candidates.add(`${basePath}/index.md`);
667
+ candidates.add(`${basePath}/INDEX.md`);
668
+ }
669
+ return Array.from(candidates);
670
+ }
671
+ function parseJsonValue(content, path) {
672
+ try {
673
+ return JSON.parse(content);
674
+ }
675
+ catch (error) {
676
+ // Structured error logging for better debugging
677
+ const errorMessage = error instanceof Error ? error.message : String(error);
678
+ console.warn(JSON.stringify({
679
+ level: "warn",
680
+ message: "Failed to parse JSON metadata",
681
+ file: path,
682
+ error: errorMessage,
683
+ context: "JSON parsing failed, metadata will be skipped for this file",
684
+ }));
685
+ return null;
686
+ }
687
+ }
688
+ function parseYamlValue(content, path) {
689
+ try {
690
+ return parseYAML(content);
691
+ }
692
+ catch (error) {
693
+ // Structured error logging for better debugging
694
+ const errorMessage = error instanceof Error ? error.message : String(error);
695
+ console.warn(JSON.stringify({
696
+ level: "warn",
697
+ message: "Failed to parse YAML metadata",
698
+ file: path,
699
+ error: errorMessage,
700
+ context: "YAML parsing failed, metadata will be skipped for this file",
701
+ }));
702
+ return null;
703
+ }
704
+ }
705
+ function parseDocmetaSnapshot(content, path) {
706
+ const parsed = parseJsonValue(content, path);
707
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
708
+ return null;
709
+ }
710
+ const candidate = parsed;
711
+ const targetPath = candidate[DOCMETA_SNAPSHOT_TARGET_FIELD];
712
+ const frontMatter = candidate[DOCMETA_SNAPSHOT_DATA_FIELD];
713
+ if (typeof targetPath !== "string") {
714
+ return null;
715
+ }
716
+ const sanitized = sanitizeMetadataTree(frontMatter);
717
+ if (!sanitized) {
718
+ return null;
719
+ }
720
+ return {
721
+ targetPath: normalizePathForIndex(targetPath),
722
+ data: sanitized,
723
+ };
724
+ }
725
+ async function collectPlainDocsPaths(repoRoot) {
726
+ const results = [];
727
+ async function walkRelative(relativeDir) {
728
+ const absDir = join(repoRoot, relativeDir);
729
+ let entries;
730
+ try {
731
+ entries = await readdir(absDir, { withFileTypes: true });
732
+ }
733
+ catch {
734
+ return;
735
+ }
736
+ for (const entry of entries) {
737
+ const relPath = pathPosix.join(relativeDir, entry.name);
738
+ if (entry.isDirectory()) {
739
+ await walkRelative(relPath);
740
+ }
741
+ else {
742
+ results.push(relPath);
743
+ }
744
+ }
745
+ }
746
+ await walkRelative("docs").catch(() => { });
747
+ await walkRelative("docmeta").catch(() => { });
748
+ return results;
749
+ }
750
+ function extractStructuredData(files, blobs, repoFileSet) {
751
+ const map = new Map();
752
+ const pairStates = new Map();
753
+ for (const file of files) {
754
+ if (file.isBinary)
755
+ continue;
756
+ const blob = blobs.get(file.blobHash);
757
+ if (!blob || blob.content === null) {
758
+ continue;
759
+ }
760
+ const ext = (file.ext ?? "").toLowerCase();
761
+ const normalizedPath = normalizePathForIndex(file.path);
762
+ if (normalizedPath.startsWith(DOCMETA_SNAPSHOT_DIR)) {
763
+ const snapshot = parseDocmetaSnapshot(blob.content, file.path);
764
+ if (snapshot) {
765
+ const existing = map.get(snapshot.targetPath);
766
+ const structured = existing ?? {
767
+ metadataRecords: [],
768
+ metadataPairs: [],
769
+ links: [],
770
+ };
771
+ structured.metadataRecords.push({
772
+ path: snapshot.targetPath,
773
+ source: "front_matter",
774
+ data: snapshot.data,
775
+ });
776
+ const pairState = ensurePairState(pairStates, snapshot.targetPath);
777
+ collectMetadataPairsFromValue(snapshot.data, snapshot.targetPath, "front_matter", structured.metadataPairs, pairState);
778
+ map.set(snapshot.targetPath, structured);
779
+ }
780
+ continue;
781
+ }
782
+ const existingEntry = map.get(file.path);
783
+ const structured = existingEntry ?? {
784
+ metadataRecords: [],
785
+ metadataPairs: [],
786
+ links: [],
787
+ };
788
+ let mutated = false;
789
+ if (ext === ".json") {
790
+ const parsed = parseJsonValue(blob.content, file.path);
791
+ const sanitized = sanitizeMetadataTree(parsed);
792
+ if (sanitized) {
793
+ structured.metadataRecords.push({ path: file.path, source: "json", data: sanitized });
794
+ const pairState = ensurePairState(pairStates, file.path);
795
+ collectMetadataPairsFromValue(sanitized, file.path, "json", structured.metadataPairs, pairState);
796
+ mutated = true;
797
+ }
798
+ }
799
+ else if (ext === ".yaml" || ext === ".yml") {
800
+ const parsed = parseYamlValue(blob.content, file.path);
801
+ const sanitized = sanitizeMetadataTree(parsed);
802
+ if (sanitized) {
803
+ structured.metadataRecords.push({ path: file.path, source: "yaml", data: sanitized });
804
+ const pairState = ensurePairState(pairStates, file.path);
805
+ collectMetadataPairsFromValue(sanitized, file.path, "yaml", structured.metadataPairs, pairState);
806
+ mutated = true;
807
+ }
808
+ }
809
+ if (MARKDOWN_EXTENSIONS.has(ext)) {
810
+ const frontMatter = parseFrontMatterBlock(blob.content, file.path);
811
+ let markdownBody = blob.content;
812
+ if (frontMatter) {
813
+ if (frontMatter.data) {
814
+ const sanitized = sanitizeMetadataTree(frontMatter.data);
815
+ if (sanitized) {
816
+ structured.metadataRecords.push({
817
+ path: file.path,
818
+ source: "front_matter",
819
+ data: sanitized,
820
+ });
821
+ const pairState = ensurePairState(pairStates, file.path);
822
+ collectMetadataPairsFromValue(sanitized, file.path, "front_matter", structured.metadataPairs, pairState);
823
+ mutated = true;
824
+ }
825
+ }
826
+ markdownBody = frontMatter.body;
827
+ }
828
+ const links = extractMarkdownLinks(markdownBody, file.path, repoFileSet);
829
+ if (links.length > 0) {
830
+ structured.links.push(...links);
831
+ mutated = true;
832
+ }
833
+ }
834
+ if (mutated || existingEntry) {
835
+ map.set(file.path, structured);
836
+ }
837
+ }
838
+ return map;
839
+ }
840
+ function aggregateStructuredData(map) {
841
+ const aggregated = {
842
+ metadataRecords: [],
843
+ metadataPairs: [],
844
+ links: [],
845
+ };
846
+ for (const entry of map.values()) {
847
+ aggregated.metadataRecords.push(...entry.metadataRecords);
848
+ aggregated.metadataPairs.push(...entry.metadataPairs);
849
+ aggregated.links.push(...entry.links);
850
+ }
851
+ return aggregated;
852
+ }
305
853
  async function buildCodeIntel(files, blobs, workspaceRoot) {
306
854
  const fileSet = new Set(files.map((file) => file.path));
307
855
  const symbols = [];
@@ -498,16 +1046,20 @@ async function reconcileDeletedFiles(db, repoId, repoRoot) {
498
1046
  }
499
1047
  }
500
1048
  // Delete all records for removed files in a single transaction
1049
+ // Batched DELETE operations to avoid N+1 query problem
501
1050
  if (deletedPaths.length > 0) {
502
1051
  await db.transaction(async () => {
503
- for (const path of deletedPaths) {
504
- await db.run("DELETE FROM symbol WHERE repo_id = ? AND path = ?", [repoId, path]);
505
- await db.run("DELETE FROM snippet WHERE repo_id = ? AND path = ?", [repoId, path]);
506
- await db.run("DELETE FROM dependency WHERE repo_id = ? AND src_path = ?", [repoId, path]);
507
- await db.run("DELETE FROM file_embedding WHERE repo_id = ? AND path = ?", [repoId, path]);
508
- await db.run("DELETE FROM tree WHERE repo_id = ? AND path = ?", [repoId, path]);
509
- await db.run("DELETE FROM file WHERE repo_id = ? AND path = ?", [repoId, path]);
510
- }
1052
+ const placeholders = deletedPaths.map(() => "?").join(", ");
1053
+ const params = [repoId, ...deletedPaths];
1054
+ await db.run(`DELETE FROM symbol WHERE repo_id = ? AND path IN (${placeholders})`, params);
1055
+ await db.run(`DELETE FROM snippet WHERE repo_id = ? AND path IN (${placeholders})`, params);
1056
+ await db.run(`DELETE FROM dependency WHERE repo_id = ? AND src_path IN (${placeholders})`, params);
1057
+ await db.run(`DELETE FROM file_embedding WHERE repo_id = ? AND path IN (${placeholders})`, params);
1058
+ await db.run(`DELETE FROM document_metadata WHERE repo_id = ? AND path IN (${placeholders})`, params);
1059
+ await db.run(`DELETE FROM document_metadata_kv WHERE repo_id = ? AND path IN (${placeholders})`, params);
1060
+ await db.run(`DELETE FROM markdown_link WHERE repo_id = ? AND src_path IN (${placeholders})`, params);
1061
+ await db.run(`DELETE FROM tree WHERE repo_id = ? AND path IN (${placeholders})`, params);
1062
+ await db.run(`DELETE FROM file WHERE repo_id = ? AND path IN (${placeholders})`, params);
511
1063
  });
512
1064
  }
513
1065
  return deletedPaths;
@@ -526,6 +1078,9 @@ async function deleteFileRecords(db, repoId, headCommit, path) {
526
1078
  await db.run("DELETE FROM snippet WHERE repo_id = ? AND path = ?", [repoId, path]);
527
1079
  await db.run("DELETE FROM dependency WHERE repo_id = ? AND src_path = ?", [repoId, path]);
528
1080
  await db.run("DELETE FROM file_embedding WHERE repo_id = ? AND path = ?", [repoId, path]);
1081
+ await db.run("DELETE FROM document_metadata WHERE repo_id = ? AND path = ?", [repoId, path]);
1082
+ await db.run("DELETE FROM document_metadata_kv WHERE repo_id = ? AND path = ?", [repoId, path]);
1083
+ await db.run("DELETE FROM markdown_link WHERE repo_id = ? AND src_path = ?", [repoId, path]);
529
1084
  await db.run("DELETE FROM tree WHERE repo_id = ? AND commit_hash = ? AND path = ?", [
530
1085
  repoId,
531
1086
  headCommit,
@@ -533,6 +1088,25 @@ async function deleteFileRecords(db, repoId, headCommit, path) {
533
1088
  ]);
534
1089
  await db.run("DELETE FROM file WHERE repo_id = ? AND path = ?", [repoId, path]);
535
1090
  }
1091
+ /**
1092
+ * Remove blob records that are no longer referenced by any file.
1093
+ * This garbage collection should be run after full re-indexing or periodically as maintenance.
1094
+ *
1095
+ * @param db - Database client
1096
+ */
1097
+ async function garbageCollectBlobs(db) {
1098
+ console.info("Running garbage collection on blob table...");
1099
+ try {
1100
+ await db.run(`
1101
+ DELETE FROM blob
1102
+ WHERE hash NOT IN (SELECT DISTINCT blob_hash FROM file)
1103
+ `);
1104
+ console.info("Blob garbage collection complete.");
1105
+ }
1106
+ catch (error) {
1107
+ console.warn("Failed to garbage collect blobs:", error instanceof Error ? error.message : String(error));
1108
+ }
1109
+ }
536
1110
  export async function runIndexer(options) {
537
1111
  const repoPathCandidates = getRepoPathCandidates(options.repoRoot);
538
1112
  const repoRoot = repoPathCandidates[0];
@@ -571,6 +1145,10 @@ export async function runIndexer(options) {
571
1145
  const dbClient = await DuckDBClient.connect({ databasePath, ensureDirectory: true });
572
1146
  db = dbClient;
573
1147
  await ensureBaseSchema(dbClient);
1148
+ // Migration: Ensure document_metadata tables exist for existing DBs
1149
+ await ensureDocumentMetadataTables(dbClient);
1150
+ // Phase 1: Ensure normalized_root column exists (Critical #1)
1151
+ await ensureNormalizedRootColumn(dbClient);
574
1152
  // Phase 3: Ensure FTS metadata columns exist for existing DBs (migration)
575
1153
  await ensureRepoMetaColumns(dbClient);
576
1154
  const [headCommit, defaultBranch] = await Promise.all([
@@ -626,6 +1204,12 @@ export async function runIndexer(options) {
626
1204
  }
627
1205
  return;
628
1206
  }
1207
+ const existingFileRows = await dbClient.all("SELECT path FROM file WHERE repo_id = ?", [repoId]);
1208
+ const repoFileSet = new Set(existingFileRows.map((row) => row.path));
1209
+ for (const file of files) {
1210
+ repoFileSet.add(file.path);
1211
+ }
1212
+ const structuredByFile = extractStructuredData(changedFiles, changedBlobs, repoFileSet);
629
1213
  // Process all changed files in a single transaction for atomicity
630
1214
  const fileSet = new Set(files.map((f) => f.path));
631
1215
  const embeddingMap = new Map();
@@ -648,67 +1232,79 @@ export async function runIndexer(options) {
648
1232
  const blob = changedBlobs.get(file.blobHash);
649
1233
  if (!blob)
650
1234
  continue;
651
- // Build code intelligence for this file
652
- const fileSymbols = [];
653
- const fileSnippets = [];
654
- const fileDependencies = [];
655
- if (!file.isBinary && blob.content) {
656
- const analysis = await analyzeSource(file.path, file.lang, blob.content, fileSet, repoRoot);
657
- for (const symbol of analysis.symbols) {
658
- fileSymbols.push({
659
- path: file.path,
660
- symbolId: symbol.symbolId,
661
- name: symbol.name,
662
- kind: symbol.kind,
663
- rangeStartLine: symbol.rangeStartLine,
664
- rangeEndLine: symbol.rangeEndLine,
665
- signature: symbol.signature,
666
- doc: symbol.doc,
667
- });
1235
+ try {
1236
+ // Build code intelligence for this file
1237
+ const fileSymbols = [];
1238
+ const fileSnippets = [];
1239
+ const fileDependencies = [];
1240
+ if (!file.isBinary && blob.content) {
1241
+ const analysis = await analyzeSource(file.path, file.lang, blob.content, fileSet, repoRoot);
1242
+ for (const symbol of analysis.symbols) {
1243
+ fileSymbols.push({
1244
+ path: file.path,
1245
+ symbolId: symbol.symbolId,
1246
+ name: symbol.name,
1247
+ kind: symbol.kind,
1248
+ rangeStartLine: symbol.rangeStartLine,
1249
+ rangeEndLine: symbol.rangeEndLine,
1250
+ signature: symbol.signature,
1251
+ doc: symbol.doc,
1252
+ });
1253
+ }
1254
+ for (const snippet of analysis.snippets) {
1255
+ fileSnippets.push({
1256
+ path: file.path,
1257
+ snippetId: snippet.startLine,
1258
+ startLine: snippet.startLine,
1259
+ endLine: snippet.endLine,
1260
+ symbolId: snippet.symbolId,
1261
+ });
1262
+ }
1263
+ for (const dep of analysis.dependencies) {
1264
+ fileDependencies.push({
1265
+ srcPath: file.path,
1266
+ dstKind: dep.dstKind,
1267
+ dst: dep.dst,
1268
+ rel: dep.rel,
1269
+ });
1270
+ }
668
1271
  }
669
- for (const snippet of analysis.snippets) {
1272
+ else {
1273
+ // Binary or no content: add fallback snippet
1274
+ const fallback = buildFallbackSnippet(blob.lineCount ?? 1);
670
1275
  fileSnippets.push({
671
1276
  path: file.path,
672
- snippetId: snippet.startLine,
673
- startLine: snippet.startLine,
674
- endLine: snippet.endLine,
675
- symbolId: snippet.symbolId,
1277
+ snippetId: fallback.startLine,
1278
+ startLine: fallback.startLine,
1279
+ endLine: fallback.endLine,
1280
+ symbolId: fallback.symbolId,
676
1281
  });
677
1282
  }
678
- for (const dep of analysis.dependencies) {
679
- fileDependencies.push({
680
- srcPath: file.path,
681
- dstKind: dep.dstKind,
682
- dst: dep.dst,
683
- rel: dep.rel,
684
- });
1283
+ const fileEmbedding = embeddingMap.get(file.path) ?? null;
1284
+ // Delete old records for this file (within main transaction)
1285
+ await deleteFileRecords(dbClient, repoId, headCommit, file.path);
1286
+ // Insert new records (within main transaction)
1287
+ await persistBlobs(dbClient, new Map([[blob.hash, blob]]));
1288
+ await persistTrees(dbClient, repoId, headCommit, [file]);
1289
+ await persistFiles(dbClient, repoId, [file]);
1290
+ await persistSymbols(dbClient, repoId, fileSymbols);
1291
+ await persistSnippets(dbClient, repoId, fileSnippets);
1292
+ await persistDependencies(dbClient, repoId, fileDependencies);
1293
+ const structured = structuredByFile.get(file.path);
1294
+ if (structured) {
1295
+ await persistDocumentMetadata(dbClient, repoId, structured.metadataRecords);
1296
+ await persistMetadataPairs(dbClient, repoId, structured.metadataPairs);
1297
+ await persistMarkdownLinks(dbClient, repoId, structured.links);
685
1298
  }
1299
+ if (fileEmbedding) {
1300
+ await persistEmbeddings(dbClient, repoId, [fileEmbedding]);
1301
+ }
1302
+ processedCount++;
686
1303
  }
687
- else {
688
- // Binary or no content: add fallback snippet
689
- const fallback = buildFallbackSnippet(blob.lineCount ?? 1);
690
- fileSnippets.push({
691
- path: file.path,
692
- snippetId: fallback.startLine,
693
- startLine: fallback.startLine,
694
- endLine: fallback.endLine,
695
- symbolId: fallback.symbolId,
696
- });
697
- }
698
- const fileEmbedding = embeddingMap.get(file.path) ?? null;
699
- // Delete old records for this file (within main transaction)
700
- await deleteFileRecords(dbClient, repoId, headCommit, file.path);
701
- // Insert new records (within main transaction)
702
- await persistBlobs(dbClient, new Map([[blob.hash, blob]]));
703
- await persistTrees(dbClient, repoId, headCommit, [file]);
704
- await persistFiles(dbClient, repoId, [file]);
705
- await persistSymbols(dbClient, repoId, fileSymbols);
706
- await persistSnippets(dbClient, repoId, fileSnippets);
707
- await persistDependencies(dbClient, repoId, fileDependencies);
708
- if (fileEmbedding) {
709
- await persistEmbeddings(dbClient, repoId, [fileEmbedding]);
1304
+ catch (error) {
1305
+ console.error(`Failed to process file ${file.path}, transaction will rollback:`, error instanceof Error ? error.message : String(error));
1306
+ throw error; // Re-throw to rollback the transaction
710
1307
  }
711
- processedCount++;
712
1308
  }
713
1309
  // Update timestamp and mark FTS dirty inside transaction for atomicity
714
1310
  // Fix: Increment fts_generation to prevent lost updates during concurrent rebuilds
@@ -725,7 +1321,14 @@ export async function runIndexer(options) {
725
1321
  return;
726
1322
  }
727
1323
  // Full mode: reindex entire repository
728
- const paths = await gitLsFiles(repoRoot);
1324
+ let paths = await gitLsFiles(repoRoot);
1325
+ if (paths.length === 0) {
1326
+ const fallbackPaths = await collectPlainDocsPaths(repoRoot);
1327
+ if (fallbackPaths.length > 0) {
1328
+ console.warn(`git ls-files returned 0 paths for ${repoRoot}. Falling back to filesystem scan (${fallbackPaths.length} files).`);
1329
+ paths = fallbackPaths;
1330
+ }
1331
+ }
729
1332
  const { blobs, files, embeddings, missingPaths } = await scanFilesInBatches(repoRoot, paths);
730
1333
  // In full mode, missingPaths should be rare (git ls-files returns existing files)
731
1334
  // But log them if they occur (race condition: file deleted between ls-files and scan)
@@ -733,6 +1336,9 @@ export async function runIndexer(options) {
733
1336
  console.warn(`${missingPaths.length} file(s) disappeared during full reindex (race condition)`);
734
1337
  }
735
1338
  const codeIntel = await buildCodeIntel(files, blobs, repoRoot);
1339
+ const repoFileSetFull = new Set(files.map((file) => file.path));
1340
+ const structuredMap = extractStructuredData(files, blobs, repoFileSetFull);
1341
+ const aggregatedStructured = aggregateStructuredData(structuredMap);
736
1342
  await dbClient.transaction(async () => {
737
1343
  await dbClient.run("DELETE FROM tree WHERE repo_id = ?", [repoId]);
738
1344
  await dbClient.run("DELETE FROM file WHERE repo_id = ?", [repoId]);
@@ -740,6 +1346,9 @@ export async function runIndexer(options) {
740
1346
  await dbClient.run("DELETE FROM snippet WHERE repo_id = ?", [repoId]);
741
1347
  await dbClient.run("DELETE FROM dependency WHERE repo_id = ?", [repoId]);
742
1348
  await dbClient.run("DELETE FROM file_embedding WHERE repo_id = ?", [repoId]);
1349
+ await dbClient.run("DELETE FROM document_metadata WHERE repo_id = ?", [repoId]);
1350
+ await dbClient.run("DELETE FROM document_metadata_kv WHERE repo_id = ?", [repoId]);
1351
+ await dbClient.run("DELETE FROM markdown_link WHERE repo_id = ?", [repoId]);
743
1352
  await persistBlobs(dbClient, blobs);
744
1353
  await persistTrees(dbClient, repoId, headCommit, files);
745
1354
  await persistFiles(dbClient, repoId, files);
@@ -747,6 +1356,9 @@ export async function runIndexer(options) {
747
1356
  await persistSnippets(dbClient, repoId, codeIntel.snippets);
748
1357
  await persistDependencies(dbClient, repoId, codeIntel.dependencies);
749
1358
  await persistEmbeddings(dbClient, repoId, embeddings);
1359
+ await persistDocumentMetadata(dbClient, repoId, aggregatedStructured.metadataRecords);
1360
+ await persistMetadataPairs(dbClient, repoId, aggregatedStructured.metadataPairs);
1361
+ await persistMarkdownLinks(dbClient, repoId, aggregatedStructured.links);
750
1362
  // Update timestamp and mark FTS dirty inside transaction to ensure atomicity
751
1363
  // Fix: Increment fts_generation to prevent lost updates during concurrent rebuilds
752
1364
  if (defaultBranch) {
@@ -759,6 +1371,8 @@ export async function runIndexer(options) {
759
1371
  console.info(`Indexed ${files.length} files for repo ${repoRoot} at ${databasePath} (commit ${headCommit.slice(0, 12)})`);
760
1372
  // Phase 2+3: Force rebuild FTS index after full reindex
761
1373
  await rebuildFTSIfNeeded(dbClient, repoId, true);
1374
+ // Garbage collect orphaned blobs after full reindex
1375
+ await garbageCollectBlobs(dbClient);
762
1376
  }
763
1377
  finally {
764
1378
  // Fix #2: Ensure lock is released even if DB connection fails