kiri-mcp-server 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -1
- package/config/scoring-profiles.yml +82 -35
- package/dist/config/scoring-profiles.yml +82 -35
- package/dist/package.json +9 -1
- package/dist/src/indexer/cli.d.ts.map +1 -1
- package/dist/src/indexer/cli.js +712 -98
- package/dist/src/indexer/cli.js.map +1 -1
- package/dist/src/indexer/git.d.ts.map +1 -1
- package/dist/src/indexer/git.js +41 -3
- package/dist/src/indexer/git.js.map +1 -1
- package/dist/src/indexer/migrations/repo-merger.d.ts +33 -0
- package/dist/src/indexer/migrations/repo-merger.d.ts.map +1 -0
- package/dist/src/indexer/migrations/repo-merger.js +67 -0
- package/dist/src/indexer/migrations/repo-merger.js.map +1 -0
- package/dist/src/indexer/schema.d.ts +66 -0
- package/dist/src/indexer/schema.d.ts.map +1 -1
- package/dist/src/indexer/schema.js +337 -0
- package/dist/src/indexer/schema.js.map +1 -1
- package/dist/src/server/boost-profiles.d.ts +1 -1
- package/dist/src/server/boost-profiles.d.ts.map +1 -1
- package/dist/src/server/boost-profiles.js +116 -0
- package/dist/src/server/boost-profiles.js.map +1 -1
- package/dist/src/server/config.d.ts +45 -0
- package/dist/src/server/config.d.ts.map +1 -0
- package/dist/src/server/config.js +146 -0
- package/dist/src/server/config.js.map +1 -0
- package/dist/src/server/context.d.ts +29 -0
- package/dist/src/server/context.d.ts.map +1 -1
- package/dist/src/server/context.js +26 -1
- package/dist/src/server/context.js.map +1 -1
- package/dist/src/server/handlers/snippets-get.d.ts +36 -0
- package/dist/src/server/handlers/snippets-get.d.ts.map +1 -0
- package/dist/src/server/handlers/snippets-get.js +120 -0
- package/dist/src/server/handlers/snippets-get.js.map +1 -0
- package/dist/src/server/handlers.d.ts +32 -20
- package/dist/src/server/handlers.d.ts.map +1 -1
- package/dist/src/server/handlers.js +1554 -338
- package/dist/src/server/handlers.js.map +1 -1
- package/dist/src/server/indexBootstrap.d.ts.map +1 -1
- package/dist/src/server/indexBootstrap.js +49 -2
- package/dist/src/server/indexBootstrap.js.map +1 -1
- package/dist/src/server/main.d.ts.map +1 -1
- package/dist/src/server/main.js +7 -0
- package/dist/src/server/main.js.map +1 -1
- package/dist/src/server/profile-selector.d.ts +33 -0
- package/dist/src/server/profile-selector.d.ts.map +1 -0
- package/dist/src/server/profile-selector.js +291 -0
- package/dist/src/server/profile-selector.js.map +1 -0
- package/dist/src/server/rpc.d.ts.map +1 -1
- package/dist/src/server/rpc.js +36 -6
- package/dist/src/server/rpc.js.map +1 -1
- package/dist/src/server/runtime.d.ts.map +1 -1
- package/dist/src/server/runtime.js +14 -4
- package/dist/src/server/runtime.js.map +1 -1
- package/dist/src/server/scoring.d.ts +7 -1
- package/dist/src/server/scoring.d.ts.map +1 -1
- package/dist/src/server/scoring.js +121 -21
- package/dist/src/server/scoring.js.map +1 -1
- package/dist/src/server/services/index.d.ts +24 -0
- package/dist/src/server/services/index.d.ts.map +1 -0
- package/dist/src/server/services/index.js +20 -0
- package/dist/src/server/services/index.js.map +1 -0
- package/dist/src/server/services/repo-repository.d.ts +61 -0
- package/dist/src/server/services/repo-repository.d.ts.map +1 -0
- package/dist/src/server/services/repo-repository.js +93 -0
- package/dist/src/server/services/repo-repository.js.map +1 -0
- package/dist/src/server/services/repo-resolver.d.ts +28 -0
- package/dist/src/server/services/repo-resolver.d.ts.map +1 -0
- package/dist/src/server/services/repo-resolver.js +62 -0
- package/dist/src/server/services/repo-resolver.js.map +1 -0
- package/dist/src/shared/duckdb.d.ts.map +1 -1
- package/dist/src/shared/duckdb.js +21 -1
- package/dist/src/shared/duckdb.js.map +1 -1
- package/dist/src/shared/fs/safePath.d.ts +7 -0
- package/dist/src/shared/fs/safePath.d.ts.map +1 -0
- package/dist/src/shared/fs/safePath.js +23 -0
- package/dist/src/shared/fs/safePath.js.map +1 -0
- package/dist/src/shared/utils/glob.d.ts +5 -0
- package/dist/src/shared/utils/glob.d.ts.map +1 -0
- package/dist/src/shared/utils/glob.js +22 -0
- package/dist/src/shared/utils/glob.js.map +1 -0
- package/dist/src/shared/utils/retry.d.ts +8 -0
- package/dist/src/shared/utils/retry.d.ts.map +1 -0
- package/dist/src/shared/utils/retry.js +20 -0
- package/dist/src/shared/utils/retry.js.map +1 -0
- package/package.json +28 -22
package/dist/src/indexer/cli.js
CHANGED
|
@@ -1,21 +1,110 @@
|
|
|
1
1
|
import { createHash } from "node:crypto";
|
|
2
2
|
import { existsSync } from "node:fs";
|
|
3
|
-
import { readFile, stat } from "node:fs/promises";
|
|
4
|
-
import { join, resolve, extname } from "node:path";
|
|
3
|
+
import { readFile, readdir, stat } from "node:fs/promises";
|
|
4
|
+
import { join, resolve, extname, posix as pathPosix } from "node:path";
|
|
5
5
|
import { pathToFileURL } from "node:url";
|
|
6
|
+
import { parse as parseYAML } from "yaml";
|
|
6
7
|
import { DuckDBClient } from "../shared/duckdb.js";
|
|
7
8
|
import { generateEmbedding } from "../shared/embedding.js";
|
|
8
9
|
import { acquireLock, releaseLock, LockfileError, getLockOwner } from "../shared/utils/lockfile.js";
|
|
9
|
-
import { normalizeDbPath, ensureDbParentDir, getRepoPathCandidates } from "../shared/utils/path.js";
|
|
10
|
+
import { normalizeDbPath, normalizeRepoPath, ensureDbParentDir, getRepoPathCandidates, } from "../shared/utils/path.js";
|
|
10
11
|
import { analyzeSource, buildFallbackSnippet } from "./codeintel.js";
|
|
11
12
|
import { getDefaultBranch, getHeadCommit, gitLsFiles, gitDiffNameOnly } from "./git.js";
|
|
12
13
|
import { detectLanguage } from "./language.js";
|
|
14
|
+
import { mergeRepoRecords } from "./migrations/repo-merger.js";
|
|
13
15
|
import { getIndexerQueue } from "./queue.js";
|
|
14
|
-
import { ensureBaseSchema, ensureRepoMetaColumns, rebuildFTSIfNeeded } from "./schema.js";
|
|
16
|
+
import { ensureBaseSchema, ensureDocumentMetadataTables, ensureNormalizedRootColumn, ensureRepoMetaColumns, rebuildFTSIfNeeded, } from "./schema.js";
|
|
15
17
|
import { IndexWatcher } from "./watch.js";
|
|
18
|
+
function normalizePathForIndex(value) {
|
|
19
|
+
return value.replace(/\\/g, "/");
|
|
20
|
+
}
|
|
21
|
+
function ensurePairState(stateMap, path) {
|
|
22
|
+
const existing = stateMap.get(path);
|
|
23
|
+
if (existing) {
|
|
24
|
+
return existing;
|
|
25
|
+
}
|
|
26
|
+
const created = { count: 0, seen: new Set() };
|
|
27
|
+
stateMap.set(path, created);
|
|
28
|
+
return created;
|
|
29
|
+
}
|
|
16
30
|
const MAX_SAMPLE_BYTES = 32_768;
|
|
17
31
|
const MAX_FILE_BYTES = 32 * 1024 * 1024; // 32MB limit to prevent memory exhaustion
|
|
18
32
|
const SCAN_BATCH_SIZE = 100; // Process files in batches to limit memory usage
|
|
33
|
+
const MARKDOWN_EXTENSIONS = new Set([".md", ".mdx", ".markdown"]);
|
|
34
|
+
const DOCMETA_SNAPSHOT_DIR = "docmeta/";
|
|
35
|
+
const DOCMETA_SNAPSHOT_TARGET_FIELD = "target_path";
|
|
36
|
+
const DOCMETA_SNAPSHOT_DATA_FIELD = "front_matter";
|
|
37
|
+
/**
|
|
38
|
+
* Metadata processing limits to prevent DoS attacks and memory exhaustion.
|
|
39
|
+
*
|
|
40
|
+
* These values balance security, performance, and real-world usage patterns.
|
|
41
|
+
* Adjust based on:
|
|
42
|
+
* - Performance testing with 10000+ file repositories
|
|
43
|
+
* - Memory profiling (Node.js heap size impact)
|
|
44
|
+
* - Analysis of 99th percentile values in production data
|
|
45
|
+
*/
|
|
46
|
+
/**
|
|
47
|
+
* Maximum length of a single metadata value (characters).
|
|
48
|
+
*
|
|
49
|
+
* Rationale: Typical YAML front matter fields (title, description) are 200-300 chars.
|
|
50
|
+
* Setting to 512 provides headroom while preventing abuse.
|
|
51
|
+
*
|
|
52
|
+
* Example use cases:
|
|
53
|
+
* - Document titles: ~100 chars
|
|
54
|
+
* - Descriptions: ~300 chars
|
|
55
|
+
* - Tags (as comma-separated string): ~200 chars
|
|
56
|
+
*/
|
|
57
|
+
const MAX_METADATA_VALUE_LENGTH = 512;
|
|
58
|
+
/**
|
|
59
|
+
* Maximum nesting depth for metadata tree structures.
|
|
60
|
+
*
|
|
61
|
+
* Rationale: Normal YAML/JSON documents nest 3-5 levels deep.
|
|
62
|
+
* Setting to 8 accommodates complex configurations while preventing stack overflow.
|
|
63
|
+
*
|
|
64
|
+
* Defense: Prevents malicious deeply-nested documents from causing:
|
|
65
|
+
* - Stack overflow (recursive function calls)
|
|
66
|
+
* - Exponential memory growth
|
|
67
|
+
* - CPU exhaustion during traversal
|
|
68
|
+
*/
|
|
69
|
+
const MAX_METADATA_DEPTH = 8;
|
|
70
|
+
/**
|
|
71
|
+
* Maximum number of elements in a metadata array.
|
|
72
|
+
*
|
|
73
|
+
* Rationale: Common use case is tags/categories arrays with ~10 items.
|
|
74
|
+
* Setting to 64 provides generous headroom for edge cases.
|
|
75
|
+
*
|
|
76
|
+
* Example arrays:
|
|
77
|
+
* - Tags: ["frontend", "react", "typescript"] (~3-10 items)
|
|
78
|
+
* - Authors: ["John Doe", "Jane Smith"] (~1-5 items)
|
|
79
|
+
* - Categories: ["guide", "tutorial", "api"] (~2-8 items)
|
|
80
|
+
*/
|
|
81
|
+
const MAX_METADATA_ARRAY_LENGTH = 64;
|
|
82
|
+
/**
|
|
83
|
+
* Maximum number of key-value pairs extracted per file.
|
|
84
|
+
*
|
|
85
|
+
* Rationale: Memory footprint calculation:
|
|
86
|
+
* - 256 pairs × ~40 bytes/pair ≈ 10KB per file
|
|
87
|
+
* - For 10000 files: 10KB × 10000 = 100MB (acceptable overhead)
|
|
88
|
+
*
|
|
89
|
+
* Prevents DoS from files with thousands of metadata fields.
|
|
90
|
+
* Normal documents have 5-20 metadata fields.
|
|
91
|
+
*/
|
|
92
|
+
const MAX_METADATA_PAIRS_PER_FILE = 256;
|
|
93
|
+
/**
|
|
94
|
+
* Maximum number of object keys processed in a metadata tree node.
|
|
95
|
+
*
|
|
96
|
+
* Rationale: Prevents memory exhaustion from maliciously crafted objects with excessive keys.
|
|
97
|
+
* Normal metadata objects have 5-20 keys. Setting to 256 provides generous headroom.
|
|
98
|
+
*
|
|
99
|
+
* Memory impact: Each key entry requires ~50 bytes (key name + value reference).
|
|
100
|
+
* 256 keys × 50 bytes ≈ 12.8KB per object, which is acceptable.
|
|
101
|
+
*/
|
|
102
|
+
const MAX_METADATA_OBJECT_KEYS = 256;
|
|
103
|
+
/**
|
|
104
|
+
* Key name used for root-level scalar values in metadata trees.
|
|
105
|
+
* Internal use only - not exposed in search results.
|
|
106
|
+
*/
|
|
107
|
+
const ROOT_METADATA_KEY = "__root";
|
|
19
108
|
/**
|
|
20
109
|
* Maximum number of SQL placeholders per INSERT statement.
|
|
21
110
|
*
|
|
@@ -72,43 +161,17 @@ function isBinaryBuffer(buffer) {
|
|
|
72
161
|
* @param defaultBranch - Default branch name (e.g., "main", "master"), or null if unknown
|
|
73
162
|
* @returns The repository ID (auto-generated on first insert, reused thereafter)
|
|
74
163
|
*/
|
|
75
|
-
async function mergeLegacyRepoRows(db, canonicalRepoId, legacyRepoIds) {
|
|
76
|
-
if (legacyRepoIds.length === 0) {
|
|
77
|
-
return;
|
|
78
|
-
}
|
|
79
|
-
const referencingTables = await db.all(`SELECT DISTINCT c.table_name
|
|
80
|
-
FROM duckdb_columns() AS c
|
|
81
|
-
JOIN duckdb_tables() AS t
|
|
82
|
-
ON c.database_name = t.database_name
|
|
83
|
-
AND c.schema_name = t.schema_name
|
|
84
|
-
AND c.table_name = t.table_name
|
|
85
|
-
WHERE c.column_name = 'repo_id'
|
|
86
|
-
AND c.table_name <> 'repo'
|
|
87
|
-
AND t.table_type = 'BASE TABLE'`);
|
|
88
|
-
const safeTables = referencingTables
|
|
89
|
-
.map((row) => row.table_name)
|
|
90
|
-
.filter((name) => /^[A-Za-z0-9_]+$/.test(name));
|
|
91
|
-
await db.transaction(async () => {
|
|
92
|
-
for (const legacyRepoId of legacyRepoIds) {
|
|
93
|
-
for (const tableName of safeTables) {
|
|
94
|
-
await db.run(`UPDATE ${tableName} SET repo_id = ? WHERE repo_id = ?`, [
|
|
95
|
-
canonicalRepoId,
|
|
96
|
-
legacyRepoId,
|
|
97
|
-
]);
|
|
98
|
-
}
|
|
99
|
-
await db.run("DELETE FROM repo WHERE id = ?", [legacyRepoId]);
|
|
100
|
-
}
|
|
101
|
-
});
|
|
102
|
-
}
|
|
103
164
|
async function ensureRepo(db, repoRoot, defaultBranch, candidateRoots) {
|
|
104
165
|
const searchRoots = Array.from(new Set([repoRoot, ...(candidateRoots ?? [])]));
|
|
105
166
|
const placeholders = searchRoots.map(() => "?").join(", ");
|
|
106
167
|
let rows = await db.all(`SELECT id, root FROM repo WHERE root IN (${placeholders})`, searchRoots);
|
|
107
168
|
if (rows.length === 0) {
|
|
108
|
-
|
|
109
|
-
|
|
169
|
+
const normalized = normalizeRepoPath(repoRoot);
|
|
170
|
+
await db.run(`INSERT INTO repo (root, normalized_root, default_branch, indexed_at)
|
|
171
|
+
VALUES (?, ?, ?, CURRENT_TIMESTAMP)
|
|
110
172
|
ON CONFLICT(root) DO UPDATE SET
|
|
111
|
-
|
|
173
|
+
normalized_root = excluded.normalized_root,
|
|
174
|
+
default_branch = COALESCE(excluded.default_branch, repo.default_branch)`, [repoRoot, normalized, defaultBranch]);
|
|
112
175
|
rows = await db.all(`SELECT id, root FROM repo WHERE root IN (${placeholders})`, searchRoots);
|
|
113
176
|
}
|
|
114
177
|
if (rows.length === 0) {
|
|
@@ -123,7 +186,7 @@ async function ensureRepo(db, repoRoot, defaultBranch, candidateRoots) {
|
|
|
123
186
|
canonicalRow = { ...canonicalRow, root: repoRoot };
|
|
124
187
|
}
|
|
125
188
|
const legacyIds = rows.filter((row) => row.id !== canonicalRow.id).map((row) => row.id);
|
|
126
|
-
await
|
|
189
|
+
await mergeRepoRecords(db, canonicalRow.id, legacyIds);
|
|
127
190
|
return canonicalRow.id;
|
|
128
191
|
}
|
|
129
192
|
/**
|
|
@@ -302,6 +365,491 @@ async function persistEmbeddings(db, repoId, records) {
|
|
|
302
365
|
]),
|
|
303
366
|
}));
|
|
304
367
|
}
|
|
368
|
+
async function persistDocumentMetadata(db, repoId, records) {
|
|
369
|
+
if (records.length === 0)
|
|
370
|
+
return;
|
|
371
|
+
const BATCH_SIZE = calculateBatchSize(4);
|
|
372
|
+
await persistInBatches(db, records, BATCH_SIZE, (batch) => ({
|
|
373
|
+
sql: `INSERT OR REPLACE INTO document_metadata (repo_id, path, source, data) VALUES ${batch.map(() => "(?, ?, ?, ?)").join(", ")}`,
|
|
374
|
+
params: batch.flatMap((record) => [
|
|
375
|
+
repoId,
|
|
376
|
+
record.path,
|
|
377
|
+
record.source,
|
|
378
|
+
JSON.stringify(record.data),
|
|
379
|
+
]),
|
|
380
|
+
}));
|
|
381
|
+
}
|
|
382
|
+
async function persistMetadataPairs(db, repoId, records) {
|
|
383
|
+
if (records.length === 0)
|
|
384
|
+
return;
|
|
385
|
+
const BATCH_SIZE = calculateBatchSize(5);
|
|
386
|
+
await persistInBatches(db, records, BATCH_SIZE, (batch) => ({
|
|
387
|
+
sql: `INSERT OR REPLACE INTO document_metadata_kv (repo_id, path, source, key, value) VALUES ${batch.map(() => "(?, ?, ?, ?, ?)").join(", ")}`,
|
|
388
|
+
params: batch.flatMap((record) => [
|
|
389
|
+
repoId,
|
|
390
|
+
record.path,
|
|
391
|
+
record.source,
|
|
392
|
+
record.key,
|
|
393
|
+
record.value,
|
|
394
|
+
]),
|
|
395
|
+
}));
|
|
396
|
+
}
|
|
397
|
+
async function persistMarkdownLinks(db, repoId, records) {
|
|
398
|
+
if (records.length === 0)
|
|
399
|
+
return;
|
|
400
|
+
const BATCH_SIZE = calculateBatchSize(6);
|
|
401
|
+
await persistInBatches(db, records, BATCH_SIZE, (batch) => ({
|
|
402
|
+
sql: `INSERT OR REPLACE INTO markdown_link (repo_id, src_path, target, resolved_path, anchor_text, kind) VALUES ${batch.map(() => "(?, ?, ?, ?, ?, ?)").join(", ")}`,
|
|
403
|
+
params: batch.flatMap((record) => [
|
|
404
|
+
repoId,
|
|
405
|
+
record.srcPath,
|
|
406
|
+
record.target,
|
|
407
|
+
record.resolvedPath,
|
|
408
|
+
record.anchorText,
|
|
409
|
+
record.kind,
|
|
410
|
+
]),
|
|
411
|
+
}));
|
|
412
|
+
}
|
|
413
|
+
function sanitizeMetadataTree(value, depth = 0) {
|
|
414
|
+
// Depth check at the beginning to prevent stack overflow
|
|
415
|
+
if (depth > MAX_METADATA_DEPTH) {
|
|
416
|
+
console.warn(`Metadata depth limit (${MAX_METADATA_DEPTH}) exceeded, truncating nested value`);
|
|
417
|
+
return null;
|
|
418
|
+
}
|
|
419
|
+
if (value === null || value === undefined) {
|
|
420
|
+
return null;
|
|
421
|
+
}
|
|
422
|
+
if (value instanceof Date) {
|
|
423
|
+
return value.toISOString();
|
|
424
|
+
}
|
|
425
|
+
if (typeof value === "string") {
|
|
426
|
+
const trimmed = value.trim();
|
|
427
|
+
if (trimmed.length === 0) {
|
|
428
|
+
return null;
|
|
429
|
+
}
|
|
430
|
+
return trimmed.length > MAX_METADATA_VALUE_LENGTH
|
|
431
|
+
? trimmed.slice(0, MAX_METADATA_VALUE_LENGTH)
|
|
432
|
+
: trimmed;
|
|
433
|
+
}
|
|
434
|
+
if (typeof value === "number") {
|
|
435
|
+
if (!Number.isFinite(value)) {
|
|
436
|
+
return null;
|
|
437
|
+
}
|
|
438
|
+
return value;
|
|
439
|
+
}
|
|
440
|
+
if (typeof value === "boolean") {
|
|
441
|
+
return value;
|
|
442
|
+
}
|
|
443
|
+
if (Array.isArray(value)) {
|
|
444
|
+
if (value.length === 0) {
|
|
445
|
+
return null;
|
|
446
|
+
}
|
|
447
|
+
// Warn if array is too large
|
|
448
|
+
if (value.length > MAX_METADATA_ARRAY_LENGTH) {
|
|
449
|
+
console.warn(`Metadata array has ${value.length} elements, limiting to ${MAX_METADATA_ARRAY_LENGTH}`);
|
|
450
|
+
}
|
|
451
|
+
const sanitized = [];
|
|
452
|
+
for (const item of value.slice(0, MAX_METADATA_ARRAY_LENGTH)) {
|
|
453
|
+
const child = sanitizeMetadataTree(item, depth + 1);
|
|
454
|
+
if (child !== null) {
|
|
455
|
+
sanitized.push(child);
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
return sanitized.length > 0 ? sanitized : null;
|
|
459
|
+
}
|
|
460
|
+
if (typeof value === "object") {
|
|
461
|
+
const result = {};
|
|
462
|
+
const entries = Object.entries(value);
|
|
463
|
+
// Limit number of object keys to prevent memory exhaustion
|
|
464
|
+
if (entries.length > MAX_METADATA_OBJECT_KEYS) {
|
|
465
|
+
console.warn(`Object has ${entries.length} keys, limiting to ${MAX_METADATA_OBJECT_KEYS} to prevent memory exhaustion`);
|
|
466
|
+
}
|
|
467
|
+
for (const [key, child] of entries.slice(0, MAX_METADATA_OBJECT_KEYS)) {
|
|
468
|
+
if (!key)
|
|
469
|
+
continue;
|
|
470
|
+
const sanitizedChild = sanitizeMetadataTree(child, depth + 1);
|
|
471
|
+
if (sanitizedChild !== null) {
|
|
472
|
+
result[key] = sanitizedChild;
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
return Object.keys(result).length > 0 ? result : null;
|
|
476
|
+
}
|
|
477
|
+
return null;
|
|
478
|
+
}
|
|
479
|
+
function metadataValueToString(value) {
|
|
480
|
+
if (typeof value === "string") {
|
|
481
|
+
return value;
|
|
482
|
+
}
|
|
483
|
+
if (typeof value === "number") {
|
|
484
|
+
return Number.isFinite(value) ? value.toString() : "";
|
|
485
|
+
}
|
|
486
|
+
return value ? "true" : "false";
|
|
487
|
+
}
|
|
488
|
+
function collectMetadataPairsFromValue(value, path, source, pairs, state, keyPrefix = "") {
|
|
489
|
+
if (state.count >= MAX_METADATA_PAIRS_PER_FILE) {
|
|
490
|
+
return;
|
|
491
|
+
}
|
|
492
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
493
|
+
const key = keyPrefix.length > 0 ? keyPrefix : ROOT_METADATA_KEY;
|
|
494
|
+
let normalized = metadataValueToString(value).trim();
|
|
495
|
+
if (normalized.length === 0) {
|
|
496
|
+
return;
|
|
497
|
+
}
|
|
498
|
+
if (normalized.length > MAX_METADATA_VALUE_LENGTH) {
|
|
499
|
+
normalized = normalized.slice(0, MAX_METADATA_VALUE_LENGTH);
|
|
500
|
+
}
|
|
501
|
+
const dedupeKey = `${source}:${key}:${normalized.toLowerCase()}`;
|
|
502
|
+
if (state.seen.has(dedupeKey)) {
|
|
503
|
+
return;
|
|
504
|
+
}
|
|
505
|
+
state.seen.add(dedupeKey);
|
|
506
|
+
pairs.push({ path, source, key, value: normalized });
|
|
507
|
+
state.count += 1;
|
|
508
|
+
return;
|
|
509
|
+
}
|
|
510
|
+
if (Array.isArray(value)) {
|
|
511
|
+
for (const item of value) {
|
|
512
|
+
collectMetadataPairsFromValue(item, path, source, pairs, state, keyPrefix);
|
|
513
|
+
if (state.count >= MAX_METADATA_PAIRS_PER_FILE) {
|
|
514
|
+
break;
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
return;
|
|
518
|
+
}
|
|
519
|
+
if (typeof value === "object" && value !== null) {
|
|
520
|
+
for (const [childKey, childValue] of Object.entries(value)) {
|
|
521
|
+
const normalizedKey = childKey.toLowerCase();
|
|
522
|
+
const nextPrefix = keyPrefix.length > 0 ? `${keyPrefix}.${normalizedKey}` : normalizedKey;
|
|
523
|
+
collectMetadataPairsFromValue(childValue, path, source, pairs, state, nextPrefix);
|
|
524
|
+
if (state.count >= MAX_METADATA_PAIRS_PER_FILE) {
|
|
525
|
+
break;
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
function parseFrontMatterBlock(content, path) {
|
|
531
|
+
const leading = content.startsWith("\uFEFF") ? content.slice(1) : content;
|
|
532
|
+
if (!leading.startsWith("---")) {
|
|
533
|
+
return null;
|
|
534
|
+
}
|
|
535
|
+
const match = leading.match(/^---\s*\r?\n([\s\S]*?)\r?\n---\s*(?:\r?\n|$)/);
|
|
536
|
+
if (!match) {
|
|
537
|
+
return null;
|
|
538
|
+
}
|
|
539
|
+
const rawBlock = match[1] ?? "";
|
|
540
|
+
const body = leading.slice(match[0].length);
|
|
541
|
+
try {
|
|
542
|
+
const data = parseYAML(rawBlock);
|
|
543
|
+
return { data: data ?? null, body };
|
|
544
|
+
}
|
|
545
|
+
catch (error) {
|
|
546
|
+
// Structured error logging for better debugging
|
|
547
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
548
|
+
console.warn(JSON.stringify({
|
|
549
|
+
level: "warn",
|
|
550
|
+
message: "Failed to parse Markdown front matter",
|
|
551
|
+
file: path,
|
|
552
|
+
error: errorMessage,
|
|
553
|
+
context: "Front matter YAML parsing failed, metadata will be skipped for this file",
|
|
554
|
+
}));
|
|
555
|
+
return { data: null, body };
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
function stripLinkTitle(target) {
|
|
559
|
+
const trimmed = target.trim();
|
|
560
|
+
if (trimmed.length === 0) {
|
|
561
|
+
return trimmed;
|
|
562
|
+
}
|
|
563
|
+
const angleWrapped = trimmed.startsWith("<") && trimmed.endsWith(">");
|
|
564
|
+
const unwrapped = angleWrapped ? trimmed.slice(1, -1) : trimmed;
|
|
565
|
+
return unwrapped.replace(/\s+("[^"]*"|'[^']*')\s*$/, "").trim();
|
|
566
|
+
}
|
|
567
|
+
function extractMarkdownLinks(content, srcPath, repoFileSet) {
|
|
568
|
+
const links = [];
|
|
569
|
+
const pattern = /\[(?<text>[^\]]+)\]\((?<target>[^)]+)\)/g;
|
|
570
|
+
let match;
|
|
571
|
+
while ((match = pattern.exec(content)) !== null) {
|
|
572
|
+
if (match.index > 0 && content[match.index - 1] === "!") {
|
|
573
|
+
continue; // Skip images
|
|
574
|
+
}
|
|
575
|
+
const text = match.groups?.text?.trim() ?? "";
|
|
576
|
+
let target = match.groups?.target?.trim() ?? "";
|
|
577
|
+
if (!text || !target) {
|
|
578
|
+
continue;
|
|
579
|
+
}
|
|
580
|
+
target = stripLinkTitle(target);
|
|
581
|
+
if (!target) {
|
|
582
|
+
continue;
|
|
583
|
+
}
|
|
584
|
+
const kind = classifyMarkdownTarget(target);
|
|
585
|
+
const resolvedPath = resolveMarkdownLink(kind, target, srcPath, repoFileSet);
|
|
586
|
+
if (kind === "anchor" && resolvedPath === null) {
|
|
587
|
+
continue;
|
|
588
|
+
}
|
|
589
|
+
links.push({
|
|
590
|
+
srcPath,
|
|
591
|
+
target,
|
|
592
|
+
resolvedPath,
|
|
593
|
+
anchorText: text.slice(0, 160),
|
|
594
|
+
kind,
|
|
595
|
+
});
|
|
596
|
+
}
|
|
597
|
+
return links;
|
|
598
|
+
}
|
|
599
|
+
function classifyMarkdownTarget(target) {
|
|
600
|
+
const trimmed = target.trim();
|
|
601
|
+
if (!trimmed) {
|
|
602
|
+
return "external";
|
|
603
|
+
}
|
|
604
|
+
if (trimmed.startsWith("#")) {
|
|
605
|
+
return "anchor";
|
|
606
|
+
}
|
|
607
|
+
if (/^[a-z][a-z0-9+.-]*:/i.test(trimmed) || trimmed.startsWith("//")) {
|
|
608
|
+
return "external";
|
|
609
|
+
}
|
|
610
|
+
if (trimmed.startsWith("/")) {
|
|
611
|
+
return "absolute";
|
|
612
|
+
}
|
|
613
|
+
return "relative";
|
|
614
|
+
}
|
|
615
|
+
function resolveMarkdownLink(kind, target, srcPath, repoFileSet) {
|
|
616
|
+
if (kind === "external" || kind === "anchor") {
|
|
617
|
+
return null;
|
|
618
|
+
}
|
|
619
|
+
let cleanTarget = target.split("?")[0] ?? "";
|
|
620
|
+
const hashIndex = cleanTarget.indexOf("#");
|
|
621
|
+
if (hashIndex >= 0) {
|
|
622
|
+
cleanTarget = cleanTarget.slice(0, hashIndex);
|
|
623
|
+
}
|
|
624
|
+
cleanTarget = cleanTarget.trim().replace(/\\/g, "/");
|
|
625
|
+
if (!cleanTarget) {
|
|
626
|
+
return null;
|
|
627
|
+
}
|
|
628
|
+
let candidate;
|
|
629
|
+
if (kind === "absolute") {
|
|
630
|
+
candidate = cleanTarget.replace(/^\/+/, "");
|
|
631
|
+
}
|
|
632
|
+
else {
|
|
633
|
+
const dir = pathPosix.dirname(srcPath);
|
|
634
|
+
candidate = pathPosix.join(dir, cleanTarget);
|
|
635
|
+
}
|
|
636
|
+
candidate = pathPosix.normalize(candidate);
|
|
637
|
+
if (!candidate || candidate.startsWith("..")) {
|
|
638
|
+
return null;
|
|
639
|
+
}
|
|
640
|
+
// Security: Prevent directory traversal by checking for ".." segments
|
|
641
|
+
// Even after normalization, check that no path segment contains ".." or "."
|
|
642
|
+
const segments = candidate.split("/");
|
|
643
|
+
if (segments.some((seg) => seg === ".." || seg === ".")) {
|
|
644
|
+
return null;
|
|
645
|
+
}
|
|
646
|
+
// Additional security: reject absolute paths that may have bypassed earlier checks
|
|
647
|
+
if (candidate.startsWith("/")) {
|
|
648
|
+
return null;
|
|
649
|
+
}
|
|
650
|
+
const candidates = buildLinkCandidatePaths(candidate);
|
|
651
|
+
for (const pathCandidate of candidates) {
|
|
652
|
+
if (repoFileSet.has(pathCandidate)) {
|
|
653
|
+
return pathCandidate;
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
return null;
|
|
657
|
+
}
|
|
658
|
+
function buildLinkCandidatePaths(basePath) {
|
|
659
|
+
const candidates = new Set();
|
|
660
|
+
candidates.add(basePath);
|
|
661
|
+
if (!pathPosix.extname(basePath)) {
|
|
662
|
+
candidates.add(`${basePath}.md`);
|
|
663
|
+
candidates.add(`${basePath}.mdx`);
|
|
664
|
+
candidates.add(`${basePath}/README.md`);
|
|
665
|
+
candidates.add(`${basePath}/readme.md`);
|
|
666
|
+
candidates.add(`${basePath}/index.md`);
|
|
667
|
+
candidates.add(`${basePath}/INDEX.md`);
|
|
668
|
+
}
|
|
669
|
+
return Array.from(candidates);
|
|
670
|
+
}
|
|
671
|
+
function parseJsonValue(content, path) {
|
|
672
|
+
try {
|
|
673
|
+
return JSON.parse(content);
|
|
674
|
+
}
|
|
675
|
+
catch (error) {
|
|
676
|
+
// Structured error logging for better debugging
|
|
677
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
678
|
+
console.warn(JSON.stringify({
|
|
679
|
+
level: "warn",
|
|
680
|
+
message: "Failed to parse JSON metadata",
|
|
681
|
+
file: path,
|
|
682
|
+
error: errorMessage,
|
|
683
|
+
context: "JSON parsing failed, metadata will be skipped for this file",
|
|
684
|
+
}));
|
|
685
|
+
return null;
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
function parseYamlValue(content, path) {
|
|
689
|
+
try {
|
|
690
|
+
return parseYAML(content);
|
|
691
|
+
}
|
|
692
|
+
catch (error) {
|
|
693
|
+
// Structured error logging for better debugging
|
|
694
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
695
|
+
console.warn(JSON.stringify({
|
|
696
|
+
level: "warn",
|
|
697
|
+
message: "Failed to parse YAML metadata",
|
|
698
|
+
file: path,
|
|
699
|
+
error: errorMessage,
|
|
700
|
+
context: "YAML parsing failed, metadata will be skipped for this file",
|
|
701
|
+
}));
|
|
702
|
+
return null;
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
function parseDocmetaSnapshot(content, path) {
|
|
706
|
+
const parsed = parseJsonValue(content, path);
|
|
707
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
708
|
+
return null;
|
|
709
|
+
}
|
|
710
|
+
const candidate = parsed;
|
|
711
|
+
const targetPath = candidate[DOCMETA_SNAPSHOT_TARGET_FIELD];
|
|
712
|
+
const frontMatter = candidate[DOCMETA_SNAPSHOT_DATA_FIELD];
|
|
713
|
+
if (typeof targetPath !== "string") {
|
|
714
|
+
return null;
|
|
715
|
+
}
|
|
716
|
+
const sanitized = sanitizeMetadataTree(frontMatter);
|
|
717
|
+
if (!sanitized) {
|
|
718
|
+
return null;
|
|
719
|
+
}
|
|
720
|
+
return {
|
|
721
|
+
targetPath: normalizePathForIndex(targetPath),
|
|
722
|
+
data: sanitized,
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
async function collectPlainDocsPaths(repoRoot) {
|
|
726
|
+
const results = [];
|
|
727
|
+
async function walkRelative(relativeDir) {
|
|
728
|
+
const absDir = join(repoRoot, relativeDir);
|
|
729
|
+
let entries;
|
|
730
|
+
try {
|
|
731
|
+
entries = await readdir(absDir, { withFileTypes: true });
|
|
732
|
+
}
|
|
733
|
+
catch {
|
|
734
|
+
return;
|
|
735
|
+
}
|
|
736
|
+
for (const entry of entries) {
|
|
737
|
+
const relPath = pathPosix.join(relativeDir, entry.name);
|
|
738
|
+
if (entry.isDirectory()) {
|
|
739
|
+
await walkRelative(relPath);
|
|
740
|
+
}
|
|
741
|
+
else {
|
|
742
|
+
results.push(relPath);
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
await walkRelative("docs").catch(() => { });
|
|
747
|
+
await walkRelative("docmeta").catch(() => { });
|
|
748
|
+
return results;
|
|
749
|
+
}
|
|
750
|
+
function extractStructuredData(files, blobs, repoFileSet) {
|
|
751
|
+
const map = new Map();
|
|
752
|
+
const pairStates = new Map();
|
|
753
|
+
for (const file of files) {
|
|
754
|
+
if (file.isBinary)
|
|
755
|
+
continue;
|
|
756
|
+
const blob = blobs.get(file.blobHash);
|
|
757
|
+
if (!blob || blob.content === null) {
|
|
758
|
+
continue;
|
|
759
|
+
}
|
|
760
|
+
const ext = (file.ext ?? "").toLowerCase();
|
|
761
|
+
const normalizedPath = normalizePathForIndex(file.path);
|
|
762
|
+
if (normalizedPath.startsWith(DOCMETA_SNAPSHOT_DIR)) {
|
|
763
|
+
const snapshot = parseDocmetaSnapshot(blob.content, file.path);
|
|
764
|
+
if (snapshot) {
|
|
765
|
+
const existing = map.get(snapshot.targetPath);
|
|
766
|
+
const structured = existing ?? {
|
|
767
|
+
metadataRecords: [],
|
|
768
|
+
metadataPairs: [],
|
|
769
|
+
links: [],
|
|
770
|
+
};
|
|
771
|
+
structured.metadataRecords.push({
|
|
772
|
+
path: snapshot.targetPath,
|
|
773
|
+
source: "front_matter",
|
|
774
|
+
data: snapshot.data,
|
|
775
|
+
});
|
|
776
|
+
const pairState = ensurePairState(pairStates, snapshot.targetPath);
|
|
777
|
+
collectMetadataPairsFromValue(snapshot.data, snapshot.targetPath, "front_matter", structured.metadataPairs, pairState);
|
|
778
|
+
map.set(snapshot.targetPath, structured);
|
|
779
|
+
}
|
|
780
|
+
continue;
|
|
781
|
+
}
|
|
782
|
+
const existingEntry = map.get(file.path);
|
|
783
|
+
const structured = existingEntry ?? {
|
|
784
|
+
metadataRecords: [],
|
|
785
|
+
metadataPairs: [],
|
|
786
|
+
links: [],
|
|
787
|
+
};
|
|
788
|
+
let mutated = false;
|
|
789
|
+
if (ext === ".json") {
|
|
790
|
+
const parsed = parseJsonValue(blob.content, file.path);
|
|
791
|
+
const sanitized = sanitizeMetadataTree(parsed);
|
|
792
|
+
if (sanitized) {
|
|
793
|
+
structured.metadataRecords.push({ path: file.path, source: "json", data: sanitized });
|
|
794
|
+
const pairState = ensurePairState(pairStates, file.path);
|
|
795
|
+
collectMetadataPairsFromValue(sanitized, file.path, "json", structured.metadataPairs, pairState);
|
|
796
|
+
mutated = true;
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
else if (ext === ".yaml" || ext === ".yml") {
|
|
800
|
+
const parsed = parseYamlValue(blob.content, file.path);
|
|
801
|
+
const sanitized = sanitizeMetadataTree(parsed);
|
|
802
|
+
if (sanitized) {
|
|
803
|
+
structured.metadataRecords.push({ path: file.path, source: "yaml", data: sanitized });
|
|
804
|
+
const pairState = ensurePairState(pairStates, file.path);
|
|
805
|
+
collectMetadataPairsFromValue(sanitized, file.path, "yaml", structured.metadataPairs, pairState);
|
|
806
|
+
mutated = true;
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
if (MARKDOWN_EXTENSIONS.has(ext)) {
|
|
810
|
+
const frontMatter = parseFrontMatterBlock(blob.content, file.path);
|
|
811
|
+
let markdownBody = blob.content;
|
|
812
|
+
if (frontMatter) {
|
|
813
|
+
if (frontMatter.data) {
|
|
814
|
+
const sanitized = sanitizeMetadataTree(frontMatter.data);
|
|
815
|
+
if (sanitized) {
|
|
816
|
+
structured.metadataRecords.push({
|
|
817
|
+
path: file.path,
|
|
818
|
+
source: "front_matter",
|
|
819
|
+
data: sanitized,
|
|
820
|
+
});
|
|
821
|
+
const pairState = ensurePairState(pairStates, file.path);
|
|
822
|
+
collectMetadataPairsFromValue(sanitized, file.path, "front_matter", structured.metadataPairs, pairState);
|
|
823
|
+
mutated = true;
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
markdownBody = frontMatter.body;
|
|
827
|
+
}
|
|
828
|
+
const links = extractMarkdownLinks(markdownBody, file.path, repoFileSet);
|
|
829
|
+
if (links.length > 0) {
|
|
830
|
+
structured.links.push(...links);
|
|
831
|
+
mutated = true;
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
if (mutated || existingEntry) {
|
|
835
|
+
map.set(file.path, structured);
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
return map;
|
|
839
|
+
}
|
|
840
|
+
function aggregateStructuredData(map) {
|
|
841
|
+
const aggregated = {
|
|
842
|
+
metadataRecords: [],
|
|
843
|
+
metadataPairs: [],
|
|
844
|
+
links: [],
|
|
845
|
+
};
|
|
846
|
+
for (const entry of map.values()) {
|
|
847
|
+
aggregated.metadataRecords.push(...entry.metadataRecords);
|
|
848
|
+
aggregated.metadataPairs.push(...entry.metadataPairs);
|
|
849
|
+
aggregated.links.push(...entry.links);
|
|
850
|
+
}
|
|
851
|
+
return aggregated;
|
|
852
|
+
}
|
|
305
853
|
async function buildCodeIntel(files, blobs, workspaceRoot) {
|
|
306
854
|
const fileSet = new Set(files.map((file) => file.path));
|
|
307
855
|
const symbols = [];
|
|
@@ -498,16 +1046,20 @@ async function reconcileDeletedFiles(db, repoId, repoRoot) {
|
|
|
498
1046
|
}
|
|
499
1047
|
}
|
|
500
1048
|
// Delete all records for removed files in a single transaction
|
|
1049
|
+
// Batched DELETE operations to avoid N+1 query problem
|
|
501
1050
|
if (deletedPaths.length > 0) {
|
|
502
1051
|
await db.transaction(async () => {
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
}
|
|
1052
|
+
const placeholders = deletedPaths.map(() => "?").join(", ");
|
|
1053
|
+
const params = [repoId, ...deletedPaths];
|
|
1054
|
+
await db.run(`DELETE FROM symbol WHERE repo_id = ? AND path IN (${placeholders})`, params);
|
|
1055
|
+
await db.run(`DELETE FROM snippet WHERE repo_id = ? AND path IN (${placeholders})`, params);
|
|
1056
|
+
await db.run(`DELETE FROM dependency WHERE repo_id = ? AND src_path IN (${placeholders})`, params);
|
|
1057
|
+
await db.run(`DELETE FROM file_embedding WHERE repo_id = ? AND path IN (${placeholders})`, params);
|
|
1058
|
+
await db.run(`DELETE FROM document_metadata WHERE repo_id = ? AND path IN (${placeholders})`, params);
|
|
1059
|
+
await db.run(`DELETE FROM document_metadata_kv WHERE repo_id = ? AND path IN (${placeholders})`, params);
|
|
1060
|
+
await db.run(`DELETE FROM markdown_link WHERE repo_id = ? AND src_path IN (${placeholders})`, params);
|
|
1061
|
+
await db.run(`DELETE FROM tree WHERE repo_id = ? AND path IN (${placeholders})`, params);
|
|
1062
|
+
await db.run(`DELETE FROM file WHERE repo_id = ? AND path IN (${placeholders})`, params);
|
|
511
1063
|
});
|
|
512
1064
|
}
|
|
513
1065
|
return deletedPaths;
|
|
@@ -526,6 +1078,9 @@ async function deleteFileRecords(db, repoId, headCommit, path) {
|
|
|
526
1078
|
await db.run("DELETE FROM snippet WHERE repo_id = ? AND path = ?", [repoId, path]);
|
|
527
1079
|
await db.run("DELETE FROM dependency WHERE repo_id = ? AND src_path = ?", [repoId, path]);
|
|
528
1080
|
await db.run("DELETE FROM file_embedding WHERE repo_id = ? AND path = ?", [repoId, path]);
|
|
1081
|
+
await db.run("DELETE FROM document_metadata WHERE repo_id = ? AND path = ?", [repoId, path]);
|
|
1082
|
+
await db.run("DELETE FROM document_metadata_kv WHERE repo_id = ? AND path = ?", [repoId, path]);
|
|
1083
|
+
await db.run("DELETE FROM markdown_link WHERE repo_id = ? AND src_path = ?", [repoId, path]);
|
|
529
1084
|
await db.run("DELETE FROM tree WHERE repo_id = ? AND commit_hash = ? AND path = ?", [
|
|
530
1085
|
repoId,
|
|
531
1086
|
headCommit,
|
|
@@ -533,6 +1088,25 @@ async function deleteFileRecords(db, repoId, headCommit, path) {
|
|
|
533
1088
|
]);
|
|
534
1089
|
await db.run("DELETE FROM file WHERE repo_id = ? AND path = ?", [repoId, path]);
|
|
535
1090
|
}
|
|
1091
|
+
/**
|
|
1092
|
+
* Remove blob records that are no longer referenced by any file.
|
|
1093
|
+
* This garbage collection should be run after full re-indexing or periodically as maintenance.
|
|
1094
|
+
*
|
|
1095
|
+
* @param db - Database client
|
|
1096
|
+
*/
|
|
1097
|
+
async function garbageCollectBlobs(db) {
|
|
1098
|
+
console.info("Running garbage collection on blob table...");
|
|
1099
|
+
try {
|
|
1100
|
+
await db.run(`
|
|
1101
|
+
DELETE FROM blob
|
|
1102
|
+
WHERE hash NOT IN (SELECT DISTINCT blob_hash FROM file)
|
|
1103
|
+
`);
|
|
1104
|
+
console.info("Blob garbage collection complete.");
|
|
1105
|
+
}
|
|
1106
|
+
catch (error) {
|
|
1107
|
+
console.warn("Failed to garbage collect blobs:", error instanceof Error ? error.message : String(error));
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
536
1110
|
export async function runIndexer(options) {
|
|
537
1111
|
const repoPathCandidates = getRepoPathCandidates(options.repoRoot);
|
|
538
1112
|
const repoRoot = repoPathCandidates[0];
|
|
@@ -571,6 +1145,10 @@ export async function runIndexer(options) {
|
|
|
571
1145
|
const dbClient = await DuckDBClient.connect({ databasePath, ensureDirectory: true });
|
|
572
1146
|
db = dbClient;
|
|
573
1147
|
await ensureBaseSchema(dbClient);
|
|
1148
|
+
// Migration: Ensure document_metadata tables exist for existing DBs
|
|
1149
|
+
await ensureDocumentMetadataTables(dbClient);
|
|
1150
|
+
// Phase 1: Ensure normalized_root column exists (Critical #1)
|
|
1151
|
+
await ensureNormalizedRootColumn(dbClient);
|
|
574
1152
|
// Phase 3: Ensure FTS metadata columns exist for existing DBs (migration)
|
|
575
1153
|
await ensureRepoMetaColumns(dbClient);
|
|
576
1154
|
const [headCommit, defaultBranch] = await Promise.all([
|
|
@@ -626,6 +1204,12 @@ export async function runIndexer(options) {
|
|
|
626
1204
|
}
|
|
627
1205
|
return;
|
|
628
1206
|
}
|
|
1207
|
+
const existingFileRows = await dbClient.all("SELECT path FROM file WHERE repo_id = ?", [repoId]);
|
|
1208
|
+
const repoFileSet = new Set(existingFileRows.map((row) => row.path));
|
|
1209
|
+
for (const file of files) {
|
|
1210
|
+
repoFileSet.add(file.path);
|
|
1211
|
+
}
|
|
1212
|
+
const structuredByFile = extractStructuredData(changedFiles, changedBlobs, repoFileSet);
|
|
629
1213
|
// Process all changed files in a single transaction for atomicity
|
|
630
1214
|
const fileSet = new Set(files.map((f) => f.path));
|
|
631
1215
|
const embeddingMap = new Map();
|
|
@@ -648,67 +1232,79 @@ export async function runIndexer(options) {
|
|
|
648
1232
|
const blob = changedBlobs.get(file.blobHash);
|
|
649
1233
|
if (!blob)
|
|
650
1234
|
continue;
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
1235
|
+
try {
|
|
1236
|
+
// Build code intelligence for this file
|
|
1237
|
+
const fileSymbols = [];
|
|
1238
|
+
const fileSnippets = [];
|
|
1239
|
+
const fileDependencies = [];
|
|
1240
|
+
if (!file.isBinary && blob.content) {
|
|
1241
|
+
const analysis = await analyzeSource(file.path, file.lang, blob.content, fileSet, repoRoot);
|
|
1242
|
+
for (const symbol of analysis.symbols) {
|
|
1243
|
+
fileSymbols.push({
|
|
1244
|
+
path: file.path,
|
|
1245
|
+
symbolId: symbol.symbolId,
|
|
1246
|
+
name: symbol.name,
|
|
1247
|
+
kind: symbol.kind,
|
|
1248
|
+
rangeStartLine: symbol.rangeStartLine,
|
|
1249
|
+
rangeEndLine: symbol.rangeEndLine,
|
|
1250
|
+
signature: symbol.signature,
|
|
1251
|
+
doc: symbol.doc,
|
|
1252
|
+
});
|
|
1253
|
+
}
|
|
1254
|
+
for (const snippet of analysis.snippets) {
|
|
1255
|
+
fileSnippets.push({
|
|
1256
|
+
path: file.path,
|
|
1257
|
+
snippetId: snippet.startLine,
|
|
1258
|
+
startLine: snippet.startLine,
|
|
1259
|
+
endLine: snippet.endLine,
|
|
1260
|
+
symbolId: snippet.symbolId,
|
|
1261
|
+
});
|
|
1262
|
+
}
|
|
1263
|
+
for (const dep of analysis.dependencies) {
|
|
1264
|
+
fileDependencies.push({
|
|
1265
|
+
srcPath: file.path,
|
|
1266
|
+
dstKind: dep.dstKind,
|
|
1267
|
+
dst: dep.dst,
|
|
1268
|
+
rel: dep.rel,
|
|
1269
|
+
});
|
|
1270
|
+
}
|
|
668
1271
|
}
|
|
669
|
-
|
|
1272
|
+
else {
|
|
1273
|
+
// Binary or no content: add fallback snippet
|
|
1274
|
+
const fallback = buildFallbackSnippet(blob.lineCount ?? 1);
|
|
670
1275
|
fileSnippets.push({
|
|
671
1276
|
path: file.path,
|
|
672
|
-
snippetId:
|
|
673
|
-
startLine:
|
|
674
|
-
endLine:
|
|
675
|
-
symbolId:
|
|
1277
|
+
snippetId: fallback.startLine,
|
|
1278
|
+
startLine: fallback.startLine,
|
|
1279
|
+
endLine: fallback.endLine,
|
|
1280
|
+
symbolId: fallback.symbolId,
|
|
676
1281
|
});
|
|
677
1282
|
}
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
1283
|
+
const fileEmbedding = embeddingMap.get(file.path) ?? null;
|
|
1284
|
+
// Delete old records for this file (within main transaction)
|
|
1285
|
+
await deleteFileRecords(dbClient, repoId, headCommit, file.path);
|
|
1286
|
+
// Insert new records (within main transaction)
|
|
1287
|
+
await persistBlobs(dbClient, new Map([[blob.hash, blob]]));
|
|
1288
|
+
await persistTrees(dbClient, repoId, headCommit, [file]);
|
|
1289
|
+
await persistFiles(dbClient, repoId, [file]);
|
|
1290
|
+
await persistSymbols(dbClient, repoId, fileSymbols);
|
|
1291
|
+
await persistSnippets(dbClient, repoId, fileSnippets);
|
|
1292
|
+
await persistDependencies(dbClient, repoId, fileDependencies);
|
|
1293
|
+
const structured = structuredByFile.get(file.path);
|
|
1294
|
+
if (structured) {
|
|
1295
|
+
await persistDocumentMetadata(dbClient, repoId, structured.metadataRecords);
|
|
1296
|
+
await persistMetadataPairs(dbClient, repoId, structured.metadataPairs);
|
|
1297
|
+
await persistMarkdownLinks(dbClient, repoId, structured.links);
|
|
685
1298
|
}
|
|
1299
|
+
if (fileEmbedding) {
|
|
1300
|
+
await persistEmbeddings(dbClient, repoId, [fileEmbedding]);
|
|
1301
|
+
}
|
|
1302
|
+
processedCount++;
|
|
686
1303
|
}
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
fileSnippets.push({
|
|
691
|
-
path: file.path,
|
|
692
|
-
snippetId: fallback.startLine,
|
|
693
|
-
startLine: fallback.startLine,
|
|
694
|
-
endLine: fallback.endLine,
|
|
695
|
-
symbolId: fallback.symbolId,
|
|
696
|
-
});
|
|
697
|
-
}
|
|
698
|
-
const fileEmbedding = embeddingMap.get(file.path) ?? null;
|
|
699
|
-
// Delete old records for this file (within main transaction)
|
|
700
|
-
await deleteFileRecords(dbClient, repoId, headCommit, file.path);
|
|
701
|
-
// Insert new records (within main transaction)
|
|
702
|
-
await persistBlobs(dbClient, new Map([[blob.hash, blob]]));
|
|
703
|
-
await persistTrees(dbClient, repoId, headCommit, [file]);
|
|
704
|
-
await persistFiles(dbClient, repoId, [file]);
|
|
705
|
-
await persistSymbols(dbClient, repoId, fileSymbols);
|
|
706
|
-
await persistSnippets(dbClient, repoId, fileSnippets);
|
|
707
|
-
await persistDependencies(dbClient, repoId, fileDependencies);
|
|
708
|
-
if (fileEmbedding) {
|
|
709
|
-
await persistEmbeddings(dbClient, repoId, [fileEmbedding]);
|
|
1304
|
+
catch (error) {
|
|
1305
|
+
console.error(`Failed to process file ${file.path}, transaction will rollback:`, error instanceof Error ? error.message : String(error));
|
|
1306
|
+
throw error; // Re-throw to rollback the transaction
|
|
710
1307
|
}
|
|
711
|
-
processedCount++;
|
|
712
1308
|
}
|
|
713
1309
|
// Update timestamp and mark FTS dirty inside transaction for atomicity
|
|
714
1310
|
// Fix: Increment fts_generation to prevent lost updates during concurrent rebuilds
|
|
@@ -725,7 +1321,14 @@ export async function runIndexer(options) {
|
|
|
725
1321
|
return;
|
|
726
1322
|
}
|
|
727
1323
|
// Full mode: reindex entire repository
|
|
728
|
-
|
|
1324
|
+
let paths = await gitLsFiles(repoRoot);
|
|
1325
|
+
if (paths.length === 0) {
|
|
1326
|
+
const fallbackPaths = await collectPlainDocsPaths(repoRoot);
|
|
1327
|
+
if (fallbackPaths.length > 0) {
|
|
1328
|
+
console.warn(`git ls-files returned 0 paths for ${repoRoot}. Falling back to filesystem scan (${fallbackPaths.length} files).`);
|
|
1329
|
+
paths = fallbackPaths;
|
|
1330
|
+
}
|
|
1331
|
+
}
|
|
729
1332
|
const { blobs, files, embeddings, missingPaths } = await scanFilesInBatches(repoRoot, paths);
|
|
730
1333
|
// In full mode, missingPaths should be rare (git ls-files returns existing files)
|
|
731
1334
|
// But log them if they occur (race condition: file deleted between ls-files and scan)
|
|
@@ -733,6 +1336,9 @@ export async function runIndexer(options) {
|
|
|
733
1336
|
console.warn(`${missingPaths.length} file(s) disappeared during full reindex (race condition)`);
|
|
734
1337
|
}
|
|
735
1338
|
const codeIntel = await buildCodeIntel(files, blobs, repoRoot);
|
|
1339
|
+
const repoFileSetFull = new Set(files.map((file) => file.path));
|
|
1340
|
+
const structuredMap = extractStructuredData(files, blobs, repoFileSetFull);
|
|
1341
|
+
const aggregatedStructured = aggregateStructuredData(structuredMap);
|
|
736
1342
|
await dbClient.transaction(async () => {
|
|
737
1343
|
await dbClient.run("DELETE FROM tree WHERE repo_id = ?", [repoId]);
|
|
738
1344
|
await dbClient.run("DELETE FROM file WHERE repo_id = ?", [repoId]);
|
|
@@ -740,6 +1346,9 @@ export async function runIndexer(options) {
|
|
|
740
1346
|
await dbClient.run("DELETE FROM snippet WHERE repo_id = ?", [repoId]);
|
|
741
1347
|
await dbClient.run("DELETE FROM dependency WHERE repo_id = ?", [repoId]);
|
|
742
1348
|
await dbClient.run("DELETE FROM file_embedding WHERE repo_id = ?", [repoId]);
|
|
1349
|
+
await dbClient.run("DELETE FROM document_metadata WHERE repo_id = ?", [repoId]);
|
|
1350
|
+
await dbClient.run("DELETE FROM document_metadata_kv WHERE repo_id = ?", [repoId]);
|
|
1351
|
+
await dbClient.run("DELETE FROM markdown_link WHERE repo_id = ?", [repoId]);
|
|
743
1352
|
await persistBlobs(dbClient, blobs);
|
|
744
1353
|
await persistTrees(dbClient, repoId, headCommit, files);
|
|
745
1354
|
await persistFiles(dbClient, repoId, files);
|
|
@@ -747,6 +1356,9 @@ export async function runIndexer(options) {
|
|
|
747
1356
|
await persistSnippets(dbClient, repoId, codeIntel.snippets);
|
|
748
1357
|
await persistDependencies(dbClient, repoId, codeIntel.dependencies);
|
|
749
1358
|
await persistEmbeddings(dbClient, repoId, embeddings);
|
|
1359
|
+
await persistDocumentMetadata(dbClient, repoId, aggregatedStructured.metadataRecords);
|
|
1360
|
+
await persistMetadataPairs(dbClient, repoId, aggregatedStructured.metadataPairs);
|
|
1361
|
+
await persistMarkdownLinks(dbClient, repoId, aggregatedStructured.links);
|
|
750
1362
|
// Update timestamp and mark FTS dirty inside transaction to ensure atomicity
|
|
751
1363
|
// Fix: Increment fts_generation to prevent lost updates during concurrent rebuilds
|
|
752
1364
|
if (defaultBranch) {
|
|
@@ -759,6 +1371,8 @@ export async function runIndexer(options) {
|
|
|
759
1371
|
console.info(`Indexed ${files.length} files for repo ${repoRoot} at ${databasePath} (commit ${headCommit.slice(0, 12)})`);
|
|
760
1372
|
// Phase 2+3: Force rebuild FTS index after full reindex
|
|
761
1373
|
await rebuildFTSIfNeeded(dbClient, repoId, true);
|
|
1374
|
+
// Garbage collect orphaned blobs after full reindex
|
|
1375
|
+
await garbageCollectBlobs(dbClient);
|
|
762
1376
|
}
|
|
763
1377
|
finally {
|
|
764
1378
|
// Fix #2: Ensure lock is released even if DB connection fails
|