@codragraph/cli 1.6.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +34 -0
  2. package/dist/cli/analyze.d.ts +22 -0
  3. package/dist/cli/analyze.js +107 -4
  4. package/dist/cli/compress-stats.d.ts +29 -0
  5. package/dist/cli/compress-stats.js +97 -0
  6. package/dist/cli/graphstore.d.ts +6 -2
  7. package/dist/cli/graphstore.js +24 -2
  8. package/dist/cli/index.js +16 -2
  9. package/dist/cli/profile-heap.d.ts +35 -0
  10. package/dist/cli/profile-heap.js +126 -0
  11. package/dist/cli/setup.d.ts +13 -0
  12. package/dist/cli/setup.js +22 -11
  13. package/dist/cli/skill-gen.d.ts +14 -2
  14. package/dist/cli/skill-gen.js +52 -19
  15. package/dist/cli/tool.js +4 -0
  16. package/dist/core/embeddings/embedding-pipeline.js +24 -7
  17. package/dist/core/group/bridge-db.js +111 -24
  18. package/dist/core/lbug/content-read.d.ts +46 -0
  19. package/dist/core/lbug/content-read.js +64 -0
  20. package/dist/core/lbug/csv-generator.d.ts +2 -6
  21. package/dist/core/lbug/csv-generator.js +45 -12
  22. package/dist/core/lbug/lbug-adapter.d.ts +4 -1
  23. package/dist/core/lbug/lbug-adapter.js +153 -21
  24. package/dist/core/lbug/schema.d.ts +7 -7
  25. package/dist/core/lbug/schema.js +18 -0
  26. package/dist/core/run-analyze.d.ts +13 -0
  27. package/dist/core/run-analyze.js +91 -4
  28. package/dist/core/search/bm25-index.js +67 -15
  29. package/dist/mcp/local/local-backend.js +22 -5
  30. package/dist/server/api.js +4 -3
  31. package/dist/storage/repo-manager.d.ts +39 -0
  32. package/dist/storage/repo-manager.js +19 -0
  33. package/hooks/claude/codragraph-hook.cjs +95 -2
  34. package/package.json +4 -4
  35. package/scripts/build-tree-sitter-proto.cjs +15 -3
  36. package/scripts/patch-tree-sitter-swift.cjs +17 -4
  37. package/skills/codragraph-api-surface.md +110 -0
  38. package/skills/codragraph-config-audit.md +146 -0
  39. package/skills/codragraph-cross-repo-impact.md +135 -0
  40. package/skills/codragraph-data-lineage.md +137 -0
  41. package/skills/codragraph-dead-code.md +119 -0
  42. package/skills/codragraph-gh-actions-debug.md +162 -0
  43. package/skills/codragraph-gh-issue-workflow.md +178 -0
  44. package/skills/codragraph-gh-pr-workflow.md +176 -0
  45. package/skills/codragraph-gh-release-workflow.md +187 -0
  46. package/skills/codragraph-git-bisect.md +176 -0
  47. package/skills/codragraph-git-force-push.md +147 -0
  48. package/skills/codragraph-git-history-rewrite.md +174 -0
  49. package/skills/codragraph-git-rebase-vs-merge.md +138 -0
  50. package/skills/codragraph-git-recovery.md +181 -0
  51. package/skills/codragraph-git-worktree.md +145 -0
  52. package/skills/codragraph-migration-tracking.md +130 -0
  53. package/skills/codragraph-notebook-context.md +136 -0
  54. package/skills/codragraph-observability-coverage.md +125 -0
  55. package/skills/codragraph-onboarding.md +129 -0
  56. package/skills/codragraph-perf-hotspots.md +132 -0
  57. package/skills/codragraph-project-switcher.md +116 -0
  58. package/skills/codragraph-security-audit.md +144 -0
  59. package/skills/codragraph-sql-tracing.md +122 -0
  60. package/skills/codragraph-supply-chain-audit.md +153 -0
  61. package/skills/codragraph-test-coverage.md +97 -0
@@ -10,9 +10,11 @@
10
10
  */
11
11
  import path from 'path';
12
12
  import fs from 'fs/promises';
13
+ import * as fsSync from 'node:fs';
14
+ import * as v8 from 'node:v8';
13
15
  import { runPipelineFromRepo } from './ingestion/pipeline.js';
14
16
  import { initLbug, loadGraphToLbug, getLbugStats, executeQuery, executeWithReusedStatement, closeLbug, loadCachedEmbeddings, } from './lbug/lbug-adapter.js';
15
- import { getStoragePaths, saveMeta, loadMeta, addToGitignore, registerRepo, cleanupOldKuzuFiles, } from '../storage/repo-manager.js';
17
+ import { getStoragePaths, saveMeta, loadMeta, addToGitignore, registerRepo, cleanupOldKuzuFiles, INDEX_SCHEMA_VERSION, } from '../storage/repo-manager.js';
16
18
  import { getCurrentCommit, getRemoteUrl, hasGitDir, getInferredRepoName } from '../storage/git.js';
17
19
  import { recordAnalysisSnapshot } from './graphstore/index.js';
18
20
  import { generateAIContextFiles } from '../cli/ai-context.js';
@@ -51,7 +53,70 @@ export const PHASE_LABELS = {
51
53
  */
52
54
  export async function runFullAnalysis(repoPath, options, callbacks) {
53
55
  const log = (msg) => callbacks.onLog?.(msg);
54
- const progress = (phase, percent, message) => callbacks.onProgress(phase, percent, message);
56
+ // RFC 0002 Phase 1 optional heap-profile instrumentation. Set
57
+ // CODRAGRAPH_HEAP_PROFILE=1 (or run `codragraph profile-heap`) to write a
58
+ // v8 heap snapshot at every phase boundary, plus a `profile-summary.jsonl`
59
+ // log of `process.memoryUsage()` at the same boundaries. Snapshots land in
60
+ // `<repo>/.codragraph/heap-profiles/`. Open snapshots in Chrome DevTools
61
+ // (Memory → Load) to find which constructors dominate retained set; the
62
+ // JSONL is the cheap RSS / heapUsed timeline. Off by default — snapshot
63
+ // writes pause the event loop ~2-5s and consume ~100-500MB of disk each.
64
+ const heapProfileEnabled = process.env.CODRAGRAPH_HEAP_PROFILE === '1';
65
+ let heapProfileDir = '';
66
+ let heapProfileSummaryPath = '';
67
+ let lastProfilePhase = '';
68
+ if (heapProfileEnabled) {
69
+ heapProfileDir = path.join(repoPath, '.codragraph', 'heap-profiles');
70
+ heapProfileSummaryPath = path.join(heapProfileDir, 'profile-summary.jsonl');
71
+ try {
72
+ fsSync.mkdirSync(heapProfileDir, { recursive: true });
73
+ // Truncate any prior summary so a single run produces a clean log.
74
+ // We append crash-safely on each phase boundary below.
75
+ fsSync.writeFileSync(heapProfileSummaryPath, '');
76
+ }
77
+ catch {
78
+ /* permission issue — best-effort */
79
+ }
80
+ }
81
+ const progress = (phase, percent, message) => {
82
+ callbacks.onProgress(phase, percent, message);
83
+ // Only snapshot on phase transitions, not every tick. Phase strings come
84
+ // from runPipelineFromRepo / loadGraphToLbug and are stable.
85
+ if (heapProfileEnabled && phase && phase !== lastProfilePhase) {
86
+ lastProfilePhase = phase;
87
+ const ts = Date.now();
88
+ const safe = phase.replace(/[^a-zA-Z0-9]+/g, '_').slice(0, 60);
89
+ const file = path.join(heapProfileDir, `${ts}-${safe}.heapsnapshot`);
90
+ // Capture the cheap memoryUsage timeline FIRST — even if writeHeapSnapshot
91
+ // crashes (out of disk, permissions), we still have the RSS curve which
92
+ // is the more useful artifact for the heap-pressure RFC.
93
+ try {
94
+ const mu = process.memoryUsage();
95
+ const entry = JSON.stringify({
96
+ ts,
97
+ phase,
98
+ percent,
99
+ rss: mu.rss,
100
+ heapUsed: mu.heapUsed,
101
+ heapTotal: mu.heapTotal,
102
+ external: mu.external,
103
+ arrayBuffers: mu.arrayBuffers,
104
+ snapshotFile: path.basename(file),
105
+ });
106
+ fsSync.appendFileSync(heapProfileSummaryPath, entry + '\n');
107
+ }
108
+ catch (err) {
109
+ log(`heap-profile: summary append failed (${err.message})`);
110
+ }
111
+ try {
112
+ v8.writeHeapSnapshot(file);
113
+ log(`heap-profile: wrote ${file}`);
114
+ }
115
+ catch (err) {
116
+ log(`heap-profile: write failed (${err.message})`);
117
+ }
118
+ }
119
+ };
55
120
  const { storagePath, lbugPath } = getStoragePaths(repoPath);
56
121
  // Clean up stale KuzuDB files from before the LadybugDB migration.
57
122
  const kuzuResult = await cleanupOldKuzuFiles(storagePath);
@@ -62,7 +127,17 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
62
127
  const currentCommit = repoHasGit ? getCurrentCommit(repoPath) : '';
63
128
  const existingMeta = await loadMeta(storagePath);
64
129
  // ── Early-return: already up to date ──────────────────────────────
65
- if (existingMeta && !options.force && existingMeta.lastCommit === currentCommit) {
130
+ // Schema-version mismatch forces a full re-analyze regardless of commit
131
+ // equality: existing 1.7.x indexes have no `schemaVersion` field at all,
132
+ // and 1.8+ readers expect every node table to carry a `contentEncoding`
133
+ // column (RFC 0001 Phase 2). LadybugDB ALTER on existing tables is not
134
+ // validated end-to-end yet, so the supported migration path is
135
+ // re-analyze → fresh CREATE NODE TABLE.
136
+ const schemaUpToDate = !!existingMeta && (existingMeta.schemaVersion ?? 0) >= INDEX_SCHEMA_VERSION;
137
+ if (existingMeta &&
138
+ schemaUpToDate &&
139
+ !options.force &&
140
+ existingMeta.lastCommit === currentCommit) {
66
141
  // Non-git folders have currentCommit = '' — always rebuild since we can't detect changes
67
142
  if (currentCommit !== '') {
68
143
  return {
@@ -73,6 +148,11 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
73
148
  };
74
149
  }
75
150
  }
151
+ if (existingMeta && !schemaUpToDate) {
152
+ log(`Index schema version ${existingMeta.schemaVersion ?? '<missing>'} is older than ` +
153
+ `${INDEX_SCHEMA_VERSION} (RFC 0001 Phase 2 — adds contentEncoding column). ` +
154
+ `Re-analyzing.`);
155
+ }
76
156
  // ── Cache embeddings from existing index before rebuild ────────────
77
157
  let cachedEmbeddingNodeIds = new Set();
78
158
  let cachedEmbeddings = [];
@@ -122,7 +202,12 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
122
202
  lbugMsgCount++;
123
203
  const pct = Math.min(84, 60 + Math.round((lbugMsgCount / (lbugMsgCount + 10)) * 24));
124
204
  progress('lbug', pct, msg);
125
- });
205
+ },
206
+ // RFC 0001 Phase 2: when --compress is set, every content row goes
207
+ // through encodeContent before hitting the CSV. Default 'none' is
208
+ // a true passthrough, so the on-disk layout is byte-identical to
209
+ // pre-Phase-2 indexes when no compression flag is passed.
210
+ { compress: options.compress });
126
211
  // ── Phase 2.5: Versioned-graph snapshot (best-effort) ────────────
127
212
  // Phase 4 hook: snapshot the freshly-loaded graph into the
128
213
  // content-addressed `.codragraph/graphstore/`. Failures here do NOT
@@ -230,6 +315,8 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
230
315
  repoPath,
231
316
  lastCommit: currentCommit,
232
317
  indexedAt: new Date().toISOString(),
318
+ schemaVersion: INDEX_SCHEMA_VERSION,
319
+ compress: options.compress ?? 'none',
233
320
  // Captured here (not at registration) so it travels with the
234
321
  // on-disk meta.json — sibling-clone fingerprinting works for
235
322
  // out-of-tree consumers (group-status, future tooling) without
@@ -12,17 +12,56 @@
12
12
  */
13
13
  import { queryFTS, ensureFTSIndex, executeQuery as executeCoreQuery, } from '../lbug/lbug-adapter.js';
14
14
  /**
15
- * FTS schema served by `searchFTSFromLbug`. Centralised so that both the
16
- * CLI/pipeline path and the MCP pool path use identical (table, index,
17
- * properties) tuples and the lazy-create logic stays in one place.
15
+ * FTS table set served by `searchFTSFromLbug`. Centralised so that both
16
+ * the CLI/pipeline path and the MCP pool path stay in lockstep.
17
+ *
18
+ * The properties list is computed at FTS-create time via `ftsPropertiesFor`
19
+ * — for repos that were analysed with `--compress brotli|zstd`, the
20
+ * `content` column holds base64-of-encoded-bytes and would tokenise to
21
+ * useless tokens. Those repos get name-only FTS so search at least
22
+ * matches function/class names instead of returning random hits on
23
+ * base64 alphabet. Plain (compress='none' / unset) repos get the full
24
+ * `name + content` index for body-text matches. RFC 0001 Phase 2.5.
18
25
  */
19
- const FTS_INDEXES = [
20
- { table: 'File', indexName: 'file_fts', properties: ['name', 'content'] },
21
- { table: 'Function', indexName: 'function_fts', properties: ['name', 'content'] },
22
- { table: 'Class', indexName: 'class_fts', properties: ['name', 'content'] },
23
- { table: 'Method', indexName: 'method_fts', properties: ['name', 'content'] },
24
- { table: 'Interface', indexName: 'interface_fts', properties: ['name', 'content'] },
26
+ const FTS_TABLES = [
27
+ { table: 'File', indexName: 'file_fts' },
28
+ { table: 'Function', indexName: 'function_fts' },
29
+ { table: 'Class', indexName: 'class_fts' },
30
+ { table: 'Method', indexName: 'method_fts' },
31
+ { table: 'Interface', indexName: 'interface_fts' },
25
32
  ];
33
+ const ftsPropertiesFor = (compress) => !compress || compress === 'none' ? ['name', 'content'] : ['name'];
34
+ /**
35
+ * Look up `meta.compress` for a repo. The MCP path passes `repoId`
36
+ * (registry-derived); the CLI path passes nothing and we walk up from
37
+ * cwd. Returns `'none'` whenever the lookup fails so the safe default
38
+ * (full FTS index) is used — the failure mode is reduced search
39
+ * quality, never wrong results.
40
+ */
41
+ async function getCompressMode(repoId) {
42
+ try {
43
+ const repoMod = await import('../../storage/repo-manager.js');
44
+ if (repoId) {
45
+ // MCP path: registry name is the source of truth. The MCP
46
+ // backend's `repoId` is `entry.name.toLowerCase()` (or `${name}-${hash}`
47
+ // on collision); match conservatively against both forms.
48
+ const entries = await repoMod.listRegisteredRepos();
49
+ for (const entry of entries) {
50
+ const base = entry.name.toLowerCase();
51
+ if (base === repoId || repoId.startsWith(`${base}-`)) {
52
+ const meta = await repoMod.loadMeta(entry.storagePath);
53
+ return meta?.compress ?? 'none';
54
+ }
55
+ }
56
+ return 'none';
57
+ }
58
+ const repo = await repoMod.findRepo(process.cwd());
59
+ return repo?.meta?.compress ?? 'none';
60
+ }
61
+ catch {
62
+ return 'none';
63
+ }
64
+ }
26
65
  const FALLBACK_SCAN_LIMIT = 50_000;
27
66
  const BOOLEAN_QUERY_TOKENS = new Set(['and', 'or', 'not']);
28
67
  const FALLBACK_FIELD_WEIGHTS = {
@@ -184,9 +223,13 @@ async function queryFallbackViaExecutor(executor, tableName, properties, query,
184
223
  return [];
185
224
  }
186
225
  }
187
- async function fallbackSearchAllTables(executor, query, limit) {
226
+ async function fallbackSearchAllTables(executor, query, limit,
227
+ // Same compress-aware property selection as the FTS path. Default keeps
228
+ // pre-Phase-2 behaviour (`['name', 'content']`) for callers that don't
229
+ // pass a value.
230
+ properties = ['name', 'content']) {
188
231
  const results = [];
189
- for (const { table, properties } of FTS_INDEXES) {
232
+ for (const { table } of FTS_TABLES) {
190
233
  results.push(await queryFallbackViaExecutor(executor, table, properties, query, limit));
191
234
  }
192
235
  return results;
@@ -220,7 +263,12 @@ export const searchFTSFromLbug = async (query, limit = 20, repoId) => {
220
263
  const executor = (cypher) => executeQuery(repoId, cypher);
221
264
  // Lazy-create FTS indexes on first query for this repo (analyze no longer
222
265
  // creates them up-front, so we ensure them here). Cached per-process.
223
- for (const { table, indexName, properties } of FTS_INDEXES) {
266
+ // RFC 0001 Phase 2.5: drop `content` from FTS properties for repos
267
+ // analysed with --compress brotli|zstd — the column holds encoded
268
+ // bytes and would tokenise to garbage.
269
+ const compress = await getCompressMode(repoId);
270
+ const properties = ftsPropertiesFor(compress);
271
+ for (const { table, indexName } of FTS_TABLES) {
224
272
  await ensureFTSIndexViaExecutor(executor, repoId, table, indexName, properties);
225
273
  }
226
274
  fileResults = await queryFTSViaExecutor(executor, 'File', 'file_fts', query, limit);
@@ -235,13 +283,17 @@ export const searchFTSFromLbug = async (query, limit = 20, repoId) => {
235
283
  interfaceResults.length ===
236
284
  0) {
237
285
  [fileResults, functionResults, classResults, methodResults, interfaceResults] =
238
- await fallbackSearchAllTables(executor, query, limit);
286
+ await fallbackSearchAllTables(executor, query, limit, properties);
239
287
  }
240
288
  }
241
289
  else {
242
290
  // Use core lbug adapter (CLI / pipeline context) — also sequential for safety.
243
291
  // Lazy-create FTS indexes on first query (analyze no longer does it).
244
- for (const { table, indexName, properties } of FTS_INDEXES) {
292
+ // RFC 0001 Phase 2.5 same `compress`-aware property selection as the MCP
293
+ // path; the CLI walks up from cwd to find the repo's meta.json.
294
+ const compress = await getCompressMode();
295
+ const properties = ftsPropertiesFor(compress);
296
+ for (const { table, indexName } of FTS_TABLES) {
245
297
  await ensureFTSIndex(table, indexName, [...properties]).catch(() => { });
246
298
  }
247
299
  fileResults = await queryFTS('File', 'file_fts', query, limit, false).catch(() => []);
@@ -256,7 +308,7 @@ export const searchFTSFromLbug = async (query, limit = 20, repoId) => {
256
308
  interfaceResults.length ===
257
309
  0) {
258
310
  [fileResults, functionResults, classResults, methodResults, interfaceResults] =
259
- await fallbackSearchAllTables(executeCoreQuery, query, limit);
311
+ await fallbackSearchAllTables(executeCoreQuery, query, limit, properties);
260
312
  }
261
313
  }
262
314
  // Collect all node scores per filePath to track which nodes actually matched
@@ -19,6 +19,7 @@ import { GroupService } from '../../core/group/service.js';
19
19
  import { resolveAtGroupMemberRepoPath } from '../../core/group/resolve-at-member.js';
20
20
  import { collectBestChunks } from '../../core/embeddings/types.js';
21
21
  import { EMBEDDING_TABLE_NAME, EMBEDDING_INDEX_NAME } from '../../core/lbug/schema.js';
22
+ import { decodeContentField } from '../../core/lbug/content-read.js';
22
23
  import { PhaseTimer } from '../../core/search/phase-timer.js';
23
24
  import { checkStaleness, checkCwdMatch } from '../../core/git-staleness.js';
24
25
  // AI context generation is CLI-only (codragraph analyze)
@@ -835,10 +836,12 @@ export class LocalBackend {
835
836
  try {
836
837
  const contentRows = await executeParameterized(repo.id, `
837
838
  MATCH (n {id: $nodeId})
838
- RETURN n.content AS content
839
+ RETURN n.content AS content, n.contentEncoding AS contentEncoding
839
840
  `, { nodeId: sym.nodeId });
840
841
  if (contentRows.length > 0) {
841
- content = contentRows[0].content ?? contentRows[0][0];
842
+ const raw = contentRows[0].content ?? contentRows[0][0];
843
+ const enc = contentRows[0].contentEncoding ?? contentRows[0][1];
844
+ content = decodeContentField(raw, enc);
842
845
  }
843
846
  }
844
847
  catch (e) {
@@ -1330,7 +1333,13 @@ export class LocalBackend {
1330
1333
  */
1331
1334
  async resolveSymbolCandidates(repo, query, hints) {
1332
1335
  const { uid, name, include_content } = query;
1333
- const selectClause = `n.id AS id, n.name AS name, labels(n)[0] AS type, n.filePath AS filePath, n.startLine AS startLine, n.endLine AS endLine${include_content ? ', n.content AS content' : ''}`;
1336
+ // RFC 0001 Phase 2: when fetching content, also fetch the per-row
1337
+ // encoding tag so `decodeContentField` can pass it through unchanged
1338
+ // (default 'none') or run brotli/zstd decode. Adding contentEncoding
1339
+ // to the SELECT shifts the numeric-index fallback for content from
1340
+ // r[6] to (still) r[6] — encoding lands at r[7] — but we read by name
1341
+ // first which is the documented preferred path on LadybugDB.
1342
+ const selectClause = `n.id AS id, n.name AS name, labels(n)[0] AS type, n.filePath AS filePath, n.startLine AS startLine, n.endLine AS endLine${include_content ? ', n.content AS content, n.contentEncoding AS contentEncoding' : ''}`;
1334
1343
  // Direct UID — zero-ambiguity path.
1335
1344
  if (uid) {
1336
1345
  const rows = await executeParameterized(repo.id, `MATCH (n {id: $uid}) RETURN ${selectClause} LIMIT 1`, { uid });
@@ -1344,7 +1353,11 @@ export class LocalBackend {
1344
1353
  filePath: (r.filePath ?? r[3]),
1345
1354
  startLine: (r.startLine ?? r[4]),
1346
1355
  endLine: (r.endLine ?? r[5]),
1347
- ...(include_content ? { content: (r.content ?? r[6]) } : {}),
1356
+ ...(include_content
1357
+ ? {
1358
+ content: decodeContentField(r.content ?? r[6], r.contentEncoding ?? r[7]),
1359
+ }
1360
+ : {}),
1348
1361
  };
1349
1362
  // Same LadybugDB label-enrichment as the name-based path: a UID
1350
1363
  // pointing at a Class must still surface `type: 'Class'` so impact's
@@ -1380,7 +1393,11 @@ export class LocalBackend {
1380
1393
  filePath: (r.filePath ?? r[3]),
1381
1394
  startLine: (r.startLine ?? r[4]),
1382
1395
  endLine: (r.endLine ?? r[5]),
1383
- ...(include_content ? { content: (r.content ?? r[6]) } : {}),
1396
+ ...(include_content
1397
+ ? {
1398
+ content: decodeContentField(r.content ?? r[6], r.contentEncoding ?? r[7]),
1399
+ }
1400
+ : {}),
1384
1401
  }));
1385
1402
  // Enrich labels for any candidates where `labels(n)[0]` came back empty.
1386
1403
  // LadybugDB returns an empty string for that projection on certain node
@@ -15,6 +15,7 @@ import { createRequire } from 'node:module';
15
15
  import { loadMeta, listRegisteredRepos, getStoragePath } from '../storage/repo-manager.js';
16
16
  import { executeQuery, executePrepared, executeWithReusedStatement, streamQuery, closeLbug, withLbugDb, } from '../core/lbug/lbug-adapter.js';
17
17
  import { isWriteQuery } from '../core/lbug/pool-adapter.js';
18
+ import { decodeContentField } from '../core/lbug/content-read.js';
18
19
  import { NODE_TABLES } from '../_shared/index.js';
19
20
  import { searchFTSFromLbug } from '../core/search/bm25-index.js';
20
21
  import { hybridSearch } from '../core/search/hybrid-search.js';
@@ -189,7 +190,7 @@ const getNodeQuery = (table, includeContent) => {
189
190
  const tableLabel = quoteNodeTable(table);
190
191
  if (table === 'File') {
191
192
  return includeContent
192
- ? `MATCH (n:${tableLabel}) RETURN n.id AS id, n.name AS name, n.filePath AS filePath, n.content AS content`
193
+ ? `MATCH (n:${tableLabel}) RETURN n.id AS id, n.name AS name, n.filePath AS filePath, n.content AS content, n.contentEncoding AS contentEncoding`
193
194
  : `MATCH (n:${tableLabel}) RETURN n.id AS id, n.name AS name, n.filePath AS filePath`;
194
195
  }
195
196
  if (table === 'Folder') {
@@ -208,7 +209,7 @@ const getNodeQuery = (table, includeContent) => {
208
209
  return `MATCH (n:${tableLabel}) RETURN n.id AS id, n.name AS name, n.filePath AS filePath, n.description AS description`;
209
210
  }
210
211
  return includeContent
211
- ? `MATCH (n:${tableLabel}) RETURN n.id AS id, n.name AS name, n.filePath AS filePath, n.startLine AS startLine, n.endLine AS endLine, n.content AS content`
212
+ ? `MATCH (n:${tableLabel}) RETURN n.id AS id, n.name AS name, n.filePath AS filePath, n.startLine AS startLine, n.endLine AS endLine, n.content AS content, n.contentEncoding AS contentEncoding`
212
213
  : `MATCH (n:${tableLabel}) RETURN n.id AS id, n.name AS name, n.filePath AS filePath, n.startLine AS startLine, n.endLine AS endLine`;
213
214
  };
214
215
  const mapGraphNodeRow = (table, row, includeContent) => ({
@@ -219,7 +220,7 @@ const mapGraphNodeRow = (table, row, includeContent) => ({
219
220
  filePath: row.filePath ?? row[2],
220
221
  startLine: row.startLine,
221
222
  endLine: row.endLine,
222
- content: includeContent ? row.content : undefined,
223
+ content: includeContent ? decodeContentField(row.content, row.contentEncoding) : undefined,
223
224
  responseKeys: row.responseKeys,
224
225
  errorKeys: row.errorKeys,
225
226
  middleware: row.middleware,
@@ -36,10 +36,49 @@
36
36
  * so the registry stabilises over analyze/re-analyze cycles.
37
37
  */
38
38
  export declare const canonicalizePath: (p: string) => string;
39
+ /**
40
+ * On-disk schema version for `.codragraph/lbug` and `.codragraph/meta.json`.
41
+ *
42
+ * 1 — pre-RFC-0001-Phase-2 layout. Node tables have `content STRING`
43
+ * but no `contentEncoding` column. Implicit/missing on existing
44
+ * 1.6.x and 1.7.x indexes (RepoMeta.schemaVersion was undefined).
45
+ * 2 — RFC 0001 Phase 2: every node table that has `content` also has
46
+ * a `contentEncoding STRING DEFAULT 'none'` column. Writers may
47
+ * opt into compression via `--compress brotli|zstd` (compression
48
+ * is OFF by default, so existing readers keep working). Readers
49
+ * decode based on the per-row encoding tag.
50
+ *
51
+ * Bumping this is the migration trigger: `runFullAnalysis` forces a
52
+ * full re-analyze when an existing index has a missing or older
53
+ * `schemaVersion` field, because adding a column to an existing
54
+ * LadybugDB table via ALTER is not validated end-to-end yet — fresh
55
+ * `CREATE NODE TABLE` is the supported path.
56
+ */
57
+ export declare const INDEX_SCHEMA_VERSION: 2;
39
58
  export interface RepoMeta {
40
59
  repoPath: string;
41
60
  lastCommit: string;
42
61
  indexedAt: string;
62
+ /**
63
+ * On-disk schema version (see {@link INDEX_SCHEMA_VERSION}). Absent on
64
+ * indexes written by 1.7.x or earlier; `runFullAnalysis` treats those
65
+ * as needing a full re-analyze when they're loaded by a 1.8+ CLI.
66
+ */
67
+ schemaVersion?: number;
68
+ /**
69
+ * RFC 0001 Phase 2 — the per-row content encoding chosen at the last
70
+ * `analyze --compress` invocation. `'none'` (or absent) means rows
71
+ * carry plain text; `'brotli'` / `'zstd'` means rows are compressed
72
+ * and consumers must decode. Persisted so query-time tooling can
73
+ * detect the compressed mode without sampling rows.
74
+ *
75
+ * Phase 2.5 hooks: `core/search/bm25-index.ts` reads this field at
76
+ * FTS-create time and drops `content` from the FTS property list
77
+ * when set to a non-`'none'` value (full-text search falls back to
78
+ * symbol-name matches). Embeddings and graph queries are unaffected
79
+ * — they decode at the read boundary.
80
+ */
81
+ compress?: 'none' | 'brotli' | 'zstd';
43
82
  /**
44
83
  * Canonical `origin` remote URL captured at index time. Used to
45
84
  * fingerprint the same logical repo across multiple on-disk clones
@@ -49,6 +49,25 @@ export const canonicalizePath = (p) => {
49
49
  return resolved;
50
50
  }
51
51
  };
52
+ /**
53
+ * On-disk schema version for `.codragraph/lbug` and `.codragraph/meta.json`.
54
+ *
55
+ * 1 — pre-RFC-0001-Phase-2 layout. Node tables have `content STRING`
56
+ * but no `contentEncoding` column. Implicit/missing on existing
57
+ * 1.6.x and 1.7.x indexes (RepoMeta.schemaVersion was undefined).
58
+ * 2 — RFC 0001 Phase 2: every node table that has `content` also has
59
+ * a `contentEncoding STRING DEFAULT 'none'` column. Writers may
60
+ * opt into compression via `--compress brotli|zstd` (compression
61
+ * is OFF by default, so existing readers keep working). Readers
62
+ * decode based on the per-row encoding tag.
63
+ *
64
+ * Bumping this is the migration trigger: `runFullAnalysis` forces a
65
+ * full re-analyze when an existing index has a missing or older
66
+ * `schemaVersion` field, because adding a column to an existing
67
+ * LadybugDB table via ALTER is not validated end-to-end yet — fresh
68
+ * `CREATE NODE TABLE` is the supported path.
69
+ */
70
+ export const INDEX_SCHEMA_VERSION = 2;
52
71
  const CODRAGRAPH_DIR = '.codragraph';
53
72
  // ─── Local Storage Helpers ─────────────────────────────────────────────
54
73
  /**
@@ -12,8 +12,27 @@
12
12
  */
13
13
 
14
14
  const fs = require('fs');
15
+ const os = require('os');
15
16
  const path = require('path');
16
- const { spawnSync } = require('child_process');
17
+ const { spawnSync, spawn } = require('child_process');
18
+
19
+ /**
20
+ * Decide whether background auto-reindex is opted in. Two equivalent signals:
21
+ * 1. CODRAGRAPH_AUTO_REINDEX=1 in env (good for shells, CI)
22
+ * 2. `{ "autoReindex": true }` in ~/.codragraph/config.json (good for GUI
23
+ * editor launches on Windows, where shell env doesn't propagate to
24
+ * hook child processes reliably)
25
+ */
26
+ function isAutoReindexEnabled() {
27
+ if (process.env.CODRAGRAPH_AUTO_REINDEX === '1') return true;
28
+ try {
29
+ const configPath = path.join(os.homedir(), '.codragraph', 'config.json');
30
+ const config = JSON.parse(fs.readFileSync(configPath, 'utf-8'));
31
+ return config && config.autoReindex === true;
32
+ } catch {
33
+ return false;
34
+ }
35
+ }
17
36
 
18
37
  /**
19
38
  * Read JSON input from stdin synchronously.
@@ -250,10 +269,84 @@ function handlePostToolUse(input) {
250
269
  if (currentHead && currentHead === lastCommit) return;
251
270
 
252
271
  const analyzeCmd = `npx @codragraph/cli analyze${hadEmbeddings ? ' --embeddings' : ''}`;
272
+
273
+ // Opt-in background auto-reindex.
274
+ // Default stays as notification-only because spawning analyze while an MCP
275
+ // server holds LadybugDB will fail with a database-busy error — the
276
+ // notification path lets the agent reindex at a quiet moment instead.
277
+ // Power users who run MCP outside Claude Code's lifecycle can opt in via
278
+ // CODRAGRAPH_AUTO_REINDEX=1 or `{ "autoReindex": true }` in
279
+ // ~/.codragraph/config.json.
280
+ if (isAutoReindexEnabled()) {
281
+ // The "coalesce" file is a single-process gate: it exists only while a
282
+ // reindex is in flight. The spawned analyze removes it on exit (success or
283
+ // failure) via CODRAGRAPH_REINDEX_LOCK_PATH; the 10-min mtime fallback
284
+ // catches the rare crash that bypasses analyze's exit handler.
285
+ const coalescePath = path.join(gitNexusDir, '.reindex.coalesce');
286
+ const crashSafetyTtlMs = 10 * 60 * 1000;
287
+ let inFlight = false;
288
+ try {
289
+ const stat = fs.statSync(coalescePath);
290
+ if (Date.now() - stat.mtimeMs < crashSafetyTtlMs) inFlight = true;
291
+ } catch {
292
+ /* no coalesce file — no reindex in flight */
293
+ }
294
+
295
+ if (!inFlight) {
296
+ try {
297
+ fs.writeFileSync(coalescePath, String(process.pid));
298
+ } catch {
299
+ /* best-effort — gate is for coalescing, not correctness */
300
+ }
301
+
302
+ const cliPath = resolveCliPath();
303
+ const reindexArgs = hadEmbeddings
304
+ ? ['analyze', '--embeddings', '--no-setup']
305
+ : ['analyze', '--no-setup'];
306
+ const spawnEnv = { ...process.env, CODRAGRAPH_REINDEX_LOCK_PATH: coalescePath };
307
+ const spawnOpts = {
308
+ cwd,
309
+ detached: true,
310
+ stdio: 'ignore',
311
+ windowsHide: true,
312
+ env: spawnEnv,
313
+ };
314
+ try {
315
+ let child;
316
+ if (cliPath) {
317
+ child = spawn(process.execPath, [cliPath, ...reindexArgs], spawnOpts);
318
+ } else if (process.platform === 'win32') {
319
+ child = spawn('cmd', ['/c', 'npx', '-y', '@codragraph/cli', ...reindexArgs], spawnOpts);
320
+ } else {
321
+ child = spawn('npx', ['-y', '@codragraph/cli', ...reindexArgs], spawnOpts);
322
+ }
323
+ child.unref();
324
+ } catch {
325
+ /* spawn failed — fall through to notification */
326
+ }
327
+
328
+ sendHookResponse(
329
+ 'PostToolUse',
330
+ `CodraGraph: auto-reindex started in background ` +
331
+ `(HEAD ${lastCommit ? lastCommit.slice(0, 7) : 'never'} → ${currentHead.slice(0, 7)}). ` +
332
+ `If an MCP server is currently holding the database, the reindex will fail silently — ` +
333
+ `run \`${analyzeCmd}\` manually after closing the agent session.`,
334
+ );
335
+ return;
336
+ }
337
+
338
+ sendHookResponse(
339
+ 'PostToolUse',
340
+ `CodraGraph: auto-reindex coalesced — another reindex is in flight (will pick up your latest commit when it finishes).`,
341
+ );
342
+ return;
343
+ }
344
+
253
345
  sendHookResponse(
254
346
  'PostToolUse',
255
347
  `CodraGraph index is stale (last indexed: ${lastCommit ? lastCommit.slice(0, 7) : 'never'}). ` +
256
- `Run \`${analyzeCmd}\` to update the knowledge graph.`,
348
+ `Run \`${analyzeCmd}\` to update the knowledge graph. ` +
349
+ `Set CODRAGRAPH_AUTO_REINDEX=1 (or autoReindex: true in ~/.codragraph/config.json) for background auto-reindex.`,
257
350
  );
258
351
  }
259
352
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@codragraph/cli",
3
- "version": "1.6.4",
3
+ "version": "2.0.0",
4
4
  "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
5
5
  "author": {
6
6
  "name": "Anit Chaudhary",
@@ -56,10 +56,10 @@
56
56
  "prepack": "node scripts/build.js"
57
57
  },
58
58
  "dependencies": {
59
+ "@codragraph/graphstore": "^1.0.0",
59
60
  "@huggingface/transformers": "^4.1.0",
60
- "@ladybugdb/core": "^0.15.2",
61
+ "@ladybugdb/core": "^0.16.0",
61
62
  "@modelcontextprotocol/sdk": "^1.0.0",
62
- "@codragraph/graphstore": "^0.1.1",
63
63
  "@scarf/scarf": "^1.4.0",
64
64
  "cli-progress": "^3.12.0",
65
65
  "commander": "^14.0.3",
@@ -99,6 +99,7 @@
99
99
  "tree-sitter-swift": "^0.6.0"
100
100
  },
101
101
  "devDependencies": {
102
+ "@codragraph/shared": "file:../codragraph-shared",
102
103
  "@types/cli-progress": "^3.11.6",
103
104
  "@types/cors": "^2.8.17",
104
105
  "@types/express": "^4.17.21",
@@ -106,7 +107,6 @@
106
107
  "@types/node": "^25.6.0",
107
108
  "@types/uuid": "^11.0.0",
108
109
  "@vitest/coverage-v8": "^4.0.18",
109
- "@codragraph/shared": "file:../codragraph-shared",
110
110
  "tsx": "^4.0.0",
111
111
  "typescript": "^5.4.5",
112
112
  "vitest": "^4.0.18"
@@ -34,14 +34,26 @@ const fs = require('fs');
34
34
  const path = require('path');
35
35
  const { execSync } = require('child_process');
36
36
 
37
- const protoDir = path.join(__dirname, '..', 'node_modules', 'tree-sitter-proto');
37
+ // Resolve tree-sitter-proto from BOTH the codragraph package itself AND any
38
+ // monorepo root that hoisted the dep. npm workspaces hoist optional deps to
39
+ // the workspace root, so the package-local path doesn't exist on a workspace
40
+ // install. Same trap as patch-tree-sitter-swift.cjs — see that file for the
41
+ // full failure mode.
42
+ const protoCandidates = [
43
+ path.join(__dirname, '..', 'node_modules', 'tree-sitter-proto'),
44
+ path.join(__dirname, '..', '..', 'node_modules', 'tree-sitter-proto'),
45
+ ];
46
+ const protoDir = protoCandidates.find((d) => fs.existsSync(path.join(d, 'binding.gyp')));
47
+ if (!protoDir) {
48
+ // tree-sitter-proto is an optionalDependency; absent when install
49
+ // skipped optional deps or the file: dep was not resolved.
50
+ process.exit(0);
51
+ }
38
52
  const bindingGyp = path.join(protoDir, 'binding.gyp');
39
53
  const bindingNode = path.join(protoDir, 'build', 'Release', 'tree_sitter_proto_binding.node');
40
54
 
41
55
  try {
42
56
  if (!fs.existsSync(bindingGyp)) {
43
- // tree-sitter-proto is an optionalDependency; absent when install
44
- // skipped optional deps or the file: dep was not resolved.
45
57
  process.exit(0);
46
58
  }
47
59
 
@@ -29,13 +29,26 @@ const fs = require('fs');
29
29
  const path = require('path');
30
30
  const { execSync } = require('child_process');
31
31
 
32
- const swiftDir = path.join(__dirname, '..', 'node_modules', 'tree-sitter-swift');
32
+ // Resolve tree-sitter-swift from BOTH the codragraph package itself AND any
33
+ // monorepo root that hoisted the dep. npm workspaces hoist optional deps to
34
+ // the workspace root, so `codragraph/node_modules/tree-sitter-swift` doesn't
35
+ // exist when this script runs as the codragraph postinstall — checking only
36
+ // that path silently no-ops, which is exactly the failure that left
37
+ // Windows Node 22.14 users without a Swift parser.
38
+ //
39
+ // Order matters: the package-local dir takes precedence (standalone install),
40
+ // then the parent monorepo root (workspace install).
41
+ const candidateDirs = [
42
+ path.join(__dirname, '..', 'node_modules', 'tree-sitter-swift'),
43
+ path.join(__dirname, '..', '..', 'node_modules', 'tree-sitter-swift'),
44
+ ];
45
+ const swiftDir = candidateDirs.find((d) => fs.existsSync(path.join(d, 'binding.gyp')));
46
+ if (!swiftDir) {
47
+ process.exit(0);
48
+ }
33
49
  const bindingPath = path.join(swiftDir, 'binding.gyp');
34
50
 
35
51
  try {
36
- if (!fs.existsSync(bindingPath)) {
37
- process.exit(0);
38
- }
39
52
 
40
53
  const content = fs.readFileSync(bindingPath, 'utf8');
41
54
  let needsRebuild = false;