@codragraph/cli 1.6.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/README.md +50 -16
  2. package/dist/cli/ai-context.js +2 -2
  3. package/dist/cli/analyze.d.ts +22 -0
  4. package/dist/cli/analyze.js +111 -8
  5. package/dist/cli/compress-stats.d.ts +29 -0
  6. package/dist/cli/compress-stats.js +97 -0
  7. package/dist/cli/graphstore.d.ts +6 -2
  8. package/dist/cli/graphstore.js +24 -2
  9. package/dist/cli/index.js +17 -6
  10. package/dist/cli/profile-heap.d.ts +35 -0
  11. package/dist/cli/profile-heap.js +126 -0
  12. package/dist/cli/setup.d.ts +13 -0
  13. package/dist/cli/setup.js +75 -29
  14. package/dist/cli/skill-gen.d.ts +14 -2
  15. package/dist/cli/skill-gen.js +53 -20
  16. package/dist/cli/tool.js +4 -0
  17. package/dist/config/ignore-service.js +1 -1
  18. package/dist/core/embeddings/embedding-pipeline.js +24 -7
  19. package/dist/core/group/bridge-db.js +111 -24
  20. package/dist/core/group/extractors/grpc-patterns/proto.js +1 -12
  21. package/dist/core/ingestion/call-processor.js +2 -2
  22. package/dist/core/ingestion/cobol/cobol-preprocessor.js +1 -1
  23. package/dist/core/ingestion/cobol/jcl-parser.d.ts +1 -1
  24. package/dist/core/ingestion/cobol/jcl-parser.js +1 -1
  25. package/dist/core/ingestion/cobol-processor.d.ts +1 -1
  26. package/dist/core/ingestion/cobol-processor.js +1 -1
  27. package/dist/core/ingestion/heritage-extractors/generic.js +1 -1
  28. package/dist/core/ingestion/heritage-processor.js +1 -1
  29. package/dist/core/ingestion/import-processor.js +1 -1
  30. package/dist/core/ingestion/mro-processor.js +1 -1
  31. package/dist/core/ingestion/parsing-processor.js +1 -1
  32. package/dist/core/ingestion/type-extractors/c-cpp.js +1 -1
  33. package/dist/core/ingestion/type-extractors/python.js +1 -1
  34. package/dist/core/ingestion/type-extractors/shared.js +0 -3
  35. package/dist/core/lbug/content-read.d.ts +46 -0
  36. package/dist/core/lbug/content-read.js +64 -0
  37. package/dist/core/lbug/csv-generator.d.ts +2 -6
  38. package/dist/core/lbug/csv-generator.js +45 -12
  39. package/dist/core/lbug/lbug-adapter.d.ts +4 -1
  40. package/dist/core/lbug/lbug-adapter.js +157 -25
  41. package/dist/core/lbug/pool-adapter.js +51 -44
  42. package/dist/core/lbug/schema.d.ts +7 -7
  43. package/dist/core/lbug/schema.js +18 -0
  44. package/dist/core/run-analyze.d.ts +13 -0
  45. package/dist/core/run-analyze.js +91 -4
  46. package/dist/core/search/bm25-index.js +153 -12
  47. package/dist/core/wiki/generator.js +4 -4
  48. package/dist/mcp/local/local-backend.js +22 -5
  49. package/dist/mcp/resources.js +2 -3
  50. package/dist/server/api.js +4 -3
  51. package/dist/storage/repo-manager.d.ts +39 -0
  52. package/dist/storage/repo-manager.js +19 -0
  53. package/hooks/claude/codragraph-hook.cjs +108 -5
  54. package/hooks/claude/pre-tool-use.sh +6 -1
  55. package/package.json +4 -4
  56. package/scripts/build-tree-sitter-proto.cjs +15 -3
  57. package/scripts/patch-tree-sitter-swift.cjs +17 -4
  58. package/skills/codragraph-api-surface.md +110 -0
  59. package/skills/codragraph-cli.md +5 -5
  60. package/skills/codragraph-config-audit.md +146 -0
  61. package/skills/codragraph-cross-repo-impact.md +135 -0
  62. package/skills/codragraph-data-lineage.md +137 -0
  63. package/skills/codragraph-dead-code.md +119 -0
  64. package/skills/codragraph-debugging.md +1 -1
  65. package/skills/codragraph-exploring.md +1 -1
  66. package/skills/codragraph-gh-actions-debug.md +162 -0
  67. package/skills/codragraph-gh-issue-workflow.md +178 -0
  68. package/skills/codragraph-gh-pr-workflow.md +176 -0
  69. package/skills/codragraph-gh-release-workflow.md +187 -0
  70. package/skills/codragraph-git-bisect.md +176 -0
  71. package/skills/codragraph-git-force-push.md +147 -0
  72. package/skills/codragraph-git-history-rewrite.md +174 -0
  73. package/skills/codragraph-git-rebase-vs-merge.md +138 -0
  74. package/skills/codragraph-git-recovery.md +181 -0
  75. package/skills/codragraph-git-worktree.md +145 -0
  76. package/skills/codragraph-guide.md +1 -1
  77. package/skills/codragraph-impact-analysis.md +1 -1
  78. package/skills/codragraph-migration-tracking.md +130 -0
  79. package/skills/codragraph-notebook-context.md +136 -0
  80. package/skills/codragraph-observability-coverage.md +125 -0
  81. package/skills/codragraph-onboarding.md +129 -0
  82. package/skills/codragraph-perf-hotspots.md +132 -0
  83. package/skills/codragraph-pr-review.md +1 -1
  84. package/skills/codragraph-project-switcher.md +116 -0
  85. package/skills/codragraph-refactoring.md +1 -1
  86. package/skills/codragraph-security-audit.md +144 -0
  87. package/skills/codragraph-sql-tracing.md +122 -0
  88. package/skills/codragraph-supply-chain-audit.md +153 -0
  89. package/skills/codragraph-test-coverage.md +97 -0
@@ -100,21 +100,93 @@ export async function ensureBridgeSchema(handle) {
100
100
  }
101
101
  }
102
102
  }
103
- export async function queryBridge(handle, cypher, params) {
103
+ /**
104
+ * Close every QueryResult / PreparedStatement before letting V8 GC them.
105
+ * Same close-order discipline as `core/lbug/lbug-adapter.ts:closeQueryResult`
106
+ * — leaking these handles past `conn.close()` corrupts LadybugDB's native
107
+ * file lock on Windows ("Error 33: The process cannot access the file
108
+ * because it is being used by another process") and segfaults on
109
+ * process exit elsewhere. Best-effort: wrap close calls in try/catch so
110
+ * a finalizer that already ran doesn't poison the queryBridge return.
111
+ */
112
+ async function closeBridgeHandle(h) {
113
+ if (!h)
114
+ return;
115
+ const candidates = Array.isArray(h) ? h : [h];
116
+ for (const r of candidates) {
117
+ try {
118
+ const close = r?.close;
119
+ if (typeof close === 'function')
120
+ await Promise.resolve(close.call(r));
121
+ }
122
+ catch {
123
+ /* best-effort */
124
+ }
125
+ }
126
+ }
127
+ /**
128
+ * True iff the error is a Windows-only transient file-lock surfaced by
129
+ * LadybugDB's native binding immediately after a writer process closes
130
+ * the same DB file. Symptom is `Error 33` on the read path even though
131
+ * `db.close()` returned cleanly at the JS layer — the kernel hasn't
132
+ * fully released the exclusive lock yet. Retrying with backoff is the
133
+ * documented workaround for this class of Windows-fs interactions.
134
+ */
135
+ function isTransientLbugLockError(err) {
136
+ const msg = err?.message ?? '';
137
+ return (msg.includes('Error 33') ||
138
+ msg.includes('locked a portion of the file') ||
139
+ msg.includes('cannot access the file because it is being used by another process'));
140
+ }
141
+ async function queryBridgeOnce(handle, cypher, params) {
104
142
  const conn = handle._conn;
105
143
  if (params && Object.keys(params).length > 0) {
106
144
  const stmt = await conn.prepare(cypher);
107
145
  if (!stmt.isSuccess()) {
108
146
  const errMsg = await stmt.getErrorMessage();
147
+ await closeBridgeHandle(stmt);
109
148
  throw new Error(`Bridge query prepare failed: ${errMsg}`);
110
149
  }
111
150
  const queryResult = await conn.execute(stmt, params);
112
151
  const result = unwrapQueryResult(queryResult);
113
- return (await result.getAll());
152
+ try {
153
+ return (await result.getAll());
154
+ }
155
+ finally {
156
+ await closeBridgeHandle(queryResult);
157
+ await closeBridgeHandle(stmt);
158
+ }
114
159
  }
115
160
  const queryResult = await conn.query(cypher);
116
161
  const result = unwrapQueryResult(queryResult);
117
- return (await result.getAll());
162
+ try {
163
+ return (await result.getAll());
164
+ }
165
+ finally {
166
+ await closeBridgeHandle(queryResult);
167
+ }
168
+ }
169
+ export async function queryBridge(handle, cypher, params) {
170
+ // Retry on Windows-transient file-lock errors. Reads issued through a
171
+ // freshly-opened readonly Database can race the writer's
172
+ // post-`db.close()` lock release on Windows + Node 22.14 (LadybugDB
173
+ // native binding holds the kernel lock briefly after the JS-level
174
+ // close returns). Backoff doubles per attempt up to ~3 s total — well
175
+ // below any user-visible CLI delay budget but enough to absorb a slow
176
+ // Windows kernel lock release.
177
+ const ATTEMPTS = 7;
178
+ for (let attempt = 0; attempt < ATTEMPTS; attempt++) {
179
+ try {
180
+ return await queryBridgeOnce(handle, cypher, params);
181
+ }
182
+ catch (err) {
183
+ if (!isTransientLbugLockError(err) || attempt === ATTEMPTS - 1)
184
+ throw err;
185
+ await new Promise((r) => setTimeout(r, 50 * Math.pow(2, attempt)));
186
+ }
187
+ }
188
+ // Unreachable: the loop either returns or throws on the last attempt.
189
+ throw new Error('queryBridge: retry loop exited unexpectedly');
118
190
  }
119
191
  /**
120
192
  * LadybugDB's `conn.query` / `conn.execute` can return either a single
@@ -421,32 +493,47 @@ export async function openBridgeDbReadOnly(groupDir) {
421
493
  // Open the native handle. If Connection construction throws AFTER
422
494
  // Database was successfully allocated, we'd leak the native Database
423
495
  // object. Wrap each step separately and tear down the partial handle.
424
- let db;
425
- let conn;
426
- try {
427
- db = new lbug.Database(dbPath, 0, false, true); // readOnly
428
- conn = new lbug.Connection(db);
429
- return { _db: db, _conn: conn, groupDir };
430
- }
431
- catch {
432
- if (conn) {
433
- try {
434
- await conn.close();
435
- }
436
- catch {
437
- /* ignore */
438
- }
496
+ //
497
+ // Retry on the Windows-transient lock error: the LadybugDB native
498
+ // binding holds the kernel file lock briefly past `db.close()` on
499
+ // Windows + Node 22.14, so a reader that races a recent writer can
500
+ // hit "Error 33: locked a portion of the file" on the constructor's
501
+ // first 4 KB header read. Backoff up to ~3 s lets the writer's lock
502
+ // age out — enough headroom for any normal write→read sequence
503
+ // without becoming a user-visible delay.
504
+ const ATTEMPTS = 7;
505
+ for (let attempt = 0; attempt < ATTEMPTS; attempt++) {
506
+ let db;
507
+ let conn;
508
+ try {
509
+ db = new lbug.Database(dbPath, 0, false, true); // readOnly
510
+ conn = new lbug.Connection(db);
511
+ return { _db: db, _conn: conn, groupDir };
439
512
  }
440
- if (db) {
441
- try {
442
- await db.close();
513
+ catch (err) {
514
+ if (conn) {
515
+ try {
516
+ await conn.close();
517
+ }
518
+ catch {
519
+ /* ignore */
520
+ }
443
521
  }
444
- catch {
445
- /* ignore */
522
+ if (db) {
523
+ try {
524
+ await db.close();
525
+ }
526
+ catch {
527
+ /* ignore */
528
+ }
446
529
  }
530
+ if (!isTransientLbugLockError(err) || attempt === ATTEMPTS - 1)
531
+ return null;
532
+ await new Promise((r) => setTimeout(r, 50 * Math.pow(2, attempt)));
533
+ continue;
447
534
  }
448
- return null;
449
535
  }
536
+ return null;
450
537
  }
451
538
  /* ------------------------------------------------------------------ */
452
539
  /* bridgeExists */
@@ -31,7 +31,6 @@ if (ProtoGrammar) {
31
31
  // test runners (vitest forks) when SyntaxNode isn't fully initialized
32
32
  // yet. Catching that here ensures `PROTO_GRPC_PLUGIN` stays null and
33
33
  // the orchestrator falls back to the manual parser.
34
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
35
34
  const _Parser = _require('tree-sitter');
36
35
  // Smoke-test: parse + setLanguage to verify the grammar is
37
36
  // end-to-end compatible with this tree-sitter runtime.
@@ -72,24 +71,14 @@ if (ProtoGrammar) {
72
71
  }
73
72
  }
74
73
  function buildPlugin() {
75
- if (!ProtoGrammar || !PACKAGE_PATTERNS || !SERVICE_PATTERNS)
74
+ if (!ProtoGrammar || !SERVICE_PATTERNS)
76
75
  return null;
77
- const pkgPatterns = PACKAGE_PATTERNS;
78
76
  const svcPatterns = SERVICE_PATTERNS;
79
77
  return {
80
78
  name: 'proto-grpc',
81
79
  language: ProtoGrammar,
82
80
  scan(tree) {
83
81
  const out = [];
84
- // Extract `package` declaration (first match wins).
85
- let pkg = '';
86
- for (const match of runCompiledPatterns(pkgPatterns, tree)) {
87
- const pkgNode = match.captures.pkg;
88
- if (pkgNode) {
89
- pkg = pkgNode.text;
90
- break;
91
- }
92
- }
93
82
  // Extract `service → rpc` pairs. The query returns one match per
94
83
  // (service, rpc) combination thanks to the nested structure.
95
84
  for (const match of runCompiledPatterns(svcPatterns, tree)) {
@@ -616,7 +616,7 @@ importedRawReturnTypesMap, heritageMap, bindingAccumulator) => {
616
616
  bufferSize: getTreeSitterBufferSize(file.content.length),
617
617
  });
618
618
  }
619
- catch (parseError) {
619
+ catch (_parseError) {
620
620
  continue;
621
621
  }
622
622
  astCache.set(file.path, tree);
@@ -704,7 +704,7 @@ importedRawReturnTypesMap, heritageMap, bindingAccumulator) => {
704
704
  // loop above, so verifyConstructorBindings sees all provider bindings
705
705
  // regardless of file processing order.
706
706
  for (let i = 0; i < prepared.length; i++) {
707
- const { file, language, provider, tree, matches, parentMap, typeEnv } = prepared[i];
707
+ const { file, language, provider, tree: _tree, matches, parentMap, typeEnv } = prepared[i];
708
708
  enclosingFnExtractCache.clear();
709
709
  onProgress?.(i + 1, files.length);
710
710
  if (i % 20 === 0)
@@ -1404,7 +1404,7 @@ export function extractCobolSymbolsWithRegex(content, _filePath) {
1404
1404
  if (anonRedefMatch) {
1405
1405
  // Check it's truly anonymous: the second capture is not a valid data name
1406
1406
  // followed by more clauses — it's the REDEFINES target directly after level
1407
- const level = parseInt(anonRedefMatch[1], 10);
1407
+ const _level = parseInt(anonRedefMatch[1], 10);
1408
1408
  // Only skip if this is genuinely "NN REDEFINES target" with no name between
1409
1409
  // We detect this by checking the full data item regex does NOT match
1410
1410
  // (because RE_DATA_ITEM expects a name before any clauses)
@@ -65,4 +65,4 @@ export interface JclParseResults {
65
65
  * @param filePath - Path for diagnostics (not used in extraction)
66
66
  * @returns Parsed JCL results
67
67
  */
68
- export declare function parseJcl(content: string, filePath: string): JclParseResults;
68
+ export declare function parseJcl(content: string, _filePath: string): JclParseResults;
@@ -73,7 +73,7 @@ function extractDisp(params) {
73
73
  * @param filePath - Path for diagnostics (not used in extraction)
74
74
  * @returns Parsed JCL results
75
75
  */
76
- export function parseJcl(content, filePath) {
76
+ export function parseJcl(content, _filePath) {
77
77
  const results = {
78
78
  jobs: [],
79
79
  steps: [],
@@ -50,5 +50,5 @@ export declare function isJclFile(filePath: string): boolean;
50
50
  * @param allPathSet - Set of all file paths in the repository
51
51
  * @returns Summary of what was extracted
52
52
  */
53
- export declare const processCobol: (graph: KnowledgeGraph, files: CobolFile[], allPathSet: ReadonlySet<string>) => CobolProcessResult;
53
+ export declare const processCobol: (graph: KnowledgeGraph, files: CobolFile[], _allPathSet: ReadonlySet<string>) => CobolProcessResult;
54
54
  export {};
@@ -47,7 +47,7 @@ function isCopybook(filePath) {
47
47
  * @param allPathSet - Set of all file paths in the repository
48
48
  * @returns Summary of what was extracted
49
49
  */
50
- export const processCobol = (graph, files, allPathSet) => {
50
+ export const processCobol = (graph, files, _allPathSet) => {
51
51
  const result = {
52
52
  programs: 0,
53
53
  paragraphs: 0,
@@ -12,7 +12,7 @@ export function createHeritageExtractor(config) {
12
12
  const callNameSet = actualConfig.callBasedHeritage?.callNames;
13
13
  return {
14
14
  language: actualConfig.language,
15
- extract(captureMap, context) {
15
+ extract(captureMap, _context) {
16
16
  const classNode = captureMap['heritage.class'];
17
17
  if (!classNode)
18
18
  return [];
@@ -151,7 +151,7 @@ export const processHeritage = async (graph, files, astCache, ctx, onProgress) =
151
151
  bufferSize: getTreeSitterBufferSize(file.content.length),
152
152
  });
153
153
  }
154
- catch (parseError) {
154
+ catch (_parseError) {
155
155
  // Skip files that can't be parsed
156
156
  continue;
157
157
  }
@@ -245,7 +245,7 @@ export const processImports = async (graph, files, astCache, ctx, onProgress, re
245
245
  bufferSize: getTreeSitterBufferSize(file.content.length),
246
246
  });
247
247
  }
248
- catch (parseError) {
248
+ catch (_parseError) {
249
249
  continue;
250
250
  }
251
251
  wasReparsed = true;
@@ -316,7 +316,7 @@ function parameterTypesMatch(a, b, aParamCount, bParamCount) {
316
316
  */
317
317
  function emitMethodImplementsEdges(graph, parentMap, methodMap, parentEdgeType, ancestorsMap, edgeTypesMap) {
318
318
  let edgeCount = 0;
319
- for (const [classId, parentIds] of parentMap) {
319
+ for (const [classId, _parentIds] of parentMap) {
320
320
  const classNode = graph.getNode(classId);
321
321
  if (!classNode)
322
322
  continue;
@@ -273,7 +273,7 @@ const processParsingSequential = async (graph, files, symbolTable, astCache, sco
273
273
  bufferSize: getTreeSitterBufferSize(parseContent.length),
274
274
  });
275
275
  }
276
- catch (parseError) {
276
+ catch (_parseError) {
277
277
  console.warn(`Skipping unparseable file: ${file.path}`);
278
278
  continue;
279
279
  }
@@ -479,7 +479,7 @@ const inferLiteralType = (node) => {
479
479
  };
480
480
  /** C++: detect constructor type from smart pointer factory calls (make_shared<Dog>()).
481
481
  * Extracts the template type argument as the constructor type for virtual dispatch. */
482
- const detectCppConstructorType = (node, classNames) => {
482
+ const detectCppConstructorType = (node, _classNames) => {
483
483
  // Navigate to the initializer value in the declaration
484
484
  const declarator = node.childForFieldName('declarator');
485
485
  const initDecl = declarator?.type === 'init_declarator' ? declarator : undefined;
@@ -149,7 +149,7 @@ const scanConstructorBinding = (node) => {
149
149
  };
150
150
  const FOR_LOOP_NODE_TYPES = new Set(['for_statement']);
151
151
  /** Python function/method node types that carry a parameters list. */
152
- const PY_FUNCTION_NODE_TYPES = new Set(['function_definition', 'decorated_definition']);
152
+ const _PY_FUNCTION_NODE_TYPES = new Set(['function_definition', 'decorated_definition']);
153
153
  /**
154
154
  * Extract element type from a Python type annotation AST node.
155
155
  * Handles:
@@ -564,16 +564,13 @@ export function extractElementTypeFromString(typeStr, pos = 'last') {
564
564
  const openAngle = typeStr.indexOf('<');
565
565
  const openSquare = typeStr.indexOf('[');
566
566
  let openIdx = -1;
567
- let openChar = '';
568
567
  let closeChar = '';
569
568
  if (openAngle >= 0 && (openSquare < 0 || openAngle < openSquare)) {
570
569
  openIdx = openAngle;
571
- openChar = '<';
572
570
  closeChar = '>';
573
571
  }
574
572
  else if (openSquare >= 0) {
575
573
  openIdx = openSquare;
576
- openChar = '[';
577
574
  closeChar = ']';
578
575
  }
579
576
  if (openIdx < 0)
@@ -0,0 +1,46 @@
1
+ /**
2
+ * Read-side decoder for `content` columns in lbug node rows.
3
+ *
4
+ * RFC 0001 Phase 2 introduces an optional `contentEncoding` column on
5
+ * every node table that has `content`. Default is `'none'` (passthrough)
6
+ * so existing reads keep working unchanged. When a writer opts into
7
+ * `--compress brotli|zstd`, the column carries the encoding tag and the
8
+ * `content` column carries base64-encoded compressed bytes — readers
9
+ * MUST run those bytes back through `decodeContent` before handing them
10
+ * to a consumer (MCP tool result, HTTP API response, embedding model,
11
+ * LLM input).
12
+ *
13
+ * Centralizing the decode in one helper has two benefits:
14
+ * 1. Shim sites are 2-line changes: add `, n.contentEncoding AS
15
+ * contentEncoding` to the Cypher RETURN, and pipe the row through
16
+ * `decodeContentField` (or `decodeContentRow`) at the boundary.
17
+ * 2. Anyone hunting for "where does the read path decode compressed
18
+ * bytes" greps for `decodeContentField` and gets every site in one
19
+ * shot — no per-table feature detection scattered across files.
20
+ */
21
+ /**
22
+ * Decode a single (content, contentEncoding) pair from a Cypher row.
23
+ *
24
+ * Returns the input content unchanged when:
25
+ * - the encoding is missing / empty / `'none'` (the common case for
26
+ * 1.6.x – 1.7.x indexes, plus any 1.8+ index written without
27
+ * `--compress`);
28
+ * - content is null/undefined (caller decides whether that's an error);
29
+ * - content is not a string (pre-Phase-2 indexes never wrote non-string
30
+ * content, but defensive: don't crash a read path on a malformed row).
31
+ *
32
+ * Throws (via `decodeContent`) only when the row claims an encoding this
33
+ * CLI build can't decode — that's a forward-compat error and the right
34
+ * behavior is to fail loudly rather than return wrong content.
35
+ */
36
+ export declare function decodeContentField(content: unknown, encoding: unknown): string | undefined;
37
+ /**
38
+ * Apply `decodeContentField` to a row that carries `content` and
39
+ * `contentEncoding` keys (or their numeric column-index aliases).
40
+ *
41
+ * The numeric-fallback shape (`r[N]`) mirrors LadybugDB's row format —
42
+ * driver versions vary on whether named keys are populated, so existing
43
+ * read sites do `r.content ?? r[N]`. This helper accepts the same
44
+ * pattern. Returns a NEW object (does not mutate input).
45
+ */
46
+ export declare function decodeContentRow<T extends Record<string, unknown>>(row: T, contentKey?: keyof T, encodingKey?: keyof T): T;
@@ -0,0 +1,64 @@
1
+ /**
2
+ * Read-side decoder for `content` columns in lbug node rows.
3
+ *
4
+ * RFC 0001 Phase 2 introduces an optional `contentEncoding` column on
5
+ * every node table that has `content`. Default is `'none'` (passthrough)
6
+ * so existing reads keep working unchanged. When a writer opts into
7
+ * `--compress brotli|zstd`, the column carries the encoding tag and the
8
+ * `content` column carries base64-encoded compressed bytes — readers
9
+ * MUST run those bytes back through `decodeContent` before handing them
10
+ * to a consumer (MCP tool result, HTTP API response, embedding model,
11
+ * LLM input).
12
+ *
13
+ * Centralizing the decode in one helper has two benefits:
14
+ * 1. Shim sites are 2-line changes: add `, n.contentEncoding AS
15
+ * contentEncoding` to the Cypher RETURN, and pipe the row through
16
+ * `decodeContentField` (or `decodeContentRow`) at the boundary.
17
+ * 2. Anyone hunting for "where does the read path decode compressed
18
+ * bytes" greps for `decodeContentField` and gets every site in one
19
+ * shot — no per-table feature detection scattered across files.
20
+ */
21
+ import { decodeContent } from '@codragraph/graphstore';
22
+ /**
23
+ * Decode a single (content, contentEncoding) pair from a Cypher row.
24
+ *
25
+ * Returns the input content unchanged when:
26
+ * - the encoding is missing / empty / `'none'` (the common case for
27
+ * 1.6.x – 1.7.x indexes, plus any 1.8+ index written without
28
+ * `--compress`);
29
+ * - content is null/undefined (caller decides whether that's an error);
30
+ * - content is not a string (pre-Phase-2 indexes never wrote non-string
31
+ * content, but defensive: don't crash a read path on a malformed row).
32
+ *
33
+ * Throws (via `decodeContent`) only when the row claims an encoding this
34
+ * CLI build can't decode — that's a forward-compat error and the right
35
+ * behavior is to fail loudly rather than return wrong content.
36
+ */
37
+ export function decodeContentField(content, encoding) {
38
+ if (content === undefined || content === null)
39
+ return undefined;
40
+ if (typeof content !== 'string')
41
+ return content;
42
+ if (typeof encoding !== 'string' || encoding === '' || encoding === 'none') {
43
+ return content;
44
+ }
45
+ return decodeContent(content, encoding);
46
+ }
47
+ /**
48
+ * Apply `decodeContentField` to a row that carries `content` and
49
+ * `contentEncoding` keys (or their numeric column-index aliases).
50
+ *
51
+ * The numeric-fallback shape (`r[N]`) mirrors LadybugDB's row format —
52
+ * driver versions vary on whether named keys are populated, so existing
53
+ * read sites do `r.content ?? r[N]`. This helper accepts the same
54
+ * pattern. Returns a NEW object (does not mutate input).
55
+ */
56
+ export function decodeContentRow(row, contentKey = 'content', encodingKey = 'contentEncoding') {
57
+ const content = row[contentKey];
58
+ if (content === undefined || content === null)
59
+ return row;
60
+ const encoding = row[encodingKey];
61
+ if (typeof encoding !== 'string' || encoding === '' || encoding === 'none')
62
+ return row;
63
+ return { ...row, [contentKey]: decodeContentField(content, encoding) };
64
+ }
@@ -13,6 +13,7 @@
13
13
  */
14
14
  import { KnowledgeGraph } from '../graph/types.js';
15
15
  import { NodeTableName } from './schema.js';
16
+ import { type ContentEncoding } from '@codragraph/graphstore';
16
17
  export declare const sanitizeUTF8: (str: string) => string;
17
18
  export declare const escapeCSVField: (value: string | number | undefined | null) => string;
18
19
  export declare const escapeCSVNumber: (value: number | undefined | null, defaultValue?: number) => string;
@@ -25,9 +26,4 @@ export interface StreamedCSVResult {
25
26
  relCsvPath: string;
26
27
  relRows: number;
27
28
  }
28
- /**
29
- * Stream all CSV data directly to disk files.
30
- * Iterates graph nodes exactly ONCE — routes each node to the right writer.
31
- * File contents are lazy-read from disk with a generous LRU cache.
32
- */
33
- export declare const streamAllCSVsToDisk: (graph: KnowledgeGraph, repoPath: string, csvDir: string) => Promise<StreamedCSVResult>;
29
+ export declare const streamAllCSVsToDisk: (graph: KnowledgeGraph, repoPath: string, csvDir: string, compress?: ContentEncoding) => Promise<StreamedCSVResult>;
@@ -14,6 +14,7 @@
14
14
  import fs from 'fs/promises';
15
15
  import { createWriteStream } from 'fs';
16
16
  import path from 'path';
17
+ import { encodeContent } from '@codragraph/graphstore';
17
18
  /** Flush buffered rows to disk every N rows */
18
19
  const FLUSH_EVERY = 500;
19
20
  // ============================================================================
@@ -184,7 +185,26 @@ class BufferedCSVWriter {
184
185
  * Iterates graph nodes exactly ONCE — routes each node to the right writer.
185
186
  * File contents are lazy-read from disk with a generous LRU cache.
186
187
  */
187
- export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
188
+ /**
189
+ * Apply RFC 0001 Phase 2 content encoding. Returns the on-the-wire string
190
+ * + the encoding tag to write into the per-row `contentEncoding` column.
191
+ *
192
+ * `compress: undefined | 'none'` is the default — content goes through
193
+ * unchanged and the tag is `'none'` (matches the schema DEFAULT, so older
194
+ * readers and the schema-default behavior stay in agreement).
195
+ *
196
+ * Always writing the tag column (even as 'none') keeps the CSV / COPY /
197
+ * schema layouts uniform regardless of compression mode. The wasted bytes
198
+ * are negligible — a few characters per row vs the kilobytes of content
199
+ * the column is alongside.
200
+ */
201
+ const applyEncoding = (content, compress) => {
202
+ if (!compress || compress === 'none') {
203
+ return { wireContent: content, tag: 'none' };
204
+ }
205
+ return { wireContent: encodeContent(content, compress), tag: compress };
206
+ };
207
+ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir, compress) => {
188
208
  // Remove stale CSVs from previous crashed runs, then recreate
189
209
  try {
190
210
  await fs.rm(csvDir, { recursive: true, force: true });
@@ -196,26 +216,29 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
196
216
  const prevMax = process.getMaxListeners();
197
217
  process.setMaxListeners(prevMax + 40);
198
218
  const contentCache = new FileContentCache(repoPath);
199
- // Create writers for every node type up-front
200
- const fileWriter = new BufferedCSVWriter(path.join(csvDir, 'file.csv'), 'id,name,filePath,content');
219
+ // Create writers for every node type up-front. Content-bearing tables
220
+ // carry an extra `contentEncoding` column right after `content` to
221
+ // match the Phase 2 schema layout. Tables without `content` (Folder,
222
+ // Community, Process, Route, Tool) are unchanged.
223
+ const fileWriter = new BufferedCSVWriter(path.join(csvDir, 'file.csv'), 'id,name,filePath,content,contentEncoding');
201
224
  const folderWriter = new BufferedCSVWriter(path.join(csvDir, 'folder.csv'), 'id,name,filePath');
202
- const codeElementHeader = 'id,name,filePath,startLine,endLine,isExported,content,description';
225
+ const codeElementHeader = 'id,name,filePath,startLine,endLine,isExported,content,contentEncoding,description';
203
226
  const functionWriter = new BufferedCSVWriter(path.join(csvDir, 'function.csv'), codeElementHeader);
204
227
  const classWriter = new BufferedCSVWriter(path.join(csvDir, 'class.csv'), codeElementHeader);
205
228
  const interfaceWriter = new BufferedCSVWriter(path.join(csvDir, 'interface.csv'), codeElementHeader);
206
- const methodHeader = 'id,name,filePath,startLine,endLine,isExported,content,description,parameterCount,returnType';
229
+ const methodHeader = 'id,name,filePath,startLine,endLine,isExported,content,contentEncoding,description,parameterCount,returnType';
207
230
  const methodWriter = new BufferedCSVWriter(path.join(csvDir, 'method.csv'), methodHeader);
208
231
  const codeElemWriter = new BufferedCSVWriter(path.join(csvDir, 'codeelement.csv'), codeElementHeader);
209
232
  const communityWriter = new BufferedCSVWriter(path.join(csvDir, 'community.csv'), 'id,label,heuristicLabel,keywords,description,enrichedBy,cohesion,symbolCount');
210
233
  const processWriter = new BufferedCSVWriter(path.join(csvDir, 'process.csv'), 'id,label,heuristicLabel,processType,stepCount,communities,entryPointId,terminalId');
211
234
  // Section nodes have an extra 'level' column
212
- const sectionWriter = new BufferedCSVWriter(path.join(csvDir, 'section.csv'), 'id,name,filePath,startLine,endLine,level,content,description');
235
+ const sectionWriter = new BufferedCSVWriter(path.join(csvDir, 'section.csv'), 'id,name,filePath,startLine,endLine,level,content,contentEncoding,description');
213
236
  // Route nodes for API endpoint mapping
214
237
  const routeWriter = new BufferedCSVWriter(path.join(csvDir, 'route.csv'), 'id,name,filePath,responseKeys,errorKeys,middleware');
215
238
  // Tool nodes for MCP tool definitions
216
239
  const toolWriter = new BufferedCSVWriter(path.join(csvDir, 'tool.csv'), 'id,name,filePath,description');
217
240
  // Multi-language node types share the same CSV shape (no isExported column)
218
- const multiLangHeader = 'id,name,filePath,startLine,endLine,content,description';
241
+ const multiLangHeader = 'id,name,filePath,startLine,endLine,content,contentEncoding,description';
219
242
  const MULTI_LANG_TYPES = [
220
243
  'Struct',
221
244
  'Enum',
@@ -259,11 +282,13 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
259
282
  switch (node.label) {
260
283
  case 'File': {
261
284
  const content = await extractContent(node, contentCache);
285
+ const { wireContent, tag } = applyEncoding(content, compress);
262
286
  await fileWriter.addRow([
263
287
  escapeCSVField(node.id),
264
288
  escapeCSVField(node.properties.name || ''),
265
289
  escapeCSVField(node.properties.filePath || ''),
266
- escapeCSVField(content),
290
+ escapeCSVField(wireContent),
291
+ escapeCSVField(tag),
267
292
  ].join(','));
268
293
  break;
269
294
  }
@@ -306,6 +331,7 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
306
331
  }
307
332
  case 'Method': {
308
333
  const content = await extractContent(node, contentCache);
334
+ const { wireContent, tag } = applyEncoding(content, compress);
309
335
  await methodWriter.addRow([
310
336
  escapeCSVField(node.id),
311
337
  escapeCSVField(node.properties.name || ''),
@@ -313,7 +339,8 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
313
339
  escapeCSVNumber(node.properties.startLine, -1),
314
340
  escapeCSVNumber(node.properties.endLine, -1),
315
341
  node.properties.isExported ? 'true' : 'false',
316
- escapeCSVField(content),
342
+ escapeCSVField(wireContent),
343
+ escapeCSVField(tag),
317
344
  escapeCSVField(node.properties.description || ''),
318
345
  escapeCSVNumber(node.properties.parameterCount, 0),
319
346
  escapeCSVField(node.properties.returnType || ''),
@@ -322,6 +349,7 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
322
349
  }
323
350
  case 'Section': {
324
351
  const content = await extractContent(node, contentCache);
352
+ const { wireContent, tag } = applyEncoding(content, compress);
325
353
  await sectionWriter.addRow([
326
354
  escapeCSVField(node.id),
327
355
  escapeCSVField(node.properties.name || ''),
@@ -329,7 +357,8 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
329
357
  escapeCSVNumber(node.properties.startLine, -1),
330
358
  escapeCSVNumber(node.properties.endLine, -1),
331
359
  escapeCSVNumber(node.properties.level, 1),
332
- escapeCSVField(content),
360
+ escapeCSVField(wireContent),
361
+ escapeCSVField(tag),
333
362
  escapeCSVField(node.properties.description || ''),
334
363
  ].join(','));
335
364
  break;
@@ -366,6 +395,7 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
366
395
  const writer = codeWriterMap[node.label];
367
396
  if (writer) {
368
397
  const content = await extractContent(node, contentCache);
398
+ const { wireContent, tag } = applyEncoding(content, compress);
369
399
  await writer.addRow([
370
400
  escapeCSVField(node.id),
371
401
  escapeCSVField(node.properties.name || ''),
@@ -373,7 +403,8 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
373
403
  escapeCSVNumber(node.properties.startLine, -1),
374
404
  escapeCSVNumber(node.properties.endLine, -1),
375
405
  node.properties.isExported ? 'true' : 'false',
376
- escapeCSVField(content),
406
+ escapeCSVField(wireContent),
407
+ escapeCSVField(tag),
377
408
  escapeCSVField(node.properties.description || ''),
378
409
  ].join(','));
379
410
  }
@@ -382,13 +413,15 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
382
413
  const mlWriter = multiLangWriters.get(node.label);
383
414
  if (mlWriter) {
384
415
  const content = await extractContent(node, contentCache);
416
+ const { wireContent, tag } = applyEncoding(content, compress);
385
417
  await mlWriter.addRow([
386
418
  escapeCSVField(node.id),
387
419
  escapeCSVField(node.properties.name || ''),
388
420
  escapeCSVField(node.properties.filePath || ''),
389
421
  escapeCSVNumber(node.properties.startLine, -1),
390
422
  escapeCSVNumber(node.properties.endLine, -1),
391
- escapeCSVField(content),
423
+ escapeCSVField(wireContent),
424
+ escapeCSVField(tag),
392
425
  escapeCSVField(node.properties.description || ''),
393
426
  ].join(','));
394
427
  }