gitnexus 1.6.8-rc.46 → 1.6.8-rc.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -55,5 +55,22 @@ export interface StreamedCSVResult {
55
55
  * Stream all CSV data directly to disk files.
56
56
  * Iterates graph nodes exactly ONCE — routes each node to the right writer.
57
57
  * File contents are lazy-read from disk with a generous LRU cache.
58
+ *
59
+ * `onNodePhaseComplete` (optional, #2203 parallelism leg): fired exactly once,
60
+ * right after every node CSV is fully flushed to disk and BEFORE the
61
+ * relationship pass starts writing any `rel_*.csv`. It receives the finished
62
+ * node-file manifest so the caller can begin `COPY`-ing nodes while this
63
+ * function keeps generating relationship CSVs (the only single-writer-safe
64
+ * overlap — node `COPY` ‖ relationship emit). It is intentionally NOT awaited:
65
+ * the relationship pass proceeds concurrently with whatever the caller
66
+ * schedules. A synchronous throw from the callback is allowed and propagates out
67
+ * of this function (rejecting the returned promise) — it is raised before the
68
+ * relationship pass begins, so no `rel_*.csv` is written; `loadGraphToLbug` uses
69
+ * this to surface its PDG-manifest collision guard. The callback must NOT, however,
70
+ * schedule un-awaited async work that can reject unobserved. Absent ⇒ today's
71
+ * behavior, byte-for-byte.
58
72
  */
59
- export declare const streamAllCSVsToDisk: (graph: KnowledgeGraph, repoPath: string, csvDir: string) => Promise<StreamedCSVResult>;
73
+ export declare const streamAllCSVsToDisk: (graph: KnowledgeGraph, repoPath: string, csvDir: string, onNodePhaseComplete?: (nodeFiles: Map<NodeTableName, {
74
+ csvPath: string;
75
+ rows: number;
76
+ }>) => void) => Promise<StreamedCSVResult>;
@@ -33,6 +33,14 @@ const orderedNodes = (graph, sorted) => sorted ? [...graph.iterNodes()].sort(byG
33
33
  const orderedRelationships = (graph, sorted) => sorted ? [...graph.iterRelationships()].sort(byGraphId) : graph.iterRelationships();
34
34
  /** Flush buffered rows to disk every N rows */
35
35
  const FLUSH_EVERY = 500;
36
+ /**
37
+ * Yield the event loop every N relationship rows during the emit pass (#2226 F4)
38
+ * so a concurrent node COPY (the overlap in loadGraphToLbug) and write-stream
39
+ * drain callbacks get scheduling time during long synchronous emit stretches.
40
+ * Scheduling-only — never changes row content or order (byte-identical). Tuning
41
+ * constant, not load-bearing.
42
+ */
43
+ const REL_YIELD_EVERY = 5000;
36
44
  // ============================================================================
37
45
  // CSV ESCAPE UTILITIES
38
46
  // ============================================================================
@@ -239,8 +247,22 @@ export const buildBasicBlockRow = (node) => [
239
247
  * Stream all CSV data directly to disk files.
240
248
  * Iterates graph nodes exactly ONCE — routes each node to the right writer.
241
249
  * File contents are lazy-read from disk with a generous LRU cache.
250
+ *
251
+ * `onNodePhaseComplete` (optional, #2203 parallelism leg): fired exactly once,
252
+ * right after every node CSV is fully flushed to disk and BEFORE the
253
+ * relationship pass starts writing any `rel_*.csv`. It receives the finished
254
+ * node-file manifest so the caller can begin `COPY`-ing nodes while this
255
+ * function keeps generating relationship CSVs (the only single-writer-safe
256
+ * overlap — node `COPY` ‖ relationship emit). It is intentionally NOT awaited:
257
+ * the relationship pass proceeds concurrently with whatever the caller
258
+ * schedules. A synchronous throw from the callback is allowed and propagates out
259
+ * of this function (rejecting the returned promise) — it is raised before the
260
+ * relationship pass begins, so no `rel_*.csv` is written; `loadGraphToLbug` uses
261
+ * this to surface its PDG-manifest collision guard. The callback must NOT, however,
262
+ * schedule un-awaited async work that can reject unobserved. Absent ⇒ today's
263
+ * behavior, byte-for-byte.
242
264
  */
243
- export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
265
+ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir, onNodePhaseComplete) => {
244
266
  // Deterministic (id-sorted) node/relationship row order when enabled;
245
267
  // default off = today's graph-insertion order (byte-identical).
246
268
  const sortOutput = parseTruthyEnv(process.env.GITNEXUS_SORT_GRAPH_OUTPUT);
@@ -502,30 +524,11 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
502
524
  ...multiLangWriters.values(),
503
525
  ];
504
526
  await Promise.all(allWriters.map((w) => w.finish()));
505
- // --- Stream relationships directly to per-FROM→TO-label-pair files ---
506
- // (#2203 U2) Route every edge to its pair file in this single pass. The old
507
- // monolithic relations.csv and its line-by-line re-read + per-edge regex
508
- // re-split in loadGraphToLbug are gone, so the ~1M-edge set is written and
509
- // read once instead of twice. The router applies the SAME label-derivation +
510
- // validTables filter as the legacy splitRelCsvByLabelPair, so the per-pair
511
- // files are byte-identical (asserted by the differential test).
512
- const relRouter = new RelPairRouter(csvDir, REL_CSV_HEADER, new Set(NODE_TABLES));
513
- try {
514
- for (const rel of orderedRelationships(graph, sortOutput)) {
515
- const pending = relRouter.route(rel.sourceId, rel.targetId, buildRelRow(rel));
516
- if (pending)
517
- await pending;
518
- }
519
- await relRouter.close();
520
- }
521
- catch (err) {
522
- relRouter.destroy();
523
- // Rethrow the real stream error (EMFILE / disk-full) rather than the generic
524
- // AbortError a pending drain-await rejects with — mirrors the retained
525
- // splitRelCsvByLabelPair's `throw streamError ?? err`.
526
- throw relRouter.lastError ?? err;
527
- }
528
- // Build result map — only include tables that have rows
527
+ // Build the node-file manifest now (all writers are flushed; `.rows` is
528
+ // final). Hoisted above the relationship pass so `onNodePhaseComplete` can
529
+ // hand the caller a complete node manifest to start COPY-ing while we keep
530
+ // generating relationship CSVs below (#2203 overlap). The same map is
531
+ // returned, so the result is unchanged when no callback is supplied.
529
532
  const nodeFiles = new Map();
530
533
  const tableMap = [
531
534
  ['File', fileWriter],
@@ -551,6 +554,38 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
551
554
  });
552
555
  }
553
556
  }
557
+ // Node CSVs are on disk; relationship CSVs have not been touched yet. Hand
558
+ // the manifest to the caller (not awaited — the rel pass runs concurrently).
559
+ onNodePhaseComplete?.(nodeFiles);
560
+ // --- Stream relationships directly to per-FROM→TO-label-pair files ---
561
+ // (#2203 U2) Route every edge to its pair file in this single pass. The old
562
+ // monolithic relations.csv — and its line-by-line re-read + per-edge regex
563
+ // re-split in loadGraphToLbug — are gone, so the ~1M-edge set is written and
564
+ // read once instead of twice. The router applies the SAME label-derivation +
565
+ // validTables filter as the legacy splitRelCsvByLabelPair, so the per-pair
566
+ // files are byte-identical (asserted by the differential test).
567
+ const relRouter = new RelPairRouter(csvDir, REL_CSV_HEADER, new Set(NODE_TABLES));
568
+ try {
569
+ let emitted = 0;
570
+ for (const rel of orderedRelationships(graph, sortOutput)) {
571
+ const pending = relRouter.route(rel.sourceId, rel.targetId, buildRelRow(rel));
572
+ if (pending)
573
+ await pending;
574
+ // Periodically hand the event loop back so the overlapped node COPY and
575
+ // write-stream drains run instead of starving behind this synchronous
576
+ // loop (#2226 F4). No effect on emitted bytes — pure scheduling.
577
+ if (++emitted % REL_YIELD_EVERY === 0)
578
+ await new Promise((r) => setImmediate(r));
579
+ }
580
+ await relRouter.close();
581
+ }
582
+ catch (err) {
583
+ relRouter.destroy();
584
+ // Rethrow the real stream error (EMFILE / disk-full) rather than the generic
585
+ // AbortError a pending drain-await rejects with — mirrors the retained
586
+ // splitRelCsvByLabelPair's `throw streamError ?? err`.
587
+ throw relRouter.lastError ?? err;
588
+ }
554
589
  return {
555
590
  nodeFiles,
556
591
  relsByPair: relRouter.byPair,
@@ -1,5 +1,6 @@
1
1
  import lbug from '@ladybugdb/core';
2
2
  import { KnowledgeGraph } from '../graph/types.js';
3
+ import { NodeTableName } from './schema.js';
3
4
  import type { PdgEmitManifest } from './pdg-emit-sink.js';
4
5
  import type { CachedEmbedding } from '../embeddings/types.js';
5
6
  import { type ExtensionEnsureOptions } from './extension-loader.js';
@@ -54,6 +55,18 @@ export declare const withLbugDb: <T>(dbPath: string, operation: () => Promise<T>
54
55
  readOnly?: boolean;
55
56
  }) => Promise<T>;
56
57
  export type LbugProgressCallback = (message: string) => void;
58
+ /**
59
+ * Persist a KnowledgeGraph: stream CSVs, then bulk-COPY nodes (overlapped with
60
+ * relationship emit — see the body) and relationships.
61
+ *
62
+ * NOT TRANSACTIONAL (#2226). Each `COPY` commits independently and there is no
63
+ * surrounding transaction, so a failure partway through — a node `COPY` that
64
+ * throws at the FK barrier, a relationship `COPY` failure, or a `pdgEmitManifest`
65
+ * collision raised after node rows have already committed in the overlap path —
66
+ * leaves a partially-loaded DB. The caller surfaces the error; recovery is a
67
+ * `--force` re-analyze (a full rebuild), not a partial retry. Callers must not
68
+ * assume the DB is either fully loaded or untouched after a rejection.
69
+ */
57
70
  export declare const loadGraphToLbug: (graph: KnowledgeGraph, repoPath: string, storagePath: string, onProgress?: LbugProgressCallback,
58
71
  /**
59
72
  * Streamed PDG-emit manifest (#2202). When present (streaming was on, full
@@ -69,6 +82,8 @@ pdgEmitManifest?: PdgEmitManifest) => Promise<{
69
82
  skippedRels: number;
70
83
  warnings: string[];
71
84
  }>;
85
+ export declare const COPY_CSV_OPTS = "(HEADER=true, ESCAPE='\"', DELIM=',', QUOTE='\"', PARALLEL=false, auto_detect=false)";
86
+ export declare const getCopyQuery: (table: NodeTableName, filePath: string) => string;
72
87
  /**
73
88
  * Insert a single node to LadybugDB
74
89
  * @param label - Node type (File, Function, Class, etc.)
@@ -740,6 +740,60 @@ const doInitLbug = async (dbPath, readOnly = false) => {
740
740
  currentDbPath = dbPath;
741
741
  return { db, conn };
742
742
  };
743
+ /**
744
+ * Run a COPY, retrying once with IGNORE_ERRORS=true (which skips row-level
745
+ * errors) on first failure. On a second failure, hand the RAW retry error to
746
+ * `onError` — each call site formats + slices its own message (#2226 F5: node
747
+ * COPY slices to 200 chars and throws; relationship COPY slices to 80 and warns,
748
+ * so the helper must not pre-format and lose that distinction). `onError` may
749
+ * throw to propagate the failure.
750
+ */
751
+ const copyCsvWithRetry = async (targetConn, copyQuery, onError) => {
752
+ try {
753
+ await queryAndDrain(targetConn, copyQuery);
754
+ }
755
+ catch {
756
+ try {
757
+ const retryQuery = copyQuery.replace('auto_detect=false)', 'auto_detect=false, IGNORE_ERRORS=true)');
758
+ await queryAndDrain(targetConn, retryQuery);
759
+ }
760
+ catch (retryErr) {
761
+ onError(retryErr);
762
+ }
763
+ }
764
+ };
765
+ /**
766
+ * Bulk-COPY every node CSV sequentially on the single writable connection
767
+ * (LadybugDB allows one write txn at a time). Extracted from loadGraphToLbug so
768
+ * it can run either at the node-phase boundary — overlapping the relationship
769
+ * emit pass (#2203) — or after emit in the serial escape-hatch path. Each COPY
770
+ * keeps the IGNORE_ERRORS=true retry; a hard failure throws (no node rows ⇒ the
771
+ * relationship COPY would dangle on missing endpoints).
772
+ */
773
+ const copyNodeCSVs = async (targetConn, nodeFileEntries, log, totalSteps) => {
774
+ let stepsDone = 0;
775
+ for (const [table, { csvPath, rows }] of nodeFileEntries) {
776
+ stepsDone++;
777
+ log(`Loading nodes ${stepsDone}/${totalSteps}: ${table} (${rows.toLocaleString()} rows)`);
778
+ const copyQuery = getCopyQuery(table, normalizeCopyPath(csvPath));
779
+ await copyCsvWithRetry(targetConn, copyQuery, (retryErr) => {
780
+ const retryMsg = retryErr instanceof Error ? retryErr.message : String(retryErr);
781
+ throw new Error(`COPY failed for ${table}: ${retryMsg.slice(0, 200)}`);
782
+ });
783
+ }
784
+ };
785
+ /**
786
+ * Persist a KnowledgeGraph: stream CSVs, then bulk-COPY nodes (overlapped with
787
+ * relationship emit — see the body) and relationships.
788
+ *
789
+ * NOT TRANSACTIONAL (#2226). Each `COPY` commits independently and there is no
790
+ * surrounding transaction, so a failure partway through — a node `COPY` that
791
+ * throws at the FK barrier, a relationship `COPY` failure, or a `pdgEmitManifest`
792
+ * collision raised after node rows have already committed in the overlap path —
793
+ * leaves a partially-loaded DB. The caller surfaces the error; recovery is a
794
+ * `--force` re-analyze (a full rebuild), not a partial retry. Callers must not
795
+ * assume the DB is either fully loaded or untouched after a rejection.
796
+ */
743
797
  export const loadGraphToLbug = async (graph, repoPath, storagePath, onProgress,
744
798
  /**
745
799
  * Streamed PDG-emit manifest (#2202). When present (streaming was on, full
@@ -761,31 +815,87 @@ pdgEmitManifest) => {
761
815
  // the gap that the DB-persistence path is un-timed today (the analyze
762
816
  // "emit" number is the scope-resolution emit bucket, not this COPY path).
763
817
  const PROF = process.env.PROF_LBUG_LOAD === '1';
818
+ // Escape hatch / differential oracle (#2203): force the legacy strictly-serial
819
+ // load order (emit everything, THEN COPY nodes, THEN COPY rels) instead of the
820
+ // default node-COPY ‖ rel-emit overlap. Lets an operator revert the behavior at
821
+ // runtime, and lets a test load the same graph both ways and assert identical
822
+ // persisted content.
823
+ const SERIAL = process.env.GITNEXUS_SERIAL_LBUG_LOAD === '1';
764
824
  const mark = () => (PROF ? process.hrtime.bigint() : 0n);
765
825
  const span = (a, b) => (Number(b - a) / 1e6).toFixed(1);
766
826
  const tStart = mark();
767
827
  const csvDir = resolveNativeSafeStorageDir(storagePath, 'csv');
768
- log('Streaming CSVs to disk...');
769
- const csvResult = await streamAllCSVsToDisk(graph, repoPath, csvDir);
770
- // Merge the streamed PDG-emit CSVs (#2202) into the COPY plan so the
771
- // BasicBlock node table + per-pair PDG edges (CFG / REACHING_DEF / CDG /
772
- // POST_DOMINATE / TAINTED / SANITIZES) load through the SAME node + per-pair
773
- // COPY loops as the structural CSVs. The graph held zero BasicBlocks when
774
- // streaming, so `streamAllCSVsToDisk` produced none of these the manifest
775
- // is the sole source and there is no double-COPY. Absent ⇒ no-op.
776
- if (pdgEmitManifest) {
828
+ // The single writable connection (LadybugDB is single-writer). Captured as a
829
+ // const so the node-COPY closure has a non-null reference — TS cannot narrow
830
+ // the reassignable module-level `conn` across the callback boundary.
831
+ const writeConn = conn;
832
+ const validTables = new Set(NODE_TABLES);
833
+ // Merge the streamed PDG-emit node CSVs (#2202) into a node-file map. Collision
834
+ // guard: a BasicBlock in the in-memory graph during a streamed run is an
835
+ // invariant violation (streamAllCSVsToDisk would also emit basicblock.csv), so
836
+ // fail loudly rather than drop rows (#2202 review #3). Runs at the node-phase
837
+ // boundary so the manifest BasicBlock table COPYs with the structural CSVs.
838
+ const mergeManifestNodeFiles = (nodeFilesMap) => {
839
+ if (!pdgEmitManifest)
840
+ return;
777
841
  for (const [table, meta] of pdgEmitManifest.nodeFiles) {
778
- // A collision means a BasicBlock leaked into the in-memory graph during a
779
- // streamed run (streamAllCSVsToDisk then emitted a structural basicblock.csv).
780
- // That is a streaming-invariant violation — fail loudly rather than
781
- // silently overwrite one CSV with the other and drop its rows (#2202 review #3).
782
- if (csvResult.nodeFiles.has(table)) {
842
+ if (nodeFilesMap.has(table)) {
783
843
  throw new Error(`Streaming PDG manifest collides with a structural node CSV for "${table}" — ` +
784
844
  `the in-memory graph should hold zero ${table} nodes when streaming. ` +
785
845
  `A ${table} node leaked into the graph during a streamed emit.`);
786
846
  }
787
- csvResult.nodeFiles.set(table, meta);
847
+ nodeFilesMap.set(table, meta);
788
848
  }
849
+ };
850
+ // Node COPY is the only DB write that can overlap relationship CSV emit: the
851
+ // rel pass writes new rel_*.csv files and never touches `conn`, while node COPY
852
+ // uses `conn` and never touches the rel files. We start node COPY at the
853
+ // node-phase boundary and let the rel pass run concurrently — the only
854
+ // single-writer-safe parallelism (#2203). The rel COPY still waits for node
855
+ // COPY (FK precondition), so the DB load order is unchanged.
856
+ let nodeCopyPromise;
857
+ let nodeCopyError;
858
+ const beginNodeCopy = (nodeFilesMap) => {
859
+ mergeManifestNodeFiles(nodeFilesMap);
860
+ const entries = [...nodeFilesMap.entries()];
861
+ // copyNodeCSVs logs node progress as step/total; it processes only node
862
+ // tables (the rel COPY has its own "Loading edges" progress line), so the
863
+ // denominator is the node-table count — not +1 reserving a rel step.
864
+ // .catch captures the failure so an overlapped (mid-emit) rejection cannot
865
+ // surface as an unhandled rejection; it is rethrown at the FK barrier below.
866
+ nodeCopyPromise = copyNodeCSVs(writeConn, entries, log, entries.length).catch((e) => {
867
+ nodeCopyError = e;
868
+ });
869
+ };
870
+ log('Streaming CSVs to disk...');
871
+ let csvResult;
872
+ try {
873
+ csvResult = SERIAL
874
+ ? await streamAllCSVsToDisk(graph, repoPath, csvDir)
875
+ : await streamAllCSVsToDisk(graph, repoPath, csvDir, beginNodeCopy);
876
+ }
877
+ catch (emitErr) {
878
+ // Relationship emit failed. In overlap mode a node COPY may be in flight —
879
+ // settle it (the .catch above means this never rejects) before rethrowing so
880
+ // it cannot leak as an unhandled rejection.
881
+ if (nodeCopyPromise)
882
+ await nodeCopyPromise;
883
+ // If node COPY ALSO failed, emitErr wins the throw — log the swallowed node
884
+ // error so a half-loaded DB isn't misattributed to the emit failure alone.
885
+ if (nodeCopyError) {
886
+ logger.warn({ err: nodeCopyError }, '[lbug-load] node COPY also failed while relationship emit was failing');
887
+ }
888
+ throw emitErr;
889
+ }
890
+ const tCsv = mark();
891
+ // Merge the streamed PDG-emit per-pair rel CSVs (#2202) into the COPY plan —
892
+ // collision-guarded. Done BEFORE node COPY so the serial escape hatch detects a
893
+ // manifest/structural pair collision before committing any node rows (legacy
894
+ // parity with the pre-overlap path), and the overlap path detects it as early
895
+ // as csvResult is available. When a manifest is present, streaming was on and
896
+ // the in-memory graph held zero BasicBlocks, so a structural collision means a
897
+ // streaming-invariant violation — fail loudly rather than load corrupt data.
898
+ if (pdgEmitManifest) {
789
899
  for (const [pairKey, meta] of pdgEmitManifest.relsByPair) {
790
900
  if (csvResult.relsByPair.has(pairKey)) {
791
901
  throw new Error(`Streaming PDG manifest collides with a structural relationship CSV for pair ` +
@@ -795,30 +905,17 @@ pdgEmitManifest) => {
795
905
  csvResult.totalValidRels += meta.rows;
796
906
  }
797
907
  }
798
- const tCsv = mark();
799
- const validTables = new Set(NODE_TABLES);
800
- // Bulk COPY all node CSVs (sequential — LadybugDB allows only one write txn at a time)
801
- const nodeFiles = [...csvResult.nodeFiles.entries()];
802
- const totalSteps = nodeFiles.length + 1; // +1 for relationships
803
- let stepsDone = 0;
804
- for (const [table, { csvPath, rows }] of nodeFiles) {
805
- stepsDone++;
806
- log(`Loading nodes ${stepsDone}/${totalSteps}: ${table} (${rows.toLocaleString()} rows)`);
807
- const normalizedPath = normalizeCopyPath(csvPath);
808
- const copyQuery = getCopyQuery(table, normalizedPath);
809
- try {
810
- await queryAndDrain(conn, copyQuery);
811
- }
812
- catch (err) {
813
- try {
814
- const retryQuery = copyQuery.replace('auto_detect=false)', 'auto_detect=false, IGNORE_ERRORS=true)');
815
- await queryAndDrain(conn, retryQuery);
816
- }
817
- catch (retryErr) {
818
- const retryMsg = retryErr instanceof Error ? retryErr.message : String(retryErr);
819
- throw new Error(`COPY failed for ${table}: ${retryMsg.slice(0, 200)}`);
820
- }
821
- }
908
+ // Serial path: all CSVs are on disk and node COPY has not started — start it
909
+ // here so the barrier below blocks on it exactly as the legacy path did.
910
+ if (SERIAL)
911
+ beginNodeCopy(csvResult.nodeFiles);
912
+ // FK barrier: node rows must exist before the relationship COPY resolves their
913
+ // endpoints. In overlap mode most of node COPY was hidden behind rel emit, so
914
+ // this await is the *residual* node-COPY time (≈0 when fully overlapped).
915
+ if (nodeCopyPromise)
916
+ await nodeCopyPromise;
917
+ if (nodeCopyError) {
918
+ throw nodeCopyError instanceof Error ? nodeCopyError : new Error(String(nodeCopyError));
822
919
  }
823
920
  const tCopyNodes = mark();
824
921
  // Bulk COPY relationships. They were already routed to per-FROM→TO-label-pair
@@ -838,25 +935,17 @@ pdgEmitManifest) => {
838
935
  pairIdx++;
839
936
  const [fromLabel, toLabel] = pairKey.split('|');
840
937
  const normalizedPath = normalizeCopyPath(pairCsvPath);
938
+ // PARALLEL=false is load-bearing here too — see COPY_CSV_OPTS (#2203 / kuzudb/kuzu#5778).
841
939
  const copyQuery = `COPY ${REL_TABLE_NAME} FROM "${normalizedPath}" (from="${fromLabel}", to="${toLabel}", HEADER=true, ESCAPE='"', DELIM=',', QUOTE='"', PARALLEL=false, auto_detect=false)`;
842
940
  if (pairIdx % 5 === 0 || rows > 1000) {
843
941
  log(`Loading edges: ${pairIdx}/${relsByPair.size} types (${fromLabel} -> ${toLabel})`);
844
942
  }
845
- try {
846
- await queryAndDrain(conn, copyQuery);
847
- }
848
- catch (err) {
849
- try {
850
- const retryQuery = copyQuery.replace('auto_detect=false)', 'auto_detect=false, IGNORE_ERRORS=true)');
851
- await queryAndDrain(conn, retryQuery);
852
- }
853
- catch (retryErr) {
854
- const retryMsg = retryErr instanceof Error ? retryErr.message : String(retryErr);
855
- warnings.push(`${fromLabel}->${toLabel} (${rows} edges): ${retryMsg.slice(0, 80)}`);
856
- failedPairEdges += rows;
857
- failedPairCsvPaths.add(pairCsvPath);
858
- }
859
- }
943
+ await copyCsvWithRetry(conn, copyQuery, (retryErr) => {
944
+ const retryMsg = retryErr instanceof Error ? retryErr.message : String(retryErr);
945
+ warnings.push(`${fromLabel}->${toLabel} (${rows} edges): ${retryMsg.slice(0, 80)}`);
946
+ failedPairEdges += rows;
947
+ failedPairCsvPaths.add(pairCsvPath);
948
+ });
860
949
  // Only delete if not in failedPairCsvPaths (needed for fallback)
861
950
  if (!failedPairCsvPaths.has(pairCsvPath)) {
862
951
  try {
@@ -919,7 +1008,12 @@ pdgEmitManifest) => {
919
1008
  let totalNodeRows = 0;
920
1009
  for (const [, { rows }] of csvResult.nodeFiles)
921
1010
  totalNodeRows += rows;
922
- logger.warn(`[lbug-load prof] csv-emit=${span(tStart, tCsv)}ms ` +
1011
+ // `mode` records which load path ran. In overlap mode `csv-emit` is the wall
1012
+ // to streamAllCSVsToDisk's return (node COPY overlapped part of it) and
1013
+ // `copy-nodes` is the RESIDUAL node-COPY await after emit returned — it
1014
+ // trends to 0 as the overlap hides node COPY behind relationship emit. In
1015
+ // serial mode the buckets carry their legacy, disjoint meaning.
1016
+ logger.warn(`[lbug-load prof] mode=${SERIAL ? 'serial' : 'overlap'} csv-emit=${span(tStart, tCsv)}ms ` +
923
1017
  `copy-nodes=${span(tCsv, tCopyNodes)}ms copy-rels=${span(tCopyNodes, tCopyRels)}ms ` +
924
1018
  `fallback=${span(tCopyRels, tFallback)}ms total=${span(tStart, tEnd)}ms ` +
925
1019
  `(${totalNodeRows} nodes, ${insertedRels} rels)`);
@@ -930,7 +1024,18 @@ pdgEmitManifest) => {
930
1024
  // Source code content is full of backslashes which confuse the auto-detection.
931
1025
  // We MUST explicitly set ESCAPE='"' to use RFC 4180 escaping, and disable auto_detect to prevent
932
1026
  // LadybugDB from overriding our settings based on sample rows.
933
- const COPY_CSV_OPTS = `(HEADER=true, ESCAPE='"', DELIM=',', QUOTE='"', PARALLEL=false, auto_detect=false)`;
1027
+ //
1028
+ // PARALLEL=false IS LOAD-BEARING FOR CORRECTNESS — DO NOT FLIP IT (#2203).
1029
+ // LadybugDB's parallel CSV reader (Kuzu-derived; default PARALLEL=true) splits the
1030
+ // file into byte ranges parsed concurrently, and CANNOT determine line boundaries
1031
+ // when a quoted field contains an embedded newline — it errors with "Quoted newlines
1032
+ // are not supported in parallel CSV reader. Please specify PARALLEL=FALSE", or worse,
1033
+ // mis-parses silently (upstream kuzudb/kuzu#5778, still open). Our `content`/`text`
1034
+ // columns hold source code, so quoted multiline fields are guaranteed. PARALLEL=false
1035
+ // is therefore required, not conservative. The multiline-quoted round-trip in
1036
+ // test/integration/copy-parallel-invariant.test.ts fails loudly if this is ever flipped.
1037
+ // Exported so that test asserts the invariant statically as well.
1038
+ export const COPY_CSV_OPTS = `(HEADER=true, ESCAPE='"', DELIM=',', QUOTE='"', PARALLEL=false, auto_detect=false)`;
934
1039
  // Multi-language table names that were created with backticks in CODE_ELEMENT_BASE
935
1040
  // and must always be referenced with backticks in queries
936
1041
  const BACKTICK_TABLES = new Set([
@@ -996,7 +1101,7 @@ const TABLES_WITH_EXPORTED = new Set([
996
1101
  'Method',
997
1102
  'CodeElement',
998
1103
  ]);
999
- const getCopyQuery = (table, filePath) => {
1104
+ export const getCopyQuery = (table, filePath) => {
1000
1105
  const t = escapeTableName(table);
1001
1106
  if (table === 'File') {
1002
1107
  return `COPY ${t}(id, name, filePath, content) FROM "${filePath}" ${COPY_CSV_OPTS}`;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gitnexus",
3
- "version": "1.6.8-rc.46",
3
+ "version": "1.6.8-rc.47",
4
4
  "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
5
5
  "author": "Abhigyan Patwari",
6
6
  "license": "PolyForm-Noncommercial-1.0.0",