gitnexus 1.2.9 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,22 +1,16 @@
1
1
  export interface WorkerPool {
2
2
  /**
3
3
  * Dispatch items across workers. Items are split into chunks (one per worker),
4
- * each worker processes its chunk, and results are concatenated back in order.
5
- *
6
- * @param onProgress - Called with cumulative files processed across all workers
4
+ * each worker processes its chunk via sub-batches to limit peak memory,
5
+ * and results are concatenated back in order.
7
6
  */
8
7
  dispatch<TInput, TResult>(items: TInput[], onProgress?: (filesProcessed: number) => void): Promise<TResult[]>;
9
- /**
10
- * Terminate all workers. Must be called when done.
11
- */
8
+ /** Terminate all workers. Must be called when done. */
12
9
  terminate(): Promise<void>;
13
10
  /** Number of workers in the pool */
14
11
  readonly size: number;
15
12
  }
16
13
  /**
17
14
  * Create a pool of worker threads.
18
- *
19
- * @param workerUrl - URL to the worker script (use `new URL('./parse-worker.js', import.meta.url)`)
20
- * @param poolSize - Number of workers (defaults to cpus - 1, minimum 1)
21
15
  */
22
16
  export declare const createWorkerPool: (workerUrl: URL, poolSize?: number) => WorkerPool;
@@ -1,13 +1,18 @@
1
1
  import { Worker } from 'node:worker_threads';
2
2
  import os from 'node:os';
3
+ /**
4
+ * Max files to send to a worker in a single postMessage.
5
+ * Keeps structured-clone memory bounded per sub-batch.
6
+ */
7
+ const SUB_BATCH_SIZE = 1500;
8
+ /** Per sub-batch timeout. If a single sub-batch takes longer than this,
9
+ * likely a pathological file (e.g. minified 50MB JS). Fail fast. */
10
+ const SUB_BATCH_TIMEOUT_MS = 30_000;
3
11
  /**
4
12
  * Create a pool of worker threads.
5
- *
6
- * @param workerUrl - URL to the worker script (use `new URL('./parse-worker.js', import.meta.url)`)
7
- * @param poolSize - Number of workers (defaults to cpus - 1, minimum 1)
8
13
  */
9
14
  export const createWorkerPool = (workerUrl, poolSize) => {
10
- const size = poolSize ?? Math.max(1, os.cpus().length - 1);
15
+ const size = poolSize ?? Math.min(8, Math.max(1, os.cpus().length - 1));
11
16
  const workers = [];
12
17
  for (let i = 0; i < size; i++) {
13
18
  workers.push(new Worker(workerUrl));
@@ -15,32 +20,47 @@ export const createWorkerPool = (workerUrl, poolSize) => {
15
20
  const dispatch = (items, onProgress) => {
16
21
  if (items.length === 0)
17
22
  return Promise.resolve([]);
18
- // Split items into one chunk per worker
19
23
  const chunkSize = Math.ceil(items.length / size);
20
24
  const chunks = [];
21
25
  for (let i = 0; i < items.length; i += chunkSize) {
22
26
  chunks.push(items.slice(i, i + chunkSize));
23
27
  }
24
- // Track per-worker progress for cumulative reporting
25
28
  const workerProgress = new Array(chunks.length).fill(0);
26
- // Send one chunk to each worker, collect results
27
29
  const promises = chunks.map((chunk, i) => {
28
30
  const worker = workers[i];
29
31
  return new Promise((resolve, reject) => {
30
32
  let settled = false;
33
+ let subBatchTimer = null;
31
34
  const cleanup = () => {
32
- clearTimeout(timer);
35
+ if (subBatchTimer)
36
+ clearTimeout(subBatchTimer);
33
37
  worker.removeListener('message', handler);
34
38
  worker.removeListener('error', errorHandler);
35
39
  worker.removeListener('exit', exitHandler);
36
40
  };
37
- const timer = setTimeout(() => {
38
- if (!settled) {
39
- settled = true;
40
- cleanup();
41
- reject(new Error(`Worker ${i} timed out after 5 minutes (chunk: ${chunk.length} items). Worker may have crashed or is processing too much data.`));
41
+ const resetSubBatchTimer = () => {
42
+ if (subBatchTimer)
43
+ clearTimeout(subBatchTimer);
44
+ subBatchTimer = setTimeout(() => {
45
+ if (!settled) {
46
+ settled = true;
47
+ cleanup();
48
+ reject(new Error(`Worker ${i} sub-batch timed out after ${SUB_BATCH_TIMEOUT_MS / 1000}s (chunk: ${chunk.length} items).`));
49
+ }
50
+ }, SUB_BATCH_TIMEOUT_MS);
51
+ };
52
+ let subBatchIdx = 0;
53
+ const sendNextSubBatch = () => {
54
+ const start = subBatchIdx * SUB_BATCH_SIZE;
55
+ if (start >= chunk.length) {
56
+ worker.postMessage({ type: 'flush' });
57
+ return;
42
58
  }
43
- }, 5 * 60 * 1000);
59
+ const subBatch = chunk.slice(start, start + SUB_BATCH_SIZE);
60
+ subBatchIdx++;
61
+ resetSubBatchTimer();
62
+ worker.postMessage({ type: 'sub-batch', files: subBatch });
63
+ };
44
64
  const handler = (msg) => {
45
65
  if (settled)
46
66
  return;
@@ -51,8 +71,10 @@ export const createWorkerPool = (workerUrl, poolSize) => {
51
71
  onProgress(total);
52
72
  }
53
73
  }
74
+ else if (msg && msg.type === 'sub-batch-done') {
75
+ sendNextSubBatch();
76
+ }
54
77
  else if (msg && msg.type === 'error') {
55
- // Error reported by worker via postMessage
56
78
  settled = true;
57
79
  cleanup();
58
80
  reject(new Error(`Worker ${i} error: ${msg.error}`));
@@ -63,7 +85,6 @@ export const createWorkerPool = (workerUrl, poolSize) => {
63
85
  resolve(msg.data);
64
86
  }
65
87
  else {
66
- // Legacy: treat any non-typed message as result
67
88
  settled = true;
68
89
  cleanup();
69
90
  resolve(msg);
@@ -80,13 +101,13 @@ export const createWorkerPool = (workerUrl, poolSize) => {
80
101
  if (!settled) {
81
102
  settled = true;
82
103
  cleanup();
83
- reject(new Error(`Worker ${i} exited unexpectedly with code ${code}. This usually indicates an out-of-memory crash or native addon failure.`));
104
+ reject(new Error(`Worker ${i} exited with code ${code}. Likely OOM or native addon failure.`));
84
105
  }
85
106
  };
86
107
  worker.on('message', handler);
87
108
  worker.once('error', errorHandler);
88
109
  worker.once('exit', exitHandler);
89
- worker.postMessage(chunk);
110
+ sendNextSubBatch();
90
111
  });
91
112
  });
92
113
  return Promise.all(promises);
@@ -1,8 +1,10 @@
1
1
  /**
2
2
  * CSV Generator for KuzuDB Hybrid Schema
3
3
  *
4
- * Generates separate CSV files for each node table and one relation CSV.
5
- * This enables efficient bulk loading via COPY FROM for hybrid schema.
4
+ * Streams CSV rows directly to disk files in a single pass over graph nodes.
5
+ * File contents are lazy-read from disk per-node to avoid holding the entire
6
+ * repo in RAM. Rows are buffered (FLUSH_EVERY) before writing to minimize
7
+ * per-row Promise overhead.
6
8
  *
7
9
  * RFC 4180 Compliant:
8
10
  * - Fields containing commas, double quotes, or newlines are enclosed in double quotes
@@ -11,12 +13,17 @@
11
13
  */
12
14
  import { KnowledgeGraph } from '../graph/types.js';
13
15
  import { NodeTableName } from './schema.js';
14
- export interface CSVData {
15
- nodes: Map<NodeTableName, string>;
16
- relCSV: string;
16
+ export interface StreamedCSVResult {
17
+ nodeFiles: Map<NodeTableName, {
18
+ csvPath: string;
19
+ rows: number;
20
+ }>;
21
+ relCsvPath: string;
22
+ relRows: number;
17
23
  }
18
24
  /**
19
- * Generate all CSV data for hybrid schema bulk loading
20
- * Returns Maps of node table name -> CSV content, and single relation CSV
25
+ * Stream all CSV data directly to disk files.
26
+ * Iterates graph nodes exactly ONCE routes each node to the right writer.
27
+ * File contents are lazy-read from disk with a generous LRU cache.
21
28
  */
22
- export declare const generateAllCSVs: (graph: KnowledgeGraph, fileContents: Map<string, string>) => CSVData;
29
+ export declare const streamAllCSVsToDisk: (graph: KnowledgeGraph, repoPath: string, csvDir: string) => Promise<StreamedCSVResult>;