gitnexus 1.2.9 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/analyze.js +69 -28
- package/dist/cli/index.js +20 -0
- package/dist/core/graph/graph.js +5 -0
- package/dist/core/graph/types.d.ts +12 -1
- package/dist/core/ingestion/call-processor.js +52 -32
- package/dist/core/ingestion/community-processor.js +75 -40
- package/dist/core/ingestion/filesystem-walker.d.ts +23 -0
- package/dist/core/ingestion/filesystem-walker.js +38 -3
- package/dist/core/ingestion/import-processor.d.ts +11 -3
- package/dist/core/ingestion/import-processor.js +27 -11
- package/dist/core/ingestion/parsing-processor.js +2 -4
- package/dist/core/ingestion/pipeline.js +142 -135
- package/dist/core/ingestion/process-processor.js +12 -11
- package/dist/core/ingestion/workers/parse-worker.js +67 -6
- package/dist/core/ingestion/workers/worker-pool.d.ts +3 -9
- package/dist/core/ingestion/workers/worker-pool.js +39 -18
- package/dist/core/kuzu/csv-generator.d.ts +15 -8
- package/dist/core/kuzu/csv-generator.js +258 -196
- package/dist/core/kuzu/kuzu-adapter.d.ts +1 -4
- package/dist/core/kuzu/kuzu-adapter.js +75 -63
- package/dist/core/kuzu/schema.d.ts +1 -1
- package/dist/core/kuzu/schema.js +10 -0
- package/dist/types/pipeline.d.ts +6 -2
- package/dist/types/pipeline.js +6 -4
- package/package.json +1 -1
|
@@ -1,22 +1,16 @@
|
|
|
1
1
|
export interface WorkerPool {
|
|
2
2
|
/**
|
|
3
3
|
* Dispatch items across workers. Items are split into chunks (one per worker),
|
|
4
|
-
* each worker processes its chunk
|
|
5
|
-
*
|
|
6
|
-
* @param onProgress - Called with cumulative files processed across all workers
|
|
4
|
+
* each worker processes its chunk via sub-batches to limit peak memory,
|
|
5
|
+
* and results are concatenated back in order.
|
|
7
6
|
*/
|
|
8
7
|
dispatch<TInput, TResult>(items: TInput[], onProgress?: (filesProcessed: number) => void): Promise<TResult[]>;
|
|
9
|
-
/**
|
|
10
|
-
* Terminate all workers. Must be called when done.
|
|
11
|
-
*/
|
|
8
|
+
/** Terminate all workers. Must be called when done. */
|
|
12
9
|
terminate(): Promise<void>;
|
|
13
10
|
/** Number of workers in the pool */
|
|
14
11
|
readonly size: number;
|
|
15
12
|
}
|
|
16
13
|
/**
|
|
17
14
|
* Create a pool of worker threads.
|
|
18
|
-
*
|
|
19
|
-
* @param workerUrl - URL to the worker script (use `new URL('./parse-worker.js', import.meta.url)`)
|
|
20
|
-
* @param poolSize - Number of workers (defaults to cpus - 1, minimum 1)
|
|
21
15
|
*/
|
|
22
16
|
export declare const createWorkerPool: (workerUrl: URL, poolSize?: number) => WorkerPool;
|
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
import { Worker } from 'node:worker_threads';
|
|
2
2
|
import os from 'node:os';
|
|
3
|
+
/**
|
|
4
|
+
* Max files to send to a worker in a single postMessage.
|
|
5
|
+
* Keeps structured-clone memory bounded per sub-batch.
|
|
6
|
+
*/
|
|
7
|
+
const SUB_BATCH_SIZE = 1500;
|
|
8
|
+
/** Per sub-batch timeout. If a single sub-batch takes longer than this,
|
|
9
|
+
* likely a pathological file (e.g. minified 50MB JS). Fail fast. */
|
|
10
|
+
const SUB_BATCH_TIMEOUT_MS = 30_000;
|
|
3
11
|
/**
|
|
4
12
|
* Create a pool of worker threads.
|
|
5
|
-
*
|
|
6
|
-
* @param workerUrl - URL to the worker script (use `new URL('./parse-worker.js', import.meta.url)`)
|
|
7
|
-
* @param poolSize - Number of workers (defaults to cpus - 1, minimum 1)
|
|
8
13
|
*/
|
|
9
14
|
export const createWorkerPool = (workerUrl, poolSize) => {
|
|
10
|
-
const size = poolSize ?? Math.max(1, os.cpus().length - 1);
|
|
15
|
+
const size = poolSize ?? Math.min(8, Math.max(1, os.cpus().length - 1));
|
|
11
16
|
const workers = [];
|
|
12
17
|
for (let i = 0; i < size; i++) {
|
|
13
18
|
workers.push(new Worker(workerUrl));
|
|
@@ -15,32 +20,47 @@ export const createWorkerPool = (workerUrl, poolSize) => {
|
|
|
15
20
|
const dispatch = (items, onProgress) => {
|
|
16
21
|
if (items.length === 0)
|
|
17
22
|
return Promise.resolve([]);
|
|
18
|
-
// Split items into one chunk per worker
|
|
19
23
|
const chunkSize = Math.ceil(items.length / size);
|
|
20
24
|
const chunks = [];
|
|
21
25
|
for (let i = 0; i < items.length; i += chunkSize) {
|
|
22
26
|
chunks.push(items.slice(i, i + chunkSize));
|
|
23
27
|
}
|
|
24
|
-
// Track per-worker progress for cumulative reporting
|
|
25
28
|
const workerProgress = new Array(chunks.length).fill(0);
|
|
26
|
-
// Send one chunk to each worker, collect results
|
|
27
29
|
const promises = chunks.map((chunk, i) => {
|
|
28
30
|
const worker = workers[i];
|
|
29
31
|
return new Promise((resolve, reject) => {
|
|
30
32
|
let settled = false;
|
|
33
|
+
let subBatchTimer = null;
|
|
31
34
|
const cleanup = () => {
|
|
32
|
-
|
|
35
|
+
if (subBatchTimer)
|
|
36
|
+
clearTimeout(subBatchTimer);
|
|
33
37
|
worker.removeListener('message', handler);
|
|
34
38
|
worker.removeListener('error', errorHandler);
|
|
35
39
|
worker.removeListener('exit', exitHandler);
|
|
36
40
|
};
|
|
37
|
-
const
|
|
38
|
-
if (
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
41
|
+
const resetSubBatchTimer = () => {
|
|
42
|
+
if (subBatchTimer)
|
|
43
|
+
clearTimeout(subBatchTimer);
|
|
44
|
+
subBatchTimer = setTimeout(() => {
|
|
45
|
+
if (!settled) {
|
|
46
|
+
settled = true;
|
|
47
|
+
cleanup();
|
|
48
|
+
reject(new Error(`Worker ${i} sub-batch timed out after ${SUB_BATCH_TIMEOUT_MS / 1000}s (chunk: ${chunk.length} items).`));
|
|
49
|
+
}
|
|
50
|
+
}, SUB_BATCH_TIMEOUT_MS);
|
|
51
|
+
};
|
|
52
|
+
let subBatchIdx = 0;
|
|
53
|
+
const sendNextSubBatch = () => {
|
|
54
|
+
const start = subBatchIdx * SUB_BATCH_SIZE;
|
|
55
|
+
if (start >= chunk.length) {
|
|
56
|
+
worker.postMessage({ type: 'flush' });
|
|
57
|
+
return;
|
|
42
58
|
}
|
|
43
|
-
|
|
59
|
+
const subBatch = chunk.slice(start, start + SUB_BATCH_SIZE);
|
|
60
|
+
subBatchIdx++;
|
|
61
|
+
resetSubBatchTimer();
|
|
62
|
+
worker.postMessage({ type: 'sub-batch', files: subBatch });
|
|
63
|
+
};
|
|
44
64
|
const handler = (msg) => {
|
|
45
65
|
if (settled)
|
|
46
66
|
return;
|
|
@@ -51,8 +71,10 @@ export const createWorkerPool = (workerUrl, poolSize) => {
|
|
|
51
71
|
onProgress(total);
|
|
52
72
|
}
|
|
53
73
|
}
|
|
74
|
+
else if (msg && msg.type === 'sub-batch-done') {
|
|
75
|
+
sendNextSubBatch();
|
|
76
|
+
}
|
|
54
77
|
else if (msg && msg.type === 'error') {
|
|
55
|
-
// Error reported by worker via postMessage
|
|
56
78
|
settled = true;
|
|
57
79
|
cleanup();
|
|
58
80
|
reject(new Error(`Worker ${i} error: ${msg.error}`));
|
|
@@ -63,7 +85,6 @@ export const createWorkerPool = (workerUrl, poolSize) => {
|
|
|
63
85
|
resolve(msg.data);
|
|
64
86
|
}
|
|
65
87
|
else {
|
|
66
|
-
// Legacy: treat any non-typed message as result
|
|
67
88
|
settled = true;
|
|
68
89
|
cleanup();
|
|
69
90
|
resolve(msg);
|
|
@@ -80,13 +101,13 @@ export const createWorkerPool = (workerUrl, poolSize) => {
|
|
|
80
101
|
if (!settled) {
|
|
81
102
|
settled = true;
|
|
82
103
|
cleanup();
|
|
83
|
-
reject(new Error(`Worker ${i} exited
|
|
104
|
+
reject(new Error(`Worker ${i} exited with code ${code}. Likely OOM or native addon failure.`));
|
|
84
105
|
}
|
|
85
106
|
};
|
|
86
107
|
worker.on('message', handler);
|
|
87
108
|
worker.once('error', errorHandler);
|
|
88
109
|
worker.once('exit', exitHandler);
|
|
89
|
-
|
|
110
|
+
sendNextSubBatch();
|
|
90
111
|
});
|
|
91
112
|
});
|
|
92
113
|
return Promise.all(promises);
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* CSV Generator for KuzuDB Hybrid Schema
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
4
|
+
* Streams CSV rows directly to disk files in a single pass over graph nodes.
|
|
5
|
+
* File contents are lazy-read from disk per-node to avoid holding the entire
|
|
6
|
+
* repo in RAM. Rows are buffered (FLUSH_EVERY) before writing to minimize
|
|
7
|
+
* per-row Promise overhead.
|
|
6
8
|
*
|
|
7
9
|
* RFC 4180 Compliant:
|
|
8
10
|
* - Fields containing commas, double quotes, or newlines are enclosed in double quotes
|
|
@@ -11,12 +13,17 @@
|
|
|
11
13
|
*/
|
|
12
14
|
import { KnowledgeGraph } from '../graph/types.js';
|
|
13
15
|
import { NodeTableName } from './schema.js';
|
|
14
|
-
export interface
|
|
15
|
-
|
|
16
|
-
|
|
16
|
+
export interface StreamedCSVResult {
|
|
17
|
+
nodeFiles: Map<NodeTableName, {
|
|
18
|
+
csvPath: string;
|
|
19
|
+
rows: number;
|
|
20
|
+
}>;
|
|
21
|
+
relCsvPath: string;
|
|
22
|
+
relRows: number;
|
|
17
23
|
}
|
|
18
24
|
/**
|
|
19
|
-
*
|
|
20
|
-
*
|
|
25
|
+
* Stream all CSV data directly to disk files.
|
|
26
|
+
* Iterates graph nodes exactly ONCE — routes each node to the right writer.
|
|
27
|
+
* File contents are lazy-read from disk with a generous LRU cache.
|
|
21
28
|
*/
|
|
22
|
-
export declare const
|
|
29
|
+
export declare const streamAllCSVsToDisk: (graph: KnowledgeGraph, repoPath: string, csvDir: string) => Promise<StreamedCSVResult>;
|