gitnexus 1.6.2-rc.1 → 1.6.2-rc.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -67,8 +67,8 @@ const queryEmbeddableNodes = async (executeQuery) => {
|
|
|
67
67
|
* that occurs when UPDATEing nodes with large content fields
|
|
68
68
|
*/
|
|
69
69
|
const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
|
|
70
|
-
//
|
|
71
|
-
const cypher = `
|
|
70
|
+
// MERGE instead of CREATE — idempotent, handles concurrent analyzes and partial prior runs
|
|
71
|
+
const cypher = `MERGE (e:CodeEmbedding {nodeId: $nodeId}) SET e.embedding = $embedding`;
|
|
72
72
|
const paramsList = updates.map((u) => ({ nodeId: u.id, embedding: u.embedding }));
|
|
73
73
|
await executeWithReusedStatement(cypher, paramsList);
|
|
74
74
|
};
|
|
@@ -246,14 +246,17 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
|
|
|
246
246
|
Interface: interfaceWriter,
|
|
247
247
|
CodeElement: codeElemWriter,
|
|
248
248
|
};
|
|
249
|
-
|
|
249
|
+
// Deduplicate all node types — the pipeline can produce duplicate IDs across
|
|
250
|
+
// all symbol types (Class, Method, Function, etc.), not just File nodes.
|
|
251
|
+
// A single Set covering every label prevents PK violations on COPY.
|
|
252
|
+
const seenNodeIds = new Set();
|
|
250
253
|
// --- SINGLE PASS over all nodes ---
|
|
251
254
|
for (const node of graph.iterNodes()) {
|
|
255
|
+
if (seenNodeIds.has(node.id))
|
|
256
|
+
continue;
|
|
257
|
+
seenNodeIds.add(node.id);
|
|
252
258
|
switch (node.label) {
|
|
253
259
|
case 'File': {
|
|
254
|
-
if (seenFileIds.has(node.id))
|
|
255
|
-
break;
|
|
256
|
-
seenFileIds.add(node.id);
|
|
257
260
|
const content = await extractContent(node, contentCache);
|
|
258
261
|
await fileWriter.addRow([
|
|
259
262
|
escapeCSVField(node.id),
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import fs from 'fs/promises';
|
|
2
2
|
import { createReadStream, createWriteStream } from 'fs';
|
|
3
3
|
import { createInterface } from 'readline';
|
|
4
|
+
import { once } from 'events';
|
|
5
|
+
import { finished } from 'stream/promises';
|
|
4
6
|
import path from 'path';
|
|
5
7
|
import lbug from '@ladybugdb/core';
|
|
6
8
|
import { NODE_TABLES, REL_TABLE_NAME, SCHEMA_QUERIES, EMBEDDING_TABLE_NAME, } from './schema.js';
|
|
@@ -25,100 +27,82 @@ export const splitRelCsvByLabelPair = async (csvPath, csvDir, validTables, getNo
|
|
|
25
27
|
const pairWriteStreams = new Map();
|
|
26
28
|
let skippedRels = 0;
|
|
27
29
|
let totalValidRels = 0;
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
}
|
|
47
|
-
catch { }
|
|
48
|
-
try {
|
|
49
|
-
inputStream.destroy();
|
|
50
|
-
}
|
|
51
|
-
catch { }
|
|
52
|
-
for (const ws of pairWriteStreams.values()) {
|
|
53
|
-
try {
|
|
54
|
-
ws.destroy();
|
|
55
|
-
}
|
|
56
|
-
catch { }
|
|
57
|
-
}
|
|
58
|
-
reject(err);
|
|
59
|
-
};
|
|
30
|
+
const inputStream = createReadStream(csvPath, 'utf-8');
|
|
31
|
+
const rl = createInterface({ input: inputStream, crlfDelay: Infinity });
|
|
32
|
+
// If any pair WriteStream errors (disk full, EMFILE, etc.) or the input
|
|
33
|
+
// stream fails, we need to abort the pending `once(ws, 'drain')` await.
|
|
34
|
+
// An AbortController gives us one signal to cancel all pending waits
|
|
35
|
+
// without a custom state machine.
|
|
36
|
+
const abortOnError = new AbortController();
|
|
37
|
+
let streamError = null;
|
|
38
|
+
const markStreamError = (err) => {
|
|
39
|
+
streamError ??= err;
|
|
40
|
+
abortOnError.abort(err);
|
|
41
|
+
};
|
|
42
|
+
try {
|
|
43
|
+
// `for await (const line of rl)` replaces the old manual
|
|
44
|
+
// on('line')/pause()/resume()/waitingForDrain state machine: readline's
|
|
45
|
+
// async iterator naturally serializes line delivery with our awaits, so
|
|
46
|
+
// at most one ws can be in backpressure at a time and we just await its
|
|
47
|
+
// 'drain' event.
|
|
60
48
|
let isFirst = true;
|
|
61
|
-
|
|
49
|
+
for await (const line of rl) {
|
|
50
|
+
if (streamError)
|
|
51
|
+
throw streamError;
|
|
62
52
|
if (isFirst) {
|
|
63
53
|
relHeader = line;
|
|
64
54
|
isFirst = false;
|
|
65
|
-
|
|
55
|
+
continue;
|
|
66
56
|
}
|
|
67
57
|
if (!line.trim())
|
|
68
|
-
|
|
58
|
+
continue;
|
|
69
59
|
const match = line.match(/"([^"]*)","([^"]*)"/);
|
|
70
60
|
if (!match) {
|
|
71
61
|
skippedRels++;
|
|
72
|
-
|
|
62
|
+
continue;
|
|
73
63
|
}
|
|
74
64
|
const fromLabel = getNodeLabel(match[1]);
|
|
75
65
|
const toLabel = getNodeLabel(match[2]);
|
|
76
66
|
if (!validTables.has(fromLabel) || !validTables.has(toLabel)) {
|
|
77
67
|
skippedRels++;
|
|
78
|
-
|
|
68
|
+
continue;
|
|
79
69
|
}
|
|
80
70
|
const pairKey = `${fromLabel}|${toLabel}`;
|
|
81
71
|
let ws = pairWriteStreams.get(pairKey);
|
|
82
72
|
if (!ws) {
|
|
83
73
|
const pairCsvPath = path.join(csvDir, `rel_${fromLabel}_${toLabel}.csv`);
|
|
84
74
|
ws = wsFactory(pairCsvPath);
|
|
85
|
-
|
|
86
|
-
// tear down everything and reject the Promise. Without this handler,
|
|
87
|
-
// a stream error while rl is paused waiting for drain would cause
|
|
88
|
-
// the drain callback to never fire and the Promise to hang forever.
|
|
89
|
-
ws.on('error', cleanup);
|
|
90
|
-
ws.write(relHeader + '\n');
|
|
75
|
+
ws.on('error', markStreamError);
|
|
91
76
|
pairWriteStreams.set(pairKey, ws);
|
|
92
77
|
relsByPairMeta.set(pairKey, { csvPath: pairCsvPath, rows: 0 });
|
|
78
|
+
if (!ws.write(relHeader + '\n')) {
|
|
79
|
+
await once(ws, 'drain', { signal: abortOnError.signal });
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
if (!ws.write(line + '\n')) {
|
|
83
|
+
await once(ws, 'drain', { signal: abortOnError.signal });
|
|
93
84
|
}
|
|
94
|
-
const ok = ws.write(line + '\n');
|
|
95
85
|
relsByPairMeta.get(pairKey).rows++;
|
|
96
86
|
totalValidRels++;
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
settled = true;
|
|
117
|
-
resolve();
|
|
118
|
-
}
|
|
119
|
-
});
|
|
120
|
-
rl.on('error', cleanup);
|
|
121
|
-
});
|
|
87
|
+
}
|
|
88
|
+
if (streamError)
|
|
89
|
+
throw streamError;
|
|
90
|
+
}
|
|
91
|
+
catch (err) {
|
|
92
|
+
// Tear down everything so no fd is left dangling. If the abort was caused
|
|
93
|
+
// by a stream error, rethrow that error (more actionable than AbortError).
|
|
94
|
+
for (const ws of pairWriteStreams.values())
|
|
95
|
+
ws.destroy();
|
|
96
|
+
inputStream.destroy();
|
|
97
|
+
throw streamError ?? err;
|
|
98
|
+
}
|
|
99
|
+
finally {
|
|
100
|
+
// Readline 'close' fires before the underlying fs.ReadStream releases its
|
|
101
|
+
// fd — on Windows that race caused ENOTEMPTY on the parent dir.
|
|
102
|
+
// stream/promises.finished is the stdlib "wait until this stream is fully
|
|
103
|
+
// closed" primitive and handles both success and error paths.
|
|
104
|
+
await finished(inputStream).catch(() => { });
|
|
105
|
+
}
|
|
122
106
|
return { relHeader, relsByPairMeta, pairWriteStreams, skippedRels, totalValidRels };
|
|
123
107
|
};
|
|
124
108
|
let db = null;
|
|
@@ -332,15 +316,13 @@ export const loadGraphToLbug = async (graph, repoPath, storagePath, onProgress)
|
|
|
332
316
|
}
|
|
333
317
|
// Bulk COPY relationships — split by FROM→TO label pair (LadybugDB requires it)
|
|
334
318
|
const { relHeader, relsByPairMeta, pairWriteStreams, skippedRels, totalValidRels } = await splitRelCsvByLabelPair(csvResult.relCsvPath, csvDir, validTables, getNodeLabel);
|
|
335
|
-
// Close all per-pair write streams before COPY
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
ws.end(
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
});
|
|
343
|
-
})));
|
|
319
|
+
// Close all per-pair write streams before COPY. `stream/promises.finished`
|
|
320
|
+
// resolves on the stream's 'finish' event and rejects on 'error' — replaces
|
|
321
|
+
// a hand-rolled promisification with the stdlib primitive.
|
|
322
|
+
await Promise.all(Array.from(pairWriteStreams.values()).map(async (ws) => {
|
|
323
|
+
ws.end();
|
|
324
|
+
await finished(ws);
|
|
325
|
+
}));
|
|
344
326
|
const insertedRels = totalValidRels;
|
|
345
327
|
const warnings = [];
|
|
346
328
|
if (insertedRels > 0) {
|
package/dist/core/run-analyze.js
CHANGED
|
@@ -149,7 +149,7 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
149
149
|
const batch = cachedEmbeddings.slice(i, i + EMBED_BATCH);
|
|
150
150
|
const paramsList = batch.map((e) => ({ nodeId: e.nodeId, embedding: e.embedding }));
|
|
151
151
|
try {
|
|
152
|
-
await executeWithReusedStatement(`
|
|
152
|
+
await executeWithReusedStatement(`MERGE (e:CodeEmbedding {nodeId: $nodeId}) SET e.embedding = $embedding`, paramsList);
|
|
153
153
|
}
|
|
154
154
|
catch {
|
|
155
155
|
/* some may fail if node was removed, that's fine */
|
package/dist/server/api.js
CHANGED
|
@@ -1277,6 +1277,26 @@ export const createServer = async (port, host = '127.0.0.1') => {
|
|
|
1277
1277
|
const lbugPath = path.join(entry.storagePath, 'lbug');
|
|
1278
1278
|
await withLbugDb(lbugPath, async () => {
|
|
1279
1279
|
const { runEmbeddingPipeline } = await import('../core/embeddings/embedding-pipeline.js');
|
|
1280
|
+
// Skip nodes that already have embeddings — Kuzu forbids SET on vector-indexed properties.
|
|
1281
|
+
let skipNodeIds;
|
|
1282
|
+
try {
|
|
1283
|
+
const rows = await executeQuery('MATCH (e:CodeEmbedding) RETURN e.nodeId AS nodeId');
|
|
1284
|
+
if (rows && rows.length > 0) {
|
|
1285
|
+
skipNodeIds = new Set(rows.map((r) => r.nodeId ?? r[0]).filter(Boolean));
|
|
1286
|
+
console.log(`[embed] ${skipNodeIds.size} nodes already embedded — skipping in incremental run`);
|
|
1287
|
+
}
|
|
1288
|
+
}
|
|
1289
|
+
catch (err) {
|
|
1290
|
+
// Swallow only "table does not exist" — let real connection errors propagate.
|
|
1291
|
+
// Log so ops can see this path fire if Kuzu ever changes error wording.
|
|
1292
|
+
const msg = err?.message ?? '';
|
|
1293
|
+
if (msg.includes('does not exist') || msg.includes('not found')) {
|
|
1294
|
+
console.log(`[embed] CodeEmbedding table not yet present — full embedding run (${msg})`);
|
|
1295
|
+
}
|
|
1296
|
+
else {
|
|
1297
|
+
throw err;
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1280
1300
|
await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (p) => {
|
|
1281
1301
|
embedJobManager.updateJob(job.id, {
|
|
1282
1302
|
progress: {
|
|
@@ -1293,7 +1313,8 @@ export const createServer = async (port, host = '127.0.0.1') => {
|
|
|
1293
1313
|
: `${p.phase} (${p.percent}%)`,
|
|
1294
1314
|
},
|
|
1295
1315
|
});
|
|
1296
|
-
})
|
|
1316
|
+
}, {}, // config: use defaults (runEmbeddingPipeline signature: executeQuery, executeWithReusedStatement, onProgress, config, skipNodeIds)
|
|
1317
|
+
skipNodeIds);
|
|
1297
1318
|
});
|
|
1298
1319
|
clearTimeout(embedTimeout);
|
|
1299
1320
|
releaseRepoLock(repoLockPath);
|
package/package.json
CHANGED