gitnexus 1.6.1 → 1.6.2-rc.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +73 -0
- package/dist/cli/analyze.js +23 -1
- package/dist/core/embeddings/embedder.js +5 -0
- package/dist/core/embeddings/embedding-pipeline.d.ts +12 -3
- package/dist/core/embeddings/embedding-pipeline.js +79 -29
- package/dist/core/group/extractors/grpc-extractor.d.ts +1 -1
- package/dist/core/group/extractors/grpc-extractor.js +28 -13
- package/dist/core/group/extractors/http-route-extractor.js +35 -5
- package/dist/core/group/extractors/manifest-extractor.js +66 -9
- package/dist/core/group/sync.js +49 -1
- package/dist/core/ingestion/language-provider.d.ts +24 -5
- package/dist/core/ingestion/languages/c-cpp.js +2 -2
- package/dist/core/ingestion/languages/dart.d.ts +1 -1
- package/dist/core/ingestion/languages/dart.js +2 -2
- package/dist/core/ingestion/languages/go.d.ts +1 -1
- package/dist/core/ingestion/languages/go.js +2 -2
- package/dist/core/ingestion/languages/ruby.js +1 -1
- package/dist/core/ingestion/languages/swift.d.ts +1 -1
- package/dist/core/ingestion/languages/swift.js +2 -2
- package/dist/core/ingestion/pipeline-phases/wildcard-synthesis.d.ts +36 -1
- package/dist/core/ingestion/pipeline-phases/wildcard-synthesis.js +143 -5
- package/dist/core/lbug/csv-generator.js +7 -4
- package/dist/core/lbug/lbug-adapter.d.ts +38 -0
- package/dist/core/lbug/lbug-adapter.js +189 -65
- package/dist/core/lbug/schema.d.ts +7 -0
- package/dist/core/lbug/schema.js +9 -1
- package/dist/core/run-analyze.js +18 -4
- package/dist/mcp/core/embedder.js +5 -0
- package/dist/server/api.js +9 -1
- package/package.json +6 -4
- package/scripts/build-tree-sitter-proto.cjs +82 -0
- package/vendor/node_modules/node-addon-api/node_addon_api.Makefile +6 -0
- package/vendor/node_modules/node-addon-api/node_addon_api.target.mk +104 -0
- package/vendor/node_modules/node-addon-api/node_addon_api_except.target.mk +108 -0
- package/vendor/node_modules/node-addon-api/node_addon_api_except_all.target.mk +104 -0
- package/vendor/node_modules/node-addon-api/node_addon_api_maybe.target.mk +104 -0
- package/vendor/tree-sitter-proto/package.json +1 -7
|
@@ -1,15 +1,123 @@
|
|
|
1
1
|
import fs from 'fs/promises';
|
|
2
2
|
import { createReadStream, createWriteStream } from 'fs';
|
|
3
3
|
import { createInterface } from 'readline';
|
|
4
|
+
import { once } from 'events';
|
|
5
|
+
import { finished } from 'stream/promises';
|
|
4
6
|
import path from 'path';
|
|
5
7
|
import lbug from '@ladybugdb/core';
|
|
6
|
-
import { NODE_TABLES, REL_TABLE_NAME, SCHEMA_QUERIES, EMBEDDING_TABLE_NAME, } from './schema.js';
|
|
8
|
+
import { NODE_TABLES, REL_TABLE_NAME, SCHEMA_QUERIES, EMBEDDING_TABLE_NAME, STALE_HASH_SENTINEL, } from './schema.js';
|
|
7
9
|
import { streamAllCSVsToDisk } from './csv-generator.js';
|
|
10
|
+
/**
|
|
11
|
+
* Split a relationship CSV into per-label-pair files on disk.
|
|
12
|
+
*
|
|
13
|
+
* Streams the CSV line-by-line, routing each relationship to a file named
|
|
14
|
+
* `rel_{fromLabel}_{toLabel}.csv`. Handles backpressure correctly: only one
|
|
15
|
+
* drain listener per stream at a time, and readline resumes only when ALL
|
|
16
|
+
* backpressured streams have drained.
|
|
17
|
+
*
|
|
18
|
+
* @param csvPath Path to the combined relationship CSV
|
|
19
|
+
* @param csvDir Directory to write per-pair CSV files
|
|
20
|
+
* @param validTables Set of valid node table names
|
|
21
|
+
* @param getNodeLabel Function to extract the label from a node ID
|
|
22
|
+
* @param wsFactory Optional WriteStream factory (defaults to fs.createWriteStream)
|
|
23
|
+
*/
|
|
24
|
+
export const splitRelCsvByLabelPair = async (csvPath, csvDir, validTables, getNodeLabel, wsFactory = (p) => createWriteStream(p, 'utf-8')) => {
|
|
25
|
+
let relHeader = '';
|
|
26
|
+
const relsByPairMeta = new Map();
|
|
27
|
+
const pairWriteStreams = new Map();
|
|
28
|
+
let skippedRels = 0;
|
|
29
|
+
let totalValidRels = 0;
|
|
30
|
+
const inputStream = createReadStream(csvPath, 'utf-8');
|
|
31
|
+
const rl = createInterface({ input: inputStream, crlfDelay: Infinity });
|
|
32
|
+
// If any pair WriteStream errors (disk full, EMFILE, etc.) or the input
|
|
33
|
+
// stream fails, we need to abort the pending `once(ws, 'drain')` await.
|
|
34
|
+
// An AbortController gives us one signal to cancel all pending waits
|
|
35
|
+
// without a custom state machine.
|
|
36
|
+
const abortOnError = new AbortController();
|
|
37
|
+
let streamError = null;
|
|
38
|
+
const markStreamError = (err) => {
|
|
39
|
+
streamError ??= err;
|
|
40
|
+
abortOnError.abort(err);
|
|
41
|
+
};
|
|
42
|
+
try {
|
|
43
|
+
// `for await (const line of rl)` replaces the old manual
|
|
44
|
+
// on('line')/pause()/resume()/waitingForDrain state machine: readline's
|
|
45
|
+
// async iterator naturally serializes line delivery with our awaits, so
|
|
46
|
+
// at most one ws can be in backpressure at a time and we just await its
|
|
47
|
+
// 'drain' event.
|
|
48
|
+
let isFirst = true;
|
|
49
|
+
for await (const line of rl) {
|
|
50
|
+
if (streamError)
|
|
51
|
+
throw streamError;
|
|
52
|
+
if (isFirst) {
|
|
53
|
+
relHeader = line;
|
|
54
|
+
isFirst = false;
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
if (!line.trim())
|
|
58
|
+
continue;
|
|
59
|
+
const match = line.match(/"([^"]*)","([^"]*)"/);
|
|
60
|
+
if (!match) {
|
|
61
|
+
skippedRels++;
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
const fromLabel = getNodeLabel(match[1]);
|
|
65
|
+
const toLabel = getNodeLabel(match[2]);
|
|
66
|
+
if (!validTables.has(fromLabel) || !validTables.has(toLabel)) {
|
|
67
|
+
skippedRels++;
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
const pairKey = `${fromLabel}|${toLabel}`;
|
|
71
|
+
let ws = pairWriteStreams.get(pairKey);
|
|
72
|
+
if (!ws) {
|
|
73
|
+
const pairCsvPath = path.join(csvDir, `rel_${fromLabel}_${toLabel}.csv`);
|
|
74
|
+
ws = wsFactory(pairCsvPath);
|
|
75
|
+
ws.on('error', markStreamError);
|
|
76
|
+
pairWriteStreams.set(pairKey, ws);
|
|
77
|
+
relsByPairMeta.set(pairKey, { csvPath: pairCsvPath, rows: 0 });
|
|
78
|
+
if (!ws.write(relHeader + '\n')) {
|
|
79
|
+
await once(ws, 'drain', { signal: abortOnError.signal });
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
if (!ws.write(line + '\n')) {
|
|
83
|
+
await once(ws, 'drain', { signal: abortOnError.signal });
|
|
84
|
+
}
|
|
85
|
+
relsByPairMeta.get(pairKey).rows++;
|
|
86
|
+
totalValidRels++;
|
|
87
|
+
}
|
|
88
|
+
if (streamError)
|
|
89
|
+
throw streamError;
|
|
90
|
+
}
|
|
91
|
+
catch (err) {
|
|
92
|
+
// Tear down everything so no fd is left dangling. If the abort was caused
|
|
93
|
+
// by a stream error, rethrow that error (more actionable than AbortError).
|
|
94
|
+
for (const ws of pairWriteStreams.values())
|
|
95
|
+
ws.destroy();
|
|
96
|
+
inputStream.destroy();
|
|
97
|
+
throw streamError ?? err;
|
|
98
|
+
}
|
|
99
|
+
finally {
|
|
100
|
+
// Readline 'close' fires before the underlying fs.ReadStream releases its
|
|
101
|
+
// fd — on Windows that race caused ENOTEMPTY on the parent dir.
|
|
102
|
+
// stream/promises.finished is the stdlib "wait until this stream is fully
|
|
103
|
+
// closed" primitive and handles both success and error paths.
|
|
104
|
+
await finished(inputStream).catch(() => { });
|
|
105
|
+
}
|
|
106
|
+
return { relHeader, relsByPairMeta, pairWriteStreams, skippedRels, totalValidRels };
|
|
107
|
+
};
|
|
8
108
|
let db = null;
|
|
9
109
|
let conn = null;
|
|
10
110
|
let currentDbPath = null;
|
|
11
111
|
let ftsLoaded = false;
|
|
12
112
|
let vectorExtensionLoaded = false;
|
|
113
|
+
/**
|
|
114
|
+
* Check if an error indicates a missing column or table (schema-level problem)
|
|
115
|
+
* rather than a transient/connection error. Used for legacy DB fallback logic.
|
|
116
|
+
*/
|
|
117
|
+
const isMissingColumnOrTableError = (msg) => msg.includes('does not exist') ||
|
|
118
|
+
// Kuzu-specific: "(table|column|property) ... not found" — narrow enough to avoid
|
|
119
|
+
// matching transient errors like "connection not found" or "key not found".
|
|
120
|
+
/(table|column|property).*not found/i.test(msg);
|
|
13
121
|
/** Expose the current Database for pool adapter reuse in tests. */
|
|
14
122
|
export const getDatabase = () => db;
|
|
15
123
|
// Global session lock for operations that touch module-level lbug globals.
|
|
@@ -215,69 +323,14 @@ export const loadGraphToLbug = async (graph, repoPath, storagePath, onProgress)
|
|
|
215
323
|
}
|
|
216
324
|
}
|
|
217
325
|
// Bulk COPY relationships — split by FROM→TO label pair (LadybugDB requires it)
|
|
218
|
-
|
|
219
|
-
//
|
|
220
|
-
//
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
await new Promise((resolve, reject) => {
|
|
227
|
-
const rl = createInterface({
|
|
228
|
-
input: createReadStream(csvResult.relCsvPath, 'utf-8'),
|
|
229
|
-
crlfDelay: Infinity,
|
|
230
|
-
});
|
|
231
|
-
let isFirst = true;
|
|
232
|
-
rl.on('line', (line) => {
|
|
233
|
-
if (isFirst) {
|
|
234
|
-
relHeader = line;
|
|
235
|
-
isFirst = false;
|
|
236
|
-
return;
|
|
237
|
-
}
|
|
238
|
-
if (!line.trim())
|
|
239
|
-
return;
|
|
240
|
-
const match = line.match(/"([^"]*)","([^"]*)"/);
|
|
241
|
-
if (!match) {
|
|
242
|
-
skippedRels++;
|
|
243
|
-
return;
|
|
244
|
-
}
|
|
245
|
-
const fromLabel = getNodeLabel(match[1]);
|
|
246
|
-
const toLabel = getNodeLabel(match[2]);
|
|
247
|
-
if (!validTables.has(fromLabel) || !validTables.has(toLabel)) {
|
|
248
|
-
skippedRels++;
|
|
249
|
-
return;
|
|
250
|
-
}
|
|
251
|
-
const pairKey = `${fromLabel}|${toLabel}`;
|
|
252
|
-
let ws = pairWriteStreams.get(pairKey);
|
|
253
|
-
if (!ws) {
|
|
254
|
-
const pairCsvPath = path.join(csvDir, `rel_${fromLabel}_${toLabel}.csv`);
|
|
255
|
-
ws = createWriteStream(pairCsvPath, 'utf-8');
|
|
256
|
-
ws.write(relHeader + '\n');
|
|
257
|
-
pairWriteStreams.set(pairKey, ws);
|
|
258
|
-
relsByPairMeta.set(pairKey, { csvPath: pairCsvPath, rows: 0 });
|
|
259
|
-
}
|
|
260
|
-
const ok = ws.write(line + '\n');
|
|
261
|
-
relsByPairMeta.get(pairKey).rows++;
|
|
262
|
-
totalValidRels++;
|
|
263
|
-
// Handle backpressure: pause reading when the write buffer is full,
|
|
264
|
-
// resume when the stream drains. Prevents unbounded memory growth
|
|
265
|
-
// on repos with millions of relationships.
|
|
266
|
-
if (!ok) {
|
|
267
|
-
rl.pause();
|
|
268
|
-
ws.once('drain', () => rl.resume());
|
|
269
|
-
}
|
|
270
|
-
});
|
|
271
|
-
rl.on('close', resolve);
|
|
272
|
-
rl.on('error', (err) => {
|
|
273
|
-
// Destroy all open write streams to avoid resource leaks
|
|
274
|
-
for (const ws of pairWriteStreams.values())
|
|
275
|
-
ws.destroy();
|
|
276
|
-
reject(err);
|
|
277
|
-
});
|
|
278
|
-
});
|
|
279
|
-
// Close all per-pair write streams before COPY
|
|
280
|
-
await Promise.all(Array.from(pairWriteStreams.values()).map((ws) => new Promise((resolve, reject) => ws.end((err) => (err ? reject(err) : resolve())))));
|
|
326
|
+
const { relHeader, relsByPairMeta, pairWriteStreams, skippedRels, totalValidRels } = await splitRelCsvByLabelPair(csvResult.relCsvPath, csvDir, validTables, getNodeLabel);
|
|
327
|
+
// Close all per-pair write streams before COPY. `stream/promises.finished`
|
|
328
|
+
// resolves on the stream's 'finish' event and rejects on 'error' — replaces
|
|
329
|
+
// a hand-rolled promisification with the stdlib primitive.
|
|
330
|
+
await Promise.all(Array.from(pairWriteStreams.values()).map(async (ws) => {
|
|
331
|
+
ws.end();
|
|
332
|
+
await finished(ws);
|
|
333
|
+
}));
|
|
281
334
|
const insertedRels = totalValidRels;
|
|
282
335
|
const warnings = [];
|
|
283
336
|
if (insertedRels > 0) {
|
|
@@ -746,7 +799,24 @@ export const loadCachedEmbeddings = async () => {
|
|
|
746
799
|
const embeddingNodeIds = new Set();
|
|
747
800
|
const embeddings = [];
|
|
748
801
|
try {
|
|
749
|
-
|
|
802
|
+
// Try to read contentHash alongside the embedding
|
|
803
|
+
let rows;
|
|
804
|
+
let hasContentHash = true;
|
|
805
|
+
try {
|
|
806
|
+
rows = await conn.query(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN e.nodeId AS nodeId, e.embedding AS embedding, e.contentHash AS contentHash`);
|
|
807
|
+
}
|
|
808
|
+
catch (err) {
|
|
809
|
+
// Only fall back for missing-column errors (legacy DBs without contentHash).
|
|
810
|
+
// Rethrow transient / connection errors so callers see them.
|
|
811
|
+
const msg = err?.message ?? '';
|
|
812
|
+
if (isMissingColumnOrTableError(msg)) {
|
|
813
|
+
hasContentHash = false;
|
|
814
|
+
rows = await conn.query(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN e.nodeId AS nodeId, e.embedding AS embedding`);
|
|
815
|
+
}
|
|
816
|
+
else {
|
|
817
|
+
throw err;
|
|
818
|
+
}
|
|
819
|
+
}
|
|
750
820
|
const result = Array.isArray(rows) ? rows[0] : rows;
|
|
751
821
|
for (const row of await result.getAll()) {
|
|
752
822
|
const nodeId = String(row.nodeId ?? row[0] ?? '');
|
|
@@ -760,6 +830,7 @@ export const loadCachedEmbeddings = async () => {
|
|
|
760
830
|
embedding: Array.isArray(embedding)
|
|
761
831
|
? embedding.map(Number)
|
|
762
832
|
: Array.from(embedding).map(Number),
|
|
833
|
+
contentHash: hasContentHash ? (row.contentHash ?? row[2] ?? undefined) : undefined,
|
|
763
834
|
});
|
|
764
835
|
}
|
|
765
836
|
}
|
|
@@ -769,6 +840,59 @@ export const loadCachedEmbeddings = async () => {
|
|
|
769
840
|
}
|
|
770
841
|
return { embeddingNodeIds, embeddings };
|
|
771
842
|
};
|
|
843
|
+
/**
|
|
844
|
+
* Fetch existing embedding hashes from CodeEmbedding table for incremental embedding.
|
|
845
|
+
* Returns a Map<nodeId, contentHash> suitable for passing to `runEmbeddingPipeline`.
|
|
846
|
+
* Handles legacy DBs without the `contentHash` column (all rows treated as stale with empty hash).
|
|
847
|
+
* Returns undefined if the CodeEmbedding table does not exist.
|
|
848
|
+
*
|
|
849
|
+
* @param execQuery - Cypher query executor (typically pool-adapter's `executeQuery`)
|
|
850
|
+
*/
|
|
851
|
+
export const fetchExistingEmbeddingHashes = async (execQuery) => {
|
|
852
|
+
try {
|
|
853
|
+
const rows = await execQuery(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN e.nodeId AS nodeId, e.contentHash AS contentHash`);
|
|
854
|
+
if (!rows || rows.length === 0)
|
|
855
|
+
return undefined;
|
|
856
|
+
const map = new Map();
|
|
857
|
+
for (const r of rows) {
|
|
858
|
+
const nodeId = r.nodeId ?? r[0];
|
|
859
|
+
const hash = r.contentHash ?? r[1] ?? STALE_HASH_SENTINEL;
|
|
860
|
+
if (nodeId) {
|
|
861
|
+
// Empty/null contentHash means legacy row — treat as stale so it gets re-embedded
|
|
862
|
+
map.set(nodeId, hash || STALE_HASH_SENTINEL);
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
return map;
|
|
866
|
+
}
|
|
867
|
+
catch (err) {
|
|
868
|
+
const msg = err?.message ?? '';
|
|
869
|
+
if (isMissingColumnOrTableError(msg)) {
|
|
870
|
+
// Column or table missing — try fallback without contentHash
|
|
871
|
+
try {
|
|
872
|
+
const rows = await execQuery(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN e.nodeId AS nodeId`);
|
|
873
|
+
if (!rows || rows.length === 0)
|
|
874
|
+
return undefined;
|
|
875
|
+
const map = new Map();
|
|
876
|
+
for (const r of rows) {
|
|
877
|
+
const nodeId = r.nodeId ?? r[0];
|
|
878
|
+
if (nodeId)
|
|
879
|
+
map.set(nodeId, STALE_HASH_SENTINEL); // no contentHash — treat as stale
|
|
880
|
+
}
|
|
881
|
+
console.log(`[embed] ${map.size} nodes in legacy DB (no contentHash) — all treated as stale`);
|
|
882
|
+
return map;
|
|
883
|
+
}
|
|
884
|
+
catch (fallbackErr) {
|
|
885
|
+
const fallbackMsg = fallbackErr?.message ?? '';
|
|
886
|
+
if (isMissingColumnOrTableError(fallbackMsg)) {
|
|
887
|
+
console.log(`[embed] CodeEmbedding table not yet present — full embedding run (${fallbackMsg})`);
|
|
888
|
+
return undefined;
|
|
889
|
+
}
|
|
890
|
+
throw fallbackErr;
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
throw err;
|
|
894
|
+
}
|
|
895
|
+
};
|
|
772
896
|
export const closeLbug = async () => {
|
|
773
897
|
if (conn) {
|
|
774
898
|
try {
|
|
@@ -43,6 +43,13 @@ export declare const TOOL_SCHEMA = "\nCREATE NODE TABLE Tool (\n id STRING,\n
|
|
|
43
43
|
export declare const SECTION_SCHEMA = "\nCREATE NODE TABLE Section (\n id STRING,\n name STRING,\n filePath STRING,\n startLine INT64,\n endLine INT64,\n level INT64,\n content STRING,\n description STRING,\n PRIMARY KEY (id)\n)";
|
|
44
44
|
export declare const RELATION_SCHEMA = "\nCREATE REL TABLE CodeRelation (\n FROM File TO File,\n FROM File TO Folder,\n FROM File TO Function,\n FROM File TO Class,\n FROM File TO Interface,\n FROM File TO Method,\n FROM File TO CodeElement,\n FROM File TO `Struct`,\n FROM File TO `Enum`,\n FROM File TO `Macro`,\n FROM File TO `Typedef`,\n FROM File TO `Union`,\n FROM File TO `Namespace`,\n FROM File TO `Trait`,\n FROM File TO `Impl`,\n FROM File TO `TypeAlias`,\n FROM File TO `Const`,\n FROM File TO `Static`,\n FROM File TO `Property`,\n FROM File TO `Record`,\n FROM File TO `Delegate`,\n FROM File TO `Annotation`,\n FROM File TO `Constructor`,\n FROM File TO `Template`,\n FROM File TO `Module`,\n FROM File TO Section,\n FROM Folder TO Folder,\n FROM Folder TO File,\n FROM Function TO Function,\n FROM Function TO Method,\n FROM Function TO Class,\n FROM Function TO Community,\n FROM Function TO `Macro`,\n FROM Function TO `Struct`,\n FROM Function TO `Template`,\n FROM Function TO `Enum`,\n FROM Function TO `Namespace`,\n FROM Function TO `TypeAlias`,\n FROM Function TO `Module`,\n FROM Function TO `Impl`,\n FROM Function TO Interface,\n FROM Function TO `Constructor`,\n FROM Function TO `Const`,\n FROM Function TO `Typedef`,\n FROM Function TO `Union`,\n FROM Function TO `Property`,\n FROM Function TO CodeElement,\n FROM Class TO Method,\n FROM Class TO Function,\n FROM Class TO Class,\n FROM Class TO Interface,\n FROM Class TO Community,\n FROM Class TO `Template`,\n FROM Class TO `TypeAlias`,\n FROM Class TO `Struct`,\n FROM Class TO `Enum`,\n FROM Class TO `Annotation`,\n FROM Class TO `Constructor`,\n FROM Class TO `Trait`,\n FROM Class TO `Macro`,\n FROM Class TO `Impl`,\n FROM Class TO `Union`,\n FROM Class TO `Namespace`,\n FROM Class TO `Typedef`,\n FROM Class TO `Property`,\n FROM Method TO Function,\n FROM Method TO Method,\n FROM Method TO Class,\n FROM Method TO Community,\n FROM Method TO `Template`,\n FROM Method TO `Struct`,\n FROM Method TO `TypeAlias`,\n FROM Method TO `Enum`,\n FROM Method TO `Macro`,\n FROM Method TO `Namespace`,\n FROM Method TO `Module`,\n FROM Method TO `Impl`,\n FROM Method TO Interface,\n FROM Method TO `Constructor`,\n FROM Method TO `Property`,\n FROM Method TO CodeElement,\n FROM `Template` TO `Template`,\n FROM `Template` TO Function,\n FROM `Template` TO Method,\n FROM `Template` TO Class,\n FROM `Template` TO `Struct`,\n FROM `Template` TO `TypeAlias`,\n FROM `Template` TO `Enum`,\n FROM `Template` TO `Macro`,\n FROM `Template` TO Interface,\n FROM `Template` TO `Constructor`,\n FROM `Module` TO `Module`,\n FROM Section TO Section,\n FROM Section TO File,\n FROM File TO Route,\n FROM Function TO Route,\n FROM Method TO Route,\n FROM File TO Tool,\n FROM Function TO Tool,\n FROM Method TO Tool,\n FROM CodeElement TO Community,\n FROM Interface TO Community,\n FROM Interface TO Function,\n FROM Interface TO Method,\n FROM Interface TO Class,\n FROM Interface TO Interface,\n FROM Interface TO `TypeAlias`,\n FROM Interface TO `Struct`,\n FROM Interface TO `Constructor`,\n FROM Interface TO `Property`,\n FROM `Struct` TO Community,\n FROM `Struct` TO `Trait`,\n FROM `Struct` TO `Struct`,\n FROM `Struct` TO Class,\n FROM `Struct` TO `Enum`,\n FROM `Struct` TO Function,\n FROM `Struct` TO Method,\n FROM `Struct` TO Interface,\n FROM `Struct` TO `Constructor`,\n FROM `Struct` TO `Property`,\n FROM `Enum` TO `Enum`,\n FROM `Enum` TO Community,\n FROM `Enum` TO Class,\n FROM `Enum` TO Interface,\n FROM `Macro` TO Community,\n FROM `Macro` TO Function,\n FROM `Macro` TO Method,\n FROM `Module` TO Function,\n FROM `Module` TO Method,\n FROM `Typedef` TO Community,\n FROM `Union` TO Community,\n FROM `Namespace` TO Community,\n FROM `Namespace` TO `Struct`,\n FROM `Trait` TO Method,\n FROM `Trait` TO `Constructor`,\n FROM `Trait` TO `Property`,\n FROM `Trait` TO Community,\n FROM `Impl` TO Method,\n FROM `Impl` TO `Constructor`,\n FROM `Impl` TO `Property`,\n FROM `Impl` TO Community,\n FROM `Impl` TO `Trait`,\n FROM `Impl` TO `Struct`,\n FROM `Impl` TO `Impl`,\n FROM `TypeAlias` TO Community,\n FROM `TypeAlias` TO `Trait`,\n FROM `TypeAlias` TO Class,\n FROM `Const` TO Community,\n FROM `Static` TO Community,\n FROM `Property` TO Community,\n FROM `Record` TO Method,\n FROM `Record` TO `Constructor`,\n FROM `Record` TO `Property`,\n FROM `Record` TO Community,\n FROM `Delegate` TO Community,\n FROM `Annotation` TO Community,\n FROM `Constructor` TO Community,\n FROM `Constructor` TO Interface,\n FROM `Constructor` TO Class,\n FROM `Constructor` TO Method,\n FROM `Constructor` TO Function,\n FROM `Constructor` TO `Constructor`,\n FROM `Constructor` TO `Struct`,\n FROM `Constructor` TO `Macro`,\n FROM `Constructor` TO `Template`,\n FROM `Constructor` TO `TypeAlias`,\n FROM `Constructor` TO `Enum`,\n FROM `Constructor` TO `Annotation`,\n FROM `Constructor` TO `Impl`,\n FROM `Constructor` TO `Namespace`,\n FROM `Constructor` TO `Module`,\n FROM `Constructor` TO `Property`,\n FROM `Constructor` TO `Typedef`,\n FROM `Template` TO Community,\n FROM `Module` TO Community,\n FROM Function TO Process,\n FROM Method TO Process,\n FROM Class TO Process,\n FROM Interface TO Process,\n FROM `Struct` TO Process,\n FROM `Constructor` TO Process,\n FROM `Module` TO Process,\n FROM `Macro` TO Process,\n FROM `Impl` TO Process,\n FROM `Typedef` TO Process,\n FROM `TypeAlias` TO Process,\n FROM `Enum` TO Process,\n FROM `Union` TO Process,\n FROM `Namespace` TO Process,\n FROM `Trait` TO Process,\n FROM `Const` TO Process,\n FROM `Static` TO Process,\n FROM `Property` TO Process,\n FROM `Record` TO Process,\n FROM `Delegate` TO Process,\n FROM `Annotation` TO Process,\n FROM `Template` TO Process,\n FROM CodeElement TO Process,\n FROM Route TO Process,\n FROM Tool TO Process,\n type STRING,\n confidence DOUBLE,\n reason STRING,\n step INT32\n)";
|
|
45
45
|
export declare const EMBEDDING_DIMS: number;
|
|
46
|
+
/** HNSW vector index name for the CodeEmbedding table. */
|
|
47
|
+
export declare const EMBEDDING_INDEX_NAME = "code_embedding_idx";
|
|
48
|
+
/**
|
|
49
|
+
* Sentinel value for "no content hash available" — used in legacy DBs and null rows.
|
|
50
|
+
* Nodes with this hash are always treated as stale and re-embedded.
|
|
51
|
+
*/
|
|
52
|
+
export declare const STALE_HASH_SENTINEL = "";
|
|
46
53
|
export declare const EMBEDDING_SCHEMA: string;
|
|
47
54
|
/**
|
|
48
55
|
* Create vector index for semantic search
|
package/dist/core/lbug/schema.js
CHANGED
|
@@ -410,10 +410,18 @@ if (Number.isNaN(_rawDims) || _rawDims <= 0) {
|
|
|
410
410
|
throw new Error(`GITNEXUS_EMBEDDING_DIMS must be a positive integer, got "${process.env.GITNEXUS_EMBEDDING_DIMS}"`);
|
|
411
411
|
}
|
|
412
412
|
export const EMBEDDING_DIMS = _rawDims;
|
|
413
|
+
/** HNSW vector index name for the CodeEmbedding table. */
|
|
414
|
+
export const EMBEDDING_INDEX_NAME = 'code_embedding_idx';
|
|
415
|
+
/**
|
|
416
|
+
* Sentinel value for "no content hash available" — used in legacy DBs and null rows.
|
|
417
|
+
* Nodes with this hash are always treated as stale and re-embedded.
|
|
418
|
+
*/
|
|
419
|
+
export const STALE_HASH_SENTINEL = '';
|
|
413
420
|
export const EMBEDDING_SCHEMA = `
|
|
414
421
|
CREATE NODE TABLE ${EMBEDDING_TABLE_NAME} (
|
|
415
422
|
nodeId STRING,
|
|
416
423
|
embedding FLOAT[${EMBEDDING_DIMS}],
|
|
424
|
+
contentHash STRING,
|
|
417
425
|
PRIMARY KEY (nodeId)
|
|
418
426
|
)`;
|
|
419
427
|
/**
|
|
@@ -421,7 +429,7 @@ CREATE NODE TABLE ${EMBEDDING_TABLE_NAME} (
|
|
|
421
429
|
* Uses HNSW (Hierarchical Navigable Small World) algorithm with cosine similarity
|
|
422
430
|
*/
|
|
423
431
|
export const CREATE_VECTOR_INDEX_QUERY = `
|
|
424
|
-
CALL CREATE_VECTOR_INDEX('${EMBEDDING_TABLE_NAME}', '
|
|
432
|
+
CALL CREATE_VECTOR_INDEX('${EMBEDDING_TABLE_NAME}', '${EMBEDDING_INDEX_NAME}', 'embedding', metric := 'cosine')
|
|
425
433
|
`;
|
|
426
434
|
// ============================================================================
|
|
427
435
|
// ALL SCHEMA QUERIES IN ORDER
|
package/dist/core/run-analyze.js
CHANGED
|
@@ -15,6 +15,8 @@ import { initLbug, loadGraphToLbug, getLbugStats, executeQuery, executeWithReuse
|
|
|
15
15
|
import { getStoragePaths, saveMeta, loadMeta, addToGitignore, registerRepo, cleanupOldKuzuFiles, } from '../storage/repo-manager.js';
|
|
16
16
|
import { getCurrentCommit, hasGitDir } from '../storage/git.js';
|
|
17
17
|
import { generateAIContextFiles } from '../cli/ai-context.js';
|
|
18
|
+
import { EMBEDDING_TABLE_NAME } from './lbug/schema.js';
|
|
19
|
+
import { STALE_HASH_SENTINEL } from './lbug/schema.js';
|
|
18
20
|
/** Threshold: auto-skip embeddings for repos with more nodes than this */
|
|
19
21
|
const EMBEDDING_NODE_LIMIT = 50_000;
|
|
20
22
|
export const PHASE_LABELS = {
|
|
@@ -147,9 +149,13 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
147
149
|
const EMBED_BATCH = 200;
|
|
148
150
|
for (let i = 0; i < cachedEmbeddings.length; i += EMBED_BATCH) {
|
|
149
151
|
const batch = cachedEmbeddings.slice(i, i + EMBED_BATCH);
|
|
150
|
-
const paramsList = batch.map((e) => ({
|
|
152
|
+
const paramsList = batch.map((e) => ({
|
|
153
|
+
nodeId: e.nodeId,
|
|
154
|
+
embedding: e.embedding,
|
|
155
|
+
contentHash: e.contentHash ?? STALE_HASH_SENTINEL,
|
|
156
|
+
}));
|
|
151
157
|
try {
|
|
152
|
-
await executeWithReusedStatement(`
|
|
158
|
+
await executeWithReusedStatement(`MERGE (e:${EMBEDDING_TABLE_NAME} {nodeId: $nodeId}) SET e.embedding = $embedding, e.contentHash = $contentHash`, paramsList);
|
|
153
159
|
}
|
|
154
160
|
catch {
|
|
155
161
|
/* some may fail if node was removed, that's fine */
|
|
@@ -170,6 +176,14 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
170
176
|
const httpMode = isHttpMode();
|
|
171
177
|
progress('embeddings', 90, httpMode ? 'Connecting to embedding endpoint...' : 'Loading embedding model...');
|
|
172
178
|
const { runEmbeddingPipeline } = await import('./embeddings/embedding-pipeline.js');
|
|
179
|
+
// Build a Map<nodeId, contentHash> from cached embeddings for incremental mode
|
|
180
|
+
let existingEmbeddings;
|
|
181
|
+
if (cachedEmbeddingNodeIds.size > 0) {
|
|
182
|
+
existingEmbeddings = new Map();
|
|
183
|
+
for (const e of cachedEmbeddings) {
|
|
184
|
+
existingEmbeddings.set(e.nodeId, e.contentHash ?? STALE_HASH_SENTINEL);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
173
187
|
await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (p) => {
|
|
174
188
|
const scaled = 90 + Math.round((p.percent / 100) * 8);
|
|
175
189
|
const label = p.phase === 'loading-model'
|
|
@@ -178,14 +192,14 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
178
192
|
: 'Loading embedding model...'
|
|
179
193
|
: `Embedding ${p.nodesProcessed || 0}/${p.totalNodes || '?'}`;
|
|
180
194
|
progress('embeddings', scaled, label);
|
|
181
|
-
}, {},
|
|
195
|
+
}, {}, existingEmbeddings);
|
|
182
196
|
}
|
|
183
197
|
// ── Phase 5: Finalize (98–100%) ───────────────────────────────────
|
|
184
198
|
progress('done', 98, 'Saving metadata...');
|
|
185
199
|
// Count embeddings in the index (cached + newly generated)
|
|
186
200
|
let embeddingCount = 0;
|
|
187
201
|
try {
|
|
188
|
-
const embResult = await executeQuery(`MATCH (e
|
|
202
|
+
const embResult = await executeQuery(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN count(e) AS cnt`);
|
|
189
203
|
embeddingCount = embResult?.[0]?.cnt ?? 0;
|
|
190
204
|
}
|
|
191
205
|
catch {
|
|
@@ -30,6 +30,11 @@ export const initEmbedder = async () => {
|
|
|
30
30
|
initPromise = (async () => {
|
|
31
31
|
try {
|
|
32
32
|
env.allowLocalModels = false;
|
|
33
|
+
// Default cache to user-writable location. transformers.js defaults to
|
|
34
|
+
// ./node_modules/.cache inside its own install dir, which is unwritable
|
|
35
|
+
// when gitnexus is installed globally (e.g. /usr/lib/node_modules/).
|
|
36
|
+
// Respect HF_HOME if set, otherwise fall back to ~/.cache/huggingface.
|
|
37
|
+
env.cacheDir = process.env.HF_HOME ?? `${process.env.HOME}/.cache/huggingface`;
|
|
33
38
|
console.error('GitNexus: Loading embedding model (first search may take a moment)...');
|
|
34
39
|
// Try GPU first (DirectML on Windows, CUDA on Linux), fall back to CPU
|
|
35
40
|
const isWindows = process.platform === 'win32';
|
package/dist/server/api.js
CHANGED
|
@@ -1277,6 +1277,13 @@ export const createServer = async (port, host = '127.0.0.1') => {
|
|
|
1277
1277
|
const lbugPath = path.join(entry.storagePath, 'lbug');
|
|
1278
1278
|
await withLbugDb(lbugPath, async () => {
|
|
1279
1279
|
const { runEmbeddingPipeline } = await import('../core/embeddings/embedding-pipeline.js');
|
|
1280
|
+
// Fetch existing content hashes for incremental embedding.
|
|
1281
|
+
// Delegated to lbug-adapter which owns the DB query logic and legacy-fallback handling.
|
|
1282
|
+
const { fetchExistingEmbeddingHashes } = await import('../core/lbug/lbug-adapter.js');
|
|
1283
|
+
const existingEmbeddings = await fetchExistingEmbeddingHashes(executeQuery);
|
|
1284
|
+
if (existingEmbeddings && existingEmbeddings.size > 0) {
|
|
1285
|
+
console.log(`[embed] ${existingEmbeddings.size} nodes already embedded — incremental run with content-hash comparison`);
|
|
1286
|
+
}
|
|
1280
1287
|
await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (p) => {
|
|
1281
1288
|
embedJobManager.updateJob(job.id, {
|
|
1282
1289
|
progress: {
|
|
@@ -1293,7 +1300,8 @@ export const createServer = async (port, host = '127.0.0.1') => {
|
|
|
1293
1300
|
: `${p.phase} (${p.percent}%)`,
|
|
1294
1301
|
},
|
|
1295
1302
|
});
|
|
1296
|
-
}
|
|
1303
|
+
}, {}, // config: use defaults
|
|
1304
|
+
existingEmbeddings);
|
|
1297
1305
|
});
|
|
1298
1306
|
clearTimeout(embedTimeout);
|
|
1299
1307
|
releaseRepoLock(repoLockPath);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "gitnexus",
|
|
3
|
-
"version": "1.6.
|
|
3
|
+
"version": "1.6.2-rc.10",
|
|
4
4
|
"description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
|
|
5
5
|
"author": "Abhigyan Patwari",
|
|
6
6
|
"license": "PolyForm-Noncommercial-1.0.0",
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
"test:integration": "vitest run test/integration",
|
|
47
47
|
"test:watch": "vitest",
|
|
48
48
|
"test:coverage": "vitest run --coverage",
|
|
49
|
-
"postinstall": "node scripts/patch-tree-sitter-swift.cjs",
|
|
49
|
+
"postinstall": "node scripts/patch-tree-sitter-swift.cjs && node scripts/build-tree-sitter-proto.cjs",
|
|
50
50
|
"prepare": "node scripts/build.js",
|
|
51
51
|
"prepack": "node scripts/build.js"
|
|
52
52
|
},
|
|
@@ -71,7 +71,7 @@
|
|
|
71
71
|
"pandemonium": "^2.4.0",
|
|
72
72
|
"tree-sitter": "^0.21.1",
|
|
73
73
|
"tree-sitter-c": "0.23.2",
|
|
74
|
-
"tree-sitter-c-sharp": "
|
|
74
|
+
"tree-sitter-c-sharp": "0.23.1",
|
|
75
75
|
"tree-sitter-cpp": "^0.23.4",
|
|
76
76
|
"tree-sitter-go": "^0.23.0",
|
|
77
77
|
"tree-sitter-java": "^0.23.5",
|
|
@@ -84,7 +84,9 @@
|
|
|
84
84
|
"uuid": "^13.0.0"
|
|
85
85
|
},
|
|
86
86
|
"optionalDependencies": {
|
|
87
|
-
"
|
|
87
|
+
"node-addon-api": "^8.0.0",
|
|
88
|
+
"node-gyp-build": "^4.8.0",
|
|
89
|
+
"tree-sitter-dart": "git+https://github.com/UserNobody14/tree-sitter-dart.git#80e23c07b64494f7e21090bb3450223ef0b192f4",
|
|
88
90
|
"tree-sitter-kotlin": "^0.3.8",
|
|
89
91
|
"tree-sitter-proto": "file:./vendor/tree-sitter-proto",
|
|
90
92
|
"tree-sitter-swift": "^0.6.0"
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Build tree-sitter-proto native binding.
|
|
4
|
+
*
|
|
5
|
+
* Why this script exists:
|
|
6
|
+
* tree-sitter-proto is vendored under gitnexus/vendor/tree-sitter-proto/
|
|
7
|
+
* and declared as a `file:` optionalDependency. Previously, the vendored
|
|
8
|
+
* package had its own `dependencies` and `install` script, which caused
|
|
9
|
+
* npm to create `vendor/tree-sitter-proto/node_modules/` and
|
|
10
|
+
* `vendor/tree-sitter-proto/build/` during install. Those directories
|
|
11
|
+
* blocked `rmdir` on global-install upgrade, producing:
|
|
12
|
+
*
|
|
13
|
+
* ENOTEMPTY: directory not empty, rmdir
|
|
14
|
+
* '.../gitnexus/vendor/tree-sitter-proto/node_modules/node-addon-api'
|
|
15
|
+
*
|
|
16
|
+
* (See https://github.com/abhigyanpatwari/GitNexus/issues/836.)
|
|
17
|
+
*
|
|
18
|
+
* We stripped `dependencies` and the `install` script from the vendored
|
|
19
|
+
* package.json, hoisted `node-addon-api` and `node-gyp-build` into
|
|
20
|
+
* gitnexus's own optionalDependencies, and moved native compilation here.
|
|
21
|
+
*
|
|
22
|
+
* What this does:
|
|
23
|
+
* Runs `npx node-gyp rebuild` inside `node_modules/tree-sitter-proto/`
|
|
24
|
+
* (which npm creates as a copy of vendor/tree-sitter-proto/ when
|
|
25
|
+
* resolving the file: dep). Build output lands in
|
|
26
|
+
* `node_modules/tree-sitter-proto/build/Release/tree_sitter_proto_binding.node`
|
|
27
|
+
* — under npm-managed territory, safe on upgrade.
|
|
28
|
+
*
|
|
29
|
+
* Mirrors scripts/patch-tree-sitter-swift.cjs. Best-effort: if any
|
|
30
|
+
* precondition fails (optional dep absent, no toolchain, --ignore-scripts),
|
|
31
|
+
* warn and exit 0 so gitnexus install still succeeds.
|
|
32
|
+
*/
|
|
33
|
+
const fs = require('fs');
|
|
34
|
+
const path = require('path');
|
|
35
|
+
const { execSync } = require('child_process');
|
|
36
|
+
|
|
37
|
+
const protoDir = path.join(__dirname, '..', 'node_modules', 'tree-sitter-proto');
|
|
38
|
+
const bindingGyp = path.join(protoDir, 'binding.gyp');
|
|
39
|
+
const bindingNode = path.join(protoDir, 'build', 'Release', 'tree_sitter_proto_binding.node');
|
|
40
|
+
|
|
41
|
+
try {
|
|
42
|
+
if (!fs.existsSync(bindingGyp)) {
|
|
43
|
+
// tree-sitter-proto is an optionalDependency; absent when install
|
|
44
|
+
// skipped optional deps or the file: dep was not resolved.
|
|
45
|
+
process.exit(0);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Skip if the native binding already exists (idempotent re-run).
|
|
49
|
+
if (fs.existsSync(bindingNode)) {
|
|
50
|
+
process.exit(0);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Pre-flight: the hoisted build deps must be resolvable.
|
|
54
|
+
try {
|
|
55
|
+
require.resolve('node-addon-api');
|
|
56
|
+
require.resolve('node-gyp-build');
|
|
57
|
+
} catch (resolveErr) {
|
|
58
|
+
console.warn(
|
|
59
|
+
'[tree-sitter-proto] Skipping build: hoisted build deps not resolvable (%s).',
|
|
60
|
+
resolveErr.message,
|
|
61
|
+
);
|
|
62
|
+
console.warn(
|
|
63
|
+
'[tree-sitter-proto] Proto parsing will be unavailable. Install without --no-optional and with scripts enabled to build.',
|
|
64
|
+
);
|
|
65
|
+
process.exit(0);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
console.log('[tree-sitter-proto] Building native binding...');
|
|
69
|
+
execSync('npx node-gyp rebuild', {
|
|
70
|
+
cwd: protoDir,
|
|
71
|
+
stdio: 'pipe',
|
|
72
|
+
timeout: 180000,
|
|
73
|
+
});
|
|
74
|
+
console.log('[tree-sitter-proto] Native binding built successfully');
|
|
75
|
+
} catch (err) {
|
|
76
|
+
console.warn('[tree-sitter-proto] Could not build native binding:', err.message);
|
|
77
|
+
console.warn(
|
|
78
|
+
'[tree-sitter-proto] Proto (.proto) parsing will be unavailable. Non-proto gitnexus functionality is unaffected.',
|
|
79
|
+
);
|
|
80
|
+
// Exit 0: optionalDependency failures must not fail the gitnexus install.
|
|
81
|
+
process.exit(0);
|
|
82
|
+
}
|