gitnexus 1.6.1 → 1.6.2-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +73 -0
  2. package/dist/cli/analyze.js +23 -1
  3. package/dist/core/embeddings/embedder.js +5 -0
  4. package/dist/core/embeddings/embedding-pipeline.d.ts +12 -3
  5. package/dist/core/embeddings/embedding-pipeline.js +79 -29
  6. package/dist/core/group/extractors/grpc-extractor.d.ts +1 -1
  7. package/dist/core/group/extractors/grpc-extractor.js +28 -13
  8. package/dist/core/group/extractors/http-route-extractor.js +35 -5
  9. package/dist/core/group/extractors/manifest-extractor.js +66 -9
  10. package/dist/core/group/sync.js +49 -1
  11. package/dist/core/ingestion/language-provider.d.ts +24 -5
  12. package/dist/core/ingestion/languages/c-cpp.js +2 -2
  13. package/dist/core/ingestion/languages/dart.d.ts +1 -1
  14. package/dist/core/ingestion/languages/dart.js +2 -2
  15. package/dist/core/ingestion/languages/go.d.ts +1 -1
  16. package/dist/core/ingestion/languages/go.js +2 -2
  17. package/dist/core/ingestion/languages/ruby.js +1 -1
  18. package/dist/core/ingestion/languages/swift.d.ts +1 -1
  19. package/dist/core/ingestion/languages/swift.js +2 -2
  20. package/dist/core/ingestion/pipeline-phases/wildcard-synthesis.d.ts +36 -1
  21. package/dist/core/ingestion/pipeline-phases/wildcard-synthesis.js +143 -5
  22. package/dist/core/lbug/csv-generator.js +7 -4
  23. package/dist/core/lbug/lbug-adapter.d.ts +38 -0
  24. package/dist/core/lbug/lbug-adapter.js +189 -65
  25. package/dist/core/lbug/schema.d.ts +7 -0
  26. package/dist/core/lbug/schema.js +9 -1
  27. package/dist/core/run-analyze.js +18 -4
  28. package/dist/mcp/core/embedder.js +5 -0
  29. package/dist/server/api.js +9 -1
  30. package/package.json +6 -4
  31. package/scripts/build-tree-sitter-proto.cjs +82 -0
  32. package/vendor/node_modules/node-addon-api/node_addon_api.Makefile +6 -0
  33. package/vendor/node_modules/node-addon-api/node_addon_api.target.mk +104 -0
  34. package/vendor/node_modules/node-addon-api/node_addon_api_except.target.mk +108 -0
  35. package/vendor/node_modules/node-addon-api/node_addon_api_except_all.target.mk +104 -0
  36. package/vendor/node_modules/node-addon-api/node_addon_api_maybe.target.mk +104 -0
  37. package/vendor/tree-sitter-proto/package.json +1 -7
@@ -1,15 +1,123 @@
1
1
  import fs from 'fs/promises';
2
2
  import { createReadStream, createWriteStream } from 'fs';
3
3
  import { createInterface } from 'readline';
4
+ import { once } from 'events';
5
+ import { finished } from 'stream/promises';
4
6
  import path from 'path';
5
7
  import lbug from '@ladybugdb/core';
6
- import { NODE_TABLES, REL_TABLE_NAME, SCHEMA_QUERIES, EMBEDDING_TABLE_NAME, } from './schema.js';
8
+ import { NODE_TABLES, REL_TABLE_NAME, SCHEMA_QUERIES, EMBEDDING_TABLE_NAME, STALE_HASH_SENTINEL, } from './schema.js';
7
9
  import { streamAllCSVsToDisk } from './csv-generator.js';
10
+ /**
11
+ * Split a relationship CSV into per-label-pair files on disk.
12
+ *
13
+ * Streams the CSV line-by-line, routing each relationship to a file named
14
+ * `rel_{fromLabel}_{toLabel}.csv`. Handles backpressure correctly: only one
15
+ * drain listener per stream at a time, and readline resumes only when ALL
16
+ * backpressured streams have drained.
17
+ *
18
+ * @param csvPath Path to the combined relationship CSV
19
+ * @param csvDir Directory to write per-pair CSV files
20
+ * @param validTables Set of valid node table names
21
+ * @param getNodeLabel Function to extract the label from a node ID
22
+ * @param wsFactory Optional WriteStream factory (defaults to fs.createWriteStream)
23
+ */
24
+ export const splitRelCsvByLabelPair = async (csvPath, csvDir, validTables, getNodeLabel, wsFactory = (p) => createWriteStream(p, 'utf-8')) => {
25
+ let relHeader = '';
26
+ const relsByPairMeta = new Map();
27
+ const pairWriteStreams = new Map();
28
+ let skippedRels = 0;
29
+ let totalValidRels = 0;
30
+ const inputStream = createReadStream(csvPath, 'utf-8');
31
+ const rl = createInterface({ input: inputStream, crlfDelay: Infinity });
32
+ // If any pair WriteStream errors (disk full, EMFILE, etc.) or the input
33
+ // stream fails, we need to abort the pending `once(ws, 'drain')` await.
34
+ // An AbortController gives us one signal to cancel all pending waits
35
+ // without a custom state machine.
36
+ const abortOnError = new AbortController();
37
+ let streamError = null;
38
+ const markStreamError = (err) => {
39
+ streamError ??= err;
40
+ abortOnError.abort(err);
41
+ };
42
+ try {
43
+ // `for await (const line of rl)` replaces the old manual
44
+ // on('line')/pause()/resume()/waitingForDrain state machine: readline's
45
+ // async iterator naturally serializes line delivery with our awaits, so
46
+ // at most one ws can be in backpressure at a time and we just await its
47
+ // 'drain' event.
48
+ let isFirst = true;
49
+ for await (const line of rl) {
50
+ if (streamError)
51
+ throw streamError;
52
+ if (isFirst) {
53
+ relHeader = line;
54
+ isFirst = false;
55
+ continue;
56
+ }
57
+ if (!line.trim())
58
+ continue;
59
+ const match = line.match(/"([^"]*)","([^"]*)"/);
60
+ if (!match) {
61
+ skippedRels++;
62
+ continue;
63
+ }
64
+ const fromLabel = getNodeLabel(match[1]);
65
+ const toLabel = getNodeLabel(match[2]);
66
+ if (!validTables.has(fromLabel) || !validTables.has(toLabel)) {
67
+ skippedRels++;
68
+ continue;
69
+ }
70
+ const pairKey = `${fromLabel}|${toLabel}`;
71
+ let ws = pairWriteStreams.get(pairKey);
72
+ if (!ws) {
73
+ const pairCsvPath = path.join(csvDir, `rel_${fromLabel}_${toLabel}.csv`);
74
+ ws = wsFactory(pairCsvPath);
75
+ ws.on('error', markStreamError);
76
+ pairWriteStreams.set(pairKey, ws);
77
+ relsByPairMeta.set(pairKey, { csvPath: pairCsvPath, rows: 0 });
78
+ if (!ws.write(relHeader + '\n')) {
79
+ await once(ws, 'drain', { signal: abortOnError.signal });
80
+ }
81
+ }
82
+ if (!ws.write(line + '\n')) {
83
+ await once(ws, 'drain', { signal: abortOnError.signal });
84
+ }
85
+ relsByPairMeta.get(pairKey).rows++;
86
+ totalValidRels++;
87
+ }
88
+ if (streamError)
89
+ throw streamError;
90
+ }
91
+ catch (err) {
92
+ // Tear down everything so no fd is left dangling. If the abort was caused
93
+ // by a stream error, rethrow that error (more actionable than AbortError).
94
+ for (const ws of pairWriteStreams.values())
95
+ ws.destroy();
96
+ inputStream.destroy();
97
+ throw streamError ?? err;
98
+ }
99
+ finally {
100
+ // Readline 'close' fires before the underlying fs.ReadStream releases its
101
+ // fd — on Windows that race caused ENOTEMPTY on the parent dir.
102
+ // stream/promises.finished is the stdlib "wait until this stream is fully
103
+ // closed" primitive and handles both success and error paths.
104
+ await finished(inputStream).catch(() => { });
105
+ }
106
+ return { relHeader, relsByPairMeta, pairWriteStreams, skippedRels, totalValidRels };
107
+ };
8
108
  let db = null;
9
109
  let conn = null;
10
110
  let currentDbPath = null;
11
111
  let ftsLoaded = false;
12
112
  let vectorExtensionLoaded = false;
113
+ /**
114
+ * Check if an error indicates a missing column or table (schema-level problem)
115
+ * rather than a transient/connection error. Used for legacy DB fallback logic.
116
+ */
117
+ const isMissingColumnOrTableError = (msg) => msg.includes('does not exist') ||
118
+ // Kuzu-specific: "(table|column|property) ... not found" — narrow enough to avoid
119
+ // matching transient errors like "connection not found" or "key not found".
120
+ /(table|column|property).*not found/i.test(msg);
13
121
  /** Expose the current Database for pool adapter reuse in tests. */
14
122
  export const getDatabase = () => db;
15
123
  // Global session lock for operations that touch module-level lbug globals.
@@ -215,69 +323,14 @@ export const loadGraphToLbug = async (graph, repoPath, storagePath, onProgress)
215
323
  }
216
324
  }
217
325
  // Bulk COPY relationships — split by FROM→TO label pair (LadybugDB requires it)
218
- // Stream-read the relation CSV line by line and write directly to per-pair
219
- // temp files on disk. This avoids accumulating potentially millions of CSV
220
- // lines in memory which could exceed V8 Map or array limits on large repos.
221
- let relHeader = '';
222
- const relsByPairMeta = new Map();
223
- const pairWriteStreams = new Map();
224
- let skippedRels = 0;
225
- let totalValidRels = 0;
226
- await new Promise((resolve, reject) => {
227
- const rl = createInterface({
228
- input: createReadStream(csvResult.relCsvPath, 'utf-8'),
229
- crlfDelay: Infinity,
230
- });
231
- let isFirst = true;
232
- rl.on('line', (line) => {
233
- if (isFirst) {
234
- relHeader = line;
235
- isFirst = false;
236
- return;
237
- }
238
- if (!line.trim())
239
- return;
240
- const match = line.match(/"([^"]*)","([^"]*)"/);
241
- if (!match) {
242
- skippedRels++;
243
- return;
244
- }
245
- const fromLabel = getNodeLabel(match[1]);
246
- const toLabel = getNodeLabel(match[2]);
247
- if (!validTables.has(fromLabel) || !validTables.has(toLabel)) {
248
- skippedRels++;
249
- return;
250
- }
251
- const pairKey = `${fromLabel}|${toLabel}`;
252
- let ws = pairWriteStreams.get(pairKey);
253
- if (!ws) {
254
- const pairCsvPath = path.join(csvDir, `rel_${fromLabel}_${toLabel}.csv`);
255
- ws = createWriteStream(pairCsvPath, 'utf-8');
256
- ws.write(relHeader + '\n');
257
- pairWriteStreams.set(pairKey, ws);
258
- relsByPairMeta.set(pairKey, { csvPath: pairCsvPath, rows: 0 });
259
- }
260
- const ok = ws.write(line + '\n');
261
- relsByPairMeta.get(pairKey).rows++;
262
- totalValidRels++;
263
- // Handle backpressure: pause reading when the write buffer is full,
264
- // resume when the stream drains. Prevents unbounded memory growth
265
- // on repos with millions of relationships.
266
- if (!ok) {
267
- rl.pause();
268
- ws.once('drain', () => rl.resume());
269
- }
270
- });
271
- rl.on('close', resolve);
272
- rl.on('error', (err) => {
273
- // Destroy all open write streams to avoid resource leaks
274
- for (const ws of pairWriteStreams.values())
275
- ws.destroy();
276
- reject(err);
277
- });
278
- });
279
- // Close all per-pair write streams before COPY
280
- await Promise.all(Array.from(pairWriteStreams.values()).map((ws) => new Promise((resolve, reject) => ws.end((err) => (err ? reject(err) : resolve())))));
326
+ const { relHeader, relsByPairMeta, pairWriteStreams, skippedRels, totalValidRels } = await splitRelCsvByLabelPair(csvResult.relCsvPath, csvDir, validTables, getNodeLabel);
327
+ // Close all per-pair write streams before COPY. `stream/promises.finished`
328
+ // resolves on the stream's 'finish' event and rejects on 'error' replaces
329
+ // a hand-rolled promisification with the stdlib primitive.
330
+ await Promise.all(Array.from(pairWriteStreams.values()).map(async (ws) => {
331
+ ws.end();
332
+ await finished(ws);
333
+ }));
281
334
  const insertedRels = totalValidRels;
282
335
  const warnings = [];
283
336
  if (insertedRels > 0) {
@@ -746,7 +799,24 @@ export const loadCachedEmbeddings = async () => {
746
799
  const embeddingNodeIds = new Set();
747
800
  const embeddings = [];
748
801
  try {
749
- const rows = await conn.query(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN e.nodeId AS nodeId, e.embedding AS embedding`);
802
+ // Try to read contentHash alongside the embedding
803
+ let rows;
804
+ let hasContentHash = true;
805
+ try {
806
+ rows = await conn.query(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN e.nodeId AS nodeId, e.embedding AS embedding, e.contentHash AS contentHash`);
807
+ }
808
+ catch (err) {
809
+ // Only fall back for missing-column errors (legacy DBs without contentHash).
810
+ // Rethrow transient / connection errors so callers see them.
811
+ const msg = err?.message ?? '';
812
+ if (isMissingColumnOrTableError(msg)) {
813
+ hasContentHash = false;
814
+ rows = await conn.query(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN e.nodeId AS nodeId, e.embedding AS embedding`);
815
+ }
816
+ else {
817
+ throw err;
818
+ }
819
+ }
750
820
  const result = Array.isArray(rows) ? rows[0] : rows;
751
821
  for (const row of await result.getAll()) {
752
822
  const nodeId = String(row.nodeId ?? row[0] ?? '');
@@ -760,6 +830,7 @@ export const loadCachedEmbeddings = async () => {
760
830
  embedding: Array.isArray(embedding)
761
831
  ? embedding.map(Number)
762
832
  : Array.from(embedding).map(Number),
833
+ contentHash: hasContentHash ? (row.contentHash ?? row[2] ?? undefined) : undefined,
763
834
  });
764
835
  }
765
836
  }
@@ -769,6 +840,59 @@ export const loadCachedEmbeddings = async () => {
769
840
  }
770
841
  return { embeddingNodeIds, embeddings };
771
842
  };
843
+ /**
844
+ * Fetch existing embedding hashes from CodeEmbedding table for incremental embedding.
845
+ * Returns a Map<nodeId, contentHash> suitable for passing to `runEmbeddingPipeline`.
846
+ * Handles legacy DBs without the `contentHash` column (all rows treated as stale with empty hash).
847
+ * Returns undefined if the CodeEmbedding table does not exist.
848
+ *
849
+ * @param execQuery - Cypher query executor (typically pool-adapter's `executeQuery`)
850
+ */
851
+ export const fetchExistingEmbeddingHashes = async (execQuery) => {
852
+ try {
853
+ const rows = await execQuery(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN e.nodeId AS nodeId, e.contentHash AS contentHash`);
854
+ if (!rows || rows.length === 0)
855
+ return undefined;
856
+ const map = new Map();
857
+ for (const r of rows) {
858
+ const nodeId = r.nodeId ?? r[0];
859
+ const hash = r.contentHash ?? r[1] ?? STALE_HASH_SENTINEL;
860
+ if (nodeId) {
861
+ // Empty/null contentHash means legacy row — treat as stale so it gets re-embedded
862
+ map.set(nodeId, hash || STALE_HASH_SENTINEL);
863
+ }
864
+ }
865
+ return map;
866
+ }
867
+ catch (err) {
868
+ const msg = err?.message ?? '';
869
+ if (isMissingColumnOrTableError(msg)) {
870
+ // Column or table missing — try fallback without contentHash
871
+ try {
872
+ const rows = await execQuery(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN e.nodeId AS nodeId`);
873
+ if (!rows || rows.length === 0)
874
+ return undefined;
875
+ const map = new Map();
876
+ for (const r of rows) {
877
+ const nodeId = r.nodeId ?? r[0];
878
+ if (nodeId)
879
+ map.set(nodeId, STALE_HASH_SENTINEL); // no contentHash — treat as stale
880
+ }
881
+ console.log(`[embed] ${map.size} nodes in legacy DB (no contentHash) — all treated as stale`);
882
+ return map;
883
+ }
884
+ catch (fallbackErr) {
885
+ const fallbackMsg = fallbackErr?.message ?? '';
886
+ if (isMissingColumnOrTableError(fallbackMsg)) {
887
+ console.log(`[embed] CodeEmbedding table not yet present — full embedding run (${fallbackMsg})`);
888
+ return undefined;
889
+ }
890
+ throw fallbackErr;
891
+ }
892
+ }
893
+ throw err;
894
+ }
895
+ };
772
896
  export const closeLbug = async () => {
773
897
  if (conn) {
774
898
  try {
@@ -43,6 +43,13 @@ export declare const TOOL_SCHEMA = "\nCREATE NODE TABLE Tool (\n id STRING,\n
43
43
  export declare const SECTION_SCHEMA = "\nCREATE NODE TABLE Section (\n id STRING,\n name STRING,\n filePath STRING,\n startLine INT64,\n endLine INT64,\n level INT64,\n content STRING,\n description STRING,\n PRIMARY KEY (id)\n)";
44
44
  export declare const RELATION_SCHEMA = "\nCREATE REL TABLE CodeRelation (\n FROM File TO File,\n FROM File TO Folder,\n FROM File TO Function,\n FROM File TO Class,\n FROM File TO Interface,\n FROM File TO Method,\n FROM File TO CodeElement,\n FROM File TO `Struct`,\n FROM File TO `Enum`,\n FROM File TO `Macro`,\n FROM File TO `Typedef`,\n FROM File TO `Union`,\n FROM File TO `Namespace`,\n FROM File TO `Trait`,\n FROM File TO `Impl`,\n FROM File TO `TypeAlias`,\n FROM File TO `Const`,\n FROM File TO `Static`,\n FROM File TO `Property`,\n FROM File TO `Record`,\n FROM File TO `Delegate`,\n FROM File TO `Annotation`,\n FROM File TO `Constructor`,\n FROM File TO `Template`,\n FROM File TO `Module`,\n FROM File TO Section,\n FROM Folder TO Folder,\n FROM Folder TO File,\n FROM Function TO Function,\n FROM Function TO Method,\n FROM Function TO Class,\n FROM Function TO Community,\n FROM Function TO `Macro`,\n FROM Function TO `Struct`,\n FROM Function TO `Template`,\n FROM Function TO `Enum`,\n FROM Function TO `Namespace`,\n FROM Function TO `TypeAlias`,\n FROM Function TO `Module`,\n FROM Function TO `Impl`,\n FROM Function TO Interface,\n FROM Function TO `Constructor`,\n FROM Function TO `Const`,\n FROM Function TO `Typedef`,\n FROM Function TO `Union`,\n FROM Function TO `Property`,\n FROM Function TO CodeElement,\n FROM Class TO Method,\n FROM Class TO Function,\n FROM Class TO Class,\n FROM Class TO Interface,\n FROM Class TO Community,\n FROM Class TO `Template`,\n FROM Class TO `TypeAlias`,\n FROM Class TO `Struct`,\n FROM Class TO `Enum`,\n FROM Class TO `Annotation`,\n FROM Class TO `Constructor`,\n FROM Class TO `Trait`,\n FROM Class TO `Macro`,\n FROM Class TO `Impl`,\n FROM Class TO `Union`,\n FROM Class TO `Namespace`,\n FROM Class TO `Typedef`,\n FROM Class TO `Property`,\n FROM Method TO Function,\n FROM Method TO Method,\n FROM Method TO Class,\n FROM Method TO Community,\n FROM Method TO `Template`,\n FROM Method TO `Struct`,\n FROM Method TO `TypeAlias`,\n FROM Method TO `Enum`,\n FROM Method TO `Macro`,\n FROM Method TO `Namespace`,\n FROM Method TO `Module`,\n FROM Method TO `Impl`,\n FROM Method TO Interface,\n FROM Method TO `Constructor`,\n FROM Method TO `Property`,\n FROM Method TO CodeElement,\n FROM `Template` TO `Template`,\n FROM `Template` TO Function,\n FROM `Template` TO Method,\n FROM `Template` TO Class,\n FROM `Template` TO `Struct`,\n FROM `Template` TO `TypeAlias`,\n FROM `Template` TO `Enum`,\n FROM `Template` TO `Macro`,\n FROM `Template` TO Interface,\n FROM `Template` TO `Constructor`,\n FROM `Module` TO `Module`,\n FROM Section TO Section,\n FROM Section TO File,\n FROM File TO Route,\n FROM Function TO Route,\n FROM Method TO Route,\n FROM File TO Tool,\n FROM Function TO Tool,\n FROM Method TO Tool,\n FROM CodeElement TO Community,\n FROM Interface TO Community,\n FROM Interface TO Function,\n FROM Interface TO Method,\n FROM Interface TO Class,\n FROM Interface TO Interface,\n FROM Interface TO `TypeAlias`,\n FROM Interface TO `Struct`,\n FROM Interface TO `Constructor`,\n FROM Interface TO `Property`,\n FROM `Struct` TO Community,\n FROM `Struct` TO `Trait`,\n FROM `Struct` TO `Struct`,\n FROM `Struct` TO Class,\n FROM `Struct` TO `Enum`,\n FROM `Struct` TO Function,\n FROM `Struct` TO Method,\n FROM `Struct` TO Interface,\n FROM `Struct` TO `Constructor`,\n FROM `Struct` TO `Property`,\n FROM `Enum` TO `Enum`,\n FROM `Enum` TO Community,\n FROM `Enum` TO Class,\n FROM `Enum` TO Interface,\n FROM `Macro` TO Community,\n FROM `Macro` TO Function,\n FROM `Macro` TO Method,\n FROM `Module` TO Function,\n FROM `Module` TO Method,\n FROM `Typedef` TO Community,\n FROM `Union` TO Community,\n FROM `Namespace` TO Community,\n FROM `Namespace` TO `Struct`,\n FROM `Trait` TO Method,\n FROM `Trait` TO `Constructor`,\n FROM `Trait` TO `Property`,\n FROM `Trait` TO Community,\n FROM `Impl` TO Method,\n FROM `Impl` TO `Constructor`,\n FROM `Impl` TO `Property`,\n FROM `Impl` TO Community,\n FROM `Impl` TO `Trait`,\n FROM `Impl` TO `Struct`,\n FROM `Impl` TO `Impl`,\n FROM `TypeAlias` TO Community,\n FROM `TypeAlias` TO `Trait`,\n FROM `TypeAlias` TO Class,\n FROM `Const` TO Community,\n FROM `Static` TO Community,\n FROM `Property` TO Community,\n FROM `Record` TO Method,\n FROM `Record` TO `Constructor`,\n FROM `Record` TO `Property`,\n FROM `Record` TO Community,\n FROM `Delegate` TO Community,\n FROM `Annotation` TO Community,\n FROM `Constructor` TO Community,\n FROM `Constructor` TO Interface,\n FROM `Constructor` TO Class,\n FROM `Constructor` TO Method,\n FROM `Constructor` TO Function,\n FROM `Constructor` TO `Constructor`,\n FROM `Constructor` TO `Struct`,\n FROM `Constructor` TO `Macro`,\n FROM `Constructor` TO `Template`,\n FROM `Constructor` TO `TypeAlias`,\n FROM `Constructor` TO `Enum`,\n FROM `Constructor` TO `Annotation`,\n FROM `Constructor` TO `Impl`,\n FROM `Constructor` TO `Namespace`,\n FROM `Constructor` TO `Module`,\n FROM `Constructor` TO `Property`,\n FROM `Constructor` TO `Typedef`,\n FROM `Template` TO Community,\n FROM `Module` TO Community,\n FROM Function TO Process,\n FROM Method TO Process,\n FROM Class TO Process,\n FROM Interface TO Process,\n FROM `Struct` TO Process,\n FROM `Constructor` TO Process,\n FROM `Module` TO Process,\n FROM `Macro` TO Process,\n FROM `Impl` TO Process,\n FROM `Typedef` TO Process,\n FROM `TypeAlias` TO Process,\n FROM `Enum` TO Process,\n FROM `Union` TO Process,\n FROM `Namespace` TO Process,\n FROM `Trait` TO Process,\n FROM `Const` TO Process,\n FROM `Static` TO Process,\n FROM `Property` TO Process,\n FROM `Record` TO Process,\n FROM `Delegate` TO Process,\n FROM `Annotation` TO Process,\n FROM `Template` TO Process,\n FROM CodeElement TO Process,\n FROM Route TO Process,\n FROM Tool TO Process,\n type STRING,\n confidence DOUBLE,\n reason STRING,\n step INT32\n)";
45
45
  export declare const EMBEDDING_DIMS: number;
46
+ /** HNSW vector index name for the CodeEmbedding table. */
47
+ export declare const EMBEDDING_INDEX_NAME = "code_embedding_idx";
48
+ /**
49
+ * Sentinel value for "no content hash available" — used in legacy DBs and null rows.
50
+ * Nodes with this hash are always treated as stale and re-embedded.
51
+ */
52
+ export declare const STALE_HASH_SENTINEL = "";
46
53
  export declare const EMBEDDING_SCHEMA: string;
47
54
  /**
48
55
  * Create vector index for semantic search
@@ -410,10 +410,18 @@ if (Number.isNaN(_rawDims) || _rawDims <= 0) {
410
410
  throw new Error(`GITNEXUS_EMBEDDING_DIMS must be a positive integer, got "${process.env.GITNEXUS_EMBEDDING_DIMS}"`);
411
411
  }
412
412
  export const EMBEDDING_DIMS = _rawDims;
413
+ /** HNSW vector index name for the CodeEmbedding table. */
414
+ export const EMBEDDING_INDEX_NAME = 'code_embedding_idx';
415
+ /**
416
+ * Sentinel value for "no content hash available" — used in legacy DBs and null rows.
417
+ * Nodes with this hash are always treated as stale and re-embedded.
418
+ */
419
+ export const STALE_HASH_SENTINEL = '';
413
420
  export const EMBEDDING_SCHEMA = `
414
421
  CREATE NODE TABLE ${EMBEDDING_TABLE_NAME} (
415
422
  nodeId STRING,
416
423
  embedding FLOAT[${EMBEDDING_DIMS}],
424
+ contentHash STRING,
417
425
  PRIMARY KEY (nodeId)
418
426
  )`;
419
427
  /**
@@ -421,7 +429,7 @@ CREATE NODE TABLE ${EMBEDDING_TABLE_NAME} (
421
429
  * Uses HNSW (Hierarchical Navigable Small World) algorithm with cosine similarity
422
430
  */
423
431
  export const CREATE_VECTOR_INDEX_QUERY = `
424
- CALL CREATE_VECTOR_INDEX('${EMBEDDING_TABLE_NAME}', 'code_embedding_idx', 'embedding', metric := 'cosine')
432
+ CALL CREATE_VECTOR_INDEX('${EMBEDDING_TABLE_NAME}', '${EMBEDDING_INDEX_NAME}', 'embedding', metric := 'cosine')
425
433
  `;
426
434
  // ============================================================================
427
435
  // ALL SCHEMA QUERIES IN ORDER
@@ -15,6 +15,8 @@ import { initLbug, loadGraphToLbug, getLbugStats, executeQuery, executeWithReuse
15
15
  import { getStoragePaths, saveMeta, loadMeta, addToGitignore, registerRepo, cleanupOldKuzuFiles, } from '../storage/repo-manager.js';
16
16
  import { getCurrentCommit, hasGitDir } from '../storage/git.js';
17
17
  import { generateAIContextFiles } from '../cli/ai-context.js';
18
+ import { EMBEDDING_TABLE_NAME } from './lbug/schema.js';
19
+ import { STALE_HASH_SENTINEL } from './lbug/schema.js';
18
20
  /** Threshold: auto-skip embeddings for repos with more nodes than this */
19
21
  const EMBEDDING_NODE_LIMIT = 50_000;
20
22
  export const PHASE_LABELS = {
@@ -147,9 +149,13 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
147
149
  const EMBED_BATCH = 200;
148
150
  for (let i = 0; i < cachedEmbeddings.length; i += EMBED_BATCH) {
149
151
  const batch = cachedEmbeddings.slice(i, i + EMBED_BATCH);
150
- const paramsList = batch.map((e) => ({ nodeId: e.nodeId, embedding: e.embedding }));
152
+ const paramsList = batch.map((e) => ({
153
+ nodeId: e.nodeId,
154
+ embedding: e.embedding,
155
+ contentHash: e.contentHash ?? STALE_HASH_SENTINEL,
156
+ }));
151
157
  try {
152
- await executeWithReusedStatement(`CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`, paramsList);
158
+ await executeWithReusedStatement(`MERGE (e:${EMBEDDING_TABLE_NAME} {nodeId: $nodeId}) SET e.embedding = $embedding, e.contentHash = $contentHash`, paramsList);
153
159
  }
154
160
  catch {
155
161
  /* some may fail if node was removed, that's fine */
@@ -170,6 +176,14 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
170
176
  const httpMode = isHttpMode();
171
177
  progress('embeddings', 90, httpMode ? 'Connecting to embedding endpoint...' : 'Loading embedding model...');
172
178
  const { runEmbeddingPipeline } = await import('./embeddings/embedding-pipeline.js');
179
+ // Build a Map<nodeId, contentHash> from cached embeddings for incremental mode
180
+ let existingEmbeddings;
181
+ if (cachedEmbeddingNodeIds.size > 0) {
182
+ existingEmbeddings = new Map();
183
+ for (const e of cachedEmbeddings) {
184
+ existingEmbeddings.set(e.nodeId, e.contentHash ?? STALE_HASH_SENTINEL);
185
+ }
186
+ }
173
187
  await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (p) => {
174
188
  const scaled = 90 + Math.round((p.percent / 100) * 8);
175
189
  const label = p.phase === 'loading-model'
@@ -178,14 +192,14 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
178
192
  : 'Loading embedding model...'
179
193
  : `Embedding ${p.nodesProcessed || 0}/${p.totalNodes || '?'}`;
180
194
  progress('embeddings', scaled, label);
181
- }, {}, cachedEmbeddingNodeIds.size > 0 ? cachedEmbeddingNodeIds : undefined);
195
+ }, {}, existingEmbeddings);
182
196
  }
183
197
  // ── Phase 5: Finalize (98–100%) ───────────────────────────────────
184
198
  progress('done', 98, 'Saving metadata...');
185
199
  // Count embeddings in the index (cached + newly generated)
186
200
  let embeddingCount = 0;
187
201
  try {
188
- const embResult = await executeQuery(`MATCH (e:CodeEmbedding) RETURN count(e) AS cnt`);
202
+ const embResult = await executeQuery(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN count(e) AS cnt`);
189
203
  embeddingCount = embResult?.[0]?.cnt ?? 0;
190
204
  }
191
205
  catch {
@@ -30,6 +30,11 @@ export const initEmbedder = async () => {
30
30
  initPromise = (async () => {
31
31
  try {
32
32
  env.allowLocalModels = false;
33
+ // Default cache to user-writable location. transformers.js defaults to
34
+ // ./node_modules/.cache inside its own install dir, which is unwritable
35
+ // when gitnexus is installed globally (e.g. /usr/lib/node_modules/).
36
+ // Respect HF_HOME if set, otherwise fall back to ~/.cache/huggingface.
37
+ env.cacheDir = process.env.HF_HOME ?? `${process.env.HOME}/.cache/huggingface`;
33
38
  console.error('GitNexus: Loading embedding model (first search may take a moment)...');
34
39
  // Try GPU first (DirectML on Windows, CUDA on Linux), fall back to CPU
35
40
  const isWindows = process.platform === 'win32';
@@ -1277,6 +1277,13 @@ export const createServer = async (port, host = '127.0.0.1') => {
1277
1277
  const lbugPath = path.join(entry.storagePath, 'lbug');
1278
1278
  await withLbugDb(lbugPath, async () => {
1279
1279
  const { runEmbeddingPipeline } = await import('../core/embeddings/embedding-pipeline.js');
1280
+ // Fetch existing content hashes for incremental embedding.
1281
+ // Delegated to lbug-adapter which owns the DB query logic and legacy-fallback handling.
1282
+ const { fetchExistingEmbeddingHashes } = await import('../core/lbug/lbug-adapter.js');
1283
+ const existingEmbeddings = await fetchExistingEmbeddingHashes(executeQuery);
1284
+ if (existingEmbeddings && existingEmbeddings.size > 0) {
1285
+ console.log(`[embed] ${existingEmbeddings.size} nodes already embedded — incremental run with content-hash comparison`);
1286
+ }
1280
1287
  await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (p) => {
1281
1288
  embedJobManager.updateJob(job.id, {
1282
1289
  progress: {
@@ -1293,7 +1300,8 @@ export const createServer = async (port, host = '127.0.0.1') => {
1293
1300
  : `${p.phase} (${p.percent}%)`,
1294
1301
  },
1295
1302
  });
1296
- });
1303
+ }, {}, // config: use defaults
1304
+ existingEmbeddings);
1297
1305
  });
1298
1306
  clearTimeout(embedTimeout);
1299
1307
  releaseRepoLock(repoLockPath);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gitnexus",
3
- "version": "1.6.1",
3
+ "version": "1.6.2-rc.10",
4
4
  "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
5
5
  "author": "Abhigyan Patwari",
6
6
  "license": "PolyForm-Noncommercial-1.0.0",
@@ -46,7 +46,7 @@
46
46
  "test:integration": "vitest run test/integration",
47
47
  "test:watch": "vitest",
48
48
  "test:coverage": "vitest run --coverage",
49
- "postinstall": "node scripts/patch-tree-sitter-swift.cjs",
49
+ "postinstall": "node scripts/patch-tree-sitter-swift.cjs && node scripts/build-tree-sitter-proto.cjs",
50
50
  "prepare": "node scripts/build.js",
51
51
  "prepack": "node scripts/build.js"
52
52
  },
@@ -71,7 +71,7 @@
71
71
  "pandemonium": "^2.4.0",
72
72
  "tree-sitter": "^0.21.1",
73
73
  "tree-sitter-c": "0.23.2",
74
- "tree-sitter-c-sharp": "^0.23.1",
74
+ "tree-sitter-c-sharp": "0.23.1",
75
75
  "tree-sitter-cpp": "^0.23.4",
76
76
  "tree-sitter-go": "^0.23.0",
77
77
  "tree-sitter-java": "^0.23.5",
@@ -84,7 +84,9 @@
84
84
  "uuid": "^13.0.0"
85
85
  },
86
86
  "optionalDependencies": {
87
- "tree-sitter-dart": "https://github.com/UserNobody14/tree-sitter-dart/archive/80e23c07b64494f7e21090bb3450223ef0b192f4.tar.gz",
87
+ "node-addon-api": "^8.0.0",
88
+ "node-gyp-build": "^4.8.0",
89
+ "tree-sitter-dart": "git+https://github.com/UserNobody14/tree-sitter-dart.git#80e23c07b64494f7e21090bb3450223ef0b192f4",
88
90
  "tree-sitter-kotlin": "^0.3.8",
89
91
  "tree-sitter-proto": "file:./vendor/tree-sitter-proto",
90
92
  "tree-sitter-swift": "^0.6.0"
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Build tree-sitter-proto native binding.
4
+ *
5
+ * Why this script exists:
6
+ * tree-sitter-proto is vendored under gitnexus/vendor/tree-sitter-proto/
7
+ * and declared as a `file:` optionalDependency. Previously, the vendored
8
+ * package had its own `dependencies` and `install` script, which caused
9
+ * npm to create `vendor/tree-sitter-proto/node_modules/` and
10
+ * `vendor/tree-sitter-proto/build/` during install. Those directories
11
+ * blocked `rmdir` on global-install upgrade, producing:
12
+ *
13
+ * ENOTEMPTY: directory not empty, rmdir
14
+ * '.../gitnexus/vendor/tree-sitter-proto/node_modules/node-addon-api'
15
+ *
16
+ * (See https://github.com/abhigyanpatwari/GitNexus/issues/836.)
17
+ *
18
+ * We stripped `dependencies` and the `install` script from the vendored
19
+ * package.json, hoisted `node-addon-api` and `node-gyp-build` into
20
+ * gitnexus's own optionalDependencies, and moved native compilation here.
21
+ *
22
+ * What this does:
23
+ * Runs `npx node-gyp rebuild` inside `node_modules/tree-sitter-proto/`
24
+ * (which npm creates as a copy of vendor/tree-sitter-proto/ when
25
+ * resolving the file: dep). Build output lands in
26
+ * `node_modules/tree-sitter-proto/build/Release/tree_sitter_proto_binding.node`
27
+ * — under npm-managed territory, safe on upgrade.
28
+ *
29
+ * Mirrors scripts/patch-tree-sitter-swift.cjs. Best-effort: if any
30
+ * precondition fails (optional dep absent, no toolchain, --ignore-scripts),
31
+ * warn and exit 0 so gitnexus install still succeeds.
32
+ */
33
+ const fs = require('fs');
34
+ const path = require('path');
35
+ const { execSync } = require('child_process');
36
+
37
+ const protoDir = path.join(__dirname, '..', 'node_modules', 'tree-sitter-proto');
38
+ const bindingGyp = path.join(protoDir, 'binding.gyp');
39
+ const bindingNode = path.join(protoDir, 'build', 'Release', 'tree_sitter_proto_binding.node');
40
+
41
+ try {
42
+ if (!fs.existsSync(bindingGyp)) {
43
+ // tree-sitter-proto is an optionalDependency; absent when install
44
+ // skipped optional deps or the file: dep was not resolved.
45
+ process.exit(0);
46
+ }
47
+
48
+ // Skip if the native binding already exists (idempotent re-run).
49
+ if (fs.existsSync(bindingNode)) {
50
+ process.exit(0);
51
+ }
52
+
53
+ // Pre-flight: the hoisted build deps must be resolvable.
54
+ try {
55
+ require.resolve('node-addon-api');
56
+ require.resolve('node-gyp-build');
57
+ } catch (resolveErr) {
58
+ console.warn(
59
+ '[tree-sitter-proto] Skipping build: hoisted build deps not resolvable (%s).',
60
+ resolveErr.message,
61
+ );
62
+ console.warn(
63
+ '[tree-sitter-proto] Proto parsing will be unavailable. Install without --no-optional and with scripts enabled to build.',
64
+ );
65
+ process.exit(0);
66
+ }
67
+
68
+ console.log('[tree-sitter-proto] Building native binding...');
69
+ execSync('npx node-gyp rebuild', {
70
+ cwd: protoDir,
71
+ stdio: 'pipe',
72
+ timeout: 180000,
73
+ });
74
+ console.log('[tree-sitter-proto] Native binding built successfully');
75
+ } catch (err) {
76
+ console.warn('[tree-sitter-proto] Could not build native binding:', err.message);
77
+ console.warn(
78
+ '[tree-sitter-proto] Proto (.proto) parsing will be unavailable. Non-proto gitnexus functionality is unaffected.',
79
+ );
80
+ // Exit 0: optionalDependency failures must not fail the gitnexus install.
81
+ process.exit(0);
82
+ }
@@ -0,0 +1,6 @@
1
+ # This file is generated by gyp; do not edit.
2
+
3
+ export builddir_name ?= ./build/../../node_modules/node-addon-api/.
4
+ .PHONY: all
5
+ all:
6
+ $(MAKE) -C ../../tree-sitter-proto/build node_addon_api_except