gitnexus 1.6.2-rc.2 → 1.6.2-rc.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/dist/_shared/lbug/schema-constants.d.ts +1 -1
  2. package/dist/_shared/lbug/schema-constants.d.ts.map +1 -1
  3. package/dist/_shared/lbug/schema-constants.js +1 -0
  4. package/dist/_shared/lbug/schema-constants.js.map +1 -1
  5. package/dist/cli/analyze.js +3 -0
  6. package/dist/core/embeddings/ast-utils.d.ts +22 -0
  7. package/dist/core/embeddings/ast-utils.js +105 -0
  8. package/dist/core/embeddings/character-chunk.d.ts +12 -0
  9. package/dist/core/embeddings/character-chunk.js +43 -0
  10. package/dist/core/embeddings/chunker.d.ts +14 -0
  11. package/dist/core/embeddings/chunker.js +234 -0
  12. package/dist/core/embeddings/embedder.js +5 -0
  13. package/dist/core/embeddings/embedding-pipeline.d.ts +29 -24
  14. package/dist/core/embeddings/embedding-pipeline.js +244 -125
  15. package/dist/core/embeddings/line-index.d.ts +7 -0
  16. package/dist/core/embeddings/line-index.js +42 -0
  17. package/dist/core/embeddings/server-mapping.d.ts +15 -0
  18. package/dist/core/embeddings/server-mapping.js +33 -0
  19. package/dist/core/embeddings/structural-extractor.d.ts +15 -0
  20. package/dist/core/embeddings/structural-extractor.js +58 -0
  21. package/dist/core/embeddings/text-generator.d.ts +20 -13
  22. package/dist/core/embeddings/text-generator.js +151 -119
  23. package/dist/core/embeddings/types.d.ts +81 -3
  24. package/dist/core/embeddings/types.js +105 -3
  25. package/dist/core/group/extractors/http-patterns/node.js +130 -0
  26. package/dist/core/group/extractors/manifest-extractor.js +20 -5
  27. package/dist/core/group/sync.js +49 -1
  28. package/dist/core/ingestion/call-extractors/configs/c-cpp.d.ts +3 -0
  29. package/dist/core/ingestion/call-extractors/configs/c-cpp.js +8 -0
  30. package/dist/core/ingestion/call-extractors/configs/csharp.d.ts +2 -0
  31. package/dist/core/ingestion/call-extractors/configs/csharp.js +6 -0
  32. package/dist/core/ingestion/call-extractors/configs/dart.d.ts +2 -0
  33. package/dist/core/ingestion/call-extractors/configs/dart.js +5 -0
  34. package/dist/core/ingestion/call-extractors/configs/go.d.ts +2 -0
  35. package/dist/core/ingestion/call-extractors/configs/go.js +5 -0
  36. package/dist/core/ingestion/call-extractors/configs/jvm.d.ts +3 -0
  37. package/dist/core/ingestion/call-extractors/configs/jvm.js +51 -0
  38. package/dist/core/ingestion/call-extractors/configs/php.d.ts +2 -0
  39. package/dist/core/ingestion/call-extractors/configs/php.js +5 -0
  40. package/dist/core/ingestion/call-extractors/configs/python.d.ts +2 -0
  41. package/dist/core/ingestion/call-extractors/configs/python.js +5 -0
  42. package/dist/core/ingestion/call-extractors/configs/ruby.d.ts +2 -0
  43. package/dist/core/ingestion/call-extractors/configs/ruby.js +5 -0
  44. package/dist/core/ingestion/call-extractors/configs/rust.d.ts +2 -0
  45. package/dist/core/ingestion/call-extractors/configs/rust.js +5 -0
  46. package/dist/core/ingestion/call-extractors/configs/swift.d.ts +2 -0
  47. package/dist/core/ingestion/call-extractors/configs/swift.js +5 -0
  48. package/dist/core/ingestion/call-extractors/configs/typescript-javascript.d.ts +3 -0
  49. package/dist/core/ingestion/call-extractors/configs/typescript-javascript.js +8 -0
  50. package/dist/core/ingestion/call-extractors/generic.d.ts +5 -0
  51. package/dist/core/ingestion/call-extractors/generic.js +59 -0
  52. package/dist/core/ingestion/call-processor.d.ts +1 -3
  53. package/dist/core/ingestion/call-processor.js +49 -47
  54. package/dist/core/ingestion/call-types.d.ts +60 -0
  55. package/dist/core/ingestion/call-types.js +2 -0
  56. package/dist/core/ingestion/class-extractors/configs/c-cpp.d.ts +3 -0
  57. package/dist/core/ingestion/class-extractors/configs/c-cpp.js +11 -0
  58. package/dist/core/ingestion/class-extractors/configs/csharp.d.ts +2 -0
  59. package/dist/core/ingestion/class-extractors/configs/csharp.js +21 -0
  60. package/dist/core/ingestion/class-extractors/configs/dart.d.ts +2 -0
  61. package/dist/core/ingestion/class-extractors/configs/dart.js +7 -0
  62. package/dist/core/ingestion/class-extractors/configs/go.d.ts +2 -0
  63. package/dist/core/ingestion/class-extractors/configs/go.js +20 -0
  64. package/dist/core/ingestion/class-extractors/configs/jvm.d.ts +3 -0
  65. package/dist/core/ingestion/class-extractors/configs/jvm.js +35 -0
  66. package/dist/core/ingestion/class-extractors/configs/php.d.ts +2 -0
  67. package/dist/core/ingestion/class-extractors/configs/php.js +7 -0
  68. package/dist/core/ingestion/class-extractors/configs/python.d.ts +2 -0
  69. package/dist/core/ingestion/class-extractors/configs/python.js +7 -0
  70. package/dist/core/ingestion/class-extractors/configs/ruby.d.ts +2 -0
  71. package/dist/core/ingestion/class-extractors/configs/ruby.js +7 -0
  72. package/dist/core/ingestion/class-extractors/configs/rust.d.ts +2 -0
  73. package/dist/core/ingestion/class-extractors/configs/rust.js +7 -0
  74. package/dist/core/ingestion/class-extractors/configs/swift.d.ts +2 -0
  75. package/dist/core/ingestion/class-extractors/configs/swift.js +18 -0
  76. package/dist/core/ingestion/class-extractors/configs/typescript-javascript.d.ts +4 -0
  77. package/dist/core/ingestion/class-extractors/configs/typescript-javascript.js +28 -0
  78. package/dist/core/ingestion/field-types.d.ts +1 -1
  79. package/dist/core/ingestion/import-resolvers/configs/c-cpp.d.ts +7 -0
  80. package/dist/core/ingestion/import-resolvers/configs/c-cpp.js +14 -0
  81. package/dist/core/ingestion/import-resolvers/configs/csharp.d.ts +8 -0
  82. package/dist/core/ingestion/import-resolvers/configs/csharp.js +27 -0
  83. package/dist/core/ingestion/import-resolvers/configs/dart.d.ts +17 -0
  84. package/dist/core/ingestion/import-resolvers/{dart.js → configs/dart.js} +26 -16
  85. package/dist/core/ingestion/import-resolvers/configs/go.d.ts +8 -0
  86. package/dist/core/ingestion/import-resolvers/configs/go.js +26 -0
  87. package/dist/core/ingestion/import-resolvers/configs/jvm.d.ts +13 -0
  88. package/dist/core/ingestion/import-resolvers/configs/jvm.js +68 -0
  89. package/dist/core/ingestion/import-resolvers/configs/php.d.ts +8 -0
  90. package/dist/core/ingestion/import-resolvers/configs/php.js +15 -0
  91. package/dist/core/ingestion/import-resolvers/configs/python.d.ts +12 -0
  92. package/dist/core/ingestion/import-resolvers/configs/python.js +27 -0
  93. package/dist/core/ingestion/import-resolvers/configs/ruby.d.ts +8 -0
  94. package/dist/core/ingestion/import-resolvers/configs/ruby.js +16 -0
  95. package/dist/core/ingestion/import-resolvers/configs/rust.d.ts +8 -0
  96. package/dist/core/ingestion/import-resolvers/configs/rust.js +54 -0
  97. package/dist/core/ingestion/import-resolvers/configs/swift.d.ts +8 -0
  98. package/dist/core/ingestion/import-resolvers/{swift.js → configs/swift.js} +10 -5
  99. package/dist/core/ingestion/import-resolvers/configs/typescript-javascript.d.ts +9 -0
  100. package/dist/core/ingestion/import-resolvers/configs/typescript-javascript.js +23 -0
  101. package/dist/core/ingestion/import-resolvers/csharp.d.ts +4 -5
  102. package/dist/core/ingestion/import-resolvers/csharp.js +4 -20
  103. package/dist/core/ingestion/import-resolvers/go.d.ts +4 -5
  104. package/dist/core/ingestion/import-resolvers/go.js +4 -19
  105. package/dist/core/ingestion/import-resolvers/jvm.d.ts +5 -10
  106. package/dist/core/ingestion/import-resolvers/jvm.js +5 -58
  107. package/dist/core/ingestion/import-resolvers/php.d.ts +4 -5
  108. package/dist/core/ingestion/import-resolvers/php.js +4 -7
  109. package/dist/core/ingestion/import-resolvers/python.d.ts +3 -6
  110. package/dist/core/ingestion/import-resolvers/python.js +3 -18
  111. package/dist/core/ingestion/import-resolvers/resolver-factory.d.ts +24 -0
  112. package/dist/core/ingestion/import-resolvers/resolver-factory.js +33 -0
  113. package/dist/core/ingestion/import-resolvers/ruby.d.ts +4 -5
  114. package/dist/core/ingestion/import-resolvers/ruby.js +4 -7
  115. package/dist/core/ingestion/import-resolvers/rust.d.ts +4 -5
  116. package/dist/core/ingestion/import-resolvers/rust.js +4 -47
  117. package/dist/core/ingestion/import-resolvers/standard.d.ts +3 -9
  118. package/dist/core/ingestion/import-resolvers/standard.js +7 -8
  119. package/dist/core/ingestion/import-resolvers/types.d.ts +24 -0
  120. package/dist/core/ingestion/language-provider.d.ts +12 -0
  121. package/dist/core/ingestion/languages/c-cpp.js +15 -12
  122. package/dist/core/ingestion/languages/csharp.js +11 -21
  123. package/dist/core/ingestion/languages/dart.js +11 -7
  124. package/dist/core/ingestion/languages/go.js +11 -20
  125. package/dist/core/ingestion/languages/java.js +11 -18
  126. package/dist/core/ingestion/languages/kotlin.js +11 -13
  127. package/dist/core/ingestion/languages/php.js +11 -7
  128. package/dist/core/ingestion/languages/python.js +11 -7
  129. package/dist/core/ingestion/languages/ruby.js +11 -7
  130. package/dist/core/ingestion/languages/rust.js +11 -7
  131. package/dist/core/ingestion/languages/swift.js +11 -18
  132. package/dist/core/ingestion/languages/typescript.js +15 -23
  133. package/dist/core/ingestion/languages/vue.js +11 -17
  134. package/dist/core/ingestion/model/index.d.ts +2 -2
  135. package/dist/core/ingestion/model/index.js +1 -1
  136. package/dist/core/ingestion/model/resolve.d.ts +3 -0
  137. package/dist/core/ingestion/model/resolve.js +6 -2
  138. package/dist/core/ingestion/parsing-processor.d.ts +1 -2
  139. package/dist/core/ingestion/tree-sitter-queries.d.ts +11 -11
  140. package/dist/core/ingestion/tree-sitter-queries.js +81 -0
  141. package/dist/core/ingestion/type-env.d.ts +1 -1
  142. package/dist/core/ingestion/utils/ast-helpers.d.ts +1 -1
  143. package/dist/core/ingestion/utils/ast-helpers.js +3 -0
  144. package/dist/core/ingestion/variable-extractors/configs/c-cpp.d.ts +3 -0
  145. package/dist/core/ingestion/variable-extractors/configs/c-cpp.js +81 -0
  146. package/dist/core/ingestion/variable-extractors/configs/csharp.d.ts +9 -0
  147. package/dist/core/ingestion/variable-extractors/configs/csharp.js +63 -0
  148. package/dist/core/ingestion/variable-extractors/configs/dart.d.ts +2 -0
  149. package/dist/core/ingestion/variable-extractors/configs/dart.js +94 -0
  150. package/dist/core/ingestion/variable-extractors/configs/go.d.ts +2 -0
  151. package/dist/core/ingestion/variable-extractors/configs/go.js +83 -0
  152. package/dist/core/ingestion/variable-extractors/configs/jvm.d.ts +18 -0
  153. package/dist/core/ingestion/variable-extractors/configs/jvm.js +115 -0
  154. package/dist/core/ingestion/variable-extractors/configs/php.d.ts +14 -0
  155. package/dist/core/ingestion/variable-extractors/configs/php.js +58 -0
  156. package/dist/core/ingestion/variable-extractors/configs/python.d.ts +2 -0
  157. package/dist/core/ingestion/variable-extractors/configs/python.js +101 -0
  158. package/dist/core/ingestion/variable-extractors/configs/ruby.d.ts +11 -0
  159. package/dist/core/ingestion/variable-extractors/configs/ruby.js +52 -0
  160. package/dist/core/ingestion/variable-extractors/configs/rust.d.ts +2 -0
  161. package/dist/core/ingestion/variable-extractors/configs/rust.js +76 -0
  162. package/dist/core/ingestion/variable-extractors/configs/swift.d.ts +2 -0
  163. package/dist/core/ingestion/variable-extractors/configs/swift.js +88 -0
  164. package/dist/core/ingestion/variable-extractors/configs/typescript-javascript.d.ts +3 -0
  165. package/dist/core/ingestion/variable-extractors/configs/typescript-javascript.js +83 -0
  166. package/dist/core/ingestion/variable-extractors/generic.d.ts +5 -0
  167. package/dist/core/ingestion/variable-extractors/generic.js +80 -0
  168. package/dist/core/ingestion/variable-types.d.ts +82 -0
  169. package/dist/core/ingestion/variable-types.js +2 -0
  170. package/dist/core/ingestion/workers/parse-worker.js +196 -166
  171. package/dist/core/ingestion/workers/worker-pool.js +3 -0
  172. package/dist/core/lbug/csv-generator.js +1 -0
  173. package/dist/core/lbug/lbug-adapter.d.ts +13 -4
  174. package/dist/core/lbug/lbug-adapter.js +166 -81
  175. package/dist/core/lbug/schema.d.ts +9 -1
  176. package/dist/core/lbug/schema.js +19 -2
  177. package/dist/core/run-analyze.js +17 -4
  178. package/dist/core/tree-sitter/parser-loader.d.ts +3 -0
  179. package/dist/core/tree-sitter/parser-loader.js +17 -8
  180. package/dist/mcp/core/embedder.js +5 -0
  181. package/dist/mcp/local/local-backend.js +29 -19
  182. package/dist/server/api.js +10 -21
  183. package/package.json +5 -3
  184. package/scripts/build-tree-sitter-proto.cjs +82 -0
  185. package/vendor/node_modules/node-addon-api/node_addon_api.Makefile +6 -0
  186. package/vendor/node_modules/node-addon-api/node_addon_api.target.mk +104 -0
  187. package/vendor/node_modules/node-addon-api/node_addon_api_except.target.mk +108 -0
  188. package/vendor/node_modules/node-addon-api/node_addon_api_except_all.target.mk +104 -0
  189. package/vendor/node_modules/node-addon-api/node_addon_api_maybe.target.mk +104 -0
  190. package/vendor/tree-sitter-proto/package.json +1 -7
  191. package/dist/core/ingestion/call-sites/extract-language-call-site.d.ts +0 -10
  192. package/dist/core/ingestion/call-sites/extract-language-call-site.js +0 -22
  193. package/dist/core/ingestion/call-sites/java.d.ts +0 -9
  194. package/dist/core/ingestion/call-sites/java.js +0 -30
  195. package/dist/core/ingestion/import-resolvers/dart.d.ts +0 -7
  196. package/dist/core/ingestion/import-resolvers/swift.d.ts +0 -7
  197. package/dist/core/ingestion/import-resolvers/vue.d.ts +0 -8
  198. package/dist/core/ingestion/import-resolvers/vue.js +0 -9
@@ -3,44 +3,76 @@
3
3
  *
4
4
  * Orchestrates the background embedding process:
5
5
  * 1. Query embeddable nodes from LadybugDB
6
- * 2. Generate text representations
7
- * 3. Batch embed using transformers.js
8
- * 4. Update LadybugDB with embeddings
6
+ * 2. Generate text representations with enriched metadata
7
+ * 3. Chunk long nodes, batch embed
8
+ * 4. Update LadybugDB with chunk-aware embeddings
9
9
  * 5. Create vector index for semantic search
10
10
  */
11
+ import { createHash } from 'crypto';
11
12
  import { initEmbedder, embedBatch, embedText, embeddingToArray, isEmbedderReady, } from './embedder.js';
12
- import { generateBatchEmbeddingTexts } from './text-generator.js';
13
- import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, } from './types.js';
13
+ import { generateEmbeddingText } from './text-generator.js';
14
+ import { chunkNode, characterChunk } from './chunker.js';
15
+ import { extractStructuralNames } from './structural-extractor.js';
16
+ import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, isShortLabel, LABELS_WITH_EXPORTED, STRUCTURAL_LABELS, collectBestChunks, } from './types.js';
17
+ import { EMBEDDING_TABLE_NAME, EMBEDDING_INDEX_NAME, CREATE_VECTOR_INDEX_QUERY, STALE_HASH_SENTINEL, } from '../lbug/schema.js';
18
+ import { loadVectorExtension } from '../lbug/lbug-adapter.js';
14
19
  const isDev = process.env.NODE_ENV === 'development';
20
+ /**
21
+ * Compute a stable content fingerprint for an embeddable node.
22
+ * Used to detect when the underlying text has changed so stale vectors
23
+ * can be replaced (DELETE-then-INSERT, the Kuzu-sanctioned pattern for
24
+ * vector-indexed rows).
25
+ */
26
+ export const contentHashForNode = (node, config = {}) => {
27
+ // Hash must be deterministic across runs, so exclude methodNames/fieldNames
28
+ // which are populated during the batch loop via AST extraction.
29
+ // Using only node.content ensures the hash stays stable.
30
+ const text = generateEmbeddingText({ ...node, methodNames: undefined, fieldNames: undefined }, node.content, config);
31
+ return createHash('sha1').update(text).digest('hex');
32
+ };
15
33
  /**
16
34
  * Query all embeddable nodes from LadybugDB
17
- * Uses table-specific queries (File has different schema than code elements)
35
+ * Uses table-specific queries for different label types
18
36
  */
19
37
  const queryEmbeddableNodes = async (executeQuery) => {
20
38
  const allNodes = [];
21
- // Query each embeddable table with table-specific columns
22
39
  for (const label of EMBEDDABLE_LABELS) {
23
40
  try {
24
41
  let query;
25
- if (label === 'File') {
26
- // File nodes don't have startLine/endLine
42
+ if (label === 'Method') {
43
+ // Method has parameterCount and returnType
27
44
  query = `
28
- MATCH (n:File)
29
- RETURN n.id AS id, n.name AS name, 'File' AS label,
30
- n.filePath AS filePath, n.content AS content
45
+ MATCH (n:Method)
46
+ RETURN n.id AS id, n.name AS name, 'Method' AS label,
47
+ n.filePath AS filePath, n.content AS content,
48
+ n.startLine AS startLine, n.endLine AS endLine,
49
+ n.isExported AS isExported, n.description AS description,
50
+ n.parameterCount AS parameterCount, n.returnType AS returnType
51
+ `;
52
+ }
53
+ else if (LABELS_WITH_EXPORTED.has(label)) {
54
+ // Function, Class, Interface have isExported and description
55
+ query = `
56
+ MATCH (n:\`${label}\`)
57
+ RETURN n.id AS id, n.name AS name, '${label}' AS label,
58
+ n.filePath AS filePath, n.content AS content,
59
+ n.startLine AS startLine, n.endLine AS endLine,
60
+ n.isExported AS isExported, n.description AS description
31
61
  `;
32
62
  }
33
63
  else {
34
- // Code elements have startLine/endLine
64
+ // Multi-language tables (Struct, Enum, etc.) — have description but no isExported
35
65
  query = `
36
- MATCH (n:${label})
37
- RETURN n.id AS id, n.name AS name, '${label}' AS label,
66
+ MATCH (n:\`${label}\`)
67
+ RETURN n.id AS id, n.name AS name, '${label}' AS label,
38
68
  n.filePath AS filePath, n.content AS content,
39
- n.startLine AS startLine, n.endLine AS endLine
69
+ n.startLine AS startLine, n.endLine AS endLine,
70
+ n.description AS description
40
71
  `;
41
72
  }
42
73
  const rows = await executeQuery(query);
43
74
  for (const row of rows) {
75
+ const hasExportedColumn = label === 'Method' || LABELS_WITH_EXPORTED.has(label);
44
76
  allNodes.push({
45
77
  id: row.id ?? row[0],
46
78
  name: row.name ?? row[1],
@@ -49,11 +81,18 @@ const queryEmbeddableNodes = async (executeQuery) => {
49
81
  content: row.content ?? row[4] ?? '',
50
82
  startLine: row.startLine ?? row[5],
51
83
  endLine: row.endLine ?? row[6],
84
+ isExported: hasExportedColumn ? (row.isExported ?? row[7]) : undefined,
85
+ description: row.description ?? (hasExportedColumn ? row[8] : row[7]),
86
+ ...(label === 'Method'
87
+ ? {
88
+ parameterCount: row.parameterCount ?? row[9],
89
+ returnType: row.returnType ?? row[10],
90
+ }
91
+ : {}),
52
92
  });
53
93
  }
54
94
  }
55
95
  catch (error) {
56
- // Table might not exist or be empty, continue
57
96
  if (isDev) {
58
97
  console.warn(`Query for ${label} nodes failed:`, error);
59
98
  }
@@ -62,42 +101,36 @@ const queryEmbeddableNodes = async (executeQuery) => {
62
101
  return allNodes;
63
102
  };
64
103
  /**
65
- * Batch INSERT embeddings into separate CodeEmbedding table
66
- * Using a separate lightweight table avoids copy-on-write overhead
67
- * that occurs when UPDATEing nodes with large content fields
104
+ * Batch INSERT chunk-aware embeddings into CodeEmbedding table
68
105
  */
69
- const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
70
- // MERGE instead of CREATE idempotent, handles concurrent analyzes and partial prior runs
71
- const cypher = `MERGE (e:CodeEmbedding {nodeId: $nodeId}) SET e.embedding = $embedding`;
72
- const paramsList = updates.map((u) => ({ nodeId: u.id, embedding: u.embedding }));
106
+ export const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
107
+ const cypher = `CREATE (e:${EMBEDDING_TABLE_NAME} {id: $id, nodeId: $nodeId, chunkIndex: $chunkIndex, startLine: $startLine, endLine: $endLine, embedding: $embedding, contentHash: $contentHash})`;
108
+ const paramsList = updates.map((u) => ({
109
+ id: `${u.nodeId}:${u.chunkIndex}`,
110
+ nodeId: u.nodeId,
111
+ chunkIndex: u.chunkIndex,
112
+ startLine: u.startLine,
113
+ endLine: u.endLine,
114
+ embedding: u.embedding,
115
+ contentHash: u.contentHash ?? STALE_HASH_SENTINEL,
116
+ }));
73
117
  await executeWithReusedStatement(cypher, paramsList);
74
118
  };
75
119
  /**
76
120
  * Create the vector index for semantic search
77
- * Now indexes the separate CodeEmbedding table
121
+
122
+ * Now indexes the separate CodeEmbedding table.
123
+ * Delegates extension loading to lbug-adapter's loadVectorExtension(),
124
+ * which owns the VECTOR extension lifecycle and state tracking.
125
+
78
126
  */
79
- let vectorExtensionLoaded = false;
80
127
  const createVectorIndex = async (executeQuery) => {
81
- // LadybugDB v0.15+ requires explicit VECTOR extension loading (once per session)
82
- if (!vectorExtensionLoaded) {
83
- try {
84
- await executeQuery('INSTALL VECTOR');
85
- await executeQuery('LOAD EXTENSION VECTOR');
86
- vectorExtensionLoaded = true;
87
- }
88
- catch {
89
- // Extension may already be loaded — CREATE_VECTOR_INDEX will fail clearly if not
90
- vectorExtensionLoaded = true;
91
- }
92
- }
93
- const cypher = `
94
- CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
95
- `;
128
+ // Delegate to the adapter which tracks loaded state and handles DB reconnect resets
129
+ await loadVectorExtension();
96
130
  try {
97
- await executeQuery(cypher);
131
+ await executeQuery(CREATE_VECTOR_INDEX_QUERY);
98
132
  }
99
133
  catch (error) {
100
- // Index might already exist
101
134
  if (isDev) {
102
135
  console.warn('Vector index creation warning:', error);
103
136
  }
@@ -111,8 +144,13 @@ const createVectorIndex = async (executeQuery) => {
111
144
  * @param onProgress - Callback for progress updates
112
145
  * @param config - Optional configuration override
113
146
  * @param skipNodeIds - Optional set of node IDs that already have embeddings (incremental mode)
147
+ * @param context - Optional repo/server context for metadata enrichment
148
+ * @param existingEmbeddings - Optional map of nodeId → contentHash for incremental mode.
149
+ * Nodes whose hash matches are skipped; nodes with a changed hash are DELETE'd
150
+ * and re-embedded; nodes not in the map are embedded fresh.
151
+
114
152
  */
115
- export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatement, onProgress, config = {}, skipNodeIds) => {
153
+ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatement, onProgress, config = {}, skipNodeIds, context, existingEmbeddings) => {
116
154
  const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
117
155
  try {
118
156
  // Phase 1: Load embedding model
@@ -141,12 +179,57 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
141
179
  }
142
180
  // Phase 2: Query embeddable nodes
143
181
  let nodes = await queryEmbeddableNodes(executeQuery);
144
- // Incremental mode: filter out nodes that already have embeddings
145
- if (skipNodeIds && skipNodeIds.size > 0) {
182
+ // Apply context metadata
183
+ if (context?.repoName) {
184
+ for (const node of nodes) {
185
+ node.repoName = context.repoName;
186
+ node.serverName = context.serverName;
187
+ }
188
+ }
189
+ // Incremental mode: compare content hashes, delete stale rows, skip fresh ones.
190
+ // Computed hashes for stale nodes are cached so batchInsertEmbeddings can reuse them
191
+ // (avoids double computation).
192
+ const computedStaleHashes = new Map();
193
+ if (existingEmbeddings && existingEmbeddings.size > 0) {
146
194
  const beforeCount = nodes.length;
147
- nodes = nodes.filter((n) => !skipNodeIds.has(n.id));
195
+ const staleNodeIds = [];
196
+ nodes = nodes.filter((n) => {
197
+ const existingHash = existingEmbeddings.get(n.id);
198
+ if (existingHash === undefined) {
199
+ // New node — needs embedding
200
+ return true;
201
+ }
202
+ const currentHash = contentHashForNode(n, finalConfig);
203
+ if (currentHash !== existingHash) {
204
+ // Content changed — cache hash for reuse during insert, mark for DELETE + re-embed
205
+ computedStaleHashes.set(n.id, currentHash);
206
+ staleNodeIds.push(n.id);
207
+ return true;
208
+ }
209
+ // Hash matches — skip (fresh); no need to cache hash for skipped nodes
210
+ return false;
211
+ });
212
+ // DELETE stale embedding rows so they can be re-inserted
213
+ // (Kuzu forbids SET on vector-indexed properties; DELETE-then-INSERT is the sanctioned pattern)
214
+ if (staleNodeIds.length > 0) {
215
+ if (isDev) {
216
+ console.log(`🔄 Deleting ${staleNodeIds.length} stale embedding rows for re-embed`);
217
+ }
218
+ try {
219
+ await executeWithReusedStatement(`MATCH (e:${EMBEDDING_TABLE_NAME} {nodeId: $nodeId}) DELETE e`, staleNodeIds.map((nodeId) => ({ nodeId })));
220
+ }
221
+ catch (err) {
222
+ // "does not exist" = rows already gone — safe to proceed.
223
+ // All other errors risk vector-index corruption (Kuzu requires DELETE-before-INSERT
224
+ // for vector-indexed properties) — propagate so the pipeline aborts cleanly.
225
+ const msg = err instanceof Error ? err.message : String(err);
226
+ if (!msg.includes('does not exist')) {
227
+ throw new Error(`[embed] Failed to delete stale embedding rows — aborting to prevent vector-index corruption: ${msg}`);
228
+ }
229
+ }
230
+ }
148
231
  if (isDev) {
149
- console.log(`📦 Incremental embeddings: ${beforeCount} total, ${skipNodeIds.size} cached, ${nodes.length} to embed`);
232
+ console.log(`📦 Incremental embeddings: ${beforeCount} total, ${existingEmbeddings.size} cached, ${staleNodeIds.length} stale, ${nodes.length} to embed`);
150
233
  }
151
234
  }
152
235
  const totalNodes = nodes.length;
@@ -154,6 +237,10 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
154
237
  console.log(`📊 Found ${totalNodes} embeddable nodes`);
155
238
  }
156
239
  if (totalNodes === 0) {
240
+ // Ensure the vector index exists even when no new nodes need embedding.
241
+ // A prior crash or first-time incremental run may have left CodeEmbedding
242
+ // rows without ever reaching index creation.
243
+ await createVectorIndex(executeQuery);
157
244
  onProgress({
158
245
  phase: 'ready',
159
246
  percent: 100,
@@ -162,42 +249,99 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
162
249
  });
163
250
  return;
164
251
  }
165
- // Phase 3: Batch embed nodes
252
+ // Phase 3: Chunk + embed nodes
166
253
  const batchSize = finalConfig.batchSize;
167
- const totalBatches = Math.ceil(totalNodes / batchSize);
254
+ const chunkSize = finalConfig.chunkSize;
255
+ const overlap = finalConfig.overlap;
168
256
  let processedNodes = 0;
257
+ let totalChunks = 0;
169
258
  onProgress({
170
259
  phase: 'embedding',
171
260
  percent: 20,
172
261
  nodesProcessed: 0,
173
262
  totalNodes,
174
263
  currentBatch: 0,
175
- totalBatches,
264
+ totalBatches: Math.ceil(totalNodes / batchSize),
176
265
  });
177
- for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
178
- const start = batchIndex * batchSize;
179
- const end = Math.min(start + batchSize, totalNodes);
180
- const batch = nodes.slice(start, end);
181
- // Generate texts for this batch
182
- const texts = generateBatchEmbeddingTexts(batch, finalConfig);
183
- // Embed the batch
184
- const embeddings = await embedBatch(texts);
185
- // Update LadybugDB with embeddings
186
- const updates = batch.map((node, i) => ({
187
- id: node.id,
188
- embedding: embeddingToArray(embeddings[i]),
189
- }));
190
- await batchInsertEmbeddings(executeWithReusedStatement, updates);
266
+ // Process in batches of nodes
267
+ for (let batchIndex = 0; batchIndex < totalNodes; batchIndex += batchSize) {
268
+ const batch = nodes.slice(batchIndex, batchIndex + batchSize);
269
+ // Chunk each node and generate text
270
+ const allTexts = [];
271
+ const allUpdates = [];
272
+ for (const node of batch) {
273
+ const isShort = isShortLabel(node.label);
274
+ const startLine = node.startLine ?? 0;
275
+ const endLine = node.endLine ?? 0;
276
+ // Extract structural names for class-like nodes via AST extractors
277
+ if (!isShort && STRUCTURAL_LABELS.has(node.label)) {
278
+ try {
279
+ const names = await extractStructuralNames(node.content, node.filePath);
280
+ node.methodNames = names.methodNames;
281
+ node.fieldNames = names.fieldNames;
282
+ }
283
+ catch {
284
+ // AST extraction failed — names stay undefined, text-generator handles gracefully
285
+ }
286
+ }
287
+ // Compute content hash once per node (re-use cached value for stale nodes)
288
+ const hash = computedStaleHashes.get(node.id) ?? contentHashForNode(node, finalConfig);
289
+ let chunks;
290
+ if (isShort) {
291
+ chunks = [{ text: node.content, chunkIndex: 0, startLine, endLine }];
292
+ }
293
+ else {
294
+ try {
295
+ chunks = await chunkNode(node.label, node.content, node.filePath, startLine, endLine, chunkSize, overlap);
296
+ }
297
+ catch (chunkErr) {
298
+ if (isDev) {
299
+ console.warn(`⚠️ AST chunking failed for ${node.label} "${node.name}" (${node.filePath}), falling back to character-based chunking:`, chunkErr);
300
+ }
301
+ chunks = characterChunk(node.content, startLine, endLine, chunkSize, overlap);
302
+ }
303
+ }
304
+ for (const chunk of chunks) {
305
+ const text = generateEmbeddingText(node, chunk.text, finalConfig);
306
+ allTexts.push(text);
307
+ allUpdates.push({
308
+ nodeId: node.id,
309
+ chunkIndex: chunk.chunkIndex,
310
+ startLine: chunk.startLine,
311
+ endLine: chunk.endLine,
312
+ contentHash: hash,
313
+ });
314
+ }
315
+ }
316
+ // Embed chunk texts in sub-batches to control memory
317
+ const EMBED_SUB_BATCH = 8;
318
+ for (let si = 0; si < allTexts.length; si += EMBED_SUB_BATCH) {
319
+ const subTexts = allTexts.slice(si, si + EMBED_SUB_BATCH);
320
+ const subUpdates = allUpdates.slice(si, si + EMBED_SUB_BATCH);
321
+ let embeddings;
322
+ try {
323
+ embeddings = await embedBatch(subTexts);
324
+ }
325
+ catch (embedErr) {
326
+ console.error(`❌ embedBatch failed for ${subTexts.length} texts (first: "${subTexts[0]?.substring(0, 80)}..."):`, embedErr);
327
+ throw embedErr;
328
+ }
329
+ const dbUpdates = subUpdates.map((u, i) => ({
330
+ ...u,
331
+ embedding: embeddingToArray(embeddings[i]),
332
+ }));
333
+ await batchInsertEmbeddings(executeWithReusedStatement, dbUpdates);
334
+ }
191
335
  processedNodes += batch.length;
192
- // Report progress (20-90% for embedding phase)
336
+ totalChunks += allUpdates.length;
193
337
  const embeddingProgress = 20 + (processedNodes / totalNodes) * 70;
194
338
  onProgress({
195
339
  phase: 'embedding',
196
340
  percent: Math.round(embeddingProgress),
197
341
  nodesProcessed: processedNodes,
198
342
  totalNodes,
199
- currentBatch: batchIndex + 1,
200
- totalBatches,
343
+ currentBatch: Math.floor(batchIndex / batchSize) + 1,
344
+ totalBatches: Math.ceil(totalNodes / batchSize),
201
345
  });
202
346
  }
203
347
  // Phase 4: Create vector index
@@ -211,7 +355,6 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
211
355
  console.log('📇 Creating vector index...');
212
356
  }
213
357
  await createVectorIndex(executeQuery);
214
- // Complete
215
358
  onProgress({
216
359
  phase: 'ready',
217
360
  percent: 100,
@@ -219,7 +362,7 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
219
362
  totalNodes,
220
363
  });
221
364
  if (isDev) {
222
- console.log('✅ Embedding pipeline complete!');
365
+ console.log(`✅ Embedding pipeline complete! (${totalChunks} chunks from ${totalNodes} nodes)`);
223
366
  }
224
367
  }
225
368
  catch (error) {
@@ -236,68 +379,57 @@ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatem
236
379
  }
237
380
  };
238
381
  /**
239
- * Perform semantic search using the vector index
240
- *
241
- * Uses CodeEmbedding table and queries each node table to get metadata
242
- *
243
- * @param executeQuery - Function to execute Cypher queries
244
- * @param query - Search query text
245
- * @param k - Number of results to return (default: 10)
246
- * @param maxDistance - Maximum distance threshold (default: 0.5)
247
- * @returns Array of search results ordered by relevance
382
+ * Perform semantic search using the vector index with chunk deduplication
248
383
  */
249
384
  export const semanticSearch = async (executeQuery, query, k = 10, maxDistance = 0.5) => {
250
385
  if (!isEmbedderReady()) {
251
386
  throw new Error('Embedding model not initialized. Run embedding pipeline first.');
252
387
  }
253
- // Embed the query
254
388
  const queryEmbedding = await embedText(query);
255
389
  const queryVec = embeddingToArray(queryEmbedding);
256
390
  const queryVecStr = `[${queryVec.join(',')}]`;
257
- // Query the vector index on CodeEmbedding to get nodeIds and distances
258
- const vectorQuery = `
259
- CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
260
- CAST(${queryVecStr} AS FLOAT[${queryVec.length}]), ${k})
261
- YIELD node AS emb, distance
262
- WITH emb, distance
263
- WHERE distance < ${maxDistance}
264
- RETURN emb.nodeId AS nodeId, distance
265
- ORDER BY distance
266
- `;
267
- const embResults = await executeQuery(vectorQuery);
268
- if (embResults.length === 0) {
391
+ const bestChunks = await collectBestChunks(k, async (fetchLimit) => {
392
+ const vectorQuery = `
393
+ CALL QUERY_VECTOR_INDEX('${EMBEDDING_TABLE_NAME}', '${EMBEDDING_INDEX_NAME}',
394
+ CAST(${queryVecStr} AS FLOAT[${queryVec.length}]), ${fetchLimit})
395
+ YIELD node AS emb, distance
396
+ WITH emb, distance
397
+ WHERE distance < ${maxDistance}
398
+ RETURN emb.nodeId AS nodeId, emb.chunkIndex AS chunkIndex,
399
+ emb.startLine AS startLine, emb.endLine AS endLine, distance
400
+ ORDER BY distance
401
+ `;
402
+ const embResults = await executeQuery(vectorQuery);
403
+ return embResults.map((row) => ({
404
+ nodeId: row.nodeId ?? row[0],
405
+ chunkIndex: row.chunkIndex ?? row[1] ?? 0,
406
+ startLine: row.startLine ?? row[2] ?? 0,
407
+ endLine: row.endLine ?? row[3] ?? 0,
408
+ distance: row.distance ?? row[4],
409
+ }));
410
+ });
411
+ if (bestChunks.size === 0) {
269
412
  return [];
270
413
  }
271
414
  // Group results by label for batched metadata queries
272
415
  const byLabel = new Map();
273
- for (const embRow of embResults) {
274
- const nodeId = embRow.nodeId ?? embRow[0];
275
- const distance = embRow.distance ?? embRow[1];
416
+ for (const [nodeId, chunk] of Array.from(bestChunks.entries()).slice(0, k)) {
276
417
  const labelEndIdx = nodeId.indexOf(':');
277
418
  const label = labelEndIdx > 0 ? nodeId.substring(0, labelEndIdx) : 'Unknown';
278
419
  if (!byLabel.has(label))
279
420
  byLabel.set(label, []);
280
- byLabel.get(label).push({ nodeId, distance });
421
+ byLabel.get(label).push({ nodeId, ...chunk });
281
422
  }
282
423
  // Batch-fetch metadata per label
283
424
  const results = [];
284
425
  for (const [label, items] of byLabel) {
285
426
  const idList = items.map((i) => `'${i.nodeId.replace(/'/g, "''")}'`).join(', ');
286
427
  try {
287
- let nodeQuery;
288
- if (label === 'File') {
289
- nodeQuery = `
290
- MATCH (n:File) WHERE n.id IN [${idList}]
291
- RETURN n.id AS id, n.name AS name, n.filePath AS filePath
292
- `;
293
- }
294
- else {
295
- nodeQuery = `
296
- MATCH (n:${label}) WHERE n.id IN [${idList}]
297
- RETURN n.id AS id, n.name AS name, n.filePath AS filePath,
298
- n.startLine AS startLine, n.endLine AS endLine
299
- `;
300
- }
428
+ const nodeQuery = `
429
+ MATCH (n:\`${label}\`) WHERE n.id IN [${idList}]
430
+ RETURN n.id AS id, n.name AS name, n.filePath AS filePath,
431
+ n.startLine AS startLine, n.endLine AS endLine
432
+ `;
301
433
  const nodeRows = await executeQuery(nodeQuery);
302
434
  const rowMap = new Map();
303
435
  for (const row of nodeRows) {
@@ -313,8 +445,8 @@ export const semanticSearch = async (executeQuery, query, k = 10, maxDistance =
313
445
  label,
314
446
  filePath: nodeRow.filePath ?? nodeRow[2] ?? '',
315
447
  distance: item.distance,
316
- startLine: label !== 'File' ? (nodeRow.startLine ?? nodeRow[3]) : undefined,
317
- endLine: label !== 'File' ? (nodeRow.endLine ?? nodeRow[4]) : undefined,
448
+ startLine: item.startLine,
449
+ endLine: item.endLine,
318
450
  });
319
451
  }
320
452
  }
@@ -323,26 +455,13 @@ export const semanticSearch = async (executeQuery, query, k = 10, maxDistance =
323
455
  // Table might not exist, skip
324
456
  }
325
457
  }
326
- // Re-sort by distance since batch queries may have mixed order
327
458
  results.sort((a, b) => a.distance - b.distance);
328
459
  return results;
329
460
  };
330
461
  /**
331
462
  * Semantic search with graph expansion (flattened results)
332
- *
333
- * Note: With multi-table schema, graph traversal is simplified.
334
- * Returns semantic matches with their metadata.
335
- * For full graph traversal, use execute_vector_cypher tool directly.
336
- *
337
- * @param executeQuery - Function to execute Cypher queries
338
- * @param query - Search query text
339
- * @param k - Number of initial semantic matches (default: 5)
340
- * @param _hops - Unused (kept for API compatibility).
341
- * @returns Semantic matches with metadata
342
463
  */
343
464
  export const semanticSearchWithContext = async (executeQuery, query, k = 5, _hops = 1) => {
344
- // For multi-table schema, just return semantic search results
345
- // Graph traversal is complex with separate tables - use execute_vector_cypher instead
346
465
  const results = await semanticSearch(executeQuery, query, k, 0.5);
347
466
  return results.map((r) => ({
348
467
  matchId: r.nodeId,
@@ -0,0 +1,7 @@
1
+ export interface ResolvedLineRange {
2
+ startLine: number;
3
+ endLine: number;
4
+ }
5
+ export declare const buildLineIndex: (content: string) => Int32Array;
6
+ export declare const lineFromOffset: (lineOffsets: Int32Array, charOffset: number) => number;
7
+ export declare const resolveChunkLines: (lineOffsets: Int32Array, startOffset: number, endOffset: number, baseStartLine: number) => ResolvedLineRange;
@@ -0,0 +1,42 @@
1
+ export const buildLineIndex = (content) => {
2
+ const offsets = [0];
3
+ for (let i = 0; i < content.length; i++) {
4
+ if (content.charCodeAt(i) === 10)
5
+ offsets.push(i + 1);
6
+ }
7
+ return new Int32Array(offsets);
8
+ };
9
+ const clampOffset = (lineOffsets, charOffset) => {
10
+ if (lineOffsets.length === 0)
11
+ return 0;
12
+ const maxOffset = lineOffsets[lineOffsets.length - 1];
13
+ if (charOffset < 0)
14
+ return 0;
15
+ if (charOffset > maxOffset)
16
+ return maxOffset;
17
+ return charOffset;
18
+ };
19
+ export const lineFromOffset = (lineOffsets, charOffset) => {
20
+ if (lineOffsets.length === 0)
21
+ return 0;
22
+ const clamped = clampOffset(lineOffsets, charOffset);
23
+ let lo = 0;
24
+ let hi = lineOffsets.length - 1;
25
+ while (lo < hi) {
26
+ const mid = (lo + hi + 1) >> 1;
27
+ if (lineOffsets[mid] <= clamped)
28
+ lo = mid;
29
+ else
30
+ hi = mid - 1;
31
+ }
32
+ return lo;
33
+ };
34
+ export const resolveChunkLines = (lineOffsets, startOffset, endOffset, baseStartLine) => {
35
+ const relativeStartLine = lineFromOffset(lineOffsets, startOffset);
36
+ const effectiveEndOffset = endOffset > startOffset ? endOffset - 1 : startOffset;
37
+ const relativeEndLine = lineFromOffset(lineOffsets, effectiveEndOffset);
38
+ return {
39
+ startLine: baseStartLine + relativeStartLine,
40
+ endLine: baseStartLine + relativeEndLine,
41
+ };
42
+ };
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Server Mapping Configuration
3
+ *
4
+ * Reads ~/.gitnexus/server-mapping.json to map repo names to service names.
5
+ * Used in embedding text to enrich metadata with microservice context.
6
+ */
7
+ /**
8
+ * Read the server mapping file and return the serverName for a given repoName.
9
+ * Returns undefined if no mapping exists.
10
+ */
11
+ export declare const readServerMapping: (repoName: string) => Promise<string | undefined>;
12
+ /**
13
+ * Clear the cached mapping (useful for testing or after file changes)
14
+ */
15
+ export declare const clearServerMappingCache: () => void;
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Server Mapping Configuration
3
+ *
4
+ * Reads ~/.gitnexus/server-mapping.json to map repo names to service names.
5
+ * Used in embedding text to enrich metadata with microservice context.
6
+ */
7
+ import fs from 'fs/promises';
8
+ import path from 'path';
9
+ import os from 'os';
10
+ const MAPPING_FILE = path.join(os.homedir(), '.gitnexus', 'server-mapping.json');
11
+ let cachedMapping = null;
12
+ /**
13
+ * Read the server mapping file and return the serverName for a given repoName.
14
+ * Returns undefined if no mapping exists.
15
+ */
16
+ export const readServerMapping = async (repoName) => {
17
+ try {
18
+ if (!cachedMapping) {
19
+ const raw = await fs.readFile(MAPPING_FILE, 'utf-8');
20
+ cachedMapping = JSON.parse(raw);
21
+ }
22
+ return cachedMapping[repoName];
23
+ }
24
+ catch {
25
+ return undefined;
26
+ }
27
+ };
28
+ /**
29
+ * Clear the cached mapping (useful for testing or after file changes)
30
+ */
31
+ export const clearServerMappingCache = () => {
32
+ cachedMapping = null;
33
+ };
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Structural Extractor Module
3
+ *
4
+ * Reuses ingestion pipeline's AST-based MethodExtractor / FieldExtractor
5
+ * to extract method and field names for embedding text generation.
6
+ */
7
+ export interface StructuralNames {
8
+ methodNames: string[];
9
+ fieldNames: string[];
10
+ }
11
+ /**
12
+ * Extract method and field names from a class/struct/interface node
13
+ * using the ingestion pipeline's AST extractors.
14
+ */
15
+ export declare const extractStructuralNames: (content: string, filePath: string) => Promise<StructuralNames>;