@duytransipher/gitnexus 1.4.6-sipher.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. package/LICENSE +73 -0
  2. package/README.md +261 -0
  3. package/dist/cli/ai-context.d.ts +23 -0
  4. package/dist/cli/ai-context.js +265 -0
  5. package/dist/cli/analyze.d.ts +12 -0
  6. package/dist/cli/analyze.js +345 -0
  7. package/dist/cli/augment.d.ts +13 -0
  8. package/dist/cli/augment.js +33 -0
  9. package/dist/cli/clean.d.ts +10 -0
  10. package/dist/cli/clean.js +60 -0
  11. package/dist/cli/eval-server.d.ts +37 -0
  12. package/dist/cli/eval-server.js +389 -0
  13. package/dist/cli/index.d.ts +2 -0
  14. package/dist/cli/index.js +137 -0
  15. package/dist/cli/lazy-action.d.ts +6 -0
  16. package/dist/cli/lazy-action.js +18 -0
  17. package/dist/cli/list.d.ts +6 -0
  18. package/dist/cli/list.js +30 -0
  19. package/dist/cli/mcp.d.ts +8 -0
  20. package/dist/cli/mcp.js +36 -0
  21. package/dist/cli/serve.d.ts +4 -0
  22. package/dist/cli/serve.js +6 -0
  23. package/dist/cli/setup.d.ts +8 -0
  24. package/dist/cli/setup.js +367 -0
  25. package/dist/cli/sipher-patched.d.ts +2 -0
  26. package/dist/cli/sipher-patched.js +77 -0
  27. package/dist/cli/skill-gen.d.ts +26 -0
  28. package/dist/cli/skill-gen.js +549 -0
  29. package/dist/cli/status.d.ts +6 -0
  30. package/dist/cli/status.js +36 -0
  31. package/dist/cli/tool.d.ts +60 -0
  32. package/dist/cli/tool.js +180 -0
  33. package/dist/cli/wiki.d.ts +15 -0
  34. package/dist/cli/wiki.js +365 -0
  35. package/dist/config/ignore-service.d.ts +26 -0
  36. package/dist/config/ignore-service.js +284 -0
  37. package/dist/config/supported-languages.d.ts +15 -0
  38. package/dist/config/supported-languages.js +16 -0
  39. package/dist/core/augmentation/engine.d.ts +26 -0
  40. package/dist/core/augmentation/engine.js +240 -0
  41. package/dist/core/embeddings/embedder.d.ts +60 -0
  42. package/dist/core/embeddings/embedder.js +251 -0
  43. package/dist/core/embeddings/embedding-pipeline.d.ts +51 -0
  44. package/dist/core/embeddings/embedding-pipeline.js +356 -0
  45. package/dist/core/embeddings/index.d.ts +9 -0
  46. package/dist/core/embeddings/index.js +9 -0
  47. package/dist/core/embeddings/text-generator.d.ts +24 -0
  48. package/dist/core/embeddings/text-generator.js +182 -0
  49. package/dist/core/embeddings/types.d.ts +87 -0
  50. package/dist/core/embeddings/types.js +32 -0
  51. package/dist/core/graph/graph.d.ts +2 -0
  52. package/dist/core/graph/graph.js +66 -0
  53. package/dist/core/graph/types.d.ts +66 -0
  54. package/dist/core/graph/types.js +1 -0
  55. package/dist/core/ingestion/ast-cache.d.ts +11 -0
  56. package/dist/core/ingestion/ast-cache.js +35 -0
  57. package/dist/core/ingestion/call-processor.d.ts +23 -0
  58. package/dist/core/ingestion/call-processor.js +793 -0
  59. package/dist/core/ingestion/call-routing.d.ts +68 -0
  60. package/dist/core/ingestion/call-routing.js +129 -0
  61. package/dist/core/ingestion/cluster-enricher.d.ts +38 -0
  62. package/dist/core/ingestion/cluster-enricher.js +170 -0
  63. package/dist/core/ingestion/community-processor.d.ts +39 -0
  64. package/dist/core/ingestion/community-processor.js +312 -0
  65. package/dist/core/ingestion/constants.d.ts +16 -0
  66. package/dist/core/ingestion/constants.js +16 -0
  67. package/dist/core/ingestion/entry-point-scoring.d.ts +40 -0
  68. package/dist/core/ingestion/entry-point-scoring.js +353 -0
  69. package/dist/core/ingestion/export-detection.d.ts +18 -0
  70. package/dist/core/ingestion/export-detection.js +231 -0
  71. package/dist/core/ingestion/filesystem-walker.d.ts +28 -0
  72. package/dist/core/ingestion/filesystem-walker.js +81 -0
  73. package/dist/core/ingestion/framework-detection.d.ts +54 -0
  74. package/dist/core/ingestion/framework-detection.js +411 -0
  75. package/dist/core/ingestion/heritage-processor.d.ts +28 -0
  76. package/dist/core/ingestion/heritage-processor.js +251 -0
  77. package/dist/core/ingestion/import-processor.d.ts +34 -0
  78. package/dist/core/ingestion/import-processor.js +398 -0
  79. package/dist/core/ingestion/language-config.d.ts +46 -0
  80. package/dist/core/ingestion/language-config.js +167 -0
  81. package/dist/core/ingestion/mro-processor.d.ts +45 -0
  82. package/dist/core/ingestion/mro-processor.js +369 -0
  83. package/dist/core/ingestion/named-binding-extraction.d.ts +61 -0
  84. package/dist/core/ingestion/named-binding-extraction.js +363 -0
  85. package/dist/core/ingestion/parsing-processor.d.ts +19 -0
  86. package/dist/core/ingestion/parsing-processor.js +315 -0
  87. package/dist/core/ingestion/pipeline.d.ts +6 -0
  88. package/dist/core/ingestion/pipeline.js +401 -0
  89. package/dist/core/ingestion/process-processor.d.ts +51 -0
  90. package/dist/core/ingestion/process-processor.js +315 -0
  91. package/dist/core/ingestion/resolution-context.d.ts +53 -0
  92. package/dist/core/ingestion/resolution-context.js +132 -0
  93. package/dist/core/ingestion/resolvers/csharp.d.ts +22 -0
  94. package/dist/core/ingestion/resolvers/csharp.js +109 -0
  95. package/dist/core/ingestion/resolvers/go.d.ts +19 -0
  96. package/dist/core/ingestion/resolvers/go.js +42 -0
  97. package/dist/core/ingestion/resolvers/index.d.ts +18 -0
  98. package/dist/core/ingestion/resolvers/index.js +13 -0
  99. package/dist/core/ingestion/resolvers/jvm.d.ts +23 -0
  100. package/dist/core/ingestion/resolvers/jvm.js +87 -0
  101. package/dist/core/ingestion/resolvers/php.d.ts +15 -0
  102. package/dist/core/ingestion/resolvers/php.js +35 -0
  103. package/dist/core/ingestion/resolvers/python.d.ts +19 -0
  104. package/dist/core/ingestion/resolvers/python.js +52 -0
  105. package/dist/core/ingestion/resolvers/ruby.d.ts +12 -0
  106. package/dist/core/ingestion/resolvers/ruby.js +15 -0
  107. package/dist/core/ingestion/resolvers/rust.d.ts +15 -0
  108. package/dist/core/ingestion/resolvers/rust.js +73 -0
  109. package/dist/core/ingestion/resolvers/standard.d.ts +28 -0
  110. package/dist/core/ingestion/resolvers/standard.js +123 -0
  111. package/dist/core/ingestion/resolvers/utils.d.ts +33 -0
  112. package/dist/core/ingestion/resolvers/utils.js +122 -0
  113. package/dist/core/ingestion/structure-processor.d.ts +2 -0
  114. package/dist/core/ingestion/structure-processor.js +36 -0
  115. package/dist/core/ingestion/symbol-table.d.ts +63 -0
  116. package/dist/core/ingestion/symbol-table.js +85 -0
  117. package/dist/core/ingestion/tree-sitter-queries.d.ts +15 -0
  118. package/dist/core/ingestion/tree-sitter-queries.js +888 -0
  119. package/dist/core/ingestion/type-env.d.ts +49 -0
  120. package/dist/core/ingestion/type-env.js +613 -0
  121. package/dist/core/ingestion/type-extractors/c-cpp.d.ts +2 -0
  122. package/dist/core/ingestion/type-extractors/c-cpp.js +385 -0
  123. package/dist/core/ingestion/type-extractors/csharp.d.ts +2 -0
  124. package/dist/core/ingestion/type-extractors/csharp.js +383 -0
  125. package/dist/core/ingestion/type-extractors/go.d.ts +2 -0
  126. package/dist/core/ingestion/type-extractors/go.js +467 -0
  127. package/dist/core/ingestion/type-extractors/index.d.ts +22 -0
  128. package/dist/core/ingestion/type-extractors/index.js +31 -0
  129. package/dist/core/ingestion/type-extractors/jvm.d.ts +3 -0
  130. package/dist/core/ingestion/type-extractors/jvm.js +681 -0
  131. package/dist/core/ingestion/type-extractors/php.d.ts +2 -0
  132. package/dist/core/ingestion/type-extractors/php.js +549 -0
  133. package/dist/core/ingestion/type-extractors/python.d.ts +2 -0
  134. package/dist/core/ingestion/type-extractors/python.js +455 -0
  135. package/dist/core/ingestion/type-extractors/ruby.d.ts +2 -0
  136. package/dist/core/ingestion/type-extractors/ruby.js +389 -0
  137. package/dist/core/ingestion/type-extractors/rust.d.ts +2 -0
  138. package/dist/core/ingestion/type-extractors/rust.js +456 -0
  139. package/dist/core/ingestion/type-extractors/shared.d.ts +145 -0
  140. package/dist/core/ingestion/type-extractors/shared.js +810 -0
  141. package/dist/core/ingestion/type-extractors/swift.d.ts +2 -0
  142. package/dist/core/ingestion/type-extractors/swift.js +137 -0
  143. package/dist/core/ingestion/type-extractors/types.d.ts +127 -0
  144. package/dist/core/ingestion/type-extractors/types.js +1 -0
  145. package/dist/core/ingestion/type-extractors/typescript.d.ts +2 -0
  146. package/dist/core/ingestion/type-extractors/typescript.js +494 -0
  147. package/dist/core/ingestion/utils.d.ts +138 -0
  148. package/dist/core/ingestion/utils.js +1290 -0
  149. package/dist/core/ingestion/workers/parse-worker.d.ts +122 -0
  150. package/dist/core/ingestion/workers/parse-worker.js +1126 -0
  151. package/dist/core/ingestion/workers/worker-pool.d.ts +16 -0
  152. package/dist/core/ingestion/workers/worker-pool.js +128 -0
  153. package/dist/core/lbug/csv-generator.d.ts +33 -0
  154. package/dist/core/lbug/csv-generator.js +366 -0
  155. package/dist/core/lbug/lbug-adapter.d.ts +103 -0
  156. package/dist/core/lbug/lbug-adapter.js +769 -0
  157. package/dist/core/lbug/schema.d.ts +53 -0
  158. package/dist/core/lbug/schema.js +430 -0
  159. package/dist/core/search/bm25-index.d.ts +23 -0
  160. package/dist/core/search/bm25-index.js +96 -0
  161. package/dist/core/search/hybrid-search.d.ts +49 -0
  162. package/dist/core/search/hybrid-search.js +118 -0
  163. package/dist/core/tree-sitter/parser-loader.d.ts +5 -0
  164. package/dist/core/tree-sitter/parser-loader.js +63 -0
  165. package/dist/core/wiki/generator.d.ts +120 -0
  166. package/dist/core/wiki/generator.js +939 -0
  167. package/dist/core/wiki/graph-queries.d.ts +80 -0
  168. package/dist/core/wiki/graph-queries.js +238 -0
  169. package/dist/core/wiki/html-viewer.d.ts +10 -0
  170. package/dist/core/wiki/html-viewer.js +297 -0
  171. package/dist/core/wiki/llm-client.d.ts +43 -0
  172. package/dist/core/wiki/llm-client.js +186 -0
  173. package/dist/core/wiki/prompts.d.ts +53 -0
  174. package/dist/core/wiki/prompts.js +174 -0
  175. package/dist/lib/utils.d.ts +1 -0
  176. package/dist/lib/utils.js +3 -0
  177. package/dist/mcp/compatible-stdio-transport.d.ts +25 -0
  178. package/dist/mcp/compatible-stdio-transport.js +200 -0
  179. package/dist/mcp/core/embedder.d.ts +27 -0
  180. package/dist/mcp/core/embedder.js +108 -0
  181. package/dist/mcp/core/lbug-adapter.d.ts +57 -0
  182. package/dist/mcp/core/lbug-adapter.js +455 -0
  183. package/dist/mcp/local/local-backend.d.ts +181 -0
  184. package/dist/mcp/local/local-backend.js +1722 -0
  185. package/dist/mcp/resources.d.ts +31 -0
  186. package/dist/mcp/resources.js +411 -0
  187. package/dist/mcp/server.d.ts +23 -0
  188. package/dist/mcp/server.js +296 -0
  189. package/dist/mcp/staleness.d.ts +15 -0
  190. package/dist/mcp/staleness.js +29 -0
  191. package/dist/mcp/tools.d.ts +24 -0
  192. package/dist/mcp/tools.js +292 -0
  193. package/dist/server/api.d.ts +10 -0
  194. package/dist/server/api.js +344 -0
  195. package/dist/server/mcp-http.d.ts +13 -0
  196. package/dist/server/mcp-http.js +100 -0
  197. package/dist/storage/git.d.ts +6 -0
  198. package/dist/storage/git.js +35 -0
  199. package/dist/storage/repo-manager.d.ts +138 -0
  200. package/dist/storage/repo-manager.js +299 -0
  201. package/dist/types/pipeline.d.ts +32 -0
  202. package/dist/types/pipeline.js +18 -0
  203. package/dist/unreal/bridge.d.ts +4 -0
  204. package/dist/unreal/bridge.js +113 -0
  205. package/dist/unreal/config.d.ts +6 -0
  206. package/dist/unreal/config.js +55 -0
  207. package/dist/unreal/types.d.ts +105 -0
  208. package/dist/unreal/types.js +1 -0
  209. package/hooks/claude/gitnexus-hook.cjs +238 -0
  210. package/hooks/claude/pre-tool-use.sh +79 -0
  211. package/hooks/claude/session-start.sh +42 -0
  212. package/package.json +100 -0
  213. package/scripts/ensure-cli-executable.cjs +21 -0
  214. package/scripts/patch-tree-sitter-swift.cjs +74 -0
  215. package/scripts/setup-unreal-gitnexus.ps1 +191 -0
  216. package/skills/gitnexus-cli.md +82 -0
  217. package/skills/gitnexus-debugging.md +89 -0
  218. package/skills/gitnexus-exploring.md +78 -0
  219. package/skills/gitnexus-guide.md +64 -0
  220. package/skills/gitnexus-impact-analysis.md +97 -0
  221. package/skills/gitnexus-pr-review.md +163 -0
  222. package/skills/gitnexus-refactoring.md +121 -0
  223. package/vendor/leiden/index.cjs +355 -0
  224. package/vendor/leiden/utils.cjs +392 -0
@@ -0,0 +1,251 @@
1
+ /**
2
+ * Embedder Module
3
+ *
4
+ * Singleton factory for transformers.js embedding pipeline.
5
+ * Handles model loading, caching, and both single and batch embedding operations.
6
+ *
7
+ * Uses snowflake-arctic-embed-xs by default (22M params, 384 dims, ~90MB)
8
+ */
9
+ // Suppress ONNX Runtime native warnings (e.g. VerifyEachNodeIsAssignedToAnEp)
10
+ // Must be set BEFORE onnxruntime-node is imported by transformers.js
11
+ // Level 3 = Error only (skips Warning/Info)
12
+ if (!process.env.ORT_LOG_LEVEL) {
13
+ process.env.ORT_LOG_LEVEL = '3';
14
+ }
15
+ import { pipeline, env } from '@huggingface/transformers';
16
+ import { existsSync } from 'fs';
17
+ import { execFileSync } from 'child_process';
18
+ import { join } from 'path';
19
+ import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
20
+ /**
21
+ * Check whether CUDA libraries are actually available on this system.
22
+ * ONNX Runtime's native layer crashes (uncatchable) if we attempt CUDA
23
+ * without the required shared libraries, so we probe first.
24
+ *
25
+ * Checks the dynamic linker cache (ldconfig) which covers all architectures
26
+ * and install paths, then falls back to CUDA_PATH / LD_LIBRARY_PATH env vars.
27
+ */
28
+ function isCudaAvailable() {
29
+ // Primary: query the dynamic linker cache — covers all architectures,
30
+ // distro layouts, and custom install paths registered with ldconfig
31
+ try {
32
+ const out = execFileSync('ldconfig', ['-p'], { timeout: 3000, encoding: 'utf-8' });
33
+ if (out.includes('libcublasLt.so.12'))
34
+ return true;
35
+ }
36
+ catch {
37
+ // ldconfig not available (e.g. non-standard container)
38
+ }
39
+ // Fallback: check CUDA_PATH and LD_LIBRARY_PATH for environments where
40
+ // ldconfig doesn't know about the CUDA install (conda, manual /opt/cuda, etc.)
41
+ for (const envVar of ['CUDA_PATH', 'LD_LIBRARY_PATH']) {
42
+ const val = process.env[envVar];
43
+ if (!val)
44
+ continue;
45
+ for (const dir of val.split(':').filter(Boolean)) {
46
+ if (existsSync(join(dir, 'lib64', 'libcublasLt.so.12')) ||
47
+ existsSync(join(dir, 'lib', 'libcublasLt.so.12')) ||
48
+ existsSync(join(dir, 'libcublasLt.so.12')))
49
+ return true;
50
+ }
51
+ }
52
+ return false;
53
+ }
54
+ // Module-level state for singleton pattern
55
+ let embedderInstance = null;
56
+ let isInitializing = false;
57
+ let initPromise = null;
58
+ let currentDevice = null;
59
+ /**
60
+ * Get the current device being used for inference
61
+ */
62
+ export const getCurrentDevice = () => currentDevice;
63
+ /**
64
+ * Initialize the embedding model
65
+ * Uses singleton pattern - only loads once, subsequent calls return cached instance
66
+ *
67
+ * @param onProgress - Optional callback for model download progress
68
+ * @param config - Optional configuration override
69
+ * @param forceDevice - Force a specific device
70
+ * @returns Promise resolving to the embedder pipeline
71
+ */
72
+ export const initEmbedder = async (onProgress, config = {}, forceDevice) => {
73
+ // Return existing instance if available
74
+ if (embedderInstance) {
75
+ return embedderInstance;
76
+ }
77
+ // If already initializing, wait for that promise
78
+ if (isInitializing && initPromise) {
79
+ return initPromise;
80
+ }
81
+ isInitializing = true;
82
+ const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
83
+ // On Windows, use DirectML for GPU acceleration (via DirectX12)
84
+ // CUDA is only available on Linux x64 with onnxruntime-node
85
+ // Probe for CUDA first — ONNX Runtime crashes (uncatchable native error)
86
+ // if we attempt CUDA without the required shared libraries
87
+ const isWindows = process.platform === 'win32';
88
+ const gpuDevice = isWindows ? 'dml' : (isCudaAvailable() ? 'cuda' : 'cpu');
89
+ let requestedDevice = forceDevice || (finalConfig.device === 'auto' ? gpuDevice : finalConfig.device);
90
+ initPromise = (async () => {
91
+ try {
92
+ // Configure transformers.js environment
93
+ env.allowLocalModels = false;
94
+ const isDev = process.env.NODE_ENV === 'development';
95
+ if (isDev) {
96
+ console.log(`🧠 Loading embedding model: ${finalConfig.modelId}`);
97
+ }
98
+ const progressCallback = onProgress ? (data) => {
99
+ const progress = {
100
+ status: data.status || 'progress',
101
+ file: data.file,
102
+ progress: data.progress,
103
+ loaded: data.loaded,
104
+ total: data.total,
105
+ };
106
+ onProgress(progress);
107
+ } : undefined;
108
+ // Try GPU first if auto, fall back to CPU
109
+ // Windows: dml (DirectML/DirectX12), Linux: cuda
110
+ const devicesToTry = (requestedDevice === 'dml' || requestedDevice === 'cuda')
111
+ ? [requestedDevice, 'cpu']
112
+ : [requestedDevice];
113
+ for (const device of devicesToTry) {
114
+ try {
115
+ if (isDev && device === 'dml') {
116
+ console.log('🔧 Trying DirectML (DirectX12) GPU backend...');
117
+ }
118
+ else if (isDev && device === 'cuda') {
119
+ console.log('🔧 Trying CUDA GPU backend...');
120
+ }
121
+ else if (isDev && device === 'cpu') {
122
+ console.log('🔧 Using CPU backend...');
123
+ }
124
+ else if (isDev && device === 'wasm') {
125
+ console.log('🔧 Using WASM backend (slower)...');
126
+ }
127
+ embedderInstance = await pipeline('feature-extraction', finalConfig.modelId, {
128
+ device: device,
129
+ dtype: 'fp32',
130
+ progress_callback: progressCallback,
131
+ session_options: { logSeverityLevel: 3 },
132
+ });
133
+ currentDevice = device;
134
+ if (isDev) {
135
+ const label = device === 'dml' ? 'GPU (DirectML/DirectX12)'
136
+ : device === 'cuda' ? 'GPU (CUDA)'
137
+ : device.toUpperCase();
138
+ console.log(`✅ Using ${label} backend`);
139
+ console.log('✅ Embedding model loaded successfully');
140
+ }
141
+ return embedderInstance;
142
+ }
143
+ catch (deviceError) {
144
+ if (isDev && (device === 'cuda' || device === 'dml')) {
145
+ const gpuType = device === 'dml' ? 'DirectML' : 'CUDA';
146
+ console.log(`⚠️ ${gpuType} not available, falling back to CPU...`);
147
+ }
148
+ // Continue to next device in list
149
+ if (device === devicesToTry[devicesToTry.length - 1]) {
150
+ throw deviceError; // Last device failed, propagate error
151
+ }
152
+ }
153
+ }
154
+ throw new Error('No suitable device found for embedding model');
155
+ }
156
+ catch (error) {
157
+ isInitializing = false;
158
+ initPromise = null;
159
+ embedderInstance = null;
160
+ throw error;
161
+ }
162
+ finally {
163
+ isInitializing = false;
164
+ }
165
+ })();
166
+ return initPromise;
167
+ };
168
+ /**
169
+ * Check if the embedder is initialized and ready
170
+ */
171
+ export const isEmbedderReady = () => {
172
+ return embedderInstance !== null;
173
+ };
174
+ /**
175
+ * Get the embedder instance (throws if not initialized)
176
+ */
177
+ export const getEmbedder = () => {
178
+ if (!embedderInstance) {
179
+ throw new Error('Embedder not initialized. Call initEmbedder() first.');
180
+ }
181
+ return embedderInstance;
182
+ };
183
+ /**
184
+ * Embed a single text string
185
+ *
186
+ * @param text - Text to embed
187
+ * @returns Float32Array of embedding vector (384 dimensions)
188
+ */
189
+ export const embedText = async (text) => {
190
+ const embedder = getEmbedder();
191
+ const result = await embedder(text, {
192
+ pooling: 'mean',
193
+ normalize: true,
194
+ });
195
+ // Result is a Tensor, convert to Float32Array
196
+ return new Float32Array(result.data);
197
+ };
198
+ /**
199
+ * Embed multiple texts in a single batch
200
+ * More efficient than calling embedText multiple times
201
+ *
202
+ * @param texts - Array of texts to embed
203
+ * @returns Array of Float32Array embedding vectors
204
+ */
205
+ export const embedBatch = async (texts) => {
206
+ if (texts.length === 0) {
207
+ return [];
208
+ }
209
+ const embedder = getEmbedder();
210
+ // Process batch
211
+ const result = await embedder(texts, {
212
+ pooling: 'mean',
213
+ normalize: true,
214
+ });
215
+ // Result shape is [batch_size, dimensions]
216
+ // Need to split into individual vectors
217
+ const data = result.data;
218
+ const dimensions = DEFAULT_EMBEDDING_CONFIG.dimensions;
219
+ const embeddings = [];
220
+ for (let i = 0; i < texts.length; i++) {
221
+ const start = i * dimensions;
222
+ const end = start + dimensions;
223
+ embeddings.push(new Float32Array(Array.prototype.slice.call(data, start, end)));
224
+ }
225
+ return embeddings;
226
+ };
227
+ /**
228
+ * Convert Float32Array to regular number array (for LadybugDB storage)
229
+ */
230
+ export const embeddingToArray = (embedding) => {
231
+ return Array.from(embedding);
232
+ };
233
+ /**
234
+ * Cleanup the embedder (free memory)
235
+ * Call this when done with embeddings
236
+ */
237
+ export const disposeEmbedder = async () => {
238
+ if (embedderInstance) {
239
+ // transformers.js pipelines may have a dispose method
240
+ try {
241
+ if ('dispose' in embedderInstance && typeof embedderInstance.dispose === 'function') {
242
+ await embedderInstance.dispose();
243
+ }
244
+ }
245
+ catch {
246
+ // Ignore disposal errors
247
+ }
248
+ embedderInstance = null;
249
+ initPromise = null;
250
+ }
251
+ };
@@ -0,0 +1,51 @@
1
+ /**
2
+ * Embedding Pipeline Module
3
+ *
4
+ * Orchestrates the background embedding process:
5
+ * 1. Query embeddable nodes from LadybugDB
6
+ * 2. Generate text representations
7
+ * 3. Batch embed using transformers.js
8
+ * 4. Update LadybugDB with embeddings
9
+ * 5. Create vector index for semantic search
10
+ */
11
+ import { type EmbeddingProgress, type EmbeddingConfig, type SemanticSearchResult } from './types.js';
12
+ /**
13
+ * Progress callback type
14
+ */
15
+ export type EmbeddingProgressCallback = (progress: EmbeddingProgress) => void;
16
+ /**
17
+ * Run the embedding pipeline
18
+ *
19
+ * @param executeQuery - Function to execute Cypher queries against LadybugDB
20
+ * @param executeWithReusedStatement - Function to execute with reused prepared statement
21
+ * @param onProgress - Callback for progress updates
22
+ * @param config - Optional configuration override
23
+ * @param skipNodeIds - Optional set of node IDs that already have embeddings (incremental mode)
24
+ */
25
+ export declare const runEmbeddingPipeline: (executeQuery: (cypher: string) => Promise<any[]>, executeWithReusedStatement: (cypher: string, paramsList: Array<Record<string, any>>) => Promise<void>, onProgress: EmbeddingProgressCallback, config?: Partial<EmbeddingConfig>, skipNodeIds?: Set<string>) => Promise<void>;
26
+ /**
27
+ * Perform semantic search using the vector index
28
+ *
29
+ * Uses CodeEmbedding table and queries each node table to get metadata
30
+ *
31
+ * @param executeQuery - Function to execute Cypher queries
32
+ * @param query - Search query text
33
+ * @param k - Number of results to return (default: 10)
34
+ * @param maxDistance - Maximum distance threshold (default: 0.5)
35
+ * @returns Array of search results ordered by relevance
36
+ */
37
+ export declare const semanticSearch: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, maxDistance?: number) => Promise<SemanticSearchResult[]>;
38
+ /**
39
+ * Semantic search with graph expansion (flattened results)
40
+ *
41
+ * Note: With multi-table schema, graph traversal is simplified.
42
+ * Returns semantic matches with their metadata.
43
+ * For full graph traversal, use execute_vector_cypher tool directly.
44
+ *
45
+ * @param executeQuery - Function to execute Cypher queries
46
+ * @param query - Search query text
47
+ * @param k - Number of initial semantic matches (default: 5)
48
+ * @param _hops - Unused (kept for API compatibility).
49
+ * @returns Semantic matches with metadata
50
+ */
51
+ export declare const semanticSearchWithContext: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, _hops?: number) => Promise<any[]>;
@@ -0,0 +1,356 @@
1
+ /**
2
+ * Embedding Pipeline Module
3
+ *
4
+ * Orchestrates the background embedding process:
5
+ * 1. Query embeddable nodes from LadybugDB
6
+ * 2. Generate text representations
7
+ * 3. Batch embed using transformers.js
8
+ * 4. Update LadybugDB with embeddings
9
+ * 5. Create vector index for semantic search
10
+ */
11
+ import { initEmbedder, embedBatch, embedText, embeddingToArray, isEmbedderReady } from './embedder.js';
12
+ import { generateBatchEmbeddingTexts } from './text-generator.js';
13
+ import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, } from './types.js';
14
+ const isDev = process.env.NODE_ENV === 'development';
15
+ /**
16
+ * Query all embeddable nodes from LadybugDB
17
+ * Uses table-specific queries (File has different schema than code elements)
18
+ */
19
+ const queryEmbeddableNodes = async (executeQuery) => {
20
+ const allNodes = [];
21
+ // Query each embeddable table with table-specific columns
22
+ for (const label of EMBEDDABLE_LABELS) {
23
+ try {
24
+ let query;
25
+ if (label === 'File') {
26
+ // File nodes don't have startLine/endLine
27
+ query = `
28
+ MATCH (n:File)
29
+ RETURN n.id AS id, n.name AS name, 'File' AS label,
30
+ n.filePath AS filePath, n.content AS content
31
+ `;
32
+ }
33
+ else {
34
+ // Code elements have startLine/endLine
35
+ query = `
36
+ MATCH (n:${label})
37
+ RETURN n.id AS id, n.name AS name, '${label}' AS label,
38
+ n.filePath AS filePath, n.content AS content,
39
+ n.startLine AS startLine, n.endLine AS endLine
40
+ `;
41
+ }
42
+ const rows = await executeQuery(query);
43
+ for (const row of rows) {
44
+ allNodes.push({
45
+ id: row.id ?? row[0],
46
+ name: row.name ?? row[1],
47
+ label: row.label ?? row[2],
48
+ filePath: row.filePath ?? row[3],
49
+ content: row.content ?? row[4] ?? '',
50
+ startLine: row.startLine ?? row[5],
51
+ endLine: row.endLine ?? row[6],
52
+ });
53
+ }
54
+ }
55
+ catch (error) {
56
+ // Table might not exist or be empty, continue
57
+ if (isDev) {
58
+ console.warn(`Query for ${label} nodes failed:`, error);
59
+ }
60
+ }
61
+ }
62
+ return allNodes;
63
+ };
64
+ /**
65
+ * Batch INSERT embeddings into separate CodeEmbedding table
66
+ * Using a separate lightweight table avoids copy-on-write overhead
67
+ * that occurs when UPDATEing nodes with large content fields
68
+ */
69
+ const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
70
+ // INSERT into separate embedding table - much more memory efficient!
71
+ const cypher = `CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`;
72
+ const paramsList = updates.map(u => ({ nodeId: u.id, embedding: u.embedding }));
73
+ await executeWithReusedStatement(cypher, paramsList);
74
+ };
75
+ /**
76
+ * Create the vector index for semantic search
77
+ * Now indexes the separate CodeEmbedding table
78
+ */
79
+ let vectorExtensionLoaded = false;
80
+ const createVectorIndex = async (executeQuery) => {
81
+ // LadybugDB v0.15+ requires explicit VECTOR extension loading (once per session)
82
+ if (!vectorExtensionLoaded) {
83
+ try {
84
+ await executeQuery('INSTALL VECTOR');
85
+ await executeQuery('LOAD EXTENSION VECTOR');
86
+ vectorExtensionLoaded = true;
87
+ }
88
+ catch {
89
+ // Extension may already be loaded — CREATE_VECTOR_INDEX will fail clearly if not
90
+ vectorExtensionLoaded = true;
91
+ }
92
+ }
93
+ const cypher = `
94
+ CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
95
+ `;
96
+ try {
97
+ await executeQuery(cypher);
98
+ }
99
+ catch (error) {
100
+ // Index might already exist
101
+ if (isDev) {
102
+ console.warn('Vector index creation warning:', error);
103
+ }
104
+ }
105
+ };
106
+ /**
107
+ * Run the embedding pipeline
108
+ *
109
+ * @param executeQuery - Function to execute Cypher queries against LadybugDB
110
+ * @param executeWithReusedStatement - Function to execute with reused prepared statement
111
+ * @param onProgress - Callback for progress updates
112
+ * @param config - Optional configuration override
113
+ * @param skipNodeIds - Optional set of node IDs that already have embeddings (incremental mode)
114
+ */
115
+ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatement, onProgress, config = {}, skipNodeIds) => {
116
+ const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
117
+ try {
118
+ // Phase 1: Load embedding model
119
+ onProgress({
120
+ phase: 'loading-model',
121
+ percent: 0,
122
+ modelDownloadPercent: 0,
123
+ });
124
+ await initEmbedder((modelProgress) => {
125
+ const downloadPercent = modelProgress.progress ?? 0;
126
+ onProgress({
127
+ phase: 'loading-model',
128
+ percent: Math.round(downloadPercent * 0.2),
129
+ modelDownloadPercent: downloadPercent,
130
+ });
131
+ }, finalConfig);
132
+ onProgress({
133
+ phase: 'loading-model',
134
+ percent: 20,
135
+ modelDownloadPercent: 100,
136
+ });
137
+ if (isDev) {
138
+ console.log('🔍 Querying embeddable nodes...');
139
+ }
140
+ // Phase 2: Query embeddable nodes
141
+ let nodes = await queryEmbeddableNodes(executeQuery);
142
+ // Incremental mode: filter out nodes that already have embeddings
143
+ if (skipNodeIds && skipNodeIds.size > 0) {
144
+ const beforeCount = nodes.length;
145
+ nodes = nodes.filter(n => !skipNodeIds.has(n.id));
146
+ if (isDev) {
147
+ console.log(`📦 Incremental embeddings: ${beforeCount} total, ${skipNodeIds.size} cached, ${nodes.length} to embed`);
148
+ }
149
+ }
150
+ const totalNodes = nodes.length;
151
+ if (isDev) {
152
+ console.log(`📊 Found ${totalNodes} embeddable nodes`);
153
+ }
154
+ if (totalNodes === 0) {
155
+ onProgress({
156
+ phase: 'ready',
157
+ percent: 100,
158
+ nodesProcessed: 0,
159
+ totalNodes: 0,
160
+ });
161
+ return;
162
+ }
163
+ // Phase 3: Batch embed nodes
164
+ const batchSize = finalConfig.batchSize;
165
+ const totalBatches = Math.ceil(totalNodes / batchSize);
166
+ let processedNodes = 0;
167
+ onProgress({
168
+ phase: 'embedding',
169
+ percent: 20,
170
+ nodesProcessed: 0,
171
+ totalNodes,
172
+ currentBatch: 0,
173
+ totalBatches,
174
+ });
175
+ for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
176
+ const start = batchIndex * batchSize;
177
+ const end = Math.min(start + batchSize, totalNodes);
178
+ const batch = nodes.slice(start, end);
179
+ // Generate texts for this batch
180
+ const texts = generateBatchEmbeddingTexts(batch, finalConfig);
181
+ // Embed the batch
182
+ const embeddings = await embedBatch(texts);
183
+ // Update LadybugDB with embeddings
184
+ const updates = batch.map((node, i) => ({
185
+ id: node.id,
186
+ embedding: embeddingToArray(embeddings[i]),
187
+ }));
188
+ await batchInsertEmbeddings(executeWithReusedStatement, updates);
189
+ processedNodes += batch.length;
190
+ // Report progress (20-90% for embedding phase)
191
+ const embeddingProgress = 20 + ((processedNodes / totalNodes) * 70);
192
+ onProgress({
193
+ phase: 'embedding',
194
+ percent: Math.round(embeddingProgress),
195
+ nodesProcessed: processedNodes,
196
+ totalNodes,
197
+ currentBatch: batchIndex + 1,
198
+ totalBatches,
199
+ });
200
+ }
201
+ // Phase 4: Create vector index
202
+ onProgress({
203
+ phase: 'indexing',
204
+ percent: 90,
205
+ nodesProcessed: totalNodes,
206
+ totalNodes,
207
+ });
208
+ if (isDev) {
209
+ console.log('📇 Creating vector index...');
210
+ }
211
+ await createVectorIndex(executeQuery);
212
+ // Complete
213
+ onProgress({
214
+ phase: 'ready',
215
+ percent: 100,
216
+ nodesProcessed: totalNodes,
217
+ totalNodes,
218
+ });
219
+ if (isDev) {
220
+ console.log('✅ Embedding pipeline complete!');
221
+ }
222
+ }
223
+ catch (error) {
224
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error';
225
+ if (isDev) {
226
+ console.error('❌ Embedding pipeline error:', error);
227
+ }
228
+ onProgress({
229
+ phase: 'error',
230
+ percent: 0,
231
+ error: errorMessage,
232
+ });
233
+ throw error;
234
+ }
235
+ };
236
+ /**
237
+ * Perform semantic search using the vector index
238
+ *
239
+ * Uses CodeEmbedding table and queries each node table to get metadata
240
+ *
241
+ * @param executeQuery - Function to execute Cypher queries
242
+ * @param query - Search query text
243
+ * @param k - Number of results to return (default: 10)
244
+ * @param maxDistance - Maximum distance threshold (default: 0.5)
245
+ * @returns Array of search results ordered by relevance
246
+ */
247
+ export const semanticSearch = async (executeQuery, query, k = 10, maxDistance = 0.5) => {
248
+ if (!isEmbedderReady()) {
249
+ throw new Error('Embedding model not initialized. Run embedding pipeline first.');
250
+ }
251
+ // Embed the query
252
+ const queryEmbedding = await embedText(query);
253
+ const queryVec = embeddingToArray(queryEmbedding);
254
+ const queryVecStr = `[${queryVec.join(',')}]`;
255
+ // Query the vector index on CodeEmbedding to get nodeIds and distances
256
+ const vectorQuery = `
257
+ CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
258
+ CAST(${queryVecStr} AS FLOAT[384]), ${k})
259
+ YIELD node AS emb, distance
260
+ WITH emb, distance
261
+ WHERE distance < ${maxDistance}
262
+ RETURN emb.nodeId AS nodeId, distance
263
+ ORDER BY distance
264
+ `;
265
+ const embResults = await executeQuery(vectorQuery);
266
+ if (embResults.length === 0) {
267
+ return [];
268
+ }
269
+ // Group results by label for batched metadata queries
270
+ const byLabel = new Map();
271
+ for (const embRow of embResults) {
272
+ const nodeId = embRow.nodeId ?? embRow[0];
273
+ const distance = embRow.distance ?? embRow[1];
274
+ const labelEndIdx = nodeId.indexOf(':');
275
+ const label = labelEndIdx > 0 ? nodeId.substring(0, labelEndIdx) : 'Unknown';
276
+ if (!byLabel.has(label))
277
+ byLabel.set(label, []);
278
+ byLabel.get(label).push({ nodeId, distance });
279
+ }
280
+ // Batch-fetch metadata per label
281
+ const results = [];
282
+ for (const [label, items] of byLabel) {
283
+ const idList = items.map(i => `'${i.nodeId.replace(/'/g, "''")}'`).join(', ');
284
+ try {
285
+ let nodeQuery;
286
+ if (label === 'File') {
287
+ nodeQuery = `
288
+ MATCH (n:File) WHERE n.id IN [${idList}]
289
+ RETURN n.id AS id, n.name AS name, n.filePath AS filePath
290
+ `;
291
+ }
292
+ else {
293
+ nodeQuery = `
294
+ MATCH (n:${label}) WHERE n.id IN [${idList}]
295
+ RETURN n.id AS id, n.name AS name, n.filePath AS filePath,
296
+ n.startLine AS startLine, n.endLine AS endLine
297
+ `;
298
+ }
299
+ const nodeRows = await executeQuery(nodeQuery);
300
+ const rowMap = new Map();
301
+ for (const row of nodeRows) {
302
+ const id = row.id ?? row[0];
303
+ rowMap.set(id, row);
304
+ }
305
+ for (const item of items) {
306
+ const nodeRow = rowMap.get(item.nodeId);
307
+ if (nodeRow) {
308
+ results.push({
309
+ nodeId: item.nodeId,
310
+ name: nodeRow.name ?? nodeRow[1] ?? '',
311
+ label,
312
+ filePath: nodeRow.filePath ?? nodeRow[2] ?? '',
313
+ distance: item.distance,
314
+ startLine: label !== 'File' ? (nodeRow.startLine ?? nodeRow[3]) : undefined,
315
+ endLine: label !== 'File' ? (nodeRow.endLine ?? nodeRow[4]) : undefined,
316
+ });
317
+ }
318
+ }
319
+ }
320
+ catch {
321
+ // Table might not exist, skip
322
+ }
323
+ }
324
+ // Re-sort by distance since batch queries may have mixed order
325
+ results.sort((a, b) => a.distance - b.distance);
326
+ return results;
327
+ };
328
+ /**
329
+ * Semantic search with graph expansion (flattened results)
330
+ *
331
+ * Note: With multi-table schema, graph traversal is simplified.
332
+ * Returns semantic matches with their metadata.
333
+ * For full graph traversal, use execute_vector_cypher tool directly.
334
+ *
335
+ * @param executeQuery - Function to execute Cypher queries
336
+ * @param query - Search query text
337
+ * @param k - Number of initial semantic matches (default: 5)
338
+ * @param _hops - Unused (kept for API compatibility).
339
+ * @returns Semantic matches with metadata
340
+ */
341
+ export const semanticSearchWithContext = async (executeQuery, query, k = 5, _hops = 1) => {
342
+ // For multi-table schema, just return semantic search results
343
+ // Graph traversal is complex with separate tables - use execute_vector_cypher instead
344
+ const results = await semanticSearch(executeQuery, query, k, 0.5);
345
+ return results.map(r => ({
346
+ matchId: r.nodeId,
347
+ matchName: r.name,
348
+ matchLabel: r.label,
349
+ matchPath: r.filePath,
350
+ distance: r.distance,
351
+ connectedId: null,
352
+ connectedName: null,
353
+ connectedLabel: null,
354
+ relationType: null,
355
+ }));
356
+ };
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Embeddings Module
3
+ *
4
+ * Re-exports for the embedding pipeline system.
5
+ */
6
+ export * from './types.js';
7
+ export * from './embedder.js';
8
+ export * from './text-generator.js';
9
+ export * from './embedding-pipeline.js';
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Embeddings Module
3
+ *
4
+ * Re-exports for the embedding pipeline system.
5
+ */
6
+ export * from './types.js';
7
+ export * from './embedder.js';
8
+ export * from './text-generator.js';
9
+ export * from './embedding-pipeline.js';