@zuvia-software-solutions/code-mapper 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. package/README.md +215 -0
  2. package/dist/cli/ai-context.d.ts +19 -0
  3. package/dist/cli/ai-context.js +168 -0
  4. package/dist/cli/analyze.d.ts +7 -0
  5. package/dist/cli/analyze.js +325 -0
  6. package/dist/cli/augment.d.ts +7 -0
  7. package/dist/cli/augment.js +27 -0
  8. package/dist/cli/clean.d.ts +5 -0
  9. package/dist/cli/clean.js +56 -0
  10. package/dist/cli/eval-server.d.ts +25 -0
  11. package/dist/cli/eval-server.js +365 -0
  12. package/dist/cli/index.d.ts +6 -0
  13. package/dist/cli/index.js +102 -0
  14. package/dist/cli/lazy-action.d.ts +6 -0
  15. package/dist/cli/lazy-action.js +19 -0
  16. package/dist/cli/list.d.ts +2 -0
  17. package/dist/cli/list.js +27 -0
  18. package/dist/cli/mcp.d.ts +8 -0
  19. package/dist/cli/mcp.js +35 -0
  20. package/dist/cli/refresh.d.ts +12 -0
  21. package/dist/cli/refresh.js +165 -0
  22. package/dist/cli/serve.d.ts +5 -0
  23. package/dist/cli/serve.js +8 -0
  24. package/dist/cli/setup.d.ts +6 -0
  25. package/dist/cli/setup.js +218 -0
  26. package/dist/cli/status.d.ts +2 -0
  27. package/dist/cli/status.js +33 -0
  28. package/dist/cli/tool.d.ts +28 -0
  29. package/dist/cli/tool.js +87 -0
  30. package/dist/config/ignore-service.d.ts +32 -0
  31. package/dist/config/ignore-service.js +282 -0
  32. package/dist/config/supported-languages.d.ts +23 -0
  33. package/dist/config/supported-languages.js +52 -0
  34. package/dist/core/augmentation/engine.d.ts +22 -0
  35. package/dist/core/augmentation/engine.js +232 -0
  36. package/dist/core/embeddings/embedder.d.ts +35 -0
  37. package/dist/core/embeddings/embedder.js +171 -0
  38. package/dist/core/embeddings/embedding-pipeline.d.ts +41 -0
  39. package/dist/core/embeddings/embedding-pipeline.js +402 -0
  40. package/dist/core/embeddings/index.d.ts +5 -0
  41. package/dist/core/embeddings/index.js +6 -0
  42. package/dist/core/embeddings/text-generator.d.ts +20 -0
  43. package/dist/core/embeddings/text-generator.js +159 -0
  44. package/dist/core/embeddings/types.d.ts +60 -0
  45. package/dist/core/embeddings/types.js +23 -0
  46. package/dist/core/graph/graph.d.ts +4 -0
  47. package/dist/core/graph/graph.js +65 -0
  48. package/dist/core/graph/types.d.ts +69 -0
  49. package/dist/core/graph/types.js +3 -0
  50. package/dist/core/incremental/child-process.d.ts +8 -0
  51. package/dist/core/incremental/child-process.js +649 -0
  52. package/dist/core/incremental/refresh-coordinator.d.ts +32 -0
  53. package/dist/core/incremental/refresh-coordinator.js +147 -0
  54. package/dist/core/incremental/types.d.ts +78 -0
  55. package/dist/core/incremental/types.js +153 -0
  56. package/dist/core/incremental/watcher.d.ts +63 -0
  57. package/dist/core/incremental/watcher.js +338 -0
  58. package/dist/core/ingestion/ast-cache.d.ts +12 -0
  59. package/dist/core/ingestion/ast-cache.js +34 -0
  60. package/dist/core/ingestion/call-processor.d.ts +34 -0
  61. package/dist/core/ingestion/call-processor.js +937 -0
  62. package/dist/core/ingestion/call-routing.d.ts +40 -0
  63. package/dist/core/ingestion/call-routing.js +97 -0
  64. package/dist/core/ingestion/cluster-enricher.d.ts +30 -0
  65. package/dist/core/ingestion/cluster-enricher.js +151 -0
  66. package/dist/core/ingestion/community-processor.d.ts +26 -0
  67. package/dist/core/ingestion/community-processor.js +272 -0
  68. package/dist/core/ingestion/constants.d.ts +5 -0
  69. package/dist/core/ingestion/constants.js +8 -0
  70. package/dist/core/ingestion/entry-point-scoring.d.ts +23 -0
  71. package/dist/core/ingestion/entry-point-scoring.js +317 -0
  72. package/dist/core/ingestion/export-detection.d.ts +11 -0
  73. package/dist/core/ingestion/export-detection.js +203 -0
  74. package/dist/core/ingestion/filesystem-walker.d.ts +18 -0
  75. package/dist/core/ingestion/filesystem-walker.js +64 -0
  76. package/dist/core/ingestion/framework-detection.d.ts +42 -0
  77. package/dist/core/ingestion/framework-detection.js +405 -0
  78. package/dist/core/ingestion/heritage-processor.d.ts +15 -0
  79. package/dist/core/ingestion/heritage-processor.js +237 -0
  80. package/dist/core/ingestion/import-processor.d.ts +31 -0
  81. package/dist/core/ingestion/import-processor.js +416 -0
  82. package/dist/core/ingestion/language-config.d.ts +32 -0
  83. package/dist/core/ingestion/language-config.js +161 -0
  84. package/dist/core/ingestion/mro-processor.d.ts +32 -0
  85. package/dist/core/ingestion/mro-processor.js +343 -0
  86. package/dist/core/ingestion/named-binding-extraction.d.ts +51 -0
  87. package/dist/core/ingestion/named-binding-extraction.js +343 -0
  88. package/dist/core/ingestion/parsing-processor.d.ts +20 -0
  89. package/dist/core/ingestion/parsing-processor.js +282 -0
  90. package/dist/core/ingestion/pipeline.d.ts +3 -0
  91. package/dist/core/ingestion/pipeline.js +416 -0
  92. package/dist/core/ingestion/process-processor.d.ts +42 -0
  93. package/dist/core/ingestion/process-processor.js +357 -0
  94. package/dist/core/ingestion/resolution-context.d.ts +40 -0
  95. package/dist/core/ingestion/resolution-context.js +171 -0
  96. package/dist/core/ingestion/resolvers/csharp.d.ts +10 -0
  97. package/dist/core/ingestion/resolvers/csharp.js +101 -0
  98. package/dist/core/ingestion/resolvers/go.d.ts +8 -0
  99. package/dist/core/ingestion/resolvers/go.js +33 -0
  100. package/dist/core/ingestion/resolvers/index.d.ts +14 -0
  101. package/dist/core/ingestion/resolvers/index.js +10 -0
  102. package/dist/core/ingestion/resolvers/jvm.d.ts +9 -0
  103. package/dist/core/ingestion/resolvers/jvm.js +74 -0
  104. package/dist/core/ingestion/resolvers/php.d.ts +7 -0
  105. package/dist/core/ingestion/resolvers/php.js +30 -0
  106. package/dist/core/ingestion/resolvers/ruby.d.ts +9 -0
  107. package/dist/core/ingestion/resolvers/ruby.js +13 -0
  108. package/dist/core/ingestion/resolvers/rust.d.ts +5 -0
  109. package/dist/core/ingestion/resolvers/rust.js +62 -0
  110. package/dist/core/ingestion/resolvers/standard.d.ts +16 -0
  111. package/dist/core/ingestion/resolvers/standard.js +144 -0
  112. package/dist/core/ingestion/resolvers/utils.d.ts +18 -0
  113. package/dist/core/ingestion/resolvers/utils.js +113 -0
  114. package/dist/core/ingestion/structure-processor.d.ts +4 -0
  115. package/dist/core/ingestion/structure-processor.js +39 -0
  116. package/dist/core/ingestion/symbol-table.d.ts +34 -0
  117. package/dist/core/ingestion/symbol-table.js +48 -0
  118. package/dist/core/ingestion/tree-sitter-queries.d.ts +20 -0
  119. package/dist/core/ingestion/tree-sitter-queries.js +691 -0
  120. package/dist/core/ingestion/type-env.d.ts +52 -0
  121. package/dist/core/ingestion/type-env.js +349 -0
  122. package/dist/core/ingestion/type-extractors/c-cpp.d.ts +4 -0
  123. package/dist/core/ingestion/type-extractors/c-cpp.js +214 -0
  124. package/dist/core/ingestion/type-extractors/csharp.d.ts +4 -0
  125. package/dist/core/ingestion/type-extractors/csharp.js +224 -0
  126. package/dist/core/ingestion/type-extractors/go.d.ts +4 -0
  127. package/dist/core/ingestion/type-extractors/go.js +261 -0
  128. package/dist/core/ingestion/type-extractors/index.d.ts +20 -0
  129. package/dist/core/ingestion/type-extractors/index.js +30 -0
  130. package/dist/core/ingestion/type-extractors/jvm.d.ts +5 -0
  131. package/dist/core/ingestion/type-extractors/jvm.js +386 -0
  132. package/dist/core/ingestion/type-extractors/php.d.ts +4 -0
  133. package/dist/core/ingestion/type-extractors/php.js +280 -0
  134. package/dist/core/ingestion/type-extractors/python.d.ts +4 -0
  135. package/dist/core/ingestion/type-extractors/python.js +175 -0
  136. package/dist/core/ingestion/type-extractors/ruby.d.ts +12 -0
  137. package/dist/core/ingestion/type-extractors/ruby.js +218 -0
  138. package/dist/core/ingestion/type-extractors/rust.d.ts +4 -0
  139. package/dist/core/ingestion/type-extractors/rust.js +290 -0
  140. package/dist/core/ingestion/type-extractors/shared.d.ts +81 -0
  141. package/dist/core/ingestion/type-extractors/shared.js +322 -0
  142. package/dist/core/ingestion/type-extractors/swift.d.ts +4 -0
  143. package/dist/core/ingestion/type-extractors/swift.js +140 -0
  144. package/dist/core/ingestion/type-extractors/types.d.ts +111 -0
  145. package/dist/core/ingestion/type-extractors/types.js +4 -0
  146. package/dist/core/ingestion/type-extractors/typescript.d.ts +4 -0
  147. package/dist/core/ingestion/type-extractors/typescript.js +227 -0
  148. package/dist/core/ingestion/utils.d.ts +73 -0
  149. package/dist/core/ingestion/utils.js +992 -0
  150. package/dist/core/ingestion/workers/parse-worker.d.ts +99 -0
  151. package/dist/core/ingestion/workers/parse-worker.js +1055 -0
  152. package/dist/core/ingestion/workers/worker-pool.d.ts +15 -0
  153. package/dist/core/ingestion/workers/worker-pool.js +123 -0
  154. package/dist/core/lbug/csv-generator.d.ts +28 -0
  155. package/dist/core/lbug/csv-generator.js +355 -0
  156. package/dist/core/lbug/lbug-adapter.d.ts +96 -0
  157. package/dist/core/lbug/lbug-adapter.js +753 -0
  158. package/dist/core/lbug/schema.d.ts +46 -0
  159. package/dist/core/lbug/schema.js +402 -0
  160. package/dist/core/search/bm25-index.d.ts +20 -0
  161. package/dist/core/search/bm25-index.js +123 -0
  162. package/dist/core/search/hybrid-search.d.ts +32 -0
  163. package/dist/core/search/hybrid-search.js +131 -0
  164. package/dist/core/search/query-cache.d.ts +18 -0
  165. package/dist/core/search/query-cache.js +47 -0
  166. package/dist/core/search/query-expansion.d.ts +19 -0
  167. package/dist/core/search/query-expansion.js +75 -0
  168. package/dist/core/search/reranker.d.ts +29 -0
  169. package/dist/core/search/reranker.js +122 -0
  170. package/dist/core/search/types.d.ts +154 -0
  171. package/dist/core/search/types.js +51 -0
  172. package/dist/core/semantic/tsgo-service.d.ts +67 -0
  173. package/dist/core/semantic/tsgo-service.js +355 -0
  174. package/dist/core/tree-sitter/parser-loader.d.ts +12 -0
  175. package/dist/core/tree-sitter/parser-loader.js +71 -0
  176. package/dist/lib/memory-guard.d.ts +35 -0
  177. package/dist/lib/memory-guard.js +70 -0
  178. package/dist/lib/utils.d.ts +3 -0
  179. package/dist/lib/utils.js +6 -0
  180. package/dist/mcp/compatible-stdio-transport.d.ts +32 -0
  181. package/dist/mcp/compatible-stdio-transport.js +209 -0
  182. package/dist/mcp/core/embedder.d.ts +24 -0
  183. package/dist/mcp/core/embedder.js +168 -0
  184. package/dist/mcp/core/lbug-adapter.d.ts +29 -0
  185. package/dist/mcp/core/lbug-adapter.js +330 -0
  186. package/dist/mcp/local/local-backend.d.ts +188 -0
  187. package/dist/mcp/local/local-backend.js +2759 -0
  188. package/dist/mcp/resources.d.ts +22 -0
  189. package/dist/mcp/resources.js +379 -0
  190. package/dist/mcp/server.d.ts +10 -0
  191. package/dist/mcp/server.js +217 -0
  192. package/dist/mcp/staleness.d.ts +10 -0
  193. package/dist/mcp/staleness.js +25 -0
  194. package/dist/mcp/tools.d.ts +21 -0
  195. package/dist/mcp/tools.js +202 -0
  196. package/dist/server/api.d.ts +5 -0
  197. package/dist/server/api.js +340 -0
  198. package/dist/server/mcp-http.d.ts +7 -0
  199. package/dist/server/mcp-http.js +95 -0
  200. package/dist/storage/git.d.ts +6 -0
  201. package/dist/storage/git.js +35 -0
  202. package/dist/storage/repo-manager.d.ts +87 -0
  203. package/dist/storage/repo-manager.js +249 -0
  204. package/dist/types/pipeline.d.ts +35 -0
  205. package/dist/types/pipeline.js +20 -0
  206. package/hooks/claude/code-mapper-hook.cjs +238 -0
  207. package/hooks/claude/pre-tool-use.sh +79 -0
  208. package/hooks/claude/session-start.sh +42 -0
  209. package/models/mlx-embedder.py +185 -0
  210. package/package.json +100 -0
  211. package/scripts/patch-tree-sitter-swift.cjs +74 -0
  212. package/vendor/leiden/index.cjs +355 -0
  213. package/vendor/leiden/utils.cjs +392 -0
@@ -0,0 +1,171 @@
1
+ // code-mapper/src/core/embeddings/embedder.ts
2
+ /**
3
+ * @file embedder.ts
4
+ * @description MLX-accelerated code embedder via Python subprocess
5
+ *
6
+ * Replaces the previous ONNX/transformers.js embedder with Jina Code 1.5B
7
+ * running on Apple Silicon Metal via MLX. Fail-fast — no fallback.
8
+ *
9
+ * Model: jinaai/jina-code-embeddings-1.5b-mlx (1.54B params, 1536 dims, 32K context)
10
+ * Matryoshka truncation to 256 dims for optimal speed/quality tradeoff
11
+ */
12
+ import { spawn, execFileSync } from 'child_process';
13
+ import path from 'path';
14
+ import { fileURLToPath } from 'url';
15
+ import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
16
+ const __filename = fileURLToPath(import.meta.url);
17
+ const __dirname = path.dirname(__filename);
18
+ // Path to MLX embedder script (relative to compiled dist/)
19
+ const MLX_SCRIPT = path.resolve(__dirname, '..', '..', '..', 'models', 'mlx-embedder.py');
20
+ // Singleton subprocess
21
+ let mlxProcess = null;
22
+ let pendingResolve = null;
23
+ let pendingReject = null;
24
+ let lineBuffer = '';
25
+ let ready = false;
26
+ /** Get the current inference device */
27
+ export const getCurrentDevice = () => ready ? 'mlx-metal' : null;
28
+ function ensureProcess() {
29
+ if (mlxProcess && !mlxProcess.killed)
30
+ return mlxProcess;
31
+ // Check prerequisites
32
+ try {
33
+ execFileSync('python3', ['-c', 'import mlx; import tokenizers'], {
34
+ timeout: 5000,
35
+ stdio: ['pipe', 'pipe', 'pipe'],
36
+ });
37
+ }
38
+ catch {
39
+ throw new Error('MLX embedder requires Python 3 + MLX on Apple Silicon.\n' +
40
+ 'Install: pip3 install mlx tokenizers huggingface_hub\n' +
41
+ 'The embedding model will download automatically on first use (~3GB).');
42
+ }
43
+ mlxProcess = spawn('python3', [MLX_SCRIPT], {
44
+ stdio: ['pipe', 'pipe', 'pipe'],
45
+ env: { ...process.env, TOKENIZERS_PARALLELISM: 'false' },
46
+ });
47
+ lineBuffer = '';
48
+ mlxProcess.stdout.on('data', (chunk) => {
49
+ lineBuffer += chunk.toString();
50
+ const lines = lineBuffer.split('\n');
51
+ lineBuffer = lines.pop() || '';
52
+ for (const line of lines) {
53
+ if (!line.trim())
54
+ continue;
55
+ try {
56
+ const msg = JSON.parse(line);
57
+ if (msg.status === 'ready' && !ready) {
58
+ ready = true;
59
+ console.error(`Code Mapper: MLX embedder ready (${msg.device}, loaded in ${msg.load_ms}ms)`);
60
+ }
61
+ if (pendingResolve) {
62
+ const resolve = pendingResolve;
63
+ pendingResolve = null;
64
+ pendingReject = null;
65
+ resolve(msg);
66
+ }
67
+ }
68
+ catch {
69
+ // Non-JSON output — ignore
70
+ }
71
+ }
72
+ });
73
+ mlxProcess.stderr.on('data', (chunk) => {
74
+ // Forward stderr for debugging
75
+ const msg = chunk.toString().trim();
76
+ if (msg)
77
+ console.error(`[mlx-embedder] ${msg}`);
78
+ });
79
+ mlxProcess.on('exit', (code) => {
80
+ ready = false;
81
+ mlxProcess = null;
82
+ if (pendingReject) {
83
+ const reject = pendingReject;
84
+ pendingResolve = null;
85
+ pendingReject = null;
86
+ reject(new Error(`MLX embedder exited with code ${code}`));
87
+ }
88
+ });
89
+ return mlxProcess;
90
+ }
91
+ function sendAndReceive(request) {
92
+ return new Promise((resolve, reject) => {
93
+ const proc = ensureProcess();
94
+ pendingResolve = resolve;
95
+ pendingReject = reject;
96
+ proc.stdin.write(JSON.stringify(request) + '\n');
97
+ });
98
+ }
99
+ /**
100
+ * Initialize the MLX embedder (spawns Python subprocess, loads model)
101
+ */
102
+ export const initEmbedder = async (_onProgress, _config = {}) => {
103
+ if (ready)
104
+ return;
105
+ ensureProcess();
106
+ // Wait for the "ready" message from the Python process
107
+ const msg = await sendAndReceive({ cmd: 'ping' });
108
+ if (msg.error) {
109
+ throw new Error(`MLX embedder failed: ${msg.error}`);
110
+ }
111
+ return msg;
112
+ };
113
+ /** Check if the embedder is initialized and ready */
114
+ export const isEmbedderReady = () => ready;
115
+ /** Get the embedder instance — not applicable for MLX, returns null */
116
+ export const getEmbedder = () => {
117
+ if (!ready)
118
+ throw new Error('MLX embedder not initialized. Call initEmbedder() first.');
119
+ return null; // No JS-side instance — inference happens in Python
120
+ };
121
+ /**
122
+ * Embed a single text string
123
+ */
124
+ export const embedText = async (text) => {
125
+ const result = await sendAndReceive({
126
+ texts: [text],
127
+ task: 'nl2code',
128
+ type: 'passage',
129
+ dims: DEFAULT_EMBEDDING_CONFIG.dimensions,
130
+ });
131
+ if (result.error)
132
+ throw new Error(`Embedding failed: ${result.error}`);
133
+ return new Float32Array(result.embeddings[0]);
134
+ };
135
+ /**
136
+ * Embed multiple texts in a single batch
137
+ */
138
+ export const embedBatch = async (texts) => {
139
+ if (texts.length === 0)
140
+ return [];
141
+ const result = await sendAndReceive({
142
+ texts,
143
+ task: 'nl2code',
144
+ type: 'passage',
145
+ dims: DEFAULT_EMBEDDING_CONFIG.dimensions,
146
+ });
147
+ if (result.error)
148
+ throw new Error(`Batch embedding failed: ${result.error}`);
149
+ return result.embeddings.map((e) => new Float32Array(e));
150
+ };
151
+ /** Convert Float32Array to number[] for LadybugDB storage */
152
+ export const embeddingToArray = (embedding) => {
153
+ return Array.from(embedding);
154
+ };
155
+ /** Dispose the embedder subprocess */
156
+ export const disposeEmbedder = async () => {
157
+ if (mlxProcess && !mlxProcess.killed) {
158
+ try {
159
+ mlxProcess.stdin.write(JSON.stringify({ cmd: 'quit' }) + '\n');
160
+ // Give it a moment to exit gracefully
161
+ await new Promise(resolve => setTimeout(resolve, 500));
162
+ }
163
+ catch { }
164
+ try {
165
+ mlxProcess.kill();
166
+ }
167
+ catch { }
168
+ mlxProcess = null;
169
+ }
170
+ ready = false;
171
+ };
@@ -0,0 +1,41 @@
1
+ /**
2
+ * @file embedding-pipeline.ts
3
+ * @description Orchestrates the background embedding process:
4
+ * 1) Query embeddable nodes from LadybugDB
5
+ * 2) Generate text representations
6
+ * 3) Batch embed using transformers.js
7
+ * 4) Store embeddings in LadybugDB
8
+ * 5) Create vector index for semantic search
9
+ */
10
+ import { type EmbeddingProgress, type EmbeddingConfig, type SemanticSearchResult } from './types.js';
11
+ /** Progress callback type */
12
+ export type EmbeddingProgressCallback = (progress: EmbeddingProgress) => void;
13
+ /**
14
+ * Run the full embedding pipeline (load model, embed nodes, create index)
15
+ * @param executeQuery - Execute Cypher queries against LadybugDB
16
+ * @param executeWithReusedStatement - Execute with reused prepared statement
17
+ * @param onProgress - Progress callback
18
+ * @param config - Configuration override
19
+ * @param skipNodeIds - Node IDs that already have embeddings (incremental mode)
20
+ */
21
+ export declare const runEmbeddingPipeline: (executeQuery: (cypher: string) => Promise<any[]>, executeWithReusedStatement: (cypher: string, paramsList: Array<Record<string, any>>) => Promise<void>, onProgress: EmbeddingProgressCallback, config?: Partial<EmbeddingConfig>, skipNodeIds?: Set<string>) => Promise<void>;
22
+ /**
23
+ * Perform semantic search via the CodeEmbedding vector index
24
+ * @param executeQuery - Execute Cypher queries
25
+ * @param query - Search query text
26
+ * @param k - Number of results (default: 10)
27
+ * @param maxDistance - Maximum cosine distance threshold (default: 0.5)
28
+ * @returns Search results ordered by relevance
29
+ */
30
+ export declare const semanticSearch: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, maxDistance?: number) => Promise<SemanticSearchResult[]>;
31
+ /**
32
+ * Semantic search with flattened results (graph expansion placeholder)
33
+ *
34
+ * For full graph traversal, use the execute_vector_cypher tool directly
35
+ *
36
+ * @param executeQuery - Execute Cypher queries
37
+ * @param query - Search query text
38
+ * @param k - Number of semantic matches (default: 5)
39
+ * @param _hops - Unused, kept for API compatibility
40
+ */
41
+ export declare const semanticSearchWithContext: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, _hops?: number) => Promise<any[]>;
@@ -0,0 +1,402 @@
1
+ // code-mapper/src/core/embeddings/embedding-pipeline.ts
2
+ /**
3
+ * @file embedding-pipeline.ts
4
+ * @description Orchestrates the background embedding process:
5
+ * 1) Query embeddable nodes from LadybugDB
6
+ * 2) Generate text representations
7
+ * 3) Batch embed using transformers.js
8
+ * 4) Store embeddings in LadybugDB
9
+ * 5) Create vector index for semantic search
10
+ */
11
+ import { initEmbedder, embedBatch, embedText, embeddingToArray, isEmbedderReady } from './embedder.js';
12
+ import { generateEmbeddingText } from './text-generator.js';
13
+ import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, } from './types.js';
14
+ const isDev = process.env.NODE_ENV === 'development';
15
+ /** Query all embeddable nodes from LadybugDB (File has different schema than code elements) */
16
+ const queryEmbeddableNodes = async (executeQuery) => {
17
+ const allNodes = [];
18
+ for (const label of EMBEDDABLE_LABELS) {
19
+ try {
20
+ // All embeddable labels are code elements with startLine/endLine
21
+ const query = `
22
+ MATCH (n:${label})
23
+ RETURN n.id AS id, n.name AS name, '${label}' AS label,
24
+ n.filePath AS filePath, n.content AS content,
25
+ n.startLine AS startLine, n.endLine AS endLine
26
+ `;
27
+ const rows = await executeQuery(query);
28
+ for (const row of rows) {
29
+ allNodes.push({
30
+ id: row.id ?? row[0],
31
+ name: row.name ?? row[1],
32
+ label: row.label ?? row[2],
33
+ filePath: row.filePath ?? row[3],
34
+ content: row.content ?? row[4] ?? '',
35
+ startLine: row.startLine ?? row[5],
36
+ endLine: row.endLine ?? row[6],
37
+ });
38
+ }
39
+ }
40
+ catch (error) {
41
+ // Table might not exist or be empty — continue
42
+ if (isDev) {
43
+ console.warn(`Query for ${label} nodes failed:`, error);
44
+ }
45
+ }
46
+ }
47
+ return allNodes;
48
+ };
49
+ /**
50
+ * Batch INSERT embeddings into the CodeEmbedding table
51
+ *
52
+ * Separate lightweight table avoids copy-on-write overhead from
53
+ * UPDATEing nodes with large content fields
54
+ */
55
+ const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
56
+ // INSERT into separate embedding table — avoids large-row COW overhead
57
+ const cypher = `CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`;
58
+ const paramsList = updates.map(u => ({ nodeId: u.id, embedding: u.embedding }));
59
+ await executeWithReusedStatement(cypher, paramsList);
60
+ };
61
+ /** Create the HNSW vector index on the CodeEmbedding table */
62
+ let vectorExtensionLoaded = false;
63
+ const createVectorIndex = async (executeQuery) => {
64
+ // LadybugDB v0.15+ requires explicit VECTOR extension load (once per session)
65
+ if (!vectorExtensionLoaded) {
66
+ try {
67
+ await executeQuery('INSTALL VECTOR');
68
+ await executeQuery('LOAD EXTENSION VECTOR');
69
+ vectorExtensionLoaded = true;
70
+ }
71
+ catch {
72
+ // Extension may already be loaded — index creation will fail clearly if not
73
+ vectorExtensionLoaded = true;
74
+ }
75
+ }
76
+ const cypher = `
77
+ CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
78
+ `;
79
+ try {
80
+ await executeQuery(cypher);
81
+ }
82
+ catch (error) {
83
+ // Index might already exist
84
+ if (isDev) {
85
+ console.warn('Vector index creation warning:', error);
86
+ }
87
+ }
88
+ };
89
+ /**
90
+ * Run the full embedding pipeline (load model, embed nodes, create index)
91
+ * @param executeQuery - Execute Cypher queries against LadybugDB
92
+ * @param executeWithReusedStatement - Execute with reused prepared statement
93
+ * @param onProgress - Progress callback
94
+ * @param config - Configuration override
95
+ * @param skipNodeIds - Node IDs that already have embeddings (incremental mode)
96
+ */
97
+ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatement, onProgress, config = {}, skipNodeIds) => {
98
+ const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
99
+ try {
100
+ // Phase 1: Load model
101
+ onProgress({
102
+ phase: 'loading-model',
103
+ percent: 0,
104
+ modelDownloadPercent: 0,
105
+ });
106
+ await initEmbedder((modelProgress) => {
107
+ const downloadPercent = modelProgress.progress ?? 0;
108
+ onProgress({
109
+ phase: 'loading-model',
110
+ percent: Math.round(downloadPercent * 0.2),
111
+ modelDownloadPercent: downloadPercent,
112
+ });
113
+ }, finalConfig);
114
+ onProgress({
115
+ phase: 'loading-model',
116
+ percent: 20,
117
+ modelDownloadPercent: 100,
118
+ });
119
+ if (isDev) {
120
+ console.log('🔍 Querying embeddable nodes...');
121
+ }
122
+ // Phase 2: Query nodes
123
+ let nodes = await queryEmbeddableNodes(executeQuery);
124
+ // Incremental mode: skip already-embedded nodes
125
+ if (skipNodeIds && skipNodeIds.size > 0) {
126
+ const beforeCount = nodes.length;
127
+ nodes = nodes.filter(n => !skipNodeIds.has(n.id));
128
+ if (isDev) {
129
+ console.log(`📦 Incremental embeddings: ${beforeCount} total, ${skipNodeIds.size} cached, ${nodes.length} to embed`);
130
+ }
131
+ }
132
+ const totalNodes = nodes.length;
133
+ // Enrich nodes with graph context (callers, callees, module) for better embeddings
134
+ // This adds relationship context so "import resolution pipeline" matches processImports
135
+ // because its caller "runPipelineFromRepo" contains "pipeline"
136
+ const graphContext = new Map();
137
+ if (totalNodes > 0) {
138
+ try {
139
+ const nodeIds = nodes.map(n => `'${String(n.id).replace(/'/g, "''")}'`).join(', ');
140
+ // Batch fetch callers
141
+ const callerRows = await executeQuery(`
142
+ MATCH (caller)-[r:CodeRelation {type: 'CALLS'}]->(n) WHERE n.id IN [${nodeIds}] AND r.confidence >= 0.7
143
+ RETURN n.id AS nid, caller.name AS name LIMIT ${totalNodes * 3}
144
+ `);
145
+ const callerMap = new Map();
146
+ for (const r of callerRows) {
147
+ const nid = String(r.nid ?? r[0]);
148
+ if (!callerMap.has(nid))
149
+ callerMap.set(nid, []);
150
+ callerMap.get(nid).push(String(r.name ?? r[1]));
151
+ }
152
+ // Batch fetch callees
153
+ const calleeRows = await executeQuery(`
154
+ MATCH (n)-[r:CodeRelation {type: 'CALLS'}]->(callee) WHERE n.id IN [${nodeIds}] AND r.confidence >= 0.7
155
+ RETURN n.id AS nid, callee.name AS name LIMIT ${totalNodes * 3}
156
+ `);
157
+ const calleeMap = new Map();
158
+ for (const r of calleeRows) {
159
+ const nid = String(r.nid ?? r[0]);
160
+ if (!calleeMap.has(nid))
161
+ calleeMap.set(nid, []);
162
+ calleeMap.get(nid).push(String(r.name ?? r[1]));
163
+ }
164
+ // Batch fetch module
165
+ const moduleRows = await executeQuery(`
166
+ MATCH (n)-[:CodeRelation {type: 'MEMBER_OF'}]->(c:Community) WHERE n.id IN [${nodeIds}]
167
+ RETURN n.id AS nid, c.heuristicLabel AS module LIMIT ${totalNodes}
168
+ `);
169
+ const moduleMap = new Map();
170
+ for (const r of moduleRows) {
171
+ moduleMap.set(String(r.nid ?? r[0]), String(r.module ?? r[1] ?? ''));
172
+ }
173
+ // Assemble
174
+ for (const node of nodes) {
175
+ graphContext.set(node.id, {
176
+ callers: (callerMap.get(node.id) || []).slice(0, 3),
177
+ callees: (calleeMap.get(node.id) || []).slice(0, 3),
178
+ module: moduleMap.get(node.id) || '',
179
+ });
180
+ }
181
+ }
182
+ catch { } // Non-fatal — embeddings work without graph context
183
+ }
184
+ if (isDev) {
185
+ console.log(`📊 Found ${totalNodes} embeddable nodes (${graphContext.size} with graph context)`);
186
+ }
187
+ if (totalNodes === 0) {
188
+ onProgress({
189
+ phase: 'ready',
190
+ percent: 100,
191
+ nodesProcessed: 0,
192
+ totalNodes: 0,
193
+ });
194
+ return;
195
+ }
196
+ // Phase 3: Batch embed
197
+ const batchSize = finalConfig.batchSize;
198
+ const totalBatches = Math.ceil(totalNodes / batchSize);
199
+ let processedNodes = 0;
200
+ onProgress({
201
+ phase: 'embedding',
202
+ percent: 20,
203
+ nodesProcessed: 0,
204
+ totalNodes,
205
+ });
206
+ // Generate ALL text representations with graph context enrichment
207
+ const allTexts = nodes.map(node => {
208
+ const ctx = graphContext.get(node.id);
209
+ let text = generateEmbeddingText(node, finalConfig);
210
+ if (ctx) {
211
+ const parts = [];
212
+ if (ctx.module)
213
+ parts.push(`Module: ${ctx.module}`);
214
+ if (ctx.callers.length > 0)
215
+ parts.push(`Called by: ${ctx.callers.join(', ')}`);
216
+ if (ctx.callees.length > 0)
217
+ parts.push(`Calls: ${ctx.callees.join(', ')}`);
218
+ if (parts.length > 0) {
219
+ const lines = text.split('\n');
220
+ const insertIdx = lines.findIndex(l => l === '') || 2;
221
+ lines.splice(insertIdx, 0, ...parts);
222
+ text = lines.join('\n');
223
+ }
224
+ }
225
+ return text;
226
+ });
227
+ // Send ALL texts to the MLX embedder in one call — it does length-tiered
228
+ // batching internally for optimal Metal GPU utilization
229
+ const allEmbeddings = await embedBatch(allTexts);
230
+ onProgress({
231
+ phase: 'embedding',
232
+ percent: 85,
233
+ nodesProcessed: totalNodes,
234
+ totalNodes,
235
+ });
236
+ // Insert all embeddings into LadybugDB in batches
237
+ const DB_BATCH = 200;
238
+ for (let i = 0; i < nodes.length; i += DB_BATCH) {
239
+ const batchNodes = nodes.slice(i, i + DB_BATCH);
240
+ const batchEmbeddings = allEmbeddings.slice(i, i + DB_BATCH);
241
+ const updates = batchNodes.map((node, j) => ({
242
+ id: node.id,
243
+ embedding: embeddingToArray(batchEmbeddings[j]),
244
+ }));
245
+ await batchInsertEmbeddings(executeWithReusedStatement, updates);
246
+ processedNodes = Math.min(i + DB_BATCH, nodes.length);
247
+ onProgress({
248
+ phase: 'embedding',
249
+ percent: Math.round(85 + ((processedNodes / totalNodes) * 5)),
250
+ nodesProcessed: processedNodes,
251
+ totalNodes,
252
+ });
253
+ }
254
+ // Phase 4: Create HNSW vector index
255
+ onProgress({
256
+ phase: 'indexing',
257
+ percent: 90,
258
+ nodesProcessed: totalNodes,
259
+ totalNodes,
260
+ });
261
+ if (isDev) {
262
+ console.log('📇 Creating vector index...');
263
+ }
264
+ await createVectorIndex(executeQuery);
265
+ // Done
266
+ onProgress({
267
+ phase: 'ready',
268
+ percent: 100,
269
+ nodesProcessed: totalNodes,
270
+ totalNodes,
271
+ });
272
+ if (isDev) {
273
+ console.log('✅ Embedding pipeline complete!');
274
+ }
275
+ }
276
+ catch (error) {
277
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error';
278
+ if (isDev) {
279
+ console.error('❌ Embedding pipeline error:', error);
280
+ }
281
+ onProgress({
282
+ phase: 'error',
283
+ percent: 0,
284
+ error: errorMessage,
285
+ });
286
+ throw error;
287
+ }
288
+ };
289
+ /**
290
+ * Perform semantic search via the CodeEmbedding vector index
291
+ * @param executeQuery - Execute Cypher queries
292
+ * @param query - Search query text
293
+ * @param k - Number of results (default: 10)
294
+ * @param maxDistance - Maximum cosine distance threshold (default: 0.5)
295
+ * @returns Search results ordered by relevance
296
+ */
297
+ export const semanticSearch = async (executeQuery, query, k = 10, maxDistance = 0.5) => {
298
+ if (!isEmbedderReady()) {
299
+ throw new Error('Embedding model not initialized. Run embedding pipeline first.');
300
+ }
301
+ // Embed query text
302
+ const queryEmbedding = await embedText(query);
303
+ const queryVec = embeddingToArray(queryEmbedding);
304
+ const queryVecStr = `[${queryVec.join(',')}]`;
305
+ // Query vector index for nearest neighbors
306
+ const vectorQuery = `
307
+ CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
308
+ CAST(${queryVecStr} AS FLOAT[${DEFAULT_EMBEDDING_CONFIG.dimensions}]), ${k})
309
+ YIELD node AS emb, distance
310
+ WITH emb, distance
311
+ WHERE distance < ${maxDistance}
312
+ RETURN emb.nodeId AS nodeId, distance
313
+ ORDER BY distance
314
+ `;
315
+ const embResults = await executeQuery(vectorQuery);
316
+ if (embResults.length === 0) {
317
+ return [];
318
+ }
319
+ // Group by label for batched metadata queries
320
+ const byLabel = new Map();
321
+ for (const embRow of embResults) {
322
+ const nodeId = embRow.nodeId ?? embRow[0];
323
+ const distance = embRow.distance ?? embRow[1];
324
+ const labelEndIdx = nodeId.indexOf(':');
325
+ const label = labelEndIdx > 0 ? nodeId.substring(0, labelEndIdx) : 'Unknown';
326
+ if (!byLabel.has(label))
327
+ byLabel.set(label, []);
328
+ byLabel.get(label).push({ nodeId, distance });
329
+ }
330
+ // Batch-fetch node metadata per label
331
+ const results = [];
332
+ for (const [label, items] of byLabel) {
333
+ const idList = items.map(i => `'${i.nodeId.replace(/'/g, "''")}'`).join(', ');
334
+ try {
335
+ let nodeQuery;
336
+ if (label === 'File') {
337
+ nodeQuery = `
338
+ MATCH (n:File) WHERE n.id IN [${idList}]
339
+ RETURN n.id AS id, n.name AS name, n.filePath AS filePath
340
+ `;
341
+ }
342
+ else {
343
+ nodeQuery = `
344
+ MATCH (n:${label}) WHERE n.id IN [${idList}]
345
+ RETURN n.id AS id, n.name AS name, n.filePath AS filePath,
346
+ n.startLine AS startLine, n.endLine AS endLine
347
+ `;
348
+ }
349
+ const nodeRows = await executeQuery(nodeQuery);
350
+ const rowMap = new Map();
351
+ for (const row of nodeRows) {
352
+ const id = row.id ?? row[0];
353
+ rowMap.set(id, row);
354
+ }
355
+ for (const item of items) {
356
+ const nodeRow = rowMap.get(item.nodeId);
357
+ if (nodeRow) {
358
+ results.push({
359
+ nodeId: item.nodeId,
360
+ name: nodeRow.name ?? nodeRow[1] ?? '',
361
+ label,
362
+ filePath: nodeRow.filePath ?? nodeRow[2] ?? '',
363
+ distance: item.distance,
364
+ startLine: label !== 'File' ? (nodeRow.startLine ?? nodeRow[3]) : undefined,
365
+ endLine: label !== 'File' ? (nodeRow.endLine ?? nodeRow[4]) : undefined,
366
+ });
367
+ }
368
+ }
369
+ }
370
+ catch {
371
+ // Table might not exist — skip
372
+ }
373
+ }
374
+ // Re-sort by distance (batch queries may have mixed order)
375
+ results.sort((a, b) => a.distance - b.distance);
376
+ return results;
377
+ };
378
+ /**
379
+ * Semantic search with flattened results (graph expansion placeholder)
380
+ *
381
+ * For full graph traversal, use the execute_vector_cypher tool directly
382
+ *
383
+ * @param executeQuery - Execute Cypher queries
384
+ * @param query - Search query text
385
+ * @param k - Number of semantic matches (default: 5)
386
+ * @param _hops - Unused, kept for API compatibility
387
+ */
388
+ export const semanticSearchWithContext = async (executeQuery, query, k = 5, _hops = 1) => {
389
+ // Return semantic results directly — use execute_vector_cypher for graph traversal
390
+ const results = await semanticSearch(executeQuery, query, k, 0.5);
391
+ return results.map(r => ({
392
+ matchId: r.nodeId,
393
+ matchName: r.name,
394
+ matchLabel: r.label,
395
+ matchPath: r.filePath,
396
+ distance: r.distance,
397
+ connectedId: null,
398
+ connectedName: null,
399
+ connectedLabel: null,
400
+ relationType: null,
401
+ }));
402
+ };
@@ -0,0 +1,5 @@
1
+ /** @file index.ts @description Barrel re-exports for the embedding pipeline system */
2
+ export * from './types.js';
3
+ export * from './embedder.js';
4
+ export * from './text-generator.js';
5
+ export * from './embedding-pipeline.js';
@@ -0,0 +1,6 @@
1
+ // code-mapper/src/core/embeddings/index.ts
2
+ /** @file index.ts @description Barrel re-exports for the embedding pipeline system */
3
+ export * from './types.js';
4
+ export * from './embedder.js';
5
+ export * from './text-generator.js';
6
+ export * from './embedding-pipeline.js';
@@ -0,0 +1,20 @@
1
+ /**
2
+ * @file text-generator.ts
3
+ * @description Pure functions to generate embedding text from code nodes,
4
+ * combining node metadata with code snippets for semantic matching
5
+ */
6
+ import type { EmbeddableNode, EmbeddingConfig } from './types.js';
7
+ /**
8
+ * Generate embedding text for any embeddable node (dispatches by label)
9
+ * @param node - The node to generate text for
10
+ * @param config - Optional configuration for max snippet length
11
+ * @returns Text suitable for embedding
12
+ */
13
+ export declare const generateEmbeddingText: (node: EmbeddableNode, config?: Partial<EmbeddingConfig>) => string;
14
+ /**
15
+ * Generate embedding texts for a batch of nodes
16
+ * @param nodes - Nodes to generate text for
17
+ * @param config - Optional configuration
18
+ * @returns Texts in the same order as input nodes
19
+ */
20
+ export declare const generateBatchEmbeddingTexts: (nodes: EmbeddableNode[], config?: Partial<EmbeddingConfig>) => string[];