@veewo/gitnexus 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. package/README.md +234 -0
  2. package/dist/benchmark/agent-context/evaluators.d.ts +9 -0
  3. package/dist/benchmark/agent-context/evaluators.js +196 -0
  4. package/dist/benchmark/agent-context/evaluators.test.d.ts +1 -0
  5. package/dist/benchmark/agent-context/evaluators.test.js +39 -0
  6. package/dist/benchmark/agent-context/io.d.ts +2 -0
  7. package/dist/benchmark/agent-context/io.js +23 -0
  8. package/dist/benchmark/agent-context/io.test.d.ts +1 -0
  9. package/dist/benchmark/agent-context/io.test.js +19 -0
  10. package/dist/benchmark/agent-context/report.d.ts +2 -0
  11. package/dist/benchmark/agent-context/report.js +59 -0
  12. package/dist/benchmark/agent-context/report.test.d.ts +1 -0
  13. package/dist/benchmark/agent-context/report.test.js +85 -0
  14. package/dist/benchmark/agent-context/runner.d.ts +46 -0
  15. package/dist/benchmark/agent-context/runner.js +111 -0
  16. package/dist/benchmark/agent-context/runner.test.d.ts +1 -0
  17. package/dist/benchmark/agent-context/runner.test.js +79 -0
  18. package/dist/benchmark/agent-context/tool-runner.d.ts +7 -0
  19. package/dist/benchmark/agent-context/tool-runner.js +18 -0
  20. package/dist/benchmark/agent-context/tool-runner.test.d.ts +1 -0
  21. package/dist/benchmark/agent-context/tool-runner.test.js +11 -0
  22. package/dist/benchmark/agent-context/types.d.ts +40 -0
  23. package/dist/benchmark/agent-context/types.js +1 -0
  24. package/dist/benchmark/analyze-runner.d.ts +16 -0
  25. package/dist/benchmark/analyze-runner.js +51 -0
  26. package/dist/benchmark/analyze-runner.test.d.ts +1 -0
  27. package/dist/benchmark/analyze-runner.test.js +37 -0
  28. package/dist/benchmark/evaluators.d.ts +6 -0
  29. package/dist/benchmark/evaluators.js +10 -0
  30. package/dist/benchmark/evaluators.test.d.ts +1 -0
  31. package/dist/benchmark/evaluators.test.js +12 -0
  32. package/dist/benchmark/io.d.ts +7 -0
  33. package/dist/benchmark/io.js +25 -0
  34. package/dist/benchmark/io.test.d.ts +1 -0
  35. package/dist/benchmark/io.test.js +35 -0
  36. package/dist/benchmark/neonspark-candidates.d.ts +19 -0
  37. package/dist/benchmark/neonspark-candidates.js +94 -0
  38. package/dist/benchmark/neonspark-candidates.test.d.ts +1 -0
  39. package/dist/benchmark/neonspark-candidates.test.js +43 -0
  40. package/dist/benchmark/neonspark-materialize.d.ts +19 -0
  41. package/dist/benchmark/neonspark-materialize.js +111 -0
  42. package/dist/benchmark/neonspark-materialize.test.d.ts +1 -0
  43. package/dist/benchmark/neonspark-materialize.test.js +124 -0
  44. package/dist/benchmark/neonspark-sync.d.ts +3 -0
  45. package/dist/benchmark/neonspark-sync.js +53 -0
  46. package/dist/benchmark/neonspark-sync.test.d.ts +1 -0
  47. package/dist/benchmark/neonspark-sync.test.js +20 -0
  48. package/dist/benchmark/report.d.ts +1 -0
  49. package/dist/benchmark/report.js +7 -0
  50. package/dist/benchmark/runner.d.ts +48 -0
  51. package/dist/benchmark/runner.js +302 -0
  52. package/dist/benchmark/runner.test.d.ts +1 -0
  53. package/dist/benchmark/runner.test.js +50 -0
  54. package/dist/benchmark/scoring.d.ts +16 -0
  55. package/dist/benchmark/scoring.js +27 -0
  56. package/dist/benchmark/scoring.test.d.ts +1 -0
  57. package/dist/benchmark/scoring.test.js +24 -0
  58. package/dist/benchmark/tool-runner.d.ts +6 -0
  59. package/dist/benchmark/tool-runner.js +17 -0
  60. package/dist/benchmark/types.d.ts +36 -0
  61. package/dist/benchmark/types.js +1 -0
  62. package/dist/cli/ai-context.d.ts +22 -0
  63. package/dist/cli/ai-context.js +184 -0
  64. package/dist/cli/ai-context.test.d.ts +1 -0
  65. package/dist/cli/ai-context.test.js +30 -0
  66. package/dist/cli/analyze-multi-scope-regression.test.d.ts +1 -0
  67. package/dist/cli/analyze-multi-scope-regression.test.js +22 -0
  68. package/dist/cli/analyze-options.d.ts +7 -0
  69. package/dist/cli/analyze-options.js +56 -0
  70. package/dist/cli/analyze-options.test.d.ts +1 -0
  71. package/dist/cli/analyze-options.test.js +36 -0
  72. package/dist/cli/analyze.d.ts +14 -0
  73. package/dist/cli/analyze.js +384 -0
  74. package/dist/cli/augment.d.ts +13 -0
  75. package/dist/cli/augment.js +33 -0
  76. package/dist/cli/benchmark-agent-context.d.ts +29 -0
  77. package/dist/cli/benchmark-agent-context.js +61 -0
  78. package/dist/cli/benchmark-agent-context.test.d.ts +1 -0
  79. package/dist/cli/benchmark-agent-context.test.js +80 -0
  80. package/dist/cli/benchmark-unity.d.ts +15 -0
  81. package/dist/cli/benchmark-unity.js +31 -0
  82. package/dist/cli/benchmark-unity.test.d.ts +1 -0
  83. package/dist/cli/benchmark-unity.test.js +18 -0
  84. package/dist/cli/claude-hooks.d.ts +22 -0
  85. package/dist/cli/claude-hooks.js +97 -0
  86. package/dist/cli/clean.d.ts +10 -0
  87. package/dist/cli/clean.js +60 -0
  88. package/dist/cli/eval-server.d.ts +30 -0
  89. package/dist/cli/eval-server.js +372 -0
  90. package/dist/cli/index.d.ts +2 -0
  91. package/dist/cli/index.js +182 -0
  92. package/dist/cli/list.d.ts +6 -0
  93. package/dist/cli/list.js +33 -0
  94. package/dist/cli/mcp.d.ts +8 -0
  95. package/dist/cli/mcp.js +34 -0
  96. package/dist/cli/repo-manager-alias.test.d.ts +1 -0
  97. package/dist/cli/repo-manager-alias.test.js +40 -0
  98. package/dist/cli/scope-filter.test.d.ts +1 -0
  99. package/dist/cli/scope-filter.test.js +49 -0
  100. package/dist/cli/serve.d.ts +4 -0
  101. package/dist/cli/serve.js +6 -0
  102. package/dist/cli/setup.d.ts +8 -0
  103. package/dist/cli/setup.js +311 -0
  104. package/dist/cli/setup.test.d.ts +1 -0
  105. package/dist/cli/setup.test.js +31 -0
  106. package/dist/cli/status.d.ts +6 -0
  107. package/dist/cli/status.js +27 -0
  108. package/dist/cli/tool.d.ts +40 -0
  109. package/dist/cli/tool.js +94 -0
  110. package/dist/cli/version.test.d.ts +1 -0
  111. package/dist/cli/version.test.js +19 -0
  112. package/dist/cli/wiki.d.ts +15 -0
  113. package/dist/cli/wiki.js +361 -0
  114. package/dist/config/ignore-service.d.ts +1 -0
  115. package/dist/config/ignore-service.js +210 -0
  116. package/dist/config/supported-languages.d.ts +12 -0
  117. package/dist/config/supported-languages.js +15 -0
  118. package/dist/core/augmentation/engine.d.ts +26 -0
  119. package/dist/core/augmentation/engine.js +213 -0
  120. package/dist/core/embeddings/embedder.d.ts +60 -0
  121. package/dist/core/embeddings/embedder.js +251 -0
  122. package/dist/core/embeddings/embedding-pipeline.d.ts +51 -0
  123. package/dist/core/embeddings/embedding-pipeline.js +329 -0
  124. package/dist/core/embeddings/index.d.ts +9 -0
  125. package/dist/core/embeddings/index.js +9 -0
  126. package/dist/core/embeddings/text-generator.d.ts +24 -0
  127. package/dist/core/embeddings/text-generator.js +182 -0
  128. package/dist/core/embeddings/types.d.ts +87 -0
  129. package/dist/core/embeddings/types.js +32 -0
  130. package/dist/core/graph/graph.d.ts +2 -0
  131. package/dist/core/graph/graph.js +66 -0
  132. package/dist/core/graph/types.d.ts +61 -0
  133. package/dist/core/graph/types.js +1 -0
  134. package/dist/core/ingestion/ast-cache.d.ts +11 -0
  135. package/dist/core/ingestion/ast-cache.js +34 -0
  136. package/dist/core/ingestion/call-processor.d.ts +15 -0
  137. package/dist/core/ingestion/call-processor.js +327 -0
  138. package/dist/core/ingestion/cluster-enricher.d.ts +38 -0
  139. package/dist/core/ingestion/cluster-enricher.js +170 -0
  140. package/dist/core/ingestion/community-processor.d.ts +39 -0
  141. package/dist/core/ingestion/community-processor.js +312 -0
  142. package/dist/core/ingestion/entry-point-scoring.d.ts +39 -0
  143. package/dist/core/ingestion/entry-point-scoring.js +260 -0
  144. package/dist/core/ingestion/filesystem-walker.d.ts +28 -0
  145. package/dist/core/ingestion/filesystem-walker.js +80 -0
  146. package/dist/core/ingestion/framework-detection.d.ts +39 -0
  147. package/dist/core/ingestion/framework-detection.js +235 -0
  148. package/dist/core/ingestion/heritage-processor.d.ts +20 -0
  149. package/dist/core/ingestion/heritage-processor.js +197 -0
  150. package/dist/core/ingestion/import-processor.d.ts +38 -0
  151. package/dist/core/ingestion/import-processor.js +778 -0
  152. package/dist/core/ingestion/parsing-processor.d.ts +15 -0
  153. package/dist/core/ingestion/parsing-processor.js +291 -0
  154. package/dist/core/ingestion/pipeline.d.ts +5 -0
  155. package/dist/core/ingestion/pipeline.js +323 -0
  156. package/dist/core/ingestion/process-processor.d.ts +51 -0
  157. package/dist/core/ingestion/process-processor.js +309 -0
  158. package/dist/core/ingestion/scope-filter.d.ts +25 -0
  159. package/dist/core/ingestion/scope-filter.js +100 -0
  160. package/dist/core/ingestion/structure-processor.d.ts +2 -0
  161. package/dist/core/ingestion/structure-processor.js +36 -0
  162. package/dist/core/ingestion/symbol-table.d.ts +33 -0
  163. package/dist/core/ingestion/symbol-table.js +38 -0
  164. package/dist/core/ingestion/tree-sitter-queries.d.ts +12 -0
  165. package/dist/core/ingestion/tree-sitter-queries.js +398 -0
  166. package/dist/core/ingestion/utils.d.ts +10 -0
  167. package/dist/core/ingestion/utils.js +50 -0
  168. package/dist/core/ingestion/workers/parse-worker.d.ts +59 -0
  169. package/dist/core/ingestion/workers/parse-worker.js +672 -0
  170. package/dist/core/ingestion/workers/worker-pool.d.ts +16 -0
  171. package/dist/core/ingestion/workers/worker-pool.js +120 -0
  172. package/dist/core/kuzu/csv-generator.d.ts +29 -0
  173. package/dist/core/kuzu/csv-generator.js +336 -0
  174. package/dist/core/kuzu/kuzu-adapter.d.ts +101 -0
  175. package/dist/core/kuzu/kuzu-adapter.js +753 -0
  176. package/dist/core/kuzu/schema.d.ts +53 -0
  177. package/dist/core/kuzu/schema.js +407 -0
  178. package/dist/core/search/bm25-index.d.ts +23 -0
  179. package/dist/core/search/bm25-index.js +95 -0
  180. package/dist/core/search/hybrid-search.d.ts +49 -0
  181. package/dist/core/search/hybrid-search.js +118 -0
  182. package/dist/core/tree-sitter/parser-loader.d.ts +4 -0
  183. package/dist/core/tree-sitter/parser-loader.js +44 -0
  184. package/dist/core/wiki/generator.d.ts +110 -0
  185. package/dist/core/wiki/generator.js +786 -0
  186. package/dist/core/wiki/graph-queries.d.ts +80 -0
  187. package/dist/core/wiki/graph-queries.js +238 -0
  188. package/dist/core/wiki/html-viewer.d.ts +10 -0
  189. package/dist/core/wiki/html-viewer.js +297 -0
  190. package/dist/core/wiki/llm-client.d.ts +40 -0
  191. package/dist/core/wiki/llm-client.js +162 -0
  192. package/dist/core/wiki/prompts.d.ts +53 -0
  193. package/dist/core/wiki/prompts.js +174 -0
  194. package/dist/lib/utils.d.ts +1 -0
  195. package/dist/lib/utils.js +3 -0
  196. package/dist/mcp/core/embedder.d.ts +27 -0
  197. package/dist/mcp/core/embedder.js +108 -0
  198. package/dist/mcp/core/kuzu-adapter.d.ts +34 -0
  199. package/dist/mcp/core/kuzu-adapter.js +231 -0
  200. package/dist/mcp/local/local-backend.d.ts +160 -0
  201. package/dist/mcp/local/local-backend.js +1646 -0
  202. package/dist/mcp/resources.d.ts +31 -0
  203. package/dist/mcp/resources.js +407 -0
  204. package/dist/mcp/server.d.ts +23 -0
  205. package/dist/mcp/server.js +251 -0
  206. package/dist/mcp/staleness.d.ts +15 -0
  207. package/dist/mcp/staleness.js +29 -0
  208. package/dist/mcp/tools.d.ts +24 -0
  209. package/dist/mcp/tools.js +195 -0
  210. package/dist/server/api.d.ts +10 -0
  211. package/dist/server/api.js +344 -0
  212. package/dist/server/mcp-http.d.ts +13 -0
  213. package/dist/server/mcp-http.js +100 -0
  214. package/dist/storage/git.d.ts +6 -0
  215. package/dist/storage/git.js +32 -0
  216. package/dist/storage/repo-manager.d.ts +125 -0
  217. package/dist/storage/repo-manager.js +257 -0
  218. package/dist/types/pipeline.d.ts +34 -0
  219. package/dist/types/pipeline.js +18 -0
  220. package/hooks/claude/gitnexus-hook.cjs +135 -0
  221. package/hooks/claude/pre-tool-use.sh +78 -0
  222. package/hooks/claude/session-start.sh +42 -0
  223. package/package.json +92 -0
  224. package/skills/gitnexus-cli.md +82 -0
  225. package/skills/gitnexus-debugging.md +89 -0
  226. package/skills/gitnexus-exploring.md +78 -0
  227. package/skills/gitnexus-guide.md +64 -0
  228. package/skills/gitnexus-impact-analysis.md +97 -0
  229. package/skills/gitnexus-refactoring.md +121 -0
  230. package/vendor/leiden/index.cjs +355 -0
  231. package/vendor/leiden/utils.cjs +392 -0
@@ -0,0 +1,213 @@
1
+ /**
2
+ * Augmentation Engine
3
+ *
4
+ * Lightweight, fast-path enrichment of search patterns with knowledge graph context.
5
+ * Designed to be called from platform hooks (Claude Code PreToolUse, Cursor beforeShellExecution)
6
+ * when an agent runs grep/glob/search.
7
+ *
8
+ * Performance target: <500ms cold start, <200ms warm.
9
+ *
10
+ * Design decisions:
11
+ * - Uses only BM25 search (no semantic/embedding) for speed
12
+ * - Clusters used internally for ranking, NEVER in output
13
+ * - Output is pure relationships: callers, callees, process participation
14
+ * - Graceful failure: any error → return empty string
15
+ */
16
+ import path from 'path';
17
+ import { listRegisteredRepos } from '../../storage/repo-manager.js';
18
+ /**
19
+ * Find the best matching repo for a given working directory.
20
+ * Matches by checking if cwd is within the repo's path.
21
+ */
22
+ async function findRepoForCwd(cwd) {
23
+ try {
24
+ const entries = await listRegisteredRepos({ validate: true });
25
+ const resolved = path.resolve(cwd);
26
+ // Normalize to lowercase on Windows (drive letters can differ: D: vs d:)
27
+ const isWindows = process.platform === 'win32';
28
+ const normalizedCwd = isWindows ? resolved.toLowerCase() : resolved;
29
+ const sep = path.sep;
30
+ // Find the LONGEST matching repo path (most specific match wins)
31
+ let bestMatch = null;
32
+ let bestLen = 0;
33
+ for (const entry of entries) {
34
+ const repoResolved = path.resolve(entry.path);
35
+ const normalizedRepo = isWindows ? repoResolved.toLowerCase() : repoResolved;
36
+ // Check if cwd is inside repo OR repo is inside cwd
37
+ // Must match at a path separator boundary to avoid false positives
38
+ // (e.g. /projects/gitnexusv2 should NOT match /projects/gitnexus)
39
+ let matched = false;
40
+ if (normalizedCwd === normalizedRepo) {
41
+ matched = true;
42
+ }
43
+ else if (normalizedCwd.startsWith(normalizedRepo + sep)) {
44
+ matched = true;
45
+ }
46
+ else if (normalizedRepo.startsWith(normalizedCwd + sep)) {
47
+ matched = true;
48
+ }
49
+ if (matched && normalizedRepo.length > bestLen) {
50
+ bestMatch = entry;
51
+ bestLen = normalizedRepo.length;
52
+ }
53
+ }
54
+ if (!bestMatch)
55
+ return null;
56
+ return {
57
+ name: bestMatch.name,
58
+ storagePath: bestMatch.storagePath,
59
+ kuzuPath: path.join(bestMatch.storagePath, 'kuzu'),
60
+ };
61
+ }
62
+ catch {
63
+ return null;
64
+ }
65
+ }
66
+ /**
67
+ * Augment a search pattern with knowledge graph context.
68
+ *
69
+ * 1. BM25 search for the pattern
70
+ * 2. For top matches, fetch callers/callees/processes
71
+ * 3. Rank by internal cluster cohesion (not exposed)
72
+ * 4. Format as structured text block
73
+ *
74
+ * Returns empty string on any error (graceful failure).
75
+ */
76
+ export async function augment(pattern, cwd) {
77
+ if (!pattern || pattern.length < 3)
78
+ return '';
79
+ const workDir = cwd || process.cwd();
80
+ try {
81
+ const repo = await findRepoForCwd(workDir);
82
+ if (!repo)
83
+ return '';
84
+ // Lazy-load kuzu adapter (skip unnecessary init)
85
+ const { initKuzu, executeQuery, isKuzuReady } = await import('../../mcp/core/kuzu-adapter.js');
86
+ const { searchFTSFromKuzu } = await import('../search/bm25-index.js');
87
+ const repoId = repo.name.toLowerCase();
88
+ // Init KuzuDB if not already
89
+ if (!isKuzuReady(repoId)) {
90
+ await initKuzu(repoId, repo.kuzuPath);
91
+ }
92
+ // Step 1: BM25 search (fast, no embeddings)
93
+ const bm25Results = await searchFTSFromKuzu(pattern, 10, repoId);
94
+ if (bm25Results.length === 0)
95
+ return '';
96
+ // Step 2: Map BM25 file results to symbols
97
+ const symbolMatches = [];
98
+ for (const result of bm25Results.slice(0, 5)) {
99
+ const escaped = result.filePath.replace(/'/g, "''");
100
+ try {
101
+ const symbols = await executeQuery(repoId, `
102
+ MATCH (n) WHERE n.filePath = '${escaped}'
103
+ AND n.name CONTAINS '${pattern.replace(/'/g, "''").split(/\s+/)[0]}'
104
+ RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, n.filePath AS filePath
105
+ LIMIT 3
106
+ `);
107
+ for (const sym of symbols) {
108
+ symbolMatches.push({
109
+ nodeId: sym.id || sym[0],
110
+ name: sym.name || sym[1],
111
+ type: sym.type || sym[2],
112
+ filePath: sym.filePath || sym[3],
113
+ score: result.score,
114
+ });
115
+ }
116
+ }
117
+ catch { /* skip */ }
118
+ }
119
+ if (symbolMatches.length === 0)
120
+ return '';
121
+ // Step 3: For top matches, fetch callers/callees/processes
122
+ // Also get cluster cohesion internally for ranking
123
+ const enriched = [];
124
+ const seen = new Set();
125
+ for (const sym of symbolMatches.slice(0, 5)) {
126
+ if (seen.has(sym.nodeId))
127
+ continue;
128
+ seen.add(sym.nodeId);
129
+ const escaped = sym.nodeId.replace(/'/g, "''");
130
+ // Callers
131
+ let callers = [];
132
+ try {
133
+ const rows = await executeQuery(repoId, `
134
+ MATCH (caller)-[:CodeRelation {type: 'CALLS'}]->(n {id: '${escaped}'})
135
+ RETURN caller.name AS name
136
+ LIMIT 3
137
+ `);
138
+ callers = rows.map((r) => r.name || r[0]).filter(Boolean);
139
+ }
140
+ catch { /* skip */ }
141
+ // Callees
142
+ let callees = [];
143
+ try {
144
+ const rows = await executeQuery(repoId, `
145
+ MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'CALLS'}]->(callee)
146
+ RETURN callee.name AS name
147
+ LIMIT 3
148
+ `);
149
+ callees = rows.map((r) => r.name || r[0]).filter(Boolean);
150
+ }
151
+ catch { /* skip */ }
152
+ // Processes
153
+ let processes = [];
154
+ try {
155
+ const rows = await executeQuery(repoId, `
156
+ MATCH (n {id: '${escaped}'})-[r:CodeRelation {type: 'STEP_IN_PROCESS'}]->(p:Process)
157
+ RETURN p.heuristicLabel AS label, r.step AS step, p.stepCount AS stepCount
158
+ `);
159
+ processes = rows.map((r) => {
160
+ const label = r.label || r[0];
161
+ const step = r.step || r[1];
162
+ const stepCount = r.stepCount || r[2];
163
+ return `${label} (step ${step}/${stepCount})`;
164
+ }).filter(Boolean);
165
+ }
166
+ catch { /* skip */ }
167
+ // Cluster cohesion (internal ranking signal)
168
+ let cohesion = 0;
169
+ try {
170
+ const rows = await executeQuery(repoId, `
171
+ MATCH (n {id: '${escaped}'})-[:CodeRelation {type: 'MEMBER_OF'}]->(c:Community)
172
+ RETURN c.cohesion AS cohesion
173
+ LIMIT 1
174
+ `);
175
+ if (rows.length > 0) {
176
+ cohesion = (rows[0].cohesion ?? rows[0][0]) || 0;
177
+ }
178
+ }
179
+ catch { /* skip */ }
180
+ enriched.push({
181
+ name: sym.name,
182
+ filePath: sym.filePath,
183
+ callers,
184
+ callees,
185
+ processes,
186
+ cohesion,
187
+ });
188
+ }
189
+ if (enriched.length === 0)
190
+ return '';
191
+ // Step 4: Rank by cohesion (internal signal) and format
192
+ enriched.sort((a, b) => b.cohesion - a.cohesion);
193
+ const lines = [`[GitNexus] ${enriched.length} related symbols found:`, ''];
194
+ for (const item of enriched) {
195
+ lines.push(`${item.name} (${item.filePath})`);
196
+ if (item.callers.length > 0) {
197
+ lines.push(` Called by: ${item.callers.join(', ')}`);
198
+ }
199
+ if (item.callees.length > 0) {
200
+ lines.push(` Calls: ${item.callees.join(', ')}`);
201
+ }
202
+ if (item.processes.length > 0) {
203
+ lines.push(` Flows: ${item.processes.join(', ')}`);
204
+ }
205
+ lines.push('');
206
+ }
207
+ return lines.join('\n').trim();
208
+ }
209
+ catch {
210
+ // Graceful failure — never break the original tool
211
+ return '';
212
+ }
213
+ }
@@ -0,0 +1,60 @@
1
+ /**
2
+ * Embedder Module
3
+ *
4
+ * Singleton factory for transformers.js embedding pipeline.
5
+ * Handles model loading, caching, and both single and batch embedding operations.
6
+ *
7
+ * Uses snowflake-arctic-embed-xs by default (22M params, 384 dims, ~90MB)
8
+ */
9
+ import { type FeatureExtractionPipeline } from '@huggingface/transformers';
10
+ import { type EmbeddingConfig, type ModelProgress } from './types.js';
11
+ /**
12
+ * Progress callback type for model loading
13
+ */
14
+ export type ModelProgressCallback = (progress: ModelProgress) => void;
15
+ /**
16
+ * Get the current device being used for inference
17
+ */
18
+ export declare const getCurrentDevice: () => "dml" | "cuda" | "cpu" | "wasm" | null;
19
+ /**
20
+ * Initialize the embedding model
21
+ * Uses singleton pattern - only loads once, subsequent calls return cached instance
22
+ *
23
+ * @param onProgress - Optional callback for model download progress
24
+ * @param config - Optional configuration override
25
+ * @param forceDevice - Force a specific device
26
+ * @returns Promise resolving to the embedder pipeline
27
+ */
28
+ export declare const initEmbedder: (onProgress?: ModelProgressCallback, config?: Partial<EmbeddingConfig>, forceDevice?: "dml" | "cuda" | "cpu" | "wasm") => Promise<FeatureExtractionPipeline>;
29
+ /**
30
+ * Check if the embedder is initialized and ready
31
+ */
32
+ export declare const isEmbedderReady: () => boolean;
33
+ /**
34
+ * Get the embedder instance (throws if not initialized)
35
+ */
36
+ export declare const getEmbedder: () => FeatureExtractionPipeline;
37
+ /**
38
+ * Embed a single text string
39
+ *
40
+ * @param text - Text to embed
41
+ * @returns Float32Array of embedding vector (384 dimensions)
42
+ */
43
+ export declare const embedText: (text: string) => Promise<Float32Array>;
44
+ /**
45
+ * Embed multiple texts in a single batch
46
+ * More efficient than calling embedText multiple times
47
+ *
48
+ * @param texts - Array of texts to embed
49
+ * @returns Array of Float32Array embedding vectors
50
+ */
51
+ export declare const embedBatch: (texts: string[]) => Promise<Float32Array[]>;
52
+ /**
53
+ * Convert Float32Array to regular number array (for KuzuDB storage)
54
+ */
55
+ export declare const embeddingToArray: (embedding: Float32Array) => number[];
56
+ /**
57
+ * Cleanup the embedder (free memory)
58
+ * Call this when done with embeddings
59
+ */
60
+ export declare const disposeEmbedder: () => Promise<void>;
@@ -0,0 +1,251 @@
1
+ /**
2
+ * Embedder Module
3
+ *
4
+ * Singleton factory for transformers.js embedding pipeline.
5
+ * Handles model loading, caching, and both single and batch embedding operations.
6
+ *
7
+ * Uses snowflake-arctic-embed-xs by default (22M params, 384 dims, ~90MB)
8
+ */
9
+ // Suppress ONNX Runtime native warnings (e.g. VerifyEachNodeIsAssignedToAnEp)
10
+ // Must be set BEFORE onnxruntime-node is imported by transformers.js
11
+ // Level 3 = Error only (skips Warning/Info)
12
+ if (!process.env.ORT_LOG_LEVEL) {
13
+ process.env.ORT_LOG_LEVEL = '3';
14
+ }
15
+ import { pipeline, env } from '@huggingface/transformers';
16
+ import { existsSync } from 'fs';
17
+ import { execFileSync } from 'child_process';
18
+ import { join } from 'path';
19
+ import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
20
+ /**
21
+ * Check whether CUDA libraries are actually available on this system.
22
+ * ONNX Runtime's native layer crashes (uncatchable) if we attempt CUDA
23
+ * without the required shared libraries, so we probe first.
24
+ *
25
+ * Checks the dynamic linker cache (ldconfig) which covers all architectures
26
+ * and install paths, then falls back to CUDA_PATH / LD_LIBRARY_PATH env vars.
27
+ */
28
+ function isCudaAvailable() {
29
+ // Primary: query the dynamic linker cache — covers all architectures,
30
+ // distro layouts, and custom install paths registered with ldconfig
31
+ try {
32
+ const out = execFileSync('ldconfig', ['-p'], { timeout: 3000, encoding: 'utf-8' });
33
+ if (out.includes('libcublasLt.so.12'))
34
+ return true;
35
+ }
36
+ catch {
37
+ // ldconfig not available (e.g. non-standard container)
38
+ }
39
+ // Fallback: check CUDA_PATH and LD_LIBRARY_PATH for environments where
40
+ // ldconfig doesn't know about the CUDA install (conda, manual /opt/cuda, etc.)
41
+ for (const envVar of ['CUDA_PATH', 'LD_LIBRARY_PATH']) {
42
+ const val = process.env[envVar];
43
+ if (!val)
44
+ continue;
45
+ for (const dir of val.split(':').filter(Boolean)) {
46
+ if (existsSync(join(dir, 'lib64', 'libcublasLt.so.12')) ||
47
+ existsSync(join(dir, 'lib', 'libcublasLt.so.12')) ||
48
+ existsSync(join(dir, 'libcublasLt.so.12')))
49
+ return true;
50
+ }
51
+ }
52
+ return false;
53
+ }
54
+ // Module-level state for singleton pattern
55
+ let embedderInstance = null;
56
+ let isInitializing = false;
57
+ let initPromise = null;
58
+ let currentDevice = null;
59
+ /**
60
+ * Get the current device being used for inference
61
+ */
62
+ export const getCurrentDevice = () => currentDevice;
63
+ /**
64
+ * Initialize the embedding model
65
+ * Uses singleton pattern - only loads once, subsequent calls return cached instance
66
+ *
67
+ * @param onProgress - Optional callback for model download progress
68
+ * @param config - Optional configuration override
69
+ * @param forceDevice - Force a specific device
70
+ * @returns Promise resolving to the embedder pipeline
71
+ */
72
+ export const initEmbedder = async (onProgress, config = {}, forceDevice) => {
73
+ // Return existing instance if available
74
+ if (embedderInstance) {
75
+ return embedderInstance;
76
+ }
77
+ // If already initializing, wait for that promise
78
+ if (isInitializing && initPromise) {
79
+ return initPromise;
80
+ }
81
+ isInitializing = true;
82
+ const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
83
+ // On Windows, use DirectML for GPU acceleration (via DirectX12)
84
+ // CUDA is only available on Linux x64 with onnxruntime-node
85
+ // Probe for CUDA first — ONNX Runtime crashes (uncatchable native error)
86
+ // if we attempt CUDA without the required shared libraries
87
+ const isWindows = process.platform === 'win32';
88
+ const gpuDevice = isWindows ? 'dml' : (isCudaAvailable() ? 'cuda' : 'cpu');
89
+ let requestedDevice = forceDevice || (finalConfig.device === 'auto' ? gpuDevice : finalConfig.device);
90
+ initPromise = (async () => {
91
+ try {
92
+ // Configure transformers.js environment
93
+ env.allowLocalModels = false;
94
+ const isDev = process.env.NODE_ENV === 'development';
95
+ if (isDev) {
96
+ console.log(`🧠 Loading embedding model: ${finalConfig.modelId}`);
97
+ }
98
+ const progressCallback = onProgress ? (data) => {
99
+ const progress = {
100
+ status: data.status || 'progress',
101
+ file: data.file,
102
+ progress: data.progress,
103
+ loaded: data.loaded,
104
+ total: data.total,
105
+ };
106
+ onProgress(progress);
107
+ } : undefined;
108
+ // Try GPU first if auto, fall back to CPU
109
+ // Windows: dml (DirectML/DirectX12), Linux: cuda
110
+ const devicesToTry = (requestedDevice === 'dml' || requestedDevice === 'cuda')
111
+ ? [requestedDevice, 'cpu']
112
+ : [requestedDevice];
113
+ for (const device of devicesToTry) {
114
+ try {
115
+ if (isDev && device === 'dml') {
116
+ console.log('🔧 Trying DirectML (DirectX12) GPU backend...');
117
+ }
118
+ else if (isDev && device === 'cuda') {
119
+ console.log('🔧 Trying CUDA GPU backend...');
120
+ }
121
+ else if (isDev && device === 'cpu') {
122
+ console.log('🔧 Using CPU backend...');
123
+ }
124
+ else if (isDev && device === 'wasm') {
125
+ console.log('🔧 Using WASM backend (slower)...');
126
+ }
127
+ embedderInstance = await pipeline('feature-extraction', finalConfig.modelId, {
128
+ device: device,
129
+ dtype: 'fp32',
130
+ progress_callback: progressCallback,
131
+ session_options: { logSeverityLevel: 3 },
132
+ });
133
+ currentDevice = device;
134
+ if (isDev) {
135
+ const label = device === 'dml' ? 'GPU (DirectML/DirectX12)'
136
+ : device === 'cuda' ? 'GPU (CUDA)'
137
+ : device.toUpperCase();
138
+ console.log(`✅ Using ${label} backend`);
139
+ console.log('✅ Embedding model loaded successfully');
140
+ }
141
+ return embedderInstance;
142
+ }
143
+ catch (deviceError) {
144
+ if (isDev && (device === 'cuda' || device === 'dml')) {
145
+ const gpuType = device === 'dml' ? 'DirectML' : 'CUDA';
146
+ console.log(`⚠️ ${gpuType} not available, falling back to CPU...`);
147
+ }
148
+ // Continue to next device in list
149
+ if (device === devicesToTry[devicesToTry.length - 1]) {
150
+ throw deviceError; // Last device failed, propagate error
151
+ }
152
+ }
153
+ }
154
+ throw new Error('No suitable device found for embedding model');
155
+ }
156
+ catch (error) {
157
+ isInitializing = false;
158
+ initPromise = null;
159
+ embedderInstance = null;
160
+ throw error;
161
+ }
162
+ finally {
163
+ isInitializing = false;
164
+ }
165
+ })();
166
+ return initPromise;
167
+ };
168
+ /**
169
+ * Check if the embedder is initialized and ready
170
+ */
171
+ export const isEmbedderReady = () => {
172
+ return embedderInstance !== null;
173
+ };
174
+ /**
175
+ * Get the embedder instance (throws if not initialized)
176
+ */
177
+ export const getEmbedder = () => {
178
+ if (!embedderInstance) {
179
+ throw new Error('Embedder not initialized. Call initEmbedder() first.');
180
+ }
181
+ return embedderInstance;
182
+ };
183
+ /**
184
+ * Embed a single text string
185
+ *
186
+ * @param text - Text to embed
187
+ * @returns Float32Array of embedding vector (384 dimensions)
188
+ */
189
+ export const embedText = async (text) => {
190
+ const embedder = getEmbedder();
191
+ const result = await embedder(text, {
192
+ pooling: 'mean',
193
+ normalize: true,
194
+ });
195
+ // Result is a Tensor, convert to Float32Array
196
+ return new Float32Array(result.data);
197
+ };
198
+ /**
199
+ * Embed multiple texts in a single batch
200
+ * More efficient than calling embedText multiple times
201
+ *
202
+ * @param texts - Array of texts to embed
203
+ * @returns Array of Float32Array embedding vectors
204
+ */
205
+ export const embedBatch = async (texts) => {
206
+ if (texts.length === 0) {
207
+ return [];
208
+ }
209
+ const embedder = getEmbedder();
210
+ // Process batch
211
+ const result = await embedder(texts, {
212
+ pooling: 'mean',
213
+ normalize: true,
214
+ });
215
+ // Result shape is [batch_size, dimensions]
216
+ // Need to split into individual vectors
217
+ const data = result.data;
218
+ const dimensions = DEFAULT_EMBEDDING_CONFIG.dimensions;
219
+ const embeddings = [];
220
+ for (let i = 0; i < texts.length; i++) {
221
+ const start = i * dimensions;
222
+ const end = start + dimensions;
223
+ embeddings.push(new Float32Array(Array.prototype.slice.call(data, start, end)));
224
+ }
225
+ return embeddings;
226
+ };
227
+ /**
228
+ * Convert Float32Array to regular number array (for KuzuDB storage)
229
+ */
230
+ export const embeddingToArray = (embedding) => {
231
+ return Array.from(embedding);
232
+ };
233
+ /**
234
+ * Cleanup the embedder (free memory)
235
+ * Call this when done with embeddings
236
+ */
237
+ export const disposeEmbedder = async () => {
238
+ if (embedderInstance) {
239
+ // transformers.js pipelines may have a dispose method
240
+ try {
241
+ if ('dispose' in embedderInstance && typeof embedderInstance.dispose === 'function') {
242
+ await embedderInstance.dispose();
243
+ }
244
+ }
245
+ catch {
246
+ // Ignore disposal errors
247
+ }
248
+ embedderInstance = null;
249
+ initPromise = null;
250
+ }
251
+ };
@@ -0,0 +1,51 @@
1
+ /**
2
+ * Embedding Pipeline Module
3
+ *
4
+ * Orchestrates the background embedding process:
5
+ * 1. Query embeddable nodes from KuzuDB
6
+ * 2. Generate text representations
7
+ * 3. Batch embed using transformers.js
8
+ * 4. Update KuzuDB with embeddings
9
+ * 5. Create vector index for semantic search
10
+ */
11
+ import { type EmbeddingProgress, type EmbeddingConfig, type SemanticSearchResult } from './types.js';
12
+ /**
13
+ * Progress callback type
14
+ */
15
+ export type EmbeddingProgressCallback = (progress: EmbeddingProgress) => void;
16
+ /**
17
+ * Run the embedding pipeline
18
+ *
19
+ * @param executeQuery - Function to execute Cypher queries against KuzuDB
20
+ * @param executeWithReusedStatement - Function to execute with reused prepared statement
21
+ * @param onProgress - Callback for progress updates
22
+ * @param config - Optional configuration override
23
+ * @param skipNodeIds - Optional set of node IDs that already have embeddings (incremental mode)
24
+ */
25
+ export declare const runEmbeddingPipeline: (executeQuery: (cypher: string) => Promise<any[]>, executeWithReusedStatement: (cypher: string, paramsList: Array<Record<string, any>>) => Promise<void>, onProgress: EmbeddingProgressCallback, config?: Partial<EmbeddingConfig>, skipNodeIds?: Set<string>) => Promise<void>;
26
+ /**
27
+ * Perform semantic search using the vector index
28
+ *
29
+ * Uses CodeEmbedding table and queries each node table to get metadata
30
+ *
31
+ * @param executeQuery - Function to execute Cypher queries
32
+ * @param query - Search query text
33
+ * @param k - Number of results to return (default: 10)
34
+ * @param maxDistance - Maximum distance threshold (default: 0.5)
35
+ * @returns Array of search results ordered by relevance
36
+ */
37
+ export declare const semanticSearch: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, maxDistance?: number) => Promise<SemanticSearchResult[]>;
38
+ /**
39
+ * Semantic search with graph expansion (flattened results)
40
+ *
41
+ * Note: With multi-table schema, graph traversal is simplified.
42
+ * Returns semantic matches with their metadata.
43
+ * For full graph traversal, use execute_vector_cypher tool directly.
44
+ *
45
+ * @param executeQuery - Function to execute Cypher queries
46
+ * @param query - Search query text
47
+ * @param k - Number of initial semantic matches (default: 5)
48
+ * @param _hops - Unused (kept for API compatibility).
49
+ * @returns Semantic matches with metadata
50
+ */
51
+ export declare const semanticSearchWithContext: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, _hops?: number) => Promise<any[]>;