gitnexus 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/README.md +181 -0
  2. package/dist/cli/ai-context.d.ts +21 -0
  3. package/dist/cli/ai-context.js +219 -0
  4. package/dist/cli/analyze.d.ts +10 -0
  5. package/dist/cli/analyze.js +118 -0
  6. package/dist/cli/clean.d.ts +8 -0
  7. package/dist/cli/clean.js +29 -0
  8. package/dist/cli/index.d.ts +2 -0
  9. package/dist/cli/index.js +42 -0
  10. package/dist/cli/list.d.ts +6 -0
  11. package/dist/cli/list.js +27 -0
  12. package/dist/cli/mcp.d.ts +7 -0
  13. package/dist/cli/mcp.js +85 -0
  14. package/dist/cli/serve.d.ts +3 -0
  15. package/dist/cli/serve.js +5 -0
  16. package/dist/cli/status.d.ts +6 -0
  17. package/dist/cli/status.js +27 -0
  18. package/dist/config/ignore-service.d.ts +1 -0
  19. package/dist/config/ignore-service.js +208 -0
  20. package/dist/config/supported-languages.d.ts +11 -0
  21. package/dist/config/supported-languages.js +15 -0
  22. package/dist/core/embeddings/embedder.d.ts +60 -0
  23. package/dist/core/embeddings/embedder.js +205 -0
  24. package/dist/core/embeddings/embedding-pipeline.d.ts +50 -0
  25. package/dist/core/embeddings/embedding-pipeline.js +321 -0
  26. package/dist/core/embeddings/index.d.ts +9 -0
  27. package/dist/core/embeddings/index.js +9 -0
  28. package/dist/core/embeddings/text-generator.d.ts +24 -0
  29. package/dist/core/embeddings/text-generator.js +182 -0
  30. package/dist/core/embeddings/types.d.ts +87 -0
  31. package/dist/core/embeddings/types.js +32 -0
  32. package/dist/core/graph/graph.d.ts +2 -0
  33. package/dist/core/graph/graph.js +61 -0
  34. package/dist/core/graph/types.d.ts +50 -0
  35. package/dist/core/graph/types.js +1 -0
  36. package/dist/core/ingestion/ast-cache.d.ts +11 -0
  37. package/dist/core/ingestion/ast-cache.js +34 -0
  38. package/dist/core/ingestion/call-processor.d.ts +8 -0
  39. package/dist/core/ingestion/call-processor.js +269 -0
  40. package/dist/core/ingestion/cluster-enricher.d.ts +38 -0
  41. package/dist/core/ingestion/cluster-enricher.js +170 -0
  42. package/dist/core/ingestion/community-processor.d.ts +39 -0
  43. package/dist/core/ingestion/community-processor.js +269 -0
  44. package/dist/core/ingestion/entry-point-scoring.d.ts +39 -0
  45. package/dist/core/ingestion/entry-point-scoring.js +235 -0
  46. package/dist/core/ingestion/filesystem-walker.d.ts +5 -0
  47. package/dist/core/ingestion/filesystem-walker.js +26 -0
  48. package/dist/core/ingestion/framework-detection.d.ts +38 -0
  49. package/dist/core/ingestion/framework-detection.js +183 -0
  50. package/dist/core/ingestion/heritage-processor.d.ts +14 -0
  51. package/dist/core/ingestion/heritage-processor.js +134 -0
  52. package/dist/core/ingestion/import-processor.d.ts +8 -0
  53. package/dist/core/ingestion/import-processor.js +490 -0
  54. package/dist/core/ingestion/parsing-processor.d.ts +8 -0
  55. package/dist/core/ingestion/parsing-processor.js +249 -0
  56. package/dist/core/ingestion/pipeline.d.ts +2 -0
  57. package/dist/core/ingestion/pipeline.js +228 -0
  58. package/dist/core/ingestion/process-processor.d.ts +51 -0
  59. package/dist/core/ingestion/process-processor.js +278 -0
  60. package/dist/core/ingestion/structure-processor.d.ts +2 -0
  61. package/dist/core/ingestion/structure-processor.js +36 -0
  62. package/dist/core/ingestion/symbol-table.d.ts +33 -0
  63. package/dist/core/ingestion/symbol-table.js +38 -0
  64. package/dist/core/ingestion/tree-sitter-queries.d.ts +11 -0
  65. package/dist/core/ingestion/tree-sitter-queries.js +319 -0
  66. package/dist/core/ingestion/utils.d.ts +10 -0
  67. package/dist/core/ingestion/utils.js +44 -0
  68. package/dist/core/kuzu/csv-generator.d.ts +22 -0
  69. package/dist/core/kuzu/csv-generator.js +272 -0
  70. package/dist/core/kuzu/kuzu-adapter.d.ts +81 -0
  71. package/dist/core/kuzu/kuzu-adapter.js +568 -0
  72. package/dist/core/kuzu/schema.d.ts +53 -0
  73. package/dist/core/kuzu/schema.js +380 -0
  74. package/dist/core/search/bm25-index.d.ts +22 -0
  75. package/dist/core/search/bm25-index.js +52 -0
  76. package/dist/core/search/hybrid-search.d.ts +49 -0
  77. package/dist/core/search/hybrid-search.js +118 -0
  78. package/dist/core/tree-sitter/parser-loader.d.ts +4 -0
  79. package/dist/core/tree-sitter/parser-loader.js +42 -0
  80. package/dist/lib/utils.d.ts +1 -0
  81. package/dist/lib/utils.js +3 -0
  82. package/dist/mcp/core/embedder.d.ts +27 -0
  83. package/dist/mcp/core/embedder.js +93 -0
  84. package/dist/mcp/core/kuzu-adapter.d.ts +23 -0
  85. package/dist/mcp/core/kuzu-adapter.js +62 -0
  86. package/dist/mcp/local/local-backend.d.ts +73 -0
  87. package/dist/mcp/local/local-backend.js +752 -0
  88. package/dist/mcp/resources.d.ts +31 -0
  89. package/dist/mcp/resources.js +279 -0
  90. package/dist/mcp/server.d.ts +12 -0
  91. package/dist/mcp/server.js +130 -0
  92. package/dist/mcp/staleness.d.ts +15 -0
  93. package/dist/mcp/staleness.js +29 -0
  94. package/dist/mcp/tools.d.ts +24 -0
  95. package/dist/mcp/tools.js +160 -0
  96. package/dist/server/api.d.ts +6 -0
  97. package/dist/server/api.js +156 -0
  98. package/dist/storage/git.d.ts +7 -0
  99. package/dist/storage/git.js +39 -0
  100. package/dist/storage/repo-manager.d.ts +61 -0
  101. package/dist/storage/repo-manager.js +106 -0
  102. package/dist/types/pipeline.d.ts +28 -0
  103. package/dist/types/pipeline.js +16 -0
  104. package/package.json +80 -0
  105. package/skills/debugging.md +104 -0
  106. package/skills/exploring.md +112 -0
  107. package/skills/impact-analysis.md +114 -0
  108. package/skills/refactoring.md +119 -0
  109. package/vendor/leiden/index.cjs +355 -0
  110. package/vendor/leiden/utils.cjs +392 -0
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Embedding Pipeline Module
3
+ *
4
+ * Orchestrates the background embedding process:
5
+ * 1. Query embeddable nodes from KuzuDB
6
+ * 2. Generate text representations
7
+ * 3. Batch embed using transformers.js
8
+ * 4. Update KuzuDB with embeddings
9
+ * 5. Create vector index for semantic search
10
+ */
11
+ import { type EmbeddingProgress, type EmbeddingConfig, type SemanticSearchResult } from './types.js';
12
+ /**
13
+ * Progress callback type
14
+ */
15
+ export type EmbeddingProgressCallback = (progress: EmbeddingProgress) => void;
16
+ /**
17
+ * Run the embedding pipeline
18
+ *
19
+ * @param executeQuery - Function to execute Cypher queries against KuzuDB
20
+ * @param executeWithReusedStatement - Function to execute with reused prepared statement
21
+ * @param onProgress - Callback for progress updates
22
+ * @param config - Optional configuration override
23
+ */
24
+ export declare const runEmbeddingPipeline: (executeQuery: (cypher: string) => Promise<any[]>, executeWithReusedStatement: (cypher: string, paramsList: Array<Record<string, any>>) => Promise<void>, onProgress: EmbeddingProgressCallback, config?: Partial<EmbeddingConfig>) => Promise<void>;
25
+ /**
26
+ * Perform semantic search using the vector index
27
+ *
28
+ * Uses CodeEmbedding table and queries each node table to get metadata
29
+ *
30
+ * @param executeQuery - Function to execute Cypher queries
31
+ * @param query - Search query text
32
+ * @param k - Number of results to return (default: 10)
33
+ * @param maxDistance - Maximum distance threshold (default: 0.5)
34
+ * @returns Array of search results ordered by relevance
35
+ */
36
+ export declare const semanticSearch: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, maxDistance?: number) => Promise<SemanticSearchResult[]>;
37
+ /**
38
+ * Semantic search with graph expansion (flattened results)
39
+ *
40
+ * Note: With multi-table schema, graph traversal is simplified.
41
+ * Returns semantic matches with their metadata.
42
+ * For full graph traversal, use execute_vector_cypher tool directly.
43
+ *
44
+ * @param executeQuery - Function to execute Cypher queries
45
+ * @param query - Search query text
46
+ * @param k - Number of initial semantic matches (default: 5)
47
+ * @param _hops - Unused (kept for API compatibility).
48
+ * @returns Semantic matches with metadata
49
+ */
50
+ export declare const semanticSearchWithContext: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, _hops?: number) => Promise<any[]>;
@@ -0,0 +1,321 @@
1
+ /**
2
+ * Embedding Pipeline Module
3
+ *
4
+ * Orchestrates the background embedding process:
5
+ * 1. Query embeddable nodes from KuzuDB
6
+ * 2. Generate text representations
7
+ * 3. Batch embed using transformers.js
8
+ * 4. Update KuzuDB with embeddings
9
+ * 5. Create vector index for semantic search
10
+ */
11
+ import { initEmbedder, embedBatch, embedText, embeddingToArray, isEmbedderReady } from './embedder.js';
12
+ import { generateBatchEmbeddingTexts } from './text-generator.js';
13
+ import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, } from './types.js';
14
+ const isDev = process.env.NODE_ENV !== 'production';
15
+ /**
16
+ * Query all embeddable nodes from KuzuDB
17
+ * Uses table-specific queries (File has different schema than code elements)
18
+ */
19
+ const queryEmbeddableNodes = async (executeQuery) => {
20
+ const allNodes = [];
21
+ // Query each embeddable table with table-specific columns
22
+ for (const label of EMBEDDABLE_LABELS) {
23
+ try {
24
+ let query;
25
+ if (label === 'File') {
26
+ // File nodes don't have startLine/endLine
27
+ query = `
28
+ MATCH (n:File)
29
+ RETURN n.id AS id, n.name AS name, 'File' AS label,
30
+ n.filePath AS filePath, n.content AS content
31
+ `;
32
+ }
33
+ else {
34
+ // Code elements have startLine/endLine
35
+ query = `
36
+ MATCH (n:${label})
37
+ RETURN n.id AS id, n.name AS name, '${label}' AS label,
38
+ n.filePath AS filePath, n.content AS content,
39
+ n.startLine AS startLine, n.endLine AS endLine
40
+ `;
41
+ }
42
+ const rows = await executeQuery(query);
43
+ for (const row of rows) {
44
+ allNodes.push({
45
+ id: row.id ?? row[0],
46
+ name: row.name ?? row[1],
47
+ label: row.label ?? row[2],
48
+ filePath: row.filePath ?? row[3],
49
+ content: row.content ?? row[4] ?? '',
50
+ startLine: row.startLine ?? row[5],
51
+ endLine: row.endLine ?? row[6],
52
+ });
53
+ }
54
+ }
55
+ catch (error) {
56
+ // Table might not exist or be empty, continue
57
+ if (isDev) {
58
+ console.warn(`Query for ${label} nodes failed:`, error);
59
+ }
60
+ }
61
+ }
62
+ return allNodes;
63
+ };
64
+ /**
65
+ * Batch INSERT embeddings into separate CodeEmbedding table
66
+ * Using a separate lightweight table avoids copy-on-write overhead
67
+ * that occurs when UPDATEing nodes with large content fields
68
+ */
69
+ const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
70
+ // INSERT into separate embedding table - much more memory efficient!
71
+ const cypher = `CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`;
72
+ const paramsList = updates.map(u => ({ nodeId: u.id, embedding: u.embedding }));
73
+ await executeWithReusedStatement(cypher, paramsList);
74
+ };
75
+ /**
76
+ * Create the vector index for semantic search
77
+ * Now indexes the separate CodeEmbedding table
78
+ */
79
+ const createVectorIndex = async (executeQuery) => {
80
+ const cypher = `
81
+ CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
82
+ `;
83
+ try {
84
+ await executeQuery(cypher);
85
+ }
86
+ catch (error) {
87
+ // Index might already exist
88
+ if (isDev) {
89
+ console.warn('Vector index creation warning:', error);
90
+ }
91
+ }
92
+ };
93
+ /**
94
+ * Run the embedding pipeline
95
+ *
96
+ * @param executeQuery - Function to execute Cypher queries against KuzuDB
97
+ * @param executeWithReusedStatement - Function to execute with reused prepared statement
98
+ * @param onProgress - Callback for progress updates
99
+ * @param config - Optional configuration override
100
+ */
101
+ export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatement, onProgress, config = {}) => {
102
+ const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
103
+ try {
104
+ // Phase 1: Load embedding model
105
+ onProgress({
106
+ phase: 'loading-model',
107
+ percent: 0,
108
+ modelDownloadPercent: 0,
109
+ });
110
+ await initEmbedder((modelProgress) => {
111
+ // Report model download progress
112
+ const downloadPercent = modelProgress.progress ?? 0;
113
+ onProgress({
114
+ phase: 'loading-model',
115
+ percent: Math.round(downloadPercent * 0.2), // 0-20% for model loading
116
+ modelDownloadPercent: downloadPercent,
117
+ });
118
+ }, finalConfig);
119
+ onProgress({
120
+ phase: 'loading-model',
121
+ percent: 20,
122
+ modelDownloadPercent: 100,
123
+ });
124
+ if (isDev) {
125
+ console.log('🔍 Querying embeddable nodes...');
126
+ }
127
+ // Phase 2: Query embeddable nodes
128
+ const nodes = await queryEmbeddableNodes(executeQuery);
129
+ const totalNodes = nodes.length;
130
+ if (isDev) {
131
+ console.log(`📊 Found ${totalNodes} embeddable nodes`);
132
+ }
133
+ if (totalNodes === 0) {
134
+ onProgress({
135
+ phase: 'ready',
136
+ percent: 100,
137
+ nodesProcessed: 0,
138
+ totalNodes: 0,
139
+ });
140
+ return;
141
+ }
142
+ // Phase 3: Batch embed nodes
143
+ const batchSize = finalConfig.batchSize;
144
+ const totalBatches = Math.ceil(totalNodes / batchSize);
145
+ let processedNodes = 0;
146
+ onProgress({
147
+ phase: 'embedding',
148
+ percent: 20,
149
+ nodesProcessed: 0,
150
+ totalNodes,
151
+ currentBatch: 0,
152
+ totalBatches,
153
+ });
154
+ for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
155
+ const start = batchIndex * batchSize;
156
+ const end = Math.min(start + batchSize, totalNodes);
157
+ const batch = nodes.slice(start, end);
158
+ // Generate texts for this batch
159
+ const texts = generateBatchEmbeddingTexts(batch, finalConfig);
160
+ // Embed the batch
161
+ const embeddings = await embedBatch(texts);
162
+ // Update KuzuDB with embeddings
163
+ const updates = batch.map((node, i) => ({
164
+ id: node.id,
165
+ embedding: embeddingToArray(embeddings[i]),
166
+ }));
167
+ await batchInsertEmbeddings(executeWithReusedStatement, updates);
168
+ processedNodes += batch.length;
169
+ // Report progress (20-90% for embedding phase)
170
+ const embeddingProgress = 20 + ((processedNodes / totalNodes) * 70);
171
+ onProgress({
172
+ phase: 'embedding',
173
+ percent: Math.round(embeddingProgress),
174
+ nodesProcessed: processedNodes,
175
+ totalNodes,
176
+ currentBatch: batchIndex + 1,
177
+ totalBatches,
178
+ });
179
+ }
180
+ // Phase 4: Create vector index
181
+ onProgress({
182
+ phase: 'indexing',
183
+ percent: 90,
184
+ nodesProcessed: totalNodes,
185
+ totalNodes,
186
+ });
187
+ if (isDev) {
188
+ console.log('📇 Creating vector index...');
189
+ }
190
+ await createVectorIndex(executeQuery);
191
+ // Complete
192
+ onProgress({
193
+ phase: 'ready',
194
+ percent: 100,
195
+ nodesProcessed: totalNodes,
196
+ totalNodes,
197
+ });
198
+ if (isDev) {
199
+ console.log('✅ Embedding pipeline complete!');
200
+ }
201
+ }
202
+ catch (error) {
203
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error';
204
+ if (isDev) {
205
+ console.error('❌ Embedding pipeline error:', error);
206
+ }
207
+ onProgress({
208
+ phase: 'error',
209
+ percent: 0,
210
+ error: errorMessage,
211
+ });
212
+ throw error;
213
+ }
214
+ };
215
+ /**
216
+ * Perform semantic search using the vector index
217
+ *
218
+ * Uses CodeEmbedding table and queries each node table to get metadata
219
+ *
220
+ * @param executeQuery - Function to execute Cypher queries
221
+ * @param query - Search query text
222
+ * @param k - Number of results to return (default: 10)
223
+ * @param maxDistance - Maximum distance threshold (default: 0.5)
224
+ * @returns Array of search results ordered by relevance
225
+ */
226
+ export const semanticSearch = async (executeQuery, query, k = 10, maxDistance = 0.5) => {
227
+ if (!isEmbedderReady()) {
228
+ throw new Error('Embedding model not initialized. Run embedding pipeline first.');
229
+ }
230
+ // Embed the query
231
+ const queryEmbedding = await embedText(query);
232
+ const queryVec = embeddingToArray(queryEmbedding);
233
+ const queryVecStr = `[${queryVec.join(',')}]`;
234
+ // Query the vector index on CodeEmbedding to get nodeIds and distances
235
+ const vectorQuery = `
236
+ CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
237
+ CAST(${queryVecStr} AS FLOAT[384]), ${k})
238
+ YIELD node AS emb, distance
239
+ WITH emb, distance
240
+ WHERE distance < ${maxDistance}
241
+ RETURN emb.nodeId AS nodeId, distance
242
+ ORDER BY distance
243
+ `;
244
+ const embResults = await executeQuery(vectorQuery);
245
+ if (embResults.length === 0) {
246
+ return [];
247
+ }
248
+ // Get metadata for each result by querying each node table
249
+ const results = [];
250
+ for (const embRow of embResults) {
251
+ const nodeId = embRow.nodeId ?? embRow[0];
252
+ const distance = embRow.distance ?? embRow[1];
253
+ // Extract label from node ID (format: Label:path:name)
254
+ const labelEndIdx = nodeId.indexOf(':');
255
+ const label = labelEndIdx > 0 ? nodeId.substring(0, labelEndIdx) : 'Unknown';
256
+ // Query the specific table for this node
257
+ // File nodes don't have startLine/endLine
258
+ try {
259
+ let nodeQuery;
260
+ if (label === 'File') {
261
+ nodeQuery = `
262
+ MATCH (n:File {id: '${nodeId.replace(/'/g, "''")}'})
263
+ RETURN n.name AS name, n.filePath AS filePath
264
+ `;
265
+ }
266
+ else {
267
+ nodeQuery = `
268
+ MATCH (n:${label} {id: '${nodeId.replace(/'/g, "''")}'})
269
+ RETURN n.name AS name, n.filePath AS filePath,
270
+ n.startLine AS startLine, n.endLine AS endLine
271
+ `;
272
+ }
273
+ const nodeRows = await executeQuery(nodeQuery);
274
+ if (nodeRows.length > 0) {
275
+ const nodeRow = nodeRows[0];
276
+ results.push({
277
+ nodeId,
278
+ name: nodeRow.name ?? nodeRow[0] ?? '',
279
+ label,
280
+ filePath: nodeRow.filePath ?? nodeRow[1] ?? '',
281
+ distance,
282
+ startLine: label !== 'File' ? (nodeRow.startLine ?? nodeRow[2]) : undefined,
283
+ endLine: label !== 'File' ? (nodeRow.endLine ?? nodeRow[3]) : undefined,
284
+ });
285
+ }
286
+ }
287
+ catch {
288
+ // Table might not exist, skip
289
+ }
290
+ }
291
+ return results;
292
+ };
293
+ /**
294
+ * Semantic search with graph expansion (flattened results)
295
+ *
296
+ * Note: With multi-table schema, graph traversal is simplified.
297
+ * Returns semantic matches with their metadata.
298
+ * For full graph traversal, use execute_vector_cypher tool directly.
299
+ *
300
+ * @param executeQuery - Function to execute Cypher queries
301
+ * @param query - Search query text
302
+ * @param k - Number of initial semantic matches (default: 5)
303
+ * @param _hops - Unused (kept for API compatibility).
304
+ * @returns Semantic matches with metadata
305
+ */
306
+ export const semanticSearchWithContext = async (executeQuery, query, k = 5, _hops = 1) => {
307
+ // For multi-table schema, just return semantic search results
308
+ // Graph traversal is complex with separate tables - use execute_vector_cypher instead
309
+ const results = await semanticSearch(executeQuery, query, k, 0.5);
310
+ return results.map(r => ({
311
+ matchId: r.nodeId,
312
+ matchName: r.name,
313
+ matchLabel: r.label,
314
+ matchPath: r.filePath,
315
+ distance: r.distance,
316
+ connectedId: null,
317
+ connectedName: null,
318
+ connectedLabel: null,
319
+ relationType: null,
320
+ }));
321
+ };
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Embeddings Module
3
+ *
4
+ * Re-exports for the embedding pipeline system.
5
+ */
6
+ export * from './types.js';
7
+ export * from './embedder.js';
8
+ export * from './text-generator.js';
9
+ export * from './embedding-pipeline.js';
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Embeddings Module
3
+ *
4
+ * Re-exports for the embedding pipeline system.
5
+ */
6
+ export * from './types.js';
7
+ export * from './embedder.js';
8
+ export * from './text-generator.js';
9
+ export * from './embedding-pipeline.js';
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Text Generator Module
3
+ *
4
+ * Pure functions to generate embedding text from code nodes.
5
+ * Combines node metadata with code snippets for semantic matching.
6
+ */
7
+ import type { EmbeddableNode, EmbeddingConfig } from './types.js';
8
+ /**
9
+ * Generate embedding text for any embeddable node
10
+ * Dispatches to the appropriate generator based on node label
11
+ *
12
+ * @param node - The node to generate text for
13
+ * @param config - Optional configuration for max snippet length
14
+ * @returns Text suitable for embedding
15
+ */
16
+ export declare const generateEmbeddingText: (node: EmbeddableNode, config?: Partial<EmbeddingConfig>) => string;
17
+ /**
18
+ * Generate embedding texts for a batch of nodes
19
+ *
20
+ * @param nodes - Array of nodes to generate text for
21
+ * @param config - Optional configuration
22
+ * @returns Array of texts in the same order as input nodes
23
+ */
24
+ export declare const generateBatchEmbeddingTexts: (nodes: EmbeddableNode[], config?: Partial<EmbeddingConfig>) => string[];
@@ -0,0 +1,182 @@
1
+ /**
2
+ * Text Generator Module
3
+ *
4
+ * Pure functions to generate embedding text from code nodes.
5
+ * Combines node metadata with code snippets for semantic matching.
6
+ */
7
+ import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
8
+ /**
9
+ * Extract the filename from a file path
10
+ */
11
+ const getFileName = (filePath) => {
12
+ const parts = filePath.split('/');
13
+ return parts[parts.length - 1] || filePath;
14
+ };
15
+ /**
16
+ * Extract the directory path from a file path
17
+ */
18
+ const getDirectory = (filePath) => {
19
+ const parts = filePath.split('/');
20
+ parts.pop();
21
+ return parts.join('/') || '';
22
+ };
23
+ /**
24
+ * Truncate content to max length, preserving word boundaries
25
+ */
26
+ const truncateContent = (content, maxLength) => {
27
+ if (content.length <= maxLength) {
28
+ return content;
29
+ }
30
+ // Find last space before maxLength to avoid cutting words
31
+ const truncated = content.slice(0, maxLength);
32
+ const lastSpace = truncated.lastIndexOf(' ');
33
+ if (lastSpace > maxLength * 0.8) {
34
+ return truncated.slice(0, lastSpace) + '...';
35
+ }
36
+ return truncated + '...';
37
+ };
38
+ /**
39
+ * Clean code content for embedding
40
+ * Removes excessive whitespace while preserving structure
41
+ */
42
+ const cleanContent = (content) => {
43
+ return content
44
+ // Normalize line endings
45
+ .replace(/\r\n/g, '\n')
46
+ // Remove excessive blank lines (more than 2)
47
+ .replace(/\n{3,}/g, '\n\n')
48
+ // Trim each line
49
+ .split('\n')
50
+ .map(line => line.trimEnd())
51
+ .join('\n')
52
+ .trim();
53
+ };
54
+ /**
55
+ * Generate embedding text for a Function node
56
+ */
57
+ const generateFunctionText = (node, maxSnippetLength) => {
58
+ const parts = [
59
+ `Function: ${node.name}`,
60
+ `File: ${getFileName(node.filePath)}`,
61
+ ];
62
+ const dir = getDirectory(node.filePath);
63
+ if (dir) {
64
+ parts.push(`Directory: ${dir}`);
65
+ }
66
+ if (node.content) {
67
+ const cleanedContent = cleanContent(node.content);
68
+ const snippet = truncateContent(cleanedContent, maxSnippetLength);
69
+ parts.push('', snippet);
70
+ }
71
+ return parts.join('\n');
72
+ };
73
+ /**
74
+ * Generate embedding text for a Class node
75
+ */
76
+ const generateClassText = (node, maxSnippetLength) => {
77
+ const parts = [
78
+ `Class: ${node.name}`,
79
+ `File: ${getFileName(node.filePath)}`,
80
+ ];
81
+ const dir = getDirectory(node.filePath);
82
+ if (dir) {
83
+ parts.push(`Directory: ${dir}`);
84
+ }
85
+ if (node.content) {
86
+ const cleanedContent = cleanContent(node.content);
87
+ const snippet = truncateContent(cleanedContent, maxSnippetLength);
88
+ parts.push('', snippet);
89
+ }
90
+ return parts.join('\n');
91
+ };
92
+ /**
93
+ * Generate embedding text for a Method node
94
+ */
95
+ const generateMethodText = (node, maxSnippetLength) => {
96
+ const parts = [
97
+ `Method: ${node.name}`,
98
+ `File: ${getFileName(node.filePath)}`,
99
+ ];
100
+ const dir = getDirectory(node.filePath);
101
+ if (dir) {
102
+ parts.push(`Directory: ${dir}`);
103
+ }
104
+ if (node.content) {
105
+ const cleanedContent = cleanContent(node.content);
106
+ const snippet = truncateContent(cleanedContent, maxSnippetLength);
107
+ parts.push('', snippet);
108
+ }
109
+ return parts.join('\n');
110
+ };
111
+ /**
112
+ * Generate embedding text for an Interface node
113
+ */
114
+ const generateInterfaceText = (node, maxSnippetLength) => {
115
+ const parts = [
116
+ `Interface: ${node.name}`,
117
+ `File: ${getFileName(node.filePath)}`,
118
+ ];
119
+ const dir = getDirectory(node.filePath);
120
+ if (dir) {
121
+ parts.push(`Directory: ${dir}`);
122
+ }
123
+ if (node.content) {
124
+ const cleanedContent = cleanContent(node.content);
125
+ const snippet = truncateContent(cleanedContent, maxSnippetLength);
126
+ parts.push('', snippet);
127
+ }
128
+ return parts.join('\n');
129
+ };
130
+ /**
131
+ * Generate embedding text for a File node
132
+ * Uses file name and first N characters of content
133
+ */
134
+ const generateFileText = (node, maxSnippetLength) => {
135
+ const parts = [
136
+ `File: ${node.name}`,
137
+ `Path: ${node.filePath}`,
138
+ ];
139
+ if (node.content) {
140
+ const cleanedContent = cleanContent(node.content);
141
+ // For files, use a shorter snippet since they can be very long
142
+ const snippet = truncateContent(cleanedContent, Math.min(maxSnippetLength, 300));
143
+ parts.push('', snippet);
144
+ }
145
+ return parts.join('\n');
146
+ };
147
+ /**
148
+ * Generate embedding text for any embeddable node
149
+ * Dispatches to the appropriate generator based on node label
150
+ *
151
+ * @param node - The node to generate text for
152
+ * @param config - Optional configuration for max snippet length
153
+ * @returns Text suitable for embedding
154
+ */
155
+ export const generateEmbeddingText = (node, config = {}) => {
156
+ const maxSnippetLength = config.maxSnippetLength ?? DEFAULT_EMBEDDING_CONFIG.maxSnippetLength;
157
+ switch (node.label) {
158
+ case 'Function':
159
+ return generateFunctionText(node, maxSnippetLength);
160
+ case 'Class':
161
+ return generateClassText(node, maxSnippetLength);
162
+ case 'Method':
163
+ return generateMethodText(node, maxSnippetLength);
164
+ case 'Interface':
165
+ return generateInterfaceText(node, maxSnippetLength);
166
+ case 'File':
167
+ return generateFileText(node, maxSnippetLength);
168
+ default:
169
+ // Fallback for any other embeddable type
170
+ return `${node.label}: ${node.name}\nPath: ${node.filePath}`;
171
+ }
172
+ };
173
+ /**
174
+ * Generate embedding texts for a batch of nodes
175
+ *
176
+ * @param nodes - Array of nodes to generate text for
177
+ * @param config - Optional configuration
178
+ * @returns Array of texts in the same order as input nodes
179
+ */
180
+ export const generateBatchEmbeddingTexts = (nodes, config = {}) => {
181
+ return nodes.map(node => generateEmbeddingText(node, config));
182
+ };
@@ -0,0 +1,87 @@
1
+ /**
2
+ * Embedding Pipeline Types
3
+ *
4
+ * Type definitions for the embedding generation and semantic search system.
5
+ */
6
+ /**
7
+ * Node labels that should be embedded for semantic search
8
+ * These are code elements that benefit from semantic matching
9
+ */
10
+ export declare const EMBEDDABLE_LABELS: readonly ["Function", "Class", "Method", "Interface", "File"];
11
+ export type EmbeddableLabel = typeof EMBEDDABLE_LABELS[number];
12
+ /**
13
+ * Check if a label should be embedded
14
+ */
15
+ export declare const isEmbeddableLabel: (label: string) => label is EmbeddableLabel;
16
+ /**
17
+ * Embedding pipeline phases
18
+ */
19
+ export type EmbeddingPhase = 'idle' | 'loading-model' | 'embedding' | 'indexing' | 'ready' | 'error';
20
+ /**
21
+ * Progress information for the embedding pipeline
22
+ */
23
+ export interface EmbeddingProgress {
24
+ phase: EmbeddingPhase;
25
+ percent: number;
26
+ modelDownloadPercent?: number;
27
+ nodesProcessed?: number;
28
+ totalNodes?: number;
29
+ currentBatch?: number;
30
+ totalBatches?: number;
31
+ error?: string;
32
+ }
33
+ /**
34
+ * Configuration for the embedding pipeline
35
+ */
36
+ export interface EmbeddingConfig {
37
+ /** Model identifier for transformers.js */
38
+ modelId: string;
39
+ /** Number of nodes to embed in each batch */
40
+ batchSize: number;
41
+ /** Embedding vector dimensions */
42
+ dimensions: number;
43
+ /** Device to use for inference: 'auto' tries GPU first, falls back to CPU */
44
+ device: 'auto' | 'webgpu' | 'cuda' | 'cpu' | 'wasm';
45
+ /** Maximum characters of code snippet to include */
46
+ maxSnippetLength: number;
47
+ }
48
+ /**
49
+ * Default embedding configuration
50
+ * Uses snowflake-arctic-embed-xs for browser efficiency
51
+ * Tries WebGPU first (fast), user can choose WASM fallback if unavailable
52
+ */
53
+ export declare const DEFAULT_EMBEDDING_CONFIG: EmbeddingConfig;
54
+ /**
55
+ * Result from semantic search
56
+ */
57
+ export interface SemanticSearchResult {
58
+ nodeId: string;
59
+ name: string;
60
+ label: string;
61
+ filePath: string;
62
+ distance: number;
63
+ startLine?: number;
64
+ endLine?: number;
65
+ }
66
+ /**
67
+ * Node data for embedding (minimal structure from KuzuDB query)
68
+ */
69
+ export interface EmbeddableNode {
70
+ id: string;
71
+ name: string;
72
+ label: string;
73
+ filePath: string;
74
+ content: string;
75
+ startLine?: number;
76
+ endLine?: number;
77
+ }
78
+ /**
79
+ * Model download progress from transformers.js
80
+ */
81
+ export interface ModelProgress {
82
+ status: 'initiate' | 'download' | 'progress' | 'done' | 'ready';
83
+ file?: string;
84
+ progress?: number;
85
+ loaded?: number;
86
+ total?: number;
87
+ }