gitnexus 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +181 -0
- package/dist/cli/ai-context.d.ts +21 -0
- package/dist/cli/ai-context.js +219 -0
- package/dist/cli/analyze.d.ts +10 -0
- package/dist/cli/analyze.js +118 -0
- package/dist/cli/clean.d.ts +8 -0
- package/dist/cli/clean.js +29 -0
- package/dist/cli/index.d.ts +2 -0
- package/dist/cli/index.js +42 -0
- package/dist/cli/list.d.ts +6 -0
- package/dist/cli/list.js +27 -0
- package/dist/cli/mcp.d.ts +7 -0
- package/dist/cli/mcp.js +85 -0
- package/dist/cli/serve.d.ts +3 -0
- package/dist/cli/serve.js +5 -0
- package/dist/cli/status.d.ts +6 -0
- package/dist/cli/status.js +27 -0
- package/dist/config/ignore-service.d.ts +1 -0
- package/dist/config/ignore-service.js +208 -0
- package/dist/config/supported-languages.d.ts +11 -0
- package/dist/config/supported-languages.js +15 -0
- package/dist/core/embeddings/embedder.d.ts +60 -0
- package/dist/core/embeddings/embedder.js +205 -0
- package/dist/core/embeddings/embedding-pipeline.d.ts +50 -0
- package/dist/core/embeddings/embedding-pipeline.js +321 -0
- package/dist/core/embeddings/index.d.ts +9 -0
- package/dist/core/embeddings/index.js +9 -0
- package/dist/core/embeddings/text-generator.d.ts +24 -0
- package/dist/core/embeddings/text-generator.js +182 -0
- package/dist/core/embeddings/types.d.ts +87 -0
- package/dist/core/embeddings/types.js +32 -0
- package/dist/core/graph/graph.d.ts +2 -0
- package/dist/core/graph/graph.js +61 -0
- package/dist/core/graph/types.d.ts +50 -0
- package/dist/core/graph/types.js +1 -0
- package/dist/core/ingestion/ast-cache.d.ts +11 -0
- package/dist/core/ingestion/ast-cache.js +34 -0
- package/dist/core/ingestion/call-processor.d.ts +8 -0
- package/dist/core/ingestion/call-processor.js +269 -0
- package/dist/core/ingestion/cluster-enricher.d.ts +38 -0
- package/dist/core/ingestion/cluster-enricher.js +170 -0
- package/dist/core/ingestion/community-processor.d.ts +39 -0
- package/dist/core/ingestion/community-processor.js +269 -0
- package/dist/core/ingestion/entry-point-scoring.d.ts +39 -0
- package/dist/core/ingestion/entry-point-scoring.js +235 -0
- package/dist/core/ingestion/filesystem-walker.d.ts +5 -0
- package/dist/core/ingestion/filesystem-walker.js +26 -0
- package/dist/core/ingestion/framework-detection.d.ts +38 -0
- package/dist/core/ingestion/framework-detection.js +183 -0
- package/dist/core/ingestion/heritage-processor.d.ts +14 -0
- package/dist/core/ingestion/heritage-processor.js +134 -0
- package/dist/core/ingestion/import-processor.d.ts +8 -0
- package/dist/core/ingestion/import-processor.js +490 -0
- package/dist/core/ingestion/parsing-processor.d.ts +8 -0
- package/dist/core/ingestion/parsing-processor.js +249 -0
- package/dist/core/ingestion/pipeline.d.ts +2 -0
- package/dist/core/ingestion/pipeline.js +228 -0
- package/dist/core/ingestion/process-processor.d.ts +51 -0
- package/dist/core/ingestion/process-processor.js +278 -0
- package/dist/core/ingestion/structure-processor.d.ts +2 -0
- package/dist/core/ingestion/structure-processor.js +36 -0
- package/dist/core/ingestion/symbol-table.d.ts +33 -0
- package/dist/core/ingestion/symbol-table.js +38 -0
- package/dist/core/ingestion/tree-sitter-queries.d.ts +11 -0
- package/dist/core/ingestion/tree-sitter-queries.js +319 -0
- package/dist/core/ingestion/utils.d.ts +10 -0
- package/dist/core/ingestion/utils.js +44 -0
- package/dist/core/kuzu/csv-generator.d.ts +22 -0
- package/dist/core/kuzu/csv-generator.js +272 -0
- package/dist/core/kuzu/kuzu-adapter.d.ts +81 -0
- package/dist/core/kuzu/kuzu-adapter.js +568 -0
- package/dist/core/kuzu/schema.d.ts +53 -0
- package/dist/core/kuzu/schema.js +380 -0
- package/dist/core/search/bm25-index.d.ts +22 -0
- package/dist/core/search/bm25-index.js +52 -0
- package/dist/core/search/hybrid-search.d.ts +49 -0
- package/dist/core/search/hybrid-search.js +118 -0
- package/dist/core/tree-sitter/parser-loader.d.ts +4 -0
- package/dist/core/tree-sitter/parser-loader.js +42 -0
- package/dist/lib/utils.d.ts +1 -0
- package/dist/lib/utils.js +3 -0
- package/dist/mcp/core/embedder.d.ts +27 -0
- package/dist/mcp/core/embedder.js +93 -0
- package/dist/mcp/core/kuzu-adapter.d.ts +23 -0
- package/dist/mcp/core/kuzu-adapter.js +62 -0
- package/dist/mcp/local/local-backend.d.ts +73 -0
- package/dist/mcp/local/local-backend.js +752 -0
- package/dist/mcp/resources.d.ts +31 -0
- package/dist/mcp/resources.js +279 -0
- package/dist/mcp/server.d.ts +12 -0
- package/dist/mcp/server.js +130 -0
- package/dist/mcp/staleness.d.ts +15 -0
- package/dist/mcp/staleness.js +29 -0
- package/dist/mcp/tools.d.ts +24 -0
- package/dist/mcp/tools.js +160 -0
- package/dist/server/api.d.ts +6 -0
- package/dist/server/api.js +156 -0
- package/dist/storage/git.d.ts +7 -0
- package/dist/storage/git.js +39 -0
- package/dist/storage/repo-manager.d.ts +61 -0
- package/dist/storage/repo-manager.js +106 -0
- package/dist/types/pipeline.d.ts +28 -0
- package/dist/types/pipeline.js +16 -0
- package/package.json +80 -0
- package/skills/debugging.md +104 -0
- package/skills/exploring.md +112 -0
- package/skills/impact-analysis.md +114 -0
- package/skills/refactoring.md +119 -0
- package/vendor/leiden/index.cjs +355 -0
- package/vendor/leiden/utils.cjs +392 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding Pipeline Module
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates the background embedding process:
|
|
5
|
+
* 1. Query embeddable nodes from KuzuDB
|
|
6
|
+
* 2. Generate text representations
|
|
7
|
+
* 3. Batch embed using transformers.js
|
|
8
|
+
* 4. Update KuzuDB with embeddings
|
|
9
|
+
* 5. Create vector index for semantic search
|
|
10
|
+
*/
|
|
11
|
+
import { type EmbeddingProgress, type EmbeddingConfig, type SemanticSearchResult } from './types.js';
|
|
12
|
+
/**
|
|
13
|
+
* Progress callback type
|
|
14
|
+
*/
|
|
15
|
+
export type EmbeddingProgressCallback = (progress: EmbeddingProgress) => void;
|
|
16
|
+
/**
|
|
17
|
+
* Run the embedding pipeline
|
|
18
|
+
*
|
|
19
|
+
* @param executeQuery - Function to execute Cypher queries against KuzuDB
|
|
20
|
+
* @param executeWithReusedStatement - Function to execute with reused prepared statement
|
|
21
|
+
* @param onProgress - Callback for progress updates
|
|
22
|
+
* @param config - Optional configuration override
|
|
23
|
+
*/
|
|
24
|
+
export declare const runEmbeddingPipeline: (executeQuery: (cypher: string) => Promise<any[]>, executeWithReusedStatement: (cypher: string, paramsList: Array<Record<string, any>>) => Promise<void>, onProgress: EmbeddingProgressCallback, config?: Partial<EmbeddingConfig>) => Promise<void>;
|
|
25
|
+
/**
|
|
26
|
+
* Perform semantic search using the vector index
|
|
27
|
+
*
|
|
28
|
+
* Uses CodeEmbedding table and queries each node table to get metadata
|
|
29
|
+
*
|
|
30
|
+
* @param executeQuery - Function to execute Cypher queries
|
|
31
|
+
* @param query - Search query text
|
|
32
|
+
* @param k - Number of results to return (default: 10)
|
|
33
|
+
* @param maxDistance - Maximum distance threshold (default: 0.5)
|
|
34
|
+
* @returns Array of search results ordered by relevance
|
|
35
|
+
*/
|
|
36
|
+
export declare const semanticSearch: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, maxDistance?: number) => Promise<SemanticSearchResult[]>;
|
|
37
|
+
/**
|
|
38
|
+
* Semantic search with graph expansion (flattened results)
|
|
39
|
+
*
|
|
40
|
+
* Note: With multi-table schema, graph traversal is simplified.
|
|
41
|
+
* Returns semantic matches with their metadata.
|
|
42
|
+
* For full graph traversal, use execute_vector_cypher tool directly.
|
|
43
|
+
*
|
|
44
|
+
* @param executeQuery - Function to execute Cypher queries
|
|
45
|
+
* @param query - Search query text
|
|
46
|
+
* @param k - Number of initial semantic matches (default: 5)
|
|
47
|
+
* @param _hops - Unused (kept for API compatibility).
|
|
48
|
+
* @returns Semantic matches with metadata
|
|
49
|
+
*/
|
|
50
|
+
export declare const semanticSearchWithContext: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, _hops?: number) => Promise<any[]>;
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding Pipeline Module
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates the background embedding process:
|
|
5
|
+
* 1. Query embeddable nodes from KuzuDB
|
|
6
|
+
* 2. Generate text representations
|
|
7
|
+
* 3. Batch embed using transformers.js
|
|
8
|
+
* 4. Update KuzuDB with embeddings
|
|
9
|
+
* 5. Create vector index for semantic search
|
|
10
|
+
*/
|
|
11
|
+
import { initEmbedder, embedBatch, embedText, embeddingToArray, isEmbedderReady } from './embedder.js';
|
|
12
|
+
import { generateBatchEmbeddingTexts } from './text-generator.js';
|
|
13
|
+
import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, } from './types.js';
|
|
14
|
+
const isDev = process.env.NODE_ENV !== 'production';
|
|
15
|
+
/**
|
|
16
|
+
* Query all embeddable nodes from KuzuDB
|
|
17
|
+
* Uses table-specific queries (File has different schema than code elements)
|
|
18
|
+
*/
|
|
19
|
+
const queryEmbeddableNodes = async (executeQuery) => {
|
|
20
|
+
const allNodes = [];
|
|
21
|
+
// Query each embeddable table with table-specific columns
|
|
22
|
+
for (const label of EMBEDDABLE_LABELS) {
|
|
23
|
+
try {
|
|
24
|
+
let query;
|
|
25
|
+
if (label === 'File') {
|
|
26
|
+
// File nodes don't have startLine/endLine
|
|
27
|
+
query = `
|
|
28
|
+
MATCH (n:File)
|
|
29
|
+
RETURN n.id AS id, n.name AS name, 'File' AS label,
|
|
30
|
+
n.filePath AS filePath, n.content AS content
|
|
31
|
+
`;
|
|
32
|
+
}
|
|
33
|
+
else {
|
|
34
|
+
// Code elements have startLine/endLine
|
|
35
|
+
query = `
|
|
36
|
+
MATCH (n:${label})
|
|
37
|
+
RETURN n.id AS id, n.name AS name, '${label}' AS label,
|
|
38
|
+
n.filePath AS filePath, n.content AS content,
|
|
39
|
+
n.startLine AS startLine, n.endLine AS endLine
|
|
40
|
+
`;
|
|
41
|
+
}
|
|
42
|
+
const rows = await executeQuery(query);
|
|
43
|
+
for (const row of rows) {
|
|
44
|
+
allNodes.push({
|
|
45
|
+
id: row.id ?? row[0],
|
|
46
|
+
name: row.name ?? row[1],
|
|
47
|
+
label: row.label ?? row[2],
|
|
48
|
+
filePath: row.filePath ?? row[3],
|
|
49
|
+
content: row.content ?? row[4] ?? '',
|
|
50
|
+
startLine: row.startLine ?? row[5],
|
|
51
|
+
endLine: row.endLine ?? row[6],
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
catch (error) {
|
|
56
|
+
// Table might not exist or be empty, continue
|
|
57
|
+
if (isDev) {
|
|
58
|
+
console.warn(`Query for ${label} nodes failed:`, error);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
return allNodes;
|
|
63
|
+
};
|
|
64
|
+
/**
|
|
65
|
+
* Batch INSERT embeddings into separate CodeEmbedding table
|
|
66
|
+
* Using a separate lightweight table avoids copy-on-write overhead
|
|
67
|
+
* that occurs when UPDATEing nodes with large content fields
|
|
68
|
+
*/
|
|
69
|
+
const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
|
|
70
|
+
// INSERT into separate embedding table - much more memory efficient!
|
|
71
|
+
const cypher = `CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`;
|
|
72
|
+
const paramsList = updates.map(u => ({ nodeId: u.id, embedding: u.embedding }));
|
|
73
|
+
await executeWithReusedStatement(cypher, paramsList);
|
|
74
|
+
};
|
|
75
|
+
/**
|
|
76
|
+
* Create the vector index for semantic search
|
|
77
|
+
* Now indexes the separate CodeEmbedding table
|
|
78
|
+
*/
|
|
79
|
+
const createVectorIndex = async (executeQuery) => {
|
|
80
|
+
const cypher = `
|
|
81
|
+
CALL CREATE_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', 'embedding', metric := 'cosine')
|
|
82
|
+
`;
|
|
83
|
+
try {
|
|
84
|
+
await executeQuery(cypher);
|
|
85
|
+
}
|
|
86
|
+
catch (error) {
|
|
87
|
+
// Index might already exist
|
|
88
|
+
if (isDev) {
|
|
89
|
+
console.warn('Vector index creation warning:', error);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
};
|
|
93
|
+
/**
|
|
94
|
+
* Run the embedding pipeline
|
|
95
|
+
*
|
|
96
|
+
* @param executeQuery - Function to execute Cypher queries against KuzuDB
|
|
97
|
+
* @param executeWithReusedStatement - Function to execute with reused prepared statement
|
|
98
|
+
* @param onProgress - Callback for progress updates
|
|
99
|
+
* @param config - Optional configuration override
|
|
100
|
+
*/
|
|
101
|
+
export const runEmbeddingPipeline = async (executeQuery, executeWithReusedStatement, onProgress, config = {}) => {
|
|
102
|
+
const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
|
|
103
|
+
try {
|
|
104
|
+
// Phase 1: Load embedding model
|
|
105
|
+
onProgress({
|
|
106
|
+
phase: 'loading-model',
|
|
107
|
+
percent: 0,
|
|
108
|
+
modelDownloadPercent: 0,
|
|
109
|
+
});
|
|
110
|
+
await initEmbedder((modelProgress) => {
|
|
111
|
+
// Report model download progress
|
|
112
|
+
const downloadPercent = modelProgress.progress ?? 0;
|
|
113
|
+
onProgress({
|
|
114
|
+
phase: 'loading-model',
|
|
115
|
+
percent: Math.round(downloadPercent * 0.2), // 0-20% for model loading
|
|
116
|
+
modelDownloadPercent: downloadPercent,
|
|
117
|
+
});
|
|
118
|
+
}, finalConfig);
|
|
119
|
+
onProgress({
|
|
120
|
+
phase: 'loading-model',
|
|
121
|
+
percent: 20,
|
|
122
|
+
modelDownloadPercent: 100,
|
|
123
|
+
});
|
|
124
|
+
if (isDev) {
|
|
125
|
+
console.log('🔍 Querying embeddable nodes...');
|
|
126
|
+
}
|
|
127
|
+
// Phase 2: Query embeddable nodes
|
|
128
|
+
const nodes = await queryEmbeddableNodes(executeQuery);
|
|
129
|
+
const totalNodes = nodes.length;
|
|
130
|
+
if (isDev) {
|
|
131
|
+
console.log(`📊 Found ${totalNodes} embeddable nodes`);
|
|
132
|
+
}
|
|
133
|
+
if (totalNodes === 0) {
|
|
134
|
+
onProgress({
|
|
135
|
+
phase: 'ready',
|
|
136
|
+
percent: 100,
|
|
137
|
+
nodesProcessed: 0,
|
|
138
|
+
totalNodes: 0,
|
|
139
|
+
});
|
|
140
|
+
return;
|
|
141
|
+
}
|
|
142
|
+
// Phase 3: Batch embed nodes
|
|
143
|
+
const batchSize = finalConfig.batchSize;
|
|
144
|
+
const totalBatches = Math.ceil(totalNodes / batchSize);
|
|
145
|
+
let processedNodes = 0;
|
|
146
|
+
onProgress({
|
|
147
|
+
phase: 'embedding',
|
|
148
|
+
percent: 20,
|
|
149
|
+
nodesProcessed: 0,
|
|
150
|
+
totalNodes,
|
|
151
|
+
currentBatch: 0,
|
|
152
|
+
totalBatches,
|
|
153
|
+
});
|
|
154
|
+
for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
|
|
155
|
+
const start = batchIndex * batchSize;
|
|
156
|
+
const end = Math.min(start + batchSize, totalNodes);
|
|
157
|
+
const batch = nodes.slice(start, end);
|
|
158
|
+
// Generate texts for this batch
|
|
159
|
+
const texts = generateBatchEmbeddingTexts(batch, finalConfig);
|
|
160
|
+
// Embed the batch
|
|
161
|
+
const embeddings = await embedBatch(texts);
|
|
162
|
+
// Update KuzuDB with embeddings
|
|
163
|
+
const updates = batch.map((node, i) => ({
|
|
164
|
+
id: node.id,
|
|
165
|
+
embedding: embeddingToArray(embeddings[i]),
|
|
166
|
+
}));
|
|
167
|
+
await batchInsertEmbeddings(executeWithReusedStatement, updates);
|
|
168
|
+
processedNodes += batch.length;
|
|
169
|
+
// Report progress (20-90% for embedding phase)
|
|
170
|
+
const embeddingProgress = 20 + ((processedNodes / totalNodes) * 70);
|
|
171
|
+
onProgress({
|
|
172
|
+
phase: 'embedding',
|
|
173
|
+
percent: Math.round(embeddingProgress),
|
|
174
|
+
nodesProcessed: processedNodes,
|
|
175
|
+
totalNodes,
|
|
176
|
+
currentBatch: batchIndex + 1,
|
|
177
|
+
totalBatches,
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
// Phase 4: Create vector index
|
|
181
|
+
onProgress({
|
|
182
|
+
phase: 'indexing',
|
|
183
|
+
percent: 90,
|
|
184
|
+
nodesProcessed: totalNodes,
|
|
185
|
+
totalNodes,
|
|
186
|
+
});
|
|
187
|
+
if (isDev) {
|
|
188
|
+
console.log('📇 Creating vector index...');
|
|
189
|
+
}
|
|
190
|
+
await createVectorIndex(executeQuery);
|
|
191
|
+
// Complete
|
|
192
|
+
onProgress({
|
|
193
|
+
phase: 'ready',
|
|
194
|
+
percent: 100,
|
|
195
|
+
nodesProcessed: totalNodes,
|
|
196
|
+
totalNodes,
|
|
197
|
+
});
|
|
198
|
+
if (isDev) {
|
|
199
|
+
console.log('✅ Embedding pipeline complete!');
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
catch (error) {
|
|
203
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
204
|
+
if (isDev) {
|
|
205
|
+
console.error('❌ Embedding pipeline error:', error);
|
|
206
|
+
}
|
|
207
|
+
onProgress({
|
|
208
|
+
phase: 'error',
|
|
209
|
+
percent: 0,
|
|
210
|
+
error: errorMessage,
|
|
211
|
+
});
|
|
212
|
+
throw error;
|
|
213
|
+
}
|
|
214
|
+
};
|
|
215
|
+
/**
|
|
216
|
+
* Perform semantic search using the vector index
|
|
217
|
+
*
|
|
218
|
+
* Uses CodeEmbedding table and queries each node table to get metadata
|
|
219
|
+
*
|
|
220
|
+
* @param executeQuery - Function to execute Cypher queries
|
|
221
|
+
* @param query - Search query text
|
|
222
|
+
* @param k - Number of results to return (default: 10)
|
|
223
|
+
* @param maxDistance - Maximum distance threshold (default: 0.5)
|
|
224
|
+
* @returns Array of search results ordered by relevance
|
|
225
|
+
*/
|
|
226
|
+
export const semanticSearch = async (executeQuery, query, k = 10, maxDistance = 0.5) => {
|
|
227
|
+
if (!isEmbedderReady()) {
|
|
228
|
+
throw new Error('Embedding model not initialized. Run embedding pipeline first.');
|
|
229
|
+
}
|
|
230
|
+
// Embed the query
|
|
231
|
+
const queryEmbedding = await embedText(query);
|
|
232
|
+
const queryVec = embeddingToArray(queryEmbedding);
|
|
233
|
+
const queryVecStr = `[${queryVec.join(',')}]`;
|
|
234
|
+
// Query the vector index on CodeEmbedding to get nodeIds and distances
|
|
235
|
+
const vectorQuery = `
|
|
236
|
+
CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx',
|
|
237
|
+
CAST(${queryVecStr} AS FLOAT[384]), ${k})
|
|
238
|
+
YIELD node AS emb, distance
|
|
239
|
+
WITH emb, distance
|
|
240
|
+
WHERE distance < ${maxDistance}
|
|
241
|
+
RETURN emb.nodeId AS nodeId, distance
|
|
242
|
+
ORDER BY distance
|
|
243
|
+
`;
|
|
244
|
+
const embResults = await executeQuery(vectorQuery);
|
|
245
|
+
if (embResults.length === 0) {
|
|
246
|
+
return [];
|
|
247
|
+
}
|
|
248
|
+
// Get metadata for each result by querying each node table
|
|
249
|
+
const results = [];
|
|
250
|
+
for (const embRow of embResults) {
|
|
251
|
+
const nodeId = embRow.nodeId ?? embRow[0];
|
|
252
|
+
const distance = embRow.distance ?? embRow[1];
|
|
253
|
+
// Extract label from node ID (format: Label:path:name)
|
|
254
|
+
const labelEndIdx = nodeId.indexOf(':');
|
|
255
|
+
const label = labelEndIdx > 0 ? nodeId.substring(0, labelEndIdx) : 'Unknown';
|
|
256
|
+
// Query the specific table for this node
|
|
257
|
+
// File nodes don't have startLine/endLine
|
|
258
|
+
try {
|
|
259
|
+
let nodeQuery;
|
|
260
|
+
if (label === 'File') {
|
|
261
|
+
nodeQuery = `
|
|
262
|
+
MATCH (n:File {id: '${nodeId.replace(/'/g, "''")}'})
|
|
263
|
+
RETURN n.name AS name, n.filePath AS filePath
|
|
264
|
+
`;
|
|
265
|
+
}
|
|
266
|
+
else {
|
|
267
|
+
nodeQuery = `
|
|
268
|
+
MATCH (n:${label} {id: '${nodeId.replace(/'/g, "''")}'})
|
|
269
|
+
RETURN n.name AS name, n.filePath AS filePath,
|
|
270
|
+
n.startLine AS startLine, n.endLine AS endLine
|
|
271
|
+
`;
|
|
272
|
+
}
|
|
273
|
+
const nodeRows = await executeQuery(nodeQuery);
|
|
274
|
+
if (nodeRows.length > 0) {
|
|
275
|
+
const nodeRow = nodeRows[0];
|
|
276
|
+
results.push({
|
|
277
|
+
nodeId,
|
|
278
|
+
name: nodeRow.name ?? nodeRow[0] ?? '',
|
|
279
|
+
label,
|
|
280
|
+
filePath: nodeRow.filePath ?? nodeRow[1] ?? '',
|
|
281
|
+
distance,
|
|
282
|
+
startLine: label !== 'File' ? (nodeRow.startLine ?? nodeRow[2]) : undefined,
|
|
283
|
+
endLine: label !== 'File' ? (nodeRow.endLine ?? nodeRow[3]) : undefined,
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
catch {
|
|
288
|
+
// Table might not exist, skip
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
return results;
|
|
292
|
+
};
|
|
293
|
+
/**
|
|
294
|
+
* Semantic search with graph expansion (flattened results)
|
|
295
|
+
*
|
|
296
|
+
* Note: With multi-table schema, graph traversal is simplified.
|
|
297
|
+
* Returns semantic matches with their metadata.
|
|
298
|
+
* For full graph traversal, use execute_vector_cypher tool directly.
|
|
299
|
+
*
|
|
300
|
+
* @param executeQuery - Function to execute Cypher queries
|
|
301
|
+
* @param query - Search query text
|
|
302
|
+
* @param k - Number of initial semantic matches (default: 5)
|
|
303
|
+
* @param _hops - Unused (kept for API compatibility).
|
|
304
|
+
* @returns Semantic matches with metadata
|
|
305
|
+
*/
|
|
306
|
+
export const semanticSearchWithContext = async (executeQuery, query, k = 5, _hops = 1) => {
|
|
307
|
+
// For multi-table schema, just return semantic search results
|
|
308
|
+
// Graph traversal is complex with separate tables - use execute_vector_cypher instead
|
|
309
|
+
const results = await semanticSearch(executeQuery, query, k, 0.5);
|
|
310
|
+
return results.map(r => ({
|
|
311
|
+
matchId: r.nodeId,
|
|
312
|
+
matchName: r.name,
|
|
313
|
+
matchLabel: r.label,
|
|
314
|
+
matchPath: r.filePath,
|
|
315
|
+
distance: r.distance,
|
|
316
|
+
connectedId: null,
|
|
317
|
+
connectedName: null,
|
|
318
|
+
connectedLabel: null,
|
|
319
|
+
relationType: null,
|
|
320
|
+
}));
|
|
321
|
+
};
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text Generator Module
|
|
3
|
+
*
|
|
4
|
+
* Pure functions to generate embedding text from code nodes.
|
|
5
|
+
* Combines node metadata with code snippets for semantic matching.
|
|
6
|
+
*/
|
|
7
|
+
import type { EmbeddableNode, EmbeddingConfig } from './types.js';
|
|
8
|
+
/**
|
|
9
|
+
* Generate embedding text for any embeddable node
|
|
10
|
+
* Dispatches to the appropriate generator based on node label
|
|
11
|
+
*
|
|
12
|
+
* @param node - The node to generate text for
|
|
13
|
+
* @param config - Optional configuration for max snippet length
|
|
14
|
+
* @returns Text suitable for embedding
|
|
15
|
+
*/
|
|
16
|
+
export declare const generateEmbeddingText: (node: EmbeddableNode, config?: Partial<EmbeddingConfig>) => string;
|
|
17
|
+
/**
|
|
18
|
+
* Generate embedding texts for a batch of nodes
|
|
19
|
+
*
|
|
20
|
+
* @param nodes - Array of nodes to generate text for
|
|
21
|
+
* @param config - Optional configuration
|
|
22
|
+
* @returns Array of texts in the same order as input nodes
|
|
23
|
+
*/
|
|
24
|
+
export declare const generateBatchEmbeddingTexts: (nodes: EmbeddableNode[], config?: Partial<EmbeddingConfig>) => string[];
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text Generator Module
|
|
3
|
+
*
|
|
4
|
+
* Pure functions to generate embedding text from code nodes.
|
|
5
|
+
* Combines node metadata with code snippets for semantic matching.
|
|
6
|
+
*/
|
|
7
|
+
import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
|
|
8
|
+
/**
|
|
9
|
+
* Extract the filename from a file path
|
|
10
|
+
*/
|
|
11
|
+
const getFileName = (filePath) => {
|
|
12
|
+
const parts = filePath.split('/');
|
|
13
|
+
return parts[parts.length - 1] || filePath;
|
|
14
|
+
};
|
|
15
|
+
/**
|
|
16
|
+
* Extract the directory path from a file path
|
|
17
|
+
*/
|
|
18
|
+
const getDirectory = (filePath) => {
|
|
19
|
+
const parts = filePath.split('/');
|
|
20
|
+
parts.pop();
|
|
21
|
+
return parts.join('/') || '';
|
|
22
|
+
};
|
|
23
|
+
/**
|
|
24
|
+
* Truncate content to max length, preserving word boundaries
|
|
25
|
+
*/
|
|
26
|
+
const truncateContent = (content, maxLength) => {
|
|
27
|
+
if (content.length <= maxLength) {
|
|
28
|
+
return content;
|
|
29
|
+
}
|
|
30
|
+
// Find last space before maxLength to avoid cutting words
|
|
31
|
+
const truncated = content.slice(0, maxLength);
|
|
32
|
+
const lastSpace = truncated.lastIndexOf(' ');
|
|
33
|
+
if (lastSpace > maxLength * 0.8) {
|
|
34
|
+
return truncated.slice(0, lastSpace) + '...';
|
|
35
|
+
}
|
|
36
|
+
return truncated + '...';
|
|
37
|
+
};
|
|
38
|
+
/**
|
|
39
|
+
* Clean code content for embedding
|
|
40
|
+
* Removes excessive whitespace while preserving structure
|
|
41
|
+
*/
|
|
42
|
+
const cleanContent = (content) => {
|
|
43
|
+
return content
|
|
44
|
+
// Normalize line endings
|
|
45
|
+
.replace(/\r\n/g, '\n')
|
|
46
|
+
// Remove excessive blank lines (more than 2)
|
|
47
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
48
|
+
// Trim each line
|
|
49
|
+
.split('\n')
|
|
50
|
+
.map(line => line.trimEnd())
|
|
51
|
+
.join('\n')
|
|
52
|
+
.trim();
|
|
53
|
+
};
|
|
54
|
+
/**
|
|
55
|
+
* Generate embedding text for a Function node
|
|
56
|
+
*/
|
|
57
|
+
const generateFunctionText = (node, maxSnippetLength) => {
|
|
58
|
+
const parts = [
|
|
59
|
+
`Function: ${node.name}`,
|
|
60
|
+
`File: ${getFileName(node.filePath)}`,
|
|
61
|
+
];
|
|
62
|
+
const dir = getDirectory(node.filePath);
|
|
63
|
+
if (dir) {
|
|
64
|
+
parts.push(`Directory: ${dir}`);
|
|
65
|
+
}
|
|
66
|
+
if (node.content) {
|
|
67
|
+
const cleanedContent = cleanContent(node.content);
|
|
68
|
+
const snippet = truncateContent(cleanedContent, maxSnippetLength);
|
|
69
|
+
parts.push('', snippet);
|
|
70
|
+
}
|
|
71
|
+
return parts.join('\n');
|
|
72
|
+
};
|
|
73
|
+
/**
|
|
74
|
+
* Generate embedding text for a Class node
|
|
75
|
+
*/
|
|
76
|
+
const generateClassText = (node, maxSnippetLength) => {
|
|
77
|
+
const parts = [
|
|
78
|
+
`Class: ${node.name}`,
|
|
79
|
+
`File: ${getFileName(node.filePath)}`,
|
|
80
|
+
];
|
|
81
|
+
const dir = getDirectory(node.filePath);
|
|
82
|
+
if (dir) {
|
|
83
|
+
parts.push(`Directory: ${dir}`);
|
|
84
|
+
}
|
|
85
|
+
if (node.content) {
|
|
86
|
+
const cleanedContent = cleanContent(node.content);
|
|
87
|
+
const snippet = truncateContent(cleanedContent, maxSnippetLength);
|
|
88
|
+
parts.push('', snippet);
|
|
89
|
+
}
|
|
90
|
+
return parts.join('\n');
|
|
91
|
+
};
|
|
92
|
+
/**
|
|
93
|
+
* Generate embedding text for a Method node
|
|
94
|
+
*/
|
|
95
|
+
const generateMethodText = (node, maxSnippetLength) => {
|
|
96
|
+
const parts = [
|
|
97
|
+
`Method: ${node.name}`,
|
|
98
|
+
`File: ${getFileName(node.filePath)}`,
|
|
99
|
+
];
|
|
100
|
+
const dir = getDirectory(node.filePath);
|
|
101
|
+
if (dir) {
|
|
102
|
+
parts.push(`Directory: ${dir}`);
|
|
103
|
+
}
|
|
104
|
+
if (node.content) {
|
|
105
|
+
const cleanedContent = cleanContent(node.content);
|
|
106
|
+
const snippet = truncateContent(cleanedContent, maxSnippetLength);
|
|
107
|
+
parts.push('', snippet);
|
|
108
|
+
}
|
|
109
|
+
return parts.join('\n');
|
|
110
|
+
};
|
|
111
|
+
/**
|
|
112
|
+
* Generate embedding text for an Interface node
|
|
113
|
+
*/
|
|
114
|
+
const generateInterfaceText = (node, maxSnippetLength) => {
|
|
115
|
+
const parts = [
|
|
116
|
+
`Interface: ${node.name}`,
|
|
117
|
+
`File: ${getFileName(node.filePath)}`,
|
|
118
|
+
];
|
|
119
|
+
const dir = getDirectory(node.filePath);
|
|
120
|
+
if (dir) {
|
|
121
|
+
parts.push(`Directory: ${dir}`);
|
|
122
|
+
}
|
|
123
|
+
if (node.content) {
|
|
124
|
+
const cleanedContent = cleanContent(node.content);
|
|
125
|
+
const snippet = truncateContent(cleanedContent, maxSnippetLength);
|
|
126
|
+
parts.push('', snippet);
|
|
127
|
+
}
|
|
128
|
+
return parts.join('\n');
|
|
129
|
+
};
|
|
130
|
+
/**
|
|
131
|
+
* Generate embedding text for a File node
|
|
132
|
+
* Uses file name and first N characters of content
|
|
133
|
+
*/
|
|
134
|
+
const generateFileText = (node, maxSnippetLength) => {
|
|
135
|
+
const parts = [
|
|
136
|
+
`File: ${node.name}`,
|
|
137
|
+
`Path: ${node.filePath}`,
|
|
138
|
+
];
|
|
139
|
+
if (node.content) {
|
|
140
|
+
const cleanedContent = cleanContent(node.content);
|
|
141
|
+
// For files, use a shorter snippet since they can be very long
|
|
142
|
+
const snippet = truncateContent(cleanedContent, Math.min(maxSnippetLength, 300));
|
|
143
|
+
parts.push('', snippet);
|
|
144
|
+
}
|
|
145
|
+
return parts.join('\n');
|
|
146
|
+
};
|
|
147
|
+
/**
|
|
148
|
+
* Generate embedding text for any embeddable node
|
|
149
|
+
* Dispatches to the appropriate generator based on node label
|
|
150
|
+
*
|
|
151
|
+
* @param node - The node to generate text for
|
|
152
|
+
* @param config - Optional configuration for max snippet length
|
|
153
|
+
* @returns Text suitable for embedding
|
|
154
|
+
*/
|
|
155
|
+
export const generateEmbeddingText = (node, config = {}) => {
|
|
156
|
+
const maxSnippetLength = config.maxSnippetLength ?? DEFAULT_EMBEDDING_CONFIG.maxSnippetLength;
|
|
157
|
+
switch (node.label) {
|
|
158
|
+
case 'Function':
|
|
159
|
+
return generateFunctionText(node, maxSnippetLength);
|
|
160
|
+
case 'Class':
|
|
161
|
+
return generateClassText(node, maxSnippetLength);
|
|
162
|
+
case 'Method':
|
|
163
|
+
return generateMethodText(node, maxSnippetLength);
|
|
164
|
+
case 'Interface':
|
|
165
|
+
return generateInterfaceText(node, maxSnippetLength);
|
|
166
|
+
case 'File':
|
|
167
|
+
return generateFileText(node, maxSnippetLength);
|
|
168
|
+
default:
|
|
169
|
+
// Fallback for any other embeddable type
|
|
170
|
+
return `${node.label}: ${node.name}\nPath: ${node.filePath}`;
|
|
171
|
+
}
|
|
172
|
+
};
|
|
173
|
+
/**
|
|
174
|
+
* Generate embedding texts for a batch of nodes
|
|
175
|
+
*
|
|
176
|
+
* @param nodes - Array of nodes to generate text for
|
|
177
|
+
* @param config - Optional configuration
|
|
178
|
+
* @returns Array of texts in the same order as input nodes
|
|
179
|
+
*/
|
|
180
|
+
export const generateBatchEmbeddingTexts = (nodes, config = {}) => {
|
|
181
|
+
return nodes.map(node => generateEmbeddingText(node, config));
|
|
182
|
+
};
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding Pipeline Types
|
|
3
|
+
*
|
|
4
|
+
* Type definitions for the embedding generation and semantic search system.
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Node labels that should be embedded for semantic search
|
|
8
|
+
* These are code elements that benefit from semantic matching
|
|
9
|
+
*/
|
|
10
|
+
export declare const EMBEDDABLE_LABELS: readonly ["Function", "Class", "Method", "Interface", "File"];
|
|
11
|
+
export type EmbeddableLabel = typeof EMBEDDABLE_LABELS[number];
|
|
12
|
+
/**
|
|
13
|
+
* Check if a label should be embedded
|
|
14
|
+
*/
|
|
15
|
+
export declare const isEmbeddableLabel: (label: string) => label is EmbeddableLabel;
|
|
16
|
+
/**
|
|
17
|
+
* Embedding pipeline phases
|
|
18
|
+
*/
|
|
19
|
+
export type EmbeddingPhase = 'idle' | 'loading-model' | 'embedding' | 'indexing' | 'ready' | 'error';
|
|
20
|
+
/**
|
|
21
|
+
* Progress information for the embedding pipeline
|
|
22
|
+
*/
|
|
23
|
+
export interface EmbeddingProgress {
|
|
24
|
+
phase: EmbeddingPhase;
|
|
25
|
+
percent: number;
|
|
26
|
+
modelDownloadPercent?: number;
|
|
27
|
+
nodesProcessed?: number;
|
|
28
|
+
totalNodes?: number;
|
|
29
|
+
currentBatch?: number;
|
|
30
|
+
totalBatches?: number;
|
|
31
|
+
error?: string;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Configuration for the embedding pipeline
|
|
35
|
+
*/
|
|
36
|
+
export interface EmbeddingConfig {
|
|
37
|
+
/** Model identifier for transformers.js */
|
|
38
|
+
modelId: string;
|
|
39
|
+
/** Number of nodes to embed in each batch */
|
|
40
|
+
batchSize: number;
|
|
41
|
+
/** Embedding vector dimensions */
|
|
42
|
+
dimensions: number;
|
|
43
|
+
/** Device to use for inference: 'auto' tries GPU first, falls back to CPU */
|
|
44
|
+
device: 'auto' | 'webgpu' | 'cuda' | 'cpu' | 'wasm';
|
|
45
|
+
/** Maximum characters of code snippet to include */
|
|
46
|
+
maxSnippetLength: number;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Default embedding configuration
|
|
50
|
+
* Uses snowflake-arctic-embed-xs for browser efficiency
|
|
51
|
+
* Tries WebGPU first (fast), user can choose WASM fallback if unavailable
|
|
52
|
+
*/
|
|
53
|
+
export declare const DEFAULT_EMBEDDING_CONFIG: EmbeddingConfig;
|
|
54
|
+
/**
|
|
55
|
+
* Result from semantic search
|
|
56
|
+
*/
|
|
57
|
+
export interface SemanticSearchResult {
|
|
58
|
+
nodeId: string;
|
|
59
|
+
name: string;
|
|
60
|
+
label: string;
|
|
61
|
+
filePath: string;
|
|
62
|
+
distance: number;
|
|
63
|
+
startLine?: number;
|
|
64
|
+
endLine?: number;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Node data for embedding (minimal structure from KuzuDB query)
|
|
68
|
+
*/
|
|
69
|
+
export interface EmbeddableNode {
|
|
70
|
+
id: string;
|
|
71
|
+
name: string;
|
|
72
|
+
label: string;
|
|
73
|
+
filePath: string;
|
|
74
|
+
content: string;
|
|
75
|
+
startLine?: number;
|
|
76
|
+
endLine?: number;
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Model download progress from transformers.js
|
|
80
|
+
*/
|
|
81
|
+
export interface ModelProgress {
|
|
82
|
+
status: 'initiate' | 'download' | 'progress' | 'done' | 'ready';
|
|
83
|
+
file?: string;
|
|
84
|
+
progress?: number;
|
|
85
|
+
loaded?: number;
|
|
86
|
+
total?: number;
|
|
87
|
+
}
|