codevault 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. package/.env.example +75 -40
  2. package/README.md +112 -345
  3. package/package.json +4 -3
  4. package/dist/chunking/file-grouper.d.ts +0 -39
  5. package/dist/chunking/file-grouper.d.ts.map +0 -1
  6. package/dist/chunking/file-grouper.js +0 -181
  7. package/dist/chunking/file-grouper.js.map +0 -1
  8. package/dist/chunking/semantic-chunker.d.ts +0 -37
  9. package/dist/chunking/semantic-chunker.d.ts.map +0 -1
  10. package/dist/chunking/semantic-chunker.js +0 -172
  11. package/dist/chunking/semantic-chunker.js.map +0 -1
  12. package/dist/chunking/token-counter.d.ts +0 -28
  13. package/dist/chunking/token-counter.d.ts.map +0 -1
  14. package/dist/chunking/token-counter.js +0 -207
  15. package/dist/chunking/token-counter.js.map +0 -1
  16. package/dist/cli/commands/ask-cmd.d.ts +0 -3
  17. package/dist/cli/commands/ask-cmd.d.ts.map +0 -1
  18. package/dist/cli/commands/ask-cmd.js +0 -130
  19. package/dist/cli/commands/ask-cmd.js.map +0 -1
  20. package/dist/cli/commands/config-cmd.d.ts +0 -3
  21. package/dist/cli/commands/config-cmd.d.ts.map +0 -1
  22. package/dist/cli/commands/config-cmd.js +0 -245
  23. package/dist/cli/commands/config-cmd.js.map +0 -1
  24. package/dist/cli/commands/context.d.ts +0 -3
  25. package/dist/cli/commands/context.d.ts.map +0 -1
  26. package/dist/cli/commands/context.js +0 -98
  27. package/dist/cli/commands/context.js.map +0 -1
  28. package/dist/cli/commands/interactive-config.d.ts +0 -2
  29. package/dist/cli/commands/interactive-config.d.ts.map +0 -1
  30. package/dist/cli/commands/interactive-config.js +0 -274
  31. package/dist/cli/commands/interactive-config.js.map +0 -1
  32. package/dist/cli.d.ts +0 -3
  33. package/dist/cli.d.ts.map +0 -1
  34. package/dist/cli.js +0 -398
  35. package/dist/cli.js.map +0 -1
  36. package/dist/codemap/io.d.ts +0 -5
  37. package/dist/codemap/io.d.ts.map +0 -1
  38. package/dist/codemap/io.js +0 -30
  39. package/dist/codemap/io.js.map +0 -1
  40. package/dist/config/apply-env.d.ts +0 -15
  41. package/dist/config/apply-env.d.ts.map +0 -1
  42. package/dist/config/apply-env.js +0 -91
  43. package/dist/config/apply-env.js.map +0 -1
  44. package/dist/config/loader.d.ts +0 -57
  45. package/dist/config/loader.d.ts.map +0 -1
  46. package/dist/config/loader.js +0 -279
  47. package/dist/config/loader.js.map +0 -1
  48. package/dist/config/types.d.ts +0 -46
  49. package/dist/config/types.d.ts.map +0 -1
  50. package/dist/config/types.js +0 -2
  51. package/dist/config/types.js.map +0 -1
  52. package/dist/context/packs.d.ts +0 -33
  53. package/dist/context/packs.d.ts.map +0 -1
  54. package/dist/context/packs.js +0 -180
  55. package/dist/context/packs.js.map +0 -1
  56. package/dist/core/batch-indexer.d.ts +0 -44
  57. package/dist/core/batch-indexer.d.ts.map +0 -1
  58. package/dist/core/batch-indexer.js +0 -161
  59. package/dist/core/batch-indexer.js.map +0 -1
  60. package/dist/core/indexer.d.ts +0 -3
  61. package/dist/core/indexer.d.ts.map +0 -1
  62. package/dist/core/indexer.js +0 -624
  63. package/dist/core/indexer.js.map +0 -1
  64. package/dist/core/metadata.d.ts +0 -19
  65. package/dist/core/metadata.d.ts.map +0 -1
  66. package/dist/core/metadata.js +0 -161
  67. package/dist/core/metadata.js.map +0 -1
  68. package/dist/core/search.d.ts +0 -7
  69. package/dist/core/search.d.ts.map +0 -1
  70. package/dist/core/search.js +0 -542
  71. package/dist/core/search.js.map +0 -1
  72. package/dist/core/symbol-extractor.d.ts +0 -3
  73. package/dist/core/symbol-extractor.d.ts.map +0 -1
  74. package/dist/core/symbol-extractor.js +0 -78
  75. package/dist/core/symbol-extractor.js.map +0 -1
  76. package/dist/core/types.d.ts +0 -104
  77. package/dist/core/types.d.ts.map +0 -1
  78. package/dist/core/types.js +0 -2
  79. package/dist/core/types.js.map +0 -1
  80. package/dist/database/db.d.ts +0 -63
  81. package/dist/database/db.d.ts.map +0 -1
  82. package/dist/database/db.js +0 -205
  83. package/dist/database/db.js.map +0 -1
  84. package/dist/indexer/merkle.d.ts +0 -13
  85. package/dist/indexer/merkle.d.ts.map +0 -1
  86. package/dist/indexer/merkle.js +0 -86
  87. package/dist/indexer/merkle.js.map +0 -1
  88. package/dist/indexer/update.d.ts +0 -19
  89. package/dist/indexer/update.d.ts.map +0 -1
  90. package/dist/indexer/update.js +0 -40
  91. package/dist/indexer/update.js.map +0 -1
  92. package/dist/indexer/watch.d.ts +0 -21
  93. package/dist/indexer/watch.d.ts.map +0 -1
  94. package/dist/indexer/watch.js +0 -222
  95. package/dist/indexer/watch.js.map +0 -1
  96. package/dist/languages/rules.d.ts +0 -11
  97. package/dist/languages/rules.d.ts.map +0 -1
  98. package/dist/languages/rules.js +0 -371
  99. package/dist/languages/rules.js.map +0 -1
  100. package/dist/languages/tree-sitter-loader.d.ts +0 -27
  101. package/dist/languages/tree-sitter-loader.d.ts.map +0 -1
  102. package/dist/languages/tree-sitter-loader.js +0 -76
  103. package/dist/languages/tree-sitter-loader.js.map +0 -1
  104. package/dist/mcp/tools/ask-codebase.d.ts +0 -85
  105. package/dist/mcp/tools/ask-codebase.d.ts.map +0 -1
  106. package/dist/mcp/tools/ask-codebase.js +0 -125
  107. package/dist/mcp/tools/ask-codebase.js.map +0 -1
  108. package/dist/mcp/tools/use-context-pack.d.ts +0 -57
  109. package/dist/mcp/tools/use-context-pack.d.ts.map +0 -1
  110. package/dist/mcp/tools/use-context-pack.js +0 -91
  111. package/dist/mcp/tools/use-context-pack.js.map +0 -1
  112. package/dist/mcp-server.d.ts +0 -3
  113. package/dist/mcp-server.d.ts.map +0 -1
  114. package/dist/mcp-server.js +0 -518
  115. package/dist/mcp-server.js.map +0 -1
  116. package/dist/providers/base.d.ts +0 -39
  117. package/dist/providers/base.d.ts.map +0 -1
  118. package/dist/providers/base.js +0 -198
  119. package/dist/providers/base.js.map +0 -1
  120. package/dist/providers/chat-llm.d.ts +0 -30
  121. package/dist/providers/chat-llm.d.ts.map +0 -1
  122. package/dist/providers/chat-llm.js +0 -82
  123. package/dist/providers/chat-llm.js.map +0 -1
  124. package/dist/providers/index.d.ts +0 -5
  125. package/dist/providers/index.d.ts.map +0 -1
  126. package/dist/providers/index.js +0 -12
  127. package/dist/providers/index.js.map +0 -1
  128. package/dist/providers/ollama.d.ts +0 -13
  129. package/dist/providers/ollama.d.ts.map +0 -1
  130. package/dist/providers/ollama.js +0 -50
  131. package/dist/providers/ollama.js.map +0 -1
  132. package/dist/providers/openai.d.ts +0 -14
  133. package/dist/providers/openai.d.ts.map +0 -1
  134. package/dist/providers/openai.js +0 -122
  135. package/dist/providers/openai.js.map +0 -1
  136. package/dist/providers/token-counter.d.ts +0 -2
  137. package/dist/providers/token-counter.d.ts.map +0 -1
  138. package/dist/providers/token-counter.js +0 -18
  139. package/dist/providers/token-counter.js.map +0 -1
  140. package/dist/ranking/api-reranker.d.ts +0 -18
  141. package/dist/ranking/api-reranker.d.ts.map +0 -1
  142. package/dist/ranking/api-reranker.js +0 -134
  143. package/dist/ranking/api-reranker.js.map +0 -1
  144. package/dist/ranking/symbol-boost.d.ts +0 -15
  145. package/dist/ranking/symbol-boost.d.ts.map +0 -1
  146. package/dist/ranking/symbol-boost.js +0 -154
  147. package/dist/ranking/symbol-boost.js.map +0 -1
  148. package/dist/search/bm25.d.ts +0 -17
  149. package/dist/search/bm25.d.ts.map +0 -1
  150. package/dist/search/bm25.js +0 -56
  151. package/dist/search/bm25.js.map +0 -1
  152. package/dist/search/hybrid.d.ts +0 -21
  153. package/dist/search/hybrid.d.ts.map +0 -1
  154. package/dist/search/hybrid.js +0 -50
  155. package/dist/search/hybrid.js.map +0 -1
  156. package/dist/search/scope.d.ts +0 -5
  157. package/dist/search/scope.d.ts.map +0 -1
  158. package/dist/search/scope.js +0 -107
  159. package/dist/search/scope.js.map +0 -1
  160. package/dist/storage/encrypted-chunks.d.ts +0 -40
  161. package/dist/storage/encrypted-chunks.d.ts.map +0 -1
  162. package/dist/storage/encrypted-chunks.js +0 -237
  163. package/dist/storage/encrypted-chunks.js.map +0 -1
  164. package/dist/symbols/extract.d.ts +0 -15
  165. package/dist/symbols/extract.d.ts.map +0 -1
  166. package/dist/symbols/extract.js +0 -187
  167. package/dist/symbols/extract.js.map +0 -1
  168. package/dist/symbols/graph.d.ts +0 -3
  169. package/dist/symbols/graph.d.ts.map +0 -1
  170. package/dist/symbols/graph.js +0 -89
  171. package/dist/symbols/graph.js.map +0 -1
  172. package/dist/synthesis/markdown-formatter.d.ts +0 -13
  173. package/dist/synthesis/markdown-formatter.d.ts.map +0 -1
  174. package/dist/synthesis/markdown-formatter.js +0 -104
  175. package/dist/synthesis/markdown-formatter.js.map +0 -1
  176. package/dist/synthesis/prompt-builder.d.ts +0 -21
  177. package/dist/synthesis/prompt-builder.d.ts.map +0 -1
  178. package/dist/synthesis/prompt-builder.js +0 -129
  179. package/dist/synthesis/prompt-builder.js.map +0 -1
  180. package/dist/synthesis/synthesizer.d.ts +0 -30
  181. package/dist/synthesis/synthesizer.d.ts.map +0 -1
  182. package/dist/synthesis/synthesizer.js +0 -210
  183. package/dist/synthesis/synthesizer.js.map +0 -1
  184. package/dist/types/ast.d.ts +0 -3
  185. package/dist/types/ast.d.ts.map +0 -1
  186. package/dist/types/ast.js +0 -2
  187. package/dist/types/ast.js.map +0 -1
  188. package/dist/types/codemap.d.ts +0 -58
  189. package/dist/types/codemap.d.ts.map +0 -1
  190. package/dist/types/codemap.js +0 -224
  191. package/dist/types/codemap.js.map +0 -1
  192. package/dist/types/context-pack.d.ts +0 -47
  193. package/dist/types/context-pack.d.ts.map +0 -1
  194. package/dist/types/context-pack.js +0 -44
  195. package/dist/types/context-pack.js.map +0 -1
  196. package/dist/types/search.d.ts +0 -15
  197. package/dist/types/search.d.ts.map +0 -1
  198. package/dist/types/search.js +0 -11
  199. package/dist/types/search.js.map +0 -1
  200. package/dist/utils/cli-ui.d.ts +0 -44
  201. package/dist/utils/cli-ui.d.ts.map +0 -1
  202. package/dist/utils/cli-ui.js +0 -139
  203. package/dist/utils/cli-ui.js.map +0 -1
  204. package/dist/utils/indexer-with-progress.d.ts +0 -10
  205. package/dist/utils/indexer-with-progress.d.ts.map +0 -1
  206. package/dist/utils/indexer-with-progress.js +0 -58
  207. package/dist/utils/indexer-with-progress.js.map +0 -1
  208. package/dist/utils/rate-limiter.d.ts +0 -34
  209. package/dist/utils/rate-limiter.d.ts.map +0 -1
  210. package/dist/utils/rate-limiter.js +0 -178
  211. package/dist/utils/rate-limiter.js.map +0 -1
@@ -1,624 +0,0 @@
1
- import crypto from 'crypto';
2
- import fg from 'fast-glob';
3
- import fs from 'fs';
4
- import path from 'path';
5
- import Parser from 'tree-sitter';
6
- import { createEmbeddingProvider, getModelProfile, getSizeLimits } from '../providers/index.js';
7
- import { BATCH_SIZE } from '../providers/base.js';
8
- import { analyzeNodeForChunking, batchAnalyzeNodes, yieldStatementChunks } from '../chunking/semantic-chunker.js';
9
- import { groupNodesForChunking, createCombinedChunk } from '../chunking/file-grouper.js';
10
- import { getTokenCountStats } from '../chunking/token-counter.js';
11
- import { readCodemap, writeCodemap } from '../codemap/io.js';
12
- import { normalizeChunkMetadata } from '../types/codemap.js';
13
- import { LANG_RULES, getSupportedLanguageExtensions } from '../languages/rules.js';
14
- import { cloneMerkle, computeFastHash, loadMerkle, normalizeToProjectPath, removeMerkleEntry, saveMerkle } from '../indexer/merkle.js';
15
- import { extractSymbolMetadata } from '../symbols/extract.js';
16
- import { attachSymbolGraphToCodemap } from '../symbols/graph.js';
17
- import { resolveEncryptionPreference, writeChunkToDisk, removeChunkArtifacts } from '../storage/encrypted-chunks.js';
18
- import { extractSymbolName } from './symbol-extractor.js';
19
- import { extractCodevaultMetadata, extractSemanticTags, extractImportantVariables, extractDocComments, generateEnhancedEmbeddingText } from './metadata.js';
20
- import { Database, initDatabase } from '../database/db.js';
21
- import { BatchEmbeddingProcessor } from './batch-indexer.js';
22
- export async function indexProject({ repoPath = '.', provider = 'auto', onProgress = null, changedFiles = null, deletedFiles = [], embeddingProviderOverride = null, encryptMode = undefined } = {}) {
23
- const repo = path.resolve(repoPath);
24
- if (!fs.existsSync(repo)) {
25
- throw new Error(`Directory ${repo} does not exist`);
26
- }
27
- const normalizedChanged = Array.isArray(changedFiles)
28
- ? Array.from(new Set(changedFiles
29
- .map(file => normalizeToProjectPath(repo, file))
30
- .filter(Boolean)))
31
- : null;
32
- const normalizedDeleted = Array.from(new Set((Array.isArray(deletedFiles) ? deletedFiles : [])
33
- .map(file => normalizeToProjectPath(repo, file))
34
- .filter(Boolean)));
35
- const deletedSet = new Set(normalizedDeleted);
36
- const languagePatterns = getSupportedLanguageExtensions().map(ext => `**/*${ext}`);
37
- let files = [];
38
- if (normalizedChanged === null) {
39
- files = await fg(languagePatterns, {
40
- cwd: repo,
41
- absolute: false,
42
- followSymbolicLinks: false,
43
- ignore: [
44
- '**/vendor/**',
45
- '**/node_modules/**',
46
- '**/.git/**',
47
- '**/storage/**',
48
- '**/dist/**',
49
- '**/build/**',
50
- '**/tmp/**',
51
- '**/temp/**',
52
- '**/.npm/**',
53
- '**/.yarn/**',
54
- '**/Library/**',
55
- '**/System/**',
56
- '**/.Trash/**',
57
- '**/.codevault/**',
58
- '**/codevault.codemap.json',
59
- '**/codevault.codemap.json.backup-*',
60
- '**/package-lock.json',
61
- '**/yarn.lock',
62
- '**/pnpm-lock.yaml',
63
- '**/*.json',
64
- '**/*.sh',
65
- '**/examples/**',
66
- '**/assets/**'
67
- ],
68
- onlyFiles: true,
69
- dot: false
70
- });
71
- }
72
- else {
73
- files = normalizedChanged.filter(rel => {
74
- const ext = path.extname(rel).toLowerCase();
75
- return !!LANG_RULES[ext];
76
- });
77
- }
78
- const uniqueFiles = [];
79
- const seenFiles = new Set();
80
- for (const rel of files) {
81
- if (!rel || seenFiles.has(rel)) {
82
- continue;
83
- }
84
- const absPath = path.join(repo, rel);
85
- if (!fs.existsSync(absPath)) {
86
- deletedSet.add(rel);
87
- continue;
88
- }
89
- seenFiles.add(rel);
90
- uniqueFiles.push(rel);
91
- }
92
- files = uniqueFiles;
93
- const isPartialUpdate = normalizedChanged !== null;
94
- const embeddingProvider = embeddingProviderOverride || createEmbeddingProvider(provider);
95
- if (!embeddingProviderOverride && embeddingProvider.init) {
96
- await embeddingProvider.init();
97
- }
98
- const providerName = embeddingProvider.getName();
99
- const modelName = embeddingProvider.getModelName ? embeddingProvider.getModelName() : null;
100
- const modelProfile = await getModelProfile(providerName, modelName || providerName);
101
- const limits = getSizeLimits(modelProfile);
102
- if (!process.env.CODEVAULT_QUIET) {
103
- console.log(`\nšŸ“Š Chunking Configuration:`);
104
- console.log(` Provider: ${providerName}`);
105
- if (modelName)
106
- console.log(` Model: ${modelName}`);
107
- console.log(` Dimensions: ${embeddingProvider.getDimensions()}`);
108
- console.log(` Chunking mode: ${limits.unit}`);
109
- console.log(` Optimal size: ${limits.optimal} ${limits.unit}`);
110
- console.log(` Min/Max: ${limits.min}-${limits.max} ${limits.unit}`);
111
- console.log(` Overlap: ${limits.overlap} ${limits.unit}`);
112
- console.log(` Batch size: ${BATCH_SIZE} chunks per API call`);
113
- if (modelProfile.useTokens && modelProfile.tokenCounter) {
114
- console.log(` āœ“ Token counting enabled`);
115
- }
116
- else {
117
- console.log(` ℹ Using character estimation (token counting unavailable)`);
118
- }
119
- if (embeddingProvider.rateLimiter) {
120
- const rateLimiterStats = embeddingProvider.rateLimiter.getStats();
121
- if (rateLimiterStats.isLimited) {
122
- console.log(` šŸ”’ Rate limiting: ${rateLimiterStats.rpm} requests/minute`);
123
- }
124
- else {
125
- console.log(` ⚔ Rate limiting: disabled (local model)`);
126
- }
127
- }
128
- console.log('');
129
- }
130
- await initDatabase(embeddingProvider.getDimensions(), repo);
131
- const codemapPath = path.join(repo, 'codevault.codemap.json');
132
- const chunkDir = path.join(repo, '.codevault/chunks');
133
- const dbPath = path.join(repo, '.codevault/codevault.db');
134
- if (fs.existsSync(dbPath)) {
135
- const db = new Database(dbPath);
136
- try {
137
- const existingDimensions = await db.getExistingDimensions();
138
- if (existingDimensions.length > 0) {
139
- const currentProvider = embeddingProvider.getName();
140
- const currentDimensions = embeddingProvider.getDimensions();
141
- const hasMismatch = existingDimensions.some(row => row.embedding_provider !== currentProvider ||
142
- row.embedding_dimensions !== currentDimensions);
143
- if (hasMismatch) {
144
- console.log('\nāš ļø WARNING: Dimension/Provider Mismatch Detected!');
145
- console.log('='.repeat(60));
146
- console.log('Existing index:');
147
- existingDimensions.forEach(row => {
148
- console.log(` ${row.embedding_provider} (${row.embedding_dimensions}D)`);
149
- });
150
- console.log(`Current config: ${currentProvider} (${currentDimensions}D)`);
151
- console.log('\nRecommendation: Full re-index for consistent results');
152
- console.log('='.repeat(60) + '\n');
153
- await new Promise(resolve => setTimeout(resolve, 2000));
154
- }
155
- }
156
- }
157
- catch (error) {
158
- // Ignore migration check errors
159
- }
160
- finally {
161
- db.close();
162
- }
163
- }
164
- const encryptionPreference = resolveEncryptionPreference({ mode: encryptMode, logger: console });
165
- let codemap = readCodemap(codemapPath);
166
- const merkle = loadMerkle(repo);
167
- const updatedMerkle = cloneMerkle(merkle);
168
- let merkleDirty = false;
169
- let indexMutated = false;
170
- const parser = new Parser();
171
- let processedChunks = 0;
172
- const errors = [];
173
- const chunkingStats = {
174
- totalNodes: 0,
175
- skippedSmall: 0,
176
- subdivided: 0,
177
- statementFallback: 0,
178
- normalChunks: 0,
179
- mergedSmall: 0
180
- };
181
- const db = new Database(dbPath);
182
- // Create batch processor for efficient embedding generation
183
- const batchProcessor = new BatchEmbeddingProcessor(embeddingProvider, db, BATCH_SIZE);
184
- async function deleteChunks(chunkIds, metadataLookup = new Map()) {
185
- if (!Array.isArray(chunkIds) || chunkIds.length === 0) {
186
- return;
187
- }
188
- await db.deleteChunks(chunkIds);
189
- for (const chunkId of chunkIds) {
190
- const metadata = metadataLookup.get(chunkId) || codemap[chunkId];
191
- if (metadata && metadata.sha) {
192
- removeChunkArtifacts(chunkDir, metadata.sha);
193
- }
194
- delete codemap[chunkId];
195
- }
196
- }
197
- async function embedAndStore(params) {
198
- try {
199
- // Add to batch processor instead of immediate embedding
200
- await batchProcessor.addChunk({
201
- chunkId: params.chunkId,
202
- enhancedEmbeddingText: params.enhancedEmbeddingText,
203
- params: {
204
- code: params.code,
205
- sha: params.sha,
206
- lang: params.lang,
207
- rel: params.rel,
208
- symbol: params.symbol,
209
- chunkType: params.chunkType,
210
- codevaultMetadata: params.codevaultMetadata,
211
- importantVariables: params.importantVariables,
212
- docComments: params.docComments,
213
- contextInfo: params.contextInfo
214
- }
215
- });
216
- indexMutated = true;
217
- fs.mkdirSync(chunkDir, { recursive: true });
218
- const writeResult = writeChunkToDisk({
219
- chunkDir,
220
- sha: params.sha,
221
- code: params.code,
222
- encryption: encryptionPreference
223
- });
224
- const previousMetadata = codemap[params.chunkId];
225
- codemap[params.chunkId] = normalizeChunkMetadata({
226
- file: params.rel,
227
- symbol: params.symbol,
228
- sha: params.sha,
229
- lang: params.lang,
230
- chunkType: params.chunkType,
231
- provider: embeddingProvider.getName(),
232
- dimensions: embeddingProvider.getDimensions(),
233
- hasCodevaultTags: Array.isArray(params.codevaultMetadata.tags) && params.codevaultMetadata.tags.length > 0,
234
- hasIntent: !!params.codevaultMetadata.intent,
235
- hasDocumentation: !!params.docComments,
236
- variableCount: Array.isArray(params.importantVariables) ? params.importantVariables.length : 0,
237
- encrypted: !!(writeResult && writeResult.encrypted),
238
- symbol_signature: params.symbolData && params.symbolData.signature ? params.symbolData.signature : undefined,
239
- symbol_parameters: params.symbolData && Array.isArray(params.symbolData.parameters) ? params.symbolData.parameters : undefined,
240
- symbol_return: params.symbolData && params.symbolData.returnType ? params.symbolData.returnType : undefined,
241
- symbol_calls: params.symbolData && Array.isArray(params.symbolData.calls) ? params.symbolData.calls : undefined
242
- }, previousMetadata);
243
- }
244
- catch (error) {
245
- errors.push({ type: 'indexing_error', chunkId: params.chunkId, error: error.message });
246
- throw error;
247
- }
248
- }
249
- async function removeFileArtifacts(fileRel) {
250
- const entries = Object.entries(codemap)
251
- .filter(([, metadata]) => metadata && metadata.file === fileRel);
252
- if (entries.length > 0) {
253
- const metadataLookup = new Map(entries);
254
- await deleteChunks(entries.map(([chunkId]) => chunkId), metadataLookup);
255
- indexMutated = true;
256
- }
257
- if (removeMerkleEntry(updatedMerkle, fileRel)) {
258
- merkleDirty = true;
259
- }
260
- }
261
- for (const rel of files) {
262
- deletedSet.delete(rel);
263
- const abs = path.join(repo, rel);
264
- const ext = path.extname(rel).toLowerCase();
265
- const rule = LANG_RULES[ext];
266
- if (!rule)
267
- continue;
268
- const existingChunks = new Map(Object.entries(codemap)
269
- .filter(([, metadata]) => metadata && metadata.file === rel));
270
- const staleChunkIds = new Set(existingChunks.keys());
271
- const chunkMerkleHashes = [];
272
- let fileHash = null;
273
- try {
274
- const source = fs.readFileSync(abs, 'utf8');
275
- fileHash = await computeFastHash(source);
276
- const previousMerkle = merkle[rel];
277
- if (previousMerkle && previousMerkle.shaFile === fileHash) {
278
- continue;
279
- }
280
- const SIZE_THRESHOLD = 30000;
281
- const CHUNK_SIZE = 30000;
282
- let tree;
283
- try {
284
- parser.setLanguage(rule.ts);
285
- if (source.length > SIZE_THRESHOLD) {
286
- tree = parser.parse((index) => {
287
- if (index < source.length) {
288
- return source.slice(index, Math.min(index + CHUNK_SIZE, source.length));
289
- }
290
- return null;
291
- });
292
- }
293
- else {
294
- tree = parser.parse(source);
295
- }
296
- if (!tree || !tree.rootNode) {
297
- throw new Error('Failed to create syntax tree');
298
- }
299
- }
300
- catch (parseError) {
301
- throw parseError;
302
- }
303
- const collectedNodes = [];
304
- function collectNodes(node) {
305
- if (node.type === 'export_statement') {
306
- let hasDeclaration = false;
307
- for (let i = 0; i < node.childCount; i++) {
308
- const child = node.child(i);
309
- if (child && ['function_declaration', 'class_declaration', 'method_definition'].includes(child.type)) {
310
- hasDeclaration = true;
311
- break;
312
- }
313
- }
314
- if (!hasDeclaration && rule.nodeTypes.includes(node.type)) {
315
- collectedNodes.push(node);
316
- return;
317
- }
318
- if (hasDeclaration) {
319
- for (let i = 0; i < node.childCount; i++) {
320
- const child = node.child(i);
321
- if (child) {
322
- collectNodes(child);
323
- }
324
- }
325
- return;
326
- }
327
- }
328
- if (rule.nodeTypes.includes(node.type)) {
329
- collectedNodes.push(node);
330
- }
331
- for (let i = 0; i < node.childCount; i++) {
332
- const child = node.child(i);
333
- if (child) {
334
- collectNodes(child);
335
- }
336
- }
337
- }
338
- collectNodes(tree.rootNode);
339
- const nodeGroups = await groupNodesForChunking(collectedNodes, source, modelProfile, rule);
340
- const processedNodes = new Set();
341
- async function processNodeGroup(nodeGroup) {
342
- if (nodeGroup.nodes.length === 1) {
343
- await yieldChunk(nodeGroup.nodes[0]);
344
- return;
345
- }
346
- const combinedChunk = createCombinedChunk(nodeGroup, source, rel);
347
- if (combinedChunk) {
348
- chunkingStats.totalNodes += nodeGroup.nodes.length;
349
- chunkingStats.fileGrouped = (chunkingStats.fileGrouped || 0) + 1;
350
- chunkingStats.functionsGrouped = (chunkingStats.functionsGrouped || 0) + nodeGroup.nodes.length;
351
- await processChunk(combinedChunk.node, combinedChunk.code, `group_${nodeGroup.nodes.length}funcs`, null);
352
- }
353
- }
354
- async function yieldChunk(node, parentNode = null) {
355
- chunkingStats.totalNodes++;
356
- const analysis = await analyzeNodeForChunking(node, source, rule, modelProfile);
357
- if (analysis.size < limits.min && parentNode !== null) {
358
- chunkingStats.skippedSmall++;
359
- return;
360
- }
361
- if (analysis.needsSubdivision && analysis.subdivisionCandidates.length > 0) {
362
- chunkingStats.subdivided++;
363
- const subAnalyses = await batchAnalyzeNodes(analysis.subdivisionCandidates, source, rule, modelProfile, true);
364
- const smallChunks = [];
365
- for (let i = 0; i < subAnalyses.length; i++) {
366
- const subAnalysis = subAnalyses[i];
367
- const subNode = subAnalysis.node;
368
- if (subAnalysis.size < limits.min) {
369
- const subCode = source.slice(subNode.startIndex, subNode.endIndex);
370
- smallChunks.push({
371
- node: subNode,
372
- code: subCode,
373
- size: subAnalysis.size
374
- });
375
- if (subNode.id !== undefined) {
376
- processedNodes.add(subNode.id);
377
- }
378
- }
379
- else {
380
- if (subNode.id !== undefined) {
381
- processedNodes.add(subNode.id);
382
- }
383
- await yieldChunk(subNode, node);
384
- }
385
- }
386
- if (smallChunks.length > 0) {
387
- const totalSmallSize = smallChunks.reduce((sum, c) => sum + c.size, 0);
388
- if (totalSmallSize >= limits.min || smallChunks.length >= 3) {
389
- const mergedCode = smallChunks.map((c) => c.code).join('\n\n');
390
- const mergedNode = {
391
- ...node,
392
- type: `${node.type}_merged`,
393
- startIndex: smallChunks[0].node.startIndex,
394
- endIndex: smallChunks[smallChunks.length - 1].node.endIndex
395
- };
396
- const suffix = `small_methods_${smallChunks.length}`;
397
- chunkingStats.mergedSmall++;
398
- await processChunk(mergedNode, mergedCode, suffix, parentNode);
399
- }
400
- else {
401
- chunkingStats.skippedSmall += smallChunks.length;
402
- }
403
- }
404
- return;
405
- }
406
- else if (analysis.size > limits.max) {
407
- chunkingStats.statementFallback++;
408
- const code = source.slice(node.startIndex, node.endIndex);
409
- const statementChunks = await yieldStatementChunks(node, source, limits.max, limits.overlap, modelProfile);
410
- for (let i = 0; i < statementChunks.length; i++) {
411
- const stmtChunk = statementChunks[i];
412
- await processChunk(node, stmtChunk.code, `${i + 1}`, parentNode);
413
- }
414
- return;
415
- }
416
- chunkingStats.normalChunks++;
417
- const code = source.slice(node.startIndex, node.endIndex);
418
- await processChunk(node, code, null, parentNode);
419
- }
420
- async function processChunk(node, code, suffix = null, parentNode = null) {
421
- let symbol = extractSymbolName(node, source);
422
- if (!symbol)
423
- return;
424
- if (suffix) {
425
- symbol = `${symbol}_part${suffix}`;
426
- }
427
- const docComments = extractDocComments(source, node, rule);
428
- const codevaultMetadata = extractCodevaultMetadata(docComments);
429
- const automaticTags = extractSemanticTags(rel, symbol, code);
430
- const allTags = [...new Set([...codevaultMetadata.tags, ...automaticTags])];
431
- codevaultMetadata.tags = allTags;
432
- const importantVariables = extractImportantVariables(node, source, rule);
433
- const symbolData = extractSymbolMetadata({ node, source, symbol });
434
- const enhancedEmbeddingText = generateEnhancedEmbeddingText(code, codevaultMetadata, importantVariables, docComments);
435
- const chunkType = node.type.includes('class') ? 'class' :
436
- node.type.includes('method') ? 'method' : 'function';
437
- const contextInfo = {
438
- nodeType: node.type,
439
- startLine: source.slice(0, node.startIndex).split('\n').length,
440
- endLine: source.slice(0, node.endIndex).split('\n').length,
441
- codeLength: code.length,
442
- hasDocumentation: !!docComments,
443
- variableCount: importantVariables.length,
444
- isSubdivision: !!suffix,
445
- hasParentContext: !!parentNode
446
- };
447
- const sha = crypto.createHash('sha1').update(code).digest('hex');
448
- const chunkId = `${rel}:${symbol}:${sha.substring(0, 8)}`;
449
- const chunkMerkleHash = await computeFastHash(code);
450
- if (codemap[chunkId]?.sha === sha) {
451
- staleChunkIds.delete(chunkId);
452
- chunkMerkleHashes.push(chunkMerkleHash);
453
- return;
454
- }
455
- await embedAndStore({
456
- code,
457
- enhancedEmbeddingText,
458
- chunkId,
459
- sha,
460
- lang: rule.lang,
461
- rel,
462
- symbol,
463
- chunkType,
464
- codevaultMetadata,
465
- importantVariables,
466
- docComments,
467
- contextInfo,
468
- symbolData
469
- });
470
- staleChunkIds.delete(chunkId);
471
- chunkMerkleHashes.push(chunkMerkleHash);
472
- processedChunks++;
473
- if (onProgress) {
474
- onProgress({ type: 'chunk_processed', file: rel, symbol, chunkId });
475
- }
476
- }
477
- for (const nodeGroup of nodeGroups) {
478
- await processNodeGroup(nodeGroup);
479
- }
480
- if (staleChunkIds.size > 0) {
481
- await deleteChunks(Array.from(staleChunkIds), existingChunks);
482
- indexMutated = true;
483
- }
484
- if (fileHash) {
485
- updatedMerkle[rel] = {
486
- shaFile: fileHash,
487
- chunkShas: chunkMerkleHashes
488
- };
489
- merkleDirty = true;
490
- }
491
- }
492
- catch (error) {
493
- errors.push({ type: 'processing_error', file: rel, error: error.message });
494
- try {
495
- const abs = path.join(repo, rel);
496
- if (!fs.existsSync(abs)) {
497
- continue;
498
- }
499
- const source = fs.readFileSync(abs, 'utf8');
500
- const fallbackSymbol = path.basename(rel) || rel;
501
- const sha = crypto.createHash('sha1').update(source).digest('hex');
502
- const chunkId = `${rel}:fallback:${sha.substring(0, 8)}`;
503
- const chunkMerkleHash = await computeFastHash(source);
504
- const fallbackMetadata = { tags: [], intent: null, description: null };
505
- const contextInfo = {
506
- nodeType: 'file',
507
- startLine: 1,
508
- endLine: source.split('\n').length,
509
- codeLength: source.length,
510
- hasDocumentation: false,
511
- variableCount: 0
512
- };
513
- await embedAndStore({
514
- code: source,
515
- enhancedEmbeddingText: source,
516
- chunkId,
517
- sha,
518
- lang: rule.lang,
519
- rel,
520
- symbol: fallbackSymbol,
521
- chunkType: 'file',
522
- codevaultMetadata: fallbackMetadata,
523
- importantVariables: [],
524
- docComments: null,
525
- contextInfo,
526
- symbolData: {
527
- signature: `${fallbackSymbol}()`,
528
- parameters: [],
529
- returnType: null,
530
- calls: []
531
- }
532
- });
533
- processedChunks++;
534
- indexMutated = true;
535
- if (onProgress) {
536
- onProgress({ type: 'chunk_processed', file: rel, symbol: fallbackSymbol, chunkId });
537
- }
538
- staleChunkIds.delete(chunkId);
539
- if (staleChunkIds.size > 0) {
540
- await deleteChunks(Array.from(staleChunkIds), existingChunks);
541
- indexMutated = true;
542
- }
543
- chunkMerkleHashes.length = 0;
544
- chunkMerkleHashes.push(chunkMerkleHash);
545
- fileHash = chunkMerkleHash;
546
- updatedMerkle[rel] = {
547
- shaFile: chunkMerkleHash,
548
- chunkShas: [...chunkMerkleHashes]
549
- };
550
- merkleDirty = true;
551
- }
552
- catch (fallbackError) {
553
- errors.push({ type: 'fallback_error', file: rel, error: fallbackError.message });
554
- }
555
- }
556
- }
557
- for (const fileRel of deletedSet) {
558
- await removeFileArtifacts(fileRel);
559
- }
560
- if (!isPartialUpdate) {
561
- const existingFilesSet = new Set(files);
562
- for (const fileRel of Object.keys(merkle)) {
563
- if (!existingFilesSet.has(fileRel)) {
564
- await removeFileArtifacts(fileRel);
565
- }
566
- }
567
- }
568
- // Notify that we're starting finalization
569
- if (onProgress) {
570
- onProgress({ type: 'finalizing' });
571
- }
572
- // Process any remaining chunks in the batch
573
- await batchProcessor.flush();
574
- if (merkleDirty) {
575
- saveMerkle(repo, updatedMerkle);
576
- }
577
- db.close();
578
- attachSymbolGraphToCodemap(codemap);
579
- codemap = writeCodemap(codemapPath, codemap);
580
- const tokenStats = getTokenCountStats();
581
- if (!process.env.CODEVAULT_QUIET) {
582
- if (chunkingStats.totalNodes > 0) {
583
- console.log(`\nšŸ“ˆ Chunking Statistics:`);
584
- console.log(` Total AST nodes analyzed: ${chunkingStats.totalNodes}`);
585
- if (chunkingStats.fileGrouped && chunkingStats.functionsGrouped) {
586
- console.log(` šŸŽÆ File-grouped chunks: ${chunkingStats.fileGrouped} (${chunkingStats.functionsGrouped} functions combined)`);
587
- }
588
- console.log(` Normal chunks (optimal size): ${chunkingStats.normalChunks || 0}`);
589
- console.log(` Subdivided (too large): ${chunkingStats.subdivided || 0}`);
590
- console.log(` Merged small chunks: ${chunkingStats.mergedSmall || 0}`);
591
- console.log(` Statement-level fallback: ${chunkingStats.statementFallback || 0}`);
592
- console.log(` Skipped (too small): ${chunkingStats.skippedSmall || 0}`);
593
- console.log(` Final chunk count: ${processedChunks}`);
594
- const reductionRatio = chunkingStats.totalNodes > 0
595
- ? ((1 - processedChunks / chunkingStats.totalNodes) * 100).toFixed(1)
596
- : 0;
597
- console.log(` Chunk reduction: ${reductionRatio}% fewer chunks vs naive approach`);
598
- console.log('');
599
- }
600
- if (modelProfile.useTokens && tokenStats.totalRequests > 0) {
601
- console.log(`⚔ Token Counting Performance:`);
602
- console.log(` Total size checks: ${tokenStats.totalRequests}`);
603
- console.log(` Character pre-filter: ${tokenStats.charFilterRate} (${tokenStats.charFilterSkips} skipped)`);
604
- console.log(` Cache hits: ${tokenStats.cacheHitRate} (${tokenStats.cacheHits} cached)`);
605
- console.log(` Actual tokenizations: ${tokenStats.actualTokenizations}`);
606
- console.log(` Batch operations: ${tokenStats.batchTokenizations}`);
607
- const efficiency = tokenStats.totalRequests > 0
608
- ? (((tokenStats.charFilterSkips + tokenStats.cacheHits) / tokenStats.totalRequests) * 100).toFixed(1)
609
- : 0;
610
- console.log(` Overall efficiency: ${efficiency}% avoided expensive tokenization`);
611
- console.log('');
612
- }
613
- }
614
- return {
615
- success: true,
616
- processedChunks,
617
- totalChunks: Object.keys(codemap).length,
618
- provider: embeddingProvider.getName(),
619
- errors,
620
- chunkingStats,
621
- tokenStats: modelProfile.useTokens ? tokenStats : undefined
622
- };
623
- }
624
- //# sourceMappingURL=indexer.js.map