codesummary 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -213
- package/README.md +61 -395
- package/features.md +25 -386
- package/package.json +13 -17
- package/src/ai/errors.js +85 -0
- package/src/ai/featureFlags.js +8 -0
- package/src/ai/promptTemplates.js +337 -0
- package/src/ai/providerClient.js +81 -0
- package/src/ai/providers/ollama.js +92 -0
- package/src/ai/providers/openaiCompatible.js +96 -0
- package/src/analysis/repositorySignals.js +196 -0
- package/src/cli.js +819 -77
- package/src/configManager.js +21 -0
- package/src/graph/adapters/baseAdapter.js +24 -0
- package/src/graph/adapters/javascriptAdapter.js +53 -0
- package/src/graph/adapters/pythonAdapter.js +77 -0
- package/src/graph/graphEngine.js +151 -0
- package/src/graph/graphMetrics.js +79 -0
- package/src/graph/graphSchema.js +30 -0
- package/src/graph/universalExtractor.js +29 -0
- package/src/llmGenerator.js +723 -8
- package/src/pdfGenerator.js +1189 -275
- package/src/renderers/llmSummaryRenderer.js +14 -0
- package/src/renderers/pdfThemeRenderer.js +685 -0
- package/src/scanner.js +115 -8
- package/rag-schema.json +0 -114
- package/src/ragConfig.js +0 -369
- package/src/ragGenerator.js +0 -1740
package/src/ragGenerator.js
DELETED
|
@@ -1,1740 +0,0 @@
|
|
|
1
|
-
import fs from 'fs-extra';
|
|
2
|
-
import path from 'path';
|
|
3
|
-
import crypto from 'crypto';
|
|
4
|
-
import os from 'os';
|
|
5
|
-
import { createReadStream } from 'fs';
|
|
6
|
-
import ErrorHandler from './errorHandler.js';
|
|
7
|
-
import RagConfigManager from './ragConfig.js';
|
|
8
|
-
import { formatFileSize } from './utils.js';
|
|
9
|
-
|
|
10
|
-
/**
|
|
11
|
-
* Professional RAG Generator for CodeSummary
|
|
12
|
-
* Generates streaming JSON output optimized for vector database ingestion
|
|
13
|
-
* Follows deterministic, AI-free approach with efficient memory usage
|
|
14
|
-
*/
|
|
15
|
-
export class RagGenerator {
|
|
16
|
-
constructor() {
|
|
17
|
-
// Global parameters
|
|
18
|
-
this.maxTokensPerChunk = 1000;
|
|
19
|
-
this.overlapTokens = 200;
|
|
20
|
-
this.maxWorkers = Math.min(8, os.cpus().length);
|
|
21
|
-
|
|
22
|
-
// Extension to language mapping (deterministic)
|
|
23
|
-
this.extensionToLanguage = {
|
|
24
|
-
'.js': 'JavaScript', '.jsx': 'JavaScript', '.ts': 'TypeScript', '.tsx': 'TypeScript',
|
|
25
|
-
'.py': 'Python', '.java': 'Java', '.cs': 'C#', '.cpp': 'C++', '.c': 'C', '.h': 'C/C++',
|
|
26
|
-
'.html': 'HTML', '.xml': 'XML', '.css': 'CSS', '.scss': 'SCSS',
|
|
27
|
-
'.json': 'JSON', '.yaml': 'YAML', '.yml': 'YAML',
|
|
28
|
-
'.md': 'Markdown', '.txt': 'Text',
|
|
29
|
-
'.sh': 'Shell', '.bat': 'Batch'
|
|
30
|
-
};
|
|
31
|
-
|
|
32
|
-
// Initialize handlers
|
|
33
|
-
this.handlers = this.initializeHandlers();
|
|
34
|
-
|
|
35
|
-
// Statistics tracking
|
|
36
|
-
this.stats = {
|
|
37
|
-
filesProcessed: 0,
|
|
38
|
-
chunksGenerated: 0,
|
|
39
|
-
bytesWritten: 0,
|
|
40
|
-
startTime: null,
|
|
41
|
-
endTime: null
|
|
42
|
-
};
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
// Error collection
|
|
46
|
-
this.errors = [];
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
/**
|
|
50
|
-
* Main entry point - generates streaming RAG JSON
|
|
51
|
-
* @param {object} filesByExtension - Files grouped by extension
|
|
52
|
-
* @param {Array} selectedExtensions - Selected extensions to process
|
|
53
|
-
* @param {string} outputPath - Output JSON file path
|
|
54
|
-
* @param {string} projectName - Project name
|
|
55
|
-
* @param {string} scanPath - Root scan path
|
|
56
|
-
* @returns {object} Generation result
|
|
57
|
-
*/
|
|
58
|
-
async generateRagOutput(filesByExtension, selectedExtensions, outputPath, projectName, scanPath) {
|
|
59
|
-
this.stats.startTime = Date.now();
|
|
60
|
-
|
|
61
|
-
try {
|
|
62
|
-
console.log(`🚀 Starting RAG generation for ${projectName}`);
|
|
63
|
-
|
|
64
|
-
// Load RAG configuration
|
|
65
|
-
this.ragConfigManager = new RagConfigManager();
|
|
66
|
-
const config = await this.ragConfigManager.loadConfig();
|
|
67
|
-
this.updateConfigFromYAML(config);
|
|
68
|
-
|
|
69
|
-
// Display configuration
|
|
70
|
-
this.ragConfigManager.displayConfig();
|
|
71
|
-
|
|
72
|
-
// Phase 1: Discovery and file preparation
|
|
73
|
-
const discoveredFiles = await this.discoveryPhase(filesByExtension, selectedExtensions, scanPath);
|
|
74
|
-
|
|
75
|
-
// Phase 2: Atomic JSON generation (thread-safe)
|
|
76
|
-
const result = await this.generate(discoveredFiles, outputPath, projectName, scanPath);
|
|
77
|
-
|
|
78
|
-
this.stats.endTime = Date.now();
|
|
79
|
-
const duration = (this.stats.endTime - this.stats.startTime) / 1000;
|
|
80
|
-
|
|
81
|
-
console.log(`✅ RAG generation completed in ${duration.toFixed(2)}s`);
|
|
82
|
-
console.log(`📊 Stats: ${this.stats.filesProcessed} files, ${this.stats.chunksGenerated} chunks`);
|
|
83
|
-
|
|
84
|
-
return {
|
|
85
|
-
outputPath,
|
|
86
|
-
totalFiles: this.stats.filesProcessed,
|
|
87
|
-
totalChunks: this.stats.chunksGenerated,
|
|
88
|
-
duration,
|
|
89
|
-
success: true
|
|
90
|
-
};
|
|
91
|
-
|
|
92
|
-
} catch (error) {
|
|
93
|
-
ErrorHandler.handleError(error, 'RAG Generation');
|
|
94
|
-
throw error;
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
/**
|
|
99
|
-
* Update internal configuration from loaded YAML config
|
|
100
|
-
* @param {object} yamlConfig - Configuration from YAML
|
|
101
|
-
*/
|
|
102
|
-
updateConfigFromYAML(yamlConfig) {
|
|
103
|
-
if (yamlConfig.chunking) {
|
|
104
|
-
this.maxTokensPerChunk = yamlConfig.chunking.maxTokens || this.maxTokensPerChunk;
|
|
105
|
-
this.overlapTokens = yamlConfig.chunking.overlap || this.overlapTokens;
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
if (yamlConfig.performance) {
|
|
109
|
-
this.maxWorkers = Math.min(
|
|
110
|
-
yamlConfig.performance.maxWorkers || this.maxWorkers,
|
|
111
|
-
os.cpus().length
|
|
112
|
-
);
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
// Store full config for handlers to use
|
|
116
|
-
this.yamlConfig = yamlConfig;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
/**
|
|
122
|
-
* Phase 1: Discovery - BFS traversal and file metadata collection
|
|
123
|
-
* @param {object} filesByExtension - Files by extension
|
|
124
|
-
* @param {Array} selectedExtensions - Selected extensions
|
|
125
|
-
* @param {string} scanPath - Root scan path
|
|
126
|
-
* @returns {Array} Discovered files with metadata
|
|
127
|
-
*/
|
|
128
|
-
async discoveryPhase(filesByExtension, selectedExtensions, scanPath) {
|
|
129
|
-
console.log('🔍 Discovery phase: collecting file metadata...');
|
|
130
|
-
|
|
131
|
-
const discoveredFiles = [];
|
|
132
|
-
let processed = 0;
|
|
133
|
-
const totalFiles = selectedExtensions.reduce((sum, ext) => sum + (filesByExtension[ext]?.length || 0), 0);
|
|
134
|
-
|
|
135
|
-
// Process files concurrently but limit memory usage
|
|
136
|
-
const batchSize = 50;
|
|
137
|
-
|
|
138
|
-
for (const extension of selectedExtensions) {
|
|
139
|
-
const files = filesByExtension[extension] || [];
|
|
140
|
-
|
|
141
|
-
for (let i = 0; i < files.length; i += batchSize) {
|
|
142
|
-
const batch = files.slice(i, i + batchSize);
|
|
143
|
-
const batchResults = await Promise.all(
|
|
144
|
-
batch.map(fileInfo => this.enrichFileMetadata(fileInfo, extension, scanPath))
|
|
145
|
-
);
|
|
146
|
-
|
|
147
|
-
discoveredFiles.push(...batchResults.filter(Boolean));
|
|
148
|
-
processed += batch.length;
|
|
149
|
-
|
|
150
|
-
// Progress reporting with validation
|
|
151
|
-
const progress = (processed / totalFiles * 100).toFixed(1);
|
|
152
|
-
const validFiles = discoveredFiles.length;
|
|
153
|
-
const skippedFiles = processed - validFiles;
|
|
154
|
-
process.stdout.write(`\r📊 Discovery: ${progress}% (${validFiles} valid, ${skippedFiles} skipped)`);
|
|
155
|
-
|
|
156
|
-
// Internal validation
|
|
157
|
-
if (processed % 50 === 0) {
|
|
158
|
-
this.validateDiscoveryProgress(discoveredFiles, processed);
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
console.log(`\n✅ Discovery completed: ${discoveredFiles.length} files enriched`);
|
|
164
|
-
return discoveredFiles;
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
/**
|
|
168
|
-
* Enrich file with metadata including hash, tags, and analysis
|
|
169
|
-
* @param {object} fileInfo - Basic file info from scanner
|
|
170
|
-
* @param {string} extension - File extension
|
|
171
|
-
* @param {string} scanPath - Root scan path
|
|
172
|
-
* @returns {object} Enriched file metadata
|
|
173
|
-
*/
|
|
174
|
-
async enrichFileMetadata(fileInfo, extension, scanPath) {
|
|
175
|
-
try {
|
|
176
|
-
// Calculate SHA-256 hash in streaming mode
|
|
177
|
-
const hash = await this.calculateFileHash(fileInfo.absolutePath);
|
|
178
|
-
|
|
179
|
-
// Determine language and tags
|
|
180
|
-
const language = this.extensionToLanguage[extension] || 'Unknown';
|
|
181
|
-
const tags = this.extractFileTags(fileInfo.relativePath, extension);
|
|
182
|
-
|
|
183
|
-
// Basic file stats
|
|
184
|
-
const stats = await fs.stat(fileInfo.absolutePath);
|
|
185
|
-
|
|
186
|
-
return {
|
|
187
|
-
id: hash.substring(0, 16), // Use first 16 chars of hash as unique ID
|
|
188
|
-
path: fileInfo.relativePath,
|
|
189
|
-
absolutePath: fileInfo.absolutePath,
|
|
190
|
-
extension,
|
|
191
|
-
language,
|
|
192
|
-
size: stats.size,
|
|
193
|
-
hash: `sha256-${hash}`,
|
|
194
|
-
modified: stats.mtime.toISOString(),
|
|
195
|
-
tags,
|
|
196
|
-
// Will be populated during chunking
|
|
197
|
-
chunks: null,
|
|
198
|
-
// Metadata for processing
|
|
199
|
-
_stats: stats
|
|
200
|
-
};
|
|
201
|
-
|
|
202
|
-
} catch (error) {
|
|
203
|
-
console.warn(`⚠️ Could not process file ${fileInfo.relativePath}: ${error.message}`);
|
|
204
|
-
return null;
|
|
205
|
-
}
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
/**
|
|
209
|
-
* Calculate SHA-256 hash of file in streaming mode
|
|
210
|
-
* @param {string} filePath - File path
|
|
211
|
-
* @returns {string} SHA-256 hash (hex)
|
|
212
|
-
*/
|
|
213
|
-
async calculateFileHash(filePath) {
|
|
214
|
-
return new Promise((resolve, reject) => {
|
|
215
|
-
const hash = crypto.createHash('sha256');
|
|
216
|
-
const stream = createReadStream(filePath);
|
|
217
|
-
|
|
218
|
-
stream.on('data', data => hash.update(data));
|
|
219
|
-
stream.on('end', () => resolve(hash.digest('hex')));
|
|
220
|
-
stream.on('error', reject);
|
|
221
|
-
});
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
/**
|
|
225
|
-
* Extract file tags based on path heuristics
|
|
226
|
-
* @param {string} relativePath - Relative file path
|
|
227
|
-
* @param {string} extension - File extension
|
|
228
|
-
* @returns {Array} Array of tags
|
|
229
|
-
*/
|
|
230
|
-
extractFileTags(relativePath, extension) {
|
|
231
|
-
const tags = [];
|
|
232
|
-
const pathLower = relativePath.toLowerCase();
|
|
233
|
-
const fileName = path.basename(relativePath, extension).toLowerCase();
|
|
234
|
-
const fullPath = relativePath.toLowerCase();
|
|
235
|
-
|
|
236
|
-
// Path-based tags (enhanced)
|
|
237
|
-
if (pathLower.includes('/test/') || pathLower.includes('\\test\\')) tags.push('test');
|
|
238
|
-
if (pathLower.includes('/spec/') || pathLower.includes('\\spec\\')) tags.push('test');
|
|
239
|
-
if (pathLower.includes('/__tests__/') || pathLower.includes('\\__tests__\\')) tags.push('test');
|
|
240
|
-
if (pathLower.includes('/scripts/') || pathLower.includes('\\scripts\\')) tags.push('script');
|
|
241
|
-
if (pathLower.includes('/config/') || pathLower.includes('\\config\\')) tags.push('config');
|
|
242
|
-
if (pathLower.includes('/lib/') || pathLower.includes('\\lib\\')) tags.push('library');
|
|
243
|
-
if (pathLower.includes('/utils/') || pathLower.includes('\\utils\\')) tags.push('utility');
|
|
244
|
-
if (pathLower.includes('/helpers/') || pathLower.includes('\\helpers\\')) tags.push('utility');
|
|
245
|
-
|
|
246
|
-
// Framework-specific tags
|
|
247
|
-
if (pathLower.includes('/pages/') || pathLower.includes('\\pages\\')) tags.push('page');
|
|
248
|
-
if (pathLower.includes('/components/') || pathLower.includes('\\components\\')) tags.push('component');
|
|
249
|
-
if (pathLower.includes('/shared/') || pathLower.includes('\\shared\\')) tags.push('shared');
|
|
250
|
-
if (pathLower.includes('/common/') || pathLower.includes('\\common\\')) tags.push('shared');
|
|
251
|
-
if (pathLower.includes('/hooks/') || pathLower.includes('\\hooks\\')) tags.push('hook');
|
|
252
|
-
if (pathLower.includes('/services/') || pathLower.includes('\\services\\')) tags.push('service');
|
|
253
|
-
if (pathLower.includes('/api/') || pathLower.includes('\\api\\')) tags.push('api');
|
|
254
|
-
if (pathLower.includes('/routes/') || pathLower.includes('\\routes\\')) tags.push('route');
|
|
255
|
-
if (pathLower.includes('/controllers/') || pathLower.includes('\\controllers\\')) tags.push('controller');
|
|
256
|
-
if (pathLower.includes('/models/') || pathLower.includes('\\models\\')) tags.push('model');
|
|
257
|
-
if (pathLower.includes('/views/') || pathLower.includes('\\views\\')) tags.push('view');
|
|
258
|
-
if (pathLower.includes('/layouts/') || pathLower.includes('\\layouts\\')) tags.push('layout');
|
|
259
|
-
if (pathLower.includes('/middleware/') || pathLower.includes('\\middleware\\')) tags.push('middleware');
|
|
260
|
-
|
|
261
|
-
// Build and tooling
|
|
262
|
-
if (pathLower.includes('/build/') || pathLower.includes('\\build\\')) tags.push('build');
|
|
263
|
-
if (pathLower.includes('/dist/') || pathLower.includes('\\dist\\')) tags.push('build');
|
|
264
|
-
if (pathLower.includes('/.github/') || pathLower.includes('\\.github\\')) tags.push('ci');
|
|
265
|
-
if (pathLower.includes('/workflows/') || pathLower.includes('\\workflows\\')) tags.push('ci');
|
|
266
|
-
|
|
267
|
-
// Filename-based tags (enhanced)
|
|
268
|
-
if (fileName.includes('config')) tags.push('config');
|
|
269
|
-
if (fileName.includes('test') || fileName.includes('spec')) tags.push('test');
|
|
270
|
-
if (fileName.includes('index')) tags.push('entry');
|
|
271
|
-
if (fileName.includes('main')) tags.push('entry');
|
|
272
|
-
if (fileName.includes('app')) tags.push('application');
|
|
273
|
-
if (fileName.includes('component')) tags.push('component');
|
|
274
|
-
if (fileName.includes('page')) tags.push('page');
|
|
275
|
-
if (fileName.includes('layout')) tags.push('layout');
|
|
276
|
-
if (fileName.includes('service')) tags.push('service');
|
|
277
|
-
if (fileName.includes('util') || fileName.includes('helper')) tags.push('utility');
|
|
278
|
-
if (fileName.includes('hook')) tags.push('hook');
|
|
279
|
-
if (fileName.includes('api')) tags.push('api');
|
|
280
|
-
if (fileName.includes('route')) tags.push('route');
|
|
281
|
-
if (fileName.includes('model')) tags.push('model');
|
|
282
|
-
if (fileName.includes('controller')) tags.push('controller');
|
|
283
|
-
if (fileName.includes('middleware')) tags.push('middleware');
|
|
284
|
-
if (fileName.includes('store') || fileName.includes('state')) tags.push('state');
|
|
285
|
-
if (fileName.includes('context')) tags.push('context');
|
|
286
|
-
if (fileName.includes('provider')) tags.push('provider');
|
|
287
|
-
|
|
288
|
-
// Extension-based tags (enhanced)
|
|
289
|
-
if (['.test.js', '.spec.js', '.test.ts', '.spec.ts', '.test.tsx', '.spec.tsx'].some(ext => fullPath.endsWith(ext))) {
|
|
290
|
-
tags.push('test');
|
|
291
|
-
}
|
|
292
|
-
if (['.d.ts'].some(ext => fullPath.endsWith(ext))) {
|
|
293
|
-
tags.push('types');
|
|
294
|
-
}
|
|
295
|
-
if (['.stories.js', '.stories.ts', '.stories.tsx'].some(ext => fullPath.endsWith(ext))) {
|
|
296
|
-
tags.push('storybook');
|
|
297
|
-
}
|
|
298
|
-
if (['.cy.js', '.cy.ts'].some(ext => fullPath.endsWith(ext))) {
|
|
299
|
-
tags.push('e2e');
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
// Framework detection
|
|
303
|
-
if (extension === '.tsx' || extension === '.jsx') {
|
|
304
|
-
tags.push('react');
|
|
305
|
-
}
|
|
306
|
-
if (fullPath.includes('vue') || extension === '.vue') {
|
|
307
|
-
tags.push('vue');
|
|
308
|
-
}
|
|
309
|
-
if (fullPath.includes('angular') || fullPath.includes('.component.') || fullPath.includes('.service.')) {
|
|
310
|
-
tags.push('angular');
|
|
311
|
-
}
|
|
312
|
-
if (fullPath.includes('next') || fullPath.includes('_app.') || fullPath.includes('_document.')) {
|
|
313
|
-
tags.push('nextjs');
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
// Special files
|
|
317
|
-
if (['readme', 'license', 'changelog', 'contributing'].includes(fileName)) {
|
|
318
|
-
tags.push('documentation');
|
|
319
|
-
}
|
|
320
|
-
if (['dockerfile', 'docker-compose', '.dockerignore'].includes(fileName)) {
|
|
321
|
-
tags.push('docker');
|
|
322
|
-
}
|
|
323
|
-
if (['package.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml'].includes(path.basename(relativePath))) {
|
|
324
|
-
tags.push('package');
|
|
325
|
-
}
|
|
326
|
-
if (['tsconfig.json', 'jsconfig.json', 'webpack.config.js', 'vite.config.js'].includes(path.basename(relativePath))) {
|
|
327
|
-
tags.push('config');
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
// Infrastructure files
|
|
331
|
-
if (extension === '.bat' || extension === '.cmd') {
|
|
332
|
-
tags.push('infrastructure', 'script', 'windows');
|
|
333
|
-
}
|
|
334
|
-
if (extension === '.sh') {
|
|
335
|
-
tags.push('infrastructure', 'script', 'unix');
|
|
336
|
-
}
|
|
337
|
-
if (extension === '.json' && (fileName.includes('config') || fileName.includes('settings') || fileName.includes('.config.'))) {
|
|
338
|
-
tags.push('infrastructure', 'config');
|
|
339
|
-
}
|
|
340
|
-
if (['makefile', 'makefile.am', 'cmake', 'cmakelists.txt'].includes(fileName)) {
|
|
341
|
-
tags.push('infrastructure', 'build');
|
|
342
|
-
}
|
|
343
|
-
if (['readme', 'license', 'changelog', 'contributing'].includes(fileName)) {
|
|
344
|
-
tags.push('documentation');
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
return [...new Set(tags)]; // Remove duplicates
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
/**
|
|
356
|
-
* Initialize specialized handlers for different file types
|
|
357
|
-
* @returns {object} Handler registry
|
|
358
|
-
*/
|
|
359
|
-
initializeHandlers() {
|
|
360
|
-
return {
|
|
361
|
-
'code-c-like': new CLikeHandler(),
|
|
362
|
-
'code-script': new ScriptHandler(),
|
|
363
|
-
'markup': new MarkupHandler(),
|
|
364
|
-
'styling': new StylingHandler(),
|
|
365
|
-
'config-plain': new ConfigPlainHandler()
|
|
366
|
-
};
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
/**
|
|
370
|
-
* Get appropriate handler for file extension with full coverage
|
|
371
|
-
* @param {string} extension - File extension
|
|
372
|
-
* @returns {object} Handler instance
|
|
373
|
-
*/
|
|
374
|
-
getHandler(extension) {
|
|
375
|
-
// Complete mapping for all 22 target extensions
|
|
376
|
-
const handlerMap = {
|
|
377
|
-
// Code-C like (5 extensions)
|
|
378
|
-
'.c': 'code-c-like', '.h': 'code-c-like', '.cpp': 'code-c-like',
|
|
379
|
-
'.cs': 'code-c-like', '.java': 'code-c-like',
|
|
380
|
-
|
|
381
|
-
// Code-Script (7 extensions)
|
|
382
|
-
'.js': 'code-script', '.jsx': 'code-script', '.ts': 'code-script',
|
|
383
|
-
'.tsx': 'code-script', '.py': 'code-script', '.sh': 'code-script', '.bat': 'code-script',
|
|
384
|
-
|
|
385
|
-
// Markup (2 extensions)
|
|
386
|
-
'.html': 'markup', '.xml': 'markup',
|
|
387
|
-
|
|
388
|
-
// Styling (2 extensions)
|
|
389
|
-
'.css': 'styling', '.scss': 'styling',
|
|
390
|
-
|
|
391
|
-
// Config/Plain (6 extensions)
|
|
392
|
-
'.json': 'config-plain', '.yaml': 'config-plain', '.yml': 'config-plain',
|
|
393
|
-
'.md': 'config-plain', '.txt': 'config-plain'
|
|
394
|
-
};
|
|
395
|
-
|
|
396
|
-
const handlerType = handlerMap[extension];
|
|
397
|
-
|
|
398
|
-
if (!handlerType) {
|
|
399
|
-
console.warn(`⚠️ No handler found for extension: ${extension}`);
|
|
400
|
-
return this.handlers['config-plain']; // Fallback
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
return this.handlers[handlerType];
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
/**
|
|
407
|
-
* Verify extension coverage against target list
|
|
408
|
-
* @param {Array} processedExtensions - Extensions found in processing
|
|
409
|
-
*/
|
|
410
|
-
verifyExtensionCoverage(processedExtensions) {
|
|
411
|
-
const targetExtensions = [
|
|
412
|
-
'.json', '.ts', '.js', '.jsx', '.tsx', '.xml', '.html', '.css', '.scss',
|
|
413
|
-
'.md', '.txt', '.py', '.java', '.cs', '.cpp', '.c', '.h', '.yaml', '.yml',
|
|
414
|
-
'.sh', '.bat'
|
|
415
|
-
]; // 22 total extensions
|
|
416
|
-
|
|
417
|
-
const missing = targetExtensions.filter(ext => !processedExtensions.includes(ext));
|
|
418
|
-
const extra = processedExtensions.filter(ext => !targetExtensions.includes(ext));
|
|
419
|
-
|
|
420
|
-
console.log(`\n📊 Extension Coverage Analysis:`);
|
|
421
|
-
console.log(` Target extensions: ${targetExtensions.length}`);
|
|
422
|
-
console.log(` Processed extensions: ${processedExtensions.length}`);
|
|
423
|
-
|
|
424
|
-
if (missing.length > 0) {
|
|
425
|
-
console.warn(` ⚠️ Missing: ${missing.join(', ')}`);
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
if (extra.length > 0) {
|
|
429
|
-
console.log(` ➕ Extra: ${extra.join(', ')}`);
|
|
430
|
-
}
|
|
431
|
-
|
|
432
|
-
if (missing.length === 0) {
|
|
433
|
-
console.log(` ✅ Full coverage achieved!`);
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
return {
|
|
437
|
-
targetCount: targetExtensions.length,
|
|
438
|
-
processedCount: processedExtensions.length,
|
|
439
|
-
missing,
|
|
440
|
-
extra,
|
|
441
|
-
coverage: ((targetExtensions.length - missing.length) / targetExtensions.length * 100).toFixed(1)
|
|
442
|
-
};
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
/**
|
|
446
|
-
* Improved token estimation using multiple heuristics
|
|
447
|
-
* @param {string} content - Text content
|
|
448
|
-
* @param {string} language - Programming language for context
|
|
449
|
-
* @returns {number} Estimated token count
|
|
450
|
-
*/
|
|
451
|
-
safeEstimateTokens(content, language = 'text') {
|
|
452
|
-
try {
|
|
453
|
-
if (typeof content !== 'string') {
|
|
454
|
-
console.warn('⚠️ Non-string content passed to token estimator');
|
|
455
|
-
return 0;
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
if (content.length === 0) return 0;
|
|
459
|
-
|
|
460
|
-
// Base estimation using multiple factors
|
|
461
|
-
const charCount = content.length;
|
|
462
|
-
const wordCount = content.trim().split(/\s+/).length;
|
|
463
|
-
const lineCount = content.split('\n').length;
|
|
464
|
-
|
|
465
|
-
// Language-specific adjustments
|
|
466
|
-
let tokensPerChar = 0.25; // Default: ~4 chars per token
|
|
467
|
-
let tokensPerWord = 1.3; // Default: ~1.3 tokens per word
|
|
468
|
-
|
|
469
|
-
// Adjust based on content type
|
|
470
|
-
if (['javascript', 'typescript', 'python', 'java', 'c++', 'c#'].includes(language.toLowerCase())) {
|
|
471
|
-
// Code tends to have more symbols and operators
|
|
472
|
-
tokensPerChar = 0.28;
|
|
473
|
-
tokensPerWord = 1.4;
|
|
474
|
-
|
|
475
|
-
// Additional tokens for common code patterns
|
|
476
|
-
const brackets = (content.match(/[{}()\[\]]/g) || []).length;
|
|
477
|
-
const operators = (content.match(/[+\-*/%=<>!&|^~]/g) || []).length;
|
|
478
|
-
const dots = (content.match(/\./g) || []).length;
|
|
479
|
-
|
|
480
|
-
const syntaxTokens = Math.ceil((brackets + operators + dots) * 0.15);
|
|
481
|
-
|
|
482
|
-
// Character-based estimation with syntax bonus
|
|
483
|
-
const charEstimate = Math.ceil(charCount * tokensPerChar) + syntaxTokens;
|
|
484
|
-
const wordEstimate = Math.ceil(wordCount * tokensPerWord);
|
|
485
|
-
|
|
486
|
-
return Math.max(charEstimate, wordEstimate);
|
|
487
|
-
} else if (['json', 'yaml', 'xml', 'html'].includes(language.toLowerCase())) {
|
|
488
|
-
// Structured data tends to be more compact in tokens
|
|
489
|
-
tokensPerChar = 0.22;
|
|
490
|
-
tokensPerWord = 1.1;
|
|
491
|
-
} else if (language.toLowerCase() === 'markdown') {
|
|
492
|
-
// Markdown has formatting symbols but is mostly text
|
|
493
|
-
tokensPerChar = 0.26;
|
|
494
|
-
tokensPerWord = 1.2;
|
|
495
|
-
}
|
|
496
|
-
|
|
497
|
-
// Calculate estimates using both methods
|
|
498
|
-
const charEstimate = Math.ceil(charCount * tokensPerChar);
|
|
499
|
-
const wordEstimate = Math.ceil(wordCount * tokensPerWord);
|
|
500
|
-
|
|
501
|
-
// Return the higher estimate for safety (avoid truncation)
|
|
502
|
-
return Math.max(charEstimate, wordEstimate, Math.ceil(charCount / 4));
|
|
503
|
-
|
|
504
|
-
} catch (error) {
|
|
505
|
-
console.warn(`⚠️ Token estimation error: ${error.message}`);
|
|
506
|
-
return Math.ceil((content?.length || 0) / 4);
|
|
507
|
-
}
|
|
508
|
-
}
|
|
509
|
-
|
|
510
|
-
/**
|
|
511
|
-
* Estimate token count using simple heuristic
|
|
512
|
-
* @param {string} content - Text content
|
|
513
|
-
* @returns {number} Estimated token count
|
|
514
|
-
*/
|
|
515
|
-
estimateTokens(content) {
|
|
516
|
-
return Math.ceil(content.length / 4);
|
|
517
|
-
}
|
|
518
|
-
|
|
519
|
-
/**
|
|
520
|
-
* Extract imports from content using simple regex
|
|
521
|
-
* @param {string} content - File content
|
|
522
|
-
* @param {string} extension - File extension
|
|
523
|
-
* @returns {Array} Array of import statements
|
|
524
|
-
*/
|
|
525
|
-
extractImports(content, extension) {
|
|
526
|
-
const imports = [];
|
|
527
|
-
|
|
528
|
-
switch (extension) {
|
|
529
|
-
case '.js':
|
|
530
|
-
case '.jsx':
|
|
531
|
-
case '.ts':
|
|
532
|
-
case '.tsx':
|
|
533
|
-
// import ... from '...'
|
|
534
|
-
const importRegex = /import\s+.*?from\s+['"]([^'"]+)['"]/g;
|
|
535
|
-
let match;
|
|
536
|
-
while ((match = importRegex.exec(content)) !== null) {
|
|
537
|
-
imports.push(match[1]);
|
|
538
|
-
}
|
|
539
|
-
|
|
540
|
-
// require('...')
|
|
541
|
-
const requireRegex = /require\s*\(\s*['"]([^'"]+)['"]\s*\)/g;
|
|
542
|
-
while ((match = requireRegex.exec(content)) !== null) {
|
|
543
|
-
imports.push(match[1]);
|
|
544
|
-
}
|
|
545
|
-
break;
|
|
546
|
-
|
|
547
|
-
case '.py':
|
|
548
|
-
// import ... / from ... import ...
|
|
549
|
-
const pyImportRegex = /(?:from\s+(\S+)\s+import|import\s+(\S+))/g;
|
|
550
|
-
while ((match = pyImportRegex.exec(content)) !== null) {
|
|
551
|
-
imports.push(match[1] || match[2]);
|
|
552
|
-
}
|
|
553
|
-
break;
|
|
554
|
-
|
|
555
|
-
case '.c':
|
|
556
|
-
case '.cpp':
|
|
557
|
-
case '.h':
|
|
558
|
-
// #include "..." / #include <...>
|
|
559
|
-
const includeRegex = /#include\s*[<"]([^>"]+)[>"]/g;
|
|
560
|
-
while ((match = includeRegex.exec(content)) !== null) {
|
|
561
|
-
imports.push(match[1]);
|
|
562
|
-
}
|
|
563
|
-
break;
|
|
564
|
-
}
|
|
565
|
-
|
|
566
|
-
return [...new Set(imports)]; // Remove duplicates
|
|
567
|
-
}
|
|
568
|
-
|
|
569
|
-
/**
|
|
570
|
-
* Extract function/method calls using simple regex
|
|
571
|
-
* @param {string} content - File content
|
|
572
|
-
* @param {string} extension - File extension
|
|
573
|
-
* @returns {Array} Array of function calls
|
|
574
|
-
*/
|
|
575
|
-
extractCalls(content, extension) {
|
|
576
|
-
const calls = [];
|
|
577
|
-
|
|
578
|
-
// Generic function call pattern: identifier followed by (
|
|
579
|
-
const callRegex = /\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(/g;
|
|
580
|
-
let match;
|
|
581
|
-
|
|
582
|
-
while ((match = callRegex.exec(content)) !== null) {
|
|
583
|
-
const funcName = match[1];
|
|
584
|
-
|
|
585
|
-
// Filter out language keywords
|
|
586
|
-
const keywords = ['if', 'for', 'while', 'switch', 'catch', 'typeof', 'return', 'new'];
|
|
587
|
-
if (!keywords.includes(funcName) && funcName.length > 1) {
|
|
588
|
-
calls.push(funcName);
|
|
589
|
-
}
|
|
590
|
-
}
|
|
591
|
-
|
|
592
|
-
// Return unique calls, limited to prevent noise
|
|
593
|
-
return [...new Set(calls)].slice(0, 20);
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
/**
|
|
597
|
-
* Validate generated JSON file
|
|
598
|
-
* @param {string} outputPath - Path to generated JSON
|
|
599
|
-
*/
|
|
600
|
-
async validateGeneratedJSON(outputPath) {
|
|
601
|
-
try {
|
|
602
|
-
// Check file exists and is readable
|
|
603
|
-
const stats = await fs.stat(outputPath);
|
|
604
|
-
|
|
605
|
-
// Check file size warnings
|
|
606
|
-
const maxSize = this.ragConfigManager.parseFileSize(this.yamlConfig?.quality?.maxOutputSize || '250MB');
|
|
607
|
-
if (stats.size > maxSize) {
|
|
608
|
-
console.warn(`⚠️ Generated file is large: ${formatFileSize(stats.size)} (>${formatFileSize(maxSize)})`);
|
|
609
|
-
}
|
|
610
|
-
|
|
611
|
-
// Simple validation: read first 1KB to check JSON structure
|
|
612
|
-
const stream = createReadStream(outputPath, { encoding: 'utf8', start: 0, end: 1023 });
|
|
613
|
-
let sampleText = '';
|
|
614
|
-
|
|
615
|
-
for await (const chunk of stream) {
|
|
616
|
-
sampleText += chunk;
|
|
617
|
-
}
|
|
618
|
-
|
|
619
|
-
// Check for basic JSON structure
|
|
620
|
-
if (!sampleText.trim().startsWith('{')) {
|
|
621
|
-
throw new Error('Generated file does not start with valid JSON');
|
|
622
|
-
}
|
|
623
|
-
|
|
624
|
-
// Check for expected structure
|
|
625
|
-
if (!sampleText.includes('"metadata"') || !sampleText.includes('"files"')) {
|
|
626
|
-
console.warn('⚠️ JSON structure may be incomplete - expected sections not found in sample');
|
|
627
|
-
}
|
|
628
|
-
|
|
629
|
-
console.log('✅ JSON validation passed');
|
|
630
|
-
|
|
631
|
-
} catch (error) {
|
|
632
|
-
console.error(`❌ JSON validation failed: ${error.message}`);
|
|
633
|
-
// Don't re-throw - just warn since file was successfully written
|
|
634
|
-
console.warn('⚠️ Continuing despite validation warning - file was generated successfully');
|
|
635
|
-
}
|
|
636
|
-
}
|
|
637
|
-
|
|
638
|
-
/**
|
|
639
|
-
* Validate discovery progress for quality assurance
|
|
640
|
-
* @param {Array} discoveredFiles - Files discovered so far
|
|
641
|
-
* @param {number} processedCount - Total files processed
|
|
642
|
-
*/
|
|
643
|
-
validateDiscoveryProgress(discoveredFiles, processedCount) {
|
|
644
|
-
const issues = [];
|
|
645
|
-
|
|
646
|
-
// Check for duplicate hashes
|
|
647
|
-
const hashes = new Set();
|
|
648
|
-
const duplicates = [];
|
|
649
|
-
|
|
650
|
-
for (const file of discoveredFiles) {
|
|
651
|
-
if (hashes.has(file.hash)) {
|
|
652
|
-
duplicates.push(file.hash.substring(0, 8));
|
|
653
|
-
} else {
|
|
654
|
-
hashes.add(file.hash);
|
|
655
|
-
}
|
|
656
|
-
}
|
|
657
|
-
|
|
658
|
-
if (duplicates.length > 0) {
|
|
659
|
-
issues.push(`Duplicate hashes detected: ${duplicates.join(', ')}`);
|
|
660
|
-
}
|
|
661
|
-
|
|
662
|
-
// Check file size distribution
|
|
663
|
-
const largeSizeThreshold = this.ragConfigManager.parseFileSize(this.yamlConfig?.performance?.maxFileSize || '100MB');
|
|
664
|
-
const largeFiles = discoveredFiles.filter(f => f.size > largeSizeThreshold);
|
|
665
|
-
|
|
666
|
-
if (largeFiles.length > 0) {
|
|
667
|
-
issues.push(`${largeFiles.length} files exceed size threshold`);
|
|
668
|
-
}
|
|
669
|
-
|
|
670
|
-
// Check tag distribution
|
|
671
|
-
const tagCounts = {};
|
|
672
|
-
discoveredFiles.forEach(file => {
|
|
673
|
-
file.tags.forEach(tag => {
|
|
674
|
-
tagCounts[tag] = (tagCounts[tag] || 0) + 1;
|
|
675
|
-
});
|
|
676
|
-
});
|
|
677
|
-
|
|
678
|
-
const untaggedFiles = discoveredFiles.filter(f => f.tags.length === 0);
|
|
679
|
-
if (untaggedFiles.length > discoveredFiles.length * 0.5) {
|
|
680
|
-
issues.push(`High untagged ratio: ${untaggedFiles.length}/${discoveredFiles.length}`);
|
|
681
|
-
}
|
|
682
|
-
|
|
683
|
-
// Report issues if any
|
|
684
|
-
if (issues.length > 0) {
|
|
685
|
-
console.warn(`\n⚠️ Discovery validation issues: ${issues.join(', ')}`);
|
|
686
|
-
}
|
|
687
|
-
}
|
|
688
|
-
|
|
689
|
-
/**
|
|
690
|
-
* Validate processing progress for quality assurance
|
|
691
|
-
* @param {Map} chunkOffsets - Current chunk offsets
|
|
692
|
-
*/
|
|
693
|
-
validateProcessingProgress(chunkOffsets) {
|
|
694
|
-
const issues = [];
|
|
695
|
-
|
|
696
|
-
// Check chunk size distribution
|
|
697
|
-
const chunkSizes = [];
|
|
698
|
-
for (const [chunkId, offsetData] of chunkOffsets.entries()) {
|
|
699
|
-
const size = offsetData.contentEnd - offsetData.contentStart;
|
|
700
|
-
chunkSizes.push(size);
|
|
701
|
-
}
|
|
702
|
-
|
|
703
|
-
if (chunkSizes.length > 0) {
|
|
704
|
-
const avgChunkSize = chunkSizes.reduce((a, b) => a + b, 0) / chunkSizes.length;
|
|
705
|
-
const maxChunkSize = Math.max(...chunkSizes);
|
|
706
|
-
const maxChunkThreshold = this.ragConfigManager.parseFileSize(this.yamlConfig?.quality?.maxChunkSize || '50KB');
|
|
707
|
-
|
|
708
|
-
if (maxChunkSize > maxChunkThreshold) {
|
|
709
|
-
issues.push(`Large chunk detected: ${formatFileSize(maxChunkSize)}`);
|
|
710
|
-
}
|
|
711
|
-
|
|
712
|
-
if (avgChunkSize < 100) {
|
|
713
|
-
issues.push(`Small average chunk size: ${formatFileSize(avgChunkSize)}`);
|
|
714
|
-
}
|
|
715
|
-
}
|
|
716
|
-
|
|
717
|
-
// Check offset consistency
|
|
718
|
-
let invalidOffsets = 0;
|
|
719
|
-
for (const [chunkId, offsetData] of chunkOffsets.entries()) {
|
|
720
|
-
if (offsetData.contentStart >= offsetData.contentEnd) {
|
|
721
|
-
invalidOffsets++;
|
|
722
|
-
}
|
|
723
|
-
}
|
|
724
|
-
|
|
725
|
-
if (invalidOffsets > 0) {
|
|
726
|
-
issues.push(`Invalid offsets: ${invalidOffsets} chunks`);
|
|
727
|
-
}
|
|
728
|
-
|
|
729
|
-
// Report issues if any
|
|
730
|
-
if (issues.length > 0) {
|
|
731
|
-
console.warn(`\n⚠️ Processing validation issues: ${issues.join(', ')}`);
|
|
732
|
-
}
|
|
733
|
-
}
|
|
734
|
-
|
|
735
|
-
/**
|
|
736
|
-
* Final validation of generated output with seek inverse testing
|
|
737
|
-
* @param {string} outputPath - Generated file path
|
|
738
|
-
* @param {Array} discoveredFiles - All processed files
|
|
739
|
-
* @param {Map} chunkOffsets - All chunk offsets
|
|
740
|
-
*/
|
|
741
|
-
async validateFinalOutput(outputPath, discoveredFiles, chunkOffsets) {
|
|
742
|
-
const issues = [];
|
|
743
|
-
|
|
744
|
-
try {
|
|
745
|
-
const stats = await fs.stat(outputPath);
|
|
746
|
-
|
|
747
|
-
// Check file size
|
|
748
|
-
const maxOutputSize = this.ragConfigManager.parseFileSize(this.yamlConfig?.quality?.maxOutputSize || '250MB');
|
|
749
|
-
if (stats.size > maxOutputSize) {
|
|
750
|
-
issues.push(`Output size (${formatFileSize(stats.size)}) exceeds threshold`);
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
// Check completeness
|
|
754
|
-
const expectedChunks = discoveredFiles.reduce((sum, file) => sum + (file.chunks?.length || 0), 0);
|
|
755
|
-
const actualChunks = chunkOffsets.size;
|
|
756
|
-
|
|
757
|
-
if (expectedChunks !== actualChunks) {
|
|
758
|
-
issues.push(`Chunk count mismatch: expected ${expectedChunks}, got ${actualChunks}`);
|
|
759
|
-
}
|
|
760
|
-
|
|
761
|
-
// Check for empty chunks
|
|
762
|
-
const emptyChunks = Array.from(chunkOffsets.values()).filter(offset =>
|
|
763
|
-
offset.contentEnd - offset.contentStart < 10
|
|
764
|
-
).length;
|
|
765
|
-
|
|
766
|
-
if (emptyChunks > 0) {
|
|
767
|
-
issues.push(`${emptyChunks} near-empty chunks detected`);
|
|
768
|
-
}
|
|
769
|
-
|
|
770
|
-
// SEEK INVERSE TESTING - Test random chunk offsets
|
|
771
|
-
await this.validateSeekInverse(outputPath, chunkOffsets);
|
|
772
|
-
|
|
773
|
-
// JSON Schema validation (basic)
|
|
774
|
-
await this.validateJsonStructure(outputPath);
|
|
775
|
-
|
|
776
|
-
// Report final validation
|
|
777
|
-
if (issues.length > 0) {
|
|
778
|
-
console.warn(`\n⚠️ Final validation issues:`);
|
|
779
|
-
issues.forEach(issue => console.warn(` • ${issue}`));
|
|
780
|
-
} else {
|
|
781
|
-
console.log('✅ Final validation passed - output is healthy');
|
|
782
|
-
}
|
|
783
|
-
|
|
784
|
-
} catch (error) {
|
|
785
|
-
console.error(`❌ Final validation failed: ${error.message}`);
|
|
786
|
-
}
|
|
787
|
-
}
|
|
788
|
-
|
|
789
|
-
/**
|
|
790
|
-
* Test seek operations on random chunk offsets to verify accuracy
|
|
791
|
-
* @param {string} outputPath - Generated JSON file path
|
|
792
|
-
* @param {Map} chunkOffsets - Chunk offset map
|
|
793
|
-
*/
|
|
794
|
-
async validateSeekInverse(outputPath, chunkOffsets) {
|
|
795
|
-
const chunkIds = Array.from(chunkOffsets.keys());
|
|
796
|
-
const testCount = Math.min(3, chunkIds.length); // Test 2-3 random chunks
|
|
797
|
-
|
|
798
|
-
if (testCount === 0) {
|
|
799
|
-
console.warn('⚠️ No chunks to test for seek validation');
|
|
800
|
-
return;
|
|
801
|
-
}
|
|
802
|
-
|
|
803
|
-
console.log(`🔍 Testing seek inverse on ${testCount} random chunks...`);
|
|
804
|
-
|
|
805
|
-
for (let i = 0; i < testCount; i++) {
|
|
806
|
-
const randomIndex = Math.floor(Math.random() * chunkIds.length);
|
|
807
|
-
const chunkId = chunkIds[randomIndex];
|
|
808
|
-
const offsetData = chunkOffsets.get(chunkId);
|
|
809
|
-
|
|
810
|
-
try {
|
|
811
|
-
// Read the specific chunk content using simple file read
|
|
812
|
-
const fullContent = await fs.readFile(outputPath, 'utf8');
|
|
813
|
-
const seekContent = fullContent.slice(offsetData.contentStart, offsetData.contentEnd);
|
|
814
|
-
|
|
815
|
-
// Verify it's valid JSON content (should be a JSON string value)
|
|
816
|
-
try {
|
|
817
|
-
// Try to parse as JSON - if it's valid JSON string content, this should work
|
|
818
|
-
const parsed = JSON.parse(seekContent);
|
|
819
|
-
if (typeof parsed === 'string') {
|
|
820
|
-
console.log(` ✅ Chunk ${chunkId}: seek successful, valid JSON string (${seekContent.length} bytes)`);
|
|
821
|
-
} else {
|
|
822
|
-
console.log(` ✅ Chunk ${chunkId}: seek successful, valid JSON (${typeof parsed}, ${seekContent.length} bytes)`);
|
|
823
|
-
}
|
|
824
|
-
} catch (parseError) {
|
|
825
|
-
// If it doesn't parse as JSON, it might be a partial chunk
|
|
826
|
-
console.log(` ✅ Chunk ${chunkId}: seek successful, partial content (${seekContent.length} bytes)`);
|
|
827
|
-
}
|
|
828
|
-
|
|
829
|
-
} catch (error) {
|
|
830
|
-
console.error(` ❌ Chunk ${chunkId}: seek failed - ${error.message}`);
|
|
831
|
-
}
|
|
832
|
-
}
|
|
833
|
-
}
|
|
834
|
-
|
|
835
|
-
/**
|
|
836
|
-
* Basic JSON structure validation
|
|
837
|
-
* @param {string} outputPath - Generated JSON file path
|
|
838
|
-
*/
|
|
839
|
-
async validateJsonStructure(outputPath) {
|
|
840
|
-
try {
|
|
841
|
-
// Read full content for validation (simpler approach)
|
|
842
|
-
const fullContent = await fs.readFile(outputPath, 'utf8');
|
|
843
|
-
const startText = fullContent.slice(0, 1024).trim();
|
|
844
|
-
const endText = fullContent.slice(-1024).trim();
|
|
845
|
-
|
|
846
|
-
// Basic structure checks
|
|
847
|
-
const issues = [];
|
|
848
|
-
|
|
849
|
-
if (!startText.startsWith('{')) {
|
|
850
|
-
issues.push('File does not start with {');
|
|
851
|
-
}
|
|
852
|
-
|
|
853
|
-
if (!endText.endsWith('}')) {
|
|
854
|
-
issues.push('File does not end with }');
|
|
855
|
-
}
|
|
856
|
-
|
|
857
|
-
if (!startText.includes('"metadata"')) {
|
|
858
|
-
issues.push('Missing metadata section');
|
|
859
|
-
}
|
|
860
|
-
|
|
861
|
-
if (!startText.includes('"files"')) {
|
|
862
|
-
issues.push('Missing files section');
|
|
863
|
-
}
|
|
864
|
-
|
|
865
|
-
if (!fullContent.includes('"index"')) {
|
|
866
|
-
issues.push('Missing index section');
|
|
867
|
-
}
|
|
868
|
-
|
|
869
|
-
if (issues.length === 0) {
|
|
870
|
-
console.log('✅ JSON structure validation passed');
|
|
871
|
-
} else {
|
|
872
|
-
console.warn(`⚠️ JSON structure issues: ${issues.join(', ')}`);
|
|
873
|
-
}
|
|
874
|
-
|
|
875
|
-
} catch (error) {
|
|
876
|
-
console.error(`❌ JSON structure validation failed: ${error.message}`);
|
|
877
|
-
}
|
|
878
|
-
}
|
|
879
|
-
|
|
880
|
-
/**
|
|
881
|
-
* Generate RAG output atomically - build complete structure in memory (thread-safe)
|
|
882
|
-
*/
|
|
883
|
-
async generate(discoveredFiles, outputPath, projectName, scanPath) {
|
|
884
|
-
console.log('📝 Atomic generation: processing all files in memory...');
|
|
885
|
-
|
|
886
|
-
await fs.ensureDir(path.dirname(outputPath));
|
|
887
|
-
|
|
888
|
-
const processedFiles = [];
|
|
889
|
-
let totalChunks = 0;
|
|
890
|
-
|
|
891
|
-
for (let i = 0; i < discoveredFiles.length; i++) {
|
|
892
|
-
const fileData = discoveredFiles[i];
|
|
893
|
-
const progress = ((i + 1) / discoveredFiles.length * 100).toFixed(1);
|
|
894
|
-
|
|
895
|
-
process.stdout.write(`\r📊 Processing: ${progress}% (${i + 1}/${discoveredFiles.length})`);
|
|
896
|
-
|
|
897
|
-
try {
|
|
898
|
-
const processedFile = await this.processFileInMemory(fileData);
|
|
899
|
-
processedFiles.push(processedFile);
|
|
900
|
-
totalChunks += processedFile.chunks?.length || 0;
|
|
901
|
-
this.stats.filesProcessed++;
|
|
902
|
-
this.stats.chunksGenerated += processedFile.chunks?.length || 0;
|
|
903
|
-
} catch (error) {
|
|
904
|
-
console.warn(`\n⚠️ Error processing ${fileData.path}: ${error.message}`);
|
|
905
|
-
this.errors.push({ file: fileData.path, error: error.message });
|
|
906
|
-
processedFiles.push({ ...fileData, chunks: [], error: error.message });
|
|
907
|
-
}
|
|
908
|
-
}
|
|
909
|
-
|
|
910
|
-
console.log(`\n✅ All files processed: ${processedFiles.length} files, ${totalChunks} chunks`);
|
|
911
|
-
|
|
912
|
-
const completeJSON = this.buildCompleteJSON(processedFiles, projectName, scanPath);
|
|
913
|
-
const finalJSON = this.calculateAndInjectOffsets(completeJSON);
|
|
914
|
-
|
|
915
|
-
await fs.writeFile(outputPath, finalJSON, 'utf8');
|
|
916
|
-
this.stats.bytesWritten = finalJSON.length;
|
|
917
|
-
|
|
918
|
-
if (this.yamlConfig?.output?.validation) {
|
|
919
|
-
console.log('🔍 Validating generated output...');
|
|
920
|
-
await this.validateGeneratedJSON(outputPath);
|
|
921
|
-
}
|
|
922
|
-
|
|
923
|
-
console.log(`✅ JSON written successfully to ${outputPath}`);
|
|
924
|
-
|
|
925
|
-
return {
|
|
926
|
-
outputPath,
|
|
927
|
-
totalFiles: processedFiles.length,
|
|
928
|
-
totalChunks,
|
|
929
|
-
bytesWritten: finalJSON.length,
|
|
930
|
-
extensionCoverage: this.verifyExtensionCoverage([...new Set(processedFiles.map(f => f.extension))])
|
|
931
|
-
};
|
|
932
|
-
}
|
|
933
|
-
|
|
934
|
-
async processFileInMemory(fileData) {
|
|
935
|
-
const content = await fs.readFile(fileData.absolutePath, 'utf8');
|
|
936
|
-
const handler = this.getHandler(fileData.extension);
|
|
937
|
-
|
|
938
|
-
console.log(`🔍 ${handler.constructor.name} processing ${fileData.extension} file: ${fileData.path}`);
|
|
939
|
-
|
|
940
|
-
const chunks = await handler.generateChunks(content, {
|
|
941
|
-
fileId: fileData.id,
|
|
942
|
-
filePath: fileData.path,
|
|
943
|
-
extension: fileData.extension,
|
|
944
|
-
language: fileData.language,
|
|
945
|
-
maxTokens: this.maxTokensPerChunk,
|
|
946
|
-
overlap: this.overlapTokens
|
|
947
|
-
});
|
|
948
|
-
|
|
949
|
-
if (chunks.length > 0) {
|
|
950
|
-
console.log(` 📝 Found ${chunks.length} semantic chunks`);
|
|
951
|
-
}
|
|
952
|
-
|
|
953
|
-
const enrichedChunks = chunks.map((chunk, index) => ({
|
|
954
|
-
...chunk,
|
|
955
|
-
id: `chunk_${fileData.id}_${index}`,
|
|
956
|
-
tokenEstimate: this.safeEstimateTokens(chunk.content, fileData.language),
|
|
957
|
-
imports: this.extractImports(chunk.content, fileData.extension),
|
|
958
|
-
calls: this.extractCalls(chunk.content, fileData.extension)
|
|
959
|
-
}));
|
|
960
|
-
|
|
961
|
-
return {
|
|
962
|
-
id: fileData.id,
|
|
963
|
-
path: fileData.path,
|
|
964
|
-
language: fileData.language,
|
|
965
|
-
extension: fileData.extension,
|
|
966
|
-
size: fileData.size,
|
|
967
|
-
lines: content.split('\n').length,
|
|
968
|
-
hash: fileData.hash,
|
|
969
|
-
modified: fileData.modified,
|
|
970
|
-
tags: fileData.tags,
|
|
971
|
-
chunks: enrichedChunks
|
|
972
|
-
};
|
|
973
|
-
}
|
|
974
|
-
|
|
975
|
-
buildCompleteJSON(processedFiles, projectName, scanPath) {
|
|
976
|
-
const totalChunks = processedFiles.reduce((sum, file) => sum + (file.chunks?.length || 0), 0);
|
|
977
|
-
const emptyFiles = processedFiles.filter(f => (f.chunks?.length || 0) === 0).length;
|
|
978
|
-
|
|
979
|
-
this.stats.endTime = Date.now();
|
|
980
|
-
const processingTimeMs = Math.max(1, this.stats.endTime - this.stats.startTime);
|
|
981
|
-
|
|
982
|
-
return {
|
|
983
|
-
metadata: {
|
|
984
|
-
projectName,
|
|
985
|
-
generatedAt: new Date().toISOString(),
|
|
986
|
-
scanPath,
|
|
987
|
-
generator: 'CodeSummary RAG Generator',
|
|
988
|
-
version: '3.1.0',
|
|
989
|
-
config: {
|
|
990
|
-
maxTokensPerChunk: this.maxTokensPerChunk,
|
|
991
|
-
overlapTokens: this.overlapTokens,
|
|
992
|
-
tokenEstimationMethod: 'enhanced_heuristic_v1.0'
|
|
993
|
-
},
|
|
994
|
-
summary: {
|
|
995
|
-
totalFiles: processedFiles.length,
|
|
996
|
-
languages: [...new Set(processedFiles.map(f => f.language))],
|
|
997
|
-
extensions: [...new Set(processedFiles.map(f => f.extension))]
|
|
998
|
-
},
|
|
999
|
-
schemaVersion: "1.0",
|
|
1000
|
-
schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json"
|
|
1001
|
-
},
|
|
1002
|
-
files: processedFiles,
|
|
1003
|
-
index: {
|
|
1004
|
-
version: "3.1.0",
|
|
1005
|
-
generatedAt: new Date().toISOString(),
|
|
1006
|
-
schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json",
|
|
1007
|
-
summary: {
|
|
1008
|
-
fileCount: processedFiles.length - emptyFiles,
|
|
1009
|
-
chunkCount: totalChunks,
|
|
1010
|
-
totalBytes: 0,
|
|
1011
|
-
languages: [...new Set(processedFiles.map(f => f.language))],
|
|
1012
|
-
extensions: [...new Set(processedFiles.map(f => f.extension))],
|
|
1013
|
-
avgFileSize: 0,
|
|
1014
|
-
avgChunksPerFile: processedFiles.length > 0 ? Math.round(totalChunks / processedFiles.length) : 0
|
|
1015
|
-
},
|
|
1016
|
-
chunkOffsets: {},
|
|
1017
|
-
fileOffsets: {},
|
|
1018
|
-
seekInfo: {
|
|
1019
|
-
instructions: "Use chunkOffsets[chunkId].contentStart and contentEnd to seek directly to chunk content",
|
|
1020
|
-
format: "All offsets are absolute byte positions in this JSON file",
|
|
1021
|
-
chunkFormat: "Object with jsonStart, jsonEnd, contentStart, contentEnd (absolute JSON positions)",
|
|
1022
|
-
fileFormat: "Array [start, end] for each file in JSON"
|
|
1023
|
-
},
|
|
1024
|
-
statistics: {
|
|
1025
|
-
processingTimeMs,
|
|
1026
|
-
bytesPerSecond: 0,
|
|
1027
|
-
bytesWritten: 0,
|
|
1028
|
-
chunksWithValidOffsets: totalChunks,
|
|
1029
|
-
filesWithValidOffsets: processedFiles.length - emptyFiles,
|
|
1030
|
-
totalFiles: processedFiles.length,
|
|
1031
|
-
emptyFiles: emptyFiles,
|
|
1032
|
-
totalChunksGenerated: totalChunks,
|
|
1033
|
-
errors: this.errors
|
|
1034
|
-
}
|
|
1035
|
-
}
|
|
1036
|
-
};
|
|
1037
|
-
}
|
|
1038
|
-
|
|
1039
|
-
/**
|
|
1040
|
-
* Finaliza la estructura JSON calculando y reinyectando los offsets correctos.
|
|
1041
|
-
* Este enfoque garantiza la máxima precisión al operar sobre el string JSON final.
|
|
1042
|
-
* @param {object} jsonStructure - El objeto JSON completo con datos pero sin offsets.
|
|
1043
|
-
* @returns {string} El string JSON final, formateado y con offsets precisos.
|
|
1044
|
-
*/
|
|
1045
|
-
calculateAndInjectOffsets(jsonStructure) {
|
|
1046
|
-
console.log('🔍 Calculating precise byte offsets and building complete index...');
|
|
1047
|
-
|
|
1048
|
-
// PASO 1: Construir JSON preliminar sin index para medir posiciones exactas
|
|
1049
|
-
const jsonWithoutIndex = {
|
|
1050
|
-
metadata: jsonStructure.metadata,
|
|
1051
|
-
files: jsonStructure.files
|
|
1052
|
-
};
|
|
1053
|
-
|
|
1054
|
-
const preliminaryJsonString = JSON.stringify(jsonWithoutIndex, null, 2);
|
|
1055
|
-
const preliminaryBytes = Buffer.byteLength(preliminaryJsonString, 'utf8');
|
|
1056
|
-
|
|
1057
|
-
// PASO 2: Calcular offsets precisos de archivos y chunks
|
|
1058
|
-
const fileOffsets = {};
|
|
1059
|
-
const chunkOffsets = {};
|
|
1060
|
-
let totalChunks = 0;
|
|
1061
|
-
let validChunks = 0;
|
|
1062
|
-
|
|
1063
|
-
for (const file of jsonStructure.files) {
|
|
1064
|
-
// Buscar el inicio del objeto file por su ID
|
|
1065
|
-
const filePattern = `"id": "${file.id}"`;
|
|
1066
|
-
const fileStartPos = preliminaryJsonString.indexOf(filePattern);
|
|
1067
|
-
|
|
1068
|
-
if (fileStartPos !== -1) {
|
|
1069
|
-
// Buscar el final aproximado del objeto file
|
|
1070
|
-
const nextFilePattern = preliminaryJsonString.indexOf(' {\n "id":', fileStartPos + 1);
|
|
1071
|
-
const fileEndPos = nextFilePattern !== -1 ? nextFilePattern : preliminaryJsonString.lastIndexOf(' ]');
|
|
1072
|
-
|
|
1073
|
-
// Formato del esquema: fileId -> [start, end]
|
|
1074
|
-
fileOffsets[file.id] = [fileStartPos, fileEndPos];
|
|
1075
|
-
|
|
1076
|
-
// Calcular offsets de chunks dentro de este archivo
|
|
1077
|
-
for (const chunk of file.chunks) {
|
|
1078
|
-
const chunkPattern = `"id": "${chunk.id}"`;
|
|
1079
|
-
const chunkStartPos = preliminaryJsonString.indexOf(chunkPattern, fileStartPos);
|
|
1080
|
-
|
|
1081
|
-
if (chunkStartPos !== -1) {
|
|
1082
|
-
// Encontrar el campo "content" dentro de este chunk
|
|
1083
|
-
const contentPattern = '"content": "';
|
|
1084
|
-
const contentStartSearch = preliminaryJsonString.indexOf(contentPattern, chunkStartPos);
|
|
1085
|
-
|
|
1086
|
-
if (contentStartSearch !== -1) {
|
|
1087
|
-
const contentStart = contentStartSearch + contentPattern.length;
|
|
1088
|
-
|
|
1089
|
-
// Buscar el final del contenido (cierre de la cadena JSON)
|
|
1090
|
-
let contentEnd = contentStart;
|
|
1091
|
-
let inEscape = false;
|
|
1092
|
-
|
|
1093
|
-
for (let i = contentStart; i < preliminaryJsonString.length; i++) {
|
|
1094
|
-
const char = preliminaryJsonString[i];
|
|
1095
|
-
if (inEscape) {
|
|
1096
|
-
inEscape = false;
|
|
1097
|
-
continue;
|
|
1098
|
-
}
|
|
1099
|
-
if (char === '\\') {
|
|
1100
|
-
inEscape = true;
|
|
1101
|
-
continue;
|
|
1102
|
-
}
|
|
1103
|
-
if (char === '"') {
|
|
1104
|
-
contentEnd = i;
|
|
1105
|
-
break;
|
|
1106
|
-
}
|
|
1107
|
-
}
|
|
1108
|
-
|
|
1109
|
-
// Buscar el final del objeto chunk completo
|
|
1110
|
-
const chunkEndPattern = '},';
|
|
1111
|
-
const chunkEndSearch = preliminaryJsonString.indexOf(chunkEndPattern, contentEnd);
|
|
1112
|
-
const chunkEnd = chunkEndSearch !== -1 ? chunkEndSearch + 1 : contentEnd + 100;
|
|
1113
|
-
|
|
1114
|
-
// Formato del esquema: chunkId -> objeto con offsets precisos
|
|
1115
|
-
chunkOffsets[chunk.id] = {
|
|
1116
|
-
jsonStart: chunkStartPos,
|
|
1117
|
-
jsonEnd: chunkEnd,
|
|
1118
|
-
contentStart: contentStart,
|
|
1119
|
-
contentEnd: contentEnd,
|
|
1120
|
-
filePath: file.path
|
|
1121
|
-
};
|
|
1122
|
-
|
|
1123
|
-
validChunks++;
|
|
1124
|
-
}
|
|
1125
|
-
}
|
|
1126
|
-
totalChunks++;
|
|
1127
|
-
}
|
|
1128
|
-
}
|
|
1129
|
-
}
|
|
1130
|
-
|
|
1131
|
-
// PASO 3: Construir estadísticas completas
|
|
1132
|
-
const processingTimeMs = Math.max(1, this.stats.endTime - this.stats.startTime);
|
|
1133
|
-
const emptyFiles = jsonStructure.files.filter(f => f.chunks.length === 0).length;
|
|
1134
|
-
|
|
1135
|
-
// PASO 4: Construir el bloque index completo según el esquema
|
|
1136
|
-
const indexBlock = {
|
|
1137
|
-
version: "3.1.0",
|
|
1138
|
-
generatedAt: new Date().toISOString(),
|
|
1139
|
-
schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json",
|
|
1140
|
-
summary: {
|
|
1141
|
-
fileCount: jsonStructure.files.length - emptyFiles,
|
|
1142
|
-
chunkCount: totalChunks,
|
|
1143
|
-
totalBytes: 0, // Se actualizará después
|
|
1144
|
-
languages: [...new Set(jsonStructure.files.map(f => f.language))],
|
|
1145
|
-
extensions: [...new Set(jsonStructure.files.map(f => f.extension))],
|
|
1146
|
-
avgFileSize: 0, // Se actualizará después
|
|
1147
|
-
avgChunksPerFile: jsonStructure.files.length > 0 ? Math.round(totalChunks / jsonStructure.files.length) : 0
|
|
1148
|
-
},
|
|
1149
|
-
chunkOffsets: chunkOffsets,
|
|
1150
|
-
fileOffsets: fileOffsets,
|
|
1151
|
-
seekInfo: {
|
|
1152
|
-
instructions: "Use chunkOffsets[chunkId].contentStart and contentEnd to seek directly to chunk content",
|
|
1153
|
-
format: "All offsets are absolute byte positions in this JSON file",
|
|
1154
|
-
chunkFormat: "Object with jsonStart, jsonEnd, contentStart, contentEnd (absolute JSON positions)",
|
|
1155
|
-
fileFormat: "Array [start, end] for each file in JSON",
|
|
1156
|
-
validation: `Generated with ${validChunks} chunks across ${Object.keys(fileOffsets).length} files`
|
|
1157
|
-
},
|
|
1158
|
-
statistics: {
|
|
1159
|
-
processingTimeMs,
|
|
1160
|
-
bytesPerSecond: 0, // Se actualizará después
|
|
1161
|
-
bytesWritten: 0, // Se actualizará después
|
|
1162
|
-
chunksWithValidOffsets: validChunks,
|
|
1163
|
-
filesWithValidOffsets: Object.keys(fileOffsets).length,
|
|
1164
|
-
totalFiles: jsonStructure.files.length,
|
|
1165
|
-
emptyFiles: emptyFiles,
|
|
1166
|
-
totalChunksGenerated: totalChunks
|
|
1167
|
-
}
|
|
1168
|
-
};
|
|
1169
|
-
|
|
1170
|
-
// PASO 5: Construir JSON final con index y calcular métricas finales
|
|
1171
|
-
const completeStructure = {
|
|
1172
|
-
metadata: jsonStructure.metadata,
|
|
1173
|
-
files: jsonStructure.files,
|
|
1174
|
-
index: indexBlock
|
|
1175
|
-
};
|
|
1176
|
-
|
|
1177
|
-
const finalJsonString = JSON.stringify(completeStructure, null, 2);
|
|
1178
|
-
const finalBytes = Buffer.byteLength(finalJsonString, 'utf8');
|
|
1179
|
-
const bytesPerSecond = Math.round(finalBytes / (processingTimeMs / 1000));
|
|
1180
|
-
|
|
1181
|
-
// Actualizar métricas finales en el index
|
|
1182
|
-
completeStructure.index.summary.totalBytes = finalBytes;
|
|
1183
|
-
completeStructure.index.summary.avgFileSize = jsonStructure.files.length > 0 ?
|
|
1184
|
-
Math.round(finalBytes / jsonStructure.files.length) : 0;
|
|
1185
|
-
completeStructure.index.statistics.bytesPerSecond = bytesPerSecond;
|
|
1186
|
-
completeStructure.index.statistics.bytesWritten = finalBytes;
|
|
1187
|
-
|
|
1188
|
-
// PASO 6: Regenerar JSON final con estadísticas actualizadas
|
|
1189
|
-
const finalResult = JSON.stringify(completeStructure, null, 2);
|
|
1190
|
-
|
|
1191
|
-
console.log(`✅ Complete index built: ${Object.keys(fileOffsets).length} files, ${validChunks}/${totalChunks} chunks with precise offsets`);
|
|
1192
|
-
console.log(`✅ Final JSON: ${formatFileSize(Buffer.byteLength(finalResult, 'utf8'))}, processing: ${processingTimeMs}ms`);
|
|
1193
|
-
|
|
1194
|
-
return finalResult;
|
|
1195
|
-
}
|
|
1196
|
-
}
|
|
1197
|
-
|
|
1198
|
-
// Specialized Handler Classes
|
|
1199
|
-
|
|
1200
|
-
class BaseHandler {
|
|
1201
|
-
async generateChunks(content, options) {
|
|
1202
|
-
// Fallback: split by lines if no specific logic
|
|
1203
|
-
return this.chunkByLines(content, options);
|
|
1204
|
-
}
|
|
1205
|
-
|
|
1206
|
-
chunkByLines(content, options) {
|
|
1207
|
-
const lines = content.split('\n');
|
|
1208
|
-
const chunks = [];
|
|
1209
|
-
const maxLines = Math.ceil(options.maxTokens / 20); // ~20 tokens per line estimate
|
|
1210
|
-
|
|
1211
|
-
for (let i = 0; i < lines.length; i += maxLines) {
|
|
1212
|
-
const chunkLines = lines.slice(i, Math.min(i + maxLines, lines.length));
|
|
1213
|
-
const chunkContent = chunkLines.join('\n');
|
|
1214
|
-
|
|
1215
|
-
chunks.push({
|
|
1216
|
-
content: chunkContent,
|
|
1217
|
-
lineStart: i + 1,
|
|
1218
|
-
lineEnd: Math.min(i + maxLines, lines.length),
|
|
1219
|
-
chunkingMethod: 'line-based'
|
|
1220
|
-
});
|
|
1221
|
-
}
|
|
1222
|
-
|
|
1223
|
-
return chunks;
|
|
1224
|
-
}
|
|
1225
|
-
|
|
1226
|
-
/**
|
|
1227
|
-
* Estimate token count for chunking decisions
|
|
1228
|
-
* @param {string} content - Text content
|
|
1229
|
-
* @returns {number} Estimated token count
|
|
1230
|
-
*/
|
|
1231
|
-
estimateTokens(content) {
|
|
1232
|
-
return Math.ceil(content.length / 4);
|
|
1233
|
-
}
|
|
1234
|
-
}
|
|
1235
|
-
|
|
1236
|
-
class CLikeHandler extends BaseHandler {
|
|
1237
|
-
async generateChunks(content, options) {
|
|
1238
|
-
const chunks = [];
|
|
1239
|
-
const lines = content.split('\n');
|
|
1240
|
-
|
|
1241
|
-
// Find class/struct/function boundaries
|
|
1242
|
-
const boundaries = this.findCodeBoundaries(content);
|
|
1243
|
-
|
|
1244
|
-
if (boundaries.length > 0) {
|
|
1245
|
-
return this.chunkByBoundaries(content, boundaries, options);
|
|
1246
|
-
}
|
|
1247
|
-
|
|
1248
|
-
// Fallback to line-based chunking
|
|
1249
|
-
return this.chunkByLines(content, options);
|
|
1250
|
-
}
|
|
1251
|
-
|
|
1252
|
-
findCodeBoundaries(content) {
|
|
1253
|
-
const boundaries = [];
|
|
1254
|
-
const boundaryRegex = /^(?:class|struct|enum|union|static)?\s*([a-zA-Z_][\w]*)\s*.*{/gm;
|
|
1255
|
-
let match;
|
|
1256
|
-
|
|
1257
|
-
while ((match = boundaryRegex.exec(content)) !== null) {
|
|
1258
|
-
const lineNumber = content.substring(0, match.index).split('\n').length;
|
|
1259
|
-
boundaries.push({
|
|
1260
|
-
name: match[1],
|
|
1261
|
-
line: lineNumber,
|
|
1262
|
-
type: 'function'
|
|
1263
|
-
});
|
|
1264
|
-
}
|
|
1265
|
-
|
|
1266
|
-
return boundaries;
|
|
1267
|
-
}
|
|
1268
|
-
|
|
1269
|
-
chunkByBoundaries(content, boundaries, options) {
|
|
1270
|
-
const lines = content.split('\n');
|
|
1271
|
-
const chunks = [];
|
|
1272
|
-
let currentStart = 0;
|
|
1273
|
-
|
|
1274
|
-
for (const boundary of boundaries) {
|
|
1275
|
-
if (currentStart < boundary.line - 1) {
|
|
1276
|
-
const chunkLines = lines.slice(currentStart, boundary.line - 1);
|
|
1277
|
-
if (chunkLines.length > 0) {
|
|
1278
|
-
chunks.push({
|
|
1279
|
-
content: chunkLines.join('\n'),
|
|
1280
|
-
lineStart: currentStart + 1,
|
|
1281
|
-
lineEnd: boundary.line - 1
|
|
1282
|
-
});
|
|
1283
|
-
}
|
|
1284
|
-
}
|
|
1285
|
-
currentStart = boundary.line - 1;
|
|
1286
|
-
}
|
|
1287
|
-
|
|
1288
|
-
// Add remaining lines
|
|
1289
|
-
if (currentStart < lines.length) {
|
|
1290
|
-
const chunkLines = lines.slice(currentStart);
|
|
1291
|
-
chunks.push({
|
|
1292
|
-
content: chunkLines.join('\n'),
|
|
1293
|
-
lineStart: currentStart + 1,
|
|
1294
|
-
lineEnd: lines.length
|
|
1295
|
-
});
|
|
1296
|
-
}
|
|
1297
|
-
|
|
1298
|
-
return chunks;
|
|
1299
|
-
}
|
|
1300
|
-
}
|
|
1301
|
-
|
|
1302
|
-
class ScriptHandler extends BaseHandler {
|
|
1303
|
-
async generateChunks(content, options) {
|
|
1304
|
-
console.log(`🔍 ScriptHandler processing ${options.extension} file: ${options.filePath}`);
|
|
1305
|
-
|
|
1306
|
-
// ALWAYS try semantic chunking first for script files
|
|
1307
|
-
const functions = this.findFunctions(content, options.extension);
|
|
1308
|
-
|
|
1309
|
-
if (functions.length > 0) {
|
|
1310
|
-
console.log(` 📝 Found ${functions.length} functions/classes - using semantic chunking`);
|
|
1311
|
-
const chunks = this.chunkByFunctions(content, functions, options);
|
|
1312
|
-
|
|
1313
|
-
// Add chunking method metadata
|
|
1314
|
-
chunks.forEach(chunk => {
|
|
1315
|
-
chunk.chunkingMethod = 'semantic-function';
|
|
1316
|
-
chunk.semanticContext = chunk.context || 'code-block';
|
|
1317
|
-
});
|
|
1318
|
-
|
|
1319
|
-
return chunks;
|
|
1320
|
-
} else {
|
|
1321
|
-
console.log(` ⚠️ No functions found - falling back to line-based chunking`);
|
|
1322
|
-
const chunks = this.chunkByLines(content, options);
|
|
1323
|
-
chunks.forEach(chunk => {
|
|
1324
|
-
chunk.chunkingMethod = 'line-based-fallback';
|
|
1325
|
-
});
|
|
1326
|
-
return chunks;
|
|
1327
|
-
}
|
|
1328
|
-
}
|
|
1329
|
-
|
|
1330
|
-
findFunctions(content, extension) {
|
|
1331
|
-
const functions = [];
|
|
1332
|
-
|
|
1333
|
-
if (['.js', '.jsx', '.ts', '.tsx'].includes(extension)) {
|
|
1334
|
-
// Enhanced JavaScript/TypeScript function detection with improved precision
|
|
1335
|
-
const patterns = [
|
|
1336
|
-
// Regular functions: function name() {} - SEMANTIC BOUNDARY
|
|
1337
|
-
/(?:^|\n)\s*(?:export\s+)?(?:async\s+)?function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*\([^)]*\)\s*\{/gm,
|
|
1338
|
-
// Arrow functions: const name = () => {} - SEMANTIC BOUNDARY
|
|
1339
|
-
/(?:^|\n)\s*(?:export\s+)?(?:const|let|var)\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*=\s*(?:async\s+)?\([^)]*\)\s*=>\s*\{/gm,
|
|
1340
|
-
// Classes: class ClassName {} - SEMANTIC BOUNDARY
|
|
1341
|
-
/(?:^|\n)\s*(?:export\s+)?(?:abstract\s+)?class\s+([a-zA-Z_$][a-zA-Z0-9_$]*)(?:\s+extends\s+[a-zA-Z_$][a-zA-Z0-9_$]*)?(?:\s+implements\s+[^{]+)?\s*\{/gm,
|
|
1342
|
-
// Interfaces (TypeScript): interface InterfaceName {} - SEMANTIC BOUNDARY
|
|
1343
|
-
/(?:^|\n)\s*(?:export\s+)?interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)(?:\s+extends\s+[^{]+)?\s*\{/gm,
|
|
1344
|
-
// Type definitions (TypeScript): type TypeName = - SEMANTIC BOUNDARY
|
|
1345
|
-
/(?:^|\n)\s*(?:export\s+)?type\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*=/gm,
|
|
1346
|
-
// Enum definitions (TypeScript): enum EnumName {} - SEMANTIC BOUNDARY
|
|
1347
|
-
/(?:^|\n)\s*(?:export\s+)?(?:const\s+)?enum\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*\{/gm,
|
|
1348
|
-
// Class methods and object methods - SEMANTIC BOUNDARY (improved to avoid false positives)
|
|
1349
|
-
/(?:^|\n)\s*(?:public\s+|private\s+|protected\s+|static\s+|async\s+)*([a-zA-Z_$][a-zA-Z0-9_$]*)\s*\([^)]*\)\s*\{/gm
|
|
1350
|
-
];
|
|
1351
|
-
|
|
1352
|
-
const types = ['function', 'arrow-function', 'class', 'interface', 'type', 'enum', 'method'];
|
|
1353
|
-
|
|
1354
|
-
patterns.forEach((regex, patternIndex) => {
|
|
1355
|
-
let match;
|
|
1356
|
-
while ((match = regex.exec(content)) !== null) {
|
|
1357
|
-
const lineNumber = content.substring(0, match.index).split('\n').length;
|
|
1358
|
-
const endLine = this.findFunctionEnd(content, match.index, lineNumber);
|
|
1359
|
-
|
|
1360
|
-
functions.push({
|
|
1361
|
-
name: match[1],
|
|
1362
|
-
line: lineNumber,
|
|
1363
|
-
endLine: endLine,
|
|
1364
|
-
type: types[patternIndex],
|
|
1365
|
-
startIndex: match.index
|
|
1366
|
-
});
|
|
1367
|
-
}
|
|
1368
|
-
});
|
|
1369
|
-
|
|
1370
|
-
// Sort by line number to process in order
|
|
1371
|
-
functions.sort((a, b) => a.line - b.line);
|
|
1372
|
-
|
|
1373
|
-
} else if (extension === '.py') {
|
|
1374
|
-
// Python functions/classes
|
|
1375
|
-
const pyFuncRegex = /^(?:def\s+(\w+)|class\s+(\w+))/gm;
|
|
1376
|
-
let match;
|
|
1377
|
-
|
|
1378
|
-
while ((match = pyFuncRegex.exec(content)) !== null) {
|
|
1379
|
-
const lineNumber = content.substring(0, match.index).split('\n').length;
|
|
1380
|
-
functions.push({
|
|
1381
|
-
name: match[1] || match[2],
|
|
1382
|
-
line: lineNumber,
|
|
1383
|
-
type: match[2] ? 'class' : 'function'
|
|
1384
|
-
});
|
|
1385
|
-
}
|
|
1386
|
-
}
|
|
1387
|
-
|
|
1388
|
-
return functions;
|
|
1389
|
-
}
|
|
1390
|
-
|
|
1391
|
-
/**
|
|
1392
|
-
* Find the end line of a function/class by matching braces
|
|
1393
|
-
* @param {string} content - Full content
|
|
1394
|
-
* @param {number} startIndex - Start character index
|
|
1395
|
-
* @param {number} startLine - Start line number
|
|
1396
|
-
* @returns {number} End line number
|
|
1397
|
-
*/
|
|
1398
|
-
findFunctionEnd(content, startIndex, startLine) {
|
|
1399
|
-
let braceCount = 0;
|
|
1400
|
-
let inString = false;
|
|
1401
|
-
let stringChar = '';
|
|
1402
|
-
let i = startIndex;
|
|
1403
|
-
|
|
1404
|
-
// Find the opening brace
|
|
1405
|
-
while (i < content.length && content[i] !== '{') {
|
|
1406
|
-
if (content[i] === '\n') {
|
|
1407
|
-
// If we hit a newline before finding {, it might be an interface/type without body
|
|
1408
|
-
if (content.substring(startIndex, i).includes('interface') ||
|
|
1409
|
-
content.substring(startIndex, i).includes('type')) {
|
|
1410
|
-
// For interfaces/types, find the end of the statement
|
|
1411
|
-
while (i < content.length && content[i] !== '\n') i++;
|
|
1412
|
-
return content.substring(0, i).split('\n').length;
|
|
1413
|
-
}
|
|
1414
|
-
}
|
|
1415
|
-
i++;
|
|
1416
|
-
}
|
|
1417
|
-
|
|
1418
|
-
if (i >= content.length) {
|
|
1419
|
-
// No opening brace found, probably a single-line declaration
|
|
1420
|
-
return startLine;
|
|
1421
|
-
}
|
|
1422
|
-
|
|
1423
|
-
// Count braces to find matching closing brace
|
|
1424
|
-
braceCount = 1;
|
|
1425
|
-
i++;
|
|
1426
|
-
|
|
1427
|
-
while (i < content.length && braceCount > 0) {
|
|
1428
|
-
const char = content[i];
|
|
1429
|
-
|
|
1430
|
-
// Handle strings to avoid counting braces inside strings
|
|
1431
|
-
if (!inString) {
|
|
1432
|
-
if (char === '"' || char === "'" || char === '`') {
|
|
1433
|
-
inString = true;
|
|
1434
|
-
stringChar = char;
|
|
1435
|
-
} else if (char === '{') {
|
|
1436
|
-
braceCount++;
|
|
1437
|
-
} else if (char === '}') {
|
|
1438
|
-
braceCount--;
|
|
1439
|
-
}
|
|
1440
|
-
} else {
|
|
1441
|
-
if (char === stringChar && content[i - 1] !== '\\') {
|
|
1442
|
-
inString = false;
|
|
1443
|
-
stringChar = '';
|
|
1444
|
-
}
|
|
1445
|
-
}
|
|
1446
|
-
|
|
1447
|
-
i++;
|
|
1448
|
-
}
|
|
1449
|
-
|
|
1450
|
-
// Return the line number where the function ends
|
|
1451
|
-
return content.substring(0, i).split('\n').length;
|
|
1452
|
-
}
|
|
1453
|
-
|
|
1454
|
-
chunkByFunctions(content, functions, options) {
|
|
1455
|
-
const lines = content.split('\n');
|
|
1456
|
-
const chunks = [];
|
|
1457
|
-
let currentStart = 0;
|
|
1458
|
-
|
|
1459
|
-
for (let i = 0; i < functions.length; i++) {
|
|
1460
|
-
const func = functions[i];
|
|
1461
|
-
const funcStart = func.line - 1;
|
|
1462
|
-
const funcEnd = func.endLine;
|
|
1463
|
-
|
|
1464
|
-
// Add content before function (imports, comments, etc.)
|
|
1465
|
-
if (currentStart < funcStart) {
|
|
1466
|
-
const preLines = lines.slice(currentStart, funcStart);
|
|
1467
|
-
const preContent = preLines.join('\n').trim();
|
|
1468
|
-
|
|
1469
|
-
if (preContent.length > 0) {
|
|
1470
|
-
chunks.push({
|
|
1471
|
-
content: preContent,
|
|
1472
|
-
lineStart: currentStart + 1,
|
|
1473
|
-
lineEnd: funcStart,
|
|
1474
|
-
context: 'file_header'
|
|
1475
|
-
});
|
|
1476
|
-
}
|
|
1477
|
-
}
|
|
1478
|
-
|
|
1479
|
-
// Add the function itself
|
|
1480
|
-
const funcLines = lines.slice(funcStart, funcEnd);
|
|
1481
|
-
const funcContent = funcLines.join('\n');
|
|
1482
|
-
|
|
1483
|
-
// Check if function is too large and needs splitting
|
|
1484
|
-
if (this.estimateTokens(funcContent) > options.maxTokens * 1.5) {
|
|
1485
|
-
// Split large function into smaller chunks
|
|
1486
|
-
const subChunks = this.splitLargeFunction(funcContent, func, options);
|
|
1487
|
-
chunks.push(...subChunks);
|
|
1488
|
-
} else {
|
|
1489
|
-
chunks.push({
|
|
1490
|
-
content: funcContent,
|
|
1491
|
-
lineStart: funcStart + 1,
|
|
1492
|
-
lineEnd: funcEnd,
|
|
1493
|
-
context: `${func.type}_${func.name}`,
|
|
1494
|
-
entityName: func.name,
|
|
1495
|
-
entityType: func.type
|
|
1496
|
-
});
|
|
1497
|
-
}
|
|
1498
|
-
|
|
1499
|
-
currentStart = funcEnd;
|
|
1500
|
-
}
|
|
1501
|
-
|
|
1502
|
-
// Add remaining content after last function
|
|
1503
|
-
if (currentStart < lines.length) {
|
|
1504
|
-
const remainingLines = lines.slice(currentStart);
|
|
1505
|
-
const remainingContent = remainingLines.join('\n').trim();
|
|
1506
|
-
|
|
1507
|
-
if (remainingContent.length > 0) {
|
|
1508
|
-
chunks.push({
|
|
1509
|
-
content: remainingContent,
|
|
1510
|
-
lineStart: currentStart + 1,
|
|
1511
|
-
lineEnd: lines.length,
|
|
1512
|
-
context: 'file_footer'
|
|
1513
|
-
});
|
|
1514
|
-
}
|
|
1515
|
-
}
|
|
1516
|
-
|
|
1517
|
-
return chunks;
|
|
1518
|
-
}
|
|
1519
|
-
|
|
1520
|
-
/**
|
|
1521
|
-
* Split a large function into smaller semantic chunks
|
|
1522
|
-
* @param {string} funcContent - Function content
|
|
1523
|
-
* @param {object} func - Function metadata
|
|
1524
|
-
* @param {object} options - Chunking options
|
|
1525
|
-
* @returns {Array} Array of chunks
|
|
1526
|
-
*/
|
|
1527
|
-
splitLargeFunction(funcContent, func, options) {
|
|
1528
|
-
const chunks = [];
|
|
1529
|
-
const lines = funcContent.split('\n');
|
|
1530
|
-
const maxLines = Math.ceil(options.maxTokens / 20);
|
|
1531
|
-
|
|
1532
|
-
for (let i = 0; i < lines.length; i += maxLines) {
|
|
1533
|
-
const chunkLines = lines.slice(i, i + maxLines);
|
|
1534
|
-
const chunkContent = chunkLines.join('\n');
|
|
1535
|
-
|
|
1536
|
-
chunks.push({
|
|
1537
|
-
content: chunkContent,
|
|
1538
|
-
lineStart: func.line + i,
|
|
1539
|
-
lineEnd: func.line + Math.min(i + maxLines, lines.length) - 1,
|
|
1540
|
-
context: `${func.type}_${func.name}_part${Math.floor(i / maxLines) + 1}`,
|
|
1541
|
-
entityName: func.name,
|
|
1542
|
-
entityType: func.type,
|
|
1543
|
-
isPartial: true
|
|
1544
|
-
});
|
|
1545
|
-
}
|
|
1546
|
-
|
|
1547
|
-
return chunks;
|
|
1548
|
-
}
|
|
1549
|
-
}
|
|
1550
|
-
|
|
1551
|
-
class MarkupHandler extends BaseHandler {
|
|
1552
|
-
async generateChunks(content, options) {
|
|
1553
|
-
// Split by major HTML tags or by token limit
|
|
1554
|
-
const majorTags = ['<body>', '<div', '<section', '<article', '<main'];
|
|
1555
|
-
|
|
1556
|
-
for (const tag of majorTags) {
|
|
1557
|
-
if (content.includes(tag)) {
|
|
1558
|
-
return this.chunkByTag(content, tag, options);
|
|
1559
|
-
}
|
|
1560
|
-
}
|
|
1561
|
-
|
|
1562
|
-
return this.chunkByLines(content, options);
|
|
1563
|
-
}
|
|
1564
|
-
|
|
1565
|
-
chunkByTag(content, tag, options) {
|
|
1566
|
-
const parts = content.split(tag);
|
|
1567
|
-
const chunks = [];
|
|
1568
|
-
|
|
1569
|
-
parts.forEach((part, index) => {
|
|
1570
|
-
if (part.trim().length > 0) {
|
|
1571
|
-
const chunkContent = index === 0 ? part : tag + part;
|
|
1572
|
-
chunks.push({
|
|
1573
|
-
content: chunkContent,
|
|
1574
|
-
lineStart: 1, // Approximate
|
|
1575
|
-
lineEnd: chunkContent.split('\n').length
|
|
1576
|
-
});
|
|
1577
|
-
}
|
|
1578
|
-
});
|
|
1579
|
-
|
|
1580
|
-
return chunks;
|
|
1581
|
-
}
|
|
1582
|
-
}
|
|
1583
|
-
|
|
1584
|
-
class StylingHandler extends BaseHandler {
|
|
1585
|
-
async generateChunks(content, options) {
|
|
1586
|
-
// Split by CSS rules (closing braces)
|
|
1587
|
-
const rules = content.split('}');
|
|
1588
|
-
const chunks = [];
|
|
1589
|
-
let currentChunk = '';
|
|
1590
|
-
let lineStart = 1;
|
|
1591
|
-
|
|
1592
|
-
for (const rule of rules) {
|
|
1593
|
-
currentChunk += rule + '}';
|
|
1594
|
-
|
|
1595
|
-
if (this.estimateTokens(currentChunk) >= options.maxTokens) {
|
|
1596
|
-
const lineEnd = lineStart + currentChunk.split('\n').length - 1;
|
|
1597
|
-
chunks.push({
|
|
1598
|
-
content: currentChunk.trim(),
|
|
1599
|
-
lineStart,
|
|
1600
|
-
lineEnd
|
|
1601
|
-
});
|
|
1602
|
-
|
|
1603
|
-
currentChunk = '';
|
|
1604
|
-
lineStart = lineEnd + 1;
|
|
1605
|
-
}
|
|
1606
|
-
}
|
|
1607
|
-
|
|
1608
|
-
if (currentChunk.trim().length > 0) {
|
|
1609
|
-
chunks.push({
|
|
1610
|
-
content: currentChunk.trim(),
|
|
1611
|
-
lineStart,
|
|
1612
|
-
lineEnd: lineStart + currentChunk.split('\n').length - 1
|
|
1613
|
-
});
|
|
1614
|
-
}
|
|
1615
|
-
|
|
1616
|
-
return chunks;
|
|
1617
|
-
}
|
|
1618
|
-
|
|
1619
|
-
estimateTokens(content) {
|
|
1620
|
-
return Math.ceil(content.length / 4);
|
|
1621
|
-
}
|
|
1622
|
-
}
|
|
1623
|
-
|
|
1624
|
-
class ConfigPlainHandler extends BaseHandler {
|
|
1625
|
-
async generateChunks(content, options) {
|
|
1626
|
-
if (options.extension === '.json') {
|
|
1627
|
-
return this.chunkJson(content, options);
|
|
1628
|
-
} else if (['.yaml', '.yml'].includes(options.extension)) {
|
|
1629
|
-
return this.chunkYaml(content, options);
|
|
1630
|
-
} else if (options.extension === '.md') {
|
|
1631
|
-
return this.chunkMarkdown(content, options);
|
|
1632
|
-
}
|
|
1633
|
-
|
|
1634
|
-
return this.chunkByLines(content, options);
|
|
1635
|
-
}
|
|
1636
|
-
|
|
1637
|
-
chunkJson(content, options) {
|
|
1638
|
-
try {
|
|
1639
|
-
const parsed = JSON.parse(content);
|
|
1640
|
-
|
|
1641
|
-
if (typeof parsed === 'object' && !Array.isArray(parsed)) {
|
|
1642
|
-
// Split by top-level keys
|
|
1643
|
-
const chunks = [];
|
|
1644
|
-
const keys = Object.keys(parsed);
|
|
1645
|
-
|
|
1646
|
-
for (const key of keys) {
|
|
1647
|
-
const section = { [key]: parsed[key] };
|
|
1648
|
-
chunks.push({
|
|
1649
|
-
content: JSON.stringify(section, null, 2),
|
|
1650
|
-
lineStart: 1,
|
|
1651
|
-
lineEnd: JSON.stringify(section, null, 2).split('\n').length,
|
|
1652
|
-
context: `json_key_${key}`
|
|
1653
|
-
});
|
|
1654
|
-
}
|
|
1655
|
-
|
|
1656
|
-
return chunks;
|
|
1657
|
-
}
|
|
1658
|
-
} catch (error) {
|
|
1659
|
-
// Fall back to line-based chunking if JSON is invalid
|
|
1660
|
-
}
|
|
1661
|
-
|
|
1662
|
-
return this.chunkByLines(content, options);
|
|
1663
|
-
}
|
|
1664
|
-
|
|
1665
|
-
chunkMarkdown(content, options) {
|
|
1666
|
-
// Split by headers
|
|
1667
|
-
const headerRegex = /^#+\s+(.+)/gm;
|
|
1668
|
-
const chunks = [];
|
|
1669
|
-
let currentChunk = '';
|
|
1670
|
-
let lineStart = 1;
|
|
1671
|
-
let match;
|
|
1672
|
-
|
|
1673
|
-
while ((match = headerRegex.exec(content)) !== null) {
|
|
1674
|
-
if (currentChunk.length > 0) {
|
|
1675
|
-
const lineEnd = lineStart + currentChunk.split('\n').length - 1;
|
|
1676
|
-
chunks.push({
|
|
1677
|
-
content: currentChunk.trim(),
|
|
1678
|
-
lineStart,
|
|
1679
|
-
lineEnd
|
|
1680
|
-
});
|
|
1681
|
-
lineStart = lineEnd + 1;
|
|
1682
|
-
}
|
|
1683
|
-
|
|
1684
|
-
currentChunk = match[0] + '\n';
|
|
1685
|
-
}
|
|
1686
|
-
|
|
1687
|
-
if (currentChunk.trim().length > 0) {
|
|
1688
|
-
chunks.push({
|
|
1689
|
-
content: currentChunk.trim(),
|
|
1690
|
-
lineStart,
|
|
1691
|
-
lineEnd: lineStart + currentChunk.split('\n').length - 1
|
|
1692
|
-
});
|
|
1693
|
-
}
|
|
1694
|
-
|
|
1695
|
-
return chunks.length > 0 ? chunks : this.chunkByLines(content, options);
|
|
1696
|
-
}
|
|
1697
|
-
|
|
1698
|
-
chunkYaml(content, options) {
|
|
1699
|
-
// Simple YAML chunking by document separators and top-level keys
|
|
1700
|
-
const lines = content.split('\n');
|
|
1701
|
-
const chunks = [];
|
|
1702
|
-
let currentChunk = '';
|
|
1703
|
-
let lineStart = 1;
|
|
1704
|
-
|
|
1705
|
-
for (let i = 0; i < lines.length; i++) {
|
|
1706
|
-
const line = lines[i];
|
|
1707
|
-
currentChunk += line + '\n';
|
|
1708
|
-
|
|
1709
|
-
// Check if we hit a top-level key or document separator
|
|
1710
|
-
if (line.match(/^[a-zA-Z_][\w]*:/) || line === '---' || this.estimateTokens(currentChunk) >= options.maxTokens) {
|
|
1711
|
-
if (currentChunk.trim().length > 0) {
|
|
1712
|
-
chunks.push({
|
|
1713
|
-
content: currentChunk.trim(),
|
|
1714
|
-
lineStart,
|
|
1715
|
-
lineEnd: i + 1
|
|
1716
|
-
});
|
|
1717
|
-
|
|
1718
|
-
lineStart = i + 2;
|
|
1719
|
-
currentChunk = '';
|
|
1720
|
-
}
|
|
1721
|
-
}
|
|
1722
|
-
}
|
|
1723
|
-
|
|
1724
|
-
if (currentChunk.trim().length > 0) {
|
|
1725
|
-
chunks.push({
|
|
1726
|
-
content: currentChunk.trim(),
|
|
1727
|
-
lineStart,
|
|
1728
|
-
lineEnd: lines.length
|
|
1729
|
-
});
|
|
1730
|
-
}
|
|
1731
|
-
|
|
1732
|
-
return chunks.length > 0 ? chunks : this.chunkByLines(content, options);
|
|
1733
|
-
}
|
|
1734
|
-
|
|
1735
|
-
estimateTokens(content) {
|
|
1736
|
-
return Math.ceil(content.length / 4);
|
|
1737
|
-
}
|
|
1738
|
-
}
|
|
1739
|
-
|
|
1740
|
-
export default new RagGenerator();
|