codesummary 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1740 +0,0 @@
1
- import fs from 'fs-extra';
2
- import path from 'path';
3
- import crypto from 'crypto';
4
- import os from 'os';
5
- import { createReadStream } from 'fs';
6
- import ErrorHandler from './errorHandler.js';
7
- import RagConfigManager from './ragConfig.js';
8
- import { formatFileSize } from './utils.js';
9
-
10
- /**
11
- * Professional RAG Generator for CodeSummary
12
- * Generates streaming JSON output optimized for vector database ingestion
13
- * Follows deterministic, AI-free approach with efficient memory usage
14
- */
15
- export class RagGenerator {
16
- constructor() {
17
- // Global parameters
18
- this.maxTokensPerChunk = 1000;
19
- this.overlapTokens = 200;
20
- this.maxWorkers = Math.min(8, os.cpus().length);
21
-
22
- // Extension to language mapping (deterministic)
23
- this.extensionToLanguage = {
24
- '.js': 'JavaScript', '.jsx': 'JavaScript', '.ts': 'TypeScript', '.tsx': 'TypeScript',
25
- '.py': 'Python', '.java': 'Java', '.cs': 'C#', '.cpp': 'C++', '.c': 'C', '.h': 'C/C++',
26
- '.html': 'HTML', '.xml': 'XML', '.css': 'CSS', '.scss': 'SCSS',
27
- '.json': 'JSON', '.yaml': 'YAML', '.yml': 'YAML',
28
- '.md': 'Markdown', '.txt': 'Text',
29
- '.sh': 'Shell', '.bat': 'Batch'
30
- };
31
-
32
- // Initialize handlers
33
- this.handlers = this.initializeHandlers();
34
-
35
- // Statistics tracking
36
- this.stats = {
37
- filesProcessed: 0,
38
- chunksGenerated: 0,
39
- bytesWritten: 0,
40
- startTime: null,
41
- endTime: null
42
- };
43
-
44
-
45
- // Error collection
46
- this.errors = [];
47
- }
48
-
49
- /**
50
- * Main entry point - generates streaming RAG JSON
51
- * @param {object} filesByExtension - Files grouped by extension
52
- * @param {Array} selectedExtensions - Selected extensions to process
53
- * @param {string} outputPath - Output JSON file path
54
- * @param {string} projectName - Project name
55
- * @param {string} scanPath - Root scan path
56
- * @returns {object} Generation result
57
- */
58
- async generateRagOutput(filesByExtension, selectedExtensions, outputPath, projectName, scanPath) {
59
- this.stats.startTime = Date.now();
60
-
61
- try {
62
- console.log(`🚀 Starting RAG generation for ${projectName}`);
63
-
64
- // Load RAG configuration
65
- this.ragConfigManager = new RagConfigManager();
66
- const config = await this.ragConfigManager.loadConfig();
67
- this.updateConfigFromYAML(config);
68
-
69
- // Display configuration
70
- this.ragConfigManager.displayConfig();
71
-
72
- // Phase 1: Discovery and file preparation
73
- const discoveredFiles = await this.discoveryPhase(filesByExtension, selectedExtensions, scanPath);
74
-
75
- // Phase 2: Atomic JSON generation (thread-safe)
76
- const result = await this.generate(discoveredFiles, outputPath, projectName, scanPath);
77
-
78
- this.stats.endTime = Date.now();
79
- const duration = (this.stats.endTime - this.stats.startTime) / 1000;
80
-
81
- console.log(`✅ RAG generation completed in ${duration.toFixed(2)}s`);
82
- console.log(`📊 Stats: ${this.stats.filesProcessed} files, ${this.stats.chunksGenerated} chunks`);
83
-
84
- return {
85
- outputPath,
86
- totalFiles: this.stats.filesProcessed,
87
- totalChunks: this.stats.chunksGenerated,
88
- duration,
89
- success: true
90
- };
91
-
92
- } catch (error) {
93
- ErrorHandler.handleError(error, 'RAG Generation');
94
- throw error;
95
- }
96
- }
97
-
98
- /**
99
- * Update internal configuration from loaded YAML config
100
- * @param {object} yamlConfig - Configuration from YAML
101
- */
102
- updateConfigFromYAML(yamlConfig) {
103
- if (yamlConfig.chunking) {
104
- this.maxTokensPerChunk = yamlConfig.chunking.maxTokens || this.maxTokensPerChunk;
105
- this.overlapTokens = yamlConfig.chunking.overlap || this.overlapTokens;
106
- }
107
-
108
- if (yamlConfig.performance) {
109
- this.maxWorkers = Math.min(
110
- yamlConfig.performance.maxWorkers || this.maxWorkers,
111
- os.cpus().length
112
- );
113
- }
114
-
115
- // Store full config for handlers to use
116
- this.yamlConfig = yamlConfig;
117
- }
118
-
119
-
120
-
121
- /**
122
- * Phase 1: Discovery - BFS traversal and file metadata collection
123
- * @param {object} filesByExtension - Files by extension
124
- * @param {Array} selectedExtensions - Selected extensions
125
- * @param {string} scanPath - Root scan path
126
- * @returns {Array} Discovered files with metadata
127
- */
128
- async discoveryPhase(filesByExtension, selectedExtensions, scanPath) {
129
- console.log('🔍 Discovery phase: collecting file metadata...');
130
-
131
- const discoveredFiles = [];
132
- let processed = 0;
133
- const totalFiles = selectedExtensions.reduce((sum, ext) => sum + (filesByExtension[ext]?.length || 0), 0);
134
-
135
- // Process files concurrently but limit memory usage
136
- const batchSize = 50;
137
-
138
- for (const extension of selectedExtensions) {
139
- const files = filesByExtension[extension] || [];
140
-
141
- for (let i = 0; i < files.length; i += batchSize) {
142
- const batch = files.slice(i, i + batchSize);
143
- const batchResults = await Promise.all(
144
- batch.map(fileInfo => this.enrichFileMetadata(fileInfo, extension, scanPath))
145
- );
146
-
147
- discoveredFiles.push(...batchResults.filter(Boolean));
148
- processed += batch.length;
149
-
150
- // Progress reporting with validation
151
- const progress = (processed / totalFiles * 100).toFixed(1);
152
- const validFiles = discoveredFiles.length;
153
- const skippedFiles = processed - validFiles;
154
- process.stdout.write(`\r📊 Discovery: ${progress}% (${validFiles} valid, ${skippedFiles} skipped)`);
155
-
156
- // Internal validation
157
- if (processed % 50 === 0) {
158
- this.validateDiscoveryProgress(discoveredFiles, processed);
159
- }
160
- }
161
- }
162
-
163
- console.log(`\n✅ Discovery completed: ${discoveredFiles.length} files enriched`);
164
- return discoveredFiles;
165
- }
166
-
167
- /**
168
- * Enrich file with metadata including hash, tags, and analysis
169
- * @param {object} fileInfo - Basic file info from scanner
170
- * @param {string} extension - File extension
171
- * @param {string} scanPath - Root scan path
172
- * @returns {object} Enriched file metadata
173
- */
174
- async enrichFileMetadata(fileInfo, extension, scanPath) {
175
- try {
176
- // Calculate SHA-256 hash in streaming mode
177
- const hash = await this.calculateFileHash(fileInfo.absolutePath);
178
-
179
- // Determine language and tags
180
- const language = this.extensionToLanguage[extension] || 'Unknown';
181
- const tags = this.extractFileTags(fileInfo.relativePath, extension);
182
-
183
- // Basic file stats
184
- const stats = await fs.stat(fileInfo.absolutePath);
185
-
186
- return {
187
- id: hash.substring(0, 16), // Use first 16 chars of hash as unique ID
188
- path: fileInfo.relativePath,
189
- absolutePath: fileInfo.absolutePath,
190
- extension,
191
- language,
192
- size: stats.size,
193
- hash: `sha256-${hash}`,
194
- modified: stats.mtime.toISOString(),
195
- tags,
196
- // Will be populated during chunking
197
- chunks: null,
198
- // Metadata for processing
199
- _stats: stats
200
- };
201
-
202
- } catch (error) {
203
- console.warn(`⚠️ Could not process file ${fileInfo.relativePath}: ${error.message}`);
204
- return null;
205
- }
206
- }
207
-
208
- /**
209
- * Calculate SHA-256 hash of file in streaming mode
210
- * @param {string} filePath - File path
211
- * @returns {string} SHA-256 hash (hex)
212
- */
213
- async calculateFileHash(filePath) {
214
- return new Promise((resolve, reject) => {
215
- const hash = crypto.createHash('sha256');
216
- const stream = createReadStream(filePath);
217
-
218
- stream.on('data', data => hash.update(data));
219
- stream.on('end', () => resolve(hash.digest('hex')));
220
- stream.on('error', reject);
221
- });
222
- }
223
-
224
- /**
225
- * Extract file tags based on path heuristics
226
- * @param {string} relativePath - Relative file path
227
- * @param {string} extension - File extension
228
- * @returns {Array} Array of tags
229
- */
230
- extractFileTags(relativePath, extension) {
231
- const tags = [];
232
- const pathLower = relativePath.toLowerCase();
233
- const fileName = path.basename(relativePath, extension).toLowerCase();
234
- const fullPath = relativePath.toLowerCase();
235
-
236
- // Path-based tags (enhanced)
237
- if (pathLower.includes('/test/') || pathLower.includes('\\test\\')) tags.push('test');
238
- if (pathLower.includes('/spec/') || pathLower.includes('\\spec\\')) tags.push('test');
239
- if (pathLower.includes('/__tests__/') || pathLower.includes('\\__tests__\\')) tags.push('test');
240
- if (pathLower.includes('/scripts/') || pathLower.includes('\\scripts\\')) tags.push('script');
241
- if (pathLower.includes('/config/') || pathLower.includes('\\config\\')) tags.push('config');
242
- if (pathLower.includes('/lib/') || pathLower.includes('\\lib\\')) tags.push('library');
243
- if (pathLower.includes('/utils/') || pathLower.includes('\\utils\\')) tags.push('utility');
244
- if (pathLower.includes('/helpers/') || pathLower.includes('\\helpers\\')) tags.push('utility');
245
-
246
- // Framework-specific tags
247
- if (pathLower.includes('/pages/') || pathLower.includes('\\pages\\')) tags.push('page');
248
- if (pathLower.includes('/components/') || pathLower.includes('\\components\\')) tags.push('component');
249
- if (pathLower.includes('/shared/') || pathLower.includes('\\shared\\')) tags.push('shared');
250
- if (pathLower.includes('/common/') || pathLower.includes('\\common\\')) tags.push('shared');
251
- if (pathLower.includes('/hooks/') || pathLower.includes('\\hooks\\')) tags.push('hook');
252
- if (pathLower.includes('/services/') || pathLower.includes('\\services\\')) tags.push('service');
253
- if (pathLower.includes('/api/') || pathLower.includes('\\api\\')) tags.push('api');
254
- if (pathLower.includes('/routes/') || pathLower.includes('\\routes\\')) tags.push('route');
255
- if (pathLower.includes('/controllers/') || pathLower.includes('\\controllers\\')) tags.push('controller');
256
- if (pathLower.includes('/models/') || pathLower.includes('\\models\\')) tags.push('model');
257
- if (pathLower.includes('/views/') || pathLower.includes('\\views\\')) tags.push('view');
258
- if (pathLower.includes('/layouts/') || pathLower.includes('\\layouts\\')) tags.push('layout');
259
- if (pathLower.includes('/middleware/') || pathLower.includes('\\middleware\\')) tags.push('middleware');
260
-
261
- // Build and tooling
262
- if (pathLower.includes('/build/') || pathLower.includes('\\build\\')) tags.push('build');
263
- if (pathLower.includes('/dist/') || pathLower.includes('\\dist\\')) tags.push('build');
264
- if (pathLower.includes('/.github/') || pathLower.includes('\\.github\\')) tags.push('ci');
265
- if (pathLower.includes('/workflows/') || pathLower.includes('\\workflows\\')) tags.push('ci');
266
-
267
- // Filename-based tags (enhanced)
268
- if (fileName.includes('config')) tags.push('config');
269
- if (fileName.includes('test') || fileName.includes('spec')) tags.push('test');
270
- if (fileName.includes('index')) tags.push('entry');
271
- if (fileName.includes('main')) tags.push('entry');
272
- if (fileName.includes('app')) tags.push('application');
273
- if (fileName.includes('component')) tags.push('component');
274
- if (fileName.includes('page')) tags.push('page');
275
- if (fileName.includes('layout')) tags.push('layout');
276
- if (fileName.includes('service')) tags.push('service');
277
- if (fileName.includes('util') || fileName.includes('helper')) tags.push('utility');
278
- if (fileName.includes('hook')) tags.push('hook');
279
- if (fileName.includes('api')) tags.push('api');
280
- if (fileName.includes('route')) tags.push('route');
281
- if (fileName.includes('model')) tags.push('model');
282
- if (fileName.includes('controller')) tags.push('controller');
283
- if (fileName.includes('middleware')) tags.push('middleware');
284
- if (fileName.includes('store') || fileName.includes('state')) tags.push('state');
285
- if (fileName.includes('context')) tags.push('context');
286
- if (fileName.includes('provider')) tags.push('provider');
287
-
288
- // Extension-based tags (enhanced)
289
- if (['.test.js', '.spec.js', '.test.ts', '.spec.ts', '.test.tsx', '.spec.tsx'].some(ext => fullPath.endsWith(ext))) {
290
- tags.push('test');
291
- }
292
- if (['.d.ts'].some(ext => fullPath.endsWith(ext))) {
293
- tags.push('types');
294
- }
295
- if (['.stories.js', '.stories.ts', '.stories.tsx'].some(ext => fullPath.endsWith(ext))) {
296
- tags.push('storybook');
297
- }
298
- if (['.cy.js', '.cy.ts'].some(ext => fullPath.endsWith(ext))) {
299
- tags.push('e2e');
300
- }
301
-
302
- // Framework detection
303
- if (extension === '.tsx' || extension === '.jsx') {
304
- tags.push('react');
305
- }
306
- if (fullPath.includes('vue') || extension === '.vue') {
307
- tags.push('vue');
308
- }
309
- if (fullPath.includes('angular') || fullPath.includes('.component.') || fullPath.includes('.service.')) {
310
- tags.push('angular');
311
- }
312
- if (fullPath.includes('next') || fullPath.includes('_app.') || fullPath.includes('_document.')) {
313
- tags.push('nextjs');
314
- }
315
-
316
- // Special files
317
- if (['readme', 'license', 'changelog', 'contributing'].includes(fileName)) {
318
- tags.push('documentation');
319
- }
320
- if (['dockerfile', 'docker-compose', '.dockerignore'].includes(fileName)) {
321
- tags.push('docker');
322
- }
323
- if (['package.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml'].includes(path.basename(relativePath))) {
324
- tags.push('package');
325
- }
326
- if (['tsconfig.json', 'jsconfig.json', 'webpack.config.js', 'vite.config.js'].includes(path.basename(relativePath))) {
327
- tags.push('config');
328
- }
329
-
330
- // Infrastructure files
331
- if (extension === '.bat' || extension === '.cmd') {
332
- tags.push('infrastructure', 'script', 'windows');
333
- }
334
- if (extension === '.sh') {
335
- tags.push('infrastructure', 'script', 'unix');
336
- }
337
- if (extension === '.json' && (fileName.includes('config') || fileName.includes('settings') || fileName.includes('.config.'))) {
338
- tags.push('infrastructure', 'config');
339
- }
340
- if (['makefile', 'makefile.am', 'cmake', 'cmakelists.txt'].includes(fileName)) {
341
- tags.push('infrastructure', 'build');
342
- }
343
- if (['readme', 'license', 'changelog', 'contributing'].includes(fileName)) {
344
- tags.push('documentation');
345
- }
346
-
347
- return [...new Set(tags)]; // Remove duplicates
348
- }
349
-
350
-
351
-
352
-
353
-
354
-
355
- /**
356
- * Initialize specialized handlers for different file types
357
- * @returns {object} Handler registry
358
- */
359
- initializeHandlers() {
360
- return {
361
- 'code-c-like': new CLikeHandler(),
362
- 'code-script': new ScriptHandler(),
363
- 'markup': new MarkupHandler(),
364
- 'styling': new StylingHandler(),
365
- 'config-plain': new ConfigPlainHandler()
366
- };
367
- }
368
-
369
- /**
370
- * Get appropriate handler for file extension with full coverage
371
- * @param {string} extension - File extension
372
- * @returns {object} Handler instance
373
- */
374
- getHandler(extension) {
375
- // Complete mapping for all 22 target extensions
376
- const handlerMap = {
377
- // Code-C like (5 extensions)
378
- '.c': 'code-c-like', '.h': 'code-c-like', '.cpp': 'code-c-like',
379
- '.cs': 'code-c-like', '.java': 'code-c-like',
380
-
381
- // Code-Script (7 extensions)
382
- '.js': 'code-script', '.jsx': 'code-script', '.ts': 'code-script',
383
- '.tsx': 'code-script', '.py': 'code-script', '.sh': 'code-script', '.bat': 'code-script',
384
-
385
- // Markup (2 extensions)
386
- '.html': 'markup', '.xml': 'markup',
387
-
388
- // Styling (2 extensions)
389
- '.css': 'styling', '.scss': 'styling',
390
-
391
- // Config/Plain (6 extensions)
392
- '.json': 'config-plain', '.yaml': 'config-plain', '.yml': 'config-plain',
393
- '.md': 'config-plain', '.txt': 'config-plain'
394
- };
395
-
396
- const handlerType = handlerMap[extension];
397
-
398
- if (!handlerType) {
399
- console.warn(`⚠️ No handler found for extension: ${extension}`);
400
- return this.handlers['config-plain']; // Fallback
401
- }
402
-
403
- return this.handlers[handlerType];
404
- }
405
-
406
- /**
407
- * Verify extension coverage against target list
408
- * @param {Array} processedExtensions - Extensions found in processing
409
- */
410
- verifyExtensionCoverage(processedExtensions) {
411
- const targetExtensions = [
412
- '.json', '.ts', '.js', '.jsx', '.tsx', '.xml', '.html', '.css', '.scss',
413
- '.md', '.txt', '.py', '.java', '.cs', '.cpp', '.c', '.h', '.yaml', '.yml',
414
- '.sh', '.bat'
415
- ]; // 22 total extensions
416
-
417
- const missing = targetExtensions.filter(ext => !processedExtensions.includes(ext));
418
- const extra = processedExtensions.filter(ext => !targetExtensions.includes(ext));
419
-
420
- console.log(`\n📊 Extension Coverage Analysis:`);
421
- console.log(` Target extensions: ${targetExtensions.length}`);
422
- console.log(` Processed extensions: ${processedExtensions.length}`);
423
-
424
- if (missing.length > 0) {
425
- console.warn(` ⚠️ Missing: ${missing.join(', ')}`);
426
- }
427
-
428
- if (extra.length > 0) {
429
- console.log(` ➕ Extra: ${extra.join(', ')}`);
430
- }
431
-
432
- if (missing.length === 0) {
433
- console.log(` ✅ Full coverage achieved!`);
434
- }
435
-
436
- return {
437
- targetCount: targetExtensions.length,
438
- processedCount: processedExtensions.length,
439
- missing,
440
- extra,
441
- coverage: ((targetExtensions.length - missing.length) / targetExtensions.length * 100).toFixed(1)
442
- };
443
- }
444
-
445
- /**
446
- * Improved token estimation using multiple heuristics
447
- * @param {string} content - Text content
448
- * @param {string} language - Programming language for context
449
- * @returns {number} Estimated token count
450
- */
451
- safeEstimateTokens(content, language = 'text') {
452
- try {
453
- if (typeof content !== 'string') {
454
- console.warn('⚠️ Non-string content passed to token estimator');
455
- return 0;
456
- }
457
-
458
- if (content.length === 0) return 0;
459
-
460
- // Base estimation using multiple factors
461
- const charCount = content.length;
462
- const wordCount = content.trim().split(/\s+/).length;
463
- const lineCount = content.split('\n').length;
464
-
465
- // Language-specific adjustments
466
- let tokensPerChar = 0.25; // Default: ~4 chars per token
467
- let tokensPerWord = 1.3; // Default: ~1.3 tokens per word
468
-
469
- // Adjust based on content type
470
- if (['javascript', 'typescript', 'python', 'java', 'c++', 'c#'].includes(language.toLowerCase())) {
471
- // Code tends to have more symbols and operators
472
- tokensPerChar = 0.28;
473
- tokensPerWord = 1.4;
474
-
475
- // Additional tokens for common code patterns
476
- const brackets = (content.match(/[{}()\[\]]/g) || []).length;
477
- const operators = (content.match(/[+\-*/%=<>!&|^~]/g) || []).length;
478
- const dots = (content.match(/\./g) || []).length;
479
-
480
- const syntaxTokens = Math.ceil((brackets + operators + dots) * 0.15);
481
-
482
- // Character-based estimation with syntax bonus
483
- const charEstimate = Math.ceil(charCount * tokensPerChar) + syntaxTokens;
484
- const wordEstimate = Math.ceil(wordCount * tokensPerWord);
485
-
486
- return Math.max(charEstimate, wordEstimate);
487
- } else if (['json', 'yaml', 'xml', 'html'].includes(language.toLowerCase())) {
488
- // Structured data tends to be more compact in tokens
489
- tokensPerChar = 0.22;
490
- tokensPerWord = 1.1;
491
- } else if (language.toLowerCase() === 'markdown') {
492
- // Markdown has formatting symbols but is mostly text
493
- tokensPerChar = 0.26;
494
- tokensPerWord = 1.2;
495
- }
496
-
497
- // Calculate estimates using both methods
498
- const charEstimate = Math.ceil(charCount * tokensPerChar);
499
- const wordEstimate = Math.ceil(wordCount * tokensPerWord);
500
-
501
- // Return the higher estimate for safety (avoid truncation)
502
- return Math.max(charEstimate, wordEstimate, Math.ceil(charCount / 4));
503
-
504
- } catch (error) {
505
- console.warn(`⚠️ Token estimation error: ${error.message}`);
506
- return Math.ceil((content?.length || 0) / 4);
507
- }
508
- }
509
-
510
- /**
511
- * Estimate token count using simple heuristic
512
- * @param {string} content - Text content
513
- * @returns {number} Estimated token count
514
- */
515
- estimateTokens(content) {
516
- return Math.ceil(content.length / 4);
517
- }
518
-
519
- /**
520
- * Extract imports from content using simple regex
521
- * @param {string} content - File content
522
- * @param {string} extension - File extension
523
- * @returns {Array} Array of import statements
524
- */
525
- extractImports(content, extension) {
526
- const imports = [];
527
-
528
- switch (extension) {
529
- case '.js':
530
- case '.jsx':
531
- case '.ts':
532
- case '.tsx':
533
- // import ... from '...'
534
- const importRegex = /import\s+.*?from\s+['"]([^'"]+)['"]/g;
535
- let match;
536
- while ((match = importRegex.exec(content)) !== null) {
537
- imports.push(match[1]);
538
- }
539
-
540
- // require('...')
541
- const requireRegex = /require\s*\(\s*['"]([^'"]+)['"]\s*\)/g;
542
- while ((match = requireRegex.exec(content)) !== null) {
543
- imports.push(match[1]);
544
- }
545
- break;
546
-
547
- case '.py':
548
- // import ... / from ... import ...
549
- const pyImportRegex = /(?:from\s+(\S+)\s+import|import\s+(\S+))/g;
550
- while ((match = pyImportRegex.exec(content)) !== null) {
551
- imports.push(match[1] || match[2]);
552
- }
553
- break;
554
-
555
- case '.c':
556
- case '.cpp':
557
- case '.h':
558
- // #include "..." / #include <...>
559
- const includeRegex = /#include\s*[<"]([^>"]+)[>"]/g;
560
- while ((match = includeRegex.exec(content)) !== null) {
561
- imports.push(match[1]);
562
- }
563
- break;
564
- }
565
-
566
- return [...new Set(imports)]; // Remove duplicates
567
- }
568
-
569
- /**
570
- * Extract function/method calls using simple regex
571
- * @param {string} content - File content
572
- * @param {string} extension - File extension
573
- * @returns {Array} Array of function calls
574
- */
575
- extractCalls(content, extension) {
576
- const calls = [];
577
-
578
- // Generic function call pattern: identifier followed by (
579
- const callRegex = /\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(/g;
580
- let match;
581
-
582
- while ((match = callRegex.exec(content)) !== null) {
583
- const funcName = match[1];
584
-
585
- // Filter out language keywords
586
- const keywords = ['if', 'for', 'while', 'switch', 'catch', 'typeof', 'return', 'new'];
587
- if (!keywords.includes(funcName) && funcName.length > 1) {
588
- calls.push(funcName);
589
- }
590
- }
591
-
592
- // Return unique calls, limited to prevent noise
593
- return [...new Set(calls)].slice(0, 20);
594
- }
595
-
596
- /**
597
- * Validate generated JSON file
598
- * @param {string} outputPath - Path to generated JSON
599
- */
600
- async validateGeneratedJSON(outputPath) {
601
- try {
602
- // Check file exists and is readable
603
- const stats = await fs.stat(outputPath);
604
-
605
- // Check file size warnings
606
- const maxSize = this.ragConfigManager.parseFileSize(this.yamlConfig?.quality?.maxOutputSize || '250MB');
607
- if (stats.size > maxSize) {
608
- console.warn(`⚠️ Generated file is large: ${formatFileSize(stats.size)} (>${formatFileSize(maxSize)})`);
609
- }
610
-
611
- // Simple validation: read first 1KB to check JSON structure
612
- const stream = createReadStream(outputPath, { encoding: 'utf8', start: 0, end: 1023 });
613
- let sampleText = '';
614
-
615
- for await (const chunk of stream) {
616
- sampleText += chunk;
617
- }
618
-
619
- // Check for basic JSON structure
620
- if (!sampleText.trim().startsWith('{')) {
621
- throw new Error('Generated file does not start with valid JSON');
622
- }
623
-
624
- // Check for expected structure
625
- if (!sampleText.includes('"metadata"') || !sampleText.includes('"files"')) {
626
- console.warn('⚠️ JSON structure may be incomplete - expected sections not found in sample');
627
- }
628
-
629
- console.log('✅ JSON validation passed');
630
-
631
- } catch (error) {
632
- console.error(`❌ JSON validation failed: ${error.message}`);
633
- // Don't re-throw - just warn since file was successfully written
634
- console.warn('⚠️ Continuing despite validation warning - file was generated successfully');
635
- }
636
- }
637
-
638
- /**
639
- * Validate discovery progress for quality assurance
640
- * @param {Array} discoveredFiles - Files discovered so far
641
- * @param {number} processedCount - Total files processed
642
- */
643
- validateDiscoveryProgress(discoveredFiles, processedCount) {
644
- const issues = [];
645
-
646
- // Check for duplicate hashes
647
- const hashes = new Set();
648
- const duplicates = [];
649
-
650
- for (const file of discoveredFiles) {
651
- if (hashes.has(file.hash)) {
652
- duplicates.push(file.hash.substring(0, 8));
653
- } else {
654
- hashes.add(file.hash);
655
- }
656
- }
657
-
658
- if (duplicates.length > 0) {
659
- issues.push(`Duplicate hashes detected: ${duplicates.join(', ')}`);
660
- }
661
-
662
- // Check file size distribution
663
- const largeSizeThreshold = this.ragConfigManager.parseFileSize(this.yamlConfig?.performance?.maxFileSize || '100MB');
664
- const largeFiles = discoveredFiles.filter(f => f.size > largeSizeThreshold);
665
-
666
- if (largeFiles.length > 0) {
667
- issues.push(`${largeFiles.length} files exceed size threshold`);
668
- }
669
-
670
- // Check tag distribution
671
- const tagCounts = {};
672
- discoveredFiles.forEach(file => {
673
- file.tags.forEach(tag => {
674
- tagCounts[tag] = (tagCounts[tag] || 0) + 1;
675
- });
676
- });
677
-
678
- const untaggedFiles = discoveredFiles.filter(f => f.tags.length === 0);
679
- if (untaggedFiles.length > discoveredFiles.length * 0.5) {
680
- issues.push(`High untagged ratio: ${untaggedFiles.length}/${discoveredFiles.length}`);
681
- }
682
-
683
- // Report issues if any
684
- if (issues.length > 0) {
685
- console.warn(`\n⚠️ Discovery validation issues: ${issues.join(', ')}`);
686
- }
687
- }
688
-
689
- /**
690
- * Validate processing progress for quality assurance
691
- * @param {Map} chunkOffsets - Current chunk offsets
692
- */
693
- validateProcessingProgress(chunkOffsets) {
694
- const issues = [];
695
-
696
- // Check chunk size distribution
697
- const chunkSizes = [];
698
- for (const [chunkId, offsetData] of chunkOffsets.entries()) {
699
- const size = offsetData.contentEnd - offsetData.contentStart;
700
- chunkSizes.push(size);
701
- }
702
-
703
- if (chunkSizes.length > 0) {
704
- const avgChunkSize = chunkSizes.reduce((a, b) => a + b, 0) / chunkSizes.length;
705
- const maxChunkSize = Math.max(...chunkSizes);
706
- const maxChunkThreshold = this.ragConfigManager.parseFileSize(this.yamlConfig?.quality?.maxChunkSize || '50KB');
707
-
708
- if (maxChunkSize > maxChunkThreshold) {
709
- issues.push(`Large chunk detected: ${formatFileSize(maxChunkSize)}`);
710
- }
711
-
712
- if (avgChunkSize < 100) {
713
- issues.push(`Small average chunk size: ${formatFileSize(avgChunkSize)}`);
714
- }
715
- }
716
-
717
- // Check offset consistency
718
- let invalidOffsets = 0;
719
- for (const [chunkId, offsetData] of chunkOffsets.entries()) {
720
- if (offsetData.contentStart >= offsetData.contentEnd) {
721
- invalidOffsets++;
722
- }
723
- }
724
-
725
- if (invalidOffsets > 0) {
726
- issues.push(`Invalid offsets: ${invalidOffsets} chunks`);
727
- }
728
-
729
- // Report issues if any
730
- if (issues.length > 0) {
731
- console.warn(`\n⚠️ Processing validation issues: ${issues.join(', ')}`);
732
- }
733
- }
734
-
735
- /**
736
- * Final validation of generated output with seek inverse testing
737
- * @param {string} outputPath - Generated file path
738
- * @param {Array} discoveredFiles - All processed files
739
- * @param {Map} chunkOffsets - All chunk offsets
740
- */
741
- async validateFinalOutput(outputPath, discoveredFiles, chunkOffsets) {
742
- const issues = [];
743
-
744
- try {
745
- const stats = await fs.stat(outputPath);
746
-
747
- // Check file size
748
- const maxOutputSize = this.ragConfigManager.parseFileSize(this.yamlConfig?.quality?.maxOutputSize || '250MB');
749
- if (stats.size > maxOutputSize) {
750
- issues.push(`Output size (${formatFileSize(stats.size)}) exceeds threshold`);
751
- }
752
-
753
- // Check completeness
754
- const expectedChunks = discoveredFiles.reduce((sum, file) => sum + (file.chunks?.length || 0), 0);
755
- const actualChunks = chunkOffsets.size;
756
-
757
- if (expectedChunks !== actualChunks) {
758
- issues.push(`Chunk count mismatch: expected ${expectedChunks}, got ${actualChunks}`);
759
- }
760
-
761
- // Check for empty chunks
762
- const emptyChunks = Array.from(chunkOffsets.values()).filter(offset =>
763
- offset.contentEnd - offset.contentStart < 10
764
- ).length;
765
-
766
- if (emptyChunks > 0) {
767
- issues.push(`${emptyChunks} near-empty chunks detected`);
768
- }
769
-
770
- // SEEK INVERSE TESTING - Test random chunk offsets
771
- await this.validateSeekInverse(outputPath, chunkOffsets);
772
-
773
- // JSON Schema validation (basic)
774
- await this.validateJsonStructure(outputPath);
775
-
776
- // Report final validation
777
- if (issues.length > 0) {
778
- console.warn(`\n⚠️ Final validation issues:`);
779
- issues.forEach(issue => console.warn(` • ${issue}`));
780
- } else {
781
- console.log('✅ Final validation passed - output is healthy');
782
- }
783
-
784
- } catch (error) {
785
- console.error(`❌ Final validation failed: ${error.message}`);
786
- }
787
- }
788
-
789
- /**
790
- * Test seek operations on random chunk offsets to verify accuracy
791
- * @param {string} outputPath - Generated JSON file path
792
- * @param {Map} chunkOffsets - Chunk offset map
793
- */
794
- async validateSeekInverse(outputPath, chunkOffsets) {
795
- const chunkIds = Array.from(chunkOffsets.keys());
796
- const testCount = Math.min(3, chunkIds.length); // Test 2-3 random chunks
797
-
798
- if (testCount === 0) {
799
- console.warn('⚠️ No chunks to test for seek validation');
800
- return;
801
- }
802
-
803
- console.log(`🔍 Testing seek inverse on ${testCount} random chunks...`);
804
-
805
- for (let i = 0; i < testCount; i++) {
806
- const randomIndex = Math.floor(Math.random() * chunkIds.length);
807
- const chunkId = chunkIds[randomIndex];
808
- const offsetData = chunkOffsets.get(chunkId);
809
-
810
- try {
811
- // Read the specific chunk content using simple file read
812
- const fullContent = await fs.readFile(outputPath, 'utf8');
813
- const seekContent = fullContent.slice(offsetData.contentStart, offsetData.contentEnd);
814
-
815
- // Verify it's valid JSON content (should be a JSON string value)
816
- try {
817
- // Try to parse as JSON - if it's valid JSON string content, this should work
818
- const parsed = JSON.parse(seekContent);
819
- if (typeof parsed === 'string') {
820
- console.log(` ✅ Chunk ${chunkId}: seek successful, valid JSON string (${seekContent.length} bytes)`);
821
- } else {
822
- console.log(` ✅ Chunk ${chunkId}: seek successful, valid JSON (${typeof parsed}, ${seekContent.length} bytes)`);
823
- }
824
- } catch (parseError) {
825
- // If it doesn't parse as JSON, it might be a partial chunk
826
- console.log(` ✅ Chunk ${chunkId}: seek successful, partial content (${seekContent.length} bytes)`);
827
- }
828
-
829
- } catch (error) {
830
- console.error(` ❌ Chunk ${chunkId}: seek failed - ${error.message}`);
831
- }
832
- }
833
- }
834
-
835
- /**
836
- * Basic JSON structure validation
837
- * @param {string} outputPath - Generated JSON file path
838
- */
839
- async validateJsonStructure(outputPath) {
840
- try {
841
- // Read full content for validation (simpler approach)
842
- const fullContent = await fs.readFile(outputPath, 'utf8');
843
- const startText = fullContent.slice(0, 1024).trim();
844
- const endText = fullContent.slice(-1024).trim();
845
-
846
- // Basic structure checks
847
- const issues = [];
848
-
849
- if (!startText.startsWith('{')) {
850
- issues.push('File does not start with {');
851
- }
852
-
853
- if (!endText.endsWith('}')) {
854
- issues.push('File does not end with }');
855
- }
856
-
857
- if (!startText.includes('"metadata"')) {
858
- issues.push('Missing metadata section');
859
- }
860
-
861
- if (!startText.includes('"files"')) {
862
- issues.push('Missing files section');
863
- }
864
-
865
- if (!fullContent.includes('"index"')) {
866
- issues.push('Missing index section');
867
- }
868
-
869
- if (issues.length === 0) {
870
- console.log('✅ JSON structure validation passed');
871
- } else {
872
- console.warn(`⚠️ JSON structure issues: ${issues.join(', ')}`);
873
- }
874
-
875
- } catch (error) {
876
- console.error(`❌ JSON structure validation failed: ${error.message}`);
877
- }
878
- }
879
-
880
- /**
881
- * Generate RAG output atomically - build complete structure in memory (thread-safe)
882
- */
883
- async generate(discoveredFiles, outputPath, projectName, scanPath) {
884
- console.log('📝 Atomic generation: processing all files in memory...');
885
-
886
- await fs.ensureDir(path.dirname(outputPath));
887
-
888
- const processedFiles = [];
889
- let totalChunks = 0;
890
-
891
- for (let i = 0; i < discoveredFiles.length; i++) {
892
- const fileData = discoveredFiles[i];
893
- const progress = ((i + 1) / discoveredFiles.length * 100).toFixed(1);
894
-
895
- process.stdout.write(`\r📊 Processing: ${progress}% (${i + 1}/${discoveredFiles.length})`);
896
-
897
- try {
898
- const processedFile = await this.processFileInMemory(fileData);
899
- processedFiles.push(processedFile);
900
- totalChunks += processedFile.chunks?.length || 0;
901
- this.stats.filesProcessed++;
902
- this.stats.chunksGenerated += processedFile.chunks?.length || 0;
903
- } catch (error) {
904
- console.warn(`\n⚠️ Error processing ${fileData.path}: ${error.message}`);
905
- this.errors.push({ file: fileData.path, error: error.message });
906
- processedFiles.push({ ...fileData, chunks: [], error: error.message });
907
- }
908
- }
909
-
910
- console.log(`\n✅ All files processed: ${processedFiles.length} files, ${totalChunks} chunks`);
911
-
912
- const completeJSON = this.buildCompleteJSON(processedFiles, projectName, scanPath);
913
- const finalJSON = this.calculateAndInjectOffsets(completeJSON);
914
-
915
- await fs.writeFile(outputPath, finalJSON, 'utf8');
916
- this.stats.bytesWritten = finalJSON.length;
917
-
918
- if (this.yamlConfig?.output?.validation) {
919
- console.log('🔍 Validating generated output...');
920
- await this.validateGeneratedJSON(outputPath);
921
- }
922
-
923
- console.log(`✅ JSON written successfully to ${outputPath}`);
924
-
925
- return {
926
- outputPath,
927
- totalFiles: processedFiles.length,
928
- totalChunks,
929
- bytesWritten: finalJSON.length,
930
- extensionCoverage: this.verifyExtensionCoverage([...new Set(processedFiles.map(f => f.extension))])
931
- };
932
- }
933
-
934
- async processFileInMemory(fileData) {
935
- const content = await fs.readFile(fileData.absolutePath, 'utf8');
936
- const handler = this.getHandler(fileData.extension);
937
-
938
- console.log(`🔍 ${handler.constructor.name} processing ${fileData.extension} file: ${fileData.path}`);
939
-
940
- const chunks = await handler.generateChunks(content, {
941
- fileId: fileData.id,
942
- filePath: fileData.path,
943
- extension: fileData.extension,
944
- language: fileData.language,
945
- maxTokens: this.maxTokensPerChunk,
946
- overlap: this.overlapTokens
947
- });
948
-
949
- if (chunks.length > 0) {
950
- console.log(` 📝 Found ${chunks.length} semantic chunks`);
951
- }
952
-
953
- const enrichedChunks = chunks.map((chunk, index) => ({
954
- ...chunk,
955
- id: `chunk_${fileData.id}_${index}`,
956
- tokenEstimate: this.safeEstimateTokens(chunk.content, fileData.language),
957
- imports: this.extractImports(chunk.content, fileData.extension),
958
- calls: this.extractCalls(chunk.content, fileData.extension)
959
- }));
960
-
961
- return {
962
- id: fileData.id,
963
- path: fileData.path,
964
- language: fileData.language,
965
- extension: fileData.extension,
966
- size: fileData.size,
967
- lines: content.split('\n').length,
968
- hash: fileData.hash,
969
- modified: fileData.modified,
970
- tags: fileData.tags,
971
- chunks: enrichedChunks
972
- };
973
- }
974
-
975
- buildCompleteJSON(processedFiles, projectName, scanPath) {
976
- const totalChunks = processedFiles.reduce((sum, file) => sum + (file.chunks?.length || 0), 0);
977
- const emptyFiles = processedFiles.filter(f => (f.chunks?.length || 0) === 0).length;
978
-
979
- this.stats.endTime = Date.now();
980
- const processingTimeMs = Math.max(1, this.stats.endTime - this.stats.startTime);
981
-
982
- return {
983
- metadata: {
984
- projectName,
985
- generatedAt: new Date().toISOString(),
986
- scanPath,
987
- generator: 'CodeSummary RAG Generator',
988
- version: '3.1.0',
989
- config: {
990
- maxTokensPerChunk: this.maxTokensPerChunk,
991
- overlapTokens: this.overlapTokens,
992
- tokenEstimationMethod: 'enhanced_heuristic_v1.0'
993
- },
994
- summary: {
995
- totalFiles: processedFiles.length,
996
- languages: [...new Set(processedFiles.map(f => f.language))],
997
- extensions: [...new Set(processedFiles.map(f => f.extension))]
998
- },
999
- schemaVersion: "1.0",
1000
- schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json"
1001
- },
1002
- files: processedFiles,
1003
- index: {
1004
- version: "3.1.0",
1005
- generatedAt: new Date().toISOString(),
1006
- schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json",
1007
- summary: {
1008
- fileCount: processedFiles.length - emptyFiles,
1009
- chunkCount: totalChunks,
1010
- totalBytes: 0,
1011
- languages: [...new Set(processedFiles.map(f => f.language))],
1012
- extensions: [...new Set(processedFiles.map(f => f.extension))],
1013
- avgFileSize: 0,
1014
- avgChunksPerFile: processedFiles.length > 0 ? Math.round(totalChunks / processedFiles.length) : 0
1015
- },
1016
- chunkOffsets: {},
1017
- fileOffsets: {},
1018
- seekInfo: {
1019
- instructions: "Use chunkOffsets[chunkId].contentStart and contentEnd to seek directly to chunk content",
1020
- format: "All offsets are absolute byte positions in this JSON file",
1021
- chunkFormat: "Object with jsonStart, jsonEnd, contentStart, contentEnd (absolute JSON positions)",
1022
- fileFormat: "Array [start, end] for each file in JSON"
1023
- },
1024
- statistics: {
1025
- processingTimeMs,
1026
- bytesPerSecond: 0,
1027
- bytesWritten: 0,
1028
- chunksWithValidOffsets: totalChunks,
1029
- filesWithValidOffsets: processedFiles.length - emptyFiles,
1030
- totalFiles: processedFiles.length,
1031
- emptyFiles: emptyFiles,
1032
- totalChunksGenerated: totalChunks,
1033
- errors: this.errors
1034
- }
1035
- }
1036
- };
1037
- }
1038
-
1039
- /**
1040
- * Finaliza la estructura JSON calculando y reinyectando los offsets correctos.
1041
- * Este enfoque garantiza la máxima precisión al operar sobre el string JSON final.
1042
- * @param {object} jsonStructure - El objeto JSON completo con datos pero sin offsets.
1043
- * @returns {string} El string JSON final, formateado y con offsets precisos.
1044
- */
1045
- calculateAndInjectOffsets(jsonStructure) {
1046
- console.log('🔍 Calculating precise byte offsets and building complete index...');
1047
-
1048
- // PASO 1: Construir JSON preliminar sin index para medir posiciones exactas
1049
- const jsonWithoutIndex = {
1050
- metadata: jsonStructure.metadata,
1051
- files: jsonStructure.files
1052
- };
1053
-
1054
- const preliminaryJsonString = JSON.stringify(jsonWithoutIndex, null, 2);
1055
- const preliminaryBytes = Buffer.byteLength(preliminaryJsonString, 'utf8');
1056
-
1057
- // PASO 2: Calcular offsets precisos de archivos y chunks
1058
- const fileOffsets = {};
1059
- const chunkOffsets = {};
1060
- let totalChunks = 0;
1061
- let validChunks = 0;
1062
-
1063
- for (const file of jsonStructure.files) {
1064
- // Buscar el inicio del objeto file por su ID
1065
- const filePattern = `"id": "${file.id}"`;
1066
- const fileStartPos = preliminaryJsonString.indexOf(filePattern);
1067
-
1068
- if (fileStartPos !== -1) {
1069
- // Buscar el final aproximado del objeto file
1070
- const nextFilePattern = preliminaryJsonString.indexOf(' {\n "id":', fileStartPos + 1);
1071
- const fileEndPos = nextFilePattern !== -1 ? nextFilePattern : preliminaryJsonString.lastIndexOf(' ]');
1072
-
1073
- // Formato del esquema: fileId -> [start, end]
1074
- fileOffsets[file.id] = [fileStartPos, fileEndPos];
1075
-
1076
- // Calcular offsets de chunks dentro de este archivo
1077
- for (const chunk of file.chunks) {
1078
- const chunkPattern = `"id": "${chunk.id}"`;
1079
- const chunkStartPos = preliminaryJsonString.indexOf(chunkPattern, fileStartPos);
1080
-
1081
- if (chunkStartPos !== -1) {
1082
- // Encontrar el campo "content" dentro de este chunk
1083
- const contentPattern = '"content": "';
1084
- const contentStartSearch = preliminaryJsonString.indexOf(contentPattern, chunkStartPos);
1085
-
1086
- if (contentStartSearch !== -1) {
1087
- const contentStart = contentStartSearch + contentPattern.length;
1088
-
1089
- // Buscar el final del contenido (cierre de la cadena JSON)
1090
- let contentEnd = contentStart;
1091
- let inEscape = false;
1092
-
1093
- for (let i = contentStart; i < preliminaryJsonString.length; i++) {
1094
- const char = preliminaryJsonString[i];
1095
- if (inEscape) {
1096
- inEscape = false;
1097
- continue;
1098
- }
1099
- if (char === '\\') {
1100
- inEscape = true;
1101
- continue;
1102
- }
1103
- if (char === '"') {
1104
- contentEnd = i;
1105
- break;
1106
- }
1107
- }
1108
-
1109
- // Buscar el final del objeto chunk completo
1110
- const chunkEndPattern = '},';
1111
- const chunkEndSearch = preliminaryJsonString.indexOf(chunkEndPattern, contentEnd);
1112
- const chunkEnd = chunkEndSearch !== -1 ? chunkEndSearch + 1 : contentEnd + 100;
1113
-
1114
- // Formato del esquema: chunkId -> objeto con offsets precisos
1115
- chunkOffsets[chunk.id] = {
1116
- jsonStart: chunkStartPos,
1117
- jsonEnd: chunkEnd,
1118
- contentStart: contentStart,
1119
- contentEnd: contentEnd,
1120
- filePath: file.path
1121
- };
1122
-
1123
- validChunks++;
1124
- }
1125
- }
1126
- totalChunks++;
1127
- }
1128
- }
1129
- }
1130
-
1131
- // PASO 3: Construir estadísticas completas
1132
- const processingTimeMs = Math.max(1, this.stats.endTime - this.stats.startTime);
1133
- const emptyFiles = jsonStructure.files.filter(f => f.chunks.length === 0).length;
1134
-
1135
- // PASO 4: Construir el bloque index completo según el esquema
1136
- const indexBlock = {
1137
- version: "3.1.0",
1138
- generatedAt: new Date().toISOString(),
1139
- schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json",
1140
- summary: {
1141
- fileCount: jsonStructure.files.length - emptyFiles,
1142
- chunkCount: totalChunks,
1143
- totalBytes: 0, // Se actualizará después
1144
- languages: [...new Set(jsonStructure.files.map(f => f.language))],
1145
- extensions: [...new Set(jsonStructure.files.map(f => f.extension))],
1146
- avgFileSize: 0, // Se actualizará después
1147
- avgChunksPerFile: jsonStructure.files.length > 0 ? Math.round(totalChunks / jsonStructure.files.length) : 0
1148
- },
1149
- chunkOffsets: chunkOffsets,
1150
- fileOffsets: fileOffsets,
1151
- seekInfo: {
1152
- instructions: "Use chunkOffsets[chunkId].contentStart and contentEnd to seek directly to chunk content",
1153
- format: "All offsets are absolute byte positions in this JSON file",
1154
- chunkFormat: "Object with jsonStart, jsonEnd, contentStart, contentEnd (absolute JSON positions)",
1155
- fileFormat: "Array [start, end] for each file in JSON",
1156
- validation: `Generated with ${validChunks} chunks across ${Object.keys(fileOffsets).length} files`
1157
- },
1158
- statistics: {
1159
- processingTimeMs,
1160
- bytesPerSecond: 0, // Se actualizará después
1161
- bytesWritten: 0, // Se actualizará después
1162
- chunksWithValidOffsets: validChunks,
1163
- filesWithValidOffsets: Object.keys(fileOffsets).length,
1164
- totalFiles: jsonStructure.files.length,
1165
- emptyFiles: emptyFiles,
1166
- totalChunksGenerated: totalChunks
1167
- }
1168
- };
1169
-
1170
- // PASO 5: Construir JSON final con index y calcular métricas finales
1171
- const completeStructure = {
1172
- metadata: jsonStructure.metadata,
1173
- files: jsonStructure.files,
1174
- index: indexBlock
1175
- };
1176
-
1177
- const finalJsonString = JSON.stringify(completeStructure, null, 2);
1178
- const finalBytes = Buffer.byteLength(finalJsonString, 'utf8');
1179
- const bytesPerSecond = Math.round(finalBytes / (processingTimeMs / 1000));
1180
-
1181
- // Actualizar métricas finales en el index
1182
- completeStructure.index.summary.totalBytes = finalBytes;
1183
- completeStructure.index.summary.avgFileSize = jsonStructure.files.length > 0 ?
1184
- Math.round(finalBytes / jsonStructure.files.length) : 0;
1185
- completeStructure.index.statistics.bytesPerSecond = bytesPerSecond;
1186
- completeStructure.index.statistics.bytesWritten = finalBytes;
1187
-
1188
- // PASO 6: Regenerar JSON final con estadísticas actualizadas
1189
- const finalResult = JSON.stringify(completeStructure, null, 2);
1190
-
1191
- console.log(`✅ Complete index built: ${Object.keys(fileOffsets).length} files, ${validChunks}/${totalChunks} chunks with precise offsets`);
1192
- console.log(`✅ Final JSON: ${formatFileSize(Buffer.byteLength(finalResult, 'utf8'))}, processing: ${processingTimeMs}ms`);
1193
-
1194
- return finalResult;
1195
- }
1196
- }
1197
-
1198
- // Specialized Handler Classes
1199
-
1200
- class BaseHandler {
1201
- async generateChunks(content, options) {
1202
- // Fallback: split by lines if no specific logic
1203
- return this.chunkByLines(content, options);
1204
- }
1205
-
1206
- chunkByLines(content, options) {
1207
- const lines = content.split('\n');
1208
- const chunks = [];
1209
- const maxLines = Math.ceil(options.maxTokens / 20); // ~20 tokens per line estimate
1210
-
1211
- for (let i = 0; i < lines.length; i += maxLines) {
1212
- const chunkLines = lines.slice(i, Math.min(i + maxLines, lines.length));
1213
- const chunkContent = chunkLines.join('\n');
1214
-
1215
- chunks.push({
1216
- content: chunkContent,
1217
- lineStart: i + 1,
1218
- lineEnd: Math.min(i + maxLines, lines.length),
1219
- chunkingMethod: 'line-based'
1220
- });
1221
- }
1222
-
1223
- return chunks;
1224
- }
1225
-
1226
- /**
1227
- * Estimate token count for chunking decisions
1228
- * @param {string} content - Text content
1229
- * @returns {number} Estimated token count
1230
- */
1231
- estimateTokens(content) {
1232
- return Math.ceil(content.length / 4);
1233
- }
1234
- }
1235
-
1236
- class CLikeHandler extends BaseHandler {
1237
- async generateChunks(content, options) {
1238
- const chunks = [];
1239
- const lines = content.split('\n');
1240
-
1241
- // Find class/struct/function boundaries
1242
- const boundaries = this.findCodeBoundaries(content);
1243
-
1244
- if (boundaries.length > 0) {
1245
- return this.chunkByBoundaries(content, boundaries, options);
1246
- }
1247
-
1248
- // Fallback to line-based chunking
1249
- return this.chunkByLines(content, options);
1250
- }
1251
-
1252
- findCodeBoundaries(content) {
1253
- const boundaries = [];
1254
- const boundaryRegex = /^(?:class|struct|enum|union|static)?\s*([a-zA-Z_][\w]*)\s*.*{/gm;
1255
- let match;
1256
-
1257
- while ((match = boundaryRegex.exec(content)) !== null) {
1258
- const lineNumber = content.substring(0, match.index).split('\n').length;
1259
- boundaries.push({
1260
- name: match[1],
1261
- line: lineNumber,
1262
- type: 'function'
1263
- });
1264
- }
1265
-
1266
- return boundaries;
1267
- }
1268
-
1269
- chunkByBoundaries(content, boundaries, options) {
1270
- const lines = content.split('\n');
1271
- const chunks = [];
1272
- let currentStart = 0;
1273
-
1274
- for (const boundary of boundaries) {
1275
- if (currentStart < boundary.line - 1) {
1276
- const chunkLines = lines.slice(currentStart, boundary.line - 1);
1277
- if (chunkLines.length > 0) {
1278
- chunks.push({
1279
- content: chunkLines.join('\n'),
1280
- lineStart: currentStart + 1,
1281
- lineEnd: boundary.line - 1
1282
- });
1283
- }
1284
- }
1285
- currentStart = boundary.line - 1;
1286
- }
1287
-
1288
- // Add remaining lines
1289
- if (currentStart < lines.length) {
1290
- const chunkLines = lines.slice(currentStart);
1291
- chunks.push({
1292
- content: chunkLines.join('\n'),
1293
- lineStart: currentStart + 1,
1294
- lineEnd: lines.length
1295
- });
1296
- }
1297
-
1298
- return chunks;
1299
- }
1300
- }
1301
-
1302
- class ScriptHandler extends BaseHandler {
1303
- async generateChunks(content, options) {
1304
- console.log(`🔍 ScriptHandler processing ${options.extension} file: ${options.filePath}`);
1305
-
1306
- // ALWAYS try semantic chunking first for script files
1307
- const functions = this.findFunctions(content, options.extension);
1308
-
1309
- if (functions.length > 0) {
1310
- console.log(` 📝 Found ${functions.length} functions/classes - using semantic chunking`);
1311
- const chunks = this.chunkByFunctions(content, functions, options);
1312
-
1313
- // Add chunking method metadata
1314
- chunks.forEach(chunk => {
1315
- chunk.chunkingMethod = 'semantic-function';
1316
- chunk.semanticContext = chunk.context || 'code-block';
1317
- });
1318
-
1319
- return chunks;
1320
- } else {
1321
- console.log(` ⚠️ No functions found - falling back to line-based chunking`);
1322
- const chunks = this.chunkByLines(content, options);
1323
- chunks.forEach(chunk => {
1324
- chunk.chunkingMethod = 'line-based-fallback';
1325
- });
1326
- return chunks;
1327
- }
1328
- }
1329
-
1330
- findFunctions(content, extension) {
1331
- const functions = [];
1332
-
1333
- if (['.js', '.jsx', '.ts', '.tsx'].includes(extension)) {
1334
- // Enhanced JavaScript/TypeScript function detection with improved precision
1335
- const patterns = [
1336
- // Regular functions: function name() {} - SEMANTIC BOUNDARY
1337
- /(?:^|\n)\s*(?:export\s+)?(?:async\s+)?function\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*\([^)]*\)\s*\{/gm,
1338
- // Arrow functions: const name = () => {} - SEMANTIC BOUNDARY
1339
- /(?:^|\n)\s*(?:export\s+)?(?:const|let|var)\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*=\s*(?:async\s+)?\([^)]*\)\s*=>\s*\{/gm,
1340
- // Classes: class ClassName {} - SEMANTIC BOUNDARY
1341
- /(?:^|\n)\s*(?:export\s+)?(?:abstract\s+)?class\s+([a-zA-Z_$][a-zA-Z0-9_$]*)(?:\s+extends\s+[a-zA-Z_$][a-zA-Z0-9_$]*)?(?:\s+implements\s+[^{]+)?\s*\{/gm,
1342
- // Interfaces (TypeScript): interface InterfaceName {} - SEMANTIC BOUNDARY
1343
- /(?:^|\n)\s*(?:export\s+)?interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)(?:\s+extends\s+[^{]+)?\s*\{/gm,
1344
- // Type definitions (TypeScript): type TypeName = - SEMANTIC BOUNDARY
1345
- /(?:^|\n)\s*(?:export\s+)?type\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*=/gm,
1346
- // Enum definitions (TypeScript): enum EnumName {} - SEMANTIC BOUNDARY
1347
- /(?:^|\n)\s*(?:export\s+)?(?:const\s+)?enum\s+([a-zA-Z_$][a-zA-Z0-9_$]*)\s*\{/gm,
1348
- // Class methods and object methods - SEMANTIC BOUNDARY (improved to avoid false positives)
1349
- /(?:^|\n)\s*(?:public\s+|private\s+|protected\s+|static\s+|async\s+)*([a-zA-Z_$][a-zA-Z0-9_$]*)\s*\([^)]*\)\s*\{/gm
1350
- ];
1351
-
1352
- const types = ['function', 'arrow-function', 'class', 'interface', 'type', 'enum', 'method'];
1353
-
1354
- patterns.forEach((regex, patternIndex) => {
1355
- let match;
1356
- while ((match = regex.exec(content)) !== null) {
1357
- const lineNumber = content.substring(0, match.index).split('\n').length;
1358
- const endLine = this.findFunctionEnd(content, match.index, lineNumber);
1359
-
1360
- functions.push({
1361
- name: match[1],
1362
- line: lineNumber,
1363
- endLine: endLine,
1364
- type: types[patternIndex],
1365
- startIndex: match.index
1366
- });
1367
- }
1368
- });
1369
-
1370
- // Sort by line number to process in order
1371
- functions.sort((a, b) => a.line - b.line);
1372
-
1373
- } else if (extension === '.py') {
1374
- // Python functions/classes
1375
- const pyFuncRegex = /^(?:def\s+(\w+)|class\s+(\w+))/gm;
1376
- let match;
1377
-
1378
- while ((match = pyFuncRegex.exec(content)) !== null) {
1379
- const lineNumber = content.substring(0, match.index).split('\n').length;
1380
- functions.push({
1381
- name: match[1] || match[2],
1382
- line: lineNumber,
1383
- type: match[2] ? 'class' : 'function'
1384
- });
1385
- }
1386
- }
1387
-
1388
- return functions;
1389
- }
1390
-
1391
- /**
1392
- * Find the end line of a function/class by matching braces
1393
- * @param {string} content - Full content
1394
- * @param {number} startIndex - Start character index
1395
- * @param {number} startLine - Start line number
1396
- * @returns {number} End line number
1397
- */
1398
- findFunctionEnd(content, startIndex, startLine) {
1399
- let braceCount = 0;
1400
- let inString = false;
1401
- let stringChar = '';
1402
- let i = startIndex;
1403
-
1404
- // Find the opening brace
1405
- while (i < content.length && content[i] !== '{') {
1406
- if (content[i] === '\n') {
1407
- // If we hit a newline before finding {, it might be an interface/type without body
1408
- if (content.substring(startIndex, i).includes('interface') ||
1409
- content.substring(startIndex, i).includes('type')) {
1410
- // For interfaces/types, find the end of the statement
1411
- while (i < content.length && content[i] !== '\n') i++;
1412
- return content.substring(0, i).split('\n').length;
1413
- }
1414
- }
1415
- i++;
1416
- }
1417
-
1418
- if (i >= content.length) {
1419
- // No opening brace found, probably a single-line declaration
1420
- return startLine;
1421
- }
1422
-
1423
- // Count braces to find matching closing brace
1424
- braceCount = 1;
1425
- i++;
1426
-
1427
- while (i < content.length && braceCount > 0) {
1428
- const char = content[i];
1429
-
1430
- // Handle strings to avoid counting braces inside strings
1431
- if (!inString) {
1432
- if (char === '"' || char === "'" || char === '`') {
1433
- inString = true;
1434
- stringChar = char;
1435
- } else if (char === '{') {
1436
- braceCount++;
1437
- } else if (char === '}') {
1438
- braceCount--;
1439
- }
1440
- } else {
1441
- if (char === stringChar && content[i - 1] !== '\\') {
1442
- inString = false;
1443
- stringChar = '';
1444
- }
1445
- }
1446
-
1447
- i++;
1448
- }
1449
-
1450
- // Return the line number where the function ends
1451
- return content.substring(0, i).split('\n').length;
1452
- }
1453
-
1454
- chunkByFunctions(content, functions, options) {
1455
- const lines = content.split('\n');
1456
- const chunks = [];
1457
- let currentStart = 0;
1458
-
1459
- for (let i = 0; i < functions.length; i++) {
1460
- const func = functions[i];
1461
- const funcStart = func.line - 1;
1462
- const funcEnd = func.endLine;
1463
-
1464
- // Add content before function (imports, comments, etc.)
1465
- if (currentStart < funcStart) {
1466
- const preLines = lines.slice(currentStart, funcStart);
1467
- const preContent = preLines.join('\n').trim();
1468
-
1469
- if (preContent.length > 0) {
1470
- chunks.push({
1471
- content: preContent,
1472
- lineStart: currentStart + 1,
1473
- lineEnd: funcStart,
1474
- context: 'file_header'
1475
- });
1476
- }
1477
- }
1478
-
1479
- // Add the function itself
1480
- const funcLines = lines.slice(funcStart, funcEnd);
1481
- const funcContent = funcLines.join('\n');
1482
-
1483
- // Check if function is too large and needs splitting
1484
- if (this.estimateTokens(funcContent) > options.maxTokens * 1.5) {
1485
- // Split large function into smaller chunks
1486
- const subChunks = this.splitLargeFunction(funcContent, func, options);
1487
- chunks.push(...subChunks);
1488
- } else {
1489
- chunks.push({
1490
- content: funcContent,
1491
- lineStart: funcStart + 1,
1492
- lineEnd: funcEnd,
1493
- context: `${func.type}_${func.name}`,
1494
- entityName: func.name,
1495
- entityType: func.type
1496
- });
1497
- }
1498
-
1499
- currentStart = funcEnd;
1500
- }
1501
-
1502
- // Add remaining content after last function
1503
- if (currentStart < lines.length) {
1504
- const remainingLines = lines.slice(currentStart);
1505
- const remainingContent = remainingLines.join('\n').trim();
1506
-
1507
- if (remainingContent.length > 0) {
1508
- chunks.push({
1509
- content: remainingContent,
1510
- lineStart: currentStart + 1,
1511
- lineEnd: lines.length,
1512
- context: 'file_footer'
1513
- });
1514
- }
1515
- }
1516
-
1517
- return chunks;
1518
- }
1519
-
1520
- /**
1521
- * Split a large function into smaller semantic chunks
1522
- * @param {string} funcContent - Function content
1523
- * @param {object} func - Function metadata
1524
- * @param {object} options - Chunking options
1525
- * @returns {Array} Array of chunks
1526
- */
1527
- splitLargeFunction(funcContent, func, options) {
1528
- const chunks = [];
1529
- const lines = funcContent.split('\n');
1530
- const maxLines = Math.ceil(options.maxTokens / 20);
1531
-
1532
- for (let i = 0; i < lines.length; i += maxLines) {
1533
- const chunkLines = lines.slice(i, i + maxLines);
1534
- const chunkContent = chunkLines.join('\n');
1535
-
1536
- chunks.push({
1537
- content: chunkContent,
1538
- lineStart: func.line + i,
1539
- lineEnd: func.line + Math.min(i + maxLines, lines.length) - 1,
1540
- context: `${func.type}_${func.name}_part${Math.floor(i / maxLines) + 1}`,
1541
- entityName: func.name,
1542
- entityType: func.type,
1543
- isPartial: true
1544
- });
1545
- }
1546
-
1547
- return chunks;
1548
- }
1549
- }
1550
-
1551
- class MarkupHandler extends BaseHandler {
1552
- async generateChunks(content, options) {
1553
- // Split by major HTML tags or by token limit
1554
- const majorTags = ['<body>', '<div', '<section', '<article', '<main'];
1555
-
1556
- for (const tag of majorTags) {
1557
- if (content.includes(tag)) {
1558
- return this.chunkByTag(content, tag, options);
1559
- }
1560
- }
1561
-
1562
- return this.chunkByLines(content, options);
1563
- }
1564
-
1565
- chunkByTag(content, tag, options) {
1566
- const parts = content.split(tag);
1567
- const chunks = [];
1568
-
1569
- parts.forEach((part, index) => {
1570
- if (part.trim().length > 0) {
1571
- const chunkContent = index === 0 ? part : tag + part;
1572
- chunks.push({
1573
- content: chunkContent,
1574
- lineStart: 1, // Approximate
1575
- lineEnd: chunkContent.split('\n').length
1576
- });
1577
- }
1578
- });
1579
-
1580
- return chunks;
1581
- }
1582
- }
1583
-
1584
- class StylingHandler extends BaseHandler {
1585
- async generateChunks(content, options) {
1586
- // Split by CSS rules (closing braces)
1587
- const rules = content.split('}');
1588
- const chunks = [];
1589
- let currentChunk = '';
1590
- let lineStart = 1;
1591
-
1592
- for (const rule of rules) {
1593
- currentChunk += rule + '}';
1594
-
1595
- if (this.estimateTokens(currentChunk) >= options.maxTokens) {
1596
- const lineEnd = lineStart + currentChunk.split('\n').length - 1;
1597
- chunks.push({
1598
- content: currentChunk.trim(),
1599
- lineStart,
1600
- lineEnd
1601
- });
1602
-
1603
- currentChunk = '';
1604
- lineStart = lineEnd + 1;
1605
- }
1606
- }
1607
-
1608
- if (currentChunk.trim().length > 0) {
1609
- chunks.push({
1610
- content: currentChunk.trim(),
1611
- lineStart,
1612
- lineEnd: lineStart + currentChunk.split('\n').length - 1
1613
- });
1614
- }
1615
-
1616
- return chunks;
1617
- }
1618
-
1619
- estimateTokens(content) {
1620
- return Math.ceil(content.length / 4);
1621
- }
1622
- }
1623
-
1624
- class ConfigPlainHandler extends BaseHandler {
1625
- async generateChunks(content, options) {
1626
- if (options.extension === '.json') {
1627
- return this.chunkJson(content, options);
1628
- } else if (['.yaml', '.yml'].includes(options.extension)) {
1629
- return this.chunkYaml(content, options);
1630
- } else if (options.extension === '.md') {
1631
- return this.chunkMarkdown(content, options);
1632
- }
1633
-
1634
- return this.chunkByLines(content, options);
1635
- }
1636
-
1637
- chunkJson(content, options) {
1638
- try {
1639
- const parsed = JSON.parse(content);
1640
-
1641
- if (typeof parsed === 'object' && !Array.isArray(parsed)) {
1642
- // Split by top-level keys
1643
- const chunks = [];
1644
- const keys = Object.keys(parsed);
1645
-
1646
- for (const key of keys) {
1647
- const section = { [key]: parsed[key] };
1648
- chunks.push({
1649
- content: JSON.stringify(section, null, 2),
1650
- lineStart: 1,
1651
- lineEnd: JSON.stringify(section, null, 2).split('\n').length,
1652
- context: `json_key_${key}`
1653
- });
1654
- }
1655
-
1656
- return chunks;
1657
- }
1658
- } catch (error) {
1659
- // Fall back to line-based chunking if JSON is invalid
1660
- }
1661
-
1662
- return this.chunkByLines(content, options);
1663
- }
1664
-
1665
- chunkMarkdown(content, options) {
1666
- // Split by headers
1667
- const headerRegex = /^#+\s+(.+)/gm;
1668
- const chunks = [];
1669
- let currentChunk = '';
1670
- let lineStart = 1;
1671
- let match;
1672
-
1673
- while ((match = headerRegex.exec(content)) !== null) {
1674
- if (currentChunk.length > 0) {
1675
- const lineEnd = lineStart + currentChunk.split('\n').length - 1;
1676
- chunks.push({
1677
- content: currentChunk.trim(),
1678
- lineStart,
1679
- lineEnd
1680
- });
1681
- lineStart = lineEnd + 1;
1682
- }
1683
-
1684
- currentChunk = match[0] + '\n';
1685
- }
1686
-
1687
- if (currentChunk.trim().length > 0) {
1688
- chunks.push({
1689
- content: currentChunk.trim(),
1690
- lineStart,
1691
- lineEnd: lineStart + currentChunk.split('\n').length - 1
1692
- });
1693
- }
1694
-
1695
- return chunks.length > 0 ? chunks : this.chunkByLines(content, options);
1696
- }
1697
-
1698
- chunkYaml(content, options) {
1699
- // Simple YAML chunking by document separators and top-level keys
1700
- const lines = content.split('\n');
1701
- const chunks = [];
1702
- let currentChunk = '';
1703
- let lineStart = 1;
1704
-
1705
- for (let i = 0; i < lines.length; i++) {
1706
- const line = lines[i];
1707
- currentChunk += line + '\n';
1708
-
1709
- // Check if we hit a top-level key or document separator
1710
- if (line.match(/^[a-zA-Z_][\w]*:/) || line === '---' || this.estimateTokens(currentChunk) >= options.maxTokens) {
1711
- if (currentChunk.trim().length > 0) {
1712
- chunks.push({
1713
- content: currentChunk.trim(),
1714
- lineStart,
1715
- lineEnd: i + 1
1716
- });
1717
-
1718
- lineStart = i + 2;
1719
- currentChunk = '';
1720
- }
1721
- }
1722
- }
1723
-
1724
- if (currentChunk.trim().length > 0) {
1725
- chunks.push({
1726
- content: currentChunk.trim(),
1727
- lineStart,
1728
- lineEnd: lines.length
1729
- });
1730
- }
1731
-
1732
- return chunks.length > 0 ? chunks : this.chunkByLines(content, options);
1733
- }
1734
-
1735
- estimateTokens(content) {
1736
- return Math.ceil(content.length / 4);
1737
- }
1738
- }
1739
-
1740
- export default new RagGenerator();