@arela/uploader 0.2.0 β†’ 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.js CHANGED
@@ -43,6 +43,7 @@ const sources = process.env.UPLOAD_SOURCES?.split('|')
43
43
  .filter(Boolean);
44
44
 
45
45
  // ConfiguraciΓ³n de RFCs para upload
46
+ console.log('πŸ”§ Configured RFCs for upload:', process.env.UPLOAD_RFCS);
46
47
  const uploadRfcs = process.env.UPLOAD_RFCS?.split('|')
47
48
  .map((s) => s.trim())
48
49
  .filter(Boolean);
@@ -156,8 +157,8 @@ const checkCredentials = async (forceSupabase = false) => {
156
157
  if (!supabaseUrl || !supabaseKey || !bucket) {
157
158
  console.error(
158
159
  '⚠️ Missing credentials. Please set either:\n' +
159
- ' - ARELA_API_URL and ARELA_API_TOKEN for API mode, or\n' +
160
- ' - SUPABASE_URL, SUPABASE_KEY, and SUPABASE_BUCKET for direct mode',
160
+ ' - ARELA_API_URL and ARELA_API_TOKEN for API mode, or\n' +
161
+ ' - SUPABASE_URL, SUPABASE_KEY, and SUPABASE_BUCKET for direct mode',
161
162
  );
162
163
  process.exit(1);
163
164
  }
@@ -179,13 +180,107 @@ const checkCredentials = async (forceSupabase = false) => {
179
180
  };
180
181
 
181
182
  const logFilePath = path.resolve(process.cwd(), 'arela-upload.log');
183
+
184
+ /**
185
+ * OPTIMIZED: Log buffer to reduce I/O operations
186
+ */
187
+ let logBuffer = [];
188
+ const LOG_BUFFER_SIZE = 100; // Flush every 100 log entries
189
+ let lastFlushTime = Date.now();
190
+ const LOG_FLUSH_INTERVAL = 5000; // Flush every 5 seconds
191
+
192
+ const flushLogBuffer = () => {
193
+ if (logBuffer.length === 0) return;
194
+
195
+ try {
196
+ const logContent = logBuffer.join('\n') + '\n';
197
+ fs.appendFileSync(logFilePath, logContent);
198
+ logBuffer = [];
199
+ lastFlushTime = Date.now();
200
+ } catch (error) {
201
+ console.error(`❌ Error writing to log file: ${error.code} | ${error.message} | path: ${logFilePath}`);
202
+ }
203
+ };
204
+
182
205
  const writeLog = (message) => {
183
206
  try {
184
207
  const timestamp = new Date().toISOString();
185
- fs.appendFileSync(logFilePath, `[${timestamp}] ${message}\n`);
208
+ logBuffer.push(`[${timestamp}] ${message}`);
209
+
210
+ // Flush if buffer is full or enough time has passed
211
+ const now = Date.now();
212
+ if (logBuffer.length >= LOG_BUFFER_SIZE || (now - lastFlushTime) >= LOG_FLUSH_INTERVAL) {
213
+ flushLogBuffer();
214
+ }
186
215
  } catch (error) {
187
- console.error(`❌ Error writing to log file: ${error.code} | ${error.message} | path: ${logFilePath}`);
216
+ console.error(`❌ Error buffering log message: ${error.message}`);
217
+ }
218
+ };
219
+
220
+ // Ensure logs are flushed on process exit
221
+ process.on('exit', flushLogBuffer);
222
+ process.on('SIGINT', () => {
223
+ flushLogBuffer();
224
+ process.exit(0);
225
+ });
226
+ process.on('SIGTERM', () => {
227
+ flushLogBuffer();
228
+ process.exit(0);
229
+ });
230
+
231
+ /**
232
+ * OPTIMIZED: Conditional logging to reduce console overhead
233
+ */
234
+ const VERBOSE_LOGGING = process.env.VERBOSE_LOGGING === 'true';
235
+ const BATCH_DELAY = parseInt(process.env.BATCH_DELAY) || 100; // Configurable delay between batches
236
+ const PROGRESS_UPDATE_INTERVAL = parseInt(process.env.PROGRESS_UPDATE_INTERVAL) || 10; // Update progress every N items
237
+
238
+ const logVerbose = (message) => {
239
+ if (VERBOSE_LOGGING) {
240
+ console.log(message);
241
+ }
242
+ };
243
+ const batchReadFileStats = (filePaths) => {
244
+ const results = [];
245
+
246
+ for (const filePath of filePaths) {
247
+ try {
248
+ const stats = fs.statSync(filePath);
249
+ results.push({ path: filePath, stats, error: null });
250
+ } catch (error) {
251
+ results.push({ path: filePath, stats: null, error: error.message });
252
+ }
253
+ }
254
+
255
+ return results;
256
+ };
257
+
258
+ /**
259
+ * OPTIMIZED: Cache for year/pedimento detection results to avoid redundant parsing
260
+ */
261
+ const pathDetectionCache = new Map();
262
+
263
+ /**
264
+ * OPTIMIZED: Clear the path detection cache (useful for testing or long-running processes)
265
+ */
266
+ const clearPathDetectionCache = () => {
267
+ pathDetectionCache.clear();
268
+ };
269
+
270
+ /**
271
+ * OPTIMIZED: Get detection results with caching
272
+ */
273
+ const getCachedPathDetection = (filePath, basePath) => {
274
+ const cacheKey = `${filePath}|${basePath}`;
275
+
276
+ if (pathDetectionCache.has(cacheKey)) {
277
+ return pathDetectionCache.get(cacheKey);
188
278
  }
279
+
280
+ const detection = extractYearAndPedimentoFromPath(filePath, basePath);
281
+ pathDetectionCache.set(cacheKey, detection);
282
+
283
+ return detection;
189
284
  };
190
285
 
191
286
  /**
@@ -276,23 +371,49 @@ const extractYearAndPedimentoFromPath = (filePath, basePath) => {
276
371
  }
277
372
  };
278
373
 
374
+ /**
375
+ * OPTIMIZED: Get processed paths with caching and buffered log reading
376
+ */
377
+ let processedPathsCache = null;
378
+ let lastLogModTime = 0;
379
+
279
380
  const getProcessedPaths = () => {
280
- const processed = new Set();
281
- const lines = fs.existsSync(logFilePath)
282
- ? fs.readFileSync(logFilePath, 'utf-8').split('\n')
283
- : [];
284
-
285
- for (const line of lines) {
286
- const match = line.match(/(SUCCESS|SKIPPED): .*? -> (.+)/);
287
- if (match) {
288
- const [, , path] = match;
381
+ try {
382
+ // Check if log file exists
383
+ if (!fs.existsSync(logFilePath)) {
384
+ return new Set();
385
+ }
386
+
387
+ // Check if cache is still valid
388
+ const logStats = fs.statSync(logFilePath);
389
+ if (processedPathsCache && logStats.mtime.getTime() === lastLogModTime) {
390
+ return processedPathsCache;
391
+ }
392
+
393
+ // Read and parse log file
394
+ const processed = new Set();
395
+ const content = fs.readFileSync(logFilePath, 'utf-8');
396
+
397
+ // Use more efficient regex with global flag
398
+ const regex = /(SUCCESS|SKIPPED): .*? -> (.+)/g;
399
+ let match;
400
+
401
+ while ((match = regex.exec(content)) !== null) {
402
+ const path = match[2];
289
403
  if (path) {
290
404
  processed.add(path.trim());
291
405
  }
292
406
  }
293
- }
294
407
 
295
- return processed;
408
+ // Update cache
409
+ processedPathsCache = processed;
410
+ lastLogModTime = logStats.mtime.getTime();
411
+
412
+ return processed;
413
+ } catch (error) {
414
+ console.error(`⚠️ Error reading processed paths: ${error.message}`);
415
+ return new Set();
416
+ }
296
417
  };
297
418
 
298
419
  /**
@@ -314,6 +435,7 @@ const uploadToApi = async (files, options) => {
314
435
 
315
436
  // Nueva funcionalidad: estructura de carpetas personalizada
316
437
  let combinedStructure = null;
438
+ let cachedDetection = null; // Cache detection result to avoid redundant calls
317
439
 
318
440
  if (
319
441
  options.folderStructure &&
@@ -322,12 +444,10 @@ const uploadToApi = async (files, options) => {
322
444
  ) {
323
445
  // Combine custom folder structure with auto-detection
324
446
  const firstFile = files[0];
325
- const detection = extractYearAndPedimentoFromPath(
326
- firstFile.path,
327
- process.cwd(),
328
- );
329
- if (detection.detected) {
330
- const autoStructure = `${detection.year}/${detection.pedimento}`;
447
+ cachedDetection = getCachedPathDetection(firstFile.path, process.cwd());
448
+
449
+ if (cachedDetection.detected) {
450
+ const autoStructure = `${cachedDetection.year}/${cachedDetection.pedimento}`;
331
451
  combinedStructure = `${options.folderStructure}/${autoStructure}`;
332
452
  formData.append('folderStructure', combinedStructure);
333
453
  console.log(
@@ -346,12 +466,10 @@ const uploadToApi = async (files, options) => {
346
466
  } else if (options.autoDetectStructure && files.length > 0) {
347
467
  // Try to auto-detect from the first file if no explicit structure is provided
348
468
  const firstFile = files[0];
349
- const detection = extractYearAndPedimentoFromPath(
350
- firstFile.path,
351
- process.cwd(),
352
- );
353
- if (detection.detected) {
354
- const autoStructure = `${detection.year}/${detection.pedimento}`;
469
+ cachedDetection = getCachedPathDetection(firstFile.path, process.cwd());
470
+
471
+ if (cachedDetection.detected) {
472
+ const autoStructure = `${cachedDetection.year}/${cachedDetection.pedimento}`;
355
473
  formData.append('folderStructure', autoStructure);
356
474
  }
357
475
  }
@@ -420,9 +538,10 @@ const insertStatsToUploaderTable = async (files, options) => {
420
538
  const records = [];
421
539
 
422
540
  for (const file of files) {
423
- const stats = fs.statSync(file.path);
541
+ // OPTIMIZED: Use pre-computed stats if available, otherwise call fs.statSync
542
+ const stats = file.stats || fs.statSync(file.path);
424
543
  const originalPath = options.clientPath || file.path;
425
-
544
+
426
545
  // Check if record already exists
427
546
  const { data: existingRecords, error: checkError } = await supabase
428
547
  .from('uploader')
@@ -439,7 +558,7 @@ const insertStatsToUploaderTable = async (files, options) => {
439
558
  console.log(`⏭️ Skipping duplicate: ${path.basename(file.path)}`);
440
559
  continue;
441
560
  }
442
-
561
+
443
562
  // Initialize record with basic file stats
444
563
  const record = {
445
564
  document_type: null,
@@ -457,17 +576,17 @@ const insertStatsToUploaderTable = async (files, options) => {
457
576
  if (detectionService.isSupportedFileType(file.path)) {
458
577
  try {
459
578
  const detection = await detectionService.detectFile(file.path);
460
-
579
+
461
580
  if (detection.detectedType) {
462
581
  record.document_type = detection.detectedType;
463
582
  record.num_pedimento = detection.detectedPedimento;
464
583
  record.status = 'detected';
465
-
584
+
466
585
  // Set arela_path for pedimento_simplificado documents
467
586
  if (detection.arelaPath) {
468
587
  record.arela_path = detection.arelaPath;
469
588
  }
470
-
589
+
471
590
  // Extract RFC from fields if available
472
591
  const rfcField = detection.fields.find(f => f.name === 'rfc' && f.found);
473
592
  if (rfcField) {
@@ -498,7 +617,7 @@ const insertStatsToUploaderTable = async (files, options) => {
498
617
  }
499
618
 
500
619
  console.log(`πŸ’Ύ Inserting ${records.length} new records into uploader table...`);
501
-
620
+
502
621
  const { data, error } = await supabase
503
622
  .from('uploader')
504
623
  .insert(records)
@@ -511,6 +630,266 @@ const insertStatsToUploaderTable = async (files, options) => {
511
630
  return data;
512
631
  };
513
632
 
633
+ /**
634
+ * OPTIMIZED: Insert ONLY file stats into uploader table (Phase 1)
635
+ * No file reading, no detection - just filesystem metadata
636
+ * Returns summary statistics instead of full records for better performance
637
+ */
638
+ const insertStatsOnlyToUploaderTable = async (files, options) => {
639
+ if (!supabase) {
640
+ throw new Error('Supabase client not initialized. Stats mode requires Supabase connection.');
641
+ }
642
+
643
+ const batchSize = 1000; // Large batch size for performance
644
+ const allRecords = [];
645
+
646
+ // Prepare all file stats data first - OPTIMIZED to use pre-computed stats
647
+ console.log('πŸ“Š Collecting filesystem stats...');
648
+ for (const file of files) {
649
+ try {
650
+ // Use pre-computed stats if available, otherwise call fs.statSync
651
+ const stats = file.stats || fs.statSync(file.path);
652
+ const originalPath = options.clientPath || file.path;
653
+ const fileExtension = path.extname(file.path).toLowerCase().replace('.', '');
654
+
655
+ const record = {
656
+ document_type: null,
657
+ size: stats.size,
658
+ num_pedimento: null,
659
+ filename: file.originalName || path.basename(file.path),
660
+ original_path: originalPath,
661
+ arela_path: null,
662
+ status: 'fs-stats',
663
+ rfc: null,
664
+ message: null,
665
+ file_extension: fileExtension,
666
+ created_at: new Date().toISOString(),
667
+ modified_at: stats.mtime.toISOString()
668
+ };
669
+
670
+ allRecords.push(record);
671
+ } catch (error) {
672
+ console.error(`❌ Error reading stats for ${file.path}:`, error.message);
673
+ }
674
+ }
675
+
676
+ if (allRecords.length === 0) {
677
+ console.log('πŸ“ No file stats to insert');
678
+ return { totalInserted: 0, totalSkipped: 0, totalProcessed: 0 };
679
+ }
680
+
681
+ console.log(`πŸ’Ύ Bulk inserting ${allRecords.length} file stats in batches of ${batchSize}...`);
682
+
683
+ let totalInserted = 0;
684
+ let totalSkipped = 0;
685
+
686
+ // Process in batches for optimal performance
687
+ for (let i = 0; i < allRecords.length; i += batchSize) {
688
+ const batch = allRecords.slice(i, i + batchSize);
689
+
690
+ try {
691
+ // OPTIMIZED: Use upsert without select to avoid unnecessary data transfer
692
+ const { error, count } = await supabase
693
+ .from('uploader')
694
+ .upsert(batch, {
695
+ onConflict: 'original_path',
696
+ ignoreDuplicates: false,
697
+ count: 'exact'
698
+ });
699
+
700
+ if (error) {
701
+ console.error(`❌ Error inserting batch ${Math.floor(i / batchSize) + 1}:`, error.message);
702
+ continue;
703
+ }
704
+
705
+ // For upsert operations, we can't easily distinguish between inserts and updates
706
+ // from the count alone, but we can estimate based on the assumption that most
707
+ // operations in --stats-only mode are likely new inserts
708
+ const batchProcessed = batch.length;
709
+
710
+ // Since we're using upsert with ignoreDuplicates: false, the count represents
711
+ // the actual number of rows affected (both inserts and updates)
712
+ const affected = count || batchProcessed;
713
+
714
+ // For simplicity and performance, we'll assume most are new inserts in stats-only mode
715
+ // This is reasonable since stats-only is typically run on new file sets
716
+ totalInserted += affected;
717
+
718
+ console.log(`βœ… Batch ${Math.floor(i / batchSize) + 1}: ${affected} rows processed`);
719
+ } catch (error) {
720
+ console.error(`❌ Unexpected error in batch ${Math.floor(i / batchSize) + 1}:`, error.message);
721
+ }
722
+ }
723
+
724
+ // Calculate skipped as difference between total records and inserted
725
+ totalSkipped = allRecords.length - totalInserted;
726
+
727
+ console.log(`πŸ“Š Phase 1 Summary: ${totalInserted} records processed, estimated ${totalSkipped} were updates`);
728
+
729
+ return {
730
+ totalInserted,
731
+ totalSkipped,
732
+ totalProcessed: allRecords.length
733
+ };
734
+ };
735
+
736
+ /**
737
+ * PHASE 2: Process PDF files for pedimento-simplificado detection
738
+ * Only processes files with status 'fs-stats' and file_extension 'pdf'
739
+ */
740
+ const detectPedimentosInDatabase = async (options = {}) => {
741
+ if (!supabase) {
742
+ throw new Error('Supabase client not initialized.');
743
+ }
744
+
745
+ console.log('πŸ” Phase 2: Starting PDF detection for pedimento-simplificado documents...');
746
+
747
+ // Get all PDF files that need detection (status = 'fs-stats' and extension = 'pdf')
748
+ let allPdfRecords = [];
749
+ let hasMore = true;
750
+ let offset = 0;
751
+ const queryBatchSize = 1000;
752
+
753
+ console.log('πŸ“₯ Fetching PDF files from database...');
754
+
755
+ while (hasMore) {
756
+ const { data: batch, error: queryError } = await supabase
757
+ .from('uploader')
758
+ .select('id, original_path, filename, file_extension, status')
759
+ .eq('status', 'fs-stats')
760
+ .eq('file_extension', 'pdf')
761
+ .ilike('filename', '%simp%')
762
+ .range(offset, offset + queryBatchSize - 1);
763
+
764
+ if (queryError) {
765
+ throw new Error(`Failed to fetch PDF records: ${queryError.message}`);
766
+ }
767
+
768
+ if (!batch || batch.length === 0) {
769
+ hasMore = false;
770
+ } else {
771
+ allPdfRecords.push(...batch);
772
+ offset += queryBatchSize;
773
+ console.log(`πŸ“„ Fetched ${batch.length} PDF records (total: ${allPdfRecords.length})`);
774
+ }
775
+ }
776
+
777
+ if (allPdfRecords.length === 0) {
778
+ console.log('πŸ“ No PDF files found for detection');
779
+ return { detectedCount: 0, processedCount: 0, errorCount: 0 };
780
+ }
781
+
782
+ console.log(`πŸ” Processing ${allPdfRecords.length} PDF files for detection...`);
783
+
784
+ const detectionService = new FileDetectionService();
785
+ const batchSize = parseInt(options.batchSize) || 10; // Smaller batches for file I/O
786
+ let totalDetected = 0;
787
+ let totalProcessed = 0;
788
+ let totalErrors = 0;
789
+
790
+ // Create progress bar
791
+ const progressBar = new cliProgress.SingleBar({
792
+ format: 'πŸ” PDF Detection |{bar}| {percentage}% | {value}/{total} | Detected: {detected} | Errors: {errors}',
793
+ barCompleteChar: 'β–ˆ',
794
+ barIncompleteChar: 'β–‘',
795
+ hideCursor: true,
796
+ });
797
+
798
+ progressBar.start(allPdfRecords.length, 0, { detected: 0, errors: 0 });
799
+
800
+ // Process files in smaller batches to avoid overwhelming the system
801
+ for (let i = 0; i < allPdfRecords.length; i += batchSize) {
802
+ const batch = allPdfRecords.slice(i, i + batchSize);
803
+ const updatePromises = [];
804
+
805
+ for (const record of batch) {
806
+ try {
807
+ // Check if file still exists
808
+ if (!fs.existsSync(record.original_path)) {
809
+ updatePromises.push(
810
+ supabase
811
+ .from('uploader')
812
+ .update({
813
+ status: 'file-not-found',
814
+ message: 'File no longer exists at original path'
815
+ })
816
+ .eq('id', record.id)
817
+ );
818
+ totalErrors++;
819
+ continue;
820
+ }
821
+
822
+ // Perform detection
823
+ const detection = await detectionService.detectFile(record.original_path);
824
+ totalProcessed++;
825
+
826
+ const updateData = {
827
+ status: detection.detectedType ? 'detected' : 'not-detected',
828
+ document_type: detection.detectedType,
829
+ num_pedimento: detection.detectedPedimento,
830
+ arela_path: detection.arelaPath,
831
+ message: detection.error || null
832
+ };
833
+
834
+ // Extract RFC from fields if available
835
+ if (detection.fields) {
836
+ const rfcField = detection.fields.find(f => f.name === 'rfc' && f.found);
837
+ if (rfcField) {
838
+ updateData.rfc = rfcField.value;
839
+ }
840
+ }
841
+
842
+ if (detection.detectedType) {
843
+ totalDetected++;
844
+ }
845
+
846
+ updatePromises.push(
847
+ supabase
848
+ .from('uploader')
849
+ .update(updateData)
850
+ .eq('id', record.id)
851
+ );
852
+
853
+ } catch (error) {
854
+ console.error(`❌ Error detecting ${record.filename}:`, error.message);
855
+ totalErrors++;
856
+
857
+ updatePromises.push(
858
+ supabase
859
+ .from('uploader')
860
+ .update({
861
+ status: 'detection-error',
862
+ message: error.message
863
+ })
864
+ .eq('id', record.id)
865
+ );
866
+ }
867
+ }
868
+
869
+ // Execute all updates in parallel for this batch
870
+ try {
871
+ await Promise.all(updatePromises);
872
+ } catch (error) {
873
+ console.error(`❌ Error updating batch:`, error.message);
874
+ }
875
+
876
+ // Update progress
877
+ progressBar.update(Math.min(i + batchSize, allPdfRecords.length), {
878
+ detected: totalDetected,
879
+ errors: totalErrors
880
+ });
881
+ }
882
+
883
+ progressBar.stop();
884
+
885
+ console.log(`πŸ“Š Phase 2 Summary: ${totalDetected} detected, ${totalProcessed} processed, ${totalErrors} errors`);
886
+ return {
887
+ detectedCount: totalDetected,
888
+ processedCount: totalProcessed,
889
+ errorCount: totalErrors
890
+ };
891
+ };
892
+
514
893
  const processFilesInBatches = async (
515
894
  files,
516
895
  batchSize,
@@ -528,7 +907,7 @@ const processFilesInBatches = async (
528
907
 
529
908
  const messageBuffer = [];
530
909
 
531
- const progressBarFormat = options.statsOnly
910
+ const progressBarFormat = options.statsOnly
532
911
  ? 'πŸ“Š Processing [{bar}] {percentage}% | {value}/{total} files | Stats: {successCount} | Errors: {failureCount} | Duplicates: {skippedCount}'
533
912
  : 'πŸ“‚ Processing [{bar}] {percentage}% | {value}/{total} files | Success: {successCount} | Errors: {failureCount} | Skipped: {skippedCount}';
534
913
 
@@ -546,98 +925,59 @@ const processFilesInBatches = async (
546
925
  });
547
926
 
548
927
  if (options.statsOnly) {
549
- // Stats-only mode - Read file stats and insert to uploader table
550
- console.log('πŸ“Š Processing files in stats-only mode...');
551
-
552
- let totalDetected = 0;
553
- let totalNotDetected = 0;
554
- let totalUnsupported = 0;
555
- let totalDetectionErrors = 0;
556
-
928
+ // OPTIMIZED Stats-only mode - Only read filesystem stats, no file detection
929
+ console.log('πŸ“Š Phase 1: Processing files in optimized stats-only mode (no detection)...');
930
+
557
931
  for (let i = 0; i < files.length; i += batchSize) {
558
932
  const batch = files.slice(i, i + batchSize);
559
-
560
- const statsFiles = batch.map((file) => {
561
- const originalFileName = path.basename(file);
562
-
563
- return {
564
- path: file,
565
- originalName: originalFileName,
566
- };
567
- });
568
933
 
569
- try {
570
- const insertedRecords = await insertStatsToUploaderTable(statsFiles, options);
571
- const actualInserted = insertedRecords.length;
572
- const skippedDuplicates = statsFiles.length - actualInserted;
573
-
574
- totalUploaded += actualInserted;
575
- totalSkipped += skippedDuplicates;
576
-
577
- // Count detection results from inserted records
578
- insertedRecords.forEach(record => {
579
- switch (record.status) {
580
- case 'detected':
581
- totalDetected++;
582
- break;
583
- case 'not-detected':
584
- totalNotDetected++;
585
- break;
586
- case 'unsupported':
587
- totalUnsupported++;
588
- break;
589
- case 'detection-error':
590
- totalDetectionErrors++;
591
- break;
592
- }
593
- });
594
-
595
- statsFiles.forEach((file) => {
596
- const wasInserted = insertedRecords.some(record =>
597
- record.original_path === (options.clientPath || file.path)
598
- );
599
- if (wasInserted) {
600
- writeLog(`STATS: ${file.path} -> uploader table`);
601
- } else {
602
- writeLog(`DUPLICATE: ${file.path} -> already exists in uploader table`);
603
- }
934
+ // OPTIMIZED: Batch read file stats to reduce I/O overhead
935
+ const fileStatsResults = batchReadFileStats(batch);
936
+ const statsFiles = fileStatsResults
937
+ .filter(result => result.stats !== null) // Only include files with valid stats
938
+ .map((result) => {
939
+ const originalFileName = path.basename(result.path);
940
+
941
+ return {
942
+ path: result.path,
943
+ originalName: originalFileName,
944
+ stats: result.stats, // Pass pre-computed stats to avoid redundant calls
945
+ };
604
946
  });
605
-
606
- if (actualInserted > 0) {
607
- console.log(`πŸ“ˆ Inserted ${actualInserted} stats records`);
608
- }
609
- if (skippedDuplicates > 0) {
610
- console.log(`⏭️ Skipped ${skippedDuplicates} duplicates`);
611
- }
612
- if (options.detect !== false) {
613
- console.log(` πŸ” Detected: ${totalDetected}, Not detected: ${totalNotDetected}, Unsupported: ${totalUnsupported}, Errors: ${totalDetectionErrors}`);
614
- }
615
-
616
- } catch (error) {
617
- totalErrors += statsFiles.length;
618
- statsFiles.forEach((file) => {
619
- writeLog(`ERROR: ${file.path}: ${error.message}`);
620
- messageBuffer.push(`❌ ${file.originalName}: ${error.message}`);
947
+
948
+ // Log any files that couldn't be read
949
+ const failedFiles = fileStatsResults.filter(result => result.error !== null);
950
+ if (failedFiles.length > 0) {
951
+ console.log(`⚠️ Could not read stats for ${failedFiles.length} files in batch`);
952
+ failedFiles.forEach(failed => {
953
+ console.error(` ❌ ${failed.path}: ${failed.error}`);
621
954
  });
622
955
  }
623
956
 
624
- progressBar.update(i + batch.length, {
625
- successCount: totalUploaded,
626
- failureCount: totalErrors,
627
- skippedCount: totalSkipped,
628
- });
957
+ try {
958
+ const result = await insertStatsOnlyToUploaderTable(statsFiles, options);
629
959
 
630
- if (i + batchSize < files.length) {
631
- await new Promise((resolve) => setTimeout(resolve, 200));
960
+ totalUploaded += result.totalInserted;
961
+ totalSkipped += result.totalSkipped;
962
+ totalErrors += failedFiles.length; // Count failed file reads as errors
963
+
964
+ progressBar.update(Math.min(i + batch.length, files.length), {
965
+ successCount: totalUploaded,
966
+ failureCount: totalErrors,
967
+ skippedCount: totalSkipped,
968
+ });
969
+
970
+ } catch (error) {
971
+ console.error(`❌ Error processing stats batch:`, error.message);
972
+ totalErrors += batch.length;
973
+
974
+ progressBar.update(Math.min(i + batch.length, files.length), {
975
+ successCount: totalUploaded,
976
+ failureCount: totalErrors,
977
+ skippedCount: totalSkipped,
978
+ });
632
979
  }
633
980
  }
634
-
635
- // Store detection stats for summary
636
- totalDetected = totalDetected || 0;
637
- totalNotDetected = totalNotDetected || 0;
638
- totalUnsupported = totalUnsupported || 0;
639
- totalDetectionErrors = totalDetectionErrors || 0;
640
-
641
981
  } else if (apiMode && !options.forceSupabase) {
642
982
  // API Mode - Process in batches
643
983
  for (let i = 0; i < files.length; i += batchSize) {
@@ -661,7 +1001,8 @@ const processFilesInBatches = async (
661
1001
 
662
1002
  // Handle combined folder structure + auto-detection
663
1003
  if (options.folderStructure && options.autoDetectStructure) {
664
- const detection = extractYearAndPedimentoFromPath(file, basePath);
1004
+ // OPTIMIZED: Use cached detection to avoid redundant parsing
1005
+ const detection = getCachedPathDetection(file, basePath);
665
1006
  if (detection.detected) {
666
1007
  const autoStructure = `${detection.year}/${detection.pedimento}`;
667
1008
  const combinedStructure = `${options.folderStructure}/${autoStructure}`;
@@ -669,7 +1010,7 @@ const processFilesInBatches = async (
669
1010
  combinedStructure,
670
1011
  sanitizedFileName,
671
1012
  );
672
- console.log(
1013
+ logVerbose(
673
1014
  `πŸ“ Combined structure: ${options.folderStructure}/${autoStructure} for ${originalFileName} -> ${uploadPath}`,
674
1015
  );
675
1016
  } else {
@@ -678,7 +1019,7 @@ const processFilesInBatches = async (
678
1019
  options.folderStructure,
679
1020
  sanitizedFileName,
680
1021
  );
681
- console.log(
1022
+ logVerbose(
682
1023
  `πŸ“ Custom structure (auto-detection failed): ${uploadPath}`,
683
1024
  );
684
1025
  }
@@ -688,10 +1029,10 @@ const processFilesInBatches = async (
688
1029
  options.folderStructure,
689
1030
  sanitizedFileName,
690
1031
  );
691
- console.log(`πŸ“ Custom structure: ${uploadPath}`);
1032
+ logVerbose(`πŸ“ Custom structure: ${uploadPath}`);
692
1033
  } else if (options.autoDetectStructure) {
693
- // Auto-detect structure from path if enabled
694
- const detection = extractYearAndPedimentoFromPath(file, basePath);
1034
+ // Auto-detect structure from path if enabled - OPTIMIZED: Use cached detection
1035
+ const detection = getCachedPathDetection(file, basePath);
695
1036
  if (detection.detected) {
696
1037
  const autoStructure = `${detection.year}/${detection.pedimento}`;
697
1038
  uploadPath = path.posix.join(autoStructure, sanitizedFileName);
@@ -737,10 +1078,8 @@ const processFilesInBatches = async (
737
1078
 
738
1079
  if (!clientPath && apiFiles.length > 0) {
739
1080
  const firstFile = apiFiles[0];
740
- const detection = extractYearAndPedimentoFromPath(
741
- firstFile.path,
742
- basePath,
743
- );
1081
+ // OPTIMIZED: Use cached detection to avoid redundant parsing
1082
+ const detection = getCachedPathDetection(firstFile.path, basePath);
744
1083
  if (detection.detected) {
745
1084
  // clientPath = `${detection.year}/${detection.pedimento}/`;
746
1085
  clientPath = path
@@ -796,7 +1135,7 @@ const processFilesInBatches = async (
796
1135
  });
797
1136
 
798
1137
  if (i + batchSize < files.length) {
799
- await new Promise((resolve) => setTimeout(resolve, 200));
1138
+ await new Promise((resolve) => setTimeout(resolve, BATCH_DELAY));
800
1139
  }
801
1140
  }
802
1141
  } else {
@@ -809,7 +1148,7 @@ const processFilesInBatches = async (
809
1148
 
810
1149
  // Handle combined folder structure + auto-detection
811
1150
  if (options.folderStructure && options.autoDetectStructure) {
812
- const detection = extractYearAndPedimentoFromPath(file, basePath);
1151
+ const detection = getCachedPathDetection(file, basePath);
813
1152
  if (detection.detected) {
814
1153
  const autoStructure = `${detection.year}/${detection.pedimento}`;
815
1154
  const combinedStructure = `${options.folderStructure}/${autoStructure}`;
@@ -832,8 +1171,8 @@ const processFilesInBatches = async (
832
1171
  uploadPath = path.join(options.folderStructure, fileName);
833
1172
  console.log(`πŸ“ Custom structure: ${uploadPath}`);
834
1173
  } else if (options.autoDetectStructure) {
835
- // Auto-detect structure from path if enabled
836
- const detection = extractYearAndPedimentoFromPath(file, basePath);
1174
+ // Auto-detect structure from path if enabled - OPTIMIZED: Use cached detection
1175
+ const detection = getCachedPathDetection(file, basePath);
837
1176
  if (detection.detected) {
838
1177
  const autoStructure = `${detection.year}/${detection.pedimento}`;
839
1178
  const fileName = path.basename(file);
@@ -943,7 +1282,7 @@ const uploadFilesByRfc = async (options = {}) => {
943
1282
  const queryBatchSize = 1000;
944
1283
 
945
1284
  console.log('πŸ“₯ Fetching all related files (with pagination)...');
946
-
1285
+
947
1286
  while (hasMore) {
948
1287
  const { data: batch, error: queryError } = await supabase
949
1288
  .from('uploader')
@@ -962,7 +1301,7 @@ const uploadFilesByRfc = async (options = {}) => {
962
1301
  } else {
963
1302
  allRelatedFiles = allRelatedFiles.concat(batch);
964
1303
  offset += queryBatchSize;
965
-
1304
+
966
1305
  // If we got less than queryBatchSize, we've reached the end
967
1306
  if (batch.length < queryBatchSize) {
968
1307
  hasMore = false;
@@ -976,7 +1315,7 @@ const uploadFilesByRfc = async (options = {}) => {
976
1315
  }
977
1316
 
978
1317
  console.log(`πŸ“ Found ${allRelatedFiles.length} total files to upload (including supporting documents)`);
979
-
1318
+
980
1319
  // Group by RFC and arela_path for better organization
981
1320
  const filesByRfc = allRelatedFiles.reduce((acc, record) => {
982
1321
  const rfc = record.rfc || 'No RFC';
@@ -1037,18 +1376,18 @@ const uploadFilesByRfc = async (options = {}) => {
1037
1376
  const batch = allRelatedFiles.slice(i, i + batchSize);
1038
1377
  const batchNumber = Math.floor(i / batchSize) + 1;
1039
1378
  const totalBatches = Math.ceil(allRelatedFiles.length / batchSize);
1040
-
1379
+
1041
1380
  console.log(`\nπŸ“¦ Processing batch ${batchNumber}/${totalBatches} (${batch.length} files)`);
1042
1381
 
1043
1382
  // Prepare files for upload
1044
1383
  const filesToUpload = [];
1045
-
1384
+
1046
1385
  for (const record of batch) {
1047
1386
  totalProcessed++;
1048
-
1387
+
1049
1388
  try {
1050
1389
  const originalPath = record.original_path;
1051
-
1390
+
1052
1391
  // Check if file exists
1053
1392
  if (!fs.existsSync(originalPath)) {
1054
1393
  console.log(` ⚠️ File not found: ${originalPath}`);
@@ -1056,24 +1395,24 @@ const uploadFilesByRfc = async (options = {}) => {
1056
1395
  continue;
1057
1396
  }
1058
1397
 
1059
- const fileStats = fs.statSync(originalPath);
1398
+ // OPTIMIZED: Read file and get size from buffer instead of separate fs.statSync call
1060
1399
  const fileBuffer = fs.readFileSync(originalPath);
1061
-
1400
+
1062
1401
  filesToUpload.push({
1063
1402
  path: originalPath,
1064
1403
  buffer: fileBuffer,
1065
- size: fileStats.size,
1404
+ size: fileBuffer.length, // Get size from buffer instead of fs.statSync
1066
1405
  name: record.filename,
1067
1406
  arelaPath: record.arela_path,
1068
1407
  rfc: record.rfc,
1069
1408
  documentType: record.document_type,
1070
1409
  });
1071
-
1410
+
1072
1411
  } catch (error) {
1073
1412
  console.error(` ❌ Error reading file ${record.original_path}:`, error.message);
1074
1413
  totalErrors++;
1075
1414
  }
1076
-
1415
+
1077
1416
  if (options.showProgress !== false) {
1078
1417
  progressBar.update(totalProcessed, {
1079
1418
  uploaded: totalUploaded,
@@ -1087,9 +1426,9 @@ const uploadFilesByRfc = async (options = {}) => {
1087
1426
  if (filesToUpload.length > 0) {
1088
1427
  try {
1089
1428
  console.log(` πŸš€ Uploading ${filesToUpload.length} files to Arela API...`);
1090
-
1429
+
1091
1430
  const formData = new FormData();
1092
-
1431
+
1093
1432
  // Add files to form data
1094
1433
  filesToUpload.forEach((file, index) => {
1095
1434
  formData.append(`files`, file.buffer, {
@@ -1112,7 +1451,7 @@ const uploadFilesByRfc = async (options = {}) => {
1112
1451
  // Upload each group separately with its folder structure
1113
1452
  for (const [arelaPath, pathFiles] of Object.entries(filesByPath)) {
1114
1453
  const pathFormData = new FormData();
1115
-
1454
+
1116
1455
  pathFiles.forEach((file) => {
1117
1456
  pathFormData.append('files', file.buffer, {
1118
1457
  filename: file.name,
@@ -1121,7 +1460,7 @@ const uploadFilesByRfc = async (options = {}) => {
1121
1460
  });
1122
1461
 
1123
1462
  // Set folder structure for this group - concatenate custom prefix with arela_path
1124
- const folderStructure = options.folderStructure
1463
+ const folderStructure = options.folderStructure
1125
1464
  ? `${options.folderStructure}/${arelaPath}`.replace(/\/+/g, '/').replace(/\/$/, '')
1126
1465
  : arelaPath;
1127
1466
  pathFormData.append('folderStructure', folderStructure);
@@ -1149,14 +1488,14 @@ const uploadFilesByRfc = async (options = {}) => {
1149
1488
  }
1150
1489
 
1151
1490
  const result = await response.json();
1152
-
1491
+
1153
1492
  // Check if upload was successful based on stats rather than success field
1154
1493
  const isSuccessful = result.stats && result.stats.uploadedCount > 0 && result.stats.errorCount === 0;
1155
-
1494
+
1156
1495
  if (isSuccessful) {
1157
1496
  console.log(` βœ… Group uploaded: ${result.stats.uploadedCount} files to ${folderStructure}`);
1158
1497
  totalUploaded += result.stats.uploadedCount;
1159
-
1498
+
1160
1499
  if (result.stats.detectedCount > 0) {
1161
1500
  console.log(` πŸ” Files detected: ${result.stats.detectedCount}`);
1162
1501
  }
@@ -1185,7 +1524,7 @@ const uploadFilesByRfc = async (options = {}) => {
1185
1524
 
1186
1525
  // Small delay between batches
1187
1526
  if (i + batchSize < allRelatedFiles.length) {
1188
- await new Promise(resolve => setTimeout(resolve, 200));
1527
+ await new Promise(resolve => setTimeout(resolve, BATCH_DELAY));
1189
1528
  }
1190
1529
  }
1191
1530
 
@@ -1263,19 +1602,19 @@ const propagateArelaPath = async (options = {}) => {
1263
1602
  for (const pedimento of pedimentoRecords) {
1264
1603
  try {
1265
1604
  totalProcessed++;
1266
-
1605
+
1267
1606
  // Extract base path from original_path (remove filename)
1268
1607
  const basePath = path.dirname(pedimento.original_path);
1269
-
1608
+
1270
1609
  console.log(`\nπŸ” Processing: ${pedimento.filename}`);
1271
1610
  console.log(` πŸ“ Base path: ${basePath}`);
1272
-
1611
+
1273
1612
  // Extract folder part from existing arela_path by removing the filename
1274
1613
  const existingPath = pedimento.arela_path;
1275
- const folderArelaPath = existingPath.includes('/') ?
1276
- existingPath.substring(0, existingPath.lastIndexOf('/')) + '/' :
1614
+ const folderArelaPath = existingPath.includes('/') ?
1615
+ existingPath.substring(0, existingPath.lastIndexOf('/')) + '/' :
1277
1616
  existingPath.endsWith('/') ? existingPath : existingPath + '/';
1278
-
1617
+
1279
1618
  console.log(` 🎯 Original arela path: ${existingPath}`);
1280
1619
  console.log(` πŸ“ Folder arela path: ${folderArelaPath}`);
1281
1620
 
@@ -1299,13 +1638,13 @@ const propagateArelaPath = async (options = {}) => {
1299
1638
  }
1300
1639
 
1301
1640
  console.log(` πŸ“„ Found ${relatedFiles.length} related files to update:`);
1302
-
1641
+
1303
1642
  // Show first 10 files, then indicate if there are more
1304
1643
  const filesToShow = relatedFiles.slice(0, 10);
1305
1644
  filesToShow.forEach(file => {
1306
1645
  console.log(` - ${file.filename}`);
1307
1646
  });
1308
-
1647
+
1309
1648
  if (relatedFiles.length > 10) {
1310
1649
  console.log(` ... and ${relatedFiles.length - 10} more files`);
1311
1650
  }
@@ -1322,7 +1661,7 @@ const propagateArelaPath = async (options = {}) => {
1322
1661
  const batchIds = fileIds.slice(i, i + BATCH_SIZE);
1323
1662
  const batchNumber = Math.floor(i / BATCH_SIZE) + 1;
1324
1663
  const totalBatches = Math.ceil(fileIds.length / BATCH_SIZE);
1325
-
1664
+
1326
1665
  console.log(` πŸ“¦ Batch ${batchNumber}/${totalBatches}: Updating ${batchIds.length} files...`);
1327
1666
 
1328
1667
  try {
@@ -1422,25 +1761,53 @@ program
1422
1761
  'Automatically detect year/pedimento from file paths',
1423
1762
  )
1424
1763
  .option('--client-path <path>', 'Client path for metadata tracking')
1425
- .option('--stats-only', 'Only read file stats and insert to uploader table, skip file upload')
1764
+ .option('--stats-only', 'Phase 1: Only read filesystem stats and insert to database (no file reading or detection)')
1426
1765
  .option('--no-detect', 'Disable document type detection in stats-only mode')
1427
- .option('--propagate-arela-path', 'Propagate arela_path from pedimento_simplificado records to related files with same base path')
1428
- .option('--upload-by-rfc', 'Upload files to Arela API based on RFC values from UPLOAD_RFCS environment variable')
1766
+ .option('--detect-pdfs', 'Phase 2: Process PDF files in database for pedimento-simplificado detection')
1767
+ .option('--propagate-arela-path', 'Phase 3: Propagate arela_path from pedimento_simplificado records to related files with same base path')
1768
+ .option('--upload-by-rfc', 'Phase 4: Upload files to Arela API based on RFC values from UPLOAD_RFCS environment variable')
1769
+ .option('--run-all-phases', 'Run all 4 phases in sequence: stats β†’ detect β†’ propagate β†’ upload')
1429
1770
  .action(async (options) => {
1430
1771
  if (options.version) {
1431
1772
  console.log(packageVersion);
1432
1773
  process.exit(0);
1433
1774
  }
1434
1775
 
1776
+ // Handle detect-pdfs option (Phase 2)
1777
+ if (options.detectPdfs) {
1778
+ console.log('πŸ” Starting Phase 2: PDF Detection');
1779
+ await checkCredentials(true); // Force Supabase mode
1780
+
1781
+ const result = await detectPedimentosInDatabase({
1782
+ batchSize: parseInt(options.batchSize) || 10,
1783
+ });
1784
+
1785
+ console.log(`βœ… Phase 2 Complete: ${result.detectedCount} detected, ${result.errorCount} errors`);
1786
+ return;
1787
+ }
1788
+
1789
+ // Handle run-all-phases option
1790
+ if (options.runAllPhases) {
1791
+ console.log('πŸš€ Starting all 4 phases in sequence...');
1792
+ await checkCredentials(true); // Force Supabase mode
1793
+
1794
+ // Phase 1: Stats collection
1795
+ console.log('\nπŸ“Š === PHASE 1: Filesystem Stats ===');
1796
+ options.statsOnly = true;
1797
+ // Continue with normal processing to run Phase 1
1798
+
1799
+ // The rest will be handled after Phase 1 completes
1800
+ }
1801
+
1435
1802
  // Handle propagate-arela-path option
1436
1803
  if (options.propagateArelaPath) {
1437
1804
  // Initialize Supabase credentials for propagation
1438
1805
  await checkCredentials(true); // Force Supabase mode
1439
-
1806
+
1440
1807
  const result = await propagateArelaPath({
1441
1808
  showProgress: options.showStats || true,
1442
1809
  });
1443
-
1810
+
1444
1811
  if (result.errorCount > 0) {
1445
1812
  process.exit(1);
1446
1813
  }
@@ -1451,7 +1818,7 @@ program
1451
1818
  if (options.uploadByRfc) {
1452
1819
  // RFC upload needs both Supabase (for database queries) and API (for uploads)
1453
1820
  await checkCredentials(false); // Initialize API mode
1454
-
1821
+
1455
1822
  // Also initialize Supabase for database queries
1456
1823
  if (!supabase) {
1457
1824
  if (!supabaseUrl || !supabaseKey) {
@@ -1459,17 +1826,17 @@ program
1459
1826
  console.error(' Please set SUPABASE_URL and SUPABASE_KEY environment variables.');
1460
1827
  process.exit(1);
1461
1828
  }
1462
-
1829
+
1463
1830
  supabase = createClient(supabaseUrl, supabaseKey);
1464
1831
  console.log('βœ… Connected to Supabase for database queries');
1465
1832
  }
1466
-
1833
+
1467
1834
  const result = await uploadFilesByRfc({
1468
1835
  showProgress: options.showStats || true,
1469
1836
  batchSize: parseInt(options.batchSize) || 10,
1470
1837
  folderStructure: options.folderStructure,
1471
1838
  });
1472
-
1839
+
1473
1840
  if (result.errorCount > 0) {
1474
1841
  process.exit(1);
1475
1842
  }
@@ -1590,12 +1957,65 @@ program
1590
1957
  console.log(` πŸ“œ Log file: ${logFilePath}`);
1591
1958
  console.log(`${'='.repeat(60)}\n`);
1592
1959
 
1593
- if (options.showStats && sanitizationCache.size > 0) {
1960
+ // Continue with remaining phases if running all phases
1961
+ if (options.runAllPhases && options.statsOnly) {
1962
+ try {
1963
+ // Phase 2: PDF Detection
1964
+ console.log('\nπŸ” === PHASE 2: PDF Detection ===');
1965
+ const detectionResult = await detectPedimentosInDatabase({
1966
+ batchSize: parseInt(options.batchSize) || 10,
1967
+ });
1968
+ console.log(`βœ… Phase 2 Complete: ${detectionResult.detectedCount} detected, ${detectionResult.errorCount} errors`);
1969
+
1970
+ // Phase 3: Propagate arela_path
1971
+ console.log('\nπŸ“ === PHASE 3: Propagate Arela Paths ===');
1972
+ const propagateResult = await propagateArelaPath({
1973
+ showProgress: options.showStats || true,
1974
+ });
1975
+ console.log(`βœ… Phase 3 Complete: ${propagateResult.updatedCount || 0} paths propagated`);
1976
+
1977
+ // Phase 4: Upload by RFC
1978
+ if (uploadRfcs && uploadRfcs.length > 0) {
1979
+ console.log('\nπŸš€ === PHASE 4: Upload by RFC ===');
1980
+
1981
+ // Initialize API mode for uploads
1982
+ await checkCredentials(false);
1983
+
1984
+ const uploadResult = await uploadFilesByRfc({
1985
+ showProgress: options.showStats || true,
1986
+ batchSize: parseInt(options.batchSize) || 10,
1987
+ folderStructure: options.folderStructure,
1988
+ });
1989
+ console.log(`βœ… Phase 4 Complete: Upload finished`);
1990
+ } else {
1991
+ console.log('\n⚠️ === PHASE 4: Upload by RFC ===');
1992
+ console.log('⚠️ UPLOAD_RFCS environment variable not configured, skipping Phase 4');
1993
+ }
1994
+
1995
+ console.log('\nπŸŽ‰ All 4 phases completed successfully!');
1996
+
1997
+ } catch (error) {
1998
+ console.error(`❌ Error in multi-phase execution:`, error.message);
1999
+ process.exit(1);
2000
+ }
2001
+ }
2002
+
2003
+ if (options.showStats && (sanitizationCache.size > 0 || pathDetectionCache.size > 0)) {
1594
2004
  console.log(`πŸ“Š Performance Statistics:`);
1595
- console.log(
1596
- ` πŸ—‚οΈ Sanitization cache entries: ${sanitizationCache.size}`,
1597
- );
2005
+ if (sanitizationCache.size > 0) {
2006
+ console.log(
2007
+ ` πŸ—‚οΈ Sanitization cache entries: ${sanitizationCache.size}`,
2008
+ );
2009
+ }
2010
+ if (pathDetectionCache.size > 0) {
2011
+ console.log(
2012
+ ` πŸ“ Path detection cache entries: ${pathDetectionCache.size}`,
2013
+ );
2014
+ }
1598
2015
  }
2016
+
2017
+ // OPTIMIZED: Ensure log buffer is flushed before exit
2018
+ flushLogBuffer();
1599
2019
  });
1600
2020
 
1601
2021
  program.parse();