@arela/uploader 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,486 @@
1
+ import cliProgress from 'cli-progress';
2
+ import fs from 'fs';
3
+ import pLimit from 'p-limit';
4
+
5
+ import logger from '../services/LoggingService.js';
6
+
7
+ import appConfig from '../config/config.js';
8
+ import ErrorHandler from '../errors/ErrorHandler.js';
9
+ import { ConfigurationError } from '../errors/ErrorTypes.js';
10
+ import FileDetectionService from '../file-detection.js';
11
+
12
+ /**
13
+ * Identify Command Handler
14
+ * Optimized replacement for "detect --detect-pdfs"
15
+ * Identifies pedimento-simplificado documents in files scanned by "arela scan"
16
+ *
17
+ * Key improvements over legacy detect command:
18
+ * - Works with dynamic scan_* tables instead of uploader table
19
+ * - Uses configured API instead of direct Supabase access
20
+ * - Detects locally to leverage CLI host resources
21
+ * - Batch processing for efficient API communication
22
+ * - Real-time progress tracking
23
+ */
24
+ export class IdentifyCommand {
25
+ constructor() {
26
+ this.errorHandler = new ErrorHandler(logger);
27
+ this.scanApiService = null; // Will be initialized in execute
28
+ this.detectionService = new FileDetectionService();
29
+ }
30
+
31
+ /**
32
+ * Execute the identify command
33
+ * @param {Object} options - Command options
34
+ * @param {string} options.api - API target (default, agencia, cliente)
35
+ * @param {number} options.batchSize - Batch size for API operations
36
+ * @param {boolean} options.showStats - Show performance statistics
37
+ */
38
+ async execute(options = {}) {
39
+ const startTime = Date.now();
40
+
41
+ try {
42
+ // Validate scan configuration (need same config as scan command)
43
+ appConfig.validateScanConfig();
44
+
45
+ // Import ScanApiService dynamically
46
+ const { default: ScanApiService } = await import(
47
+ '../services/ScanApiService.js'
48
+ );
49
+ this.scanApiService = new ScanApiService();
50
+
51
+ // Set API target if specified
52
+ if (options.api) {
53
+ appConfig.setApiTarget(options.api);
54
+ this.scanApiService = new ScanApiService();
55
+ }
56
+
57
+ const scanConfig = appConfig.getScanConfig();
58
+ const batchSize = parseInt(options.batchSize) || 100;
59
+
60
+ logger.info('šŸ” Starting arela identify command');
61
+ logger.info(`šŸŽÆ API Target: ${options.api || 'default'}`);
62
+ logger.info(`šŸ“¦ Batch Size: ${batchSize}`);
63
+
64
+ // Fetch all tables for this instance
65
+ logger.info('\nšŸ“Š Fetching instance tables...');
66
+ const tables = await this.scanApiService.getInstanceTables(
67
+ scanConfig.companySlug,
68
+ scanConfig.serverId,
69
+ scanConfig.basePathLabel,
70
+ );
71
+
72
+ if (tables.length === 0) {
73
+ throw new ConfigurationError(
74
+ 'No tables found for this instance. Run "arela scan" first.',
75
+ );
76
+ }
77
+
78
+ logger.info(
79
+ `šŸ“‹ Found ${tables.length} table${tables.length === 1 ? '' : 's'} to process`,
80
+ );
81
+ for (const table of tables) {
82
+ logger.info(` - ${table.tableName}`);
83
+ }
84
+
85
+ // Process each table
86
+ let totalStats = {
87
+ processed: 0,
88
+ detected: 0,
89
+ errors: 0,
90
+ pending: 0,
91
+ };
92
+
93
+ for (const table of tables) {
94
+ logger.info(`\nšŸ” Processing table: ${table.tableName}`);
95
+
96
+ // Get detection statistics for this table
97
+ const stats = await this.#processTable(
98
+ table.tableName,
99
+ batchSize,
100
+ startTime,
101
+ );
102
+
103
+ totalStats.processed += stats.processed;
104
+ totalStats.detected += stats.detected;
105
+ totalStats.errors += stats.errors;
106
+ }
107
+
108
+ const duration = ((Date.now() - startTime) / 1000).toFixed(2);
109
+ const avgSpeed =
110
+ duration > 0 ? Math.round(totalStats.processed / duration) : 0;
111
+
112
+ logger.success(`\nāœ… Identification Complete!`);
113
+ logger.info(`\nšŸ“Š Total Results:`);
114
+ logger.info(` Tables Processed: ${tables.length}`);
115
+ logger.info(` Files Processed: ${totalStats.processed}`);
116
+ logger.info(` Pedimentos Detected: ${totalStats.detected}`);
117
+ logger.info(` Errors: ${totalStats.errors}`);
118
+ logger.info(` Duration: ${duration}s`);
119
+ logger.info(` Speed: ${avgSpeed} files/sec`);
120
+
121
+ if (options.showStats) {
122
+ this.#showDetailedStats(
123
+ startTime,
124
+ totalStats.processed,
125
+ totalStats.detected,
126
+ totalStats.errors,
127
+ );
128
+ }
129
+ } catch (error) {
130
+ logger.error('āŒ Identification failed:', error.message);
131
+
132
+ if (error instanceof ConfigurationError) {
133
+ logger.error('\nConfiguration errors:');
134
+ error.errors.forEach((err) => logger.error(` - ${err}`));
135
+ }
136
+
137
+ throw error;
138
+ }
139
+ }
140
+
141
+ /**
142
+ * Process a single table
143
+ * @private
144
+ * @param {string} tableName - Table name to process
145
+ * @param {number} batchSize - Batch size
146
+ * @param {number} startTime - Start time for speed calculation
147
+ * @returns {Promise<Object>} Processing statistics
148
+ */
149
+ async #processTable(tableName, batchSize, startTime) {
150
+ // Get detection statistics first
151
+ const initialStats = await this.scanApiService.getDetectionStats(tableName);
152
+ logger.info(` Total PDFs: ${initialStats.totalPdfs}`);
153
+ logger.info(` Detected: ${initialStats.detected}`);
154
+ logger.info(` Pending: ${initialStats.pending}`);
155
+ logger.info(` Not Pedimento: ${initialStats.notPedimento || 0}`);
156
+ logger.info(
157
+ ` Max Attempts Reached: ${initialStats.maxAttemptsReached || 0}`,
158
+ );
159
+ logger.info(` Errors: ${initialStats.errors}`);
160
+
161
+ if (initialStats.pending === 0) {
162
+ logger.info(' āœ… All PDFs processed. Skipping.');
163
+
164
+ if (initialStats.maxAttemptsReached > 0) {
165
+ logger.info(
166
+ ` āš ļø ${initialStats.maxAttemptsReached} PDFs reached max attempts.`,
167
+ );
168
+ }
169
+
170
+ return {
171
+ processed: 0,
172
+ detected: 0,
173
+ errors: 0,
174
+ };
175
+ }
176
+
177
+ logger.info(` šŸš€ Processing ${initialStats.pending} pending PDFs...`);
178
+
179
+ logger.info(` šŸš€ Processing ${initialStats.pending} pending PDFs...`);
180
+
181
+ // Setup progress bar
182
+ const progressBar = new cliProgress.SingleBar({
183
+ format:
184
+ ' šŸ“„ |{bar}| {percentage}% | {value}/{total} files | {speed} files/sec',
185
+ barCompleteChar: '\u2588',
186
+ barIncompleteChar: '\u2591',
187
+ hideCursor: true,
188
+ });
189
+
190
+ progressBar.start(initialStats.pending, 0, { speed: 0 });
191
+
192
+ let processedCount = 0;
193
+ let detectedCount = 0;
194
+ let errorCount = 0;
195
+ let hasMore = true;
196
+ let batchNumber = 0;
197
+
198
+ // Process in batches
199
+ while (hasMore) {
200
+ batchNumber++;
201
+
202
+ // Fetch from offset 0 since processed files are filtered out
203
+ const response = await this.scanApiService.fetchPdfsForDetection(
204
+ tableName,
205
+ 0,
206
+ batchSize,
207
+ );
208
+
209
+ if (!response.data || response.data.length === 0) {
210
+ break;
211
+ }
212
+
213
+ const files = response.data;
214
+
215
+ // Detect files locally with concurrent processing
216
+ const detectionResults = await this.#detectFilesLocally(files, 10);
217
+
218
+ // Batch update to API
219
+ const updateResult = await this.scanApiService.batchUpdateDetection(
220
+ tableName,
221
+ detectionResults,
222
+ );
223
+
224
+ // Update statistics
225
+ processedCount += files.length;
226
+ detectedCount += detectionResults.filter(
227
+ (r) => r.detectedType === 'pedimento_simplificado',
228
+ ).length;
229
+ errorCount += detectionResults.filter((r) => r.detectionError).length;
230
+
231
+ // Update progress bar
232
+ const elapsed = (Date.now() - startTime) / 1000;
233
+ const speed = elapsed > 0 ? Math.round(processedCount / elapsed) : 0;
234
+ progressBar.update(processedCount, { speed });
235
+
236
+ // Check if there are more files
237
+ hasMore = response.hasMore;
238
+ }
239
+
240
+ progressBar.stop();
241
+
242
+ return {
243
+ processed: processedCount,
244
+ detected: detectedCount,
245
+ errors: errorCount,
246
+ };
247
+ }
248
+
249
+ /**
250
+ * Detect files locally using FileDetectionService
251
+ * @private
252
+ * @param {Array} files - Files to detect
253
+ * @param {number} concurrency - Maximum concurrent detections
254
+ * @returns {Promise<Array>} Detection results
255
+ */
256
+ async #detectFilesLocally(files, concurrency = 10) {
257
+ const limit = pLimit(concurrency);
258
+ const basePath = appConfig.getBasePath();
259
+
260
+ const detectionPromises = files.map((file) =>
261
+ limit(async () => {
262
+ try {
263
+ // Check if file exists on filesystem
264
+ const absolutePath = file.absolute_path;
265
+
266
+ if (!fs.existsSync(absolutePath)) {
267
+ return {
268
+ id: file.id,
269
+ detectedType: null,
270
+ detectedPedimento: null,
271
+ detectedPedimentoYear: null,
272
+ rfc: null,
273
+ arelaPath: null,
274
+ detectionError:
275
+ 'FILE_NOT_FOUND: File does not exist on filesystem. May have been moved or deleted after scan.',
276
+ isNotPedimento: false,
277
+ };
278
+ }
279
+
280
+ // Check file size - skip very large files
281
+ const stats = fs.statSync(absolutePath);
282
+ const maxSizeBytes = 50 * 1024 * 1024; // 50MB
283
+ if (stats.size > maxSizeBytes) {
284
+ return {
285
+ id: file.id,
286
+ detectedType: null,
287
+ detectedPedimento: null,
288
+ detectedPedimentoYear: null,
289
+ rfc: null,
290
+ arelaPath: null,
291
+ detectionError: `FILE_TOO_LARGE: File size ${(stats.size / 1024 / 1024).toFixed(2)}MB exceeds ${maxSizeBytes / 1024 / 1024}MB limit.`,
292
+ isNotPedimento: false,
293
+ };
294
+ }
295
+
296
+ // Detect using existing FileDetectionService
297
+ const result = await this.detectionService.detectFile(absolutePath);
298
+
299
+ // If detection succeeded and found a pedimento
300
+ if (result.detectedType === 'pedimento_simplificado') {
301
+ return {
302
+ id: file.id,
303
+ detectedType: result.detectedType,
304
+ detectedPedimento: result.detectedPedimento,
305
+ detectedPedimentoYear: result.detectedPedimentoYear,
306
+ rfc: result.rfc,
307
+ arelaPath: result.arelaPath,
308
+ detectionError: result.error,
309
+ isNotPedimento: false,
310
+ };
311
+ }
312
+
313
+ // If no detection, determine if it's definitely not a pedimento
314
+ // This helps avoid re-processing files we know aren't pedimentos
315
+ const isNotPedimento = this.#isDefinitelyNotPedimento(result, file);
316
+
317
+ // Build descriptive error message
318
+ let detectionError = null;
319
+ if (result.error) {
320
+ detectionError = `DETECTION_ERROR: ${result.error}`;
321
+ } else if (isNotPedimento) {
322
+ detectionError =
323
+ 'NOT_PEDIMENTO: File does not match pedimento-simplificado pattern. Missing key markers: "FORMA SIMPLIFICADA DE PEDIMENTO".';
324
+ } else {
325
+ // Partial match - might be a pedimento with missing fields
326
+ const missingFields = this.#getMissingFields(result);
327
+ if (missingFields.length > 0) {
328
+ detectionError = `INCOMPLETE_PEDIMENTO: Detected as potential pedimento but missing fields: ${missingFields.join(', ')}. Matcher may need improvement.`;
329
+ } else {
330
+ detectionError =
331
+ 'UNKNOWN_ERROR: Detection completed but no pedimento found. Check file content and matcher patterns.';
332
+ }
333
+ }
334
+
335
+ return {
336
+ id: file.id,
337
+ detectedType: result.detectedType,
338
+ detectedPedimento: result.detectedPedimento,
339
+ detectedPedimentoYear: result.detectedPedimentoYear,
340
+ rfc: result.rfc,
341
+ arelaPath: result.arelaPath,
342
+ detectionError,
343
+ isNotPedimento,
344
+ };
345
+ } catch (error) {
346
+ logger.warn(
347
+ `Failed to detect ${file.relative_path}: ${error.message}`,
348
+ );
349
+
350
+ // Categorize the error
351
+ let errorCategory = 'UNKNOWN_ERROR';
352
+ if (error.message.includes('ENOENT')) {
353
+ errorCategory = 'FILE_NOT_FOUND';
354
+ } else if (error.message.includes('timeout')) {
355
+ errorCategory = 'TIMEOUT';
356
+ } else if (error.message.includes('PDF')) {
357
+ errorCategory = 'PDF_PARSE_ERROR';
358
+ } else if (error.message.includes('extract')) {
359
+ errorCategory = 'TEXT_EXTRACTION_ERROR';
360
+ }
361
+
362
+ return {
363
+ id: file.id,
364
+ detectedType: null,
365
+ detectedPedimento: null,
366
+ detectedPedimentoYear: null,
367
+ rfc: null,
368
+ arelaPath: null,
369
+ detectionError: `${errorCategory}: ${error.message}`,
370
+ isNotPedimento: false,
371
+ };
372
+ }
373
+ }),
374
+ );
375
+
376
+ return Promise.all(detectionPromises);
377
+ }
378
+
379
+ /**
380
+ * Determine if a file is definitely not a pedimento
381
+ * @private
382
+ * @param {Object} result - Detection result
383
+ * @param {Object} file - File metadata
384
+ * @returns {boolean} True if definitely not a pedimento
385
+ */
386
+ #isDefinitelyNotPedimento(result, file) {
387
+ // If we got any pedimento-related fields, it might be a pedimento
388
+ if (result.detectedPedimento || result.rfc || result.arelaPath) {
389
+ return false;
390
+ }
391
+
392
+ // Check if the text contains the required pedimento marker
393
+ // This must match the criteria in pedimento-simplificado.js match function
394
+ const text = result.text || '';
395
+ const hasRequiredMarker = /FORMA SIMPLIFICADA DE PEDIMENTO/i.test(text);
396
+
397
+ // If the required marker is not found, it's definitely not a pedimento
398
+ return !hasRequiredMarker;
399
+ }
400
+
401
+ /**
402
+ * Get list of missing required fields for pedimento
403
+ * @private
404
+ * @param {Object} result - Detection result
405
+ * @returns {Array<string>} Missing field names
406
+ */
407
+ #getMissingFields(result) {
408
+ const requiredFields = [
409
+ { key: 'detectedPedimento', name: 'numPedimento' },
410
+ { key: 'rfc', name: 'rfc' },
411
+ { key: 'detectedPedimentoYear', name: 'year' },
412
+ ];
413
+
414
+ const missing = [];
415
+ for (const field of requiredFields) {
416
+ if (!result[field.key]) {
417
+ missing.push(field.name);
418
+ }
419
+ }
420
+
421
+ return missing;
422
+ }
423
+
424
+ /**
425
+ * Generate table name from scan config (same logic as scan command)
426
+ * @private
427
+ * @param {Object} scanConfig - Scan configuration
428
+ * @returns {string} Table name
429
+ */
430
+ #generateTableName(scanConfig) {
431
+ const { companySlug, serverId, basePathLabel } = scanConfig;
432
+
433
+ // Combine components
434
+ const rawName = `${companySlug}_${serverId}_${basePathLabel}`;
435
+
436
+ // Sanitize: lowercase, replace special chars with underscore
437
+ let sanitized = rawName
438
+ .toLowerCase()
439
+ .replace(/[^a-z0-9_]/g, '_')
440
+ .replace(/_+/g, '_')
441
+ .replace(/^_|_$/g, '');
442
+
443
+ // Add prefix
444
+ const tableName = 'scan_' + sanitized;
445
+
446
+ // Note: Hash truncation logic should match backend
447
+ // For simplicity, we rely on backend validation
448
+ return tableName;
449
+ }
450
+
451
+ /**
452
+ * Show detailed performance statistics
453
+ * @private
454
+ */
455
+ #showDetailedStats(startTime, processedCount, detectedCount, errorCount) {
456
+ const duration = (Date.now() - startTime) / 1000;
457
+ const avgSpeed = duration > 0 ? (processedCount / duration).toFixed(2) : 0;
458
+ const detectionRate =
459
+ processedCount > 0
460
+ ? ((detectedCount / processedCount) * 100).toFixed(1)
461
+ : 0;
462
+
463
+ logger.info('\nšŸ“ˆ Detailed Statistics:');
464
+ logger.info(` Total Processing Time: ${duration.toFixed(2)}s`);
465
+ logger.info(` Average Speed: ${avgSpeed} files/sec`);
466
+ logger.info(` Detection Rate: ${detectionRate}%`);
467
+ logger.info(
468
+ ` Error Rate: ${errorCount > 0 ? ((errorCount / processedCount) * 100).toFixed(1) : 0}%`,
469
+ );
470
+
471
+ // Memory usage
472
+ const memUsage = process.memoryUsage();
473
+ logger.info('\nšŸ’¾ Memory Usage:');
474
+ logger.info(` RSS: ${(memUsage.rss / 1024 / 1024).toFixed(2)} MB`);
475
+ logger.info(
476
+ ` Heap Used: ${(memUsage.heapUsed / 1024 / 1024).toFixed(2)} MB`,
477
+ );
478
+ logger.info(
479
+ ` Heap Total: ${(memUsage.heapTotal / 1024 / 1024).toFixed(2)} MB`,
480
+ );
481
+ }
482
+ }
483
+
484
+ // Export singleton instance
485
+ const identifyCommand = new IdentifyCommand();
486
+ export default identifyCommand;