@arela/uploader 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/.env.local +316 -0
  2. package/.env.template +70 -0
  3. package/coverage/IdentifyCommand.js.html +1462 -0
  4. package/coverage/PropagateCommand.js.html +1507 -0
  5. package/coverage/PushCommand.js.html +1504 -0
  6. package/coverage/ScanCommand.js.html +1654 -0
  7. package/coverage/UploadCommand.js.html +1846 -0
  8. package/coverage/WatchCommand.js.html +4111 -0
  9. package/coverage/base.css +224 -0
  10. package/coverage/block-navigation.js +87 -0
  11. package/coverage/favicon.png +0 -0
  12. package/coverage/index.html +191 -0
  13. package/coverage/lcov-report/IdentifyCommand.js.html +1462 -0
  14. package/coverage/lcov-report/PropagateCommand.js.html +1507 -0
  15. package/coverage/lcov-report/PushCommand.js.html +1504 -0
  16. package/coverage/lcov-report/ScanCommand.js.html +1654 -0
  17. package/coverage/lcov-report/UploadCommand.js.html +1846 -0
  18. package/coverage/lcov-report/WatchCommand.js.html +4111 -0
  19. package/coverage/lcov-report/base.css +224 -0
  20. package/coverage/lcov-report/block-navigation.js +87 -0
  21. package/coverage/lcov-report/favicon.png +0 -0
  22. package/coverage/lcov-report/index.html +191 -0
  23. package/coverage/lcov-report/prettify.css +1 -0
  24. package/coverage/lcov-report/prettify.js +2 -0
  25. package/coverage/lcov-report/sort-arrow-sprite.png +0 -0
  26. package/coverage/lcov-report/sorter.js +210 -0
  27. package/coverage/lcov.info +1937 -0
  28. package/coverage/prettify.css +1 -0
  29. package/coverage/prettify.js +2 -0
  30. package/coverage/sort-arrow-sprite.png +0 -0
  31. package/coverage/sorter.js +210 -0
  32. package/docs/API_RETRY_MECHANISM.md +338 -0
  33. package/docs/ARELA_IDENTIFY_IMPLEMENTATION.md +489 -0
  34. package/docs/ARELA_IDENTIFY_QUICKREF.md +186 -0
  35. package/docs/ARELA_PROPAGATE_IMPLEMENTATION.md +581 -0
  36. package/docs/ARELA_PROPAGATE_QUICKREF.md +272 -0
  37. package/docs/ARELA_PUSH_IMPLEMENTATION.md +577 -0
  38. package/docs/ARELA_PUSH_QUICKREF.md +322 -0
  39. package/docs/ARELA_SCAN_IMPLEMENTATION.md +373 -0
  40. package/docs/ARELA_SCAN_QUICKREF.md +139 -0
  41. package/docs/CROSS_PLATFORM_PATH_HANDLING.md +593 -0
  42. package/docs/DETECTION_ATTEMPT_TRACKING.md +414 -0
  43. package/docs/MIGRATION_UPLOADER_TO_FILE_STATS.md +1020 -0
  44. package/docs/MULTI_LEVEL_DIRECTORY_SCANNING.md +494 -0
  45. package/docs/STATS_COMMAND_SEQUENCE_DIAGRAM.md +287 -0
  46. package/docs/STATS_COMMAND_SIMPLE.md +93 -0
  47. package/package.json +31 -3
  48. package/src/commands/IdentifyCommand.js +459 -0
  49. package/src/commands/PropagateCommand.js +474 -0
  50. package/src/commands/PushCommand.js +473 -0
  51. package/src/commands/ScanCommand.js +523 -0
  52. package/src/config/config.js +154 -7
  53. package/src/file-detection.js +9 -10
  54. package/src/index.js +150 -0
  55. package/src/services/ScanApiService.js +645 -0
  56. package/src/utils/PathNormalizer.js +220 -0
  57. package/tests/commands/IdentifyCommand.test.js +570 -0
  58. package/tests/commands/PropagateCommand.test.js +568 -0
  59. package/tests/commands/PushCommand.test.js +754 -0
  60. package/tests/commands/ScanCommand.test.js +382 -0
  61. package/tests/unit/PathAndTableNameGeneration.test.js +1211 -0
@@ -3,6 +3,8 @@ import fs from 'fs';
3
3
  import path from 'path';
4
4
  import { fileURLToPath } from 'url';
5
5
 
6
+ import PathNormalizer from '../utils/PathNormalizer.js';
7
+
6
8
  config();
7
9
 
8
10
  /**
@@ -15,6 +17,8 @@ class Config {
15
17
  this.supabase = this.#loadSupabaseConfig();
16
18
  this.api = this.#loadApiConfig();
17
19
  this.upload = this.#loadUploadConfig();
20
+ this.scan = this.#loadScanConfig();
21
+ this.push = this.#loadPushConfig();
18
22
  this.performance = this.#loadPerformanceConfig();
19
23
  this.logging = this.#loadLoggingConfig();
20
24
  this.watch = this.#loadWatchConfig();
@@ -30,10 +34,10 @@ class Config {
30
34
  const __dirname = path.dirname(__filename);
31
35
  const packageJsonPath = path.resolve(__dirname, '../../package.json');
32
36
  const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
33
- return packageJson.version || '1.0.2';
37
+ return packageJson.version || '1.0.4';
34
38
  } catch (error) {
35
39
  console.warn('⚠️ Could not read package.json version, using fallback');
36
- return '1.0.2';
40
+ return '1.0.4';
37
41
  }
38
42
  }
39
43
 
@@ -239,6 +243,151 @@ class Config {
239
243
  };
240
244
  }
241
245
 
246
+ /**
247
+ * Load scan configuration
248
+ * @private
249
+ */
250
+ #loadScanConfig() {
251
+ const companySlug = process.env.ARELA_COMPANY_SLUG;
252
+ const serverId = process.env.ARELA_SERVER_ID;
253
+ let basePathLabel = process.env.ARELA_BASE_PATH_LABEL;
254
+
255
+ // Auto-derive basePathLabel from UPLOAD_BASE_PATH if not set
256
+ // IMPORTANT: Always resolve to absolute path for uniqueness
257
+ if (!basePathLabel && process.env.UPLOAD_BASE_PATH) {
258
+ const basePath = process.env.UPLOAD_BASE_PATH;
259
+ // Resolve to absolute path (handles ../sample vs ./sample correctly)
260
+ basePathLabel = PathNormalizer.toAbsolutePath(basePath);
261
+ }
262
+
263
+ // If basePathLabel is provided, ensure it's absolute
264
+ if (basePathLabel && !path.isAbsolute(basePathLabel)) {
265
+ basePathLabel = PathNormalizer.toAbsolutePath(basePathLabel);
266
+ }
267
+
268
+ // Parse exclude patterns
269
+ const defaultExcludePatterns =
270
+ '.DS_Store,Thumbs.db,desktop.ini,__pycache__,.pyc,.tmp,.swp,$RECYCLE.BIN,System Volume Information,~$*';
271
+ const excludePatterns = (
272
+ process.env.SCAN_EXCLUDE_PATTERNS || defaultExcludePatterns
273
+ )
274
+ .split(',')
275
+ .map((p) => p.trim())
276
+ .filter(Boolean);
277
+
278
+ // Generate table name if all components are available
279
+ // Note: This is just for reference; actual table names are generated dynamically
280
+ // in ScanCommand based on discovered directories and levels
281
+ let tableName = null;
282
+ if (companySlug && serverId && basePathLabel) {
283
+ tableName = PathNormalizer.generateTableName({
284
+ companySlug,
285
+ serverId,
286
+ basePathLabel,
287
+ });
288
+ }
289
+
290
+ return {
291
+ companySlug,
292
+ serverId,
293
+ basePathFull: basePathLabel, // Renamed for consistency
294
+ tableName,
295
+ excludePatterns,
296
+ batchSize: parseInt(process.env.SCAN_BATCH_SIZE) || 2000,
297
+ directoryLevel: parseInt(process.env.SCAN_DIRECTORY_LEVEL) || 0,
298
+ };
299
+ }
300
+
301
+ /**
302
+ * Load push configuration
303
+ * @private
304
+ */
305
+ #loadPushConfig() {
306
+ const pushRfcs = process.env.PUSH_RFCS?.split('|')
307
+ .map((s) => s.trim().toUpperCase())
308
+ .filter(Boolean);
309
+
310
+ const pushYears = process.env.PUSH_YEARS?.split('|')
311
+ .map((s) => parseInt(s.trim(), 10))
312
+ .filter((y) => !isNaN(y));
313
+
314
+ return {
315
+ rfcs: pushRfcs,
316
+ years: pushYears,
317
+ batchSize: parseInt(process.env.PUSH_BATCH_SIZE) || 50,
318
+ uploadBatchSize: parseInt(process.env.PUSH_UPLOAD_BATCH_SIZE) || 10,
319
+ bucket:
320
+ process.env.PUSH_BUCKET || process.env.SUPABASE_BUCKET || 'archivos',
321
+ };
322
+ }
323
+
324
+ /**
325
+ * Validate scan configuration
326
+ * @throws {Error} If required scan configuration is missing
327
+ */
328
+ validateScanConfig() {
329
+ const errors = [];
330
+
331
+ if (!this.scan.companySlug) {
332
+ errors.push(
333
+ 'ARELA_COMPANY_SLUG is required (e.g., "acme_corp", "cliente_123")',
334
+ );
335
+ } else if (!/^[a-zA-Z0-9_-]+$/.test(this.scan.companySlug)) {
336
+ errors.push(
337
+ 'ARELA_COMPANY_SLUG must contain only alphanumeric characters, dashes, and underscores',
338
+ );
339
+ }
340
+
341
+ if (!this.scan.serverId) {
342
+ errors.push(
343
+ 'ARELA_SERVER_ID is required (e.g., "nas01", "server-mx", "storage-01")',
344
+ );
345
+ } else if (!/^[a-zA-Z0-9_-]+$/.test(this.scan.serverId)) {
346
+ errors.push(
347
+ 'ARELA_SERVER_ID must contain only alphanumeric characters, dashes, and underscores',
348
+ );
349
+ }
350
+
351
+ if (!this.upload.basePath) {
352
+ errors.push(
353
+ 'UPLOAD_BASE_PATH is required to determine the scan base path',
354
+ );
355
+ }
356
+
357
+ if (!this.scan.basePathFull) {
358
+ errors.push(
359
+ 'Could not determine base path. Set ARELA_BASE_PATH_LABEL or UPLOAD_BASE_PATH',
360
+ );
361
+ }
362
+
363
+ if (errors.length > 0) {
364
+ throw new Error(
365
+ '⚠️ Scan configuration errors:\n - ' + errors.join('\n - '),
366
+ );
367
+ }
368
+ }
369
+
370
+ /**
371
+ * Get scan configuration
372
+ * @returns {Object} Scan configuration
373
+ */
374
+ getScanConfig() {
375
+ // Return scan config with basePathFull already resolved to absolute path
376
+ return this.scan;
377
+ }
378
+
379
+ /**
380
+ * Get push configuration
381
+ * @returns {Object} Push configuration
382
+ */
383
+ getPushConfig() {
384
+ return {
385
+ ...this.push,
386
+ rfcs: this.push.rfcs || [],
387
+ years: this.push.years || [],
388
+ };
389
+ }
390
+
242
391
  /**
243
392
  * Load performance configuration
244
393
  * @private
@@ -365,20 +514,18 @@ class Config {
365
514
 
366
515
  /**
367
516
  * Get upload sources with validation
368
- * @returns {string[]} Array of upload sources
369
- * @throws {Error} If sources are not configured
517
+ * @returns {string[]} Array of upload sources (defaults to ['.'] if not configured)
370
518
  */
371
519
  getUploadSources() {
372
520
  if (!this.upload.sources || this.upload.sources.length === 0) {
373
- throw new Error(
374
- '⚠️ No upload sources configured. Please set UPLOAD_SOURCES environment variable.',
375
- );
521
+ return ['.'];
376
522
  }
377
523
  return this.upload.sources;
378
524
  }
379
525
 
380
526
  /**
381
527
  * Get base path with validation
528
+ * Returns the path as configured (may be relative for legacy compatibility)
382
529
  * @returns {string} Base path for uploads
383
530
  * @throws {Error} If base path is not configured
384
531
  */
@@ -1,11 +1,9 @@
1
1
  import fs from 'fs';
2
- import { getTextExtractor } from 'office-text-extractor';
3
2
  import path from 'path';
3
+ import { PDFParse } from 'pdf-parse';
4
4
 
5
5
  import { extractDocumentFields } from './document-type-shared.js';
6
6
 
7
- const extractor = getTextExtractor();
8
-
9
7
  /**
10
8
  * Compose arela_path from extracted pedimento fields
11
9
  * Format: RFC/Year/Patente/Aduana/Pedimento/
@@ -151,18 +149,19 @@ export class FileDetectionService {
151
149
  }
152
150
 
153
151
  /**
154
- * Extract text from PDF file
152
+ * Extract text from PDF file using pdf-parse
153
+ * More reliable for concurrent operations than office-text-extractor
155
154
  * @param {string} filePath - Path to PDF file
156
155
  * @returns {Promise<string>} - Extracted text
157
156
  */
158
157
  async extractTextFromPDF(filePath) {
159
158
  try {
160
- const buffer = fs.readFileSync(filePath);
161
- const text = await extractor.extractText({
162
- input: buffer,
163
- type: 'file',
164
- });
165
- return text;
159
+ const dataBuffer = fs.readFileSync(filePath);
160
+ // Convert Buffer to Uint8Array as required by pdf-parse
161
+ const uint8Array = new Uint8Array(dataBuffer);
162
+ const pdfParse = new PDFParse(uint8Array);
163
+ const result = await pdfParse.getText();
164
+ return result.text;
166
165
  } catch (error) {
167
166
  console.error(
168
167
  `Error extracting text from PDF ${filePath}:`,
package/src/index.js CHANGED
@@ -1,6 +1,10 @@
1
1
  #!/usr/bin/env node
2
2
  import { Command } from 'commander';
3
3
 
4
+ import identifyCommand from './commands/IdentifyCommand.js';
5
+ import PropagateCommand from './commands/PropagateCommand.js';
6
+ import PushCommand from './commands/PushCommand.js';
7
+ import scanCommand from './commands/ScanCommand.js';
4
8
  import UploadCommand from './commands/UploadCommand.js';
5
9
  import watchCommand from './commands/WatchCommand.js';
6
10
  import appConfig from './config/config.js';
@@ -15,6 +19,8 @@ class ArelaUploaderCLI {
15
19
  constructor() {
16
20
  this.program = new Command();
17
21
  this.errorHandler = new ErrorHandler(logger);
22
+ this.identifyCommand = identifyCommand;
23
+ this.scanCommand = scanCommand;
18
24
  this.uploadCommand = new UploadCommand();
19
25
  this.watchCommand = watchCommand;
20
26
 
@@ -164,6 +170,33 @@ class ArelaUploaderCLI {
164
170
  }
165
171
  });
166
172
 
173
+ // Scan command (optimized stats collection with streaming)
174
+ this.program
175
+ .command('scan')
176
+ .description(
177
+ 'Scan filesystem and collect file statistics (optimized with streaming)',
178
+ )
179
+ .option(
180
+ '--api <target>',
181
+ 'API target: agencia|cliente|default',
182
+ 'default',
183
+ )
184
+ .option(
185
+ '--count-first',
186
+ 'Count files first for percentage-based progress (slower start)',
187
+ )
188
+ .action(async (options) => {
189
+ try {
190
+ // Set API target if specified
191
+ if (options.api && options.api !== 'default') {
192
+ appConfig.setApiTarget(options.api);
193
+ }
194
+ await this.scanCommand.execute(options);
195
+ } catch (error) {
196
+ this.errorHandler.handleFatalError(error, { command: 'scan' });
197
+ }
198
+ });
199
+
167
200
  // Detection command
168
201
  this.program
169
202
  .command('detect')
@@ -277,6 +310,123 @@ class ArelaUploaderCLI {
277
310
  }
278
311
  });
279
312
 
313
+ // ============================================================================
314
+ // NEW SIMPLIFIED COMMANDS (Optimized versions with better naming)
315
+ // ============================================================================
316
+
317
+ // Identify command - simplified version of "detect --detect-pdfs"
318
+ this.program
319
+ .command('identify')
320
+ .description('🔍 Identify document types using matchers (optimized)')
321
+ .option(
322
+ '--api <target>',
323
+ 'API target: agencia|cliente|default',
324
+ 'default',
325
+ )
326
+ .option(
327
+ '-b, --batch-size <size>',
328
+ 'Number of files to process in each batch',
329
+ '100',
330
+ )
331
+ .option('--show-stats', 'Show performance statistics')
332
+ .action(async (options) => {
333
+ try {
334
+ await this.identifyCommand.execute(options);
335
+ } catch (error) {
336
+ this.errorHandler.handleFatalError(error, { command: 'identify' });
337
+ }
338
+ });
339
+
340
+ // Propagate command - simplified version of "detect --propagate-arela-path"
341
+ this.program
342
+ .command('propagate')
343
+ .description(
344
+ '🔄 Propagate arela_path from pedimentos to related files (optimized)',
345
+ )
346
+ .option(
347
+ '--api <target>',
348
+ 'API target: agencia|cliente|default',
349
+ 'default',
350
+ )
351
+ .option(
352
+ '-b, --batch-size <size>',
353
+ 'Number of pedimentos to process per batch',
354
+ '50',
355
+ )
356
+ .option('--show-stats', 'Show performance statistics')
357
+ .action(async (options) => {
358
+ try {
359
+ const propagateCommand = new PropagateCommand(options);
360
+ await propagateCommand.execute();
361
+ } catch (error) {
362
+ this.errorHandler.handleFatalError(error, { command: 'propagate' });
363
+ }
364
+ });
365
+
366
+ // Push command - simplified version of "upload --upload-by-rfc"
367
+ this.program
368
+ .command('push')
369
+ .description('📤 Upload files by RFC to Arela API (optimized)')
370
+ .option(
371
+ '--api <target>',
372
+ 'API target for scan operations: default|agencia|cliente',
373
+ 'default',
374
+ )
375
+ .option(
376
+ '--scan-api <target>',
377
+ 'API for reading scan table: default|agencia|cliente',
378
+ 'default',
379
+ )
380
+ .option(
381
+ '--push-api <target>',
382
+ 'API for uploading files: default|agencia|cliente',
383
+ )
384
+ .option(
385
+ '-b, --batch-size <size>',
386
+ 'Number of files to fetch per batch',
387
+ '100',
388
+ )
389
+ .option(
390
+ '--upload-batch-size <size>',
391
+ 'Number of files to upload concurrently',
392
+ '10',
393
+ )
394
+ .option(
395
+ '--rfcs <rfcs>',
396
+ 'Comma-separated RFCs to filter (overrides PUSH_RFCS env var)',
397
+ )
398
+ .option(
399
+ '--years <years>',
400
+ 'Comma-separated years to filter (overrides PUSH_YEARS env var)',
401
+ )
402
+ .option('--show-stats', 'Show performance statistics')
403
+ .action(async (options) => {
404
+ try {
405
+ // Parse comma-separated values
406
+ if (options.rfcs) {
407
+ options.rfcs = options.rfcs
408
+ .split(',')
409
+ .map((r) => r.trim().toUpperCase())
410
+ .filter(Boolean);
411
+ }
412
+ if (options.years) {
413
+ options.years = options.years
414
+ .split(',')
415
+ .map((y) => parseInt(y.trim(), 10))
416
+ .filter((y) => !isNaN(y));
417
+ }
418
+
419
+ const pushCommand = new PushCommand();
420
+ await pushCommand.execute(options);
421
+ } catch (error) {
422
+ this.errorHandler.handleFatalError(error, { command: 'push' });
423
+ }
424
+ });
425
+
426
+ // ============================================================================
427
+ // END OF NEW SIMPLIFIED COMMANDS
428
+ // ============================================================================
429
+
280
430
  // Watch command
281
431
  this.program
282
432
  .command('watch')