@arela/uploader 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,8 @@ class Config {
15
15
  this.supabase = this.#loadSupabaseConfig();
16
16
  this.api = this.#loadApiConfig();
17
17
  this.upload = this.#loadUploadConfig();
18
+ this.scan = this.#loadScanConfig();
19
+ this.push = this.#loadPushConfig();
18
20
  this.performance = this.#loadPerformanceConfig();
19
21
  this.logging = this.#loadLoggingConfig();
20
22
  this.watch = this.#loadWatchConfig();
@@ -30,10 +32,10 @@ class Config {
30
32
  const __dirname = path.dirname(__filename);
31
33
  const packageJsonPath = path.resolve(__dirname, '../../package.json');
32
34
  const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
33
- return packageJson.version || '1.0.1';
35
+ return packageJson.version || '1.0.3';
34
36
  } catch (error) {
35
37
  console.warn('⚠️ Could not read package.json version, using fallback');
36
- return '1.0.1';
38
+ return '1.0.3';
37
39
  }
38
40
  }
39
41
 
@@ -239,6 +241,177 @@ class Config {
239
241
  };
240
242
  }
241
243
 
244
+ /**
245
+ * Load scan configuration
246
+ * @private
247
+ */
248
+ #loadScanConfig() {
249
+ const companySlug = process.env.ARELA_COMPANY_SLUG;
250
+ const serverId = process.env.ARELA_SERVER_ID;
251
+ let basePathLabel = process.env.ARELA_BASE_PATH_LABEL;
252
+
253
+ // Auto-derive basePathLabel from UPLOAD_BASE_PATH if not set
254
+ if (!basePathLabel && process.env.UPLOAD_BASE_PATH) {
255
+ const basePath = process.env.UPLOAD_BASE_PATH;
256
+ // Get the last segment of the path
257
+ const segments = basePath.split(path.sep).filter(Boolean);
258
+ basePathLabel = segments[segments.length - 1] || 'root';
259
+ // Sanitize the label
260
+ basePathLabel = basePathLabel.replace(/[^a-zA-Z0-9_-]/g, '_');
261
+ }
262
+
263
+ // Parse exclude patterns
264
+ const defaultExcludePatterns =
265
+ '.DS_Store,Thumbs.db,desktop.ini,__pycache__,.pyc,.tmp,.swp,$RECYCLE.BIN,System Volume Information,~$*';
266
+ const excludePatterns = (
267
+ process.env.SCAN_EXCLUDE_PATTERNS || defaultExcludePatterns
268
+ )
269
+ .split(',')
270
+ .map((p) => p.trim())
271
+ .filter(Boolean);
272
+
273
+ // Generate table name if all components are available
274
+ let tableName = null;
275
+ if (companySlug && serverId && basePathLabel) {
276
+ const rawName = `${companySlug}_${serverId}_${basePathLabel}`;
277
+ tableName = this.#sanitizeTableName(rawName);
278
+ }
279
+
280
+ return {
281
+ companySlug,
282
+ serverId,
283
+ basePathLabel,
284
+ tableName,
285
+ excludePatterns,
286
+ batchSize: parseInt(process.env.SCAN_BATCH_SIZE) || 2000,
287
+ directoryLevel: parseInt(process.env.SCAN_DIRECTORY_LEVEL) || 0,
288
+ };
289
+ }
290
+
291
+ /**
292
+ * Load push configuration
293
+ * @private
294
+ */
295
+ #loadPushConfig() {
296
+ const pushRfcs = process.env.PUSH_RFCS?.split('|')
297
+ .map((s) => s.trim().toUpperCase())
298
+ .filter(Boolean);
299
+
300
+ const pushYears = process.env.PUSH_YEARS?.split('|')
301
+ .map((s) => parseInt(s.trim(), 10))
302
+ .filter((y) => !isNaN(y));
303
+
304
+ return {
305
+ rfcs: pushRfcs,
306
+ years: pushYears,
307
+ batchSize: parseInt(process.env.PUSH_BATCH_SIZE) || 50,
308
+ uploadBatchSize: parseInt(process.env.PUSH_UPLOAD_BATCH_SIZE) || 10,
309
+ bucket:
310
+ process.env.PUSH_BUCKET || process.env.SUPABASE_BUCKET || 'archivos',
311
+ };
312
+ }
313
+
314
+ /**
315
+ * Sanitize and generate table name
316
+ * @private
317
+ */
318
+ #sanitizeTableName(rawName) {
319
+ // Sanitize: lowercase, replace special chars with underscore
320
+ let sanitized = rawName
321
+ .toLowerCase()
322
+ .replace(/[^a-z0-9_]/g, '_')
323
+ .replace(/_+/g, '_')
324
+ .replace(/^_|_$/g, '');
325
+
326
+ const prefix = 'scan_';
327
+ let tableName = prefix + sanitized;
328
+
329
+ // PostgreSQL table name limit is 63 characters
330
+ if (tableName.length > 63) {
331
+ // Simple hash without crypto module
332
+ let hash = 0;
333
+ for (let i = 0; i < rawName.length; i++) {
334
+ const char = rawName.charCodeAt(i);
335
+ hash = (hash << 5) - hash + char;
336
+ hash = hash & hash; // Convert to 32bit integer
337
+ }
338
+ const hashStr = Math.abs(hash).toString(36).substring(0, 8);
339
+ const maxBaseLength = 63 - hashStr.length - 1;
340
+ tableName = tableName.substring(0, maxBaseLength) + '_' + hashStr;
341
+ }
342
+
343
+ return tableName;
344
+ }
345
+
346
+ /**
347
+ * Validate scan configuration
348
+ * @throws {Error} If required scan configuration is missing
349
+ */
350
+ validateScanConfig() {
351
+ const errors = [];
352
+
353
+ if (!this.scan.companySlug) {
354
+ errors.push(
355
+ 'ARELA_COMPANY_SLUG is required (e.g., "acme_corp", "cliente_123")',
356
+ );
357
+ } else if (!/^[a-zA-Z0-9_-]+$/.test(this.scan.companySlug)) {
358
+ errors.push(
359
+ 'ARELA_COMPANY_SLUG must contain only alphanumeric characters, dashes, and underscores',
360
+ );
361
+ }
362
+
363
+ if (!this.scan.serverId) {
364
+ errors.push(
365
+ 'ARELA_SERVER_ID is required (e.g., "nas01", "server-mx", "storage-01")',
366
+ );
367
+ } else if (!/^[a-zA-Z0-9_-]+$/.test(this.scan.serverId)) {
368
+ errors.push(
369
+ 'ARELA_SERVER_ID must contain only alphanumeric characters, dashes, and underscores',
370
+ );
371
+ }
372
+
373
+ if (!this.upload.basePath) {
374
+ errors.push(
375
+ 'UPLOAD_BASE_PATH is required to determine the scan base path',
376
+ );
377
+ }
378
+
379
+ if (!this.scan.basePathLabel) {
380
+ errors.push(
381
+ 'Could not determine base path label. Set ARELA_BASE_PATH_LABEL or UPLOAD_BASE_PATH',
382
+ );
383
+ }
384
+
385
+ if (errors.length > 0) {
386
+ throw new Error(
387
+ '⚠️ Scan configuration errors:\n - ' + errors.join('\n - '),
388
+ );
389
+ }
390
+ }
391
+
392
+ /**
393
+ * Get scan configuration
394
+ * @returns {Object} Scan configuration
395
+ */
396
+ getScanConfig() {
397
+ return {
398
+ ...this.scan,
399
+ basePathFull: this.upload.basePath,
400
+ };
401
+ }
402
+
403
+ /**
404
+ * Get push configuration
405
+ * @returns {Object} Push configuration
406
+ */
407
+ getPushConfig() {
408
+ return {
409
+ ...this.push,
410
+ rfcs: this.push.rfcs || [],
411
+ years: this.push.years || [],
412
+ };
413
+ }
414
+
242
415
  /**
243
416
  * Load performance configuration
244
417
  * @private
@@ -365,14 +538,11 @@ class Config {
365
538
 
366
539
  /**
367
540
  * Get upload sources with validation
368
- * @returns {string[]} Array of upload sources
369
- * @throws {Error} If sources are not configured
541
+ * @returns {string[]} Array of upload sources (defaults to ['.'] if not configured)
370
542
  */
371
543
  getUploadSources() {
372
544
  if (!this.upload.sources || this.upload.sources.length === 0) {
373
- throw new Error(
374
- '⚠️ No upload sources configured. Please set UPLOAD_SOURCES environment variable.',
375
- );
545
+ return ['.'];
376
546
  }
377
547
  return this.upload.sources;
378
548
  }
@@ -1,11 +1,9 @@
1
1
  import fs from 'fs';
2
- import { getTextExtractor } from 'office-text-extractor';
3
2
  import path from 'path';
3
+ import { PDFParse } from 'pdf-parse';
4
4
 
5
5
  import { extractDocumentFields } from './document-type-shared.js';
6
6
 
7
- const extractor = getTextExtractor();
8
-
9
7
  /**
10
8
  * Compose arela_path from extracted pedimento fields
11
9
  * Format: RFC/Year/Patente/Aduana/Pedimento/
@@ -151,18 +149,19 @@ export class FileDetectionService {
151
149
  }
152
150
 
153
151
  /**
154
- * Extract text from PDF file
152
+ * Extract text from PDF file using pdf-parse
153
+ * More reliable for concurrent operations than office-text-extractor
155
154
  * @param {string} filePath - Path to PDF file
156
155
  * @returns {Promise<string>} - Extracted text
157
156
  */
158
157
  async extractTextFromPDF(filePath) {
159
158
  try {
160
- const buffer = fs.readFileSync(filePath);
161
- const text = await extractor.extractText({
162
- input: buffer,
163
- type: 'file',
164
- });
165
- return text;
159
+ const dataBuffer = fs.readFileSync(filePath);
160
+ // Convert Buffer to Uint8Array as required by pdf-parse
161
+ const uint8Array = new Uint8Array(dataBuffer);
162
+ const pdfParse = new PDFParse(uint8Array);
163
+ const result = await pdfParse.getText();
164
+ return result.text;
166
165
  } catch (error) {
167
166
  console.error(
168
167
  `Error extracting text from PDF ${filePath}:`,
package/src/index.js CHANGED
@@ -1,6 +1,10 @@
1
1
  #!/usr/bin/env node
2
2
  import { Command } from 'commander';
3
3
 
4
+ import identifyCommand from './commands/IdentifyCommand.js';
5
+ import PropagateCommand from './commands/PropagateCommand.js';
6
+ import PushCommand from './commands/PushCommand.js';
7
+ import scanCommand from './commands/ScanCommand.js';
4
8
  import UploadCommand from './commands/UploadCommand.js';
5
9
  import watchCommand from './commands/WatchCommand.js';
6
10
  import appConfig from './config/config.js';
@@ -15,6 +19,8 @@ class ArelaUploaderCLI {
15
19
  constructor() {
16
20
  this.program = new Command();
17
21
  this.errorHandler = new ErrorHandler(logger);
22
+ this.identifyCommand = identifyCommand;
23
+ this.scanCommand = scanCommand;
18
24
  this.uploadCommand = new UploadCommand();
19
25
  this.watchCommand = watchCommand;
20
26
 
@@ -164,6 +170,33 @@ class ArelaUploaderCLI {
164
170
  }
165
171
  });
166
172
 
173
+ // Scan command (optimized stats collection with streaming)
174
+ this.program
175
+ .command('scan')
176
+ .description(
177
+ 'Scan filesystem and collect file statistics (optimized with streaming)',
178
+ )
179
+ .option(
180
+ '--api <target>',
181
+ 'API target: agencia|cliente|default',
182
+ 'default',
183
+ )
184
+ .option(
185
+ '--count-first',
186
+ 'Count files first for percentage-based progress (slower start)',
187
+ )
188
+ .action(async (options) => {
189
+ try {
190
+ // Set API target if specified
191
+ if (options.api && options.api !== 'default') {
192
+ appConfig.setApiTarget(options.api);
193
+ }
194
+ await this.scanCommand.execute(options);
195
+ } catch (error) {
196
+ this.errorHandler.handleFatalError(error, { command: 'scan' });
197
+ }
198
+ });
199
+
167
200
  // Detection command
168
201
  this.program
169
202
  .command('detect')
@@ -277,6 +310,123 @@ class ArelaUploaderCLI {
277
310
  }
278
311
  });
279
312
 
313
+ // ============================================================================
314
+ // NEW SIMPLIFIED COMMANDS (Optimized versions with better naming)
315
+ // ============================================================================
316
+
317
+ // Identify command - simplified version of "detect --detect-pdfs"
318
+ this.program
319
+ .command('identify')
320
+ .description('🔍 Identify document types using matchers (optimized)')
321
+ .option(
322
+ '--api <target>',
323
+ 'API target: agencia|cliente|default',
324
+ 'default',
325
+ )
326
+ .option(
327
+ '-b, --batch-size <size>',
328
+ 'Number of files to process in each batch',
329
+ '100',
330
+ )
331
+ .option('--show-stats', 'Show performance statistics')
332
+ .action(async (options) => {
333
+ try {
334
+ await this.identifyCommand.execute(options);
335
+ } catch (error) {
336
+ this.errorHandler.handleFatalError(error, { command: 'identify' });
337
+ }
338
+ });
339
+
340
+ // Propagate command - simplified version of "detect --propagate-arela-path"
341
+ this.program
342
+ .command('propagate')
343
+ .description(
344
+ '🔄 Propagate arela_path from pedimentos to related files (optimized)',
345
+ )
346
+ .option(
347
+ '--api <target>',
348
+ 'API target: agencia|cliente|default',
349
+ 'default',
350
+ )
351
+ .option(
352
+ '-b, --batch-size <size>',
353
+ 'Number of pedimentos to process per batch',
354
+ '50',
355
+ )
356
+ .option('--show-stats', 'Show performance statistics')
357
+ .action(async (options) => {
358
+ try {
359
+ const propagateCommand = new PropagateCommand(options);
360
+ await propagateCommand.execute();
361
+ } catch (error) {
362
+ this.errorHandler.handleFatalError(error, { command: 'propagate' });
363
+ }
364
+ });
365
+
366
+ // Push command - simplified version of "upload --upload-by-rfc"
367
+ this.program
368
+ .command('push')
369
+ .description('📤 Upload files by RFC to Arela API (optimized)')
370
+ .option(
371
+ '--api <target>',
372
+ 'API target for scan operations: default|agencia|cliente',
373
+ 'default',
374
+ )
375
+ .option(
376
+ '--scan-api <target>',
377
+ 'API for reading scan table: default|agencia|cliente',
378
+ 'default',
379
+ )
380
+ .option(
381
+ '--push-api <target>',
382
+ 'API for uploading files: default|agencia|cliente',
383
+ )
384
+ .option(
385
+ '-b, --batch-size <size>',
386
+ 'Number of files to fetch per batch',
387
+ '100',
388
+ )
389
+ .option(
390
+ '--upload-batch-size <size>',
391
+ 'Number of files to upload concurrently',
392
+ '10',
393
+ )
394
+ .option(
395
+ '--rfcs <rfcs>',
396
+ 'Comma-separated RFCs to filter (overrides PUSH_RFCS env var)',
397
+ )
398
+ .option(
399
+ '--years <years>',
400
+ 'Comma-separated years to filter (overrides PUSH_YEARS env var)',
401
+ )
402
+ .option('--show-stats', 'Show performance statistics')
403
+ .action(async (options) => {
404
+ try {
405
+ // Parse comma-separated values
406
+ if (options.rfcs) {
407
+ options.rfcs = options.rfcs
408
+ .split(',')
409
+ .map((r) => r.trim().toUpperCase())
410
+ .filter(Boolean);
411
+ }
412
+ if (options.years) {
413
+ options.years = options.years
414
+ .split(',')
415
+ .map((y) => parseInt(y.trim(), 10))
416
+ .filter((y) => !isNaN(y));
417
+ }
418
+
419
+ const pushCommand = new PushCommand();
420
+ await pushCommand.execute(options);
421
+ } catch (error) {
422
+ this.errorHandler.handleFatalError(error, { command: 'push' });
423
+ }
424
+ });
425
+
426
+ // ============================================================================
427
+ // END OF NEW SIMPLIFIED COMMANDS
428
+ // ============================================================================
429
+
280
430
  // Watch command
281
431
  this.program
282
432
  .command('watch')
@@ -498,8 +498,8 @@ export class DatabaseService {
498
498
  );
499
499
 
500
500
  const processingBatchSize = parseInt(options.batchSize) || 10;
501
- // Reduced query batch size to avoid timeouts
502
- const queryBatchSize = 100; // Reduced from 500 to 100
501
+ // Query batch size for each API call
502
+ const queryBatchSize = 100;
503
503
 
504
504
  let totalDetected = 0;
505
505
  let totalProcessed = 0;