@arela/uploader 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.template +70 -0
- package/docs/API_RETRY_MECHANISM.md +338 -0
- package/docs/ARELA_IDENTIFY_IMPLEMENTATION.md +489 -0
- package/docs/ARELA_IDENTIFY_QUICKREF.md +186 -0
- package/docs/ARELA_PROPAGATE_IMPLEMENTATION.md +581 -0
- package/docs/ARELA_PROPAGATE_QUICKREF.md +272 -0
- package/docs/ARELA_PUSH_IMPLEMENTATION.md +577 -0
- package/docs/ARELA_PUSH_QUICKREF.md +322 -0
- package/docs/ARELA_SCAN_IMPLEMENTATION.md +373 -0
- package/docs/ARELA_SCAN_QUICKREF.md +139 -0
- package/docs/DETECTION_ATTEMPT_TRACKING.md +414 -0
- package/docs/MIGRATION_UPLOADER_TO_FILE_STATS.md +1020 -0
- package/docs/MULTI_LEVEL_DIRECTORY_SCANNING.md +494 -0
- package/docs/STATS_COMMAND_SEQUENCE_DIAGRAM.md +287 -0
- package/docs/STATS_COMMAND_SIMPLE.md +93 -0
- package/package.json +4 -2
- package/src/commands/IdentifyCommand.js +486 -0
- package/src/commands/PropagateCommand.js +474 -0
- package/src/commands/PushCommand.js +473 -0
- package/src/commands/ScanCommand.js +516 -0
- package/src/config/config.js +177 -7
- package/src/file-detection.js +9 -10
- package/src/index.js +150 -0
- package/src/services/ScanApiService.js +646 -0
package/src/config/config.js
CHANGED
|
@@ -15,6 +15,8 @@ class Config {
|
|
|
15
15
|
this.supabase = this.#loadSupabaseConfig();
|
|
16
16
|
this.api = this.#loadApiConfig();
|
|
17
17
|
this.upload = this.#loadUploadConfig();
|
|
18
|
+
this.scan = this.#loadScanConfig();
|
|
19
|
+
this.push = this.#loadPushConfig();
|
|
18
20
|
this.performance = this.#loadPerformanceConfig();
|
|
19
21
|
this.logging = this.#loadLoggingConfig();
|
|
20
22
|
this.watch = this.#loadWatchConfig();
|
|
@@ -30,10 +32,10 @@ class Config {
|
|
|
30
32
|
const __dirname = path.dirname(__filename);
|
|
31
33
|
const packageJsonPath = path.resolve(__dirname, '../../package.json');
|
|
32
34
|
const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
|
|
33
|
-
return packageJson.version || '1.0.
|
|
35
|
+
return packageJson.version || '1.0.3';
|
|
34
36
|
} catch (error) {
|
|
35
37
|
console.warn('⚠️ Could not read package.json version, using fallback');
|
|
36
|
-
return '1.0.
|
|
38
|
+
return '1.0.3';
|
|
37
39
|
}
|
|
38
40
|
}
|
|
39
41
|
|
|
@@ -239,6 +241,177 @@ class Config {
|
|
|
239
241
|
};
|
|
240
242
|
}
|
|
241
243
|
|
|
244
|
+
/**
|
|
245
|
+
* Load scan configuration
|
|
246
|
+
* @private
|
|
247
|
+
*/
|
|
248
|
+
#loadScanConfig() {
|
|
249
|
+
const companySlug = process.env.ARELA_COMPANY_SLUG;
|
|
250
|
+
const serverId = process.env.ARELA_SERVER_ID;
|
|
251
|
+
let basePathLabel = process.env.ARELA_BASE_PATH_LABEL;
|
|
252
|
+
|
|
253
|
+
// Auto-derive basePathLabel from UPLOAD_BASE_PATH if not set
|
|
254
|
+
if (!basePathLabel && process.env.UPLOAD_BASE_PATH) {
|
|
255
|
+
const basePath = process.env.UPLOAD_BASE_PATH;
|
|
256
|
+
// Get the last segment of the path
|
|
257
|
+
const segments = basePath.split(path.sep).filter(Boolean);
|
|
258
|
+
basePathLabel = segments[segments.length - 1] || 'root';
|
|
259
|
+
// Sanitize the label
|
|
260
|
+
basePathLabel = basePathLabel.replace(/[^a-zA-Z0-9_-]/g, '_');
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
// Parse exclude patterns
|
|
264
|
+
const defaultExcludePatterns =
|
|
265
|
+
'.DS_Store,Thumbs.db,desktop.ini,__pycache__,.pyc,.tmp,.swp,$RECYCLE.BIN,System Volume Information,~$*';
|
|
266
|
+
const excludePatterns = (
|
|
267
|
+
process.env.SCAN_EXCLUDE_PATTERNS || defaultExcludePatterns
|
|
268
|
+
)
|
|
269
|
+
.split(',')
|
|
270
|
+
.map((p) => p.trim())
|
|
271
|
+
.filter(Boolean);
|
|
272
|
+
|
|
273
|
+
// Generate table name if all components are available
|
|
274
|
+
let tableName = null;
|
|
275
|
+
if (companySlug && serverId && basePathLabel) {
|
|
276
|
+
const rawName = `${companySlug}_${serverId}_${basePathLabel}`;
|
|
277
|
+
tableName = this.#sanitizeTableName(rawName);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
return {
|
|
281
|
+
companySlug,
|
|
282
|
+
serverId,
|
|
283
|
+
basePathLabel,
|
|
284
|
+
tableName,
|
|
285
|
+
excludePatterns,
|
|
286
|
+
batchSize: parseInt(process.env.SCAN_BATCH_SIZE) || 2000,
|
|
287
|
+
directoryLevel: parseInt(process.env.SCAN_DIRECTORY_LEVEL) || 0,
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Load push configuration
|
|
293
|
+
* @private
|
|
294
|
+
*/
|
|
295
|
+
#loadPushConfig() {
|
|
296
|
+
const pushRfcs = process.env.PUSH_RFCS?.split('|')
|
|
297
|
+
.map((s) => s.trim().toUpperCase())
|
|
298
|
+
.filter(Boolean);
|
|
299
|
+
|
|
300
|
+
const pushYears = process.env.PUSH_YEARS?.split('|')
|
|
301
|
+
.map((s) => parseInt(s.trim(), 10))
|
|
302
|
+
.filter((y) => !isNaN(y));
|
|
303
|
+
|
|
304
|
+
return {
|
|
305
|
+
rfcs: pushRfcs,
|
|
306
|
+
years: pushYears,
|
|
307
|
+
batchSize: parseInt(process.env.PUSH_BATCH_SIZE) || 50,
|
|
308
|
+
uploadBatchSize: parseInt(process.env.PUSH_UPLOAD_BATCH_SIZE) || 10,
|
|
309
|
+
bucket:
|
|
310
|
+
process.env.PUSH_BUCKET || process.env.SUPABASE_BUCKET || 'archivos',
|
|
311
|
+
};
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
/**
|
|
315
|
+
* Sanitize and generate table name
|
|
316
|
+
* @private
|
|
317
|
+
*/
|
|
318
|
+
#sanitizeTableName(rawName) {
|
|
319
|
+
// Sanitize: lowercase, replace special chars with underscore
|
|
320
|
+
let sanitized = rawName
|
|
321
|
+
.toLowerCase()
|
|
322
|
+
.replace(/[^a-z0-9_]/g, '_')
|
|
323
|
+
.replace(/_+/g, '_')
|
|
324
|
+
.replace(/^_|_$/g, '');
|
|
325
|
+
|
|
326
|
+
const prefix = 'scan_';
|
|
327
|
+
let tableName = prefix + sanitized;
|
|
328
|
+
|
|
329
|
+
// PostgreSQL table name limit is 63 characters
|
|
330
|
+
if (tableName.length > 63) {
|
|
331
|
+
// Simple hash without crypto module
|
|
332
|
+
let hash = 0;
|
|
333
|
+
for (let i = 0; i < rawName.length; i++) {
|
|
334
|
+
const char = rawName.charCodeAt(i);
|
|
335
|
+
hash = (hash << 5) - hash + char;
|
|
336
|
+
hash = hash & hash; // Convert to 32bit integer
|
|
337
|
+
}
|
|
338
|
+
const hashStr = Math.abs(hash).toString(36).substring(0, 8);
|
|
339
|
+
const maxBaseLength = 63 - hashStr.length - 1;
|
|
340
|
+
tableName = tableName.substring(0, maxBaseLength) + '_' + hashStr;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
return tableName;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Validate scan configuration
|
|
348
|
+
* @throws {Error} If required scan configuration is missing
|
|
349
|
+
*/
|
|
350
|
+
validateScanConfig() {
|
|
351
|
+
const errors = [];
|
|
352
|
+
|
|
353
|
+
if (!this.scan.companySlug) {
|
|
354
|
+
errors.push(
|
|
355
|
+
'ARELA_COMPANY_SLUG is required (e.g., "acme_corp", "cliente_123")',
|
|
356
|
+
);
|
|
357
|
+
} else if (!/^[a-zA-Z0-9_-]+$/.test(this.scan.companySlug)) {
|
|
358
|
+
errors.push(
|
|
359
|
+
'ARELA_COMPANY_SLUG must contain only alphanumeric characters, dashes, and underscores',
|
|
360
|
+
);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
if (!this.scan.serverId) {
|
|
364
|
+
errors.push(
|
|
365
|
+
'ARELA_SERVER_ID is required (e.g., "nas01", "server-mx", "storage-01")',
|
|
366
|
+
);
|
|
367
|
+
} else if (!/^[a-zA-Z0-9_-]+$/.test(this.scan.serverId)) {
|
|
368
|
+
errors.push(
|
|
369
|
+
'ARELA_SERVER_ID must contain only alphanumeric characters, dashes, and underscores',
|
|
370
|
+
);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
if (!this.upload.basePath) {
|
|
374
|
+
errors.push(
|
|
375
|
+
'UPLOAD_BASE_PATH is required to determine the scan base path',
|
|
376
|
+
);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
if (!this.scan.basePathLabel) {
|
|
380
|
+
errors.push(
|
|
381
|
+
'Could not determine base path label. Set ARELA_BASE_PATH_LABEL or UPLOAD_BASE_PATH',
|
|
382
|
+
);
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
if (errors.length > 0) {
|
|
386
|
+
throw new Error(
|
|
387
|
+
'⚠️ Scan configuration errors:\n - ' + errors.join('\n - '),
|
|
388
|
+
);
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/**
|
|
393
|
+
* Get scan configuration
|
|
394
|
+
* @returns {Object} Scan configuration
|
|
395
|
+
*/
|
|
396
|
+
getScanConfig() {
|
|
397
|
+
return {
|
|
398
|
+
...this.scan,
|
|
399
|
+
basePathFull: this.upload.basePath,
|
|
400
|
+
};
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
/**
|
|
404
|
+
* Get push configuration
|
|
405
|
+
* @returns {Object} Push configuration
|
|
406
|
+
*/
|
|
407
|
+
getPushConfig() {
|
|
408
|
+
return {
|
|
409
|
+
...this.push,
|
|
410
|
+
rfcs: this.push.rfcs || [],
|
|
411
|
+
years: this.push.years || [],
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
|
|
242
415
|
/**
|
|
243
416
|
* Load performance configuration
|
|
244
417
|
* @private
|
|
@@ -365,14 +538,11 @@ class Config {
|
|
|
365
538
|
|
|
366
539
|
/**
|
|
367
540
|
* Get upload sources with validation
|
|
368
|
-
* @returns {string[]} Array of upload sources
|
|
369
|
-
* @throws {Error} If sources are not configured
|
|
541
|
+
* @returns {string[]} Array of upload sources (defaults to ['.'] if not configured)
|
|
370
542
|
*/
|
|
371
543
|
getUploadSources() {
|
|
372
544
|
if (!this.upload.sources || this.upload.sources.length === 0) {
|
|
373
|
-
|
|
374
|
-
'⚠️ No upload sources configured. Please set UPLOAD_SOURCES environment variable.',
|
|
375
|
-
);
|
|
545
|
+
return ['.'];
|
|
376
546
|
}
|
|
377
547
|
return this.upload.sources;
|
|
378
548
|
}
|
package/src/file-detection.js
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
import fs from 'fs';
|
|
2
|
-
import { getTextExtractor } from 'office-text-extractor';
|
|
3
2
|
import path from 'path';
|
|
3
|
+
import { PDFParse } from 'pdf-parse';
|
|
4
4
|
|
|
5
5
|
import { extractDocumentFields } from './document-type-shared.js';
|
|
6
6
|
|
|
7
|
-
const extractor = getTextExtractor();
|
|
8
|
-
|
|
9
7
|
/**
|
|
10
8
|
* Compose arela_path from extracted pedimento fields
|
|
11
9
|
* Format: RFC/Year/Patente/Aduana/Pedimento/
|
|
@@ -151,18 +149,19 @@ export class FileDetectionService {
|
|
|
151
149
|
}
|
|
152
150
|
|
|
153
151
|
/**
|
|
154
|
-
* Extract text from PDF file
|
|
152
|
+
* Extract text from PDF file using pdf-parse
|
|
153
|
+
* More reliable for concurrent operations than office-text-extractor
|
|
155
154
|
* @param {string} filePath - Path to PDF file
|
|
156
155
|
* @returns {Promise<string>} - Extracted text
|
|
157
156
|
*/
|
|
158
157
|
async extractTextFromPDF(filePath) {
|
|
159
158
|
try {
|
|
160
|
-
const
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
return text;
|
|
159
|
+
const dataBuffer = fs.readFileSync(filePath);
|
|
160
|
+
// Convert Buffer to Uint8Array as required by pdf-parse
|
|
161
|
+
const uint8Array = new Uint8Array(dataBuffer);
|
|
162
|
+
const pdfParse = new PDFParse(uint8Array);
|
|
163
|
+
const result = await pdfParse.getText();
|
|
164
|
+
return result.text;
|
|
166
165
|
} catch (error) {
|
|
167
166
|
console.error(
|
|
168
167
|
`Error extracting text from PDF ${filePath}:`,
|
package/src/index.js
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { Command } from 'commander';
|
|
3
3
|
|
|
4
|
+
import identifyCommand from './commands/IdentifyCommand.js';
|
|
5
|
+
import PropagateCommand from './commands/PropagateCommand.js';
|
|
6
|
+
import PushCommand from './commands/PushCommand.js';
|
|
7
|
+
import scanCommand from './commands/ScanCommand.js';
|
|
4
8
|
import UploadCommand from './commands/UploadCommand.js';
|
|
5
9
|
import watchCommand from './commands/WatchCommand.js';
|
|
6
10
|
import appConfig from './config/config.js';
|
|
@@ -15,6 +19,8 @@ class ArelaUploaderCLI {
|
|
|
15
19
|
constructor() {
|
|
16
20
|
this.program = new Command();
|
|
17
21
|
this.errorHandler = new ErrorHandler(logger);
|
|
22
|
+
this.identifyCommand = identifyCommand;
|
|
23
|
+
this.scanCommand = scanCommand;
|
|
18
24
|
this.uploadCommand = new UploadCommand();
|
|
19
25
|
this.watchCommand = watchCommand;
|
|
20
26
|
|
|
@@ -164,6 +170,33 @@ class ArelaUploaderCLI {
|
|
|
164
170
|
}
|
|
165
171
|
});
|
|
166
172
|
|
|
173
|
+
// Scan command (optimized stats collection with streaming)
|
|
174
|
+
this.program
|
|
175
|
+
.command('scan')
|
|
176
|
+
.description(
|
|
177
|
+
'Scan filesystem and collect file statistics (optimized with streaming)',
|
|
178
|
+
)
|
|
179
|
+
.option(
|
|
180
|
+
'--api <target>',
|
|
181
|
+
'API target: agencia|cliente|default',
|
|
182
|
+
'default',
|
|
183
|
+
)
|
|
184
|
+
.option(
|
|
185
|
+
'--count-first',
|
|
186
|
+
'Count files first for percentage-based progress (slower start)',
|
|
187
|
+
)
|
|
188
|
+
.action(async (options) => {
|
|
189
|
+
try {
|
|
190
|
+
// Set API target if specified
|
|
191
|
+
if (options.api && options.api !== 'default') {
|
|
192
|
+
appConfig.setApiTarget(options.api);
|
|
193
|
+
}
|
|
194
|
+
await this.scanCommand.execute(options);
|
|
195
|
+
} catch (error) {
|
|
196
|
+
this.errorHandler.handleFatalError(error, { command: 'scan' });
|
|
197
|
+
}
|
|
198
|
+
});
|
|
199
|
+
|
|
167
200
|
// Detection command
|
|
168
201
|
this.program
|
|
169
202
|
.command('detect')
|
|
@@ -277,6 +310,123 @@ class ArelaUploaderCLI {
|
|
|
277
310
|
}
|
|
278
311
|
});
|
|
279
312
|
|
|
313
|
+
// ============================================================================
|
|
314
|
+
// NEW SIMPLIFIED COMMANDS (Optimized versions with better naming)
|
|
315
|
+
// ============================================================================
|
|
316
|
+
|
|
317
|
+
// Identify command - simplified version of "detect --detect-pdfs"
|
|
318
|
+
this.program
|
|
319
|
+
.command('identify')
|
|
320
|
+
.description('🔍 Identify document types using matchers (optimized)')
|
|
321
|
+
.option(
|
|
322
|
+
'--api <target>',
|
|
323
|
+
'API target: agencia|cliente|default',
|
|
324
|
+
'default',
|
|
325
|
+
)
|
|
326
|
+
.option(
|
|
327
|
+
'-b, --batch-size <size>',
|
|
328
|
+
'Number of files to process in each batch',
|
|
329
|
+
'100',
|
|
330
|
+
)
|
|
331
|
+
.option('--show-stats', 'Show performance statistics')
|
|
332
|
+
.action(async (options) => {
|
|
333
|
+
try {
|
|
334
|
+
await this.identifyCommand.execute(options);
|
|
335
|
+
} catch (error) {
|
|
336
|
+
this.errorHandler.handleFatalError(error, { command: 'identify' });
|
|
337
|
+
}
|
|
338
|
+
});
|
|
339
|
+
|
|
340
|
+
// Propagate command - simplified version of "detect --propagate-arela-path"
|
|
341
|
+
this.program
|
|
342
|
+
.command('propagate')
|
|
343
|
+
.description(
|
|
344
|
+
'🔄 Propagate arela_path from pedimentos to related files (optimized)',
|
|
345
|
+
)
|
|
346
|
+
.option(
|
|
347
|
+
'--api <target>',
|
|
348
|
+
'API target: agencia|cliente|default',
|
|
349
|
+
'default',
|
|
350
|
+
)
|
|
351
|
+
.option(
|
|
352
|
+
'-b, --batch-size <size>',
|
|
353
|
+
'Number of pedimentos to process per batch',
|
|
354
|
+
'50',
|
|
355
|
+
)
|
|
356
|
+
.option('--show-stats', 'Show performance statistics')
|
|
357
|
+
.action(async (options) => {
|
|
358
|
+
try {
|
|
359
|
+
const propagateCommand = new PropagateCommand(options);
|
|
360
|
+
await propagateCommand.execute();
|
|
361
|
+
} catch (error) {
|
|
362
|
+
this.errorHandler.handleFatalError(error, { command: 'propagate' });
|
|
363
|
+
}
|
|
364
|
+
});
|
|
365
|
+
|
|
366
|
+
// Push command - simplified version of "upload --upload-by-rfc"
|
|
367
|
+
this.program
|
|
368
|
+
.command('push')
|
|
369
|
+
.description('📤 Upload files by RFC to Arela API (optimized)')
|
|
370
|
+
.option(
|
|
371
|
+
'--api <target>',
|
|
372
|
+
'API target for scan operations: default|agencia|cliente',
|
|
373
|
+
'default',
|
|
374
|
+
)
|
|
375
|
+
.option(
|
|
376
|
+
'--scan-api <target>',
|
|
377
|
+
'API for reading scan table: default|agencia|cliente',
|
|
378
|
+
'default',
|
|
379
|
+
)
|
|
380
|
+
.option(
|
|
381
|
+
'--push-api <target>',
|
|
382
|
+
'API for uploading files: default|agencia|cliente',
|
|
383
|
+
)
|
|
384
|
+
.option(
|
|
385
|
+
'-b, --batch-size <size>',
|
|
386
|
+
'Number of files to fetch per batch',
|
|
387
|
+
'100',
|
|
388
|
+
)
|
|
389
|
+
.option(
|
|
390
|
+
'--upload-batch-size <size>',
|
|
391
|
+
'Number of files to upload concurrently',
|
|
392
|
+
'10',
|
|
393
|
+
)
|
|
394
|
+
.option(
|
|
395
|
+
'--rfcs <rfcs>',
|
|
396
|
+
'Comma-separated RFCs to filter (overrides PUSH_RFCS env var)',
|
|
397
|
+
)
|
|
398
|
+
.option(
|
|
399
|
+
'--years <years>',
|
|
400
|
+
'Comma-separated years to filter (overrides PUSH_YEARS env var)',
|
|
401
|
+
)
|
|
402
|
+
.option('--show-stats', 'Show performance statistics')
|
|
403
|
+
.action(async (options) => {
|
|
404
|
+
try {
|
|
405
|
+
// Parse comma-separated values
|
|
406
|
+
if (options.rfcs) {
|
|
407
|
+
options.rfcs = options.rfcs
|
|
408
|
+
.split(',')
|
|
409
|
+
.map((r) => r.trim().toUpperCase())
|
|
410
|
+
.filter(Boolean);
|
|
411
|
+
}
|
|
412
|
+
if (options.years) {
|
|
413
|
+
options.years = options.years
|
|
414
|
+
.split(',')
|
|
415
|
+
.map((y) => parseInt(y.trim(), 10))
|
|
416
|
+
.filter((y) => !isNaN(y));
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
const pushCommand = new PushCommand();
|
|
420
|
+
await pushCommand.execute(options);
|
|
421
|
+
} catch (error) {
|
|
422
|
+
this.errorHandler.handleFatalError(error, { command: 'push' });
|
|
423
|
+
}
|
|
424
|
+
});
|
|
425
|
+
|
|
426
|
+
// ============================================================================
|
|
427
|
+
// END OF NEW SIMPLIFIED COMMANDS
|
|
428
|
+
// ============================================================================
|
|
429
|
+
|
|
280
430
|
// Watch command
|
|
281
431
|
this.program
|
|
282
432
|
.command('watch')
|