@arela/uploader 1.0.17 → 1.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arela/uploader",
3
- "version": "1.0.17",
3
+ "version": "1.0.19",
4
4
  "description": "CLI to upload files/directories to Arela",
5
5
  "bin": {
6
6
  "arela": "./src/index.js"
@@ -166,8 +166,11 @@ export class IdentifyCommand {
166
166
  * @returns {Promise<Object>} Processing statistics
167
167
  */
168
168
  async #processTable(tableName, batchSize, startTime) {
169
- // Get detection statistics first
170
- const initialStats = await this.scanApiService.getDetectionStats(tableName);
169
+ // Get detection statistics first (allTypes=true to count all supported file types)
170
+ const initialStats = await this.scanApiService.getDetectionStats(
171
+ tableName,
172
+ true,
173
+ );
171
174
  logger.info(` Total PDFs: ${initialStats.totalPdfs}`);
172
175
  logger.info(` Detected: ${initialStats.detected}`);
173
176
  logger.info(` Pending: ${initialStats.pending}`);
@@ -224,6 +227,7 @@ export class IdentifyCommand {
224
227
  tableName,
225
228
  0,
226
229
  batchSize,
230
+ true, // allTypes: fetch all supported file types, not just likely-simplificado PDFs
227
231
  );
228
232
 
229
233
  if (!response.data || response.data.length === 0) {
@@ -1,8 +1,11 @@
1
+ import path from 'path';
2
+
1
3
  import logger from '../services/LoggingService.js';
2
4
  import { PipelineApiService } from '../services/PipelineApiService.js';
3
5
 
4
6
  import appConfig from '../config/config.js';
5
7
  import ErrorHandler from '../errors/ErrorHandler.js';
8
+ import { PathNormalizer } from '../utils/PathNormalizer.js';
6
9
 
7
10
  /**
8
11
  * Poll Worker Command Handler
@@ -234,13 +237,74 @@ export class PollWorkerCommand {
234
237
 
235
238
  // Override scan directories if provided
236
239
  if (job.scanDirectories && job.scanDirectories.length > 0) {
237
- process.env.UPLOAD_SOURCES = job.scanDirectories.join('|');
240
+ const allAbsolute = job.scanDirectories.every((d) =>
241
+ PathNormalizer.isAbsolutePath(d),
242
+ );
243
+
244
+ if (allAbsolute) {
245
+ const ancestor = this.#commonAncestor(job.scanDirectories);
246
+ // Check if ancestor is meaningful (not just root or a drive letter)
247
+ const isUseful =
248
+ ancestor.length > 1 && !/^[a-zA-Z]:[/\\]?$/.test(ancestor);
249
+
250
+ if (isUseful) {
251
+ // Common ancestor found — set as base path, make sources relative
252
+ process.env.UPLOAD_BASE_PATH = ancestor;
253
+ process.env.ARELA_BASE_PATH_LABEL = ancestor;
254
+ const relativeSources = job.scanDirectories.map(
255
+ (d) => path.relative(ancestor, d) || '.',
256
+ );
257
+ process.env.UPLOAD_SOURCES = relativeSources.join('|');
258
+ } else {
259
+ // Cross-drive or no common ancestor — wildcard base, absolute sources
260
+ process.env.UPLOAD_BASE_PATH = '*';
261
+ process.env.ARELA_BASE_PATH_LABEL = '*';
262
+ process.env.UPLOAD_SOURCES = job.scanDirectories.join('|');
263
+ }
264
+ } else {
265
+ process.env.UPLOAD_SOURCES = job.scanDirectories.join('|');
266
+ }
238
267
  }
239
268
 
240
269
  // Override file extensions if provided
241
270
  if (job.fileExtensions && job.fileExtensions.length > 0) {
242
271
  process.env.UPLOAD_FILE_EXTENSIONS = job.fileExtensions.join(',');
243
272
  }
273
+
274
+ // Reload cached config from the updated env vars
275
+ appConfig.reloadScanConfig();
276
+ }
277
+
278
+ /**
279
+ * Compute the longest common ancestor directory of a list of absolute paths.
280
+ * Uses '/' as separator (PathNormalizer normalizes Windows \\ to /).
281
+ * @param {string[]} paths
282
+ * @returns {string}
283
+ */
284
+ #commonAncestor(paths) {
285
+ if (paths.length === 0) return '/';
286
+ if (paths.length === 1) return paths[0];
287
+
288
+ // Normalize separators so O:\exp\... becomes O:/exp/...
289
+ const normalized = paths.map((p) => PathNormalizer.normalizeSeparators(p));
290
+ const split = normalized.map((p) => p.split('/').filter(Boolean));
291
+ const minLen = Math.min(...split.map((s) => s.length));
292
+ const common = [];
293
+
294
+ for (let i = 0; i < minLen; i++) {
295
+ const seg = split[0][i];
296
+ if (split.every((s) => s[i] === seg)) {
297
+ common.push(seg);
298
+ } else {
299
+ break;
300
+ }
301
+ }
302
+
303
+ // Preserve drive letter format (e.g., 'O:' → 'O:/')
304
+ if (common.length > 0 && /^[a-zA-Z]:$/.test(common[0])) {
305
+ return common[0] + '/' + common.slice(1).join('/');
306
+ }
307
+ return '/' + common.join('/');
244
308
  }
245
309
 
246
310
  /**
@@ -100,6 +100,15 @@ export class PropagateCommand {
100
100
  totalStats.directoriesProcessed += stats.directoriesProcessed;
101
101
  }
102
102
 
103
+ // Step 5: Cross-table propagation
104
+ // Match files with detected_pedimento in one table to pedimento sources in other tables
105
+ const crossTableStats = await this.#processCrossTablePropagation(
106
+ scanConfig,
107
+ tables,
108
+ );
109
+ totalStats.filesUpdated += crossTableStats.filesUpdated;
110
+ totalStats.filesFailed += crossTableStats.filesFailed;
111
+
103
112
  // Show combined results
104
113
  const duration = ((Date.now() - this.stats.startTime) / 1000).toFixed(2);
105
114
  const filesPerSec =
@@ -439,6 +448,118 @@ export class PropagateCommand {
439
448
  };
440
449
  }
441
450
 
451
+ /**
452
+ * Cross-table propagation phase
453
+ * Matches files with detected_pedimento in one table to pedimento sources in other tables.
454
+ * This enables facturas (in a different directory/table) to get arela_path from their pedimento.
455
+ * @private
456
+ * @param {Object} scanConfig - Scan configuration with companySlug, serverId, basePathFull
457
+ * @param {Array} tables - All tables for this instance
458
+ * @returns {Promise<Object>} { filesUpdated, filesFailed }
459
+ */
460
+ async #processCrossTablePropagation(scanConfig, tables) {
461
+ console.log('\n🔗 Cross-table propagation phase...\n');
462
+
463
+ const stats = { filesUpdated: 0, filesFailed: 0 };
464
+
465
+ // Step 1: Fetch all pedimento sources across all tables
466
+ const pedimentoSources =
467
+ await this.scanApiService.fetchCrossTablePedimentoSources(
468
+ scanConfig.companySlug,
469
+ scanConfig.serverId,
470
+ scanConfig.basePathFull,
471
+ );
472
+
473
+ if (pedimentoSources.length === 0) {
474
+ console.log(
475
+ ' ℹ️ No pedimento sources found across tables. Skipping cross-table phase.\n',
476
+ );
477
+ return stats;
478
+ }
479
+
480
+ // Build a map: detected_pedimento → source info
481
+ const sourceMap = new Map();
482
+ for (const source of pedimentoSources) {
483
+ sourceMap.set(source.detected_pedimento, source);
484
+ }
485
+
486
+ console.log(
487
+ ` 📋 Found ${sourceMap.size} unique pedimento sources across ${tables.length} tables`,
488
+ );
489
+
490
+ // Step 2: For each table, find orphan files (have pedimento, no arela_path)
491
+ let totalOrphans = 0;
492
+
493
+ for (const table of tables) {
494
+ let offset = 0;
495
+ let hasMore = true;
496
+
497
+ while (hasMore) {
498
+ const orphanFiles =
499
+ await this.scanApiService.fetchFilesWithPedimentoNoArelaPath(
500
+ table.tableName,
501
+ offset,
502
+ this.options.batchSize,
503
+ );
504
+
505
+ if (orphanFiles.length === 0) {
506
+ hasMore = false;
507
+ break;
508
+ }
509
+
510
+ // Step 3: Match orphans against pedimento source map
511
+ const updates = [];
512
+ for (const file of orphanFiles) {
513
+ const source = sourceMap.get(file.detected_pedimento);
514
+ if (source) {
515
+ updates.push({
516
+ id: file.id,
517
+ arelaPath: source.arela_path,
518
+ rfc: source.rfc,
519
+ detectedPedimento: file.detected_pedimento,
520
+ detectedPedimentoYear: source.detected_pedimento_year,
521
+ propagatedFromId: source.source_id,
522
+ propagatedFromTable: source.source_table,
523
+ propagationError: null,
524
+ });
525
+ }
526
+ }
527
+
528
+ totalOrphans += orphanFiles.length;
529
+
530
+ // Step 4: Batch update matched files
531
+ if (updates.length > 0) {
532
+ try {
533
+ const result = await this.scanApiService.batchUpdatePropagation(
534
+ table.tableName,
535
+ updates,
536
+ );
537
+ stats.filesUpdated += result.updated;
538
+ stats.filesFailed += result.errors;
539
+ } catch (error) {
540
+ logger.error(
541
+ `Failed cross-table update on ${table.tableName}:`,
542
+ error,
543
+ );
544
+ stats.filesFailed += updates.length;
545
+ }
546
+ }
547
+
548
+ offset += orphanFiles.length;
549
+ if (orphanFiles.length < this.options.batchSize) {
550
+ hasMore = false;
551
+ }
552
+ }
553
+ }
554
+
555
+ console.log(` 📊 Cross-table results:`);
556
+ console.log(` Orphan files checked: ${totalOrphans}`);
557
+ console.log(` Files updated: ${stats.filesUpdated}`);
558
+ console.log(` Files failed: ${stats.filesFailed}\n`);
559
+
560
+ return stats;
561
+ }
562
+
442
563
  /**
443
564
  * Show final propagation statistics
444
565
  * @private
@@ -222,7 +222,10 @@ export class PushCommand {
222
222
  }
223
223
 
224
224
  const scanConfig = appConfig.getScanConfig();
225
- if (!scanConfig.tableName) {
225
+ // When basePathFull is '*' (cross-directory wildcard), tableName is intentionally null.
226
+ // The push command fetches tables dynamically via getInstanceTables, so a static
227
+ // tableName is not required — only companySlug, serverId, and basePathFull matter.
228
+ if (!scanConfig.tableName && scanConfig.basePathFull !== '*') {
226
229
  errors.push('Could not generate table name from configuration');
227
230
  }
228
231
 
@@ -49,7 +49,10 @@ export class ScanCommand {
49
49
 
50
50
  const scanConfig = appConfig.getScanConfig();
51
51
  // Ensure basePath is absolute for scan operations
52
- const basePath = PathNormalizer.toAbsolutePath(appConfig.getBasePath());
52
+ // '*' is a wildcard sentinel for cross-drive pipelines — no real basePath
53
+ const rawBasePath = appConfig.getBasePath();
54
+ const basePath =
55
+ rawBasePath === '*' ? '*' : PathNormalizer.toAbsolutePath(rawBasePath);
53
56
 
54
57
  logger.info('🔍 Starting arela scan command');
55
58
  logger.info(`🎯 API Target: ${apiTarget}`);
@@ -213,8 +216,16 @@ export class ScanCommand {
213
216
  if (level === 0) {
214
217
  // Level 0: Create one entry per source
215
218
  return sources.map((source) => {
216
- const sourcePath =
217
- source === '.' ? basePath : path.resolve(basePath, source);
219
+ let sourcePath;
220
+ if (source === '.') {
221
+ sourcePath = basePath;
222
+ } else if (source.startsWith('..') || path.isAbsolute(source)) {
223
+ // Source is a relative-to-CWD path (e.g., from pipeline UI) or absolute
224
+ sourcePath = PathNormalizer.toAbsolutePath(source);
225
+ } else {
226
+ // Source is a subdirectory of basePath
227
+ sourcePath = path.resolve(basePath, source);
228
+ }
218
229
  // Label is relative path for display purposes only
219
230
  const label = source === '.' ? '' : source;
220
231
  return { path: sourcePath, label };
@@ -238,8 +249,11 @@ export class ScanCommand {
238
249
  // Source is current directory, use discovered path as-is
239
250
  directories.push(levelDir);
240
251
  } else {
241
- // Append source to path
242
- const combinedPath = path.resolve(levelDir.path, source);
252
+ // Resolve source: if it starts with ".." it's relative to CWD, not levelDir
253
+ const combinedPath =
254
+ source.startsWith('..') || path.isAbsolute(source)
255
+ ? PathNormalizer.toAbsolutePath(source)
256
+ : path.resolve(levelDir.path, source);
243
257
 
244
258
  // Only add if the combined path actually exists
245
259
  try {
@@ -290,6 +290,9 @@ export class WorkerCommand {
290
290
  if (scanConfig.directoryLevel !== undefined) {
291
291
  process.env.SCAN_DIRECTORY_LEVEL = String(scanConfig.directoryLevel);
292
292
  }
293
+
294
+ // Reload cached config from the updated env vars
295
+ appConfig.reloadScanConfig();
293
296
  }
294
297
 
295
298
  /**
@@ -36,10 +36,10 @@ class Config {
36
36
  const __dirname = path.dirname(__filename);
37
37
  const packageJsonPath = path.resolve(__dirname, '../../package.json');
38
38
  const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
39
- return packageJson.version || '1.0.17';
39
+ return packageJson.version || '1.0.19';
40
40
  } catch (error) {
41
41
  console.warn('⚠️ Could not read package.json version, using fallback');
42
- return '1.0.17';
42
+ return '1.0.19';
43
43
  }
44
44
  }
45
45
 
@@ -263,14 +263,23 @@ class Config {
263
263
  // IMPORTANT: Always resolve to absolute path for uniqueness
264
264
  if (!basePathLabel && process.env.UPLOAD_BASE_PATH) {
265
265
  const basePath = process.env.UPLOAD_BASE_PATH;
266
- // Resolve to absolute path (handles ../sample vs ./sample correctly)
267
- // Note: toAbsolutePath handles Windows paths (O:\...) even on macOS/Linux
268
- basePathLabel = PathNormalizer.toAbsolutePath(basePath);
266
+ // '*' is a wildcard sentinel for cross-drive scenarios — keep as-is
267
+ if (basePath === '*') {
268
+ basePathLabel = '*';
269
+ } else {
270
+ // Resolve to absolute path (handles ../sample vs ./sample correctly)
271
+ // Note: toAbsolutePath handles Windows paths (O:\...) even on macOS/Linux
272
+ basePathLabel = PathNormalizer.toAbsolutePath(basePath);
273
+ }
269
274
  }
270
275
 
271
- // If basePathLabel is provided, ensure it's absolute
276
+ // If basePathLabel is provided, ensure it's absolute (skip wildcard)
272
277
  // Use PathNormalizer.isAbsolutePath for cross-platform Windows path detection
273
- if (basePathLabel && !PathNormalizer.isAbsolutePath(basePathLabel)) {
278
+ if (
279
+ basePathLabel &&
280
+ basePathLabel !== '*' &&
281
+ !PathNormalizer.isAbsolutePath(basePathLabel)
282
+ ) {
274
283
  basePathLabel = PathNormalizer.toAbsolutePath(basePathLabel);
275
284
  }
276
285
 
@@ -288,7 +297,7 @@ class Config {
288
297
  // Note: This is just for reference; actual table names are generated dynamically
289
298
  // in ScanCommand based on discovered directories and levels
290
299
  let tableName = null;
291
- if (companySlug && serverId && basePathLabel) {
300
+ if (companySlug && serverId && basePathLabel && basePathLabel !== '*') {
292
301
  tableName = PathNormalizer.generateTableName({
293
302
  companySlug,
294
303
  serverId,
@@ -658,6 +667,15 @@ class Config {
658
667
  };
659
668
  }
660
669
 
670
+ /**
671
+ * Reload upload and scan config from current process.env values.
672
+ * Must be called after modifying env vars at runtime (e.g., PollWorkerCommand).
673
+ */
674
+ reloadScanConfig() {
675
+ this.upload = this.#loadUploadConfig();
676
+ this.scan = this.#loadScanConfig();
677
+ }
678
+
661
679
  /**
662
680
  * Validate watch configuration
663
681
  * @param {string[]} directories - Directories to validate
@@ -1,4 +1,7 @@
1
1
  // Import all document type definitions
2
+ import { dodaPdfDefinition } from './document-types/doda-pdf.js';
3
+ import { dodaXmlDefinition } from './document-types/doda-xml.js';
4
+ import { facturasComerciales } from './document-types/facturas-comerciales.js';
2
5
  import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
3
6
  import { proformaDefinition } from './document-types/proforma.js';
4
7
  import { supportDocumentDefinition } from './document-types/support-document.js';
@@ -39,6 +42,9 @@ export class DocumentTypeDefinition {
39
42
  const documentTypes = [
40
43
  pedimentoSimplificadoDefinition,
41
44
  supportDocumentDefinition,
45
+ dodaPdfDefinition,
46
+ dodaXmlDefinition,
47
+ facturasComerciales,
42
48
  // Add more document types here as needed
43
49
  ];
44
50
 
@@ -0,0 +1,121 @@
1
+ import { FieldResult } from '../document-type-shared.js';
2
+
3
+ /**
4
+ * DODA PDF Document Type Definition
5
+ * Detects DODA (Documento de Operación para Despacho Aduanero) in PDF format.
6
+ * DODAs are validation documents generated by VUCEM for customs clearance.
7
+ * They reside in the SAME directory as the pedimento_simplificado,
8
+ * so within-table propagation handles arela_path assignment.
9
+ */
10
+ export const dodaPdfDefinition = {
11
+ type: 'doda_pdf',
12
+ extensions: ['pdf'],
13
+ match: (source) => {
14
+ // DODA PDFs contain specific markers from VUCEM/customs systems
15
+ const markers = [
16
+ /DOCUMENTO DE OPERACI[OÓ]N PARA DESPACHO ADUANERO/i,
17
+ /DODA/i,
18
+ /VUCEM/i,
19
+ ];
20
+
21
+ // Require the primary DODA marker, or at least 2 of the secondary markers
22
+ const primaryMatch = markers[0].test(source);
23
+ if (primaryMatch) return true;
24
+
25
+ const secondaryMatches = markers
26
+ .slice(1)
27
+ .filter((m) => m.test(source)).length;
28
+
29
+ // Also check for pedimento number + DODA-specific context
30
+ const hasPedimentoNumber = /\d{2}\s?\d{2}\s?\d{4}\s?\d{7}/.test(source);
31
+ const hasDodaContext =
32
+ /despacho aduanero|operaci[oó]n aduanera|validaci[oó]n/i.test(source);
33
+
34
+ return (
35
+ (secondaryMatches >= 2 && hasPedimentoNumber) ||
36
+ (hasDodaContext && hasPedimentoNumber && secondaryMatches >= 1)
37
+ );
38
+ },
39
+
40
+ extractNumPedimento: (source, fields) => {
41
+ return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
42
+ },
43
+
44
+ extractPedimentoYear: (source, fields) => {
45
+ const numPedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
46
+ if (!numPedimento) return null;
47
+ const year = parseInt(numPedimento.substring(0, 2), 10);
48
+ return year < 50 ? year + 2000 : year + 1900;
49
+ },
50
+
51
+ extractors: [
52
+ {
53
+ field: 'numPedimento',
54
+ extract: (source) => {
55
+ // Try cadena original format: ||aduana|patente|...|pedimentos|integracion|...
56
+ // e.g. ||070|3429|2|4009029,4008062|109335668|A231|
57
+ const cadenaMatch = source.match(
58
+ /\|\|(\d{2,3})\|(\d{4})\|\d\|([\d,]+)\|(\d+)\|/,
59
+ );
60
+ if (cadenaMatch) {
61
+ const rawAduana = cadenaMatch[1];
62
+ // 3-digit code = aduana(2) + section(1), e.g. 070 → aduana 07
63
+ const aduana =
64
+ rawAduana.length === 3
65
+ ? rawAduana.slice(0, 2)
66
+ : rawAduana.padStart(2, '0');
67
+ const patente = cadenaMatch[2];
68
+ const pedNums = cadenaMatch[3].split(',');
69
+ // Use first pedimento number, pad to 7 digits
70
+ const pedNum = pedNums[0].padStart(7, '0');
71
+ // Extract year from date in cadena
72
+ const yearMatch = source.match(/(\d{4})-\d{2}-\d{2}/);
73
+ const year = yearMatch
74
+ ? yearMatch[1].slice(-2)
75
+ : new Date().getFullYear().toString().slice(-2);
76
+ const full = `${year}${aduana}${patente}${pedNum}`;
77
+ if (full.length === 15) {
78
+ return new FieldResult('numPedimento', true, full);
79
+ }
80
+ }
81
+
82
+ // Try dash-separated format: YY-AA-PPPP-NNNNNNN
83
+ const dashMatch = source.match(/(\d{2})-(\d{2})-(\d{4})-(\d{7})/);
84
+ if (dashMatch) {
85
+ const full =
86
+ dashMatch[1] + dashMatch[2] + dashMatch[3] + dashMatch[4];
87
+ return new FieldResult('numPedimento', true, full);
88
+ }
89
+
90
+ // Try 15-digit near pedimento keyword (avoid matching sello digital)
91
+ const contextMatch = source.match(
92
+ /pedimento[^\d]{0,30}(\d{2}\s?\d{2}\s?\d{4}\s?\d{7})/i,
93
+ );
94
+ if (contextMatch) {
95
+ return new FieldResult(
96
+ 'numPedimento',
97
+ true,
98
+ contextMatch[1].replace(/\s/g, ''),
99
+ );
100
+ }
101
+
102
+ return new FieldResult('numPedimento', false, null);
103
+ },
104
+ },
105
+ {
106
+ field: 'rfc',
107
+ extract: (source) => {
108
+ // Mexican RFC: 3-4 letters + 6 digits + 3 alphanumeric
109
+ const match = source.match(/\b([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})\b/);
110
+ return new FieldResult('rfc', !!match, match ? match[1] : null);
111
+ },
112
+ },
113
+ {
114
+ field: 'aduana',
115
+ extract: (source) => {
116
+ const match = source.match(/aduana[:\s]*(\d{2,4})/i);
117
+ return new FieldResult('aduana', !!match, match ? match[1] : null);
118
+ },
119
+ },
120
+ ],
121
+ };
@@ -0,0 +1,118 @@
1
+ import { FieldResult } from '../document-type-shared.js';
2
+
3
+ /**
4
+ * DODA XML Document Type Definition
5
+ * Detects DODA (Documento de Operación para Despacho Aduanero) in XML format.
6
+ * XML DODAs contain structured data from VUCEM/customs systems.
7
+ * They reside in the SAME directory as the pedimento_simplificado,
8
+ * so within-table propagation handles arela_path assignment.
9
+ */
10
+ export const dodaXmlDefinition = {
11
+ type: 'doda_xml',
12
+ extensions: ['xml'],
13
+ match: (source) => {
14
+ // DODA XML files contain specific XML tags/namespaces
15
+ const xmlMarkers = [
16
+ /documentoOperacion/i,
17
+ /despachoAduanero/i,
18
+ /<doda\b/i,
19
+ /xmlns[^"]*doda/i,
20
+ /VUCEM/i,
21
+ ];
22
+
23
+ // Also check for pedimento-related XML structure
24
+ const pedimentoXmlMarkers = [
25
+ /numPedimento/i,
26
+ /patenteAduanal/i,
27
+ /aduanaDespacho/i,
28
+ /tipoOperacion/i,
29
+ ];
30
+
31
+ const dodaMatches = xmlMarkers.filter((m) => m.test(source)).length;
32
+ const pedimentoMatches = pedimentoXmlMarkers.filter((m) =>
33
+ m.test(source),
34
+ ).length;
35
+
36
+ // Match if: has DODA-specific markers, or combination of pedimento markers with XML structure
37
+ return (
38
+ dodaMatches >= 1 || (pedimentoMatches >= 3 && /<\?xml/i.test(source))
39
+ );
40
+ },
41
+
42
+ extractNumPedimento: (source, fields) => {
43
+ return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
44
+ },
45
+
46
+ extractPedimentoYear: (source, fields) => {
47
+ const numPedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
48
+ if (!numPedimento) {
49
+ // Try to extract year from date in XML
50
+ const dateMatch = source.match(/(\d{4})-\d{2}-\d{2}/);
51
+ if (dateMatch) {
52
+ return parseInt(dateMatch[1], 10);
53
+ }
54
+ return null;
55
+ }
56
+ const year = parseInt(numPedimento.substring(0, 2), 10);
57
+ return year < 50 ? year + 2000 : year + 1900;
58
+ },
59
+
60
+ extractors: [
61
+ {
62
+ field: 'numPedimento',
63
+ extract: (source) => {
64
+ // Try XML tag format first
65
+ const xmlMatch = source.match(/numPedimento[^>]*>(\d{15})<\/[^>]+>/i);
66
+ if (xmlMatch) {
67
+ return new FieldResult('numPedimento', true, xmlMatch[1]);
68
+ }
69
+
70
+ // Try attribute format
71
+ const attrMatch = source.match(
72
+ /numPedimento[=:"]\s*["']?(\d{15})["']?/i,
73
+ );
74
+ if (attrMatch) {
75
+ return new FieldResult('numPedimento', true, attrMatch[1]);
76
+ }
77
+
78
+ // Fallback: 15-digit pattern
79
+ const fallback = source.match(/\d{2}\s?\d{2}\s?\d{4}\s?\d{7}/);
80
+ return new FieldResult(
81
+ 'numPedimento',
82
+ !!fallback,
83
+ fallback ? fallback[0].replace(/\s/g, '') : null,
84
+ );
85
+ },
86
+ },
87
+ {
88
+ field: 'rfc',
89
+ extract: (source) => {
90
+ // Try XML tag format
91
+ const xmlMatch = source.match(
92
+ /rfc[^>]*>([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})<\/[^>]+>/i,
93
+ );
94
+ if (xmlMatch) {
95
+ return new FieldResult('rfc', true, xmlMatch[1]);
96
+ }
97
+
98
+ // Fallback: generic RFC pattern
99
+ const match = source.match(/\b([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})\b/);
100
+ return new FieldResult('rfc', !!match, match ? match[1] : null);
101
+ },
102
+ },
103
+ {
104
+ field: 'patente',
105
+ extract: (source) => {
106
+ const match = source.match(/patenteAduanal[^>]*>(\d{4})<\/[^>]+>/i);
107
+ return new FieldResult('patente', !!match, match ? match[1] : null);
108
+ },
109
+ },
110
+ {
111
+ field: 'aduana',
112
+ extract: (source) => {
113
+ const match = source.match(/aduanaDespacho[^>]*>(\d{2,4})<\/[^>]+>/i);
114
+ return new FieldResult('aduana', !!match, match ? match[1] : null);
115
+ },
116
+ },
117
+ ],
118
+ };
@@ -0,0 +1,233 @@
1
+ import { FieldResult } from '../document-type-shared.js';
2
+
3
+ /**
4
+ * Facturas Comerciales Document Type Definition
5
+ * Detects commercial invoices (facturas) related to customs operations.
6
+ *
7
+ * CRITICAL: Facturas reside in a DIFFERENT directory than the pedimento_simplificado,
8
+ * creating a different CLI scan table. Cross-table propagation uses `detected_pedimento`
9
+ * to link facturas to their corresponding pedimento and assign arela_path.
10
+ *
11
+ * Supported formats: PDF (scanned invoices), XML (CFDI/electronic invoices)
12
+ */
13
+ export const facturasComerciales = {
14
+ type: 'factura_comercial',
15
+ extensions: ['pdf', 'xml'],
16
+ match: (source) => {
17
+ // CFDI / electronic invoice markers (XML)
18
+ const cfdiMarkers = [
19
+ /cfdi:Comprobante/i,
20
+ /xmlns:cfdi/i,
21
+ /TipoDeComprobante/i,
22
+ /timbreFiscalDigital/i,
23
+ /SelloSAT/i,
24
+ ];
25
+
26
+ // PDF invoice markers
27
+ const invoiceMarkers = [
28
+ /factura\s*(comercial|de\s*venta|de\s*exportaci[oó]n)?/i,
29
+ /commercial\s*invoice/i,
30
+ /invoice\s*number/i,
31
+ /n[uú]mero\s*de\s*factura/i,
32
+ ];
33
+
34
+ // Customs-related invoice context
35
+ const customsContext = [
36
+ /pedimento/i,
37
+ /aduana/i,
38
+ /importaci[oó]n|exportaci[oó]n/i,
39
+ /despacho\s*aduanero/i,
40
+ /fracci[oó]n\s*arancelaria/i,
41
+ ];
42
+
43
+ const cfdiMatches = cfdiMarkers.filter((m) => m.test(source)).length;
44
+ const invoiceMatches = invoiceMarkers.filter((m) => m.test(source)).length;
45
+ const customsMatches = customsContext.filter((m) => m.test(source)).length;
46
+
47
+ // Match if: CFDI structure (>=2 markers), or invoice + customs context
48
+ return cfdiMatches >= 2 || (invoiceMatches >= 1 && customsMatches >= 1);
49
+ },
50
+
51
+ extractNumPedimento: (source, fields) => {
52
+ return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
53
+ },
54
+
55
+ extractPedimentoYear: (source, fields) => {
56
+ const numPedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
57
+ if (numPedimento) {
58
+ const year = parseInt(numPedimento.substring(0, 2), 10);
59
+ return year < 50 ? year + 2000 : year + 1900;
60
+ }
61
+ // Try invoice date field
62
+ const invoiceDate = fields?.find((f) => f.name === 'invoiceDate')?.value;
63
+ if (invoiceDate) {
64
+ const yearMatch = invoiceDate.match(/^(\d{4})/);
65
+ if (yearMatch) return parseInt(yearMatch[1], 10);
66
+ }
67
+ // Try to extract year from any date in source
68
+ const dateMatch = source.match(/(\d{4})-\d{2}-\d{2}/);
69
+ if (dateMatch) return parseInt(dateMatch[1], 10);
70
+ const mmmMatch = source.match(
71
+ /\d{1,2}\/(?:ene|feb|mar|abr|may|jun|jul|ago|sep|oct|nov|dic)\/(\d{4})/i,
72
+ );
73
+ if (mmmMatch) return parseInt(mmmMatch[1], 10);
74
+ return null;
75
+ },
76
+
77
+ extractors: [
78
+ {
79
+ field: 'numPedimento',
80
+ extract: (source) => {
81
+ // Try dash-separated format: YY-AA-PPPP-NNNNNNN (most common in Mexican import invoices)
82
+ // e.g. "26-07-3429-6016477" — may be wrapped across lines in PDF text
83
+ const dashMatch = source.match(
84
+ /(\d{2})-(\d{2})-(\d{4})-(\d{1,7})\s*(\d*)/,
85
+ );
86
+ if (dashMatch) {
87
+ const lastPart = (dashMatch[4] + dashMatch[5]).substring(0, 7);
88
+ if (lastPart.length === 7) {
89
+ const full = dashMatch[1] + dashMatch[2] + dashMatch[3] + lastPart;
90
+ return new FieldResult('numPedimento', true, full);
91
+ }
92
+ }
93
+
94
+ // Try CFDI XML: NumPedimento attribute or InformacionAduanera
95
+ const cfdiMatch = source.match(
96
+ /(?:NumPedimento|NumeroPedimento)[=:"]\s*["']?(\d{15})["']?/i,
97
+ );
98
+ if (cfdiMatch) {
99
+ return new FieldResult('numPedimento', true, cfdiMatch[1]);
100
+ }
101
+
102
+ // Try InformacionAduanera tag
103
+ const aduaneraMatch = source.match(
104
+ /InformacionAduanera[^>]*NumeroPedimento[=:"]\s*["']?(\d{15})["']?/i,
105
+ );
106
+ if (aduaneraMatch) {
107
+ return new FieldResult('numPedimento', true, aduaneraMatch[1]);
108
+ }
109
+
110
+ // Try generic XML tag
111
+ const xmlMatch = source.match(/pedimento[^>]*>(\d{15})<\/[^>]+>/i);
112
+ if (xmlMatch) {
113
+ return new FieldResult('numPedimento', true, xmlMatch[1]);
114
+ }
115
+
116
+ // Try space-separated near pedimento keyword
117
+ const textMatch = source.match(
118
+ /pedimento[^\d]{0,30}(\d{2}\s?\d{2}\s?\d{4}\s?\d{7})/i,
119
+ );
120
+ if (textMatch) {
121
+ return new FieldResult(
122
+ 'numPedimento',
123
+ true,
124
+ textMatch[1].replace(/\s/g, ''),
125
+ );
126
+ }
127
+
128
+ return new FieldResult('numPedimento', false, null);
129
+ },
130
+ },
131
+ {
132
+ field: 'rfc',
133
+ extract: (source) => {
134
+ // Try CFDI Rfc attribute (emisor)
135
+ const cfdiMatch = source.match(
136
+ /Emisor[^>]*Rfc[=:"]\s*["']?([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})["']?/i,
137
+ );
138
+ if (cfdiMatch) {
139
+ return new FieldResult('rfc', true, cfdiMatch[1]);
140
+ }
141
+
142
+ // Try Receptor Rfc (for import invoices, receptor is the importer)
143
+ const receptorMatch = source.match(
144
+ /Receptor[^>]*Rfc[=:"]\s*["']?([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})["']?/i,
145
+ );
146
+ if (receptorMatch) {
147
+ return new FieldResult('rfc', true, receptorMatch[1]);
148
+ }
149
+
150
+ // Fallback: generic RFC pattern
151
+ const match = source.match(/\b([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})\b/);
152
+ return new FieldResult('rfc', !!match, match ? match[1] : null);
153
+ },
154
+ },
155
+ {
156
+ field: 'invoiceNumber',
157
+ extract: (source) => {
158
+ // CFDI Folio
159
+ const folioMatch = source.match(/Folio[=:"]\s*["']?([A-Z0-9-]+)["']?/i);
160
+ if (folioMatch) {
161
+ return new FieldResult('invoiceNumber', true, folioMatch[1]);
162
+ }
163
+
164
+ // Header line followed by invoice number on next line
165
+ // e.g. "FACTURA FECHA ADUANA...\nMIB260064 02/mar/2026..."
166
+ const headerMatch = source.match(
167
+ /FACTURA\s+FECHA[^\n]*\n([A-Z]{2,5}\d{4,10})/i,
168
+ );
169
+ if (headerMatch) {
170
+ return new FieldResult('invoiceNumber', true, headerMatch[1]);
171
+ }
172
+
173
+ // Text-based invoice number
174
+ const textMatch = source.match(
175
+ /(?:factura|invoice)\s*(?:no\.?|number|#|num\.?)?[:\s]*([A-Z]{2,5}\d{4,10})/i,
176
+ );
177
+ return new FieldResult(
178
+ 'invoiceNumber',
179
+ !!textMatch,
180
+ textMatch ? textMatch[1] : null,
181
+ );
182
+ },
183
+ },
184
+ {
185
+ field: 'invoiceDate',
186
+ extract: (source) => {
187
+ // CFDI Fecha attribute
188
+ const cfdiMatch = source.match(
189
+ /Fecha[=:"]\s*["']?(\d{4}-\d{2}-\d{2})/i,
190
+ );
191
+ if (cfdiMatch) {
192
+ return new FieldResult('invoiceDate', true, cfdiMatch[1]);
193
+ }
194
+
195
+ // DD/MMM/YYYY format (e.g. "02/mar/2026")
196
+ const mmmMonths = {
197
+ ene: '01',
198
+ feb: '02',
199
+ mar: '03',
200
+ abr: '04',
201
+ may: '05',
202
+ jun: '06',
203
+ jul: '07',
204
+ ago: '08',
205
+ sep: '09',
206
+ oct: '10',
207
+ nov: '11',
208
+ dic: '12',
209
+ };
210
+ const mmmMatch = source.match(
211
+ /(\d{1,2})\/(ene|feb|mar|abr|may|jun|jul|ago|sep|oct|nov|dic)\/(\d{4})/i,
212
+ );
213
+ if (mmmMatch) {
214
+ const day = mmmMatch[1].padStart(2, '0');
215
+ const month = mmmMonths[mmmMatch[2].toLowerCase()];
216
+ return new FieldResult(
217
+ 'invoiceDate',
218
+ true,
219
+ `${mmmMatch[3]}-${month}-${day}`,
220
+ );
221
+ }
222
+
223
+ // ISO date
224
+ const isoMatch = source.match(/(\d{4}-\d{2}-\d{2})/);
225
+ return new FieldResult(
226
+ 'invoiceDate',
227
+ !!isoMatch,
228
+ isoMatch ? isoMatch[1] : null,
229
+ );
230
+ },
231
+ },
232
+ ],
233
+ };
@@ -368,24 +368,32 @@ export class ScanApiService {
368
368
  // ============================================================================
369
369
 
370
370
  /**
371
- * Fetch PDF files for detection
371
+ * Fetch files for detection
372
372
  * @param {string} tableName - Target table name
373
373
  * @param {number} offset - Pagination offset
374
374
  * @param {number} limit - Number of records to fetch
375
+ * @param {boolean} allTypes - When true, fetch all supported file types instead of just likely-simplificado PDFs
375
376
  * @returns {Promise<Object>} { data: Array, hasMore: boolean }
376
377
  */
377
- async fetchPdfsForDetection(tableName, offset = 0, limit = 100) {
378
+ async fetchPdfsForDetection(
379
+ tableName,
380
+ offset = 0,
381
+ limit = 100,
382
+ allTypes = false,
383
+ ) {
378
384
  logger.debug(
379
- `Fetching PDFs for detection (offset: ${offset}, limit: ${limit})...`,
385
+ `Fetching files for detection (offset: ${offset}, limit: ${limit}, allTypes: ${allTypes})...`,
380
386
  );
381
387
 
382
- const result = await this.#request(
383
- `/api/uploader/scan/pdfs-for-detection?tableName=${encodeURIComponent(tableName)}&offset=${offset}&limit=${limit}`,
384
- 'GET',
385
- );
388
+ let url = `/api/uploader/scan/pdfs-for-detection?tableName=${encodeURIComponent(tableName)}&offset=${offset}&limit=${limit}`;
389
+ if (allTypes) {
390
+ url += '&allTypes=true';
391
+ }
392
+
393
+ const result = await this.#request(url, 'GET');
386
394
 
387
395
  logger.debug(
388
- `Fetched ${result.data.length} PDFs, hasMore: ${result.hasMore}`,
396
+ `Fetched ${result.data.length} files, hasMore: ${result.hasMore}`,
389
397
  );
390
398
  return result;
391
399
  }
@@ -420,13 +428,15 @@ export class ScanApiService {
420
428
  * @param {string} tableName - Target table name
421
429
  * @returns {Promise<Object>} { totalPdfs, detected, pending, errors }
422
430
  */
423
- async getDetectionStats(tableName) {
431
+ async getDetectionStats(tableName, allTypes = false) {
424
432
  logger.debug('Fetching detection statistics...');
425
433
 
426
- const result = await this.#request(
427
- `/api/uploader/scan/detection-stats?tableName=${encodeURIComponent(tableName)}`,
428
- 'GET',
429
- );
434
+ let url = `/api/uploader/scan/detection-stats?tableName=${encodeURIComponent(tableName)}`;
435
+ if (allTypes) {
436
+ url += '&allTypes=true';
437
+ }
438
+
439
+ const result = await this.#request(url, 'GET');
430
440
 
431
441
  logger.debug(
432
442
  `Detection stats: ${result.detected}/${result.totalPdfs} detected, ${result.pending} pending`,
@@ -554,6 +564,68 @@ export class ScanApiService {
554
564
  return result;
555
565
  }
556
566
 
567
+ // ============================================================================
568
+ // CROSS-TABLE PROPAGATION
569
+ // ============================================================================
570
+
571
+ /**
572
+ * Fetch pedimento sources across all tables for cross-table propagation
573
+ * @param {string} companySlug - Company slug
574
+ * @param {string} serverId - Server ID
575
+ * @param {string} basePathFull - Base path
576
+ * @returns {Promise<Array>} Array of pedimento sources with source_table info
577
+ */
578
+ async fetchCrossTablePedimentoSources(companySlug, serverId, basePathFull) {
579
+ logger.debug('Fetching cross-table pedimento sources...');
580
+
581
+ const result = await this.#request(
582
+ `/api/uploader/scan/cross-table-pedimento-sources?companySlug=${encodeURIComponent(companySlug)}&serverId=${encodeURIComponent(serverId)}&basePathFull=${encodeURIComponent(basePathFull)}`,
583
+ 'GET',
584
+ );
585
+
586
+ if (!Array.isArray(result)) {
587
+ logger.error(
588
+ 'fetchCrossTablePedimentoSources: Expected array, got:',
589
+ typeof result,
590
+ );
591
+ return [];
592
+ }
593
+
594
+ logger.debug(`Fetched ${result.length} cross-table pedimento sources`);
595
+ return result;
596
+ }
597
+
598
+ /**
599
+ * Fetch files with detected_pedimento but no arela_path (candidates for cross-table propagation)
600
+ * @param {string} tableName - Target table name
601
+ * @param {number} offset - Pagination offset
602
+ * @param {number} limit - Number of records to fetch
603
+ * @returns {Promise<Array>} Array of files needing cross-table propagation
604
+ */
605
+ async fetchFilesWithPedimentoNoArelaPath(tableName, offset = 0, limit = 100) {
606
+ logger.debug(
607
+ `Fetching files with pedimento but no arela_path (offset: ${offset}, limit: ${limit})...`,
608
+ );
609
+
610
+ const result = await this.#request(
611
+ `/api/uploader/scan/files-with-pedimento-no-arela-path?tableName=${encodeURIComponent(tableName)}&offset=${offset}&limit=${limit}`,
612
+ 'GET',
613
+ );
614
+
615
+ if (!Array.isArray(result)) {
616
+ logger.error(
617
+ 'fetchFilesWithPedimentoNoArelaPath: Expected array, got:',
618
+ typeof result,
619
+ );
620
+ return [];
621
+ }
622
+
623
+ logger.debug(
624
+ `Fetched ${result.length} files needing cross-table propagation`,
625
+ );
626
+ return result;
627
+ }
628
+
557
629
  // ============================================================================
558
630
  // PUSH OPERATIONS
559
631
  // ============================================================================