npm - @arela/uploader - Versions diffs - 1.0.19 → 1.0.21 - Mend

@arela/uploader 1.0.19 → 1.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/package.json +2 -1
package/src/commands/GDriveSyncCommand.js +475 -0
package/src/commands/IdentifyCommand.js +41 -16
package/src/commands/ScanCommand.js +6 -3
package/src/config/config.js +88 -2
package/src/document-type-shared.js +13 -3
package/src/document-types/_pedimento-shared-extractors.js +226 -0
package/src/document-types/pedimento-completo-xml.js +322 -0
package/src/document-types/pedimento-completo.js +68 -0
package/src/document-types/pedimento-simplificado.js +7 -286
package/src/file-detection.js +43 -5
package/src/index.js +27 -0
package/src/services/DatabaseService.js +3 -1
package/src/services/GoogleDriveService.js +217 -0
package/src/services/LoggingService.js +1 -1

package/src/document-types/pedimento-completo.js ADDED Viewed

@@ -0,0 +1,68 @@
+import {
+  pedimentoYearFromFields,
+  sharedPedimentoExtractors,
+} from './_pedimento-shared-extractors.js';
+/**
+ * "Pedimento Completo" matcher — the 7-page DEF / SEGUNDA / TERCERA copy
+ * (and the "CoveFact" variant). This is distinct from `pedimento_simplificado`
+ * which uses the "FORMA SIMPLIFICADA DE PEDIMENTO" header.
+ *
+ * Match strategy: require the structural fields that the long-form pedimento
+ * always carries (`NUM. PEDIMENTO:`, `CVE.PEDIMENTO:`, `T.OPER:`) plus at
+ * least one of the printed copy markers, while explicitly excluding any
+ * document that already declares itself as a "FORMA SIMPLIFICADA".
+ */
+export const pedimentoCompletoDefinition = {
+  type: 'pedimento_completo',
+  extensions: ['pdf'],
+  match: (source) => {
+    if (/FORMA SIMPLIFICADA DE PEDIMENTO/i.test(source)) return false;
+    const hasHeaderFields =
+      /NUM\.?\s*PEDIMENTO:/i.test(source) &&
+      /CVE\.?\s*PEDIMENTO:/i.test(source) &&
+      /T\.?\s*OPER:/i.test(source);
+    if (!hasHeaderFields) return false;
+    const hasCopyMarker =
+      /ORIGINAL:\s*ADMINISTRACION GENERAL DE ADUANAS/i.test(source) ||
+      /SEGUNDA COPIA/i.test(source) ||
+      /TERCERA COPIA/i.test(source) ||
+      /COPIA\s+(SIMPLIFICAD[AO])?\s*TRANSPORTISTA/i.test(source) ||
+      /DEFINITIVO/i.test(source);
+    return hasCopyMarker;
+  },
+  /**
+   * Resolve the final document type after fields have been extracted.
+   * Mirrors the simplificado logic:
+   *   - R1 rectifications require fechaPagoRectificacion
+   *   - Everything else requires paymentDate
+   * No payment evidence ⇒ proforma_completo.
+   */
+  resolveType: (fields) => {
+    const clavePedimento =
+      fields?.find((f) => f.name === 'clavePedimento')?.value ?? null;
+    const paymentDate =
+      fields?.find((f) => f.name === 'paymentDate' && f.found)?.value ?? null;
+    const fechaPagoRectificacion =
+      fields?.find((f) => f.name === 'fechaPagoRectificacion' && f.found)
+        ?.value ?? null;
+    if (clavePedimento === 'R1') {
+      return fechaPagoRectificacion
+        ? 'pedimento_completo'
+        : 'proforma_completo';
+    }
+    return paymentDate ? 'pedimento_completo' : 'proforma_completo';
+  },
+  extractNumPedimento: (source, fields) => {
+    return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
+  },
+  extractPedimentoYear: (source, fields) => pedimentoYearFromFields(fields),
+  extractors: sharedPedimentoExtractors,
+};

package/src/document-types/pedimento-simplificado.js CHANGED Viewed

@@ -1,4 +1,7 @@
-import { FieldResult } from '../document-type-shared.js';
+import {
+  pedimentoYearFromFields,
+  sharedPedimentoExtractors,
+} from './_pedimento-shared-extractors.js';
 export const pedimentoSimplificadoDefinition = {
   type: 'pedimento_simplificado',
@@ -14,7 +17,7 @@ export const pedimentoSimplificadoDefinition = {
    *   - Otherwise: must have paymentDate
    * If no payment evidence is found, it's a "proforma".
    *
-   * @param {FieldResult[]} fields - Extracted fields
+   * @param {import('../document-type-shared.js').FieldResult[]} fields
    * @returns {string} - 'pedimento_simplificado' or 'proforma'
    */
   resolveType: (fields) => {
@@ -27,297 +30,15 @@ export const pedimentoSimplificadoDefinition = {
         ?.value ?? null;
     if (clavePedimento === 'R1') {
-      // Rectification pedimentos require fechaPagoRectificacion
       return fechaPagoRectificacion ? 'pedimento_simplificado' : 'proforma';
     }
-    // Regular pedimentos require paymentDate
     return paymentDate ? 'pedimento_simplificado' : 'proforma';
   },
   extractNumPedimento: (source, fields) => {
     return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
   },
-  extractPedimentoYear: (source, fields) => {
-    const numPedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
-    if (!numPedimento) {
-      return null;
-    }
-    const year = parseInt(numPedimento.substring(0, 2), 10);
-    return year < 50 ? year + 2000 : year + 1900;
-  },
-  extractors: [
-    // 1) Número de Pedimento (15 dígitos)
-    {
-      field: 'numPedimento',
-      extract: (source) => {
-        const match = source.match(/\d{2}\s?\d{2}\s?\d{4}\s?\d{7}/);
-        return new FieldResult(
-          'numPedimento',
-          !!match,
-          match ? match[0].replace(/\s/g, '') : null,
-        );
-      },
-    },
-    // 2) Tipo de Operación: los 3 caracteres justo después del número
-    {
-      field: 'tipoOperacion',
-      extract: (source) => {
-        // Look for the pedimento number pattern followed by operation type
-        // Pattern matches: "22 07 3429 2002089 EXP RT"
-        const match = source.match(
-          /\d{2}\s+\d{2}\s+\d{4}\s+\d{7}\s+([A-Z]{3})/,
-        );
-        return new FieldResult(
-          'tipoOperacion',
-          !!match,
-          match ? match[1] : null,
-        );
-      },
-    },
-    // 3) Clave de Pedimento: los 2 caracteres justo después de la operación
-    {
-      field: 'clavePedimento',
-      extract: (source) => {
-        // Look for the pedimento number pattern followed by operation type and then the key
-        // Pattern matches: "22 07 3429 2002089 EXP RT" to capture "RT"
-        const match = source.match(
-          /\d{2}\s+\d{2}\s+\d{4}\s+\d{7}\s+[A-Z]{3}\s+([A-Z0-9]{2})/,
-        );
-        return new FieldResult(
-          'clavePedimento',
-          !!match,
-          match ? match[1] : null,
-        );
-      },
-    },
-    // 4) Aduana E/S: Extract the 3-digit aduana code that appears at the end of the peso bruto line
-    {
-      field: 'aduanaEntradaSalida',
-      extract: (source) => {
-        // Look for the peso bruto line format: number followed by decimal amount followed by 3-digit aduana code
-        // Pattern matches formats like: "7 1.100 071" or "7 19,834.260 071" to capture "071"
-        // Uses multiline flag to match line boundaries precisely
-        const match = source.match(/^\s*\d+\s+[\d,.]+\s+(\d{3})\s*$/m);
-        return new FieldResult(
-          'aduanaEntradaSalida',
-          !!match,
-          match ? match[1] : null,
-        );
-      },
-    },
-    // 5) RFC: línea con 12-13 caracteres alfanuméricos
-    {
-      field: 'rfc',
-      extract: (source) => {
-        const match = source.match(/\n\s*([A-Z0-9]{12,13})\s*\n/);
-        return new FieldResult('rfc', !!match, match ? match[1] : null);
-      },
-    },
-    // 6) Código de Aceptación: línea con 8 caracteres alfanuméricos justo después del RFC
-    {
-      field: 'codigoAceptacion',
-      extract: (source) => {
-        // 1) split into trimmed, non-empty lines
-        const lines = source
-          .split(/\r?\n/)
-          .map((l) => l.trim())
-          .filter((l) => l.length > 0);
-        // 2) find the index of an RFC line (12–13 alnum chars)
-        const rfcIndex = lines.findIndex((l) => /^[A-Z0-9]{12,13}$/.test(l));
-        let code = null;
-        // 3) if next line exists and is exactly 8 alnum chars, that's the code
-        if (rfcIndex >= 0 && /^[A-Z0-9]{8}$/.test(lines[rfcIndex + 1] || '')) {
-          code = lines[rfcIndex + 1];
-        }
-        return new FieldResult('codigoAceptacion', code !== null, code);
-      },
-    },
-    // 7) Num. E-Document: exactamente 13 caracteres tras la etiqueta (puede haber múltiples líneas)
-    // {
-    //   field: 'numEDocumento',
-    //   extract: (source) => {
-    //     // Split into lines and find all lines containing NUM. E-DOCUMENT
-    //     const lines = source.split(/\r?\n/);
-    //     const edocLines = lines.filter((line) =>
-    //       /NUM\.?\s*E-DOCUMENT/i.test(line),
-    //     );
-    //     if (edocLines.length === 0) {
-    //       return new FieldResult('numEDocumento', false, null);
-    //     }
-    //     // Extract all 13-character alphanumeric codes from all NUM. E-DOCUMENT lines
-    //     const extractedCodes = [];
-    //     edocLines.forEach((line) => {
-    //       const afterEdoc = line.replace(/.*NUM\.?\s*E-DOCUMENT\s*/i, '');
-    //       const codes = afterEdoc.match(/[A-Z0-9]{13}/g) || [];
-    //       extractedCodes.push(...codes);
-    //     });
-    //     if (extractedCodes.length === 0) {
-    //       return new FieldResult('numEDocumento', false, null);
-    //     }
-    //     // Remove duplicates using Set
-    //     const uniqueCodes = [...new Set(extractedCodes)];
-    //     const formattedValue = `[${uniqueCodes.join(',')}]`;
-    //     return new FieldResult('numEDocumento', true, formattedValue);
-    //   },
-    // },
-    // 8) Fecha de Pago: Look for various payment date patterns
-    {
-      field: 'paymentDate',
-      extract: (source) => {
-        // Try multiple patterns for payment dates
-        let match = source.match(/2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/);
-        if (!match) {
-          match = source.match(/FECHA DE PAGO:\s*(\d{4}\/\d{2}\/\d{2})/);
-        }
-        if (!match) {
-          match = source.match(/PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/);
-        }
-        return new FieldResult('paymentDate', !!match, match ? match[1] : null);
-      },
-    },
-    // 9) COVE: NUMERO DE ACUSE DE VALOR (puede haber múltiples líneas)
-    // {
-    //   field: 'cove',
-    //   extract: (source) => {
-    //     // Split into lines and find all lines containing NUMERO DE ACUSE DE VALOR or COVE
-    //     const lines = source.split(/\r?\n/);
-    //     const coveLines = lines.filter(
-    //       (line) =>
-    //         /COVE/i.test(line) || /NUMERO DE ACUSE DE VALOR/i.test(line),
-    //     );
-    //     if (coveLines.length === 0) {
-    //       return new FieldResult('cove', false, null);
-    //     }
-    //     // Extract all COVE values from all matching lines
-    //     const coveValues = [];
-    //     coveLines.forEach((line) => {
-    //       const coveMatches = line.match(/COVE([A-Z0-9]+)/gi) || [];
-    //       // Extract just the alphanumeric parts (remove the "COVE" prefix)
-    //       const codes = coveMatches.map((match) => match);
-    //       coveValues.push(...codes);
-    //     });
-    //     if (coveValues.length === 0) {
-    //       return new FieldResult('cove', false, null);
-    //     }
-    //     // Remove duplicates using Set
-    //     const uniqueCoveValues = [...new Set(coveValues)];
-    //     const formattedValue = `[${uniqueCoveValues.join(',')}]`;
-    //     return new FieldResult('cove', true, formattedValue);
-    //   },
-    // },
-    // 10) Peso Bruto: Extract weight value
-    // {
-    //   field: 'pesoBruto',
-    //   extract: (source) => {
-    //     // Look for the peso bruto value with decimal format
-    //     const match = source.match(/(\d+\.\d+)\d{3}/);
-    //     return new FieldResult('pesoBruto', !!match, match ? match[1] : null);
-    //   },
-    // },
-    // 11) Patente: Extract patent number
-    {
-      field: 'patente',
-      extract: (source) => {
-        // Look for the PATENTE: PEDIMENTO: ADUANA: header line
-        // Then find the corresponding data line with three numbers
-        const lines = source.split(/\r?\n/);
-        const patenteHeaderIndex = lines.findIndex((line) =>
-          /PATENTE:.*PEDIMENTO:.*ADUANA:/i.test(line),
-        );
-        if (patenteHeaderIndex >= 0) {
-          // Look for the data line after the header (format: "3429 2002089 07")
-          for (let i = patenteHeaderIndex + 1; i < lines.length; i++) {
-            const line = lines[i].trim();
-            if (/^\d+\s+\d+\s+\d+$/.test(line)) {
-              const parts = line.split(/\s+/);
-              return new FieldResult('patente', true, parts[0]); // First number is the PATENTE
-            }
-          }
-        }
-        return new FieldResult('patente', false, null);
-      },
-    },
-    // 12) Numero de Operacion Bancaria
-    // {
-    //   field: 'numeroOperacionBancaria',
-    //   extract: (source) => {
-    //     const match = source.match(
-    //       /NUMERO DE OPERACION BANCARIA:\s*([A-Z0-9]+)/i,
-    //     );
-    //     return new FieldResult(
-    //       'numeroOperacionBancaria',
-    //       !!match,
-    //       match ? match[1] : null,
-    //     );
-    //   },
-    // },
-    // 13) Numero de Transaccion SAT
-    // {
-    //   field: 'numeroTransaccionSAT',
-    //   extract: (source) => {
-    //     const match = source.match(/NUMERO DE TRANSACCION SAT:\s*([A-Z0-9]+)/i);
-    //     return new FieldResult(
-    //       'numeroTransaccionSAT',
-    //       !!match,
-    //       match ? match[1] : null,
-    //     );
-    //   },
-    // },
-    // 14) Fecha de Pago Rectificación
-    {
-      field: 'fechaPagoRectificacion',
-      extract: (source) => {
-        // Look for the RECTIFICACION section header
-        const rectSectionMatch = source.match(
-          /RECTIFICACION[\s\S]{0,500}?(\d{2}\/\d{2}\/\d{4})/i,
-        );
-        if (rectSectionMatch) {
-          return new FieldResult(
-            'fechaPagoRectificacion',
-            true,
-            rectSectionMatch[1],
-          );
-        }
-        // Fallback: look for any date after FECHA PAGO RECT
-        const fechaMatch = source.match(
-          /FECHA PAGO RECT[\s\S]{0,500}?(\d{2}\/\d{2}\/\d{4})/i,
-        );
-        if (fechaMatch) {
-          return new FieldResult('fechaPagoRectificacion', true, fechaMatch[1]);
-        }
+  extractPedimentoYear: (source, fields) => pedimentoYearFromFields(fields),
-        return new FieldResult('fechaPagoRectificacion', false, null);
-      },
-    },
-  ],
+  extractors: sharedPedimentoExtractors,
 };

package/src/file-detection.js CHANGED Viewed

@@ -4,6 +4,35 @@ import { PDFParse } from 'pdf-parse';
 import { extractDocumentFields } from './document-type-shared.js';
+// Document types that participate in arela_path composition. The XML type is
+// kept here even though its matcher is currently disabled — once re-enabled
+// in document-type-shared.js no further changes are needed here.
+const ARELA_PATH_TYPES = new Set([
+  'pedimento_simplificado',
+  'pedimento_completo',
+  'pedimento_completo_xml',
+]);
+/**
+ * For `pedimento_completo_xml` the patente is not present in the XML body —
+ * it must be parsed from the filename. Three known patterns are tried.
+ */
+function patenteFromXmlFilename(filePath) {
+  if (!filePath) return null;
+  const fileName = path.basename(filePath);
+  let m = fileName.match(/^VU_(\d{4})_\d{3}_\d{7}\.xml$/i);
+  if (m) return m[1];
+  m = fileName.match(/^\d{3}-(\d{4})-\d{7}\.xml$/i);
+  if (m) return m[1];
+  m = fileName.match(/^\d{4}(\d{4})\d{7}(?:_\d{15})?\.xml$/i);
+  if (m) return m[1];
+  return null;
+}
 /**
  * Compose arela_path from extracted pedimento fields
  * Format: RFC/Year/Patente/Aduana/Pedimento/
@@ -15,16 +44,21 @@ function composeArelaPath(
   detectedPedimentoYear,
   filePath,
 ) {
-  if (detectedType !== 'pedimento_simplificado') {
+  if (!ARELA_PATH_TYPES.has(detectedType)) {
     return null;
   }
   const rfc = fields?.find((f) => f.name === 'rfc')?.value;
-  const patente = fields?.find((f) => f.name === 'patente')?.value;
+  let patente = fields?.find((f) => f.name === 'patente')?.value;
   const aduana = fields?.find((f) => f.name === 'aduanaEntradaSalida')?.value;
   const pedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
   const year = detectedPedimentoYear;
+  // XML matcher does not extract patente from the body — derive from filename.
+  if (!patente && detectedType === 'pedimento_completo_xml') {
+    patente = patenteFromXmlFilename(filePath);
+  }
   // All components are required for a valid arela_path
   if (!rfc || !year || !patente || !aduana || !pedimento) {
     console.log('⚠️ Missing required fields for arela_path composition:', {
@@ -155,12 +189,12 @@ export class FileDetectionService {
    * @returns {Promise<string>} - Extracted text
    */
   async extractTextFromPDF(filePath) {
+    let parser;
     try {
       const dataBuffer = fs.readFileSync(filePath);
-      // Convert Buffer to Uint8Array as required by pdf-parse
       const uint8Array = new Uint8Array(dataBuffer);
-      const pdfParse = new PDFParse(uint8Array);
-      const result = await pdfParse.getText();
+      parser = new PDFParse({ data: uint8Array });
+      const result = await parser.getText();
       return result.text;
     } catch (error) {
       console.error(
@@ -168,6 +202,10 @@ export class FileDetectionService {
         error.message,
       );
       throw new Error(`Failed to extract text from PDF: ${error.message}`);
+    } finally {
+      if (parser) {
+        await parser.destroy();
+      }
     }
   }

package/src/index.js CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env node
 import { Command } from 'commander';
+import gdriveSyncCommand from './commands/GDriveSyncCommand.js';
 import identifyCommand from './commands/IdentifyCommand.js';
 import pollWorkerCommand from './commands/PollWorkerCommand.js';
 import PropagateCommand from './commands/PropagateCommand.js';
@@ -458,6 +459,32 @@ class ArelaUploaderCLI {
     // END OF NEW SIMPLIFIED COMMANDS
     // ============================================================================
+    // GDrive sync command - mirror a Google Drive folder to local before scan
+    this.program
+      .command('gdrive-sync')
+      .description(
+        '☁️  Mirror a Google Drive folder to local filesystem (pre-scan source)',
+      )
+      .option(
+        '--root-folder <id>',
+        'Drive folder ID to sync (overrides GDRIVE_ROOT_FOLDER_ID)',
+      )
+      .option(
+        '--dest <path>',
+        'Local mirror destination (overrides GDRIVE_LOCAL_MIRROR_PATH)',
+      )
+      .option('--full', 'Ignore state file and re-verify all files')
+      .option('--dry-run', 'List/plan only, no downloads or writes')
+      .action(async (options) => {
+        try {
+          await gdriveSyncCommand.execute(options);
+        } catch (error) {
+          this.errorHandler.handleFatalError(error, {
+            command: 'gdrive-sync',
+          });
+        }
+      });
     // Watch command
     this.program
       .command('watch')

package/src/services/DatabaseService.js CHANGED Viewed

@@ -144,7 +144,9 @@ export class DatabaseService {
         rfc: null,
         message: null,
         file_extension: fileExtension,
-        is_like_simplificado: filename.toLowerCase().includes('simp'),
+        // Flag any PDF whose filename hints at a pedimento (simplificado,
+        // completo, or CoveFact). Column name preserved; semantics broadened.
+        is_like_simplificado: /(simp|pedim|covefact)/i.test(filename),
         year: null,
         created_at: new Date().toISOString(),
         updated_at: new Date().toISOString(),