npm - @arela/uploader - Versions diffs - 0.1.0 → 0.2.1 - Mend

@arela/uploader 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/.env.template +20 -0
package/OPTIMIZATION_SUMMARY.md +154 -0
package/PERFORMANCE_OPTIMIZATIONS.md +270 -0
package/README.md +412 -24
package/arela-upload.log +0 -0
package/commands.md +6 -0
package/package.json +12 -9
package/src/document-type-shared.js +80 -0
package/src/document-types/pedimento-simplificado.js +289 -0
package/src/file-detection.js +194 -0
package/src/index.js +1755 -575

package/src/document-types/pedimento-simplificado.js ADDED Viewed

@@ -0,0 +1,289 @@
+import { FieldResult } from '../document-type-shared.js';
+export const pedimentoSimplificadoDefinition = {
+  type: 'pedimento_simplificado',
+  extensions: ['pdf'],
+  match: (source) => {
+    const clues = [/FORMA SIMPLIFICADA DE PEDIMENTO/i];
+    const found = clues.filter((clue) => clue.test(source));
+    if (found.length > clues.length / 2) {
+      return true;
+    }
+    return false;
+  },
+  extractNumPedimento: (source, fields) => {
+    return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
+  },
+  extractPedimentoYear: (source, fields) => {
+    const numPedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
+    if (!numPedimento) {
+      return null;
+    }
+    const year = parseInt(numPedimento.substring(0, 2), 10);
+    return year < 50 ? year + 2000 : year + 1900;
+  },
+  extractors: [
+    // 1) Número de Pedimento (15 dígitos)
+    {
+      field: 'numPedimento',
+      extract: (source) => {
+        const match = source.match(/\d{2}\s?\d{2}\s?\d{4}\s?\d{7}/);
+        return new FieldResult(
+          'numPedimento',
+          !!match,
+          match ? match[0].replace(/\s/g, '') : null
+        );
+      },
+    },
+    // 2) Tipo de Operación: los 3 caracteres justo después del número
+    {
+      field: 'tipoOperacion',
+      extract: (source) => {
+        // Look for the pedimento number pattern followed by operation type
+        // Pattern matches: "22 07 3429 2002089 EXP RT"
+        const match = source.match(
+          /\d{2}\s+\d{2}\s+\d{4}\s+\d{7}\s+([A-Z]{3})/,
+        );
+        return new FieldResult(
+          'tipoOperacion',
+          !!match,
+          match ? match[1] : null
+        );
+      },
+    },
+    // 3) Clave de Pedimento: los 2 caracteres justo después de la operación
+    {
+      field: 'clavePedimento',
+      extract: (source) => {
+        // Look for the pedimento number pattern followed by operation type and then the key
+        // Pattern matches: "22 07 3429 2002089 EXP RT" to capture "RT"
+        const match = source.match(
+          /\d{2}\s+\d{2}\s+\d{4}\s+\d{7}\s+[A-Z]{3}\s+([A-Z0-9]{2})/,
+        );
+        return new FieldResult(
+          'clavePedimento',
+          !!match,
+          match ? match[1] : null
+        );
+      },
+    },
+    // 4) Aduana E/S: Extract the 3-digit aduana code that appears at the end of the peso bruto line
+    {
+      field: 'aduanaEntradaSalida',
+      extract: (source) => {
+        // Look for the peso bruto line format: number followed by decimal amount followed by 3-digit aduana code
+        // Pattern matches formats like: "7 1.100 071" or "7 19,834.260 071" to capture "071"
+        // Uses multiline flag to match line boundaries precisely
+        const match = source.match(/^\s*\d+\s+[\d,.]+\s+(\d{3})\s*$/m);
+        return new FieldResult(
+          'aduanaEntradaSalida',
+          !!match,
+          match ? match[1] : null
+        );
+      },
+    },
+    // 5) RFC: línea con 12-13 caracteres alfanuméricos
+    {
+      field: 'rfc',
+      extract: (source) => {
+        const match = source.match(/\n\s*([A-Z0-9]{12,13})\s*\n/);
+        return new FieldResult(
+          'rfc',
+          !!match,
+          match ? match[1] : null
+        );
+      },
+    },
+    // 6) Código de Aceptación: línea con 8 caracteres alfanuméricos justo después del RFC
+    {
+      field: 'codigoAceptacion',
+      extract: (source) => {
+        // 1) split into trimmed, non-empty lines
+        const lines = source
+          .split(/\r?\n/)
+          .map((l) => l.trim())
+          .filter((l) => l.length > 0);
+        // 2) find the index of an RFC line (12–13 alnum chars)
+        const rfcIndex = lines.findIndex((l) =>
+          /^[A-Z0-9]{12,13}$/.test(l),
+        );
+        let code = null;
+        // 3) if next line exists and is exactly 8 alnum chars, that's the code
+        if (rfcIndex >= 0 && /^[A-Z0-9]{8}$/.test(lines[rfcIndex + 1] || '')) {
+          code = lines[rfcIndex + 1];
+        }
+        return new FieldResult(
+          'codigoAceptacion',
+          code !== null,
+          code
+        );
+      },
+    },
+    // 7) Num. E-Document: exactamente 13 caracteres tras la etiqueta (puede haber múltiples líneas)
+    {
+      field: 'numEDocumento',
+      extract: (source) => {
+        // Split into lines and find all lines containing NUM. E-DOCUMENT
+        const lines = source.split(/\r?\n/);
+        const edocLines = lines.filter((line) =>
+          /NUM\.?\s*E-DOCUMENT/i.test(line),
+        );
+        if (edocLines.length === 0) {
+          return new FieldResult('numEDocumento', false, null);
+        }
+        // Extract all 13-character alphanumeric codes from all NUM. E-DOCUMENT lines
+        const extractedCodes = [];
+        edocLines.forEach((line) => {
+          const afterEdoc = line.replace(/.*NUM\.?\s*E-DOCUMENT\s*/i, '');
+          const codes = afterEdoc.match(/[A-Z0-9]{13}/g) || [];
+          extractedCodes.push(...codes);
+        });
+        if (extractedCodes.length === 0) {
+          return new FieldResult('numEDocumento', false, null);
+        }
+        // Remove duplicates using Set
+        const uniqueCodes = [...new Set(extractedCodes)];
+        const formattedValue = `[${uniqueCodes.join(',')}]`;
+        return new FieldResult('numEDocumento', true, formattedValue);
+      },
+    },
+    // 8) Fecha de Pago: Look for various payment date patterns
+    {
+      field: 'paymentDate',
+      extract: (source) => {
+        // Try multiple patterns for payment dates
+        let match = source.match(/2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/);
+        if (!match) {
+          match = source.match(/FECHA DE PAGO:\s*(\d{4}\/\d{2}\/\d{2})/);
+        }
+        if (!match) {
+          match = source.match(/PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/);
+        }
+        return new FieldResult(
+          'paymentDate',
+          !!match,
+          match ? match[1] : null
+        );
+      },
+    },
+    // 9) COVE: NUMERO DE ACUSE DE VALOR (puede haber múltiples líneas)
+    {
+      field: 'cove',
+      extract: (source) => {
+        // Split into lines and find all lines containing NUMERO DE ACUSE DE VALOR or COVE
+        const lines = source.split(/\r?\n/);
+        const coveLines = lines.filter(
+          (line) =>
+            /COVE/i.test(line) || /NUMERO DE ACUSE DE VALOR/i.test(line),
+        );
+        if (coveLines.length === 0) {
+          return new FieldResult('cove', false, null);
+        }
+        // Extract all COVE values from all matching lines
+        const coveValues = [];
+        coveLines.forEach((line) => {
+          const coveMatches = line.match(/COVE([A-Z0-9]+)/gi) || [];
+          // Extract just the alphanumeric parts (remove the "COVE" prefix)
+          const codes = coveMatches.map((match) => match);
+          coveValues.push(...codes);
+        });
+        if (coveValues.length === 0) {
+          return new FieldResult('cove', false, null);
+        }
+        // Remove duplicates using Set
+        const uniqueCoveValues = [...new Set(coveValues)];
+        const formattedValue = `[${uniqueCoveValues.join(',')}]`;
+        return new FieldResult('cove', true, formattedValue);
+      },
+    },
+    // 10) Peso Bruto: Extract weight value
+    {
+      field: 'pesoBruto',
+      extract: (source) => {
+        // Look for the peso bruto value with decimal format
+        const match = source.match(/(\d+\.\d+)\d{3}/);
+        return new FieldResult(
+          'pesoBruto',
+          !!match,
+          match ? match[1] : null
+        );
+      },
+    },
+    // 11) Patente: Extract patent number
+    {
+      field: 'patente',
+      extract: (source) => {
+        // Look for the PATENTE: PEDIMENTO: ADUANA: header line
+        // Then find the corresponding data line with three numbers
+        const lines = source.split(/\r?\n/);
+        const patenteHeaderIndex = lines.findIndex((line) =>
+          /PATENTE:.*PEDIMENTO:.*ADUANA:/i.test(line),
+        );
+        if (patenteHeaderIndex >= 0) {
+          // Look for the data line after the header (format: "3429 2002089 07")
+          for (let i = patenteHeaderIndex + 1; i < lines.length; i++) {
+            const line = lines[i].trim();
+            if (/^\d+\s+\d+\s+\d+$/.test(line)) {
+              const parts = line.split(/\s+/);
+              return new FieldResult('patente', true, parts[0]); // First number is the PATENTE
+            }
+          }
+        }
+        return new FieldResult('patente', false, null);
+      },
+    },
+    // 12) Numero de Operacion Bancaria
+    {
+      field: 'numeroOperacionBancaria',
+      extract: (source) => {
+        const match = source.match(
+          /NUMERO DE OPERACION BANCARIA:\s*([A-Z0-9]+)/i,
+        );
+        return new FieldResult(
+          'numeroOperacionBancaria',
+          !!match,
+          match ? match[1] : null
+        );
+      },
+    },
+    // 13) Numero de Transaccion SAT
+    {
+      field: 'numeroTransaccionSAT',
+      extract: (source) => {
+        const match = source.match(/NUMERO DE TRANSACCION SAT:\s*([A-Z0-9]+)/i);
+        return new FieldResult(
+          'numeroTransaccionSAT',
+          !!match,
+          match ? match[1] : null
+        );
+      },
+    },
+  ],
+};

package/src/file-detection.js ADDED Viewed

@@ -0,0 +1,194 @@
+import fs from 'fs';
+import path from 'path';
+import { getTextExtractor } from 'office-text-extractor';
+import { extractDocumentFields } from './document-type-shared.js';
+const extractor = getTextExtractor();
+/**
+ * Compose arela_path from extracted pedimento fields
+ * Format: RFC/Year/Patente/Aduana/Pedimento/
+ * Example: PED781129JT6/2023/3429/07/3019796/
+ */
+function composeArelaPath(detectedType, fields, detectedPedimentoYear, filePath) {
+  if (detectedType !== 'pedimento_simplificado') {
+    return null;
+  }
+  const rfc = fields?.find(f => f.name === 'rfc')?.value;
+  const patente = fields?.find(f => f.name === 'patente')?.value;
+  const aduana = fields?.find(f => f.name === 'aduanaEntradaSalida')?.value;
+  const pedimento = fields?.find(f => f.name === 'numPedimento')?.value;
+  const year = detectedPedimentoYear;
+  // All components are required for a valid arela_path
+  if (!rfc || !year || !patente || !aduana || !pedimento) {
+    console.log('⚠️ Missing required fields for arela_path composition:', {
+      rfc: !!rfc,
+      year: !!year,
+      patente: !!patente,
+      aduana: !!aduana,
+      pedimento: !!pedimento
+    });
+    return null;
+  }
+  // Ensure aduana is padded to 2 digits if needed (07 instead of 7)
+  const aduanaFormatted = aduana.toString().padStart(2, '0');
+  // arela_path should be the folder structure only, without filename
+  const arelaPath = `${rfc}/${year}/${patente}/${aduanaFormatted}/${pedimento}/`;
+  console.log(`✅ Composed arela_path: ${arelaPath}`);
+  return arelaPath;
+}
+/**
+ * File Detection Service
+ * Detects document types and extracts metadata from files
+ */
+export class FileDetectionService {
+  /**
+   * Detect document type from a file
+   * @param {string} filePath - Path to the file to analyze
+   * @returns {Promise<{detectedType: string|null, fields: Array, detectedPedimento: string|null, detectedPedimentoYear: number|null, text: string}>}
+   */
+  async detectFile(filePath) {
+    try {
+      const fileExtension = path.extname(filePath).toLowerCase().replace('.', '');
+      const fileName = path.basename(filePath);
+      console.log(`🔍 Analyzing file: ${fileName} (${fileExtension})`);
+      let text = '';
+      // Extract text based on file type
+      switch (fileExtension) {
+        case 'pdf':
+          text = await this.extractTextFromPDF(filePath);
+          break;
+        case 'txt':
+          text = fs.readFileSync(filePath, 'utf8');
+          break;
+        case 'xml':
+          text = fs.readFileSync(filePath, 'utf8');
+          break;
+        default:
+          console.log(`⚠️ Unsupported file type: ${fileExtension}`);
+          return {
+            detectedType: null,
+            fields: [],
+            detectedPedimento: null,
+            detectedPedimentoYear: null,
+            arelaPath: null,
+            text: '',
+            error: `Unsupported file type: ${fileExtension}`
+          };
+      }
+      if (!text || text.trim().length === 0) {
+        console.log('⚠️ No text extracted from file');
+        return {
+          detectedType: null,
+          fields: [],
+          detectedPedimento: null,
+          detectedPedimentoYear: null,
+          arelaPath: null,
+          text: '',
+          error: 'No text could be extracted from file'
+        };
+      }
+      // Extract document fields and detect type
+      const [detectedType, fields, detectedPedimento, detectedPedimentoYear] =
+        extractDocumentFields(text, fileExtension, filePath);
+      // Compose arela_path for pedimento_simplificado documents
+      const arelaPath = composeArelaPath(detectedType, fields, detectedPedimentoYear, filePath);
+      return {
+        detectedType,
+        fields,
+        detectedPedimento,
+        detectedPedimentoYear,
+        arelaPath,
+        text,
+        error: null
+      };
+    } catch (error) {
+      console.error(`❌ Error detecting file ${filePath}:`, error.message);
+      return {
+        detectedType: null,
+        fields: [],
+        detectedPedimento: null,
+        detectedPedimentoYear: null,
+        arelaPath: null,
+        text: '',
+        error: error.message
+      };
+    }
+  }
+  /**
+   * Extract text from PDF file
+   * @param {string} filePath - Path to PDF file
+   * @returns {Promise<string>} - Extracted text
+   */
+  async extractTextFromPDF(filePath) {
+    try {
+      const buffer = fs.readFileSync(filePath);
+      const text = await extractor.extractText({
+        input: buffer,
+        type: 'file'
+      });
+      return text;
+    } catch (error) {
+      console.error(`Error extracting text from PDF ${filePath}:`, error.message);
+      throw new Error(`Failed to extract text from PDF: ${error.message}`);
+    }
+  }
+  /**
+   * Detect multiple files in a directory
+   * @param {Array<string>} filePaths - Array of file paths to analyze
+   * @returns {Promise<Array>} - Array of detection results
+   */
+  async detectFiles(filePaths) {
+    const results = [];
+    for (const filePath of filePaths) {
+      const result = await this.detectFile(filePath);
+      results.push({
+        filePath,
+        ...result
+      });
+    }
+    return results;
+  }
+  /**
+   * Check if file type is supported for detection
+   * @param {string} filePath - Path to file
+   * @returns {boolean} - True if file type is supported
+   */
+  isSupportedFileType(filePath) {
+    const fileExtension = path.extname(filePath).toLowerCase().replace('.', '');
+    const supportedExtensions = ['pdf'];
+    return supportedExtensions.includes(fileExtension);
+  }
+  /**
+   * Filter files to only include supported types
+   * @param {Array<string>} filePaths - Array of file paths
+   * @returns {Array<string>} - Filtered array of supported file paths
+   */
+  filterSupportedFiles(filePaths) {
+    return filePaths.filter(filePath => this.isSupportedFileType(filePath));
+  }
+}
+export default FileDetectionService;
+export { composeArelaPath };