npm - @arela/uploader - Versions diffs - 1.0.22 → 1.0.24 - Mend

@arela/uploader 1.0.22 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/package.json +1 -1
package/scripts/scoring-compare.js +243 -0
package/scripts/scoring-phase4-check.js +96 -0
package/src/commands/IdentifyCommand.js +34 -6
package/src/commands/ScanCommand.js +15 -0
package/src/config/config.js +28 -2
package/src/document-type-shared.js +15 -7
package/src/document-types/_pedimento-shared-extractors.js +27 -8
package/src/document-types/factura-inter-agencia.js +186 -0
package/src/document-types/pedimento-completo-xml.js +62 -12
package/src/document-types/pedimento-completo.js +5 -3
package/src/document-types/pedimento-simplificado.js +5 -2
package/src/document-types/proforma.js +2 -2
package/src/file-detection.js +30 -6
package/src/scoring/db-matcher-adapter.js +98 -0
package/src/scoring/matchers-seed.js +386 -0
package/src/scoring/scoring-engine.js +218 -0
package/src/services/ScanApiService.js +14 -0
package/tests/unit/factura-inter-agencia.test.js +218 -0
package/tests/unit/pedimento-completo-xml-matcher.test.js +271 -0
package/tests/unit/pedimento-simplificado-matcher.test.js +185 -0
package/tests/unit/scoring-engine.test.js +221 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@arela/uploader",
-  "version": "1.0.22",
+  "version": "1.0.24",
   "description": "CLI to upload files/directories to Arela",
   "bin": {
     "arela": "./src/index.js"

package/scripts/scoring-compare.js ADDED Viewed

@@ -0,0 +1,243 @@
+/**
+ * Scoring engine validation harness (PROTOTYPE).
+ *
+ * Runs the CURRENT first-match-wins engine (`extractDocumentFields`) and the new
+ * best-match scoring engine (`classifyDocument`) over the same corpus and prints
+ * a side-by-side comparison so we can confirm best-match reproduces (or
+ * improves on) the current behaviour before wiring it into the pipeline.
+ *
+ * Usage:
+ *   node scripts/scoring-compare.js                # built-in synthetic samples
+ *   node scripts/scoring-compare.js <folder>       # + real .pdf/.xml/.txt files
+ *
+ * The built-in samples include the `factura_inter_agencia` vs
+ * `factura_comercial` case, which the current engine only resolves via
+ * registration order — the harness shows best-match resolving it by score,
+ * independent of matcher order.
+ */
+import fs from 'fs';
+import path from 'path';
+import { extractDocumentFields } from '../src/document-type-shared.js';
+import FileDetectionService from '../src/file-detection.js';
+import { classifyDocument, scoreAll } from '../src/scoring/scoring-engine.js';
+import { scoringMatchers } from '../src/scoring/matchers-seed.js';
+// --------------------------- synthetic corpus -------------------------------
+// Compact, representative texts that trigger the relevant clues. Real pdf-parse
+// output is messier — pass a folder to validate against production documents.
+const SAMPLES = [
+  {
+    name: 'simplificado-paid',
+    extension: 'pdf',
+    expected: 'pedimento_simplificado',
+    text: `FORMA SIMPLIFICADA DEL PEDIMENTO
+NUM. PEDIMENTO: 26 07 3429 6000079
+CVE. PEDIMENTO: A1
+T. OPER: IMP
+RFC: CSM9204097Q1
+FECHA DE PAGO: 04/03/2026
+*** PAGO ELECTRONICO ***`,
+  },
+  {
+    name: 'simplificado-unpaid (proforma)',
+    extension: 'pdf',
+    expected: 'proforma',
+    text: `FORMA SIMPLIFICADA DE PEDIMENTO
+NUM. PEDIMENTO: 26 07 3429 6000080
+CVE. PEDIMENTO: A1
+T. OPER: IMP
+RFC: CSM9204097Q1
+*** NO PAGADO ***`,
+  },
+  {
+    name: 'completo',
+    extension: 'pdf',
+    expected: 'pedimento_completo',
+    text: `NUM. PEDIMENTO: 26 07 3429 2002089
+CVE. PEDIMENTO: A1
+T. OPER: IMP
+SEGUNDA COPIA TRANSPORTISTA
+CERTIFICACIONES
+CUADRO DE LIQUIDACION
+*** PAGO ELECTRONICO ***
+FECHA DE PAGO: 02/03/2026`,
+  },
+  {
+    name: 'completo-xml',
+    extension: 'xml',
+    filePath: '/tmp/260734296016642.xml',
+    expected: 'pedimento_completo_xml',
+    text: `<?xml version="1.0"?>
+<ns2:consultarPedimentoCompletoRespuesta>
+  <ns2:pedimento>6016642</ns2:pedimento>
+  <ns2:aduanaEntradaSalida><ns2:clave>70</ns2:clave></ns2:aduanaEntradaSalida>
+  <ns2:fechas><ns2:clave>2</ns2:clave><ns2:fecha>2026-03-02-06:00</ns2:fecha></ns2:fechas>
+  <ns2:fechas><ns2:clave>5</ns2:clave><ns2:fecha>2026-02-20-06:00</ns2:fecha></ns2:fechas>
+  <ns2:rfc>CSM9204097Q1</ns2:rfc>
+</ns2:consultarPedimentoCompletoRespuesta>`,
+  },
+  {
+    name: 'doda-pdf',
+    extension: 'pdf',
+    expected: 'doda_pdf',
+    text: `DOCUMENTO DE OPERACION PARA DESPACHO ADUANERO
+DODA
+VUCEM
+||070|3429|2|4009029|109335668|A231|
+2026-03-02`,
+  },
+  {
+    name: 'doda-xml',
+    extension: 'xml',
+    expected: 'doda_xml',
+    text: `<?xml version="1.0"?>
+<documentoOperacion>
+  <numPedimento>260734292002089</numPedimento>
+  <patenteAduanal>3429</patenteAduanal>
+  <aduanaDespacho>07</aduanaDespacho>
+</documentoOperacion>`,
+  },
+  {
+    name: 'inter-agencia (vs comercial)',
+    extension: 'xml',
+    expected: 'factura_inter_agencia',
+    text: `<cfdi:Comprobante xmlns:cfdi="..." TipoDeComprobante="I">
+  <cfdi:Emisor Rfc="NAA120215F20"/>
+  <cfdi:Receptor Rfc="PCC1008161WA"/>
+  <cfdi:Concepto ClaveProdServ="78141502" Descripcion="Servicios de agente aduanal"/>
+</cfdi:Comprobante>`,
+  },
+  {
+    name: 'factura-comercial',
+    extension: 'xml',
+    expected: 'factura_comercial',
+    text: `<cfdi:Comprobante xmlns:cfdi="..." TipoDeComprobante="I">
+  <cfdi:Emisor Rfc="ABC010101AB1"/>
+  <cfdi:Receptor Rfc="XYZ020202CD2"/>
+  <tfd:TimbreFiscalDigital/>
+  pedimento 26 07 3429 6016477
+</cfdi:Comprobante>`,
+  },
+  {
+    name: 'support-document',
+    extension: 'xml',
+    expected: 'support_document',
+    text: `<?xml version="1.0"?>
+<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/">
+  <oxml:tipoOperacion>IMP</oxml:tipoOperacion>
+  <oxml:patenteAduanal>3429</oxml:patenteAduanal>
+</soapenv:Envelope>`,
+  },
+];
+// --------------------------- comparison -------------------------------------
+function firstMatchType(source, extension, filePath) {
+  const [type] = extractDocumentFields(source, extension, filePath);
+  return type;
+}
+function bestMatchResult(source, extension, filePath) {
+  return classifyDocument(scoringMatchers, { source, extension, filePath });
+}
+function topCandidates(source, extension, filePath, n = 3) {
+  return scoreAll(scoringMatchers, {
+    source,
+    extension,
+    fileName: filePath ? path.basename(filePath) : '',
+  })
+    .slice(0, n)
+    .map((c) => `${c.documentType}:${c.score}`)
+    .join(', ');
+}
+function row(name, first, best, expected) {
+  const agree = first === best ? 'sí ' : 'NO ';
+  const vsExp = expected ? (best === expected ? 'ok ' : '⚠️ ') : '   ';
+  return (
+    `${name.padEnd(34)} first=${String(first).padEnd(24)} ` +
+    `best=${String(best).padEnd(24)} coinciden=${agree} esperado=${vsExp}`
+  );
+}
+async function run() {
+  const folder = process.argv[2];
+  let total = 0;
+  let disagreements = 0;
+  console.log('\n=== Muestras sintéticas ===');
+  for (const s of SAMPLES) {
+    const first = firstMatchType(s.text, s.extension, s.filePath);
+    const best = bestMatchResult(s.text, s.extension, s.filePath).detectedType;
+    total++;
+    if (first !== best) disagreements++;
+    console.log(row(s.name, first, best, s.expected));
+  }
+  // Order-independence demonstration for the inter-agencia/comercial case.
+  const ia = SAMPLES.find((s) => s.name.startsWith('inter-agencia'));
+  const reversed = [...scoringMatchers].reverse();
+  const normalWinner = classifyDocument(scoringMatchers, {
+    source: ia.text,
+    extension: ia.extension,
+  }).detectedType;
+  const reversedWinner = classifyDocument(reversed, {
+    source: ia.text,
+    extension: ia.extension,
+  }).detectedType;
+  console.log('\n=== Independencia de orden (inter-agencia) ===');
+  console.log(`candidatos (por score): ${topCandidates(ia.text, ia.extension)}`);
+  console.log(`seed normal   -> ${normalWinner}`);
+  console.log(`seed invertido-> ${reversedWinner}`);
+  console.log(
+    `order-independent: ${normalWinner === reversedWinner ? 'sí ✅' : 'NO ❌'}`,
+  );
+  // Optional: real files from a folder.
+  if (folder) {
+    if (!fs.existsSync(folder)) {
+      console.error(`\nCarpeta no existe: ${folder}`);
+    } else {
+      console.log(`\n=== Archivos reales (${folder}) ===`);
+      const detection = new FileDetectionService();
+      const files = walk(folder).filter((f) =>
+        ['.pdf', '.xml', '.txt'].includes(path.extname(f).toLowerCase()),
+      );
+      for (const file of files) {
+        const ext = path.extname(file).toLowerCase().replace('.', '');
+        let text = '';
+        try {
+          text =
+            ext === 'pdf'
+              ? await detection.extractTextFromPDF(file)
+              : fs.readFileSync(file, 'utf8');
+        } catch (err) {
+          console.log(`${path.basename(file).padEnd(34)} ERROR: ${err.message}`);
+          continue;
+        }
+        const first = firstMatchType(text, ext, file);
+        const best = bestMatchResult(text, ext, file).detectedType;
+        total++;
+        if (first !== best) disagreements++;
+        console.log(row(path.basename(file), first, best, null));
+      }
+    }
+  }
+  console.log(
+    `\n=== Resumen: ${total} documentos, ${disagreements} divergencias first-vs-best ===\n`,
+  );
+}
+function walk(dir) {
+  const out = [];
+  for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
+    const full = path.join(dir, entry.name);
+    if (entry.isDirectory()) out.push(...walk(full));
+    else out.push(full);
+  }
+  return out;
+}
+run();

package/scripts/scoring-phase4-check.js ADDED Viewed

@@ -0,0 +1,96 @@
+/**
+ * Phase 4 validation: runs the REAL runtime path the uploader now uses
+ * (DB-shape matchers -> adaptDbMatchers -> classifyDocument with rich extraction)
+ * against a corpus and compares it to the legacy engine (extractDocumentFields).
+ *
+ * Usage: node scripts/scoring-phase4-check.js <folder>
+ */
+import fs from 'fs';
+import path from 'path';
+import { extractDocumentFields } from '../src/document-type-shared.js';
+import FileDetectionService from '../src/file-detection.js';
+import { adaptDbMatchers } from '../src/scoring/db-matcher-adapter.js';
+import { scoringMatchers } from '../src/scoring/matchers-seed.js';
+import { classifyDocument } from '../src/scoring/scoring-engine.js';
+// Serialize the local seed to the shape the API `/resolved` endpoint returns,
+// so we exercise the adapter exactly as in production.
+function toDbShape(matchers) {
+  return matchers.map((m) => ({
+    documentType: m.documentType,
+    extensions: m.extensions,
+    minScore: m.minScore ?? null,
+    priority: m.priority ?? 0,
+    qualify: m.qualify ?? null,
+    clues: (m.clues || []).map((c) => ({
+      kind: c.kind,
+      pattern: c.pattern instanceof RegExp ? c.pattern.source : c.pattern,
+      flags: c.pattern instanceof RegExp ? c.pattern.flags : c.flags || '',
+      weight: c.weight ?? 1,
+      group: c.group ?? null,
+      required: !!c.required,
+      negative: !!c.negative,
+    })),
+    fieldExtractors: [], // rich extraction comes from the registry by documentType
+  }));
+}
+const adapted = adaptDbMatchers(toDbShape(scoringMatchers));
+function walk(dir) {
+  const out = [];
+  for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
+    const full = path.join(dir, e.name);
+    if (e.isDirectory()) out.push(...walk(full));
+    else out.push(full);
+  }
+  return out;
+}
+async function run() {
+  const folder = process.argv[2];
+  if (!folder) {
+    console.error('Pass a folder: node scripts/scoring-phase4-check.js <folder>');
+    process.exit(1);
+  }
+  const detection = new FileDetectionService();
+  const files = walk(folder).filter((f) =>
+    ['.pdf', '.xml', '.txt'].includes(path.extname(f).toLowerCase()),
+  );
+  let total = 0;
+  let diverge = 0;
+  const patterns = {};
+  for (const file of files) {
+    const ext = path.extname(file).toLowerCase().replace('.', '');
+    let text = '';
+    try {
+      text =
+        ext === 'pdf'
+          ? await detection.extractTextFromPDF(file)
+          : fs.readFileSync(file, 'utf8');
+    } catch {
+      continue;
+    }
+    const legacy = extractDocumentFields(text, ext, file)[0];
+    const phase4 = classifyDocument(adapted, {
+      source: text,
+      extension: ext,
+      filePath: file,
+    }).detectedType;
+    total++;
+    if (legacy !== phase4) {
+      diverge++;
+      const key = `${legacy} -> ${phase4}`;
+      patterns[key] = (patterns[key] || 0) + 1;
+      console.log(`NO  ${path.basename(file).padEnd(40)} ${key}`);
+    }
+  }
+  console.log(`\n=== Fase 4 vs legacy: ${total} docs, ${diverge} divergencias ===`);
+  for (const [k, n] of Object.entries(patterns)) console.log(`  ${n}×  ${k}`);
+}
+run();

package/src/commands/IdentifyCommand.js CHANGED Viewed

@@ -8,11 +8,11 @@ import appConfig from '../config/config.js';
 import ErrorHandler from '../errors/ErrorHandler.js';
 import { ConfigurationError } from '../errors/ErrorTypes.js';
 import FileDetectionService from '../file-detection.js';
+import { adaptDbMatchers } from '../scoring/db-matcher-adapter.js';
+import { scoringMatchers } from '../scoring/matchers-seed.js';
 /**
- * Paid pedimento detected_type values. `pedimento_completo_xml` is included
- * even though the XML matcher is currently disabled in the registry so that
- * re-enabling it requires no changes here.
+ * Paid pedimento detected_type values.
  */
 const DETECTED_PEDIMENTO_TYPES = new Set([
   'pedimento_simplificado',
@@ -71,6 +71,32 @@ export class IdentifyCommand {
       );
       this.scanApiService = new ScanApiService(apiTarget);
+      // Load matchers for best-match classification (phase 4 hybrid). Prefer the
+      // DB-resolved set (this RFC + globals); fall back to the validated local
+      // seed; set DISABLE_SCORING_MATCHERS=true to force legacy first-match.
+      if (process.env.DISABLE_SCORING_MATCHERS === 'true') {
+        logger.info('🧩 Scoring matchers disabled — legacy detection');
+      } else {
+        let matchers = null;
+        try {
+          const rfc = process.env.MATCHER_RFC || null;
+          const dbMatchers = await this.scanApiService.getResolvedMatchers(rfc);
+          if (dbMatchers.length) {
+            matchers = adaptDbMatchers(dbMatchers);
+            logger.info(`🧩 Loaded ${matchers.length} matchers from API`);
+          }
+        } catch (err) {
+          logger.warn(`🧩 Could not load matchers from API: ${err.message}`);
+        }
+        if (!matchers) {
+          matchers = scoringMatchers;
+          logger.info(`🧩 Using local seed matchers (${matchers.length})`);
+        }
+        if (typeof this.detectionService.setMatchers === 'function') {
+          this.detectionService.setMatchers(matchers);
+        }
+      }
       const scanConfig = appConfig.getScanConfig();
       const batchSize = parseInt(options.batchSize) || 100;
@@ -543,13 +569,15 @@ export class IdentifyCommand {
     // Check if the text contains any required pedimento marker. This must
     // stay aligned with the `match()` predicates in pedimento-simplificado.js
-    // and pedimento-completo.js.
+    // and pedimento-completo.js (which accept both "DE" and "DEL" in the
+    // title, and treat the colon after "T. OPER" as optional).
     const text = result.text || '';
-    const hasSimplificadoMarker = /FORMA SIMPLIFICADA DE PEDIMENTO/i.test(text);
+    const hasSimplificadoMarker =
+      /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(text);
     const hasCompletoMarkers =
       /NUM\.?\s*PEDIMENTO:/i.test(text) &&
       /CVE\.?\s*PEDIMENTO:/i.test(text) &&
-      /T\.?\s*OPER:/i.test(text);
+      /T\.?\s*OPER:?/i.test(text);
     return !hasSimplificadoMarker && !hasCompletoMarkers;
   }

package/src/commands/ScanCommand.js CHANGED Viewed

@@ -579,6 +579,9 @@ export class ScanCommand {
    * Normalize file record for database insertion
    * Stores paths with forward slashes for consistency but keeps them absolute
    * Sets likelySimplificado to true if file is a PDF and filename contains 'simp'
+   * Sets likelyInterAgencia to true if filename matches an inter-agency CFDI
+   * pattern (e.g. SICINGR*), so the API forces these XML/PDF through detection
+   * even though they lack the 'simp/pedim/covefact' heuristic.
    * @private
    */
   #normalizeFileRecord(filePath, fileStats, basePath, scanTimestamp) {
@@ -600,6 +603,17 @@ export class ScanCommand {
     const likelySimplificado =
       fileExtension === 'pdf' && /(simp|pedim|covefact)/i.test(fileName);
+    // Flag inter-agency CFDIs by filename so detection picks them up.
+    // Patterns are configurable via SCAN_INTER_AGENCIA_PATTERNS env var
+    // (see config.js). Only meaningful for PDF and XML.
+    let likelyInterAgencia = false;
+    if (fileExtension === 'pdf' || fileExtension === 'xml') {
+      const patterns = appConfig.scan.interAgenciaPatterns;
+      if (patterns && patterns.length > 0) {
+        likelyInterAgencia = patterns.some((re) => re.test(fileName));
+      }
+    }
     return {
       fileName,
       fileExtension,
@@ -610,6 +624,7 @@ export class ScanCommand {
       modifiedAt: fileStats.mtime.toISOString(),
       scanTimestamp,
       likelySimplificado,
+      likelyInterAgencia,
     };
   }

package/src/config/config.js CHANGED Viewed

@@ -37,10 +37,10 @@ class Config {
       const __dirname = path.dirname(__filename);
       const packageJsonPath = path.resolve(__dirname, '../../package.json');
       const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
-      return packageJson.version || '1.0.22';
+      return packageJson.version || '1.0.24';
     } catch (error) {
       console.warn('⚠️ Could not read package.json version, using fallback');
-      return '1.0.22';
+      return '1.0.24';
     }
   }
@@ -294,6 +294,31 @@ class Config {
       .map((p) => p.trim())
       .filter(Boolean);
+    // Parse inter-agency CFDI filename patterns. Files whose basename matches
+    // any of these regex patterns are flagged at scan time (likelyInterAgencia)
+    // so the API forces them through detection and the factura_inter_agencia
+    // matcher can classify them. The push pipeline then excludes them (see
+    // NON_PUSHABLE_TYPES_SQL in arela-api). Comma-separated regex source list.
+    // Default: ^SICINGR — covers NORCOM's SICINGR70-NNNNNN(...).pdf/.XML files.
+    const defaultInterAgenciaPatterns = '^SICINGR';
+    const interAgenciaPatterns = (
+      process.env.SCAN_INTER_AGENCIA_PATTERNS || defaultInterAgenciaPatterns
+    )
+      .split(',')
+      .map((p) => p.trim())
+      .filter(Boolean)
+      .map((p) => {
+        try {
+          return new RegExp(p, 'i');
+        } catch (err) {
+          console.warn(
+            `⚠️ Invalid SCAN_INTER_AGENCIA_PATTERNS regex "${p}": ${err.message}`,
+          );
+          return null;
+        }
+      })
+      .filter(Boolean);
     // Generate table name if all components are available
     // Note: This is just for reference; actual table names are generated dynamically
     // in ScanCommand based on discovered directories and levels
@@ -312,6 +337,7 @@ class Config {
       basePathFull: basePathLabel, // Renamed for consistency
       tableName,
       excludePatterns,
+      interAgenciaPatterns,
       batchSize: parseInt(process.env.SCAN_BATCH_SIZE) || 2000,
       directoryLevel: parseInt(process.env.SCAN_DIRECTORY_LEVEL) || 0,
     };

package/src/document-type-shared.js CHANGED Viewed

@@ -1,10 +1,10 @@
 // Import all document type definitions
 import { dodaPdfDefinition } from './document-types/doda-pdf.js';
 import { dodaXmlDefinition } from './document-types/doda-xml.js';
+import { facturaInterAgenciaDefinition } from './document-types/factura-inter-agencia.js';
 import { facturasComerciales } from './document-types/facturas-comerciales.js';
+import { pedimentoCompletoXmlDefinition } from './document-types/pedimento-completo-xml.js';
 import { pedimentoCompletoDefinition } from './document-types/pedimento-completo.js';
-// TODO: enable XML pedimento detection — implementation ready in pedimento-completo-xml.js
-// import { pedimentoCompletoXmlDefinition } from './document-types/pedimento-completo-xml.js';
 import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
 import { proformaDefinition } from './document-types/proforma.js';
 import { supportDocumentDefinition } from './document-types/support-document.js';
@@ -45,14 +45,14 @@ export class DocumentTypeDefinition {
 const documentTypes = [
   pedimentoSimplificadoDefinition,
   pedimentoCompletoDefinition,
-  // TODO: enable XML pedimento detection — uncomment the next line and the
-  // matching import at the top of this file. All downstream code
-  // (composeArelaPath, arela-api SQL filters, IdentifyCommand counters)
-  // already accepts `pedimento_completo_xml`.
-  // pedimentoCompletoXmlDefinition,
+  pedimentoCompletoXmlDefinition,
   supportDocumentDefinition,
   dodaPdfDefinition,
   dodaXmlDefinition,
+  // factura_inter_agencia MUST be evaluated BEFORE facturasComerciales
+  // because a NORCOM↔PALCO CFDI would also match the generic commercial
+  // invoice matcher. First match wins (see extractDocumentFields).
+  facturaInterAgenciaDefinition,
   facturasComerciales,
   // Add more document types here as needed
 ];
@@ -114,6 +114,14 @@ export function extractDocumentFields(source, fileExtension, filePath) {
         ? docType.extractPedimentoYear(source, fields, filePath)
         : null;
+      // Ensure downstream code (composeArelaPath) sees `numPedimento` as a
+      // field. PDF matchers add it via an explicit extractor; XML matchers
+      // compose it externally via extractNumPedimento. Backfill so both paths
+      // expose the same shape.
+      if (pedimento && !fields.some((f) => f.name === 'numPedimento')) {
+        fields.push(new FieldResult('numPedimento', true, pedimento));
+      }
       return [resolvedType, fields, pedimento, year];
     }
   }

package/src/document-types/_pedimento-shared-extractors.js CHANGED Viewed

@@ -186,15 +186,34 @@ export const paymentDateExtractor = {
   field: 'paymentDate',
   extract: (source) => {
     const patterns = [
-      /FECHA\s+DE\s+PAGO:?\s*(\d{2}\/\d{2}\/\d{4})/i,
-      /FECHA\s+DE\s+PAGO:?\s*(\d{4}\/\d{2}\/\d{2})/i,
-      /2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/,
-      /(?:^|\n)\s*PAGO\s+(\d{2}\/\d{2}\/\d{4})/i,
-      /PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/i,
+      /FECHA\s+DE\s+PAGO:?\s*(\d{2}\/\d{2}\/\d{4})/i, // 0: explicit label DD/MM/YYYY
+      /FECHA\s+DE\s+PAGO:?\s*(\d{4}\/\d{2}\/\d{2})/i, // 1: explicit label YYYY/MM/DD
+      /2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/, // 2: forma simplificada scheduled date ⚠️
+      /(?:^|\n)\s*PAGO\s+(\d{2}\/\d{2}\/\d{4})/i, // 3: PAGO at line start (original)
+      /(?<=\d)PAGO\s+(\d{2}\/\d{2}\/\d{4})/i, // 4: PAGO after digit (pdf-parse artifact)
+      /(\d{2}\/\d{2}\/\d{4})[ \t]+PAGO[ \t]*$/im, // 5: reversed layout — date before PAGO (FECHAS column)
+      // 6: forma simplificada — pdf-parse extracts table cells out of order, so the
+      // label "FECHA DE PAGO:" can appear on its own line and the value (along with
+      // other cells like línea de captura, pedimento, importe) follows several lines
+      // later. Take the FIRST dd/mm/yyyy after the label within a 400-char window.
+      // Safe because `isNoPagado` short-circuits documents without a real payment,
+      // so we won't grab the unrelated ENTRADA date from the "FECHAS:" block above.
+      /FECHA\s+DE\s+PAGO:[\s\S]{1,400}?(\d{2}\/\d{2}\/\d{4})/i,
+      /PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/i, // 7: fallback
     ];
-    for (const re of patterns) {
-      const m = source.match(re);
-      if (m) return new FieldResult('paymentDate', true, m[1]);
+    // "*** NO PAGADO" is the explicit SAT marker that no payment has been
+    // certified. When present, the bank-certification block is physically
+    // absent, so any date matched by the fallback patterns (e.g.
+    // "2 PAGO:" with a scheduled date, or "PRESENTACION:") would be a false
+    // positive. Return null outright — the document is classified as proforma.
+    const isNoPagado = /\*{3}\s*NO\s+PAGADO/i.test(source);
+    if (isNoPagado) {
+      return new FieldResult('paymentDate', false, null);
+    }
+    for (const pattern of patterns) {
+      const m = source.match(pattern);
+      if (!m) continue;
+      return new FieldResult('paymentDate', true, m[1]);
     }
     return new FieldResult('paymentDate', false, null);
   },