@arela/uploader 1.0.22 → 1.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arela/uploader",
3
- "version": "1.0.22",
3
+ "version": "1.0.23",
4
4
  "description": "CLI to upload files/directories to Arela",
5
5
  "bin": {
6
6
  "arela": "./src/index.js"
@@ -10,9 +10,7 @@ import { ConfigurationError } from '../errors/ErrorTypes.js';
10
10
  import FileDetectionService from '../file-detection.js';
11
11
 
12
12
  /**
13
- * Paid pedimento detected_type values. `pedimento_completo_xml` is included
14
- * even though the XML matcher is currently disabled in the registry so that
15
- * re-enabling it requires no changes here.
13
+ * Paid pedimento detected_type values.
16
14
  */
17
15
  const DETECTED_PEDIMENTO_TYPES = new Set([
18
16
  'pedimento_simplificado',
@@ -543,13 +541,15 @@ export class IdentifyCommand {
543
541
 
544
542
  // Check if the text contains any required pedimento marker. This must
545
543
  // stay aligned with the `match()` predicates in pedimento-simplificado.js
546
- // and pedimento-completo.js.
544
+ // and pedimento-completo.js (which accept both "DE" and "DEL" in the
545
+ // title, and treat the colon after "T. OPER" as optional).
547
546
  const text = result.text || '';
548
- const hasSimplificadoMarker = /FORMA SIMPLIFICADA DE PEDIMENTO/i.test(text);
547
+ const hasSimplificadoMarker =
548
+ /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(text);
549
549
  const hasCompletoMarkers =
550
550
  /NUM\.?\s*PEDIMENTO:/i.test(text) &&
551
551
  /CVE\.?\s*PEDIMENTO:/i.test(text) &&
552
- /T\.?\s*OPER:/i.test(text);
552
+ /T\.?\s*OPER:?/i.test(text);
553
553
 
554
554
  return !hasSimplificadoMarker && !hasCompletoMarkers;
555
555
  }
@@ -579,6 +579,9 @@ export class ScanCommand {
579
579
  * Normalize file record for database insertion
580
580
  * Stores paths with forward slashes for consistency but keeps them absolute
581
581
  * Sets likelySimplificado to true if file is a PDF and filename contains 'simp'
582
+ * Sets likelyInterAgencia to true if filename matches an inter-agency CFDI
583
+ * pattern (e.g. SICINGR*), so the API forces these XML/PDF through detection
584
+ * even though they lack the 'simp/pedim/covefact' heuristic.
582
585
  * @private
583
586
  */
584
587
  #normalizeFileRecord(filePath, fileStats, basePath, scanTimestamp) {
@@ -600,6 +603,17 @@ export class ScanCommand {
600
603
  const likelySimplificado =
601
604
  fileExtension === 'pdf' && /(simp|pedim|covefact)/i.test(fileName);
602
605
 
606
+ // Flag inter-agency CFDIs by filename so detection picks them up.
607
+ // Patterns are configurable via SCAN_INTER_AGENCIA_PATTERNS env var
608
+ // (see config.js). Only meaningful for PDF and XML.
609
+ let likelyInterAgencia = false;
610
+ if (fileExtension === 'pdf' || fileExtension === 'xml') {
611
+ const patterns = appConfig.scan.interAgenciaPatterns;
612
+ if (patterns && patterns.length > 0) {
613
+ likelyInterAgencia = patterns.some((re) => re.test(fileName));
614
+ }
615
+ }
616
+
603
617
  return {
604
618
  fileName,
605
619
  fileExtension,
@@ -610,6 +624,7 @@ export class ScanCommand {
610
624
  modifiedAt: fileStats.mtime.toISOString(),
611
625
  scanTimestamp,
612
626
  likelySimplificado,
627
+ likelyInterAgencia,
613
628
  };
614
629
  }
615
630
 
@@ -37,10 +37,10 @@ class Config {
37
37
  const __dirname = path.dirname(__filename);
38
38
  const packageJsonPath = path.resolve(__dirname, '../../package.json');
39
39
  const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
40
- return packageJson.version || '1.0.22';
40
+ return packageJson.version || '1.0.23';
41
41
  } catch (error) {
42
42
  console.warn('⚠️ Could not read package.json version, using fallback');
43
- return '1.0.22';
43
+ return '1.0.23';
44
44
  }
45
45
  }
46
46
 
@@ -294,6 +294,31 @@ class Config {
294
294
  .map((p) => p.trim())
295
295
  .filter(Boolean);
296
296
 
297
+ // Parse inter-agency CFDI filename patterns. Files whose basename matches
298
+ // any of these regex patterns are flagged at scan time (likelyInterAgencia)
299
+ // so the API forces them through detection and the factura_inter_agencia
300
+ // matcher can classify them. The push pipeline then excludes them (see
301
+ // NON_PUSHABLE_TYPES_SQL in arela-api). Comma-separated regex source list.
302
+ // Default: ^SICINGR — covers NORCOM's SICINGR70-NNNNNN(...).pdf/.XML files.
303
+ const defaultInterAgenciaPatterns = '^SICINGR';
304
+ const interAgenciaPatterns = (
305
+ process.env.SCAN_INTER_AGENCIA_PATTERNS || defaultInterAgenciaPatterns
306
+ )
307
+ .split(',')
308
+ .map((p) => p.trim())
309
+ .filter(Boolean)
310
+ .map((p) => {
311
+ try {
312
+ return new RegExp(p, 'i');
313
+ } catch (err) {
314
+ console.warn(
315
+ `⚠️ Invalid SCAN_INTER_AGENCIA_PATTERNS regex "${p}": ${err.message}`,
316
+ );
317
+ return null;
318
+ }
319
+ })
320
+ .filter(Boolean);
321
+
297
322
  // Generate table name if all components are available
298
323
  // Note: This is just for reference; actual table names are generated dynamically
299
324
  // in ScanCommand based on discovered directories and levels
@@ -312,6 +337,7 @@ class Config {
312
337
  basePathFull: basePathLabel, // Renamed for consistency
313
338
  tableName,
314
339
  excludePatterns,
340
+ interAgenciaPatterns,
315
341
  batchSize: parseInt(process.env.SCAN_BATCH_SIZE) || 2000,
316
342
  directoryLevel: parseInt(process.env.SCAN_DIRECTORY_LEVEL) || 0,
317
343
  };
@@ -1,10 +1,10 @@
1
1
  // Import all document type definitions
2
2
  import { dodaPdfDefinition } from './document-types/doda-pdf.js';
3
3
  import { dodaXmlDefinition } from './document-types/doda-xml.js';
4
+ import { facturaInterAgenciaDefinition } from './document-types/factura-inter-agencia.js';
4
5
  import { facturasComerciales } from './document-types/facturas-comerciales.js';
6
+ import { pedimentoCompletoXmlDefinition } from './document-types/pedimento-completo-xml.js';
5
7
  import { pedimentoCompletoDefinition } from './document-types/pedimento-completo.js';
6
- // TODO: enable XML pedimento detection — implementation ready in pedimento-completo-xml.js
7
- // import { pedimentoCompletoXmlDefinition } from './document-types/pedimento-completo-xml.js';
8
8
  import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
9
9
  import { proformaDefinition } from './document-types/proforma.js';
10
10
  import { supportDocumentDefinition } from './document-types/support-document.js';
@@ -45,14 +45,14 @@ export class DocumentTypeDefinition {
45
45
  const documentTypes = [
46
46
  pedimentoSimplificadoDefinition,
47
47
  pedimentoCompletoDefinition,
48
- // TODO: enable XML pedimento detection — uncomment the next line and the
49
- // matching import at the top of this file. All downstream code
50
- // (composeArelaPath, arela-api SQL filters, IdentifyCommand counters)
51
- // already accepts `pedimento_completo_xml`.
52
- // pedimentoCompletoXmlDefinition,
48
+ pedimentoCompletoXmlDefinition,
53
49
  supportDocumentDefinition,
54
50
  dodaPdfDefinition,
55
51
  dodaXmlDefinition,
52
+ // factura_inter_agencia MUST be evaluated BEFORE facturasComerciales
53
+ // because a NORCOM↔PALCO CFDI would also match the generic commercial
54
+ // invoice matcher. First match wins (see extractDocumentFields).
55
+ facturaInterAgenciaDefinition,
56
56
  facturasComerciales,
57
57
  // Add more document types here as needed
58
58
  ];
@@ -114,6 +114,14 @@ export function extractDocumentFields(source, fileExtension, filePath) {
114
114
  ? docType.extractPedimentoYear(source, fields, filePath)
115
115
  : null;
116
116
 
117
+ // Ensure downstream code (composeArelaPath) sees `numPedimento` as a
118
+ // field. PDF matchers add it via an explicit extractor; XML matchers
119
+ // compose it externally via extractNumPedimento. Backfill so both paths
120
+ // expose the same shape.
121
+ if (pedimento && !fields.some((f) => f.name === 'numPedimento')) {
122
+ fields.push(new FieldResult('numPedimento', true, pedimento));
123
+ }
124
+
117
125
  return [resolvedType, fields, pedimento, year];
118
126
  }
119
127
  }
@@ -186,15 +186,34 @@ export const paymentDateExtractor = {
186
186
  field: 'paymentDate',
187
187
  extract: (source) => {
188
188
  const patterns = [
189
- /FECHA\s+DE\s+PAGO:?\s*(\d{2}\/\d{2}\/\d{4})/i,
190
- /FECHA\s+DE\s+PAGO:?\s*(\d{4}\/\d{2}\/\d{2})/i,
191
- /2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/,
192
- /(?:^|\n)\s*PAGO\s+(\d{2}\/\d{2}\/\d{4})/i,
193
- /PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/i,
189
+ /FECHA\s+DE\s+PAGO:?\s*(\d{2}\/\d{2}\/\d{4})/i, // 0: explicit label DD/MM/YYYY
190
+ /FECHA\s+DE\s+PAGO:?\s*(\d{4}\/\d{2}\/\d{2})/i, // 1: explicit label YYYY/MM/DD
191
+ /2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/, // 2: forma simplificada scheduled date ⚠️
192
+ /(?:^|\n)\s*PAGO\s+(\d{2}\/\d{2}\/\d{4})/i, // 3: PAGO at line start (original)
193
+ /(?<=\d)PAGO\s+(\d{2}\/\d{2}\/\d{4})/i, // 4: PAGO after digit (pdf-parse artifact)
194
+ /(\d{2}\/\d{2}\/\d{4})[ \t]+PAGO[ \t]*$/im, // 5: reversed layout — date before PAGO (FECHAS column)
195
+ // 6: forma simplificada — pdf-parse extracts table cells out of order, so the
196
+ // label "FECHA DE PAGO:" can appear on its own line and the value (along with
197
+ // other cells like línea de captura, pedimento, importe) follows several lines
198
+ // later. Take the FIRST dd/mm/yyyy after the label within a 400-char window.
199
+ // Safe because `isNoPagado` short-circuits documents without a real payment,
200
+ // so we won't grab the unrelated ENTRADA date from the "FECHAS:" block above.
201
+ /FECHA\s+DE\s+PAGO:[\s\S]{1,400}?(\d{2}\/\d{2}\/\d{4})/i,
202
+ /PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/i, // 7: fallback
194
203
  ];
195
- for (const re of patterns) {
196
- const m = source.match(re);
197
- if (m) return new FieldResult('paymentDate', true, m[1]);
204
+ // "*** NO PAGADO" is the explicit SAT marker that no payment has been
205
+ // certified. When present, the bank-certification block is physically
206
+ // absent, so any date matched by the fallback patterns (e.g.
207
+ // "2 PAGO:" with a scheduled date, or "PRESENTACION:") would be a false
208
+ // positive. Return null outright — the document is classified as proforma.
209
+ const isNoPagado = /\*{3}\s*NO\s+PAGADO/i.test(source);
210
+ if (isNoPagado) {
211
+ return new FieldResult('paymentDate', false, null);
212
+ }
213
+ for (const pattern of patterns) {
214
+ const m = source.match(pattern);
215
+ if (!m) continue;
216
+ return new FieldResult('paymentDate', true, m[1]);
198
217
  }
199
218
  return new FieldResult('paymentDate', false, null);
200
219
  },
@@ -0,0 +1,186 @@
1
+ // NOTE: We intentionally do NOT import `FieldResult` from
2
+ // '../document-type-shared.js' to avoid a circular-import TDZ when this
3
+ // module is imported directly (e.g. from unit tests). `FieldResult` is a
4
+ // plain data-class with shape `{ name, found, value }`, so we construct
5
+ // equivalent plain objects locally.
6
+ const fieldResult = (name, found, value) => ({ name, found, value });
7
+
8
+ /**
9
+ * Factura Inter-Agencia Document Type Definition
10
+ *
11
+ * Detects CFDIs (XML or PDF) issued between customs broker agencies (e.g.,
12
+ * NORCOM ↔ PALCO). These files are dropped into a pedimento folder by the
13
+ * broker but they are NOT part of the customs electronic file (expediente
14
+ * aduanal) — they are inter-agency billing for broker services.
15
+ *
16
+ * Detection rules (ALL required):
17
+ * 1) CFDI markers present (either xml structure or PDF text representation)
18
+ * 2) Both emisor and receptor RFCs belong to the configured agency pair
19
+ * (NAA120215F20 = NORCOM, PCC1008161WA = PALCO) in any direction.
20
+ * 3) At least one concepto with ClaveProdServ 78141502 (Servicios de
21
+ * agentes aduaneros) — confirms the billing is for broker services.
22
+ *
23
+ * IMPORTANT: This matcher MUST be registered BEFORE `facturasComerciales`
24
+ * in document-type-shared.js — both would match a CFDI in a pedimento
25
+ * folder, but inter-agency invoices must take precedence so they are
26
+ * filtered out of the Arela push pipeline (see arela-api
27
+ * NON_PUSHABLE_TYPES_SQL).
28
+ *
29
+ * Currently scope-limited to NORCOM↔PALCO. To widen, move INTER_AGENCIA_RFCS
30
+ * to env config and require ≥2 distinct RFCs from the configured list.
31
+ */
32
+
33
+ /**
34
+ * RFCs of agencies whose mutual invoices should be excluded from the Arela
35
+ * push pipeline. Order is irrelevant — a match is any pair of distinct RFCs
36
+ * from this set appearing as emisor and receptor.
37
+ */
38
+ export const INTER_AGENCIA_RFCS = ['NAA120215F20', 'PCC1008161WA'];
39
+
40
+ const BROKER_SERVICE_CLAVE_PROD_SERV = '78141502';
41
+
42
+ const CFDI_XML_MARKERS = [
43
+ /cfdi:Comprobante/i,
44
+ /xmlns:cfdi/i,
45
+ /TipoDeComprobante/i,
46
+ ];
47
+
48
+ /**
49
+ * Detect that the source represents a CFDI — either as the original XML
50
+ * structure or as text extracted from a printed CFDI (PDF representation).
51
+ *
52
+ * PDF text loses XML tags, so we look for the human-readable equivalents
53
+ * commonly rendered by SAT-style invoice templates ("Folio Fiscal", "Sello
54
+ * Digital del CFDI", "Cadena Original ... Certificacion Digital del SAT").
55
+ */
56
+ function isCfdiContent(source) {
57
+ const xmlHits = CFDI_XML_MARKERS.filter((re) => re.test(source)).length;
58
+ if (xmlHits >= 2) return true;
59
+
60
+ const pdfMarkers = [
61
+ /folio\s*fiscal/i,
62
+ /sello\s*digital\s*del\s*cfdi/i,
63
+ /cadena\s*original.*certificaci[oó]n\s*digital\s*del\s*sat/i,
64
+ /representaci[oó]n\s*impresa\s*de\s*un\s*cfdi/i,
65
+ ];
66
+ return pdfMarkers.filter((re) => re.test(source)).length >= 2;
67
+ }
68
+
69
+ /**
70
+ * Return the subset of INTER_AGENCIA_RFCS that appear in `source`. Matching is
71
+ * case-insensitive and uses word boundaries so substrings inside larger tokens
72
+ * (cert/sello base64) don't produce false positives.
73
+ */
74
+ function findInterAgenciaRfcs(source) {
75
+ const found = new Set();
76
+ for (const rfc of INTER_AGENCIA_RFCS) {
77
+ const re = new RegExp(`\\b${rfc}\\b`, 'i');
78
+ if (re.test(source)) found.add(rfc.toUpperCase());
79
+ }
80
+ return [...found];
81
+ }
82
+
83
+ export const facturaInterAgenciaDefinition = {
84
+ type: 'factura_inter_agencia',
85
+ extensions: ['xml', 'pdf'],
86
+
87
+ match: (source) => {
88
+ if (!isCfdiContent(source)) return false;
89
+
90
+ // Need ≥2 distinct configured RFCs present (one as emisor, one as receptor)
91
+ const rfcsFound = findInterAgenciaRfcs(source);
92
+ if (rfcsFound.length < 2) return false;
93
+
94
+ // Confirm the invoice is for broker services (customs agent services)
95
+ if (!source.includes(BROKER_SERVICE_CLAVE_PROD_SERV)) return false;
96
+
97
+ return true;
98
+ },
99
+
100
+ // Pedimento extraction is optional / informational — these files are
101
+ // excluded from push, so arela_path is never composed. We still extract
102
+ // a pedimento number when present (from the "Referencias" / "Pedimento:"
103
+ // section of the printable CFDI) for auditability.
104
+ extractNumPedimento: (source, fields) => {
105
+ return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
106
+ },
107
+
108
+ extractPedimentoYear: (source, fields) => {
109
+ const numPedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
110
+ if (numPedimento && numPedimento.length >= 2) {
111
+ const yy = parseInt(numPedimento.substring(0, 2), 10);
112
+ if (!isNaN(yy)) return yy < 50 ? yy + 2000 : yy + 1900;
113
+ }
114
+ return null;
115
+ },
116
+
117
+ extractors: [
118
+ {
119
+ field: 'rfcEmisor',
120
+ extract: (source) => {
121
+ // XML form: <cfdi:Emisor Rfc="..." />
122
+ const xmlMatch = source.match(
123
+ /<[^>]*Emisor[^>]*Rfc\s*=\s*["']([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})["']/i,
124
+ );
125
+ if (xmlMatch) return fieldResult('rfcEmisor', true, xmlMatch[1]);
126
+
127
+ // PDF form: "Emisor" section followed by RFC label/value on later lines.
128
+ // We pick the first INTER_AGENCIA RFC that appears in the document.
129
+ const rfcs = findInterAgenciaRfcs(source);
130
+ if (rfcs.length > 0) return fieldResult('rfcEmisor', true, rfcs[0]);
131
+
132
+ return fieldResult('rfcEmisor', false, null);
133
+ },
134
+ },
135
+ {
136
+ field: 'rfcReceptor',
137
+ extract: (source) => {
138
+ const xmlMatch = source.match(
139
+ /<[^>]*Receptor[^>]*Rfc\s*=\s*["']([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})["']/i,
140
+ );
141
+ if (xmlMatch) return fieldResult('rfcReceptor', true, xmlMatch[1]);
142
+
143
+ const rfcs = findInterAgenciaRfcs(source);
144
+ if (rfcs.length >= 2) {
145
+ return fieldResult('rfcReceptor', true, rfcs[1]);
146
+ }
147
+ return fieldResult('rfcReceptor', false, null);
148
+ },
149
+ },
150
+ {
151
+ field: 'folio',
152
+ extract: (source) => {
153
+ // CFDI Folio attribute
154
+ const xmlMatch = source.match(/\bFolio\s*=\s*["']([A-Z0-9-]+)["']/i);
155
+ if (xmlMatch) return fieldResult('folio', true, xmlMatch[1]);
156
+
157
+ // PDF: "Numero Folio 012749"
158
+ const pdfMatch = source.match(/Numero\s+Folio\s+([A-Z0-9-]+)/i);
159
+ if (pdfMatch) return fieldResult('folio', true, pdfMatch[1]);
160
+
161
+ return fieldResult('folio', false, null);
162
+ },
163
+ },
164
+ {
165
+ field: 'uuid',
166
+ extract: (source) => {
167
+ const uuidRe =
168
+ /[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}/i;
169
+ const m = source.match(uuidRe);
170
+ return fieldResult('uuid', !!m, m ? m[0].toUpperCase() : null);
171
+ },
172
+ },
173
+ {
174
+ field: 'numPedimento',
175
+ extract: (source) => {
176
+ // Printable CFDI "Pedimento: 3458 6000046 Fecha: ..." — recovers an
177
+ // 11-digit pedimento (no YY prefix). Useful for auditability only.
178
+ const m = source.match(/Pedimento:?\s*(\d{4})\s*(\d{7})/i);
179
+ if (m) {
180
+ return fieldResult('numPedimento', true, `${m[1]}${m[2]}`);
181
+ }
182
+ return fieldResult('numPedimento', false, null);
183
+ },
184
+ },
185
+ ],
186
+ };
@@ -1,10 +1,8 @@
1
1
  // VUCEM "consultarPedimentoCompleto" XML matcher.
2
2
  //
3
- // STATUS: implemented but NOT registered in `document-type-shared.js`. To
4
- // activate, uncomment the import + registration in that file. All downstream
5
- // code (composeArelaPath, arela-api propagation SQL, IdentifyCommand
6
- // counters) already includes `pedimento_completo_xml`, so re-enabling is a
7
- // single-line change.
3
+ // Registered in `document-type-shared.js`. Downstream code
4
+ // (composeArelaPath, arela-api propagation SQL, IdentifyCommand counters)
5
+ // also includes `pedimento_completo_xml`.
8
6
  //
9
7
  // Filename patterns recognized (try in order — patente extraction):
10
8
  // 1) VU_PATENTE_ADUANA_PEDIMENTO.xml → e.g. VU_3429_070_5016101.xml
@@ -47,6 +45,22 @@ function pad(value, length) {
47
45
  return String(value).padStart(length, '0');
48
46
  }
49
47
 
48
+ /**
49
+ * Convert a VUCEM `aduanaEntradaSalida.clave` (e.g. "70", "750", "40") to the
50
+ * 2-digit "sección aduanera" prefix used inside the 15-digit pedimento number.
51
+ *
52
+ * VUCEM strips leading zeros from the canonical 3-digit SAT aduana code,
53
+ * so `070` (Ciudad Juárez) arrives as `70`. The pedimento prefix is the
54
+ * first 2 digits of the 3-digit code:
55
+ * `70` → `070` → `07` (Cd. Juárez)
56
+ * `750` → `750` → `75` (Puebla)
57
+ * `40` → `040` → `04` (Lázaro Cárdenas)
58
+ */
59
+ function aduanaToSeccion(claveValue) {
60
+ if (claveValue == null) return null;
61
+ return pad(claveValue, 3).substring(0, 2);
62
+ }
63
+
50
64
  /**
51
65
  * Try the three known filename patterns and return {patente, aduana, pedimento}
52
66
  * with any subset of the fields populated. Returns null if no pattern matches.
@@ -102,12 +116,17 @@ function yyFromIsoDate(iso) {
102
116
  return m ? m[1].substring(2, 4) : null;
103
117
  }
104
118
 
105
- // Find <ns2:fechas> block with nested clave==2 and return its <ns2:fecha>.
106
- function findPaymentDate(source) {
119
+ // Find <ns2:fechas> block whose nested <clave> matches `claveValue` and
120
+ // return its <ns2:fecha>. Works for both shapes:
121
+ // <fechas><clave>N</clave><fecha>...</fecha></fechas>
122
+ // <fechas><fecha>...</fecha><tipo><clave>N</clave></tipo></fechas>
123
+ // (firstTag finds the FIRST <clave> in the block — both layouts expose only
124
+ // one clave per fechas entry.)
125
+ function findFechaByClave(source, claveValue) {
107
126
  const fechasBlocks = allTagBlocks(source, 'fechas');
108
127
  for (const block of fechasBlocks) {
109
128
  const clave = firstTag(block, 'clave');
110
- if (clave === '2') {
129
+ if (clave === claveValue) {
111
130
  const fecha = firstTag(block, 'fecha');
112
131
  if (fecha) return fecha;
113
132
  }
@@ -115,6 +134,18 @@ function findPaymentDate(source) {
115
134
  return null;
116
135
  }
117
136
 
137
+ // Fecha de pago de las contribuciones (tipo.clave == 2).
138
+ function findPaymentDate(source) {
139
+ return findFechaByClave(source, '2');
140
+ }
141
+
142
+ // Fecha de presentacion (tipo.clave == 5). This is the authoritative source
143
+ // for the pedimento's YY prefix — a pedimento opened in Dec-2025 but paid in
144
+ // Jan-2026 keeps the `25` prefix, matching what VUCEM stamps in the filename.
145
+ function findPresentationDate(source) {
146
+ return findFechaByClave(source, '5');
147
+ }
148
+
118
149
  // --------------------------- extractors ------------------------------------
119
150
 
120
151
  const rfcExtractor = {
@@ -152,7 +183,7 @@ const aduanaEntradaSalidaExtractor = {
152
183
  return new FieldResult(
153
184
  'aduanaEntradaSalida',
154
185
  !!clave,
155
- clave ? pad(clave, 2) : null,
186
+ aduanaToSeccion(clave),
156
187
  );
157
188
  },
158
189
  };
@@ -165,6 +196,14 @@ const paymentDateExtractor = {
165
196
  },
166
197
  };
167
198
 
199
+ const presentationDateExtractor = {
200
+ field: 'presentationDate',
201
+ extract: (source) => {
202
+ const fecha = findPresentationDate(source);
203
+ return new FieldResult('presentationDate', !!fecha, fecha);
204
+ },
205
+ };
206
+
168
207
  const fechaPagoRectificacionExtractor = {
169
208
  field: 'fechaPagoRectificacion',
170
209
  extract: (source) => {
@@ -257,8 +296,14 @@ export const pedimentoCompletoXmlDefinition = {
257
296
 
258
297
  /**
259
298
  * Compose the 15-digit pedimento number from XML body + filename.
260
- * YY: from rectification fechaPago if present, else from the clave==2
261
- * payment-date fecha; falls back to filename pattern 3.
299
+ * YY: priority order (most authoritative first):
300
+ * 1) Filename pattern 3 (`{15-digit}.xml`) VUCEM stamps the correct
301
+ * prefix at export time.
302
+ * 2) Fecha de presentacion (<fechas><clave>5) — the year the pedimento
303
+ * was opened. Authoritative for the YY prefix even when payment
304
+ * crosses calendar year (e.g. opened Dec-2025, paid Jan-2026 → YY=25).
305
+ * 3) Rectification fechaPago (only when no presentation date exists).
306
+ * 4) Payment date (last-resort fallback).
262
307
  * AA: from <aduanaEntradaSalida><clave> padded to 2.
263
308
  * PPPP: from the filename (any of the three patterns).
264
309
  * NNNNNNN: from <pedimento> padded to 7.
@@ -267,15 +312,19 @@ export const pedimentoCompletoXmlDefinition = {
267
312
  extractNumPedimento: (source, fields, filePath) => {
268
313
  const parts = parseFilenameParts(filePath);
269
314
 
315
+ const presentation = fields?.find(
316
+ (f) => f.name === 'presentationDate' && f.found,
317
+ )?.value;
270
318
  const rect = fields?.find(
271
319
  (f) => f.name === 'fechaPagoRectificacion' && f.found,
272
320
  )?.value;
273
321
  const pay = fields?.find((f) => f.name === 'paymentDate' && f.found)?.value;
274
322
 
275
323
  let yy =
324
+ (parts && parts.year) ||
325
+ yyFromIsoDate(presentation) ||
276
326
  yyFromIsoDate(rect) ||
277
327
  yyFromIsoDate(pay) ||
278
- (parts && parts.year) ||
279
328
  null;
280
329
 
281
330
  const aduanaField = fields?.find(
@@ -315,6 +364,7 @@ export const pedimentoCompletoXmlDefinition = {
315
364
  tipoOperacionExtractor,
316
365
  aduanaEntradaSalidaExtractor,
317
366
  paymentDateExtractor,
367
+ presentationDateExtractor,
318
368
  fechaPagoRectificacionExtractor,
319
369
  coveExtractor,
320
370
  numEDocumentoExtractor,
@@ -17,17 +17,19 @@ export const pedimentoCompletoDefinition = {
17
17
  type: 'pedimento_completo',
18
18
  extensions: ['pdf'],
19
19
  match: (source) => {
20
- // Hard exclude: "FORMA SIMPLIFICADA" is handled by pedimento_simplificado.
21
- if (/FORMA SIMPLIFICADA DE PEDIMENTO/i.test(source)) return false;
20
+ // Hard exclude: "FORMA SIMPLIFICADA [DE|DEL] PEDIMENTO" is handled by
21
+ // pedimento_simplificado.
22
+ if (/FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(source)) return false;
22
23
 
23
24
  // Hard exclude: "AVISO CONSOLIDADO" shares the header trio but is a
24
25
  // different document type handled by aviso_consolidado.
25
26
  if (/AVISO\s+CONSOLIDADO/i.test(source)) return false;
26
27
 
28
+ // The colon after "T. OPER" is optional — see note in pedimento-simplificado.js.
27
29
  const hasHeaderFields =
28
30
  /NUM\.?\s*PEDIMENTO:/i.test(source) &&
29
31
  /CVE\.?\s*PEDIMENTO:/i.test(source) &&
30
- /T\.?\s*OPER:/i.test(source);
32
+ /T\.?\s*OPER:?/i.test(source);
31
33
  if (hasHeaderFields) {
32
34
  const hasCopyMarker =
33
35
  /ORIGINAL:\s*ADMINISTRACION GENERAL DE ADUANAS/i.test(source) ||
@@ -12,15 +12,18 @@ export const pedimentoSimplificadoDefinition = {
12
12
  if (/AVISO\s+CONSOLIDADO/i.test(source)) return false;
13
13
 
14
14
  // Fast path: the literal title appears on standard SIMP layouts.
15
- if (/FORMA SIMPLIFICADA DE PEDIMENTO/i.test(source)) return true;
15
+ // Some prevalidators print "FORMA SIMPLIFICADA DEL PEDIMENTO" (with DEL).
16
+ if (/FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(source)) return true;
16
17
 
17
18
  // Some PDFs (single-page anchors) lack that title but still carry the
18
19
  // three pedimento header fields. Treat them as simplificado UNLESS they
19
20
  // have the multi-page copy markers that uniquely identify a completo.
21
+ // NOTE: the colon after "T. OPER" is optional — many printable layouts
22
+ // render OPER as a table-header label with the value in the next cell.
20
23
  const hasHeaderFields =
21
24
  /NUM\.?\s*PEDIMENTO:/i.test(source) &&
22
25
  /CVE\.?\s*PEDIMENTO:/i.test(source) &&
23
- /T\.?\s*OPER:/i.test(source);
26
+ /T\.?\s*OPER:?/i.test(source);
24
27
  if (!hasHeaderFields) return false;
25
28
 
26
29
  const hasCompletoCopyMarker =
@@ -16,9 +16,9 @@ export const proformaDefinition = {
16
16
  type: 'proforma',
17
17
  extensions: ['pdf'],
18
18
 
19
- // Same content marker as pedimento simplificado
19
+ // Same content marker as pedimento simplificado (accepts "DE" or "DEL").
20
20
  match: (source) => {
21
- return /FORMA SIMPLIFICADA DE PEDIMENTO/i.test(source);
21
+ return /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(source);
22
22
  },
23
23
 
24
24
  extractNumPedimento: pedimentoSimplificadoDefinition.extractNumPedimento,
@@ -4,9 +4,7 @@ import { PDFParse } from 'pdf-parse';
4
4
 
5
5
  import { extractDocumentFields } from './document-type-shared.js';
6
6
 
7
- // Document types that participate in arela_path composition. The XML type is
8
- // kept here even though its matcher is currently disabled — once re-enabled
9
- // in document-type-shared.js no further changes are needed here.
7
+ // Document types that participate in arela_path composition.
10
8
  const ARELA_PATH_TYPES = new Set([
11
9
  'pedimento_simplificado',
12
10
  'pedimento_completo',
@@ -0,0 +1,218 @@
1
+ /**
2
+ * Unit tests for the factura_inter_agencia matcher.
3
+ *
4
+ * Verifies that NORCOM↔PALCO CFDIs (XML and printable PDF text) are
5
+ * detected as `factura_inter_agencia`, and that ordinary CFDIs are NOT
6
+ * mis-classified.
7
+ */
8
+ import { describe, it, expect } from '@jest/globals';
9
+
10
+ import {
11
+ facturaInterAgenciaDefinition,
12
+ INTER_AGENCIA_RFCS,
13
+ } from '../../src/document-types/factura-inter-agencia.js';
14
+ import { extractDocumentFields } from '../../src/document-type-shared.js';
15
+
16
+ const NORCOM_RFC = 'NAA120215F20';
17
+ const PALCO_RFC = 'PCC1008161WA';
18
+
19
+ // Realistic CFDI 4.0 XML between NORCOM (emisor) and PALCO (receptor).
20
+ // Conceptos use ClaveProdServ 78141502 (servicios de agentes aduaneros).
21
+ const CFDI_XML_INTER_AGENCIA = `<?xml version="1.0" encoding="utf-8"?>
22
+ <cfdi:Comprobante xmlns:cfdi="http://www.sat.gob.mx/cfd/4" Version="4.0" Folio="012749"
23
+ TipoDeComprobante="I" SubTotal="3000.00" Total="3480.00" Moneda="MXN">
24
+ <cfdi:Emisor Rfc="${NORCOM_RFC}" Nombre="NORCOM AGENTES ADUANALES" RegimenFiscal="601"/>
25
+ <cfdi:Receptor Rfc="${PALCO_RFC}" Nombre="PALCO, CONSORCIO DE COMERCIO INTERNACIONAL"
26
+ DomicilioFiscalReceptor="32380" RegimenFiscalReceptor="601" UsoCFDI="G03"/>
27
+ <cfdi:Conceptos>
28
+ <cfdi:Concepto ClaveProdServ="78141502" NoIdentificacion="HONO" Cantidad="1.00"
29
+ ClaveUnidad="E48" Unidad="Unidad de servicio" Descripcion="HONORARIOS"
30
+ ValorUnitario="1300.00" Importe="1300.00" ObjetoImp="02"/>
31
+ <cfdi:Concepto ClaveProdServ="78141502" NoIdentificacion="VALID" Cantidad="1.00"
32
+ ClaveUnidad="E48" Unidad="Unidad de servicio" Descripcion="VALIDACION"
33
+ ValorUnitario="200.00" Importe="200.00" ObjetoImp="02"/>
34
+ </cfdi:Conceptos>
35
+ </cfdi:Comprobante>`;
36
+
37
+ // Same agencies but conceptos do NOT use 78141502 — should NOT match.
38
+ const CFDI_XML_INTER_AGENCIA_WRONG_CONCEPT = CFDI_XML_INTER_AGENCIA.replace(
39
+ /78141502/g,
40
+ '90121502',
41
+ );
42
+
43
+ // CFDI between unrelated taxpayers — should NOT match.
44
+ const CFDI_XML_REGULAR = `<?xml version="1.0" encoding="utf-8"?>
45
+ <cfdi:Comprobante xmlns:cfdi="http://www.sat.gob.mx/cfd/4" Version="4.0" Folio="000123"
46
+ TipoDeComprobante="I" SubTotal="100.00" Total="116.00">
47
+ <cfdi:Emisor Rfc="ACME010101AB1" Nombre="ACME COMERCIAL" RegimenFiscal="601"/>
48
+ <cfdi:Receptor Rfc="XYZ020202CD2" Nombre="CLIENTE FINAL"
49
+ DomicilioFiscalReceptor="00000" RegimenFiscalReceptor="601" UsoCFDI="G03"/>
50
+ <cfdi:Conceptos>
51
+ <cfdi:Concepto ClaveProdServ="78141502" NoIdentificacion="ITEM" Cantidad="1.00"
52
+ ClaveUnidad="E48" Unidad="Unidad de servicio" Descripcion="SERVICIO"
53
+ ValorUnitario="100.00" Importe="100.00" ObjetoImp="02"/>
54
+ </cfdi:Conceptos>
55
+ </cfdi:Comprobante>`;
56
+
57
+ // Text extracted from the printable PDF representation of a CFDI inter-agencia.
58
+ // Mirrors what pdf-parse returns for the sample SICINGR70-012749(...).pdf.
59
+ const CFDI_PDF_TEXT_INTER_AGENCIA = `NORCOM AGENTES ADUANALES S.C
60
+ Tipo de Comprobante: (I) Ingreso
61
+ Folio Fiscal 84FC9CE2-00D5-4843-B377-B463321F9FC6
62
+ Numero Folio 012749
63
+ Emisor
64
+ RFC ${NORCOM_RFC}
65
+ Razon Social NORCOM AGENTES ADUANALES
66
+ Receptor
67
+ RFC ${PALCO_RFC}
68
+ Razon Social PALCO, CONSORCIO DE COMERCIO INTERNACIONAL
69
+ Pedimento: 3458 6000046 Fecha: 17/02/2026 Tipo: EXP Clave: A1
70
+ Erogaciones
71
+ 78141502 HONO HONORARIOS 1,300.00
72
+ 78141502 SERCOM SERVICIOS COMPLEMENTARIOS 1,500.00
73
+ 78141502 VALID VALIDACION 200.00
74
+ Sello Digital del CFDI
75
+ c4oBJ8/zAol0zg1jVe4MK8...
76
+ Cadena Original del Complemento de Certificacion Digital del SAT
77
+ ||4.0|012749|...
78
+ Este documento es una representación impresa de un CFDI`;
79
+
80
+ describe('factura_inter_agencia matcher', () => {
81
+ describe('configured RFC set', () => {
82
+ it('includes NORCOM and PALCO RFCs', () => {
83
+ expect(INTER_AGENCIA_RFCS).toContain(NORCOM_RFC);
84
+ expect(INTER_AGENCIA_RFCS).toContain(PALCO_RFC);
85
+ });
86
+ });
87
+
88
+ describe('match()', () => {
89
+ it('matches a NORCOM→PALCO XML CFDI with broker-service conceptos', () => {
90
+ expect(facturaInterAgenciaDefinition.match(CFDI_XML_INTER_AGENCIA)).toBe(
91
+ true,
92
+ );
93
+ });
94
+
95
+ it('matches the PDF-text representation of the same CFDI', () => {
96
+ expect(
97
+ facturaInterAgenciaDefinition.match(CFDI_PDF_TEXT_INTER_AGENCIA),
98
+ ).toBe(true);
99
+ });
100
+
101
+ it('does NOT match when ClaveProdServ is not 78141502', () => {
102
+ expect(
103
+ facturaInterAgenciaDefinition.match(
104
+ CFDI_XML_INTER_AGENCIA_WRONG_CONCEPT,
105
+ ),
106
+ ).toBe(false);
107
+ });
108
+
109
+ it('does NOT match a CFDI between unrelated taxpayers', () => {
110
+ expect(facturaInterAgenciaDefinition.match(CFDI_XML_REGULAR)).toBe(false);
111
+ });
112
+
113
+ it('does NOT match arbitrary non-CFDI text containing the RFCs', () => {
114
+ const text = `Reporte interno
115
+ RFC emisor: ${NORCOM_RFC}
116
+ RFC cliente: ${PALCO_RFC}
117
+ Clave 78141502`;
118
+ // No CFDI markers → should not match.
119
+ expect(facturaInterAgenciaDefinition.match(text)).toBe(false);
120
+ });
121
+
122
+ it('does NOT match if only one of the configured RFCs is present', () => {
123
+ const text = CFDI_XML_INTER_AGENCIA.replace(PALCO_RFC, 'XYZ020202CD2');
124
+ expect(facturaInterAgenciaDefinition.match(text)).toBe(false);
125
+ });
126
+ });
127
+
128
+ describe('extractors', () => {
129
+ it('extracts emisor + receptor RFCs from XML', () => {
130
+ const rfcEmisor = facturaInterAgenciaDefinition.extractors
131
+ .find((e) => e.field === 'rfcEmisor')
132
+ .extract(CFDI_XML_INTER_AGENCIA);
133
+ const rfcReceptor = facturaInterAgenciaDefinition.extractors
134
+ .find((e) => e.field === 'rfcReceptor')
135
+ .extract(CFDI_XML_INTER_AGENCIA);
136
+
137
+ expect(rfcEmisor.found).toBe(true);
138
+ expect(rfcEmisor.value).toBe(NORCOM_RFC);
139
+ expect(rfcReceptor.found).toBe(true);
140
+ expect(rfcReceptor.value).toBe(PALCO_RFC);
141
+ });
142
+
143
+ it('extracts both RFCs from PDF text via fallback', () => {
144
+ const rfcEmisor = facturaInterAgenciaDefinition.extractors
145
+ .find((e) => e.field === 'rfcEmisor')
146
+ .extract(CFDI_PDF_TEXT_INTER_AGENCIA);
147
+ const rfcReceptor = facturaInterAgenciaDefinition.extractors
148
+ .find((e) => e.field === 'rfcReceptor')
149
+ .extract(CFDI_PDF_TEXT_INTER_AGENCIA);
150
+
151
+ expect(rfcEmisor.found).toBe(true);
152
+ expect(rfcReceptor.found).toBe(true);
153
+ // Order is the order of first appearance in the document.
154
+ const found = [rfcEmisor.value, rfcReceptor.value].sort();
155
+ expect(found).toEqual([NORCOM_RFC, PALCO_RFC].sort());
156
+ });
157
+
158
+ it('extracts the UUID (folio fiscal) from both formats', () => {
159
+ const uuidExtractor = facturaInterAgenciaDefinition.extractors.find(
160
+ (e) => e.field === 'uuid',
161
+ );
162
+
163
+ const fromXml = uuidExtractor.extract(CFDI_XML_INTER_AGENCIA);
164
+ // XML sample has no UUID inside the comprobante body — that's fine.
165
+ expect(fromXml.found).toBe(false);
166
+
167
+ const fromPdf = uuidExtractor.extract(CFDI_PDF_TEXT_INTER_AGENCIA);
168
+ expect(fromPdf.found).toBe(true);
169
+ expect(fromPdf.value).toBe('84FC9CE2-00D5-4843-B377-B463321F9FC6');
170
+ });
171
+
172
+ it('extracts numPedimento from the printable PDF "Pedimento:" line', () => {
173
+ const numExtractor = facturaInterAgenciaDefinition.extractors.find(
174
+ (e) => e.field === 'numPedimento',
175
+ );
176
+ const result = numExtractor.extract(CFDI_PDF_TEXT_INTER_AGENCIA);
177
+ expect(result.found).toBe(true);
178
+ expect(result.value).toBe('34586000046');
179
+ });
180
+
181
+ it('extracts the CFDI folio from XML attribute', () => {
182
+ const folio = facturaInterAgenciaDefinition.extractors
183
+ .find((e) => e.field === 'folio')
184
+ .extract(CFDI_XML_INTER_AGENCIA);
185
+ expect(folio.found).toBe(true);
186
+ expect(folio.value).toBe('012749');
187
+ });
188
+ });
189
+
190
+ describe('registry order (factura_inter_agencia precedes facturas_comerciales)', () => {
191
+ it('resolves the inter-agency CFDI XML to factura_inter_agencia, not factura_comercial', () => {
192
+ const [detectedType] = extractDocumentFields(
193
+ CFDI_XML_INTER_AGENCIA,
194
+ 'xml',
195
+ '/tmp/SICINGR70-012749(PALCO).XML',
196
+ );
197
+ expect(detectedType).toBe('factura_inter_agencia');
198
+ });
199
+
200
+ it('resolves the inter-agency CFDI PDF text to factura_inter_agencia', () => {
201
+ const [detectedType] = extractDocumentFields(
202
+ CFDI_PDF_TEXT_INTER_AGENCIA,
203
+ 'pdf',
204
+ '/tmp/SICINGR70-012749(PALCO).pdf',
205
+ );
206
+ expect(detectedType).toBe('factura_inter_agencia');
207
+ });
208
+
209
+ it('falls through to factura_comercial for a regular CFDI', () => {
210
+ const [detectedType] = extractDocumentFields(
211
+ CFDI_XML_REGULAR,
212
+ 'xml',
213
+ '/tmp/regular-invoice.xml',
214
+ );
215
+ expect(detectedType).toBe('factura_comercial');
216
+ });
217
+ });
218
+ });
@@ -0,0 +1,271 @@
1
+ /**
2
+ * Regression tests for the pedimento_completo_xml matcher.
3
+ *
4
+ * Covers:
5
+ * 1) Basic detection + arela_path composition from a VUCEM
6
+ * `consultarPedimentoCompletoRespuesta` XML.
7
+ * 2) **YY truth source** — when the pedimento is opened in one year and
8
+ * paid in the next (e.g. presentation 2025-12, payment 2026-01), the
9
+ * 15-digit pedimento MUST keep the presentation year (`25...`), not the
10
+ * payment year (`26...`). This matches what VUCEM stamps in the
11
+ * filename and what the PDF matchers produce.
12
+ * 3) **Aduana padding** — VUCEM returns the aduana code without leading
13
+ * zeros (e.g. `70` for Ciudad Juárez instead of the canonical `070`).
14
+ * The 2-digit "sección aduanera" prefix used inside the 15-digit
15
+ * pedimento is the first 2 digits of the 3-digit form (`70` → `07`).
16
+ * 4) **numPedimento backfill** — the XML matcher composes numPedimento
17
+ * externally via `extractNumPedimento` rather than as a field
18
+ * extractor. `extractDocumentFields` must backfill it so that
19
+ * `composeArelaPath` can find it.
20
+ * 5) Resolution to `proforma_completo_xml` when no payment evidence
21
+ * exists in the body.
22
+ */
23
+ import { describe, it, expect } from '@jest/globals';
24
+
25
+ import { extractDocumentFields } from '../../src/document-type-shared.js';
26
+ import { composeArelaPath } from '../../src/file-detection.js';
27
+
28
+ // ---------------------------------------------------------------------------
29
+ // Test fixtures
30
+ // ---------------------------------------------------------------------------
31
+
32
+ /**
33
+ * Build a minimal VUCEM consultarPedimentoCompletoRespuesta XML.
34
+ * Only the tags the matcher actually reads are included.
35
+ */
36
+ function buildXml({
37
+ rfc = 'CEM090106MU3',
38
+ pedimento = '5063036',
39
+ claveDocumento = 'V1',
40
+ tipoOperacionDesc = 'Exportacion',
41
+ aduanaClave = '70',
42
+ presentationDate = '2025-12-01-06:00',
43
+ paymentDate = '2026-01-07-06:00',
44
+ rectFechaPago = null,
45
+ facturas = ['V1-FUJIKURA MEX-202512'],
46
+ edDocs = [],
47
+ } = {}) {
48
+ const fechas = [];
49
+ if (presentationDate) {
50
+ fechas.push(
51
+ `<ns2:fechas><ns2:fecha>${presentationDate}</ns2:fecha><ns2:tipo><ns2:clave>5</ns2:clave><ns2:descripcion>FECHA DE PRESENTACION</ns2:descripcion></ns2:tipo></ns2:fechas>`,
52
+ );
53
+ }
54
+ if (paymentDate) {
55
+ fechas.push(
56
+ `<ns2:fechas><ns2:fecha>${paymentDate}</ns2:fecha><ns2:tipo><ns2:clave>2</ns2:clave><ns2:descripcion>FECHA DE PAGO</ns2:descripcion></ns2:tipo></ns2:fechas>`,
57
+ );
58
+ }
59
+
60
+ const rect = rectFechaPago
61
+ ? `<ns2:rectificacion><ns2:fechaPago>${rectFechaPago}</ns2:fechaPago></ns2:rectificacion>`
62
+ : '';
63
+
64
+ const facturasXml = facturas
65
+ .map(
66
+ (num) =>
67
+ `<ns2:facturas><ns2:numero>${num}</ns2:numero></ns2:facturas>`,
68
+ )
69
+ .join('');
70
+
71
+ const identificadoresXml =
72
+ edDocs.length === 0
73
+ ? ''
74
+ : `<ns2:identificadores>${edDocs
75
+ .map(
76
+ (code) =>
77
+ `<ns2:identificadores><claveIdentificador><clave>ED</clave></claveIdentificador><complemento1>${code}</complemento1></ns2:identificadores>`,
78
+ )
79
+ .join('')}</ns2:identificadores>`;
80
+
81
+ return `<?xml version="1.0" encoding="UTF-8"?>
82
+ <S:Envelope xmlns:S="http://schemas.xmlsoap.org/soap/envelope/">
83
+ <S:Body>
84
+ <ns2:consultarPedimentoCompletoRespuesta xmlns:ns2="http://x">
85
+ <ns2:pedimento>
86
+ <ns2:pedimento>${pedimento}</ns2:pedimento>
87
+ <ns2:encabezado>
88
+ <ns2:claveDocumento><ns2:clave>${claveDocumento}</ns2:clave></ns2:claveDocumento>
89
+ <ns2:tipoOperacion><ns2:clave>2</ns2:clave><ns2:descripcion>${tipoOperacionDesc}</ns2:descripcion></ns2:tipoOperacion>
90
+ <ns2:aduanaEntradaSalida><ns2:clave>${aduanaClave}</ns2:clave></ns2:aduanaEntradaSalida>
91
+ </ns2:encabezado>
92
+ <ns2:importadorExportador>
93
+ <ns2:rfc>${rfc}</ns2:rfc>
94
+ ${fechas.join('\n')}
95
+ </ns2:importadorExportador>
96
+ ${rect}
97
+ ${facturasXml}
98
+ ${identificadoresXml}
99
+ </ns2:pedimento>
100
+ </ns2:consultarPedimentoCompletoRespuesta>
101
+ </S:Body>
102
+ </S:Envelope>`;
103
+ }
104
+
105
+ // ---------------------------------------------------------------------------
106
+ // Tests
107
+ // ---------------------------------------------------------------------------
108
+
109
+ describe('pedimento_completo_xml matcher', () => {
110
+ it('detects, extracts, and composes arela_path for a basic export pedimento', () => {
111
+ const xml = buildXml({
112
+ rfc: 'CEM090106MU3',
113
+ pedimento: '5063036',
114
+ aduanaClave: '70',
115
+ presentationDate: '2025-06-15-06:00',
116
+ paymentDate: '2025-06-20-06:00',
117
+ });
118
+ // 15-digit filename pattern: YY=25 AA=07 PPPP=3429 NNNNNNN=5063036
119
+ const filePath = '/x/2025/250734295063036_250734295063036.xml';
120
+
121
+ const [type, fields, ped, year] = extractDocumentFields(
122
+ xml,
123
+ 'xml',
124
+ filePath,
125
+ );
126
+
127
+ expect(type).toBe('pedimento_completo_xml');
128
+ expect(ped).toBe('250734295063036');
129
+ expect(year).toBe(2025);
130
+ expect(fields.find((f) => f.name === 'rfc')?.value).toBe('CEM090106MU3');
131
+ expect(fields.find((f) => f.name === 'aduanaEntradaSalida')?.value).toBe(
132
+ '07',
133
+ );
134
+ // Backfill check: numPedimento must be exposed as a field so
135
+ // composeArelaPath can find it.
136
+ expect(fields.find((f) => f.name === 'numPedimento')?.value).toBe(
137
+ '250734295063036',
138
+ );
139
+
140
+ const arela = composeArelaPath(type, fields, year, filePath);
141
+ expect(arela).toBe('CEM090106MU3/2025/3429/07/250734295063036/');
142
+ });
143
+
144
+ it('uses presentation date (not payment date) for YY when payment crosses calendar year', () => {
145
+ // Pedimento opened Dec 2025, paid Jan 2026 — the YY must be 25.
146
+ const xml = buildXml({
147
+ pedimento: '5063036',
148
+ aduanaClave: '70',
149
+ presentationDate: '2025-12-01-06:00',
150
+ paymentDate: '2026-01-07-06:00',
151
+ });
152
+ // Use the 3-part filename pattern (no YY in filename) so YY comes from XML body.
153
+ const filePath = '/x/070-3429-5063036.xml';
154
+
155
+ const [type, , ped, year] = extractDocumentFields(xml, 'xml', filePath);
156
+
157
+ expect(type).toBe('pedimento_completo_xml');
158
+ expect(ped).toBe('250734295063036');
159
+ expect(year).toBe(2025);
160
+ });
161
+
162
+ it('falls back to payment date YY when presentation date is missing', () => {
163
+ const xml = buildXml({
164
+ pedimento: '5063036',
165
+ aduanaClave: '70',
166
+ presentationDate: null, // No clave=5 block
167
+ paymentDate: '2026-01-07-06:00',
168
+ });
169
+ const filePath = '/x/070-3429-5063036.xml';
170
+
171
+ const [, , ped, year] = extractDocumentFields(xml, 'xml', filePath);
172
+
173
+ expect(ped).toBe('260734295063036');
174
+ expect(year).toBe(2026);
175
+ });
176
+
177
+ it('prefers filename YY over body fechas (VUCEM-stamped truth)', () => {
178
+ // Filename says YY=24 but body has presentation=2025. Filename wins.
179
+ const xml = buildXml({
180
+ pedimento: '5063036',
181
+ aduanaClave: '70',
182
+ presentationDate: '2025-12-01-06:00',
183
+ paymentDate: '2026-01-07-06:00',
184
+ });
185
+ const filePath = '/x/240734295063036_240734295063036.xml';
186
+
187
+ const [, , ped, year] = extractDocumentFields(xml, 'xml', filePath);
188
+
189
+ expect(ped).toBe('240734295063036');
190
+ expect(year).toBe(2024);
191
+ });
192
+
193
+ it('pads VUCEM aduana correctly: 70 -> 07, 750 -> 75, 40 -> 04', () => {
194
+ const cases = [
195
+ { aduanaClave: '70', expected: '07', // Cd. Juárez (3-digit canonical: 070)
196
+ filename: '/x/070-3429-5000001.xml' },
197
+ { aduanaClave: '750', expected: '75', // Puebla
198
+ filename: '/x/750-3429-5000002.xml' },
199
+ { aduanaClave: '40', expected: '04', // Lázaro Cárdenas (canonical: 040)
200
+ filename: '/x/040-3429-5000003.xml' },
201
+ ];
202
+
203
+ for (const c of cases) {
204
+ const xml = buildXml({
205
+ pedimento: c.filename.match(/-(\d{7})\.xml$/)[1],
206
+ aduanaClave: c.aduanaClave,
207
+ presentationDate: '2025-06-15-06:00',
208
+ paymentDate: '2025-06-20-06:00',
209
+ });
210
+ const [, fields, ped] = extractDocumentFields(xml, 'xml', c.filename);
211
+ expect(fields.find((f) => f.name === 'aduanaEntradaSalida')?.value).toBe(
212
+ c.expected,
213
+ );
214
+ // Positions 2-3 of the composed 15-digit pedimento must equal the
215
+ // aduana prefix.
216
+ expect(ped.substring(2, 4)).toBe(c.expected);
217
+ }
218
+ });
219
+
220
+ it('resolves to proforma_completo_xml when no payment evidence exists', () => {
221
+ const xml = buildXml({
222
+ pedimento: '5063036',
223
+ aduanaClave: '70',
224
+ presentationDate: '2025-12-01-06:00',
225
+ paymentDate: null, // No payment, no rectificacion
226
+ });
227
+ const filePath = '/x/070-3429-5063036.xml';
228
+
229
+ const [type] = extractDocumentFields(xml, 'xml', filePath);
230
+
231
+ expect(type).toBe('proforma_completo_xml');
232
+ });
233
+
234
+ it('extracts cove and rfc correctly', () => {
235
+ const xml = buildXml({
236
+ rfc: 'CEM090106MU3',
237
+ facturas: ['V1-FUJIKURA MEX-202512', 'INV-2'],
238
+ });
239
+ const filePath = '/x/250734295063036_250734295063036.xml';
240
+
241
+ const [, fields] = extractDocumentFields(xml, 'xml', filePath);
242
+
243
+ expect(fields.find((f) => f.name === 'rfc')?.value).toBe('CEM090106MU3');
244
+ expect(fields.find((f) => f.name === 'cove')?.value).toBe(
245
+ '[V1-FUJIKURA MEX-202512,INV-2]',
246
+ );
247
+ });
248
+
249
+ it('returns null arela_path when filename is unrecognized (no patente)', () => {
250
+ const xml = buildXml({
251
+ pedimento: '5063036',
252
+ aduanaClave: '70',
253
+ presentationDate: '2025-06-15-06:00',
254
+ paymentDate: '2025-06-20-06:00',
255
+ });
256
+ // Unrecognized filename — no patente derivable.
257
+ const filePath = '/x/random_name.xml';
258
+
259
+ const [type, fields, ped, year] = extractDocumentFields(
260
+ xml,
261
+ 'xml',
262
+ filePath,
263
+ );
264
+
265
+ expect(type).toBe('pedimento_completo_xml');
266
+ expect(ped).toBeNull();
267
+ expect(year).toBeNull();
268
+ // composeArelaPath returns null because patente is missing.
269
+ expect(composeArelaPath(type, fields, year, filePath)).toBeNull();
270
+ });
271
+ });
@@ -0,0 +1,185 @@
1
+ /**
2
+ * Regression tests for the pedimento_simplificado matcher.
3
+ *
4
+ * Covers PDF layouts where:
5
+ * - The header reads "FORMA SIMPLIFICADA DEL PEDIMENTO" (with DEL),
6
+ * not the canonical "DE PEDIMENTO".
7
+ * - The header trio prints "T. OPER" WITHOUT a trailing colon
8
+ * (the value sits in a separate table cell).
9
+ *
10
+ * Real-world example: REF NQR26-079, Aduana 640 (Querétaro), patente 3458.
11
+ * Before this regression test, the matcher fell through to
12
+ * `facturas_comerciales` because the word "FACTURA" appears in the
13
+ * "OBSERVACIONES" block.
14
+ */
15
+ import { describe, it, expect } from '@jest/globals';
16
+
17
+ // Importing only the dispatcher avoids circular-init issues caused by
18
+ // `_pedimento-shared-extractors.js` pulling FieldResult from document-type-shared.
19
+ import { extractDocumentFields } from '../../src/document-type-shared.js';
20
+
21
+ // REAL pdf-parse output from the NQR26-079 simplificado PDF
22
+ // (CSM9204097Q1, patente 3458, aduana 640).
23
+ // Captured verbatim with `PDFParse({data}).getText()` — pdf-parse extracts
24
+ // table cells out of visual order, so labels and values often live on
25
+ // different lines (see the FECHA DE PAGO block: label appears, then a few
26
+ // unrelated cells, then the date sits on its own line with the importe).
27
+ // This is exactly what the matchers and extractors see in production.
28
+ const SIMP_DEL_NQR26079_TEXT = `A1 CVE. PEDIMENTO: IMP T. OPER 26 64 3458 6000079 NUM. PEDIMENTO:
29
+ CERTIFICACIONES
30
+ ADUANA E/S:
31
+ DATOS DEL IMPORTADOR / EXPORTADOR
32
+ RFC: CURP:
33
+ CÓDIGO DE
34
+ ACEPTACIÓN
35
+ 640
36
+ CSM9204097Q1
37
+ FECHAS:
38
+ 17/03/2026
39
+ Ped. 6000079
40
+ CLAVE DE LA SECCION ADUANERA
41
+ DE DESPACHO:
42
+ QUERETARO, EL MARQUES Y
43
+ COLON, QUERETARO.
44
+ 640
45
+ DESTINO: 9 PESO BRUTO: 5.350
46
+ MARCAS,NUMEROS Y TOTAL DE BULTOS: 1
47
+ 04/03/2026
48
+ ENTRADA
49
+ PAGO
50
+ 3PW4CLHE
51
+ S/M S/N
52
+ CODIGO DE BARRAS
53
+ 0326 0132 XMP1 4914 6243 989
54
+ *** PAGO ELECTRONICO ***
55
+ DEPÓSITO REFERENCIADO - LÍNEA DE CAPTURA
56
+ PATENTE:
57
+ NOMBRE DE LA INSTITUCIÓN BANCARIA:
58
+ LÍNEA DE CAPTURA:
59
+ IMPORTE PAGADO:
60
+ NÚMERO DE OPERACIÓN BANCARIA:
61
+ NÚMERO DE TRANSACCIÓN SAT:
62
+ MEDIO DE PRESENTACIÓN:
63
+ MEDIO DE RECEPCIÓN/COBRO:
64
+ OTROS MEDIOS ELECTRÓNICOS (PAGO ELECTRÓNICO)
65
+ EFECTIVO (CARGO A CUENTA)
66
+ PEDIMENTO: ADUANA:
67
+ FECHA DE PAGO:
68
+ 0326 0132 XMP1 4914 6243
69
+ 6000079 640
70
+ 17/03/2026 $989
71
+ Banco Nacional de México, S.A.
72
+ 00000000703543
73
+ 3458
74
+ 40124170320261403012
75
+ NUMERO (GUIA/ORDEN EMBARQUE)/ID: 023-51315051 M 490453269837 H
76
+ NÚMERO DE ACUSE DE VALOR COVE268074HT1
77
+ NÚMERO DE E-DOCUMENT: 0438261DOG9W3 01702619TYEU7
78
+ OBSERVACIONES
79
+ FACTURA DE ACUERDO AL ARTÍCULO 36-A DE LA LEY ADUANERA VIGENTE Y A LA REGLA 3.1.
80
+ 8. DE LAS REGLAS
81
+ GENERALES DE COMERCIO EXTERIOR VIGENTES.
82
+ SE TRANSMITE PREVIAMENTE A VENTANILLA DIGITAL CONFORME A LA REGLA 1.9.18. DE LAS
83
+ REGLAS GENERALES DE
84
+ COMERCIO EXTERIOR VIGENTES.
85
+ SE EFECTÚA LA TRANSMISIÓN DIGITAL DE CONFORMIDAD A LA REGLA 3.1.17. Y 3.1.31. DE
86
+ LAS REGLAS GENERALES
87
+ DE COMERCIO EXTERIOR VIGENTES.
88
+ LA INFORMACIÓN CONTENIDA EN ESTE PEDIMENTO FUE SUMINISTRADA POR EL IMPORTADOR DE
89
+ CONFORMIDAD CON EL
90
+ ARTICULO 54 DE LA LEY ADUANERA EN VIGOR.
91
+ SE EXIME NOM-024-SCFI-2013 EN TERMINOS DEL NUMERAL 10, FRACC. X INCISO H, IMPORT
92
+ ACIÓN DEFINITIVA,
93
+ TRATÁNDOSE DE IMPORTADORES QUE CUENTEN CON UN PROSEC.
94
+ SE EXIME NOM-003-SCFI-2014 EN TERMINOS DEL NUMERAL 10, FRACC. X INCISO H, IMPORT
95
+ ACIÓN DEFINITIVA,
96
+ TRATÁNDOSE DE IMPORTADORES QUE CUENTEN CON UN PROSEC.
97
+ JOAQUIN GOMEZ ABAD
98
+ AGENTE ADUANAL, AGENCIA ADUANAL, APODERADO ADUANAL O DE ALMACEN
99
+ NOMBRE O RAZ. SOC.:
100
+ RFC: GAA1003111U6 GOAJ641219HDFMBQ09 CURP:
101
+ e.firma:
102
+ NUMERO DE SERIE DEL CERTIFICADO: 00001000000705949781
103
+ GOAJ641219QT5 RFC:
104
+ DECLARO BAJO PROTESTA DE DECIR VERDAD, EN LOS TERMINOS
105
+ DE LO DISPUESTO ARTICULO 81 DE LA LEY: PATENTE O
106
+ AUTORIZACIÓN: 3458 GOMEZ ABAD ASESORES EN COMERCIO EXTERIOR S.C.
107
+ FORMA SIMPLIFICADA DEL PEDIMENTO
108
+ SEGUNDA COPIA: IMPORTADOR EXPORTADOR DESTINO/ORIGEN: INTERIOR DEL PAÍS
109
+ REF: NQR26-079 Página 1 de 2
110
+
111
+ -- 1 of 2 --
112
+
113
+ FORMA SIMPLIFICADA DEL PEDIMENTO
114
+ SEGUNDA COPIA: IMPORTADOR EXPORTADOR DESTINO/ORIGEN: INTERIOR DEL PAÍS
115
+ REF: NQR26-079 Página 1 de 2
116
+ A1 CVE. PEDIMENTO: IMP T. OPER 26 64 3458 6000079 NUM. PEDIMENTO:
117
+ CURP:
118
+ RFC: CSM9204097Q1
119
+ ****** ****** ********** ********** FIN DE PEDIMENTO NUM. TOTAL DE PARTID
120
+ AS: CLAVE PREVALIDADOR: 010 1
121
+ ANEXO DEL PEDIMENTO
122
+ SEGUNDA COPIA: IMPORTADOR EXPORTADOR DESTINO/ORIGEN: INTERIOR DEL PAÍS
123
+ REF: NQR26-079 Página 2 de 2`;
124
+
125
+ describe('pedimento_simplificado matcher — DEL PEDIMENTO variant', () => {
126
+ it('dispatcher resolves NQR26-079 (DEL PEDIMENTO) as pedimento_simplificado', () => {
127
+ const [detectedType, , pedimento] = extractDocumentFields(
128
+ SIMP_DEL_NQR26079_TEXT,
129
+ 'pdf',
130
+ '/scans/CSM9204097Q1/NQR26-079.pdf',
131
+ );
132
+
133
+ // Regression: previously this resolved to `factura_comercial` because
134
+ // (1) the title regex demanded "DE PEDIMENTO" (this PDF says "DEL") and
135
+ // (2) the header trio required a colon after "T. OPER" (this PDF omits it).
136
+ expect(detectedType).toBe('pedimento_simplificado');
137
+ expect(pedimento).toBe('266434586000079');
138
+ });
139
+ });
140
+
141
+ describe('pedimento_simplificado matcher — header trio without colon after T. OPER', () => {
142
+ // Minimal text: title is the canonical "DE PEDIMENTO" so the fast path
143
+ // does NOT apply; only the fallback that requires the header trio runs.
144
+ // The trio MUST tolerate "T. OPER" without a trailing colon, because
145
+ // many printable PDFs render OPER as a column header (value in next cell).
146
+ const FALLBACK_TEXT = `FORMA SIMPLIFICADA DE PEDIMENTO
147
+ NUM. PEDIMENTO: 22 07 3429 2002089 T. OPER IMP CVE. PEDIMENTO: A1
148
+ DATOS DEL IMPORTADOR
149
+ PATENTE: 3429 PEDIMENTO: 2002089 ADUANA: 070
150
+ FECHA DE PAGO: 01/02/2023`;
151
+
152
+ it('resolves via fast-path "FORMA SIMPLIFICADA DE PEDIMENTO" header', () => {
153
+ const [detectedType] = extractDocumentFields(
154
+ FALLBACK_TEXT,
155
+ 'pdf',
156
+ '/scans/SAMPLE/pedimento.pdf',
157
+ );
158
+ expect(detectedType).toBe('pedimento_simplificado');
159
+ });
160
+ });
161
+
162
+ describe('pedimento_simplificado matcher — title accepts both DE and DEL', () => {
163
+ // Same minimal body, only the title differs. Both variants are produced
164
+ // by different prevalidators / agencias in the wild, and BOTH must
165
+ // resolve to pedimento_simplificado.
166
+ const body = `
167
+ NUM. PEDIMENTO: 22 07 3429 2002089 T. OPER IMP CVE. PEDIMENTO: A1
168
+ DATOS DEL IMPORTADOR
169
+ PATENTE: 3429 PEDIMENTO: 2002089 ADUANA: 070
170
+ FECHA DE PAGO: 01/02/2023`;
171
+
172
+ it.each([
173
+ ['FORMA SIMPLIFICADA DE PEDIMENTO', 'pedimento_simplificado'],
174
+ ['FORMA SIMPLIFICADA DEL PEDIMENTO', 'pedimento_simplificado'],
175
+ ['forma simplificada de pedimento', 'pedimento_simplificado'], // case-insensitive
176
+ ['FORMA SIMPLIFICADA DEL PEDIMENTO', 'pedimento_simplificado'], // extra spaces
177
+ ])('title "%s" resolves to %s', (title, expected) => {
178
+ const [detectedType] = extractDocumentFields(
179
+ `${title}\n${body}`,
180
+ 'pdf',
181
+ '/scans/SAMPLE/pedimento.pdf',
182
+ );
183
+ expect(detectedType).toBe(expected);
184
+ });
185
+ });