@arela/uploader 1.0.22 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arela/uploader",
3
- "version": "1.0.22",
3
+ "version": "1.0.24",
4
4
  "description": "CLI to upload files/directories to Arela",
5
5
  "bin": {
6
6
  "arela": "./src/index.js"
@@ -0,0 +1,243 @@
1
+ /**
2
+ * Scoring engine validation harness (PROTOTYPE).
3
+ *
4
+ * Runs the CURRENT first-match-wins engine (`extractDocumentFields`) and the new
5
+ * best-match scoring engine (`classifyDocument`) over the same corpus and prints
6
+ * a side-by-side comparison so we can confirm best-match reproduces (or
7
+ * improves on) the current behaviour before wiring it into the pipeline.
8
+ *
9
+ * Usage:
10
+ * node scripts/scoring-compare.js # built-in synthetic samples
11
+ * node scripts/scoring-compare.js <folder> # + real .pdf/.xml/.txt files
12
+ *
13
+ * The built-in samples include the `factura_inter_agencia` vs
14
+ * `factura_comercial` case, which the current engine only resolves via
15
+ * registration order — the harness shows best-match resolving it by score,
16
+ * independent of matcher order.
17
+ */
18
+ import fs from 'fs';
19
+ import path from 'path';
20
+
21
+ import { extractDocumentFields } from '../src/document-type-shared.js';
22
+ import FileDetectionService from '../src/file-detection.js';
23
+ import { classifyDocument, scoreAll } from '../src/scoring/scoring-engine.js';
24
+ import { scoringMatchers } from '../src/scoring/matchers-seed.js';
25
+
26
+ // --------------------------- synthetic corpus -------------------------------
27
+ // Compact, representative texts that trigger the relevant clues. Real pdf-parse
28
+ // output is messier — pass a folder to validate against production documents.
29
+ const SAMPLES = [
30
+ {
31
+ name: 'simplificado-paid',
32
+ extension: 'pdf',
33
+ expected: 'pedimento_simplificado',
34
+ text: `FORMA SIMPLIFICADA DEL PEDIMENTO
35
+ NUM. PEDIMENTO: 26 07 3429 6000079
36
+ CVE. PEDIMENTO: A1
37
+ T. OPER: IMP
38
+ RFC: CSM9204097Q1
39
+ FECHA DE PAGO: 04/03/2026
40
+ *** PAGO ELECTRONICO ***`,
41
+ },
42
+ {
43
+ name: 'simplificado-unpaid (proforma)',
44
+ extension: 'pdf',
45
+ expected: 'proforma',
46
+ text: `FORMA SIMPLIFICADA DE PEDIMENTO
47
+ NUM. PEDIMENTO: 26 07 3429 6000080
48
+ CVE. PEDIMENTO: A1
49
+ T. OPER: IMP
50
+ RFC: CSM9204097Q1
51
+ *** NO PAGADO ***`,
52
+ },
53
+ {
54
+ name: 'completo',
55
+ extension: 'pdf',
56
+ expected: 'pedimento_completo',
57
+ text: `NUM. PEDIMENTO: 26 07 3429 2002089
58
+ CVE. PEDIMENTO: A1
59
+ T. OPER: IMP
60
+ SEGUNDA COPIA TRANSPORTISTA
61
+ CERTIFICACIONES
62
+ CUADRO DE LIQUIDACION
63
+ *** PAGO ELECTRONICO ***
64
+ FECHA DE PAGO: 02/03/2026`,
65
+ },
66
+ {
67
+ name: 'completo-xml',
68
+ extension: 'xml',
69
+ filePath: '/tmp/260734296016642.xml',
70
+ expected: 'pedimento_completo_xml',
71
+ text: `<?xml version="1.0"?>
72
+ <ns2:consultarPedimentoCompletoRespuesta>
73
+ <ns2:pedimento>6016642</ns2:pedimento>
74
+ <ns2:aduanaEntradaSalida><ns2:clave>70</ns2:clave></ns2:aduanaEntradaSalida>
75
+ <ns2:fechas><ns2:clave>2</ns2:clave><ns2:fecha>2026-03-02-06:00</ns2:fecha></ns2:fechas>
76
+ <ns2:fechas><ns2:clave>5</ns2:clave><ns2:fecha>2026-02-20-06:00</ns2:fecha></ns2:fechas>
77
+ <ns2:rfc>CSM9204097Q1</ns2:rfc>
78
+ </ns2:consultarPedimentoCompletoRespuesta>`,
79
+ },
80
+ {
81
+ name: 'doda-pdf',
82
+ extension: 'pdf',
83
+ expected: 'doda_pdf',
84
+ text: `DOCUMENTO DE OPERACION PARA DESPACHO ADUANERO
85
+ DODA
86
+ VUCEM
87
+ ||070|3429|2|4009029|109335668|A231|
88
+ 2026-03-02`,
89
+ },
90
+ {
91
+ name: 'doda-xml',
92
+ extension: 'xml',
93
+ expected: 'doda_xml',
94
+ text: `<?xml version="1.0"?>
95
+ <documentoOperacion>
96
+ <numPedimento>260734292002089</numPedimento>
97
+ <patenteAduanal>3429</patenteAduanal>
98
+ <aduanaDespacho>07</aduanaDespacho>
99
+ </documentoOperacion>`,
100
+ },
101
+ {
102
+ name: 'inter-agencia (vs comercial)',
103
+ extension: 'xml',
104
+ expected: 'factura_inter_agencia',
105
+ text: `<cfdi:Comprobante xmlns:cfdi="..." TipoDeComprobante="I">
106
+ <cfdi:Emisor Rfc="NAA120215F20"/>
107
+ <cfdi:Receptor Rfc="PCC1008161WA"/>
108
+ <cfdi:Concepto ClaveProdServ="78141502" Descripcion="Servicios de agente aduanal"/>
109
+ </cfdi:Comprobante>`,
110
+ },
111
+ {
112
+ name: 'factura-comercial',
113
+ extension: 'xml',
114
+ expected: 'factura_comercial',
115
+ text: `<cfdi:Comprobante xmlns:cfdi="..." TipoDeComprobante="I">
116
+ <cfdi:Emisor Rfc="ABC010101AB1"/>
117
+ <cfdi:Receptor Rfc="XYZ020202CD2"/>
118
+ <tfd:TimbreFiscalDigital/>
119
+ pedimento 26 07 3429 6016477
120
+ </cfdi:Comprobante>`,
121
+ },
122
+ {
123
+ name: 'support-document',
124
+ extension: 'xml',
125
+ expected: 'support_document',
126
+ text: `<?xml version="1.0"?>
127
+ <soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/">
128
+ <oxml:tipoOperacion>IMP</oxml:tipoOperacion>
129
+ <oxml:patenteAduanal>3429</oxml:patenteAduanal>
130
+ </soapenv:Envelope>`,
131
+ },
132
+ ];
133
+
134
+ // --------------------------- comparison -------------------------------------
135
+ function firstMatchType(source, extension, filePath) {
136
+ const [type] = extractDocumentFields(source, extension, filePath);
137
+ return type;
138
+ }
139
+
140
+ function bestMatchResult(source, extension, filePath) {
141
+ return classifyDocument(scoringMatchers, { source, extension, filePath });
142
+ }
143
+
144
+ function topCandidates(source, extension, filePath, n = 3) {
145
+ return scoreAll(scoringMatchers, {
146
+ source,
147
+ extension,
148
+ fileName: filePath ? path.basename(filePath) : '',
149
+ })
150
+ .slice(0, n)
151
+ .map((c) => `${c.documentType}:${c.score}`)
152
+ .join(', ');
153
+ }
154
+
155
+ function row(name, first, best, expected) {
156
+ const agree = first === best ? 'sí ' : 'NO ';
157
+ const vsExp = expected ? (best === expected ? 'ok ' : '⚠️ ') : ' ';
158
+ return (
159
+ `${name.padEnd(34)} first=${String(first).padEnd(24)} ` +
160
+ `best=${String(best).padEnd(24)} coinciden=${agree} esperado=${vsExp}`
161
+ );
162
+ }
163
+
164
+ async function run() {
165
+ const folder = process.argv[2];
166
+ let total = 0;
167
+ let disagreements = 0;
168
+
169
+ console.log('\n=== Muestras sintéticas ===');
170
+ for (const s of SAMPLES) {
171
+ const first = firstMatchType(s.text, s.extension, s.filePath);
172
+ const best = bestMatchResult(s.text, s.extension, s.filePath).detectedType;
173
+ total++;
174
+ if (first !== best) disagreements++;
175
+ console.log(row(s.name, first, best, s.expected));
176
+ }
177
+
178
+ // Order-independence demonstration for the inter-agencia/comercial case.
179
+ const ia = SAMPLES.find((s) => s.name.startsWith('inter-agencia'));
180
+ const reversed = [...scoringMatchers].reverse();
181
+ const normalWinner = classifyDocument(scoringMatchers, {
182
+ source: ia.text,
183
+ extension: ia.extension,
184
+ }).detectedType;
185
+ const reversedWinner = classifyDocument(reversed, {
186
+ source: ia.text,
187
+ extension: ia.extension,
188
+ }).detectedType;
189
+ console.log('\n=== Independencia de orden (inter-agencia) ===');
190
+ console.log(`candidatos (por score): ${topCandidates(ia.text, ia.extension)}`);
191
+ console.log(`seed normal -> ${normalWinner}`);
192
+ console.log(`seed invertido-> ${reversedWinner}`);
193
+ console.log(
194
+ `order-independent: ${normalWinner === reversedWinner ? 'sí ✅' : 'NO ❌'}`,
195
+ );
196
+
197
+ // Optional: real files from a folder.
198
+ if (folder) {
199
+ if (!fs.existsSync(folder)) {
200
+ console.error(`\nCarpeta no existe: ${folder}`);
201
+ } else {
202
+ console.log(`\n=== Archivos reales (${folder}) ===`);
203
+ const detection = new FileDetectionService();
204
+ const files = walk(folder).filter((f) =>
205
+ ['.pdf', '.xml', '.txt'].includes(path.extname(f).toLowerCase()),
206
+ );
207
+ for (const file of files) {
208
+ const ext = path.extname(file).toLowerCase().replace('.', '');
209
+ let text = '';
210
+ try {
211
+ text =
212
+ ext === 'pdf'
213
+ ? await detection.extractTextFromPDF(file)
214
+ : fs.readFileSync(file, 'utf8');
215
+ } catch (err) {
216
+ console.log(`${path.basename(file).padEnd(34)} ERROR: ${err.message}`);
217
+ continue;
218
+ }
219
+ const first = firstMatchType(text, ext, file);
220
+ const best = bestMatchResult(text, ext, file).detectedType;
221
+ total++;
222
+ if (first !== best) disagreements++;
223
+ console.log(row(path.basename(file), first, best, null));
224
+ }
225
+ }
226
+ }
227
+
228
+ console.log(
229
+ `\n=== Resumen: ${total} documentos, ${disagreements} divergencias first-vs-best ===\n`,
230
+ );
231
+ }
232
+
233
+ function walk(dir) {
234
+ const out = [];
235
+ for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
236
+ const full = path.join(dir, entry.name);
237
+ if (entry.isDirectory()) out.push(...walk(full));
238
+ else out.push(full);
239
+ }
240
+ return out;
241
+ }
242
+
243
+ run();
@@ -0,0 +1,96 @@
1
+ /**
2
+ * Phase 4 validation: runs the REAL runtime path the uploader now uses
3
+ * (DB-shape matchers -> adaptDbMatchers -> classifyDocument with rich extraction)
4
+ * against a corpus and compares it to the legacy engine (extractDocumentFields).
5
+ *
6
+ * Usage: node scripts/scoring-phase4-check.js <folder>
7
+ */
8
+ import fs from 'fs';
9
+ import path from 'path';
10
+
11
+ import { extractDocumentFields } from '../src/document-type-shared.js';
12
+ import FileDetectionService from '../src/file-detection.js';
13
+ import { adaptDbMatchers } from '../src/scoring/db-matcher-adapter.js';
14
+ import { scoringMatchers } from '../src/scoring/matchers-seed.js';
15
+ import { classifyDocument } from '../src/scoring/scoring-engine.js';
16
+
17
+ // Serialize the local seed to the shape the API `/resolved` endpoint returns,
18
+ // so we exercise the adapter exactly as in production.
19
+ function toDbShape(matchers) {
20
+ return matchers.map((m) => ({
21
+ documentType: m.documentType,
22
+ extensions: m.extensions,
23
+ minScore: m.minScore ?? null,
24
+ priority: m.priority ?? 0,
25
+ qualify: m.qualify ?? null,
26
+ clues: (m.clues || []).map((c) => ({
27
+ kind: c.kind,
28
+ pattern: c.pattern instanceof RegExp ? c.pattern.source : c.pattern,
29
+ flags: c.pattern instanceof RegExp ? c.pattern.flags : c.flags || '',
30
+ weight: c.weight ?? 1,
31
+ group: c.group ?? null,
32
+ required: !!c.required,
33
+ negative: !!c.negative,
34
+ })),
35
+ fieldExtractors: [], // rich extraction comes from the registry by documentType
36
+ }));
37
+ }
38
+
39
+ const adapted = adaptDbMatchers(toDbShape(scoringMatchers));
40
+
41
+ function walk(dir) {
42
+ const out = [];
43
+ for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
44
+ const full = path.join(dir, e.name);
45
+ if (e.isDirectory()) out.push(...walk(full));
46
+ else out.push(full);
47
+ }
48
+ return out;
49
+ }
50
+
51
+ async function run() {
52
+ const folder = process.argv[2];
53
+ if (!folder) {
54
+ console.error('Pass a folder: node scripts/scoring-phase4-check.js <folder>');
55
+ process.exit(1);
56
+ }
57
+ const detection = new FileDetectionService();
58
+ const files = walk(folder).filter((f) =>
59
+ ['.pdf', '.xml', '.txt'].includes(path.extname(f).toLowerCase()),
60
+ );
61
+
62
+ let total = 0;
63
+ let diverge = 0;
64
+ const patterns = {};
65
+
66
+ for (const file of files) {
67
+ const ext = path.extname(file).toLowerCase().replace('.', '');
68
+ let text = '';
69
+ try {
70
+ text =
71
+ ext === 'pdf'
72
+ ? await detection.extractTextFromPDF(file)
73
+ : fs.readFileSync(file, 'utf8');
74
+ } catch {
75
+ continue;
76
+ }
77
+ const legacy = extractDocumentFields(text, ext, file)[0];
78
+ const phase4 = classifyDocument(adapted, {
79
+ source: text,
80
+ extension: ext,
81
+ filePath: file,
82
+ }).detectedType;
83
+ total++;
84
+ if (legacy !== phase4) {
85
+ diverge++;
86
+ const key = `${legacy} -> ${phase4}`;
87
+ patterns[key] = (patterns[key] || 0) + 1;
88
+ console.log(`NO ${path.basename(file).padEnd(40)} ${key}`);
89
+ }
90
+ }
91
+
92
+ console.log(`\n=== Fase 4 vs legacy: ${total} docs, ${diverge} divergencias ===`);
93
+ for (const [k, n] of Object.entries(patterns)) console.log(` ${n}× ${k}`);
94
+ }
95
+
96
+ run();
@@ -8,11 +8,11 @@ import appConfig from '../config/config.js';
8
8
  import ErrorHandler from '../errors/ErrorHandler.js';
9
9
  import { ConfigurationError } from '../errors/ErrorTypes.js';
10
10
  import FileDetectionService from '../file-detection.js';
11
+ import { adaptDbMatchers } from '../scoring/db-matcher-adapter.js';
12
+ import { scoringMatchers } from '../scoring/matchers-seed.js';
11
13
 
12
14
  /**
13
- * Paid pedimento detected_type values. `pedimento_completo_xml` is included
14
- * even though the XML matcher is currently disabled in the registry so that
15
- * re-enabling it requires no changes here.
15
+ * Paid pedimento detected_type values.
16
16
  */
17
17
  const DETECTED_PEDIMENTO_TYPES = new Set([
18
18
  'pedimento_simplificado',
@@ -71,6 +71,32 @@ export class IdentifyCommand {
71
71
  );
72
72
  this.scanApiService = new ScanApiService(apiTarget);
73
73
 
74
+ // Load matchers for best-match classification (phase 4 hybrid). Prefer the
75
+ // DB-resolved set (this RFC + globals); fall back to the validated local
76
+ // seed; set DISABLE_SCORING_MATCHERS=true to force legacy first-match.
77
+ if (process.env.DISABLE_SCORING_MATCHERS === 'true') {
78
+ logger.info('🧩 Scoring matchers disabled — legacy detection');
79
+ } else {
80
+ let matchers = null;
81
+ try {
82
+ const rfc = process.env.MATCHER_RFC || null;
83
+ const dbMatchers = await this.scanApiService.getResolvedMatchers(rfc);
84
+ if (dbMatchers.length) {
85
+ matchers = adaptDbMatchers(dbMatchers);
86
+ logger.info(`🧩 Loaded ${matchers.length} matchers from API`);
87
+ }
88
+ } catch (err) {
89
+ logger.warn(`🧩 Could not load matchers from API: ${err.message}`);
90
+ }
91
+ if (!matchers) {
92
+ matchers = scoringMatchers;
93
+ logger.info(`🧩 Using local seed matchers (${matchers.length})`);
94
+ }
95
+ if (typeof this.detectionService.setMatchers === 'function') {
96
+ this.detectionService.setMatchers(matchers);
97
+ }
98
+ }
99
+
74
100
  const scanConfig = appConfig.getScanConfig();
75
101
  const batchSize = parseInt(options.batchSize) || 100;
76
102
 
@@ -543,13 +569,15 @@ export class IdentifyCommand {
543
569
 
544
570
  // Check if the text contains any required pedimento marker. This must
545
571
  // stay aligned with the `match()` predicates in pedimento-simplificado.js
546
- // and pedimento-completo.js.
572
+ // and pedimento-completo.js (which accept both "DE" and "DEL" in the
573
+ // title, and treat the colon after "T. OPER" as optional).
547
574
  const text = result.text || '';
548
- const hasSimplificadoMarker = /FORMA SIMPLIFICADA DE PEDIMENTO/i.test(text);
575
+ const hasSimplificadoMarker =
576
+ /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(text);
549
577
  const hasCompletoMarkers =
550
578
  /NUM\.?\s*PEDIMENTO:/i.test(text) &&
551
579
  /CVE\.?\s*PEDIMENTO:/i.test(text) &&
552
- /T\.?\s*OPER:/i.test(text);
580
+ /T\.?\s*OPER:?/i.test(text);
553
581
 
554
582
  return !hasSimplificadoMarker && !hasCompletoMarkers;
555
583
  }
@@ -579,6 +579,9 @@ export class ScanCommand {
579
579
  * Normalize file record for database insertion
580
580
  * Stores paths with forward slashes for consistency but keeps them absolute
581
581
  * Sets likelySimplificado to true if file is a PDF and filename contains 'simp'
582
+ * Sets likelyInterAgencia to true if filename matches an inter-agency CFDI
583
+ * pattern (e.g. SICINGR*), so the API forces these XML/PDF through detection
584
+ * even though they lack the 'simp/pedim/covefact' heuristic.
582
585
  * @private
583
586
  */
584
587
  #normalizeFileRecord(filePath, fileStats, basePath, scanTimestamp) {
@@ -600,6 +603,17 @@ export class ScanCommand {
600
603
  const likelySimplificado =
601
604
  fileExtension === 'pdf' && /(simp|pedim|covefact)/i.test(fileName);
602
605
 
606
+ // Flag inter-agency CFDIs by filename so detection picks them up.
607
+ // Patterns are configurable via SCAN_INTER_AGENCIA_PATTERNS env var
608
+ // (see config.js). Only meaningful for PDF and XML.
609
+ let likelyInterAgencia = false;
610
+ if (fileExtension === 'pdf' || fileExtension === 'xml') {
611
+ const patterns = appConfig.scan.interAgenciaPatterns;
612
+ if (patterns && patterns.length > 0) {
613
+ likelyInterAgencia = patterns.some((re) => re.test(fileName));
614
+ }
615
+ }
616
+
603
617
  return {
604
618
  fileName,
605
619
  fileExtension,
@@ -610,6 +624,7 @@ export class ScanCommand {
610
624
  modifiedAt: fileStats.mtime.toISOString(),
611
625
  scanTimestamp,
612
626
  likelySimplificado,
627
+ likelyInterAgencia,
613
628
  };
614
629
  }
615
630
 
@@ -37,10 +37,10 @@ class Config {
37
37
  const __dirname = path.dirname(__filename);
38
38
  const packageJsonPath = path.resolve(__dirname, '../../package.json');
39
39
  const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
40
- return packageJson.version || '1.0.22';
40
+ return packageJson.version || '1.0.24';
41
41
  } catch (error) {
42
42
  console.warn('⚠️ Could not read package.json version, using fallback');
43
- return '1.0.22';
43
+ return '1.0.24';
44
44
  }
45
45
  }
46
46
 
@@ -294,6 +294,31 @@ class Config {
294
294
  .map((p) => p.trim())
295
295
  .filter(Boolean);
296
296
 
297
+ // Parse inter-agency CFDI filename patterns. Files whose basename matches
298
+ // any of these regex patterns are flagged at scan time (likelyInterAgencia)
299
+ // so the API forces them through detection and the factura_inter_agencia
300
+ // matcher can classify them. The push pipeline then excludes them (see
301
+ // NON_PUSHABLE_TYPES_SQL in arela-api). Comma-separated regex source list.
302
+ // Default: ^SICINGR — covers NORCOM's SICINGR70-NNNNNN(...).pdf/.XML files.
303
+ const defaultInterAgenciaPatterns = '^SICINGR';
304
+ const interAgenciaPatterns = (
305
+ process.env.SCAN_INTER_AGENCIA_PATTERNS || defaultInterAgenciaPatterns
306
+ )
307
+ .split(',')
308
+ .map((p) => p.trim())
309
+ .filter(Boolean)
310
+ .map((p) => {
311
+ try {
312
+ return new RegExp(p, 'i');
313
+ } catch (err) {
314
+ console.warn(
315
+ `⚠️ Invalid SCAN_INTER_AGENCIA_PATTERNS regex "${p}": ${err.message}`,
316
+ );
317
+ return null;
318
+ }
319
+ })
320
+ .filter(Boolean);
321
+
297
322
  // Generate table name if all components are available
298
323
  // Note: This is just for reference; actual table names are generated dynamically
299
324
  // in ScanCommand based on discovered directories and levels
@@ -312,6 +337,7 @@ class Config {
312
337
  basePathFull: basePathLabel, // Renamed for consistency
313
338
  tableName,
314
339
  excludePatterns,
340
+ interAgenciaPatterns,
315
341
  batchSize: parseInt(process.env.SCAN_BATCH_SIZE) || 2000,
316
342
  directoryLevel: parseInt(process.env.SCAN_DIRECTORY_LEVEL) || 0,
317
343
  };
@@ -1,10 +1,10 @@
1
1
  // Import all document type definitions
2
2
  import { dodaPdfDefinition } from './document-types/doda-pdf.js';
3
3
  import { dodaXmlDefinition } from './document-types/doda-xml.js';
4
+ import { facturaInterAgenciaDefinition } from './document-types/factura-inter-agencia.js';
4
5
  import { facturasComerciales } from './document-types/facturas-comerciales.js';
6
+ import { pedimentoCompletoXmlDefinition } from './document-types/pedimento-completo-xml.js';
5
7
  import { pedimentoCompletoDefinition } from './document-types/pedimento-completo.js';
6
- // TODO: enable XML pedimento detection — implementation ready in pedimento-completo-xml.js
7
- // import { pedimentoCompletoXmlDefinition } from './document-types/pedimento-completo-xml.js';
8
8
  import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
9
9
  import { proformaDefinition } from './document-types/proforma.js';
10
10
  import { supportDocumentDefinition } from './document-types/support-document.js';
@@ -45,14 +45,14 @@ export class DocumentTypeDefinition {
45
45
  const documentTypes = [
46
46
  pedimentoSimplificadoDefinition,
47
47
  pedimentoCompletoDefinition,
48
- // TODO: enable XML pedimento detection — uncomment the next line and the
49
- // matching import at the top of this file. All downstream code
50
- // (composeArelaPath, arela-api SQL filters, IdentifyCommand counters)
51
- // already accepts `pedimento_completo_xml`.
52
- // pedimentoCompletoXmlDefinition,
48
+ pedimentoCompletoXmlDefinition,
53
49
  supportDocumentDefinition,
54
50
  dodaPdfDefinition,
55
51
  dodaXmlDefinition,
52
+ // factura_inter_agencia MUST be evaluated BEFORE facturasComerciales
53
+ // because a NORCOM↔PALCO CFDI would also match the generic commercial
54
+ // invoice matcher. First match wins (see extractDocumentFields).
55
+ facturaInterAgenciaDefinition,
56
56
  facturasComerciales,
57
57
  // Add more document types here as needed
58
58
  ];
@@ -114,6 +114,14 @@ export function extractDocumentFields(source, fileExtension, filePath) {
114
114
  ? docType.extractPedimentoYear(source, fields, filePath)
115
115
  : null;
116
116
 
117
+ // Ensure downstream code (composeArelaPath) sees `numPedimento` as a
118
+ // field. PDF matchers add it via an explicit extractor; XML matchers
119
+ // compose it externally via extractNumPedimento. Backfill so both paths
120
+ // expose the same shape.
121
+ if (pedimento && !fields.some((f) => f.name === 'numPedimento')) {
122
+ fields.push(new FieldResult('numPedimento', true, pedimento));
123
+ }
124
+
117
125
  return [resolvedType, fields, pedimento, year];
118
126
  }
119
127
  }
@@ -186,15 +186,34 @@ export const paymentDateExtractor = {
186
186
  field: 'paymentDate',
187
187
  extract: (source) => {
188
188
  const patterns = [
189
- /FECHA\s+DE\s+PAGO:?\s*(\d{2}\/\d{2}\/\d{4})/i,
190
- /FECHA\s+DE\s+PAGO:?\s*(\d{4}\/\d{2}\/\d{2})/i,
191
- /2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/,
192
- /(?:^|\n)\s*PAGO\s+(\d{2}\/\d{2}\/\d{4})/i,
193
- /PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/i,
189
+ /FECHA\s+DE\s+PAGO:?\s*(\d{2}\/\d{2}\/\d{4})/i, // 0: explicit label DD/MM/YYYY
190
+ /FECHA\s+DE\s+PAGO:?\s*(\d{4}\/\d{2}\/\d{2})/i, // 1: explicit label YYYY/MM/DD
191
+ /2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/, // 2: forma simplificada scheduled date ⚠️
192
+ /(?:^|\n)\s*PAGO\s+(\d{2}\/\d{2}\/\d{4})/i, // 3: PAGO at line start (original)
193
+ /(?<=\d)PAGO\s+(\d{2}\/\d{2}\/\d{4})/i, // 4: PAGO after digit (pdf-parse artifact)
194
+ /(\d{2}\/\d{2}\/\d{4})[ \t]+PAGO[ \t]*$/im, // 5: reversed layout — date before PAGO (FECHAS column)
195
+ // 6: forma simplificada — pdf-parse extracts table cells out of order, so the
196
+ // label "FECHA DE PAGO:" can appear on its own line and the value (along with
197
+ // other cells like línea de captura, pedimento, importe) follows several lines
198
+ // later. Take the FIRST dd/mm/yyyy after the label within a 400-char window.
199
+ // Safe because `isNoPagado` short-circuits documents without a real payment,
200
+ // so we won't grab the unrelated ENTRADA date from the "FECHAS:" block above.
201
+ /FECHA\s+DE\s+PAGO:[\s\S]{1,400}?(\d{2}\/\d{2}\/\d{4})/i,
202
+ /PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/i, // 7: fallback
194
203
  ];
195
- for (const re of patterns) {
196
- const m = source.match(re);
197
- if (m) return new FieldResult('paymentDate', true, m[1]);
204
+ // "*** NO PAGADO" is the explicit SAT marker that no payment has been
205
+ // certified. When present, the bank-certification block is physically
206
+ // absent, so any date matched by the fallback patterns (e.g.
207
+ // "2 PAGO:" with a scheduled date, or "PRESENTACION:") would be a false
208
+ // positive. Return null outright — the document is classified as proforma.
209
+ const isNoPagado = /\*{3}\s*NO\s+PAGADO/i.test(source);
210
+ if (isNoPagado) {
211
+ return new FieldResult('paymentDate', false, null);
212
+ }
213
+ for (const pattern of patterns) {
214
+ const m = source.match(pattern);
215
+ if (!m) continue;
216
+ return new FieldResult('paymentDate', true, m[1]);
198
217
  }
199
218
  return new FieldResult('paymentDate', false, null);
200
219
  },