@arela/uploader 1.0.22 → 1.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/scoring-compare.js +243 -0
- package/scripts/scoring-phase4-check.js +96 -0
- package/src/commands/IdentifyCommand.js +34 -6
- package/src/commands/ScanCommand.js +15 -0
- package/src/config/config.js +28 -2
- package/src/document-type-shared.js +15 -7
- package/src/document-types/_pedimento-shared-extractors.js +27 -8
- package/src/document-types/factura-inter-agencia.js +186 -0
- package/src/document-types/pedimento-completo-xml.js +62 -12
- package/src/document-types/pedimento-completo.js +5 -3
- package/src/document-types/pedimento-simplificado.js +5 -2
- package/src/document-types/proforma.js +2 -2
- package/src/file-detection.js +30 -6
- package/src/scoring/db-matcher-adapter.js +98 -0
- package/src/scoring/matchers-seed.js +386 -0
- package/src/scoring/scoring-engine.js +218 -0
- package/src/services/ScanApiService.js +14 -0
- package/tests/unit/factura-inter-agencia.test.js +218 -0
- package/tests/unit/pedimento-completo-xml-matcher.test.js +271 -0
- package/tests/unit/pedimento-simplificado-matcher.test.js +185 -0
- package/tests/unit/scoring-engine.test.js +221 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
// NOTE: We intentionally do NOT import `FieldResult` from
|
|
2
|
+
// '../document-type-shared.js' to avoid a circular-import TDZ when this
|
|
3
|
+
// module is imported directly (e.g. from unit tests). `FieldResult` is a
|
|
4
|
+
// plain data-class with shape `{ name, found, value }`, so we construct
|
|
5
|
+
// equivalent plain objects locally.
|
|
6
|
+
const fieldResult = (name, found, value) => ({ name, found, value });
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Factura Inter-Agencia Document Type Definition
|
|
10
|
+
*
|
|
11
|
+
* Detects CFDIs (XML or PDF) issued between customs broker agencies (e.g.,
|
|
12
|
+
* NORCOM ↔ PALCO). These files are dropped into a pedimento folder by the
|
|
13
|
+
* broker but they are NOT part of the customs electronic file (expediente
|
|
14
|
+
* aduanal) — they are inter-agency billing for broker services.
|
|
15
|
+
*
|
|
16
|
+
* Detection rules (ALL required):
|
|
17
|
+
* 1) CFDI markers present (either xml structure or PDF text representation)
|
|
18
|
+
* 2) Both emisor and receptor RFCs belong to the configured agency pair
|
|
19
|
+
* (NAA120215F20 = NORCOM, PCC1008161WA = PALCO) in any direction.
|
|
20
|
+
* 3) At least one concepto with ClaveProdServ 78141502 (Servicios de
|
|
21
|
+
* agentes aduaneros) — confirms the billing is for broker services.
|
|
22
|
+
*
|
|
23
|
+
* IMPORTANT: This matcher MUST be registered BEFORE `facturasComerciales`
|
|
24
|
+
* in document-type-shared.js — both would match a CFDI in a pedimento
|
|
25
|
+
* folder, but inter-agency invoices must take precedence so they are
|
|
26
|
+
* filtered out of the Arela push pipeline (see arela-api
|
|
27
|
+
* NON_PUSHABLE_TYPES_SQL).
|
|
28
|
+
*
|
|
29
|
+
* Currently scope-limited to NORCOM↔PALCO. To widen, move INTER_AGENCIA_RFCS
|
|
30
|
+
* to env config and require ≥2 distinct RFCs from the configured list.
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* RFCs of agencies whose mutual invoices should be excluded from the Arela
|
|
35
|
+
* push pipeline. Order is irrelevant — a match is any pair of distinct RFCs
|
|
36
|
+
* from this set appearing as emisor and receptor.
|
|
37
|
+
*/
|
|
38
|
+
export const INTER_AGENCIA_RFCS = ['NAA120215F20', 'PCC1008161WA'];
|
|
39
|
+
|
|
40
|
+
const BROKER_SERVICE_CLAVE_PROD_SERV = '78141502';
|
|
41
|
+
|
|
42
|
+
const CFDI_XML_MARKERS = [
|
|
43
|
+
/cfdi:Comprobante/i,
|
|
44
|
+
/xmlns:cfdi/i,
|
|
45
|
+
/TipoDeComprobante/i,
|
|
46
|
+
];
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Detect that the source represents a CFDI — either as the original XML
|
|
50
|
+
* structure or as text extracted from a printed CFDI (PDF representation).
|
|
51
|
+
*
|
|
52
|
+
* PDF text loses XML tags, so we look for the human-readable equivalents
|
|
53
|
+
* commonly rendered by SAT-style invoice templates ("Folio Fiscal", "Sello
|
|
54
|
+
* Digital del CFDI", "Cadena Original ... Certificacion Digital del SAT").
|
|
55
|
+
*/
|
|
56
|
+
function isCfdiContent(source) {
|
|
57
|
+
const xmlHits = CFDI_XML_MARKERS.filter((re) => re.test(source)).length;
|
|
58
|
+
if (xmlHits >= 2) return true;
|
|
59
|
+
|
|
60
|
+
const pdfMarkers = [
|
|
61
|
+
/folio\s*fiscal/i,
|
|
62
|
+
/sello\s*digital\s*del\s*cfdi/i,
|
|
63
|
+
/cadena\s*original.*certificaci[oó]n\s*digital\s*del\s*sat/i,
|
|
64
|
+
/representaci[oó]n\s*impresa\s*de\s*un\s*cfdi/i,
|
|
65
|
+
];
|
|
66
|
+
return pdfMarkers.filter((re) => re.test(source)).length >= 2;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Return the subset of INTER_AGENCIA_RFCS that appear in `source`. Matching is
|
|
71
|
+
* case-insensitive and uses word boundaries so substrings inside larger tokens
|
|
72
|
+
* (cert/sello base64) don't produce false positives.
|
|
73
|
+
*/
|
|
74
|
+
function findInterAgenciaRfcs(source) {
|
|
75
|
+
const found = new Set();
|
|
76
|
+
for (const rfc of INTER_AGENCIA_RFCS) {
|
|
77
|
+
const re = new RegExp(`\\b${rfc}\\b`, 'i');
|
|
78
|
+
if (re.test(source)) found.add(rfc.toUpperCase());
|
|
79
|
+
}
|
|
80
|
+
return [...found];
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
export const facturaInterAgenciaDefinition = {
|
|
84
|
+
type: 'factura_inter_agencia',
|
|
85
|
+
extensions: ['xml', 'pdf'],
|
|
86
|
+
|
|
87
|
+
match: (source) => {
|
|
88
|
+
if (!isCfdiContent(source)) return false;
|
|
89
|
+
|
|
90
|
+
// Need ≥2 distinct configured RFCs present (one as emisor, one as receptor)
|
|
91
|
+
const rfcsFound = findInterAgenciaRfcs(source);
|
|
92
|
+
if (rfcsFound.length < 2) return false;
|
|
93
|
+
|
|
94
|
+
// Confirm the invoice is for broker services (customs agent services)
|
|
95
|
+
if (!source.includes(BROKER_SERVICE_CLAVE_PROD_SERV)) return false;
|
|
96
|
+
|
|
97
|
+
return true;
|
|
98
|
+
},
|
|
99
|
+
|
|
100
|
+
// Pedimento extraction is optional / informational — these files are
|
|
101
|
+
// excluded from push, so arela_path is never composed. We still extract
|
|
102
|
+
// a pedimento number when present (from the "Referencias" / "Pedimento:"
|
|
103
|
+
// section of the printable CFDI) for auditability.
|
|
104
|
+
extractNumPedimento: (source, fields) => {
|
|
105
|
+
return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
|
|
106
|
+
},
|
|
107
|
+
|
|
108
|
+
extractPedimentoYear: (source, fields) => {
|
|
109
|
+
const numPedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
|
|
110
|
+
if (numPedimento && numPedimento.length >= 2) {
|
|
111
|
+
const yy = parseInt(numPedimento.substring(0, 2), 10);
|
|
112
|
+
if (!isNaN(yy)) return yy < 50 ? yy + 2000 : yy + 1900;
|
|
113
|
+
}
|
|
114
|
+
return null;
|
|
115
|
+
},
|
|
116
|
+
|
|
117
|
+
extractors: [
|
|
118
|
+
{
|
|
119
|
+
field: 'rfcEmisor',
|
|
120
|
+
extract: (source) => {
|
|
121
|
+
// XML form: <cfdi:Emisor Rfc="..." />
|
|
122
|
+
const xmlMatch = source.match(
|
|
123
|
+
/<[^>]*Emisor[^>]*Rfc\s*=\s*["']([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})["']/i,
|
|
124
|
+
);
|
|
125
|
+
if (xmlMatch) return fieldResult('rfcEmisor', true, xmlMatch[1]);
|
|
126
|
+
|
|
127
|
+
// PDF form: "Emisor" section followed by RFC label/value on later lines.
|
|
128
|
+
// We pick the first INTER_AGENCIA RFC that appears in the document.
|
|
129
|
+
const rfcs = findInterAgenciaRfcs(source);
|
|
130
|
+
if (rfcs.length > 0) return fieldResult('rfcEmisor', true, rfcs[0]);
|
|
131
|
+
|
|
132
|
+
return fieldResult('rfcEmisor', false, null);
|
|
133
|
+
},
|
|
134
|
+
},
|
|
135
|
+
{
|
|
136
|
+
field: 'rfcReceptor',
|
|
137
|
+
extract: (source) => {
|
|
138
|
+
const xmlMatch = source.match(
|
|
139
|
+
/<[^>]*Receptor[^>]*Rfc\s*=\s*["']([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})["']/i,
|
|
140
|
+
);
|
|
141
|
+
if (xmlMatch) return fieldResult('rfcReceptor', true, xmlMatch[1]);
|
|
142
|
+
|
|
143
|
+
const rfcs = findInterAgenciaRfcs(source);
|
|
144
|
+
if (rfcs.length >= 2) {
|
|
145
|
+
return fieldResult('rfcReceptor', true, rfcs[1]);
|
|
146
|
+
}
|
|
147
|
+
return fieldResult('rfcReceptor', false, null);
|
|
148
|
+
},
|
|
149
|
+
},
|
|
150
|
+
{
|
|
151
|
+
field: 'folio',
|
|
152
|
+
extract: (source) => {
|
|
153
|
+
// CFDI Folio attribute
|
|
154
|
+
const xmlMatch = source.match(/\bFolio\s*=\s*["']([A-Z0-9-]+)["']/i);
|
|
155
|
+
if (xmlMatch) return fieldResult('folio', true, xmlMatch[1]);
|
|
156
|
+
|
|
157
|
+
// PDF: "Numero Folio 012749"
|
|
158
|
+
const pdfMatch = source.match(/Numero\s+Folio\s+([A-Z0-9-]+)/i);
|
|
159
|
+
if (pdfMatch) return fieldResult('folio', true, pdfMatch[1]);
|
|
160
|
+
|
|
161
|
+
return fieldResult('folio', false, null);
|
|
162
|
+
},
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
field: 'uuid',
|
|
166
|
+
extract: (source) => {
|
|
167
|
+
const uuidRe =
|
|
168
|
+
/[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}/i;
|
|
169
|
+
const m = source.match(uuidRe);
|
|
170
|
+
return fieldResult('uuid', !!m, m ? m[0].toUpperCase() : null);
|
|
171
|
+
},
|
|
172
|
+
},
|
|
173
|
+
{
|
|
174
|
+
field: 'numPedimento',
|
|
175
|
+
extract: (source) => {
|
|
176
|
+
// Printable CFDI "Pedimento: 3458 6000046 Fecha: ..." — recovers an
|
|
177
|
+
// 11-digit pedimento (no YY prefix). Useful for auditability only.
|
|
178
|
+
const m = source.match(/Pedimento:?\s*(\d{4})\s*(\d{7})/i);
|
|
179
|
+
if (m) {
|
|
180
|
+
return fieldResult('numPedimento', true, `${m[1]}${m[2]}`);
|
|
181
|
+
}
|
|
182
|
+
return fieldResult('numPedimento', false, null);
|
|
183
|
+
},
|
|
184
|
+
},
|
|
185
|
+
],
|
|
186
|
+
};
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
// VUCEM "consultarPedimentoCompleto" XML matcher.
|
|
2
2
|
//
|
|
3
|
-
//
|
|
4
|
-
//
|
|
5
|
-
//
|
|
6
|
-
// counters) already includes `pedimento_completo_xml`, so re-enabling is a
|
|
7
|
-
// single-line change.
|
|
3
|
+
// Registered in `document-type-shared.js`. Downstream code
|
|
4
|
+
// (composeArelaPath, arela-api propagation SQL, IdentifyCommand counters)
|
|
5
|
+
// also includes `pedimento_completo_xml`.
|
|
8
6
|
//
|
|
9
7
|
// Filename patterns recognized (try in order — patente extraction):
|
|
10
8
|
// 1) VU_PATENTE_ADUANA_PEDIMENTO.xml → e.g. VU_3429_070_5016101.xml
|
|
@@ -47,6 +45,22 @@ function pad(value, length) {
|
|
|
47
45
|
return String(value).padStart(length, '0');
|
|
48
46
|
}
|
|
49
47
|
|
|
48
|
+
/**
|
|
49
|
+
* Convert a VUCEM `aduanaEntradaSalida.clave` (e.g. "70", "750", "40") to the
|
|
50
|
+
* 2-digit "sección aduanera" prefix used inside the 15-digit pedimento number.
|
|
51
|
+
*
|
|
52
|
+
* VUCEM strips leading zeros from the canonical 3-digit SAT aduana code,
|
|
53
|
+
* so `070` (Ciudad Juárez) arrives as `70`. The pedimento prefix is the
|
|
54
|
+
* first 2 digits of the 3-digit code:
|
|
55
|
+
* `70` → `070` → `07` (Cd. Juárez)
|
|
56
|
+
* `750` → `750` → `75` (Puebla)
|
|
57
|
+
* `40` → `040` → `04` (Lázaro Cárdenas)
|
|
58
|
+
*/
|
|
59
|
+
function aduanaToSeccion(claveValue) {
|
|
60
|
+
if (claveValue == null) return null;
|
|
61
|
+
return pad(claveValue, 3).substring(0, 2);
|
|
62
|
+
}
|
|
63
|
+
|
|
50
64
|
/**
|
|
51
65
|
* Try the three known filename patterns and return {patente, aduana, pedimento}
|
|
52
66
|
* with any subset of the fields populated. Returns null if no pattern matches.
|
|
@@ -102,12 +116,17 @@ function yyFromIsoDate(iso) {
|
|
|
102
116
|
return m ? m[1].substring(2, 4) : null;
|
|
103
117
|
}
|
|
104
118
|
|
|
105
|
-
// Find <ns2:fechas> block
|
|
106
|
-
|
|
119
|
+
// Find <ns2:fechas> block whose nested <clave> matches `claveValue` and
|
|
120
|
+
// return its <ns2:fecha>. Works for both shapes:
|
|
121
|
+
// <fechas><clave>N</clave><fecha>...</fecha></fechas>
|
|
122
|
+
// <fechas><fecha>...</fecha><tipo><clave>N</clave></tipo></fechas>
|
|
123
|
+
// (firstTag finds the FIRST <clave> in the block — both layouts expose only
|
|
124
|
+
// one clave per fechas entry.)
|
|
125
|
+
function findFechaByClave(source, claveValue) {
|
|
107
126
|
const fechasBlocks = allTagBlocks(source, 'fechas');
|
|
108
127
|
for (const block of fechasBlocks) {
|
|
109
128
|
const clave = firstTag(block, 'clave');
|
|
110
|
-
if (clave ===
|
|
129
|
+
if (clave === claveValue) {
|
|
111
130
|
const fecha = firstTag(block, 'fecha');
|
|
112
131
|
if (fecha) return fecha;
|
|
113
132
|
}
|
|
@@ -115,6 +134,18 @@ function findPaymentDate(source) {
|
|
|
115
134
|
return null;
|
|
116
135
|
}
|
|
117
136
|
|
|
137
|
+
// Fecha de pago de las contribuciones (tipo.clave == 2).
|
|
138
|
+
function findPaymentDate(source) {
|
|
139
|
+
return findFechaByClave(source, '2');
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Fecha de presentacion (tipo.clave == 5). This is the authoritative source
|
|
143
|
+
// for the pedimento's YY prefix — a pedimento opened in Dec-2025 but paid in
|
|
144
|
+
// Jan-2026 keeps the `25` prefix, matching what VUCEM stamps in the filename.
|
|
145
|
+
function findPresentationDate(source) {
|
|
146
|
+
return findFechaByClave(source, '5');
|
|
147
|
+
}
|
|
148
|
+
|
|
118
149
|
// --------------------------- extractors ------------------------------------
|
|
119
150
|
|
|
120
151
|
const rfcExtractor = {
|
|
@@ -152,7 +183,7 @@ const aduanaEntradaSalidaExtractor = {
|
|
|
152
183
|
return new FieldResult(
|
|
153
184
|
'aduanaEntradaSalida',
|
|
154
185
|
!!clave,
|
|
155
|
-
|
|
186
|
+
aduanaToSeccion(clave),
|
|
156
187
|
);
|
|
157
188
|
},
|
|
158
189
|
};
|
|
@@ -165,6 +196,14 @@ const paymentDateExtractor = {
|
|
|
165
196
|
},
|
|
166
197
|
};
|
|
167
198
|
|
|
199
|
+
const presentationDateExtractor = {
|
|
200
|
+
field: 'presentationDate',
|
|
201
|
+
extract: (source) => {
|
|
202
|
+
const fecha = findPresentationDate(source);
|
|
203
|
+
return new FieldResult('presentationDate', !!fecha, fecha);
|
|
204
|
+
},
|
|
205
|
+
};
|
|
206
|
+
|
|
168
207
|
const fechaPagoRectificacionExtractor = {
|
|
169
208
|
field: 'fechaPagoRectificacion',
|
|
170
209
|
extract: (source) => {
|
|
@@ -257,8 +296,14 @@ export const pedimentoCompletoXmlDefinition = {
|
|
|
257
296
|
|
|
258
297
|
/**
|
|
259
298
|
* Compose the 15-digit pedimento number from XML body + filename.
|
|
260
|
-
* YY:
|
|
261
|
-
*
|
|
299
|
+
* YY: priority order (most authoritative first):
|
|
300
|
+
* 1) Filename pattern 3 (`{15-digit}.xml`) — VUCEM stamps the correct
|
|
301
|
+
* prefix at export time.
|
|
302
|
+
* 2) Fecha de presentacion (<fechas><clave>5) — the year the pedimento
|
|
303
|
+
* was opened. Authoritative for the YY prefix even when payment
|
|
304
|
+
* crosses calendar year (e.g. opened Dec-2025, paid Jan-2026 → YY=25).
|
|
305
|
+
* 3) Rectification fechaPago (only when no presentation date exists).
|
|
306
|
+
* 4) Payment date (last-resort fallback).
|
|
262
307
|
* AA: from <aduanaEntradaSalida><clave> padded to 2.
|
|
263
308
|
* PPPP: from the filename (any of the three patterns).
|
|
264
309
|
* NNNNNNN: from <pedimento> padded to 7.
|
|
@@ -267,15 +312,19 @@ export const pedimentoCompletoXmlDefinition = {
|
|
|
267
312
|
extractNumPedimento: (source, fields, filePath) => {
|
|
268
313
|
const parts = parseFilenameParts(filePath);
|
|
269
314
|
|
|
315
|
+
const presentation = fields?.find(
|
|
316
|
+
(f) => f.name === 'presentationDate' && f.found,
|
|
317
|
+
)?.value;
|
|
270
318
|
const rect = fields?.find(
|
|
271
319
|
(f) => f.name === 'fechaPagoRectificacion' && f.found,
|
|
272
320
|
)?.value;
|
|
273
321
|
const pay = fields?.find((f) => f.name === 'paymentDate' && f.found)?.value;
|
|
274
322
|
|
|
275
323
|
let yy =
|
|
324
|
+
(parts && parts.year) ||
|
|
325
|
+
yyFromIsoDate(presentation) ||
|
|
276
326
|
yyFromIsoDate(rect) ||
|
|
277
327
|
yyFromIsoDate(pay) ||
|
|
278
|
-
(parts && parts.year) ||
|
|
279
328
|
null;
|
|
280
329
|
|
|
281
330
|
const aduanaField = fields?.find(
|
|
@@ -315,6 +364,7 @@ export const pedimentoCompletoXmlDefinition = {
|
|
|
315
364
|
tipoOperacionExtractor,
|
|
316
365
|
aduanaEntradaSalidaExtractor,
|
|
317
366
|
paymentDateExtractor,
|
|
367
|
+
presentationDateExtractor,
|
|
318
368
|
fechaPagoRectificacionExtractor,
|
|
319
369
|
coveExtractor,
|
|
320
370
|
numEDocumentoExtractor,
|
|
@@ -17,17 +17,19 @@ export const pedimentoCompletoDefinition = {
|
|
|
17
17
|
type: 'pedimento_completo',
|
|
18
18
|
extensions: ['pdf'],
|
|
19
19
|
match: (source) => {
|
|
20
|
-
// Hard exclude: "FORMA SIMPLIFICADA" is handled by
|
|
21
|
-
|
|
20
|
+
// Hard exclude: "FORMA SIMPLIFICADA [DE|DEL] PEDIMENTO" is handled by
|
|
21
|
+
// pedimento_simplificado.
|
|
22
|
+
if (/FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(source)) return false;
|
|
22
23
|
|
|
23
24
|
// Hard exclude: "AVISO CONSOLIDADO" shares the header trio but is a
|
|
24
25
|
// different document type handled by aviso_consolidado.
|
|
25
26
|
if (/AVISO\s+CONSOLIDADO/i.test(source)) return false;
|
|
26
27
|
|
|
28
|
+
// The colon after "T. OPER" is optional — see note in pedimento-simplificado.js.
|
|
27
29
|
const hasHeaderFields =
|
|
28
30
|
/NUM\.?\s*PEDIMENTO:/i.test(source) &&
|
|
29
31
|
/CVE\.?\s*PEDIMENTO:/i.test(source) &&
|
|
30
|
-
/T\.?\s*OPER
|
|
32
|
+
/T\.?\s*OPER:?/i.test(source);
|
|
31
33
|
if (hasHeaderFields) {
|
|
32
34
|
const hasCopyMarker =
|
|
33
35
|
/ORIGINAL:\s*ADMINISTRACION GENERAL DE ADUANAS/i.test(source) ||
|
|
@@ -12,15 +12,18 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
12
12
|
if (/AVISO\s+CONSOLIDADO/i.test(source)) return false;
|
|
13
13
|
|
|
14
14
|
// Fast path: the literal title appears on standard SIMP layouts.
|
|
15
|
-
|
|
15
|
+
// Some prevalidators print "FORMA SIMPLIFICADA DEL PEDIMENTO" (with DEL).
|
|
16
|
+
if (/FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(source)) return true;
|
|
16
17
|
|
|
17
18
|
// Some PDFs (single-page anchors) lack that title but still carry the
|
|
18
19
|
// three pedimento header fields. Treat them as simplificado UNLESS they
|
|
19
20
|
// have the multi-page copy markers that uniquely identify a completo.
|
|
21
|
+
// NOTE: the colon after "T. OPER" is optional — many printable layouts
|
|
22
|
+
// render OPER as a table-header label with the value in the next cell.
|
|
20
23
|
const hasHeaderFields =
|
|
21
24
|
/NUM\.?\s*PEDIMENTO:/i.test(source) &&
|
|
22
25
|
/CVE\.?\s*PEDIMENTO:/i.test(source) &&
|
|
23
|
-
/T\.?\s*OPER
|
|
26
|
+
/T\.?\s*OPER:?/i.test(source);
|
|
24
27
|
if (!hasHeaderFields) return false;
|
|
25
28
|
|
|
26
29
|
const hasCompletoCopyMarker =
|
|
@@ -16,9 +16,9 @@ export const proformaDefinition = {
|
|
|
16
16
|
type: 'proforma',
|
|
17
17
|
extensions: ['pdf'],
|
|
18
18
|
|
|
19
|
-
// Same content marker as pedimento simplificado
|
|
19
|
+
// Same content marker as pedimento simplificado (accepts "DE" or "DEL").
|
|
20
20
|
match: (source) => {
|
|
21
|
-
return /FORMA
|
|
21
|
+
return /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(source);
|
|
22
22
|
},
|
|
23
23
|
|
|
24
24
|
extractNumPedimento: pedimentoSimplificadoDefinition.extractNumPedimento,
|
package/src/file-detection.js
CHANGED
|
@@ -3,10 +3,9 @@ import path from 'path';
|
|
|
3
3
|
import { PDFParse } from 'pdf-parse';
|
|
4
4
|
|
|
5
5
|
import { extractDocumentFields } from './document-type-shared.js';
|
|
6
|
+
import { classifyDocument } from './scoring/scoring-engine.js';
|
|
6
7
|
|
|
7
|
-
// Document types that participate in arela_path composition.
|
|
8
|
-
// kept here even though its matcher is currently disabled — once re-enabled
|
|
9
|
-
// in document-type-shared.js no further changes are needed here.
|
|
8
|
+
// Document types that participate in arela_path composition.
|
|
10
9
|
const ARELA_PATH_TYPES = new Set([
|
|
11
10
|
'pedimento_simplificado',
|
|
12
11
|
'pedimento_completo',
|
|
@@ -86,6 +85,17 @@ function composeArelaPath(
|
|
|
86
85
|
* Detects document types and extracts metadata from files
|
|
87
86
|
*/
|
|
88
87
|
export class FileDetectionService {
|
|
88
|
+
constructor() {
|
|
89
|
+
// Best-match matchers (adapted from the API). When set, classification uses
|
|
90
|
+
// the scoring engine; otherwise it falls back to legacy first-match-wins.
|
|
91
|
+
this.matchers = null;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/** Provide the resolved+adapted matcher set for scoring-based classification. */
|
|
95
|
+
setMatchers(matchers) {
|
|
96
|
+
this.matchers = matchers && matchers.length ? matchers : null;
|
|
97
|
+
}
|
|
98
|
+
|
|
89
99
|
/**
|
|
90
100
|
* Detect document type from a file
|
|
91
101
|
* @param {string} filePath - Path to the file to analyze
|
|
@@ -142,9 +152,23 @@ export class FileDetectionService {
|
|
|
142
152
|
};
|
|
143
153
|
}
|
|
144
154
|
|
|
145
|
-
// Extract document fields and detect type
|
|
146
|
-
|
|
147
|
-
|
|
155
|
+
// Extract document fields and detect type. Use the best-match scoring
|
|
156
|
+
// engine when matchers are configured; otherwise legacy first-match-wins.
|
|
157
|
+
let detectedType, fields, detectedPedimento, detectedPedimentoYear;
|
|
158
|
+
if (this.matchers) {
|
|
159
|
+
const r = classifyDocument(this.matchers, {
|
|
160
|
+
source: text,
|
|
161
|
+
extension: fileExtension,
|
|
162
|
+
filePath,
|
|
163
|
+
});
|
|
164
|
+
detectedType = r.detectedType;
|
|
165
|
+
fields = r.fields;
|
|
166
|
+
detectedPedimento = r.detectedPedimento;
|
|
167
|
+
detectedPedimentoYear = r.detectedPedimentoYear;
|
|
168
|
+
} else {
|
|
169
|
+
[detectedType, fields, detectedPedimento, detectedPedimentoYear] =
|
|
170
|
+
extractDocumentFields(text, fileExtension, filePath);
|
|
171
|
+
}
|
|
148
172
|
|
|
149
173
|
// Extract RFC from fields
|
|
150
174
|
const rfc = fields?.find((f) => f.name === 'rfc')?.value ?? null;
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adapt DB matchers (from arela-api `GET /document-matcher/resolved`) into the
|
|
3
|
+
* shape the scoring engine consumes — the HYBRID model:
|
|
4
|
+
*
|
|
5
|
+
* - SELECTION comes from the DB matcher's clues / qualify (per-RFC + globals).
|
|
6
|
+
* - EXTRACTION uses the rich JS extractors keyed by `documentType` when one
|
|
7
|
+
* exists (resolveType, multi-pattern field extractors, pedimento composition);
|
|
8
|
+
* otherwise it falls back to building simple regex extractors from the DB
|
|
9
|
+
* matcher's `fieldExtractors`.
|
|
10
|
+
*
|
|
11
|
+
* This keeps per-client matching configurable from the UI while preserving the
|
|
12
|
+
* robust field extraction that already ships in the uploader.
|
|
13
|
+
*/
|
|
14
|
+
// IMPORTANT: load document-type-shared FIRST so it becomes the root of the
|
|
15
|
+
// shared<->definitions import cycle and fully evaluates before the individual
|
|
16
|
+
// definitions are referenced (otherwise: "Cannot access X before initialization").
|
|
17
|
+
import { FieldResult } from '../document-type-shared.js';
|
|
18
|
+
import { dodaPdfDefinition } from '../document-types/doda-pdf.js';
|
|
19
|
+
import { dodaXmlDefinition } from '../document-types/doda-xml.js';
|
|
20
|
+
import { facturaInterAgenciaDefinition } from '../document-types/factura-inter-agencia.js';
|
|
21
|
+
import { facturasComerciales } from '../document-types/facturas-comerciales.js';
|
|
22
|
+
import { pedimentoCompletoXmlDefinition } from '../document-types/pedimento-completo-xml.js';
|
|
23
|
+
import { pedimentoCompletoDefinition } from '../document-types/pedimento-completo.js';
|
|
24
|
+
import { pedimentoSimplificadoDefinition } from '../document-types/pedimento-simplificado.js';
|
|
25
|
+
import { supportDocumentDefinition } from '../document-types/support-document.js';
|
|
26
|
+
|
|
27
|
+
// documentType -> rich extraction half of the JS definition.
|
|
28
|
+
function extractionOf(def) {
|
|
29
|
+
return {
|
|
30
|
+
extractors: def.extractors,
|
|
31
|
+
resolveType: def.resolveType,
|
|
32
|
+
extractNumPedimento: def.extractNumPedimento,
|
|
33
|
+
extractPedimentoYear: def.extractPedimentoYear,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const EXTRACTION_REGISTRY = {
|
|
38
|
+
pedimento_simplificado: extractionOf(pedimentoSimplificadoDefinition),
|
|
39
|
+
pedimento_completo: extractionOf(pedimentoCompletoDefinition),
|
|
40
|
+
pedimento_completo_xml: extractionOf(pedimentoCompletoXmlDefinition),
|
|
41
|
+
doda_pdf: extractionOf(dodaPdfDefinition),
|
|
42
|
+
doda_xml: extractionOf(dodaXmlDefinition),
|
|
43
|
+
factura_inter_agencia: extractionOf(facturaInterAgenciaDefinition),
|
|
44
|
+
factura_comercial: extractionOf(facturasComerciales),
|
|
45
|
+
support_document: extractionOf(supportDocumentDefinition),
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
// Build a scoring-engine extractor from a DB fieldExtractor (regex + capture).
|
|
49
|
+
function regexExtractor(fe) {
|
|
50
|
+
return {
|
|
51
|
+
field: fe.field,
|
|
52
|
+
extract: (source) => {
|
|
53
|
+
try {
|
|
54
|
+
const m = source.match(new RegExp(fe.extractor, fe.flags || ''));
|
|
55
|
+
return new FieldResult(fe.field, !!m, m ? (m[1] ?? m[0]) : null);
|
|
56
|
+
} catch {
|
|
57
|
+
return new FieldResult(fe.field, false, null);
|
|
58
|
+
}
|
|
59
|
+
},
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Convert resolved DB matchers into scoring-engine matchers.
|
|
65
|
+
* @param {Array} dbMatchers - matchers from the API (with clues + fieldExtractors)
|
|
66
|
+
* @returns {Array} scoring matchers
|
|
67
|
+
*/
|
|
68
|
+
export function adaptDbMatchers(dbMatchers) {
|
|
69
|
+
return (dbMatchers || []).map((m) => {
|
|
70
|
+
const rich = EXTRACTION_REGISTRY[m.documentType];
|
|
71
|
+
const extraction = rich
|
|
72
|
+
? rich
|
|
73
|
+
: { extractors: (m.fieldExtractors || []).map(regexExtractor) };
|
|
74
|
+
|
|
75
|
+
return {
|
|
76
|
+
documentType: m.documentType,
|
|
77
|
+
extensions: Array.isArray(m.extensions)
|
|
78
|
+
? m.extensions
|
|
79
|
+
: String(m.extensions || '')
|
|
80
|
+
.split(',')
|
|
81
|
+
.map((s) => s.trim())
|
|
82
|
+
.filter(Boolean),
|
|
83
|
+
minScore: m.minScore ?? undefined,
|
|
84
|
+
priority: m.priority ?? 0,
|
|
85
|
+
qualify: m.qualify ?? undefined,
|
|
86
|
+
clues: (m.clues || []).map((c) => ({
|
|
87
|
+
kind: c.kind,
|
|
88
|
+
pattern: c.pattern,
|
|
89
|
+
flags: c.flags || undefined,
|
|
90
|
+
weight: c.weight ?? 1,
|
|
91
|
+
group: c.group || undefined,
|
|
92
|
+
required: !!c.required,
|
|
93
|
+
negative: !!c.negative,
|
|
94
|
+
})),
|
|
95
|
+
...extraction,
|
|
96
|
+
};
|
|
97
|
+
});
|
|
98
|
+
}
|