@arela/uploader 1.0.22 → 1.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/scoring-compare.js +243 -0
- package/scripts/scoring-phase4-check.js +96 -0
- package/src/commands/IdentifyCommand.js +34 -6
- package/src/commands/ScanCommand.js +15 -0
- package/src/config/config.js +28 -2
- package/src/document-type-shared.js +15 -7
- package/src/document-types/_pedimento-shared-extractors.js +27 -8
- package/src/document-types/factura-inter-agencia.js +186 -0
- package/src/document-types/pedimento-completo-xml.js +62 -12
- package/src/document-types/pedimento-completo.js +5 -3
- package/src/document-types/pedimento-simplificado.js +5 -2
- package/src/document-types/proforma.js +2 -2
- package/src/file-detection.js +30 -6
- package/src/scoring/db-matcher-adapter.js +98 -0
- package/src/scoring/matchers-seed.js +386 -0
- package/src/scoring/scoring-engine.js +218 -0
- package/src/services/ScanApiService.js +14 -0
- package/tests/unit/factura-inter-agencia.test.js +218 -0
- package/tests/unit/pedimento-completo-xml-matcher.test.js +271 -0
- package/tests/unit/pedimento-simplificado-matcher.test.js +185 -0
- package/tests/unit/scoring-engine.test.js +221 -0
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Seed matchers for the scoring engine (PROTOTYPE).
|
|
3
|
+
*
|
|
4
|
+
* Each entry re-expresses the boolean `match()` of an existing
|
|
5
|
+
* `src/document-types/*.js` definition as a set of weighted **clues**, while
|
|
6
|
+
* REUSING that definition's `extractors` / `resolveType` /
|
|
7
|
+
* `extractNumPedimento` / `extractPedimentoYear` verbatim. Only the SELECTION
|
|
8
|
+
* logic is new — field extraction is unchanged, so a comparison against
|
|
9
|
+
* `extractDocumentFields` isolates the first-match-wins → best-match change.
|
|
10
|
+
*
|
|
11
|
+
* Clue → flag mapping used throughout:
|
|
12
|
+
* - strong positive signal → high `weight`
|
|
13
|
+
* - hard exclusion (return false in the original) → `negative: true`
|
|
14
|
+
* - mandatory signature → `required: true`
|
|
15
|
+
* - `minScore` is tuned so the weighted sum reproduces the original boolean
|
|
16
|
+
* on the existing test fixtures.
|
|
17
|
+
*
|
|
18
|
+
* NOTE: a few original conditions are compound (e.g. simplificado's
|
|
19
|
+
* "COVE: present AND PAGO absent", or inter-agencia's "≥2 distinct RFCs from a
|
|
20
|
+
* set"). Where a single clue cannot express the exact boolean, the closest
|
|
21
|
+
* faithful approximation is used and flagged inline — the comparison harness
|
|
22
|
+
* (`scripts/scoring-compare.js`) surfaces any divergence on a real corpus.
|
|
23
|
+
*/
|
|
24
|
+
import { dodaPdfDefinition } from '../document-types/doda-pdf.js';
|
|
25
|
+
import { dodaXmlDefinition } from '../document-types/doda-xml.js';
|
|
26
|
+
import {
|
|
27
|
+
INTER_AGENCIA_RFCS,
|
|
28
|
+
facturaInterAgenciaDefinition,
|
|
29
|
+
} from '../document-types/factura-inter-agencia.js';
|
|
30
|
+
import { facturasComerciales } from '../document-types/facturas-comerciales.js';
|
|
31
|
+
import { pedimentoCompletoXmlDefinition } from '../document-types/pedimento-completo-xml.js';
|
|
32
|
+
import { pedimentoCompletoDefinition } from '../document-types/pedimento-completo.js';
|
|
33
|
+
import { pedimentoSimplificadoDefinition } from '../document-types/pedimento-simplificado.js';
|
|
34
|
+
import { supportDocumentDefinition } from '../document-types/support-document.js';
|
|
35
|
+
|
|
36
|
+
// Pull the extraction half of a definition (reused unchanged).
|
|
37
|
+
function reuse(def) {
|
|
38
|
+
return {
|
|
39
|
+
extractors: def.extractors,
|
|
40
|
+
resolveType: def.resolveType,
|
|
41
|
+
extractNumPedimento: def.extractNumPedimento,
|
|
42
|
+
extractPedimentoYear: def.extractPedimentoYear,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const PEDIMENTO_NUM = /\d{2}\s?\d{2}\s?\d{4}\s?\d{7}/;
|
|
47
|
+
|
|
48
|
+
// --- pedimento_simplificado --------------------------------------------------
|
|
49
|
+
// Original: AVISO/COVE excluded; title "FORMA SIMPLIFICADA DE[L] PEDIMENTO"
|
|
50
|
+
// short-circuits to true; otherwise the header trio (all three) qualifies.
|
|
51
|
+
// Copy markers are NOT made negative here — they let `pedimento_completo`
|
|
52
|
+
// outscore on completo layouts, which reproduces the title short-circuit.
|
|
53
|
+
const simplificado = {
|
|
54
|
+
documentType: 'pedimento_simplificado',
|
|
55
|
+
extensions: ['pdf'],
|
|
56
|
+
minScore: 3, // title(5) OR full header trio(1+1+1)
|
|
57
|
+
priority: 1,
|
|
58
|
+
...reuse(pedimentoSimplificadoDefinition),
|
|
59
|
+
clues: [
|
|
60
|
+
{ kind: 'CONTENT_REGEX', pattern: /AVISO\s+CONSOLIDADO/i, negative: true },
|
|
61
|
+
{
|
|
62
|
+
kind: 'CONTENT_REGEX',
|
|
63
|
+
pattern: /COMPROBANTE\s+DE\s+VALOR\s+ELECTR[ÓO]NICO/i,
|
|
64
|
+
negative: true,
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
kind: 'CONTENT_REGEX',
|
|
68
|
+
pattern: /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i,
|
|
69
|
+
weight: 5,
|
|
70
|
+
},
|
|
71
|
+
{ kind: 'CONTENT_REGEX', pattern: /NUM\.?\s*PEDIMENTO:/i, weight: 1 },
|
|
72
|
+
{ kind: 'CONTENT_REGEX', pattern: /CVE\.?\s*PEDIMENTO:/i, weight: 1 },
|
|
73
|
+
{ kind: 'CONTENT_REGEX', pattern: /T\.?\s*OPER:?/i, weight: 1 },
|
|
74
|
+
],
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
// --- pedimento_completo ------------------------------------------------------
|
|
78
|
+
// Original: exclude FORMA SIMPLIFICADA & AVISO; (header trio AND ≥1 copy marker)
|
|
79
|
+
// OR clue-count heuristic (>25% of ~18 clues ≈ ≥5).
|
|
80
|
+
const completo = {
|
|
81
|
+
documentType: 'pedimento_completo',
|
|
82
|
+
extensions: ['pdf'],
|
|
83
|
+
minScore: 5, // header trio(3) + 1 copy marker(2), or ≥5 fallback clues
|
|
84
|
+
priority: 1,
|
|
85
|
+
...reuse(pedimentoCompletoDefinition),
|
|
86
|
+
clues: [
|
|
87
|
+
{
|
|
88
|
+
kind: 'CONTENT_REGEX',
|
|
89
|
+
pattern: /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i,
|
|
90
|
+
negative: true,
|
|
91
|
+
},
|
|
92
|
+
{ kind: 'CONTENT_REGEX', pattern: /AVISO\s+CONSOLIDADO/i, negative: true },
|
|
93
|
+
// header trio
|
|
94
|
+
{ kind: 'CONTENT_REGEX', pattern: /NUM\.?\s*PEDIMENTO:/i, weight: 1 },
|
|
95
|
+
{ kind: 'CONTENT_REGEX', pattern: /CVE\.?\s*PEDIMENTO:/i, weight: 1 },
|
|
96
|
+
{ kind: 'CONTENT_REGEX', pattern: /T\.?\s*OPER:?/i, weight: 1 },
|
|
97
|
+
// copy markers (long-form pedimento signatures)
|
|
98
|
+
{
|
|
99
|
+
kind: 'CONTENT_REGEX',
|
|
100
|
+
pattern: /ORIGINAL:\s*ADMINISTRACION GENERAL DE ADUANAS/i,
|
|
101
|
+
weight: 2,
|
|
102
|
+
},
|
|
103
|
+
{ kind: 'CONTENT_REGEX', pattern: /SEGUNDA\s+COPIA/i, weight: 2 },
|
|
104
|
+
{ kind: 'CONTENT_REGEX', pattern: /TERCERA\s+COPIA/i, weight: 2 },
|
|
105
|
+
{
|
|
106
|
+
kind: 'CONTENT_REGEX',
|
|
107
|
+
pattern: /COPIA\s+(SIMPLIFICAD[AO])?\s*TRANSPORTISTA/i,
|
|
108
|
+
weight: 2,
|
|
109
|
+
},
|
|
110
|
+
{ kind: 'CONTENT_REGEX', pattern: /DEFINITIVO/i, weight: 2 },
|
|
111
|
+
{ kind: 'CONTENT_REGEX', pattern: /ANEXO\s+DEL\s+PEDIMENTO/i, weight: 2 },
|
|
112
|
+
{
|
|
113
|
+
kind: 'CONTENT_REGEX',
|
|
114
|
+
pattern: /\*+FIN\s+DE\s+PEDIMENTO\s*\*+/i,
|
|
115
|
+
weight: 2,
|
|
116
|
+
},
|
|
117
|
+
// exotic-layout fallback clues (weight 1 each)
|
|
118
|
+
{ kind: 'CONTENT_REGEX', pattern: /CERTIFICACIONES/i, weight: 1 },
|
|
119
|
+
{ kind: 'CONTENT_REGEX', pattern: /CUADRO\s+DE\s+LIQUIDACION/i, weight: 1 },
|
|
120
|
+
{
|
|
121
|
+
kind: 'CONTENT_REGEX',
|
|
122
|
+
pattern: /\*\*\*\s+PAGO\s+ELECTRONICO\s+\*\*\*/i,
|
|
123
|
+
weight: 1,
|
|
124
|
+
},
|
|
125
|
+
{ kind: 'CONTENT_REGEX', pattern: /MEDIOS\s+DE\s+TRANSPORTE/i, weight: 1 },
|
|
126
|
+
{
|
|
127
|
+
kind: 'CONTENT_REGEX',
|
|
128
|
+
pattern: /DATOS\s+DEL\s+IMPORTADOR\/EXPORTADOR/i,
|
|
129
|
+
weight: 1,
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
kind: 'CONTENT_REGEX',
|
|
133
|
+
pattern: /DATOS\s+DEL\s+PROVEEDOR\s+O\s+COMPRADOR/i,
|
|
134
|
+
weight: 1,
|
|
135
|
+
},
|
|
136
|
+
{ kind: 'CONTENT_REGEX', pattern: /LINEA\s+DE\s+CAPTURA:/i, weight: 1 },
|
|
137
|
+
{
|
|
138
|
+
kind: 'CONTENT_REGEX',
|
|
139
|
+
pattern: /DECLARO\s+BAJO\s+PROTESTA\s+DE\s+DECIR\s+VERDAD/i,
|
|
140
|
+
weight: 1,
|
|
141
|
+
},
|
|
142
|
+
{
|
|
143
|
+
kind: 'CONTENT_REGEX',
|
|
144
|
+
pattern: /PEDIMENTO\s+ELABORADO\s+DE\s+CONFORMIDAD/i,
|
|
145
|
+
weight: 1,
|
|
146
|
+
},
|
|
147
|
+
],
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
// --- pedimento_completo_xml --------------------------------------------------
|
|
151
|
+
// Original: single condition — the VUCEM response root tag.
|
|
152
|
+
const completoXml = {
|
|
153
|
+
documentType: 'pedimento_completo_xml',
|
|
154
|
+
extensions: ['xml'],
|
|
155
|
+
minScore: 1,
|
|
156
|
+
priority: 2, // authoritative signal — must dominate doda_xml/support on XML
|
|
157
|
+
...reuse(pedimentoCompletoXmlDefinition),
|
|
158
|
+
clues: [
|
|
159
|
+
{
|
|
160
|
+
kind: 'CONTENT_REGEX',
|
|
161
|
+
pattern: /consultarPedimentoCompletoRespuesta/i,
|
|
162
|
+
weight: 10,
|
|
163
|
+
required: true,
|
|
164
|
+
},
|
|
165
|
+
],
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
// --- doda_pdf ----------------------------------------------------------------
|
|
169
|
+
// Original: primary marker → true; OR (≥2 secondary + pedimento#);
|
|
170
|
+
// OR (doda-context + pedimento# + ≥1 secondary).
|
|
171
|
+
const dodaPdf = {
|
|
172
|
+
documentType: 'doda_pdf',
|
|
173
|
+
extensions: ['pdf'],
|
|
174
|
+
priority: 1,
|
|
175
|
+
qualify: [
|
|
176
|
+
{ primary: 1 },
|
|
177
|
+
{ secondary: 2, pedimento: 1 },
|
|
178
|
+
{ context: 1, pedimento: 1, secondary: 1 },
|
|
179
|
+
],
|
|
180
|
+
...reuse(dodaPdfDefinition),
|
|
181
|
+
clues: [
|
|
182
|
+
{
|
|
183
|
+
kind: 'CONTENT_REGEX',
|
|
184
|
+
pattern: /DOCUMENTO DE OPERACI[OÓ]N PARA DESPACHO ADUANERO/i,
|
|
185
|
+
weight: 5,
|
|
186
|
+
group: 'primary',
|
|
187
|
+
},
|
|
188
|
+
{ kind: 'CONTENT_REGEX', pattern: /DODA/i, weight: 1, group: 'secondary' },
|
|
189
|
+
{ kind: 'CONTENT_REGEX', pattern: /VUCEM/i, weight: 1, group: 'secondary' },
|
|
190
|
+
{
|
|
191
|
+
kind: 'CONTENT_REGEX',
|
|
192
|
+
pattern: PEDIMENTO_NUM,
|
|
193
|
+
weight: 1,
|
|
194
|
+
group: 'pedimento',
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
kind: 'CONTENT_REGEX',
|
|
198
|
+
pattern: /despacho aduanero|operaci[oó]n aduanera|validaci[oó]n/i,
|
|
199
|
+
weight: 1,
|
|
200
|
+
group: 'context',
|
|
201
|
+
},
|
|
202
|
+
],
|
|
203
|
+
};
|
|
204
|
+
|
|
205
|
+
// --- doda_xml ----------------------------------------------------------------
|
|
206
|
+
// Original: ≥1 doda marker → true; OR (≥3 pedimento markers AND <?xml).
|
|
207
|
+
const dodaXml = {
|
|
208
|
+
documentType: 'doda_xml',
|
|
209
|
+
extensions: ['xml'],
|
|
210
|
+
priority: 1,
|
|
211
|
+
qualify: [{ doda: 1 }, { pedimento: 3, xml: 1 }],
|
|
212
|
+
...reuse(dodaXmlDefinition),
|
|
213
|
+
clues: [
|
|
214
|
+
{
|
|
215
|
+
kind: 'CONTENT_REGEX',
|
|
216
|
+
pattern: /documentoOperacion/i,
|
|
217
|
+
weight: 3,
|
|
218
|
+
group: 'doda',
|
|
219
|
+
},
|
|
220
|
+
{
|
|
221
|
+
kind: 'CONTENT_REGEX',
|
|
222
|
+
pattern: /despachoAduanero/i,
|
|
223
|
+
weight: 3,
|
|
224
|
+
group: 'doda',
|
|
225
|
+
},
|
|
226
|
+
{ kind: 'CONTENT_REGEX', pattern: /<doda\b/i, weight: 3, group: 'doda' },
|
|
227
|
+
{
|
|
228
|
+
kind: 'CONTENT_REGEX',
|
|
229
|
+
pattern: /xmlns[^"]*doda/i,
|
|
230
|
+
weight: 3,
|
|
231
|
+
group: 'doda',
|
|
232
|
+
},
|
|
233
|
+
{ kind: 'CONTENT_REGEX', pattern: /VUCEM/i, weight: 3, group: 'doda' },
|
|
234
|
+
{
|
|
235
|
+
kind: 'CONTENT_REGEX',
|
|
236
|
+
pattern: /numPedimento/i,
|
|
237
|
+
weight: 1,
|
|
238
|
+
group: 'pedimento',
|
|
239
|
+
},
|
|
240
|
+
{
|
|
241
|
+
kind: 'CONTENT_REGEX',
|
|
242
|
+
pattern: /patenteAduanal/i,
|
|
243
|
+
weight: 1,
|
|
244
|
+
group: 'pedimento',
|
|
245
|
+
},
|
|
246
|
+
{
|
|
247
|
+
kind: 'CONTENT_REGEX',
|
|
248
|
+
pattern: /aduanaDespacho/i,
|
|
249
|
+
weight: 1,
|
|
250
|
+
group: 'pedimento',
|
|
251
|
+
},
|
|
252
|
+
{
|
|
253
|
+
kind: 'CONTENT_REGEX',
|
|
254
|
+
pattern: /tipoOperacion/i,
|
|
255
|
+
weight: 1,
|
|
256
|
+
group: 'pedimento',
|
|
257
|
+
},
|
|
258
|
+
// structural gate for the pedimento-markers path (weight 0 = gate only)
|
|
259
|
+
{ kind: 'CONTENT_REGEX', pattern: /<\?xml/i, weight: 0, group: 'xml' },
|
|
260
|
+
],
|
|
261
|
+
};
|
|
262
|
+
|
|
263
|
+
// --- factura_inter_agencia ---------------------------------------------------
|
|
264
|
+
// Original: isCfdiContent AND ≥2 distinct configured RFCs AND broker clave.
|
|
265
|
+
// The pair of agency RFCs and the broker-service clave are modelled as REQUIRED
|
|
266
|
+
// clues, which is what lets this win over `factura_comercial` purely by score —
|
|
267
|
+
// no registration-order dependency. (Scope-limited to the configured pair, same
|
|
268
|
+
// as the original; widening means making RFC presence a counting rule.)
|
|
269
|
+
const interAgencia = {
|
|
270
|
+
documentType: 'factura_inter_agencia',
|
|
271
|
+
extensions: ['xml', 'pdf'],
|
|
272
|
+
minScore: 25, // both required RFCs (10+10) + required clave (5)
|
|
273
|
+
priority: 3,
|
|
274
|
+
...reuse(facturaInterAgenciaDefinition),
|
|
275
|
+
clues: [
|
|
276
|
+
...INTER_AGENCIA_RFCS.map((rfc) => ({
|
|
277
|
+
kind: 'CONTENT_REGEX',
|
|
278
|
+
pattern: new RegExp(`\\b${rfc}\\b`, 'i'),
|
|
279
|
+
weight: 10,
|
|
280
|
+
required: true,
|
|
281
|
+
})),
|
|
282
|
+
// BROKER_SERVICE_CLAVE_PROD_SERV (78141502 = servicios de agentes aduaneros)
|
|
283
|
+
{ kind: 'CONTENT_REGEX', pattern: /78141502/, weight: 5, required: true },
|
|
284
|
+
// CFDI content markers (informational positive signal)
|
|
285
|
+
{ kind: 'CONTENT_REGEX', pattern: /cfdi:Comprobante/i, weight: 1 },
|
|
286
|
+
{ kind: 'CONTENT_REGEX', pattern: /xmlns:cfdi/i, weight: 1 },
|
|
287
|
+
{ kind: 'CONTENT_REGEX', pattern: /TipoDeComprobante/i, weight: 1 },
|
|
288
|
+
],
|
|
289
|
+
};
|
|
290
|
+
|
|
291
|
+
// --- factura_comercial -------------------------------------------------------
|
|
292
|
+
// Original: cfdiMatches≥2 OR (invoiceMatches≥1 AND customsMatches≥1).
|
|
293
|
+
// Faithfully expressed with clue groups + qualify (OR-of-ANDs) — a flat
|
|
294
|
+
// minScore could not enforce the "invoice AND customs" pairing and produced
|
|
295
|
+
// false positives on COVE acuses (customs keywords alone reaching the threshold).
|
|
296
|
+
const facturaComercial = {
|
|
297
|
+
documentType: 'factura_comercial',
|
|
298
|
+
extensions: ['pdf', 'xml'],
|
|
299
|
+
priority: 0,
|
|
300
|
+
qualify: [{ cfdi: 2 }, { invoice: 1, customs: 1 }],
|
|
301
|
+
...reuse(facturasComerciales),
|
|
302
|
+
clues: [
|
|
303
|
+
{ kind: 'CONTENT_REGEX', pattern: /cfdi:Comprobante/i, group: 'cfdi' },
|
|
304
|
+
{ kind: 'CONTENT_REGEX', pattern: /xmlns:cfdi/i, group: 'cfdi' },
|
|
305
|
+
{ kind: 'CONTENT_REGEX', pattern: /TipoDeComprobante/i, group: 'cfdi' },
|
|
306
|
+
{ kind: 'CONTENT_REGEX', pattern: /timbreFiscalDigital/i, group: 'cfdi' },
|
|
307
|
+
{ kind: 'CONTENT_REGEX', pattern: /SelloSAT/i, group: 'cfdi' },
|
|
308
|
+
{
|
|
309
|
+
kind: 'CONTENT_REGEX',
|
|
310
|
+
pattern: /factura\s*(comercial|de\s*venta|de\s*exportaci[oó]n)?/i,
|
|
311
|
+
group: 'invoice',
|
|
312
|
+
},
|
|
313
|
+
{
|
|
314
|
+
kind: 'CONTENT_REGEX',
|
|
315
|
+
pattern: /commercial\s*invoice/i,
|
|
316
|
+
group: 'invoice',
|
|
317
|
+
},
|
|
318
|
+
{ kind: 'CONTENT_REGEX', pattern: /invoice\s*number/i, group: 'invoice' },
|
|
319
|
+
{
|
|
320
|
+
kind: 'CONTENT_REGEX',
|
|
321
|
+
pattern: /n[uú]mero\s*de\s*factura/i,
|
|
322
|
+
group: 'invoice',
|
|
323
|
+
},
|
|
324
|
+
{ kind: 'CONTENT_REGEX', pattern: /pedimento/i, group: 'customs' },
|
|
325
|
+
{ kind: 'CONTENT_REGEX', pattern: /aduana/i, group: 'customs' },
|
|
326
|
+
{
|
|
327
|
+
kind: 'CONTENT_REGEX',
|
|
328
|
+
pattern: /importaci[oó]n|exportaci[oó]n/i,
|
|
329
|
+
group: 'customs',
|
|
330
|
+
},
|
|
331
|
+
{
|
|
332
|
+
kind: 'CONTENT_REGEX',
|
|
333
|
+
pattern: /despacho\s*aduanero/i,
|
|
334
|
+
group: 'customs',
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
kind: 'CONTENT_REGEX',
|
|
338
|
+
pattern: /fracci[oó]n\s*arancelaria/i,
|
|
339
|
+
group: 'customs',
|
|
340
|
+
},
|
|
341
|
+
],
|
|
342
|
+
};
|
|
343
|
+
|
|
344
|
+
// --- support_document --------------------------------------------------------
|
|
345
|
+
// Original: soapFound≥2 OR customsFound≥2. Broad fallback → lowest priority so
|
|
346
|
+
// it only wins when no specific matcher qualifies.
|
|
347
|
+
const supportDocument = {
|
|
348
|
+
documentType: 'support_document',
|
|
349
|
+
extensions: ['xml', 'txt', 'json'],
|
|
350
|
+
priority: -1,
|
|
351
|
+
qualify: [{ soap: 2 }, { customs: 2 }],
|
|
352
|
+
...reuse(supportDocumentDefinition),
|
|
353
|
+
clues: [
|
|
354
|
+
{ kind: 'CONTENT_REGEX', pattern: /soapenv:Envelope/i, group: 'soap' },
|
|
355
|
+
{ kind: 'CONTENT_REGEX', pattern: /xmlns:soapenv=/i, group: 'soap' },
|
|
356
|
+
{
|
|
357
|
+
kind: 'CONTENT_REGEX',
|
|
358
|
+
pattern: /solicitarRecibirCoveServicio/i,
|
|
359
|
+
group: 'soap',
|
|
360
|
+
},
|
|
361
|
+
{ kind: 'CONTENT_REGEX', pattern: /tipoOperacion/i, group: 'soap' },
|
|
362
|
+
{ kind: 'CONTENT_REGEX', pattern: /patenteAduanal/i, group: 'soap' },
|
|
363
|
+
// customs metadata fallback — original requires BOTH patterns present
|
|
364
|
+
{ kind: 'CONTENT_REGEX', pattern: /rfc/i, group: 'customs' },
|
|
365
|
+
{
|
|
366
|
+
kind: 'CONTENT_REGEX',
|
|
367
|
+
pattern: /patente|aduana|customs|pedimento/i,
|
|
368
|
+
group: 'customs',
|
|
369
|
+
},
|
|
370
|
+
],
|
|
371
|
+
};
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* Default/global seed set. Order is irrelevant — best-match selects the winner.
|
|
375
|
+
* (This becomes the seed for DEFAULT matchers when the model moves to the DB.)
|
|
376
|
+
*/
|
|
377
|
+
export const scoringMatchers = [
|
|
378
|
+
simplificado,
|
|
379
|
+
completo,
|
|
380
|
+
completoXml,
|
|
381
|
+
dodaPdf,
|
|
382
|
+
dodaXml,
|
|
383
|
+
interAgencia,
|
|
384
|
+
facturaComercial,
|
|
385
|
+
supportDocument,
|
|
386
|
+
];
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scoring-based document classification engine (PROTOTYPE).
|
|
3
|
+
*
|
|
4
|
+
* Replaces the "first-match-wins" selection in `document-type-shared.js`
|
|
5
|
+
* (`extractDocumentFields`) with "best-match": every applicable matcher is
|
|
6
|
+
* scored by the weight of the clues it satisfies, and the highest score wins.
|
|
7
|
+
* This removes the order-dependent registration that the current registry
|
|
8
|
+
* relies on (e.g. `factura_inter_agencia` MUST be evaluated before
|
|
9
|
+
* `facturas_comerciales`) — precedence now lives in clue weights / `required`
|
|
10
|
+
* / `negative` flags, not in array order.
|
|
11
|
+
*
|
|
12
|
+
* Matcher shape (see `matchers-seed.js`):
|
|
13
|
+
* {
|
|
14
|
+
* documentType, extensions[], minScore, priority,
|
|
15
|
+
* clues: [{ kind, pattern, flags?, weight=1, group?, required=false, negative=false }],
|
|
16
|
+
* qualify?: [{ <group>: minCount, ... }, ...], // OR-of-ANDs gate
|
|
17
|
+
* extractors, resolveType?, extractNumPedimento?, extractPedimentoYear?
|
|
18
|
+
* }
|
|
19
|
+
*
|
|
20
|
+
* Two separable concerns:
|
|
21
|
+
* - QUALIFICATION (does this matcher apply at all?): `required`/`negative`
|
|
22
|
+
* clues plus an optional `qualify` rule set. `qualify` is a list of
|
|
23
|
+
* alternative rules (OR); a rule is a map of `group -> minimum matched
|
|
24
|
+
* clues` (AND across its entries). This expresses the grouped boolean gates
|
|
25
|
+
* of the original matchers, e.g. `(cfdi>=2) OR (invoice>=1 AND customs>=1)`.
|
|
26
|
+
* When `qualify` is absent, the gate falls back to `score >= minScore`.
|
|
27
|
+
* - RANKING (which qualifying matcher wins?): always the weighted sum of
|
|
28
|
+
* matched clues (`score`), tie-broken by fraction -> priority -> type.
|
|
29
|
+
*
|
|
30
|
+
* Selection only depends on clues/qualify. The winning matcher's `extractors` /
|
|
31
|
+
* `resolveType` / `extractNumPedimento` / `extractPedimentoYear` run AFTER
|
|
32
|
+
* selection with the same post-processing as `extractDocumentFields`, so a
|
|
33
|
+
* side-by-side comparison isolates the selection change.
|
|
34
|
+
*/
|
|
35
|
+
import path from 'path';
|
|
36
|
+
|
|
37
|
+
import { FieldResult } from '../document-type-shared.js';
|
|
38
|
+
|
|
39
|
+
function toRegExp(clue) {
|
|
40
|
+
return clue.pattern instanceof RegExp
|
|
41
|
+
? clue.pattern
|
|
42
|
+
: new RegExp(clue.pattern, clue.flags ?? '');
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function clueTarget(clue, ctx) {
|
|
46
|
+
// FILENAME_REGEX tests the file name; every other kind tests the content.
|
|
47
|
+
return clue.kind === 'FILENAME_REGEX'
|
|
48
|
+
? (ctx.fileName ?? '')
|
|
49
|
+
: (ctx.source ?? '');
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Score a single matcher against a document context.
|
|
54
|
+
* @returns {null} when the matcher does not apply (extension mismatch),
|
|
55
|
+
* `{ disqualified: true, reason }` when a `required`/`negative` clue
|
|
56
|
+
* rules it out, otherwise a scored result object.
|
|
57
|
+
*/
|
|
58
|
+
export function scoreMatcher(matcher, ctx) {
|
|
59
|
+
const ext = (ctx.extension ?? '').toLowerCase();
|
|
60
|
+
if (
|
|
61
|
+
Array.isArray(matcher.extensions) &&
|
|
62
|
+
matcher.extensions.length > 0 &&
|
|
63
|
+
ext &&
|
|
64
|
+
!matcher.extensions.includes(ext)
|
|
65
|
+
) {
|
|
66
|
+
return null;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
let matchedWeight = 0;
|
|
70
|
+
let totalWeight = 0;
|
|
71
|
+
const matchedClues = [];
|
|
72
|
+
const groupCounts = {};
|
|
73
|
+
|
|
74
|
+
for (const clue of matcher.clues ?? []) {
|
|
75
|
+
const weight = clue.weight ?? 1;
|
|
76
|
+
const hit = toRegExp(clue).test(clueTarget(clue, ctx));
|
|
77
|
+
|
|
78
|
+
if (clue.negative) {
|
|
79
|
+
if (hit) {
|
|
80
|
+
return {
|
|
81
|
+
documentType: matcher.documentType,
|
|
82
|
+
disqualified: true,
|
|
83
|
+
reason: `negative:${clue.pattern}`,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (clue.required && !hit) {
|
|
90
|
+
return {
|
|
91
|
+
documentType: matcher.documentType,
|
|
92
|
+
disqualified: true,
|
|
93
|
+
reason: `required-missing:${clue.pattern}`,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
totalWeight += weight;
|
|
98
|
+
if (hit) {
|
|
99
|
+
matchedWeight += weight;
|
|
100
|
+
matchedClues.push(clue);
|
|
101
|
+
if (clue.group) {
|
|
102
|
+
groupCounts[clue.group] = (groupCounts[clue.group] ?? 0) + 1;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Qualification gate: `qualify` rules (OR-of-ANDs over group counts) when
|
|
108
|
+
// present, otherwise the weighted-score threshold.
|
|
109
|
+
const passed = Array.isArray(matcher.qualify)
|
|
110
|
+
? matcher.qualify.some((rule) =>
|
|
111
|
+
Object.entries(rule).every(
|
|
112
|
+
([group, min]) => (groupCounts[group] ?? 0) >= min,
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
: matchedWeight >= (matcher.minScore ?? 1);
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
documentType: matcher.documentType,
|
|
119
|
+
matcher,
|
|
120
|
+
disqualified: false,
|
|
121
|
+
score: matchedWeight,
|
|
122
|
+
totalWeight,
|
|
123
|
+
fraction: totalWeight > 0 ? matchedWeight / totalWeight : 0,
|
|
124
|
+
priority: matcher.priority ?? 0,
|
|
125
|
+
passed,
|
|
126
|
+
matchedClues,
|
|
127
|
+
groupCounts,
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* All qualifying candidates, sorted best-first.
|
|
133
|
+
* Order: score desc → fraction desc → priority desc → documentType (stable).
|
|
134
|
+
*/
|
|
135
|
+
export function scoreAll(matchers, ctx) {
|
|
136
|
+
const candidates = [];
|
|
137
|
+
for (const matcher of matchers) {
|
|
138
|
+
const result = scoreMatcher(matcher, ctx);
|
|
139
|
+
if (!result || result.disqualified || !result.passed) continue;
|
|
140
|
+
candidates.push(result);
|
|
141
|
+
}
|
|
142
|
+
candidates.sort(
|
|
143
|
+
(a, b) =>
|
|
144
|
+
b.score - a.score ||
|
|
145
|
+
b.fraction - a.fraction ||
|
|
146
|
+
b.priority - a.priority ||
|
|
147
|
+
String(a.documentType).localeCompare(String(b.documentType)),
|
|
148
|
+
);
|
|
149
|
+
return candidates;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
export function selectBestMatch(matchers, ctx) {
|
|
153
|
+
return scoreAll(matchers, ctx)[0] ?? null;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Full classification: pick the best matcher, then run ITS extractors /
|
|
158
|
+
* resolveType / pedimento helpers. Post-selection logic mirrors
|
|
159
|
+
* `extractDocumentFields` so a comparison isolates the selection change.
|
|
160
|
+
*
|
|
161
|
+
* @returns {{ detectedType, fields, detectedPedimento, detectedPedimentoYear,
|
|
162
|
+
* winner, candidates }}
|
|
163
|
+
*/
|
|
164
|
+
export function classifyDocument(matchers, { source, extension, filePath }) {
|
|
165
|
+
const ctx = {
|
|
166
|
+
source,
|
|
167
|
+
extension,
|
|
168
|
+
fileName: filePath ? path.basename(filePath) : '',
|
|
169
|
+
};
|
|
170
|
+
const candidates = scoreAll(matchers, ctx);
|
|
171
|
+
const winner = candidates[0] ?? null;
|
|
172
|
+
|
|
173
|
+
if (!winner) {
|
|
174
|
+
return {
|
|
175
|
+
detectedType: null,
|
|
176
|
+
fields: [],
|
|
177
|
+
detectedPedimento: null,
|
|
178
|
+
detectedPedimentoYear: null,
|
|
179
|
+
winner: null,
|
|
180
|
+
candidates,
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
const def = winner.matcher;
|
|
185
|
+
const fields = [];
|
|
186
|
+
for (const extractor of def.extractors ?? []) {
|
|
187
|
+
try {
|
|
188
|
+
fields.push(extractor.extract(source));
|
|
189
|
+
} catch {
|
|
190
|
+
fields.push(new FieldResult(extractor.field, false, null));
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
const resolvedType = def.resolveType
|
|
195
|
+
? def.resolveType(fields)
|
|
196
|
+
: def.documentType;
|
|
197
|
+
const pedimento = def.extractNumPedimento
|
|
198
|
+
? def.extractNumPedimento(source, fields, filePath)
|
|
199
|
+
: null;
|
|
200
|
+
const year = def.extractPedimentoYear
|
|
201
|
+
? def.extractPedimentoYear(source, fields, filePath)
|
|
202
|
+
: null;
|
|
203
|
+
|
|
204
|
+
// Backfill numPedimento as a field (same as extractDocumentFields) so
|
|
205
|
+
// downstream consumers (composeArelaPath) see a consistent shape.
|
|
206
|
+
if (pedimento && !fields.some((f) => f.name === 'numPedimento')) {
|
|
207
|
+
fields.push(new FieldResult('numPedimento', true, pedimento));
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
detectedType: resolvedType,
|
|
212
|
+
fields,
|
|
213
|
+
detectedPedimento: pedimento,
|
|
214
|
+
detectedPedimentoYear: year,
|
|
215
|
+
winner,
|
|
216
|
+
candidates,
|
|
217
|
+
};
|
|
218
|
+
}
|
|
@@ -389,6 +389,20 @@ export class ScanApiService {
|
|
|
389
389
|
return result;
|
|
390
390
|
}
|
|
391
391
|
|
|
392
|
+
/**
|
|
393
|
+
* Fetch the resolved matcher set (this RFC's matchers + globals) for runtime
|
|
394
|
+
* classification. Returns an array of matchers with clues + fieldExtractors.
|
|
395
|
+
* @param {string|null} rfc - optional RFC to scope per-company matchers
|
|
396
|
+
*/
|
|
397
|
+
async getResolvedMatchers(rfc = null) {
|
|
398
|
+
const qs = rfc ? `?rfc=${encodeURIComponent(rfc)}` : '';
|
|
399
|
+
const result = await this.#request(
|
|
400
|
+
`/api/document-matcher/resolved${qs}`,
|
|
401
|
+
'GET',
|
|
402
|
+
);
|
|
403
|
+
return Array.isArray(result) ? result : [];
|
|
404
|
+
}
|
|
405
|
+
|
|
392
406
|
async fetchPdfsForDetection(
|
|
393
407
|
tableName,
|
|
394
408
|
offset = 0,
|