@arela/uploader 1.0.22 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,186 @@
1
+ // NOTE: We intentionally do NOT import `FieldResult` from
2
+ // '../document-type-shared.js' to avoid a circular-import TDZ when this
3
+ // module is imported directly (e.g. from unit tests). `FieldResult` is a
4
+ // plain data-class with shape `{ name, found, value }`, so we construct
5
+ // equivalent plain objects locally.
6
+ const fieldResult = (name, found, value) => ({ name, found, value });
7
+
8
+ /**
9
+ * Factura Inter-Agencia Document Type Definition
10
+ *
11
+ * Detects CFDIs (XML or PDF) issued between customs broker agencies (e.g.,
12
+ * NORCOM ↔ PALCO). These files are dropped into a pedimento folder by the
13
+ * broker but they are NOT part of the customs electronic file (expediente
14
+ * aduanal) — they are inter-agency billing for broker services.
15
+ *
16
+ * Detection rules (ALL required):
17
+ * 1) CFDI markers present (either xml structure or PDF text representation)
18
+ * 2) Both emisor and receptor RFCs belong to the configured agency pair
19
+ * (NAA120215F20 = NORCOM, PCC1008161WA = PALCO) in any direction.
20
+ * 3) At least one concepto with ClaveProdServ 78141502 (Servicios de
21
+ * agentes aduaneros) — confirms the billing is for broker services.
22
+ *
23
+ * IMPORTANT: This matcher MUST be registered BEFORE `facturasComerciales`
24
+ * in document-type-shared.js — both would match a CFDI in a pedimento
25
+ * folder, but inter-agency invoices must take precedence so they are
26
+ * filtered out of the Arela push pipeline (see arela-api
27
+ * NON_PUSHABLE_TYPES_SQL).
28
+ *
29
+ * Currently scope-limited to NORCOM↔PALCO. To widen, move INTER_AGENCIA_RFCS
30
+ * to env config and require ≥2 distinct RFCs from the configured list.
31
+ */
32
+
33
+ /**
34
+ * RFCs of agencies whose mutual invoices should be excluded from the Arela
35
+ * push pipeline. Order is irrelevant — a match is any pair of distinct RFCs
36
+ * from this set appearing as emisor and receptor.
37
+ */
38
+ export const INTER_AGENCIA_RFCS = ['NAA120215F20', 'PCC1008161WA'];
39
+
40
+ const BROKER_SERVICE_CLAVE_PROD_SERV = '78141502';
41
+
42
+ const CFDI_XML_MARKERS = [
43
+ /cfdi:Comprobante/i,
44
+ /xmlns:cfdi/i,
45
+ /TipoDeComprobante/i,
46
+ ];
47
+
48
+ /**
49
+ * Detect that the source represents a CFDI — either as the original XML
50
+ * structure or as text extracted from a printed CFDI (PDF representation).
51
+ *
52
+ * PDF text loses XML tags, so we look for the human-readable equivalents
53
+ * commonly rendered by SAT-style invoice templates ("Folio Fiscal", "Sello
54
+ * Digital del CFDI", "Cadena Original ... Certificacion Digital del SAT").
55
+ */
56
+ function isCfdiContent(source) {
57
+ const xmlHits = CFDI_XML_MARKERS.filter((re) => re.test(source)).length;
58
+ if (xmlHits >= 2) return true;
59
+
60
+ const pdfMarkers = [
61
+ /folio\s*fiscal/i,
62
+ /sello\s*digital\s*del\s*cfdi/i,
63
+ /cadena\s*original.*certificaci[oó]n\s*digital\s*del\s*sat/i,
64
+ /representaci[oó]n\s*impresa\s*de\s*un\s*cfdi/i,
65
+ ];
66
+ return pdfMarkers.filter((re) => re.test(source)).length >= 2;
67
+ }
68
+
69
+ /**
70
+ * Return the subset of INTER_AGENCIA_RFCS that appear in `source`. Matching is
71
+ * case-insensitive and uses word boundaries so substrings inside larger tokens
72
+ * (cert/sello base64) don't produce false positives.
73
+ */
74
+ function findInterAgenciaRfcs(source) {
75
+ const found = new Set();
76
+ for (const rfc of INTER_AGENCIA_RFCS) {
77
+ const re = new RegExp(`\\b${rfc}\\b`, 'i');
78
+ if (re.test(source)) found.add(rfc.toUpperCase());
79
+ }
80
+ return [...found];
81
+ }
82
+
83
+ export const facturaInterAgenciaDefinition = {
84
+ type: 'factura_inter_agencia',
85
+ extensions: ['xml', 'pdf'],
86
+
87
+ match: (source) => {
88
+ if (!isCfdiContent(source)) return false;
89
+
90
+ // Need ≥2 distinct configured RFCs present (one as emisor, one as receptor)
91
+ const rfcsFound = findInterAgenciaRfcs(source);
92
+ if (rfcsFound.length < 2) return false;
93
+
94
+ // Confirm the invoice is for broker services (customs agent services)
95
+ if (!source.includes(BROKER_SERVICE_CLAVE_PROD_SERV)) return false;
96
+
97
+ return true;
98
+ },
99
+
100
+ // Pedimento extraction is optional / informational — these files are
101
+ // excluded from push, so arela_path is never composed. We still extract
102
+ // a pedimento number when present (from the "Referencias" / "Pedimento:"
103
+ // section of the printable CFDI) for auditability.
104
+ extractNumPedimento: (source, fields) => {
105
+ return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
106
+ },
107
+
108
+ extractPedimentoYear: (source, fields) => {
109
+ const numPedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
110
+ if (numPedimento && numPedimento.length >= 2) {
111
+ const yy = parseInt(numPedimento.substring(0, 2), 10);
112
+ if (!isNaN(yy)) return yy < 50 ? yy + 2000 : yy + 1900;
113
+ }
114
+ return null;
115
+ },
116
+
117
+ extractors: [
118
+ {
119
+ field: 'rfcEmisor',
120
+ extract: (source) => {
121
+ // XML form: <cfdi:Emisor Rfc="..." />
122
+ const xmlMatch = source.match(
123
+ /<[^>]*Emisor[^>]*Rfc\s*=\s*["']([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})["']/i,
124
+ );
125
+ if (xmlMatch) return fieldResult('rfcEmisor', true, xmlMatch[1]);
126
+
127
+ // PDF form: "Emisor" section followed by RFC label/value on later lines.
128
+ // We pick the first INTER_AGENCIA RFC that appears in the document.
129
+ const rfcs = findInterAgenciaRfcs(source);
130
+ if (rfcs.length > 0) return fieldResult('rfcEmisor', true, rfcs[0]);
131
+
132
+ return fieldResult('rfcEmisor', false, null);
133
+ },
134
+ },
135
+ {
136
+ field: 'rfcReceptor',
137
+ extract: (source) => {
138
+ const xmlMatch = source.match(
139
+ /<[^>]*Receptor[^>]*Rfc\s*=\s*["']([A-ZÑ&]{3,4}\d{6}[A-Z0-9]{3})["']/i,
140
+ );
141
+ if (xmlMatch) return fieldResult('rfcReceptor', true, xmlMatch[1]);
142
+
143
+ const rfcs = findInterAgenciaRfcs(source);
144
+ if (rfcs.length >= 2) {
145
+ return fieldResult('rfcReceptor', true, rfcs[1]);
146
+ }
147
+ return fieldResult('rfcReceptor', false, null);
148
+ },
149
+ },
150
+ {
151
+ field: 'folio',
152
+ extract: (source) => {
153
+ // CFDI Folio attribute
154
+ const xmlMatch = source.match(/\bFolio\s*=\s*["']([A-Z0-9-]+)["']/i);
155
+ if (xmlMatch) return fieldResult('folio', true, xmlMatch[1]);
156
+
157
+ // PDF: "Numero Folio 012749"
158
+ const pdfMatch = source.match(/Numero\s+Folio\s+([A-Z0-9-]+)/i);
159
+ if (pdfMatch) return fieldResult('folio', true, pdfMatch[1]);
160
+
161
+ return fieldResult('folio', false, null);
162
+ },
163
+ },
164
+ {
165
+ field: 'uuid',
166
+ extract: (source) => {
167
+ const uuidRe =
168
+ /[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}/i;
169
+ const m = source.match(uuidRe);
170
+ return fieldResult('uuid', !!m, m ? m[0].toUpperCase() : null);
171
+ },
172
+ },
173
+ {
174
+ field: 'numPedimento',
175
+ extract: (source) => {
176
+ // Printable CFDI "Pedimento: 3458 6000046 Fecha: ..." — recovers an
177
+ // 11-digit pedimento (no YY prefix). Useful for auditability only.
178
+ const m = source.match(/Pedimento:?\s*(\d{4})\s*(\d{7})/i);
179
+ if (m) {
180
+ return fieldResult('numPedimento', true, `${m[1]}${m[2]}`);
181
+ }
182
+ return fieldResult('numPedimento', false, null);
183
+ },
184
+ },
185
+ ],
186
+ };
@@ -1,10 +1,8 @@
1
1
  // VUCEM "consultarPedimentoCompleto" XML matcher.
2
2
  //
3
- // STATUS: implemented but NOT registered in `document-type-shared.js`. To
4
- // activate, uncomment the import + registration in that file. All downstream
5
- // code (composeArelaPath, arela-api propagation SQL, IdentifyCommand
6
- // counters) already includes `pedimento_completo_xml`, so re-enabling is a
7
- // single-line change.
3
+ // Registered in `document-type-shared.js`. Downstream code
4
+ // (composeArelaPath, arela-api propagation SQL, IdentifyCommand counters)
5
+ // also includes `pedimento_completo_xml`.
8
6
  //
9
7
  // Filename patterns recognized (try in order — patente extraction):
10
8
  // 1) VU_PATENTE_ADUANA_PEDIMENTO.xml → e.g. VU_3429_070_5016101.xml
@@ -47,6 +45,22 @@ function pad(value, length) {
47
45
  return String(value).padStart(length, '0');
48
46
  }
49
47
 
48
+ /**
49
+ * Convert a VUCEM `aduanaEntradaSalida.clave` (e.g. "70", "750", "40") to the
50
+ * 2-digit "sección aduanera" prefix used inside the 15-digit pedimento number.
51
+ *
52
+ * VUCEM strips leading zeros from the canonical 3-digit SAT aduana code,
53
+ * so `070` (Ciudad Juárez) arrives as `70`. The pedimento prefix is the
54
+ * first 2 digits of the 3-digit code:
55
+ * `70` → `070` → `07` (Cd. Juárez)
56
+ * `750` → `750` → `75` (Puebla)
57
+ * `40` → `040` → `04` (Lázaro Cárdenas)
58
+ */
59
+ function aduanaToSeccion(claveValue) {
60
+ if (claveValue == null) return null;
61
+ return pad(claveValue, 3).substring(0, 2);
62
+ }
63
+
50
64
  /**
51
65
  * Try the three known filename patterns and return {patente, aduana, pedimento}
52
66
  * with any subset of the fields populated. Returns null if no pattern matches.
@@ -102,12 +116,17 @@ function yyFromIsoDate(iso) {
102
116
  return m ? m[1].substring(2, 4) : null;
103
117
  }
104
118
 
105
- // Find <ns2:fechas> block with nested clave==2 and return its <ns2:fecha>.
106
- function findPaymentDate(source) {
119
+ // Find <ns2:fechas> block whose nested <clave> matches `claveValue` and
120
+ // return its <ns2:fecha>. Works for both shapes:
121
+ // <fechas><clave>N</clave><fecha>...</fecha></fechas>
122
+ // <fechas><fecha>...</fecha><tipo><clave>N</clave></tipo></fechas>
123
+ // (firstTag finds the FIRST <clave> in the block — both layouts expose only
124
+ // one clave per fechas entry.)
125
+ function findFechaByClave(source, claveValue) {
107
126
  const fechasBlocks = allTagBlocks(source, 'fechas');
108
127
  for (const block of fechasBlocks) {
109
128
  const clave = firstTag(block, 'clave');
110
- if (clave === '2') {
129
+ if (clave === claveValue) {
111
130
  const fecha = firstTag(block, 'fecha');
112
131
  if (fecha) return fecha;
113
132
  }
@@ -115,6 +134,18 @@ function findPaymentDate(source) {
115
134
  return null;
116
135
  }
117
136
 
137
+ // Fecha de pago de las contribuciones (tipo.clave == 2).
138
+ function findPaymentDate(source) {
139
+ return findFechaByClave(source, '2');
140
+ }
141
+
142
+ // Fecha de presentacion (tipo.clave == 5). This is the authoritative source
143
+ // for the pedimento's YY prefix — a pedimento opened in Dec-2025 but paid in
144
+ // Jan-2026 keeps the `25` prefix, matching what VUCEM stamps in the filename.
145
+ function findPresentationDate(source) {
146
+ return findFechaByClave(source, '5');
147
+ }
148
+
118
149
  // --------------------------- extractors ------------------------------------
119
150
 
120
151
  const rfcExtractor = {
@@ -152,7 +183,7 @@ const aduanaEntradaSalidaExtractor = {
152
183
  return new FieldResult(
153
184
  'aduanaEntradaSalida',
154
185
  !!clave,
155
- clave ? pad(clave, 2) : null,
186
+ aduanaToSeccion(clave),
156
187
  );
157
188
  },
158
189
  };
@@ -165,6 +196,14 @@ const paymentDateExtractor = {
165
196
  },
166
197
  };
167
198
 
199
+ const presentationDateExtractor = {
200
+ field: 'presentationDate',
201
+ extract: (source) => {
202
+ const fecha = findPresentationDate(source);
203
+ return new FieldResult('presentationDate', !!fecha, fecha);
204
+ },
205
+ };
206
+
168
207
  const fechaPagoRectificacionExtractor = {
169
208
  field: 'fechaPagoRectificacion',
170
209
  extract: (source) => {
@@ -257,8 +296,14 @@ export const pedimentoCompletoXmlDefinition = {
257
296
 
258
297
  /**
259
298
  * Compose the 15-digit pedimento number from XML body + filename.
260
- * YY: from rectification fechaPago if present, else from the clave==2
261
- * payment-date fecha; falls back to filename pattern 3.
299
+ * YY: priority order (most authoritative first):
300
+ * 1) Filename pattern 3 (`{15-digit}.xml`) VUCEM stamps the correct
301
+ * prefix at export time.
302
+ * 2) Fecha de presentacion (<fechas><clave>5) — the year the pedimento
303
+ * was opened. Authoritative for the YY prefix even when payment
304
+ * crosses calendar year (e.g. opened Dec-2025, paid Jan-2026 → YY=25).
305
+ * 3) Rectification fechaPago (only when no presentation date exists).
306
+ * 4) Payment date (last-resort fallback).
262
307
  * AA: from <aduanaEntradaSalida><clave> padded to 2.
263
308
  * PPPP: from the filename (any of the three patterns).
264
309
  * NNNNNNN: from <pedimento> padded to 7.
@@ -267,15 +312,19 @@ export const pedimentoCompletoXmlDefinition = {
267
312
  extractNumPedimento: (source, fields, filePath) => {
268
313
  const parts = parseFilenameParts(filePath);
269
314
 
315
+ const presentation = fields?.find(
316
+ (f) => f.name === 'presentationDate' && f.found,
317
+ )?.value;
270
318
  const rect = fields?.find(
271
319
  (f) => f.name === 'fechaPagoRectificacion' && f.found,
272
320
  )?.value;
273
321
  const pay = fields?.find((f) => f.name === 'paymentDate' && f.found)?.value;
274
322
 
275
323
  let yy =
324
+ (parts && parts.year) ||
325
+ yyFromIsoDate(presentation) ||
276
326
  yyFromIsoDate(rect) ||
277
327
  yyFromIsoDate(pay) ||
278
- (parts && parts.year) ||
279
328
  null;
280
329
 
281
330
  const aduanaField = fields?.find(
@@ -315,6 +364,7 @@ export const pedimentoCompletoXmlDefinition = {
315
364
  tipoOperacionExtractor,
316
365
  aduanaEntradaSalidaExtractor,
317
366
  paymentDateExtractor,
367
+ presentationDateExtractor,
318
368
  fechaPagoRectificacionExtractor,
319
369
  coveExtractor,
320
370
  numEDocumentoExtractor,
@@ -17,17 +17,19 @@ export const pedimentoCompletoDefinition = {
17
17
  type: 'pedimento_completo',
18
18
  extensions: ['pdf'],
19
19
  match: (source) => {
20
- // Hard exclude: "FORMA SIMPLIFICADA" is handled by pedimento_simplificado.
21
- if (/FORMA SIMPLIFICADA DE PEDIMENTO/i.test(source)) return false;
20
+ // Hard exclude: "FORMA SIMPLIFICADA [DE|DEL] PEDIMENTO" is handled by
21
+ // pedimento_simplificado.
22
+ if (/FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(source)) return false;
22
23
 
23
24
  // Hard exclude: "AVISO CONSOLIDADO" shares the header trio but is a
24
25
  // different document type handled by aviso_consolidado.
25
26
  if (/AVISO\s+CONSOLIDADO/i.test(source)) return false;
26
27
 
28
+ // The colon after "T. OPER" is optional — see note in pedimento-simplificado.js.
27
29
  const hasHeaderFields =
28
30
  /NUM\.?\s*PEDIMENTO:/i.test(source) &&
29
31
  /CVE\.?\s*PEDIMENTO:/i.test(source) &&
30
- /T\.?\s*OPER:/i.test(source);
32
+ /T\.?\s*OPER:?/i.test(source);
31
33
  if (hasHeaderFields) {
32
34
  const hasCopyMarker =
33
35
  /ORIGINAL:\s*ADMINISTRACION GENERAL DE ADUANAS/i.test(source) ||
@@ -12,15 +12,18 @@ export const pedimentoSimplificadoDefinition = {
12
12
  if (/AVISO\s+CONSOLIDADO/i.test(source)) return false;
13
13
 
14
14
  // Fast path: the literal title appears on standard SIMP layouts.
15
- if (/FORMA SIMPLIFICADA DE PEDIMENTO/i.test(source)) return true;
15
+ // Some prevalidators print "FORMA SIMPLIFICADA DEL PEDIMENTO" (with DEL).
16
+ if (/FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(source)) return true;
16
17
 
17
18
  // Some PDFs (single-page anchors) lack that title but still carry the
18
19
  // three pedimento header fields. Treat them as simplificado UNLESS they
19
20
  // have the multi-page copy markers that uniquely identify a completo.
21
+ // NOTE: the colon after "T. OPER" is optional — many printable layouts
22
+ // render OPER as a table-header label with the value in the next cell.
20
23
  const hasHeaderFields =
21
24
  /NUM\.?\s*PEDIMENTO:/i.test(source) &&
22
25
  /CVE\.?\s*PEDIMENTO:/i.test(source) &&
23
- /T\.?\s*OPER:/i.test(source);
26
+ /T\.?\s*OPER:?/i.test(source);
24
27
  if (!hasHeaderFields) return false;
25
28
 
26
29
  const hasCompletoCopyMarker =
@@ -16,9 +16,9 @@ export const proformaDefinition = {
16
16
  type: 'proforma',
17
17
  extensions: ['pdf'],
18
18
 
19
- // Same content marker as pedimento simplificado
19
+ // Same content marker as pedimento simplificado (accepts "DE" or "DEL").
20
20
  match: (source) => {
21
- return /FORMA SIMPLIFICADA DE PEDIMENTO/i.test(source);
21
+ return /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(source);
22
22
  },
23
23
 
24
24
  extractNumPedimento: pedimentoSimplificadoDefinition.extractNumPedimento,
@@ -3,10 +3,9 @@ import path from 'path';
3
3
  import { PDFParse } from 'pdf-parse';
4
4
 
5
5
  import { extractDocumentFields } from './document-type-shared.js';
6
+ import { classifyDocument } from './scoring/scoring-engine.js';
6
7
 
7
- // Document types that participate in arela_path composition. The XML type is
8
- // kept here even though its matcher is currently disabled — once re-enabled
9
- // in document-type-shared.js no further changes are needed here.
8
+ // Document types that participate in arela_path composition.
10
9
  const ARELA_PATH_TYPES = new Set([
11
10
  'pedimento_simplificado',
12
11
  'pedimento_completo',
@@ -86,6 +85,17 @@ function composeArelaPath(
86
85
  * Detects document types and extracts metadata from files
87
86
  */
88
87
  export class FileDetectionService {
88
+ constructor() {
89
+ // Best-match matchers (adapted from the API). When set, classification uses
90
+ // the scoring engine; otherwise it falls back to legacy first-match-wins.
91
+ this.matchers = null;
92
+ }
93
+
94
+ /** Provide the resolved+adapted matcher set for scoring-based classification. */
95
+ setMatchers(matchers) {
96
+ this.matchers = matchers && matchers.length ? matchers : null;
97
+ }
98
+
89
99
  /**
90
100
  * Detect document type from a file
91
101
  * @param {string} filePath - Path to the file to analyze
@@ -142,9 +152,23 @@ export class FileDetectionService {
142
152
  };
143
153
  }
144
154
 
145
- // Extract document fields and detect type
146
- const [detectedType, fields, detectedPedimento, detectedPedimentoYear] =
147
- extractDocumentFields(text, fileExtension, filePath);
155
+ // Extract document fields and detect type. Use the best-match scoring
156
+ // engine when matchers are configured; otherwise legacy first-match-wins.
157
+ let detectedType, fields, detectedPedimento, detectedPedimentoYear;
158
+ if (this.matchers) {
159
+ const r = classifyDocument(this.matchers, {
160
+ source: text,
161
+ extension: fileExtension,
162
+ filePath,
163
+ });
164
+ detectedType = r.detectedType;
165
+ fields = r.fields;
166
+ detectedPedimento = r.detectedPedimento;
167
+ detectedPedimentoYear = r.detectedPedimentoYear;
168
+ } else {
169
+ [detectedType, fields, detectedPedimento, detectedPedimentoYear] =
170
+ extractDocumentFields(text, fileExtension, filePath);
171
+ }
148
172
 
149
173
  // Extract RFC from fields
150
174
  const rfc = fields?.find((f) => f.name === 'rfc')?.value ?? null;
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Adapt DB matchers (from arela-api `GET /document-matcher/resolved`) into the
3
+ * shape the scoring engine consumes — the HYBRID model:
4
+ *
5
+ * - SELECTION comes from the DB matcher's clues / qualify (per-RFC + globals).
6
+ * - EXTRACTION uses the rich JS extractors keyed by `documentType` when one
7
+ * exists (resolveType, multi-pattern field extractors, pedimento composition);
8
+ * otherwise it falls back to building simple regex extractors from the DB
9
+ * matcher's `fieldExtractors`.
10
+ *
11
+ * This keeps per-client matching configurable from the UI while preserving the
12
+ * robust field extraction that already ships in the uploader.
13
+ */
14
+ // IMPORTANT: load document-type-shared FIRST so it becomes the root of the
15
+ // shared<->definitions import cycle and fully evaluates before the individual
16
+ // definitions are referenced (otherwise: "Cannot access X before initialization").
17
+ import { FieldResult } from '../document-type-shared.js';
18
+ import { dodaPdfDefinition } from '../document-types/doda-pdf.js';
19
+ import { dodaXmlDefinition } from '../document-types/doda-xml.js';
20
+ import { facturaInterAgenciaDefinition } from '../document-types/factura-inter-agencia.js';
21
+ import { facturasComerciales } from '../document-types/facturas-comerciales.js';
22
+ import { pedimentoCompletoXmlDefinition } from '../document-types/pedimento-completo-xml.js';
23
+ import { pedimentoCompletoDefinition } from '../document-types/pedimento-completo.js';
24
+ import { pedimentoSimplificadoDefinition } from '../document-types/pedimento-simplificado.js';
25
+ import { supportDocumentDefinition } from '../document-types/support-document.js';
26
+
27
+ // documentType -> rich extraction half of the JS definition.
28
+ function extractionOf(def) {
29
+ return {
30
+ extractors: def.extractors,
31
+ resolveType: def.resolveType,
32
+ extractNumPedimento: def.extractNumPedimento,
33
+ extractPedimentoYear: def.extractPedimentoYear,
34
+ };
35
+ }
36
+
37
+ const EXTRACTION_REGISTRY = {
38
+ pedimento_simplificado: extractionOf(pedimentoSimplificadoDefinition),
39
+ pedimento_completo: extractionOf(pedimentoCompletoDefinition),
40
+ pedimento_completo_xml: extractionOf(pedimentoCompletoXmlDefinition),
41
+ doda_pdf: extractionOf(dodaPdfDefinition),
42
+ doda_xml: extractionOf(dodaXmlDefinition),
43
+ factura_inter_agencia: extractionOf(facturaInterAgenciaDefinition),
44
+ factura_comercial: extractionOf(facturasComerciales),
45
+ support_document: extractionOf(supportDocumentDefinition),
46
+ };
47
+
48
+ // Build a scoring-engine extractor from a DB fieldExtractor (regex + capture).
49
+ function regexExtractor(fe) {
50
+ return {
51
+ field: fe.field,
52
+ extract: (source) => {
53
+ try {
54
+ const m = source.match(new RegExp(fe.extractor, fe.flags || ''));
55
+ return new FieldResult(fe.field, !!m, m ? (m[1] ?? m[0]) : null);
56
+ } catch {
57
+ return new FieldResult(fe.field, false, null);
58
+ }
59
+ },
60
+ };
61
+ }
62
+
63
+ /**
64
+ * Convert resolved DB matchers into scoring-engine matchers.
65
+ * @param {Array} dbMatchers - matchers from the API (with clues + fieldExtractors)
66
+ * @returns {Array} scoring matchers
67
+ */
68
+ export function adaptDbMatchers(dbMatchers) {
69
+ return (dbMatchers || []).map((m) => {
70
+ const rich = EXTRACTION_REGISTRY[m.documentType];
71
+ const extraction = rich
72
+ ? rich
73
+ : { extractors: (m.fieldExtractors || []).map(regexExtractor) };
74
+
75
+ return {
76
+ documentType: m.documentType,
77
+ extensions: Array.isArray(m.extensions)
78
+ ? m.extensions
79
+ : String(m.extensions || '')
80
+ .split(',')
81
+ .map((s) => s.trim())
82
+ .filter(Boolean),
83
+ minScore: m.minScore ?? undefined,
84
+ priority: m.priority ?? 0,
85
+ qualify: m.qualify ?? undefined,
86
+ clues: (m.clues || []).map((c) => ({
87
+ kind: c.kind,
88
+ pattern: c.pattern,
89
+ flags: c.flags || undefined,
90
+ weight: c.weight ?? 1,
91
+ group: c.group || undefined,
92
+ required: !!c.required,
93
+ negative: !!c.negative,
94
+ })),
95
+ ...extraction,
96
+ };
97
+ });
98
+ }