@arela/uploader 1.0.22 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,386 @@
1
+ /**
2
+ * Seed matchers for the scoring engine (PROTOTYPE).
3
+ *
4
+ * Each entry re-expresses the boolean `match()` of an existing
5
+ * `src/document-types/*.js` definition as a set of weighted **clues**, while
6
+ * REUSING that definition's `extractors` / `resolveType` /
7
+ * `extractNumPedimento` / `extractPedimentoYear` verbatim. Only the SELECTION
8
+ * logic is new — field extraction is unchanged, so a comparison against
9
+ * `extractDocumentFields` isolates the first-match-wins → best-match change.
10
+ *
11
+ * Clue → flag mapping used throughout:
12
+ * - strong positive signal → high `weight`
13
+ * - hard exclusion (return false in the original) → `negative: true`
14
+ * - mandatory signature → `required: true`
15
+ * - `minScore` is tuned so the weighted sum reproduces the original boolean
16
+ * on the existing test fixtures.
17
+ *
18
+ * NOTE: a few original conditions are compound (e.g. simplificado's
19
+ * "COVE: present AND PAGO absent", or inter-agencia's "≥2 distinct RFCs from a
20
+ * set"). Where a single clue cannot express the exact boolean, the closest
21
+ * faithful approximation is used and flagged inline — the comparison harness
22
+ * (`scripts/scoring-compare.js`) surfaces any divergence on a real corpus.
23
+ */
24
+ import { dodaPdfDefinition } from '../document-types/doda-pdf.js';
25
+ import { dodaXmlDefinition } from '../document-types/doda-xml.js';
26
+ import {
27
+ INTER_AGENCIA_RFCS,
28
+ facturaInterAgenciaDefinition,
29
+ } from '../document-types/factura-inter-agencia.js';
30
+ import { facturasComerciales } from '../document-types/facturas-comerciales.js';
31
+ import { pedimentoCompletoXmlDefinition } from '../document-types/pedimento-completo-xml.js';
32
+ import { pedimentoCompletoDefinition } from '../document-types/pedimento-completo.js';
33
+ import { pedimentoSimplificadoDefinition } from '../document-types/pedimento-simplificado.js';
34
+ import { supportDocumentDefinition } from '../document-types/support-document.js';
35
+
36
+ // Pull the extraction half of a definition (reused unchanged).
37
+ function reuse(def) {
38
+ return {
39
+ extractors: def.extractors,
40
+ resolveType: def.resolveType,
41
+ extractNumPedimento: def.extractNumPedimento,
42
+ extractPedimentoYear: def.extractPedimentoYear,
43
+ };
44
+ }
45
+
46
+ const PEDIMENTO_NUM = /\d{2}\s?\d{2}\s?\d{4}\s?\d{7}/;
47
+
48
+ // --- pedimento_simplificado --------------------------------------------------
49
+ // Original: AVISO/COVE excluded; title "FORMA SIMPLIFICADA DE[L] PEDIMENTO"
50
+ // short-circuits to true; otherwise the header trio (all three) qualifies.
51
+ // Copy markers are NOT made negative here — they let `pedimento_completo`
52
+ // outscore on completo layouts, which reproduces the title short-circuit.
53
+ const simplificado = {
54
+ documentType: 'pedimento_simplificado',
55
+ extensions: ['pdf'],
56
+ minScore: 3, // title(5) OR full header trio(1+1+1)
57
+ priority: 1,
58
+ ...reuse(pedimentoSimplificadoDefinition),
59
+ clues: [
60
+ { kind: 'CONTENT_REGEX', pattern: /AVISO\s+CONSOLIDADO/i, negative: true },
61
+ {
62
+ kind: 'CONTENT_REGEX',
63
+ pattern: /COMPROBANTE\s+DE\s+VALOR\s+ELECTR[ÓO]NICO/i,
64
+ negative: true,
65
+ },
66
+ {
67
+ kind: 'CONTENT_REGEX',
68
+ pattern: /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i,
69
+ weight: 5,
70
+ },
71
+ { kind: 'CONTENT_REGEX', pattern: /NUM\.?\s*PEDIMENTO:/i, weight: 1 },
72
+ { kind: 'CONTENT_REGEX', pattern: /CVE\.?\s*PEDIMENTO:/i, weight: 1 },
73
+ { kind: 'CONTENT_REGEX', pattern: /T\.?\s*OPER:?/i, weight: 1 },
74
+ ],
75
+ };
76
+
77
+ // --- pedimento_completo ------------------------------------------------------
78
+ // Original: exclude FORMA SIMPLIFICADA & AVISO; (header trio AND ≥1 copy marker)
79
+ // OR clue-count heuristic (>25% of ~18 clues ≈ ≥5).
80
+ const completo = {
81
+ documentType: 'pedimento_completo',
82
+ extensions: ['pdf'],
83
+ minScore: 5, // header trio(3) + 1 copy marker(2), or ≥5 fallback clues
84
+ priority: 1,
85
+ ...reuse(pedimentoCompletoDefinition),
86
+ clues: [
87
+ {
88
+ kind: 'CONTENT_REGEX',
89
+ pattern: /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i,
90
+ negative: true,
91
+ },
92
+ { kind: 'CONTENT_REGEX', pattern: /AVISO\s+CONSOLIDADO/i, negative: true },
93
+ // header trio
94
+ { kind: 'CONTENT_REGEX', pattern: /NUM\.?\s*PEDIMENTO:/i, weight: 1 },
95
+ { kind: 'CONTENT_REGEX', pattern: /CVE\.?\s*PEDIMENTO:/i, weight: 1 },
96
+ { kind: 'CONTENT_REGEX', pattern: /T\.?\s*OPER:?/i, weight: 1 },
97
+ // copy markers (long-form pedimento signatures)
98
+ {
99
+ kind: 'CONTENT_REGEX',
100
+ pattern: /ORIGINAL:\s*ADMINISTRACION GENERAL DE ADUANAS/i,
101
+ weight: 2,
102
+ },
103
+ { kind: 'CONTENT_REGEX', pattern: /SEGUNDA\s+COPIA/i, weight: 2 },
104
+ { kind: 'CONTENT_REGEX', pattern: /TERCERA\s+COPIA/i, weight: 2 },
105
+ {
106
+ kind: 'CONTENT_REGEX',
107
+ pattern: /COPIA\s+(SIMPLIFICAD[AO])?\s*TRANSPORTISTA/i,
108
+ weight: 2,
109
+ },
110
+ { kind: 'CONTENT_REGEX', pattern: /DEFINITIVO/i, weight: 2 },
111
+ { kind: 'CONTENT_REGEX', pattern: /ANEXO\s+DEL\s+PEDIMENTO/i, weight: 2 },
112
+ {
113
+ kind: 'CONTENT_REGEX',
114
+ pattern: /\*+FIN\s+DE\s+PEDIMENTO\s*\*+/i,
115
+ weight: 2,
116
+ },
117
+ // exotic-layout fallback clues (weight 1 each)
118
+ { kind: 'CONTENT_REGEX', pattern: /CERTIFICACIONES/i, weight: 1 },
119
+ { kind: 'CONTENT_REGEX', pattern: /CUADRO\s+DE\s+LIQUIDACION/i, weight: 1 },
120
+ {
121
+ kind: 'CONTENT_REGEX',
122
+ pattern: /\*\*\*\s+PAGO\s+ELECTRONICO\s+\*\*\*/i,
123
+ weight: 1,
124
+ },
125
+ { kind: 'CONTENT_REGEX', pattern: /MEDIOS\s+DE\s+TRANSPORTE/i, weight: 1 },
126
+ {
127
+ kind: 'CONTENT_REGEX',
128
+ pattern: /DATOS\s+DEL\s+IMPORTADOR\/EXPORTADOR/i,
129
+ weight: 1,
130
+ },
131
+ {
132
+ kind: 'CONTENT_REGEX',
133
+ pattern: /DATOS\s+DEL\s+PROVEEDOR\s+O\s+COMPRADOR/i,
134
+ weight: 1,
135
+ },
136
+ { kind: 'CONTENT_REGEX', pattern: /LINEA\s+DE\s+CAPTURA:/i, weight: 1 },
137
+ {
138
+ kind: 'CONTENT_REGEX',
139
+ pattern: /DECLARO\s+BAJO\s+PROTESTA\s+DE\s+DECIR\s+VERDAD/i,
140
+ weight: 1,
141
+ },
142
+ {
143
+ kind: 'CONTENT_REGEX',
144
+ pattern: /PEDIMENTO\s+ELABORADO\s+DE\s+CONFORMIDAD/i,
145
+ weight: 1,
146
+ },
147
+ ],
148
+ };
149
+
150
+ // --- pedimento_completo_xml --------------------------------------------------
151
+ // Original: single condition — the VUCEM response root tag.
152
+ const completoXml = {
153
+ documentType: 'pedimento_completo_xml',
154
+ extensions: ['xml'],
155
+ minScore: 1,
156
+ priority: 2, // authoritative signal — must dominate doda_xml/support on XML
157
+ ...reuse(pedimentoCompletoXmlDefinition),
158
+ clues: [
159
+ {
160
+ kind: 'CONTENT_REGEX',
161
+ pattern: /consultarPedimentoCompletoRespuesta/i,
162
+ weight: 10,
163
+ required: true,
164
+ },
165
+ ],
166
+ };
167
+
168
+ // --- doda_pdf ----------------------------------------------------------------
169
+ // Original: primary marker → true; OR (≥2 secondary + pedimento#);
170
+ // OR (doda-context + pedimento# + ≥1 secondary).
171
+ const dodaPdf = {
172
+ documentType: 'doda_pdf',
173
+ extensions: ['pdf'],
174
+ priority: 1,
175
+ qualify: [
176
+ { primary: 1 },
177
+ { secondary: 2, pedimento: 1 },
178
+ { context: 1, pedimento: 1, secondary: 1 },
179
+ ],
180
+ ...reuse(dodaPdfDefinition),
181
+ clues: [
182
+ {
183
+ kind: 'CONTENT_REGEX',
184
+ pattern: /DOCUMENTO DE OPERACI[OÓ]N PARA DESPACHO ADUANERO/i,
185
+ weight: 5,
186
+ group: 'primary',
187
+ },
188
+ { kind: 'CONTENT_REGEX', pattern: /DODA/i, weight: 1, group: 'secondary' },
189
+ { kind: 'CONTENT_REGEX', pattern: /VUCEM/i, weight: 1, group: 'secondary' },
190
+ {
191
+ kind: 'CONTENT_REGEX',
192
+ pattern: PEDIMENTO_NUM,
193
+ weight: 1,
194
+ group: 'pedimento',
195
+ },
196
+ {
197
+ kind: 'CONTENT_REGEX',
198
+ pattern: /despacho aduanero|operaci[oó]n aduanera|validaci[oó]n/i,
199
+ weight: 1,
200
+ group: 'context',
201
+ },
202
+ ],
203
+ };
204
+
205
+ // --- doda_xml ----------------------------------------------------------------
206
+ // Original: ≥1 doda marker → true; OR (≥3 pedimento markers AND <?xml).
207
+ const dodaXml = {
208
+ documentType: 'doda_xml',
209
+ extensions: ['xml'],
210
+ priority: 1,
211
+ qualify: [{ doda: 1 }, { pedimento: 3, xml: 1 }],
212
+ ...reuse(dodaXmlDefinition),
213
+ clues: [
214
+ {
215
+ kind: 'CONTENT_REGEX',
216
+ pattern: /documentoOperacion/i,
217
+ weight: 3,
218
+ group: 'doda',
219
+ },
220
+ {
221
+ kind: 'CONTENT_REGEX',
222
+ pattern: /despachoAduanero/i,
223
+ weight: 3,
224
+ group: 'doda',
225
+ },
226
+ { kind: 'CONTENT_REGEX', pattern: /<doda\b/i, weight: 3, group: 'doda' },
227
+ {
228
+ kind: 'CONTENT_REGEX',
229
+ pattern: /xmlns[^"]*doda/i,
230
+ weight: 3,
231
+ group: 'doda',
232
+ },
233
+ { kind: 'CONTENT_REGEX', pattern: /VUCEM/i, weight: 3, group: 'doda' },
234
+ {
235
+ kind: 'CONTENT_REGEX',
236
+ pattern: /numPedimento/i,
237
+ weight: 1,
238
+ group: 'pedimento',
239
+ },
240
+ {
241
+ kind: 'CONTENT_REGEX',
242
+ pattern: /patenteAduanal/i,
243
+ weight: 1,
244
+ group: 'pedimento',
245
+ },
246
+ {
247
+ kind: 'CONTENT_REGEX',
248
+ pattern: /aduanaDespacho/i,
249
+ weight: 1,
250
+ group: 'pedimento',
251
+ },
252
+ {
253
+ kind: 'CONTENT_REGEX',
254
+ pattern: /tipoOperacion/i,
255
+ weight: 1,
256
+ group: 'pedimento',
257
+ },
258
+ // structural gate for the pedimento-markers path (weight 0 = gate only)
259
+ { kind: 'CONTENT_REGEX', pattern: /<\?xml/i, weight: 0, group: 'xml' },
260
+ ],
261
+ };
262
+
263
+ // --- factura_inter_agencia ---------------------------------------------------
264
+ // Original: isCfdiContent AND ≥2 distinct configured RFCs AND broker clave.
265
+ // The pair of agency RFCs and the broker-service clave are modelled as REQUIRED
266
+ // clues, which is what lets this win over `factura_comercial` purely by score —
267
+ // no registration-order dependency. (Scope-limited to the configured pair, same
268
+ // as the original; widening means making RFC presence a counting rule.)
269
+ const interAgencia = {
270
+ documentType: 'factura_inter_agencia',
271
+ extensions: ['xml', 'pdf'],
272
+ minScore: 25, // both required RFCs (10+10) + required clave (5)
273
+ priority: 3,
274
+ ...reuse(facturaInterAgenciaDefinition),
275
+ clues: [
276
+ ...INTER_AGENCIA_RFCS.map((rfc) => ({
277
+ kind: 'CONTENT_REGEX',
278
+ pattern: new RegExp(`\\b${rfc}\\b`, 'i'),
279
+ weight: 10,
280
+ required: true,
281
+ })),
282
+ // BROKER_SERVICE_CLAVE_PROD_SERV (78141502 = servicios de agentes aduaneros)
283
+ { kind: 'CONTENT_REGEX', pattern: /78141502/, weight: 5, required: true },
284
+ // CFDI content markers (informational positive signal)
285
+ { kind: 'CONTENT_REGEX', pattern: /cfdi:Comprobante/i, weight: 1 },
286
+ { kind: 'CONTENT_REGEX', pattern: /xmlns:cfdi/i, weight: 1 },
287
+ { kind: 'CONTENT_REGEX', pattern: /TipoDeComprobante/i, weight: 1 },
288
+ ],
289
+ };
290
+
291
+ // --- factura_comercial -------------------------------------------------------
292
+ // Original: cfdiMatches≥2 OR (invoiceMatches≥1 AND customsMatches≥1).
293
+ // Faithfully expressed with clue groups + qualify (OR-of-ANDs) — a flat
294
+ // minScore could not enforce the "invoice AND customs" pairing and produced
295
+ // false positives on COVE acuses (customs keywords alone reaching the threshold).
296
+ const facturaComercial = {
297
+ documentType: 'factura_comercial',
298
+ extensions: ['pdf', 'xml'],
299
+ priority: 0,
300
+ qualify: [{ cfdi: 2 }, { invoice: 1, customs: 1 }],
301
+ ...reuse(facturasComerciales),
302
+ clues: [
303
+ { kind: 'CONTENT_REGEX', pattern: /cfdi:Comprobante/i, group: 'cfdi' },
304
+ { kind: 'CONTENT_REGEX', pattern: /xmlns:cfdi/i, group: 'cfdi' },
305
+ { kind: 'CONTENT_REGEX', pattern: /TipoDeComprobante/i, group: 'cfdi' },
306
+ { kind: 'CONTENT_REGEX', pattern: /timbreFiscalDigital/i, group: 'cfdi' },
307
+ { kind: 'CONTENT_REGEX', pattern: /SelloSAT/i, group: 'cfdi' },
308
+ {
309
+ kind: 'CONTENT_REGEX',
310
+ pattern: /factura\s*(comercial|de\s*venta|de\s*exportaci[oó]n)?/i,
311
+ group: 'invoice',
312
+ },
313
+ {
314
+ kind: 'CONTENT_REGEX',
315
+ pattern: /commercial\s*invoice/i,
316
+ group: 'invoice',
317
+ },
318
+ { kind: 'CONTENT_REGEX', pattern: /invoice\s*number/i, group: 'invoice' },
319
+ {
320
+ kind: 'CONTENT_REGEX',
321
+ pattern: /n[uú]mero\s*de\s*factura/i,
322
+ group: 'invoice',
323
+ },
324
+ { kind: 'CONTENT_REGEX', pattern: /pedimento/i, group: 'customs' },
325
+ { kind: 'CONTENT_REGEX', pattern: /aduana/i, group: 'customs' },
326
+ {
327
+ kind: 'CONTENT_REGEX',
328
+ pattern: /importaci[oó]n|exportaci[oó]n/i,
329
+ group: 'customs',
330
+ },
331
+ {
332
+ kind: 'CONTENT_REGEX',
333
+ pattern: /despacho\s*aduanero/i,
334
+ group: 'customs',
335
+ },
336
+ {
337
+ kind: 'CONTENT_REGEX',
338
+ pattern: /fracci[oó]n\s*arancelaria/i,
339
+ group: 'customs',
340
+ },
341
+ ],
342
+ };
343
+
344
+ // --- support_document --------------------------------------------------------
345
+ // Original: soapFound≥2 OR customsFound≥2. Broad fallback → lowest priority so
346
+ // it only wins when no specific matcher qualifies.
347
+ const supportDocument = {
348
+ documentType: 'support_document',
349
+ extensions: ['xml', 'txt', 'json'],
350
+ priority: -1,
351
+ qualify: [{ soap: 2 }, { customs: 2 }],
352
+ ...reuse(supportDocumentDefinition),
353
+ clues: [
354
+ { kind: 'CONTENT_REGEX', pattern: /soapenv:Envelope/i, group: 'soap' },
355
+ { kind: 'CONTENT_REGEX', pattern: /xmlns:soapenv=/i, group: 'soap' },
356
+ {
357
+ kind: 'CONTENT_REGEX',
358
+ pattern: /solicitarRecibirCoveServicio/i,
359
+ group: 'soap',
360
+ },
361
+ { kind: 'CONTENT_REGEX', pattern: /tipoOperacion/i, group: 'soap' },
362
+ { kind: 'CONTENT_REGEX', pattern: /patenteAduanal/i, group: 'soap' },
363
+ // customs metadata fallback — original requires BOTH patterns present
364
+ { kind: 'CONTENT_REGEX', pattern: /rfc/i, group: 'customs' },
365
+ {
366
+ kind: 'CONTENT_REGEX',
367
+ pattern: /patente|aduana|customs|pedimento/i,
368
+ group: 'customs',
369
+ },
370
+ ],
371
+ };
372
+
373
+ /**
374
+ * Default/global seed set. Order is irrelevant — best-match selects the winner.
375
+ * (This becomes the seed for DEFAULT matchers when the model moves to the DB.)
376
+ */
377
+ export const scoringMatchers = [
378
+ simplificado,
379
+ completo,
380
+ completoXml,
381
+ dodaPdf,
382
+ dodaXml,
383
+ interAgencia,
384
+ facturaComercial,
385
+ supportDocument,
386
+ ];
@@ -0,0 +1,218 @@
1
+ /**
2
+ * Scoring-based document classification engine (PROTOTYPE).
3
+ *
4
+ * Replaces the "first-match-wins" selection in `document-type-shared.js`
5
+ * (`extractDocumentFields`) with "best-match": every applicable matcher is
6
+ * scored by the weight of the clues it satisfies, and the highest score wins.
7
+ * This removes the order-dependent registration that the current registry
8
+ * relies on (e.g. `factura_inter_agencia` MUST be evaluated before
9
+ * `facturas_comerciales`) — precedence now lives in clue weights / `required`
10
+ * / `negative` flags, not in array order.
11
+ *
12
+ * Matcher shape (see `matchers-seed.js`):
13
+ * {
14
+ * documentType, extensions[], minScore, priority,
15
+ * clues: [{ kind, pattern, flags?, weight=1, group?, required=false, negative=false }],
16
+ * qualify?: [{ <group>: minCount, ... }, ...], // OR-of-ANDs gate
17
+ * extractors, resolveType?, extractNumPedimento?, extractPedimentoYear?
18
+ * }
19
+ *
20
+ * Two separable concerns:
21
+ * - QUALIFICATION (does this matcher apply at all?): `required`/`negative`
22
+ * clues plus an optional `qualify` rule set. `qualify` is a list of
23
+ * alternative rules (OR); a rule is a map of `group -> minimum matched
24
+ * clues` (AND across its entries). This expresses the grouped boolean gates
25
+ * of the original matchers, e.g. `(cfdi>=2) OR (invoice>=1 AND customs>=1)`.
26
+ * When `qualify` is absent, the gate falls back to `score >= minScore`.
27
+ * - RANKING (which qualifying matcher wins?): always the weighted sum of
28
+ * matched clues (`score`), tie-broken by fraction -> priority -> type.
29
+ *
30
+ * Selection only depends on clues/qualify. The winning matcher's `extractors` /
31
+ * `resolveType` / `extractNumPedimento` / `extractPedimentoYear` run AFTER
32
+ * selection with the same post-processing as `extractDocumentFields`, so a
33
+ * side-by-side comparison isolates the selection change.
34
+ */
35
+ import path from 'path';
36
+
37
+ import { FieldResult } from '../document-type-shared.js';
38
+
39
+ function toRegExp(clue) {
40
+ return clue.pattern instanceof RegExp
41
+ ? clue.pattern
42
+ : new RegExp(clue.pattern, clue.flags ?? '');
43
+ }
44
+
45
+ function clueTarget(clue, ctx) {
46
+ // FILENAME_REGEX tests the file name; every other kind tests the content.
47
+ return clue.kind === 'FILENAME_REGEX'
48
+ ? (ctx.fileName ?? '')
49
+ : (ctx.source ?? '');
50
+ }
51
+
52
+ /**
53
+ * Score a single matcher against a document context.
54
+ * @returns {null} when the matcher does not apply (extension mismatch),
55
+ * `{ disqualified: true, reason }` when a `required`/`negative` clue
56
+ * rules it out, otherwise a scored result object.
57
+ */
58
+ export function scoreMatcher(matcher, ctx) {
59
+ const ext = (ctx.extension ?? '').toLowerCase();
60
+ if (
61
+ Array.isArray(matcher.extensions) &&
62
+ matcher.extensions.length > 0 &&
63
+ ext &&
64
+ !matcher.extensions.includes(ext)
65
+ ) {
66
+ return null;
67
+ }
68
+
69
+ let matchedWeight = 0;
70
+ let totalWeight = 0;
71
+ const matchedClues = [];
72
+ const groupCounts = {};
73
+
74
+ for (const clue of matcher.clues ?? []) {
75
+ const weight = clue.weight ?? 1;
76
+ const hit = toRegExp(clue).test(clueTarget(clue, ctx));
77
+
78
+ if (clue.negative) {
79
+ if (hit) {
80
+ return {
81
+ documentType: matcher.documentType,
82
+ disqualified: true,
83
+ reason: `negative:${clue.pattern}`,
84
+ };
85
+ }
86
+ continue;
87
+ }
88
+
89
+ if (clue.required && !hit) {
90
+ return {
91
+ documentType: matcher.documentType,
92
+ disqualified: true,
93
+ reason: `required-missing:${clue.pattern}`,
94
+ };
95
+ }
96
+
97
+ totalWeight += weight;
98
+ if (hit) {
99
+ matchedWeight += weight;
100
+ matchedClues.push(clue);
101
+ if (clue.group) {
102
+ groupCounts[clue.group] = (groupCounts[clue.group] ?? 0) + 1;
103
+ }
104
+ }
105
+ }
106
+
107
+ // Qualification gate: `qualify` rules (OR-of-ANDs over group counts) when
108
+ // present, otherwise the weighted-score threshold.
109
+ const passed = Array.isArray(matcher.qualify)
110
+ ? matcher.qualify.some((rule) =>
111
+ Object.entries(rule).every(
112
+ ([group, min]) => (groupCounts[group] ?? 0) >= min,
113
+ ),
114
+ )
115
+ : matchedWeight >= (matcher.minScore ?? 1);
116
+
117
+ return {
118
+ documentType: matcher.documentType,
119
+ matcher,
120
+ disqualified: false,
121
+ score: matchedWeight,
122
+ totalWeight,
123
+ fraction: totalWeight > 0 ? matchedWeight / totalWeight : 0,
124
+ priority: matcher.priority ?? 0,
125
+ passed,
126
+ matchedClues,
127
+ groupCounts,
128
+ };
129
+ }
130
+
131
+ /**
132
+ * All qualifying candidates, sorted best-first.
133
+ * Order: score desc → fraction desc → priority desc → documentType (stable).
134
+ */
135
+ export function scoreAll(matchers, ctx) {
136
+ const candidates = [];
137
+ for (const matcher of matchers) {
138
+ const result = scoreMatcher(matcher, ctx);
139
+ if (!result || result.disqualified || !result.passed) continue;
140
+ candidates.push(result);
141
+ }
142
+ candidates.sort(
143
+ (a, b) =>
144
+ b.score - a.score ||
145
+ b.fraction - a.fraction ||
146
+ b.priority - a.priority ||
147
+ String(a.documentType).localeCompare(String(b.documentType)),
148
+ );
149
+ return candidates;
150
+ }
151
+
152
+ export function selectBestMatch(matchers, ctx) {
153
+ return scoreAll(matchers, ctx)[0] ?? null;
154
+ }
155
+
156
+ /**
157
+ * Full classification: pick the best matcher, then run ITS extractors /
158
+ * resolveType / pedimento helpers. Post-selection logic mirrors
159
+ * `extractDocumentFields` so a comparison isolates the selection change.
160
+ *
161
+ * @returns {{ detectedType, fields, detectedPedimento, detectedPedimentoYear,
162
+ * winner, candidates }}
163
+ */
164
+ export function classifyDocument(matchers, { source, extension, filePath }) {
165
+ const ctx = {
166
+ source,
167
+ extension,
168
+ fileName: filePath ? path.basename(filePath) : '',
169
+ };
170
+ const candidates = scoreAll(matchers, ctx);
171
+ const winner = candidates[0] ?? null;
172
+
173
+ if (!winner) {
174
+ return {
175
+ detectedType: null,
176
+ fields: [],
177
+ detectedPedimento: null,
178
+ detectedPedimentoYear: null,
179
+ winner: null,
180
+ candidates,
181
+ };
182
+ }
183
+
184
+ const def = winner.matcher;
185
+ const fields = [];
186
+ for (const extractor of def.extractors ?? []) {
187
+ try {
188
+ fields.push(extractor.extract(source));
189
+ } catch {
190
+ fields.push(new FieldResult(extractor.field, false, null));
191
+ }
192
+ }
193
+
194
+ const resolvedType = def.resolveType
195
+ ? def.resolveType(fields)
196
+ : def.documentType;
197
+ const pedimento = def.extractNumPedimento
198
+ ? def.extractNumPedimento(source, fields, filePath)
199
+ : null;
200
+ const year = def.extractPedimentoYear
201
+ ? def.extractPedimentoYear(source, fields, filePath)
202
+ : null;
203
+
204
+ // Backfill numPedimento as a field (same as extractDocumentFields) so
205
+ // downstream consumers (composeArelaPath) see a consistent shape.
206
+ if (pedimento && !fields.some((f) => f.name === 'numPedimento')) {
207
+ fields.push(new FieldResult('numPedimento', true, pedimento));
208
+ }
209
+
210
+ return {
211
+ detectedType: resolvedType,
212
+ fields,
213
+ detectedPedimento: pedimento,
214
+ detectedPedimentoYear: year,
215
+ winner,
216
+ candidates,
217
+ };
218
+ }
@@ -389,6 +389,20 @@ export class ScanApiService {
389
389
  return result;
390
390
  }
391
391
 
392
+ /**
393
+ * Fetch the resolved matcher set (this RFC's matchers + globals) for runtime
394
+ * classification. Returns an array of matchers with clues + fieldExtractors.
395
+ * @param {string|null} rfc - optional RFC to scope per-company matchers
396
+ */
397
+ async getResolvedMatchers(rfc = null) {
398
+ const qs = rfc ? `?rfc=${encodeURIComponent(rfc)}` : '';
399
+ const result = await this.#request(
400
+ `/api/document-matcher/resolved${qs}`,
401
+ 'GET',
402
+ );
403
+ return Array.isArray(result) ? result : [];
404
+ }
405
+
392
406
  async fetchPdfsForDetection(
393
407
  tableName,
394
408
  offset = 0,