@arela/uploader 1.0.23 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/docs/AUTO_PROCESSING_PIPELINE.md +258 -0
  2. package/docs/COMPLETE_USAGE_GUIDE.md +1363 -0
  3. package/docs/DATABASESERVICE_IMPROVEMENTS.md +546 -0
  4. package/docs/PASO_2_TEST_RESULTS.md +298 -0
  5. package/docs/PASO_3_PLAN.md +385 -0
  6. package/docs/PHASE_1_FILE_DETECTION.md +366 -0
  7. package/docs/PHASE_2_API_INTEGRATION.md +426 -0
  8. package/docs/PHASE_3_DATABASE_MANAGEMENT.md +480 -0
  9. package/docs/PHASE_4_FILE_OPERATIONS.md +448 -0
  10. package/docs/PHASE_5_WATCH_MODE.md +450 -0
  11. package/docs/PHASE_6_SIGNAL_HANDLING.md +472 -0
  12. package/docs/PHASE_7_ADVANCED_FEATURES.md +560 -0
  13. package/docs/PLAN_WATCH_FEATURE.md +417 -0
  14. package/docs/README.md +480 -0
  15. package/docs/SCHEMA_ALIGNMENT_SUMMARY.md +301 -0
  16. package/docs/SMARTWATCH_DATABASE_REFACTORING.md +181 -0
  17. package/docs/SMART_WATCH_DATABASE_CHANGES.md +502 -0
  18. package/docs/TESTING_WATCH_MODE.md +212 -0
  19. package/docs/WATCHER_API_IMPLEMENTATION.md +520 -0
  20. package/docs/WATCHER_API_INTEGRATION.md +562 -0
  21. package/docs/WATCHER_SETUP_GUIDE.md +614 -0
  22. package/docs/WATCH_ARCHITECTURE.md +395 -0
  23. package/docs/WATCH_AUTO_PIPELINE.md +334 -0
  24. package/docs/WATCH_CONFIGURATION.md +267 -0
  25. package/docs/WATCH_USAGE_GUIDE.md +567 -0
  26. package/docs/commands.md +14 -0
  27. package/package.json +1 -1
  28. package/scripts/scoring-compare.js +243 -0
  29. package/scripts/scoring-phase4-check.js +96 -0
  30. package/src/commands/IdentifyCommand.js +36 -0
  31. package/src/config/config.js +2 -2
  32. package/src/file-detection.js +71 -4
  33. package/src/scoring/db-matcher-adapter.js +98 -0
  34. package/src/scoring/matchers-seed.js +386 -0
  35. package/src/scoring/scoring-engine.js +246 -0
  36. package/src/services/ScanApiService.js +14 -0
  37. package/tests/unit/scoring-engine.test.js +221 -0
  38. package/.vscode/settings.json +0 -1
  39. package/coverage/IdentifyCommand.js.html +0 -1462
  40. package/coverage/PropagateCommand.js.html +0 -1507
  41. package/coverage/PushCommand.js.html +0 -1504
  42. package/coverage/ScanCommand.js.html +0 -1654
  43. package/coverage/UploadCommand.js.html +0 -1846
  44. package/coverage/WatchCommand.js.html +0 -4111
  45. package/coverage/base.css +0 -224
  46. package/coverage/block-navigation.js +0 -87
  47. package/coverage/favicon.png +0 -0
  48. package/coverage/index.html +0 -191
  49. package/coverage/lcov-report/IdentifyCommand.js.html +0 -1462
  50. package/coverage/lcov-report/PropagateCommand.js.html +0 -1507
  51. package/coverage/lcov-report/PushCommand.js.html +0 -1504
  52. package/coverage/lcov-report/ScanCommand.js.html +0 -1654
  53. package/coverage/lcov-report/UploadCommand.js.html +0 -1846
  54. package/coverage/lcov-report/WatchCommand.js.html +0 -4111
  55. package/coverage/lcov-report/base.css +0 -224
  56. package/coverage/lcov-report/block-navigation.js +0 -87
  57. package/coverage/lcov-report/favicon.png +0 -0
  58. package/coverage/lcov-report/index.html +0 -191
  59. package/coverage/lcov-report/prettify.css +0 -1
  60. package/coverage/lcov-report/prettify.js +0 -2
  61. package/coverage/lcov-report/sort-arrow-sprite.png +0 -0
  62. package/coverage/lcov-report/sorter.js +0 -210
  63. package/coverage/lcov.info +0 -1937
  64. package/coverage/prettify.css +0 -1
  65. package/coverage/prettify.js +0 -2
  66. package/coverage/sort-arrow-sprite.png +0 -0
  67. package/coverage/sorter.js +0 -210
  68. package/docs/API_ENDPOINTS_FOR_DETECTION.md +0 -647
  69. package/docs/API_RETRY_MECHANISM.md +0 -338
  70. package/docs/ARELA_IDENTIFY_IMPLEMENTATION.md +0 -489
  71. package/docs/ARELA_IDENTIFY_QUICKREF.md +0 -186
  72. package/docs/ARELA_PROPAGATE_IMPLEMENTATION.md +0 -581
  73. package/docs/ARELA_PROPAGATE_QUICKREF.md +0 -272
  74. package/docs/ARELA_PUSH_IMPLEMENTATION.md +0 -577
  75. package/docs/ARELA_PUSH_QUICKREF.md +0 -322
  76. package/docs/ARELA_SCAN_IMPLEMENTATION.md +0 -373
  77. package/docs/ARELA_SCAN_QUICKREF.md +0 -139
  78. package/docs/CROSS_PLATFORM_PATH_HANDLING.md +0 -597
  79. package/docs/DETECTION_ATTEMPT_TRACKING.md +0 -414
  80. package/docs/MIGRATION_UPLOADER_TO_FILE_STATS.md +0 -1020
  81. package/docs/MULTI_LEVEL_DIRECTORY_SCANNING.md +0 -494
  82. package/docs/QUICK_REFERENCE_API_DETECTION.md +0 -264
  83. package/docs/REFACTORING_SUMMARY_DETECT_PEDIMENTOS.md +0 -200
  84. package/docs/STATS_COMMAND_SEQUENCE_DIAGRAM.md +0 -287
  85. package/docs/STATS_COMMAND_SIMPLE.md +0 -93
@@ -0,0 +1,386 @@
1
+ /**
2
+ * Seed matchers for the scoring engine (PROTOTYPE).
3
+ *
4
+ * Each entry re-expresses the boolean `match()` of an existing
5
+ * `src/document-types/*.js` definition as a set of weighted **clues**, while
6
+ * REUSING that definition's `extractors` / `resolveType` /
7
+ * `extractNumPedimento` / `extractPedimentoYear` verbatim. Only the SELECTION
8
+ * logic is new — field extraction is unchanged, so a comparison against
9
+ * `extractDocumentFields` isolates the first-match-wins → best-match change.
10
+ *
11
+ * Clue → flag mapping used throughout:
12
+ * - strong positive signal → high `weight`
13
+ * - hard exclusion (return false in the original) → `negative: true`
14
+ * - mandatory signature → `required: true`
15
+ * - `minScore` is tuned so the weighted sum reproduces the original boolean
16
+ * on the existing test fixtures.
17
+ *
18
+ * NOTE: a few original conditions are compound (e.g. simplificado's
19
+ * "COVE: present AND PAGO absent", or inter-agencia's "≥2 distinct RFCs from a
20
+ * set"). Where a single clue cannot express the exact boolean, the closest
21
+ * faithful approximation is used and flagged inline — the comparison harness
22
+ * (`scripts/scoring-compare.js`) surfaces any divergence on a real corpus.
23
+ */
24
+ import { dodaPdfDefinition } from '../document-types/doda-pdf.js';
25
+ import { dodaXmlDefinition } from '../document-types/doda-xml.js';
26
+ import {
27
+ INTER_AGENCIA_RFCS,
28
+ facturaInterAgenciaDefinition,
29
+ } from '../document-types/factura-inter-agencia.js';
30
+ import { facturasComerciales } from '../document-types/facturas-comerciales.js';
31
+ import { pedimentoCompletoXmlDefinition } from '../document-types/pedimento-completo-xml.js';
32
+ import { pedimentoCompletoDefinition } from '../document-types/pedimento-completo.js';
33
+ import { pedimentoSimplificadoDefinition } from '../document-types/pedimento-simplificado.js';
34
+ import { supportDocumentDefinition } from '../document-types/support-document.js';
35
+
36
+ // Pull the extraction half of a definition (reused unchanged).
37
+ function reuse(def) {
38
+ return {
39
+ extractors: def.extractors,
40
+ resolveType: def.resolveType,
41
+ extractNumPedimento: def.extractNumPedimento,
42
+ extractPedimentoYear: def.extractPedimentoYear,
43
+ };
44
+ }
45
+
46
+ const PEDIMENTO_NUM = /\d{2}\s?\d{2}\s?\d{4}\s?\d{7}/;
47
+
48
+ // --- pedimento_simplificado --------------------------------------------------
49
+ // Original: AVISO/COVE excluded; title "FORMA SIMPLIFICADA DE[L] PEDIMENTO"
50
+ // short-circuits to true; otherwise the header trio (all three) qualifies.
51
+ // Copy markers are NOT made negative here — they let `pedimento_completo`
52
+ // outscore on completo layouts, which reproduces the title short-circuit.
53
+ const simplificado = {
54
+ documentType: 'pedimento_simplificado',
55
+ extensions: ['pdf'],
56
+ minScore: 3, // title(5) OR full header trio(1+1+1)
57
+ priority: 1,
58
+ ...reuse(pedimentoSimplificadoDefinition),
59
+ clues: [
60
+ { kind: 'CONTENT_REGEX', pattern: /AVISO\s+CONSOLIDADO/i, negative: true },
61
+ {
62
+ kind: 'CONTENT_REGEX',
63
+ pattern: /COMPROBANTE\s+DE\s+VALOR\s+ELECTR[ÓO]NICO/i,
64
+ negative: true,
65
+ },
66
+ {
67
+ kind: 'CONTENT_REGEX',
68
+ pattern: /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i,
69
+ weight: 5,
70
+ },
71
+ { kind: 'CONTENT_REGEX', pattern: /NUM\.?\s*PEDIMENTO:/i, weight: 1 },
72
+ { kind: 'CONTENT_REGEX', pattern: /CVE\.?\s*PEDIMENTO:/i, weight: 1 },
73
+ { kind: 'CONTENT_REGEX', pattern: /T\.?\s*OPER:?/i, weight: 1 },
74
+ ],
75
+ };
76
+
77
+ // --- pedimento_completo ------------------------------------------------------
78
+ // Original: exclude FORMA SIMPLIFICADA & AVISO; (header trio AND ≥1 copy marker)
79
+ // OR clue-count heuristic (>25% of ~18 clues ≈ ≥5).
80
+ const completo = {
81
+ documentType: 'pedimento_completo',
82
+ extensions: ['pdf'],
83
+ minScore: 5, // header trio(3) + 1 copy marker(2), or ≥5 fallback clues
84
+ priority: 1,
85
+ ...reuse(pedimentoCompletoDefinition),
86
+ clues: [
87
+ {
88
+ kind: 'CONTENT_REGEX',
89
+ pattern: /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i,
90
+ negative: true,
91
+ },
92
+ { kind: 'CONTENT_REGEX', pattern: /AVISO\s+CONSOLIDADO/i, negative: true },
93
+ // header trio
94
+ { kind: 'CONTENT_REGEX', pattern: /NUM\.?\s*PEDIMENTO:/i, weight: 1 },
95
+ { kind: 'CONTENT_REGEX', pattern: /CVE\.?\s*PEDIMENTO:/i, weight: 1 },
96
+ { kind: 'CONTENT_REGEX', pattern: /T\.?\s*OPER:?/i, weight: 1 },
97
+ // copy markers (long-form pedimento signatures)
98
+ {
99
+ kind: 'CONTENT_REGEX',
100
+ pattern: /ORIGINAL:\s*ADMINISTRACION GENERAL DE ADUANAS/i,
101
+ weight: 2,
102
+ },
103
+ { kind: 'CONTENT_REGEX', pattern: /SEGUNDA\s+COPIA/i, weight: 2 },
104
+ { kind: 'CONTENT_REGEX', pattern: /TERCERA\s+COPIA/i, weight: 2 },
105
+ {
106
+ kind: 'CONTENT_REGEX',
107
+ pattern: /COPIA\s+(SIMPLIFICAD[AO])?\s*TRANSPORTISTA/i,
108
+ weight: 2,
109
+ },
110
+ { kind: 'CONTENT_REGEX', pattern: /DEFINITIVO/i, weight: 2 },
111
+ { kind: 'CONTENT_REGEX', pattern: /ANEXO\s+DEL\s+PEDIMENTO/i, weight: 2 },
112
+ {
113
+ kind: 'CONTENT_REGEX',
114
+ pattern: /\*+FIN\s+DE\s+PEDIMENTO\s*\*+/i,
115
+ weight: 2,
116
+ },
117
+ // exotic-layout fallback clues (weight 1 each)
118
+ { kind: 'CONTENT_REGEX', pattern: /CERTIFICACIONES/i, weight: 1 },
119
+ { kind: 'CONTENT_REGEX', pattern: /CUADRO\s+DE\s+LIQUIDACION/i, weight: 1 },
120
+ {
121
+ kind: 'CONTENT_REGEX',
122
+ pattern: /\*\*\*\s+PAGO\s+ELECTRONICO\s+\*\*\*/i,
123
+ weight: 1,
124
+ },
125
+ { kind: 'CONTENT_REGEX', pattern: /MEDIOS\s+DE\s+TRANSPORTE/i, weight: 1 },
126
+ {
127
+ kind: 'CONTENT_REGEX',
128
+ pattern: /DATOS\s+DEL\s+IMPORTADOR\/EXPORTADOR/i,
129
+ weight: 1,
130
+ },
131
+ {
132
+ kind: 'CONTENT_REGEX',
133
+ pattern: /DATOS\s+DEL\s+PROVEEDOR\s+O\s+COMPRADOR/i,
134
+ weight: 1,
135
+ },
136
+ { kind: 'CONTENT_REGEX', pattern: /LINEA\s+DE\s+CAPTURA:/i, weight: 1 },
137
+ {
138
+ kind: 'CONTENT_REGEX',
139
+ pattern: /DECLARO\s+BAJO\s+PROTESTA\s+DE\s+DECIR\s+VERDAD/i,
140
+ weight: 1,
141
+ },
142
+ {
143
+ kind: 'CONTENT_REGEX',
144
+ pattern: /PEDIMENTO\s+ELABORADO\s+DE\s+CONFORMIDAD/i,
145
+ weight: 1,
146
+ },
147
+ ],
148
+ };
149
+
150
+ // --- pedimento_completo_xml --------------------------------------------------
151
+ // Original: single condition — the VUCEM response root tag.
152
+ const completoXml = {
153
+ documentType: 'pedimento_completo_xml',
154
+ extensions: ['xml'],
155
+ minScore: 1,
156
+ priority: 2, // authoritative signal — must dominate doda_xml/support on XML
157
+ ...reuse(pedimentoCompletoXmlDefinition),
158
+ clues: [
159
+ {
160
+ kind: 'CONTENT_REGEX',
161
+ pattern: /consultarPedimentoCompletoRespuesta/i,
162
+ weight: 10,
163
+ required: true,
164
+ },
165
+ ],
166
+ };
167
+
168
+ // --- doda_pdf ----------------------------------------------------------------
169
+ // Original: primary marker → true; OR (≥2 secondary + pedimento#);
170
+ // OR (doda-context + pedimento# + ≥1 secondary).
171
+ const dodaPdf = {
172
+ documentType: 'doda_pdf',
173
+ extensions: ['pdf'],
174
+ priority: 1,
175
+ qualify: [
176
+ { primary: 1 },
177
+ { secondary: 2, pedimento: 1 },
178
+ { context: 1, pedimento: 1, secondary: 1 },
179
+ ],
180
+ ...reuse(dodaPdfDefinition),
181
+ clues: [
182
+ {
183
+ kind: 'CONTENT_REGEX',
184
+ pattern: /DOCUMENTO DE OPERACI[OÓ]N PARA DESPACHO ADUANERO/i,
185
+ weight: 5,
186
+ group: 'primary',
187
+ },
188
+ { kind: 'CONTENT_REGEX', pattern: /DODA/i, weight: 1, group: 'secondary' },
189
+ { kind: 'CONTENT_REGEX', pattern: /VUCEM/i, weight: 1, group: 'secondary' },
190
+ {
191
+ kind: 'CONTENT_REGEX',
192
+ pattern: PEDIMENTO_NUM,
193
+ weight: 1,
194
+ group: 'pedimento',
195
+ },
196
+ {
197
+ kind: 'CONTENT_REGEX',
198
+ pattern: /despacho aduanero|operaci[oó]n aduanera|validaci[oó]n/i,
199
+ weight: 1,
200
+ group: 'context',
201
+ },
202
+ ],
203
+ };
204
+
205
+ // --- doda_xml ----------------------------------------------------------------
206
+ // Original: ≥1 doda marker → true; OR (≥3 pedimento markers AND <?xml).
207
+ const dodaXml = {
208
+ documentType: 'doda_xml',
209
+ extensions: ['xml'],
210
+ priority: 1,
211
+ qualify: [{ doda: 1 }, { pedimento: 3, xml: 1 }],
212
+ ...reuse(dodaXmlDefinition),
213
+ clues: [
214
+ {
215
+ kind: 'CONTENT_REGEX',
216
+ pattern: /documentoOperacion/i,
217
+ weight: 3,
218
+ group: 'doda',
219
+ },
220
+ {
221
+ kind: 'CONTENT_REGEX',
222
+ pattern: /despachoAduanero/i,
223
+ weight: 3,
224
+ group: 'doda',
225
+ },
226
+ { kind: 'CONTENT_REGEX', pattern: /<doda\b/i, weight: 3, group: 'doda' },
227
+ {
228
+ kind: 'CONTENT_REGEX',
229
+ pattern: /xmlns[^"]*doda/i,
230
+ weight: 3,
231
+ group: 'doda',
232
+ },
233
+ { kind: 'CONTENT_REGEX', pattern: /VUCEM/i, weight: 3, group: 'doda' },
234
+ {
235
+ kind: 'CONTENT_REGEX',
236
+ pattern: /numPedimento/i,
237
+ weight: 1,
238
+ group: 'pedimento',
239
+ },
240
+ {
241
+ kind: 'CONTENT_REGEX',
242
+ pattern: /patenteAduanal/i,
243
+ weight: 1,
244
+ group: 'pedimento',
245
+ },
246
+ {
247
+ kind: 'CONTENT_REGEX',
248
+ pattern: /aduanaDespacho/i,
249
+ weight: 1,
250
+ group: 'pedimento',
251
+ },
252
+ {
253
+ kind: 'CONTENT_REGEX',
254
+ pattern: /tipoOperacion/i,
255
+ weight: 1,
256
+ group: 'pedimento',
257
+ },
258
+ // structural gate for the pedimento-markers path (weight 0 = gate only)
259
+ { kind: 'CONTENT_REGEX', pattern: /<\?xml/i, weight: 0, group: 'xml' },
260
+ ],
261
+ };
262
+
263
+ // --- factura_inter_agencia ---------------------------------------------------
264
+ // Original: isCfdiContent AND ≥2 distinct configured RFCs AND broker clave.
265
+ // The pair of agency RFCs and the broker-service clave are modelled as REQUIRED
266
+ // clues, which is what lets this win over `factura_comercial` purely by score —
267
+ // no registration-order dependency. (Scope-limited to the configured pair, same
268
+ // as the original; widening means making RFC presence a counting rule.)
269
+ const interAgencia = {
270
+ documentType: 'factura_inter_agencia',
271
+ extensions: ['xml', 'pdf'],
272
+ minScore: 25, // both required RFCs (10+10) + required clave (5)
273
+ priority: 3,
274
+ ...reuse(facturaInterAgenciaDefinition),
275
+ clues: [
276
+ ...INTER_AGENCIA_RFCS.map((rfc) => ({
277
+ kind: 'CONTENT_REGEX',
278
+ pattern: new RegExp(`\\b${rfc}\\b`, 'i'),
279
+ weight: 10,
280
+ required: true,
281
+ })),
282
+ // BROKER_SERVICE_CLAVE_PROD_SERV (78141502 = servicios de agentes aduaneros)
283
+ { kind: 'CONTENT_REGEX', pattern: /78141502/, weight: 5, required: true },
284
+ // CFDI content markers (informational positive signal)
285
+ { kind: 'CONTENT_REGEX', pattern: /cfdi:Comprobante/i, weight: 1 },
286
+ { kind: 'CONTENT_REGEX', pattern: /xmlns:cfdi/i, weight: 1 },
287
+ { kind: 'CONTENT_REGEX', pattern: /TipoDeComprobante/i, weight: 1 },
288
+ ],
289
+ };
290
+
291
+ // --- factura_comercial -------------------------------------------------------
292
+ // Original: cfdiMatches≥2 OR (invoiceMatches≥1 AND customsMatches≥1).
293
+ // Faithfully expressed with clue groups + qualify (OR-of-ANDs) — a flat
294
+ // minScore could not enforce the "invoice AND customs" pairing and produced
295
+ // false positives on COVE acuses (customs keywords alone reaching the threshold).
296
+ const facturaComercial = {
297
+ documentType: 'factura_comercial',
298
+ extensions: ['pdf', 'xml'],
299
+ priority: 0,
300
+ qualify: [{ cfdi: 2 }, { invoice: 1, customs: 1 }],
301
+ ...reuse(facturasComerciales),
302
+ clues: [
303
+ { kind: 'CONTENT_REGEX', pattern: /cfdi:Comprobante/i, group: 'cfdi' },
304
+ { kind: 'CONTENT_REGEX', pattern: /xmlns:cfdi/i, group: 'cfdi' },
305
+ { kind: 'CONTENT_REGEX', pattern: /TipoDeComprobante/i, group: 'cfdi' },
306
+ { kind: 'CONTENT_REGEX', pattern: /timbreFiscalDigital/i, group: 'cfdi' },
307
+ { kind: 'CONTENT_REGEX', pattern: /SelloSAT/i, group: 'cfdi' },
308
+ {
309
+ kind: 'CONTENT_REGEX',
310
+ pattern: /factura\s*(comercial|de\s*venta|de\s*exportaci[oó]n)?/i,
311
+ group: 'invoice',
312
+ },
313
+ {
314
+ kind: 'CONTENT_REGEX',
315
+ pattern: /commercial\s*invoice/i,
316
+ group: 'invoice',
317
+ },
318
+ { kind: 'CONTENT_REGEX', pattern: /invoice\s*number/i, group: 'invoice' },
319
+ {
320
+ kind: 'CONTENT_REGEX',
321
+ pattern: /n[uú]mero\s*de\s*factura/i,
322
+ group: 'invoice',
323
+ },
324
+ { kind: 'CONTENT_REGEX', pattern: /pedimento/i, group: 'customs' },
325
+ { kind: 'CONTENT_REGEX', pattern: /aduana/i, group: 'customs' },
326
+ {
327
+ kind: 'CONTENT_REGEX',
328
+ pattern: /importaci[oó]n|exportaci[oó]n/i,
329
+ group: 'customs',
330
+ },
331
+ {
332
+ kind: 'CONTENT_REGEX',
333
+ pattern: /despacho\s*aduanero/i,
334
+ group: 'customs',
335
+ },
336
+ {
337
+ kind: 'CONTENT_REGEX',
338
+ pattern: /fracci[oó]n\s*arancelaria/i,
339
+ group: 'customs',
340
+ },
341
+ ],
342
+ };
343
+
344
+ // --- support_document --------------------------------------------------------
345
+ // Original: soapFound≥2 OR customsFound≥2. Broad fallback → lowest priority so
346
+ // it only wins when no specific matcher qualifies.
347
+ const supportDocument = {
348
+ documentType: 'support_document',
349
+ extensions: ['xml', 'txt', 'json'],
350
+ priority: -1,
351
+ qualify: [{ soap: 2 }, { customs: 2 }],
352
+ ...reuse(supportDocumentDefinition),
353
+ clues: [
354
+ { kind: 'CONTENT_REGEX', pattern: /soapenv:Envelope/i, group: 'soap' },
355
+ { kind: 'CONTENT_REGEX', pattern: /xmlns:soapenv=/i, group: 'soap' },
356
+ {
357
+ kind: 'CONTENT_REGEX',
358
+ pattern: /solicitarRecibirCoveServicio/i,
359
+ group: 'soap',
360
+ },
361
+ { kind: 'CONTENT_REGEX', pattern: /tipoOperacion/i, group: 'soap' },
362
+ { kind: 'CONTENT_REGEX', pattern: /patenteAduanal/i, group: 'soap' },
363
+ // customs metadata fallback — original requires BOTH patterns present
364
+ { kind: 'CONTENT_REGEX', pattern: /rfc/i, group: 'customs' },
365
+ {
366
+ kind: 'CONTENT_REGEX',
367
+ pattern: /patente|aduana|customs|pedimento/i,
368
+ group: 'customs',
369
+ },
370
+ ],
371
+ };
372
+
373
+ /**
374
+ * Default/global seed set. Order is irrelevant — best-match selects the winner.
375
+ * (This becomes the seed for DEFAULT matchers when the model moves to the DB.)
376
+ */
377
+ export const scoringMatchers = [
378
+ simplificado,
379
+ completo,
380
+ completoXml,
381
+ dodaPdf,
382
+ dodaXml,
383
+ interAgencia,
384
+ facturaComercial,
385
+ supportDocument,
386
+ ];
@@ -0,0 +1,246 @@
1
+ /**
2
+ * Scoring-based document classification engine (PROTOTYPE).
3
+ *
4
+ * Replaces the "first-match-wins" selection in `document-type-shared.js`
5
+ * (`extractDocumentFields`) with "best-match": every applicable matcher is
6
+ * scored by the weight of the clues it satisfies, and the highest score wins.
7
+ * This removes the order-dependent registration that the current registry
8
+ * relies on (e.g. `factura_inter_agencia` MUST be evaluated before
9
+ * `facturas_comerciales`) — precedence now lives in clue weights / `required`
10
+ * / `negative` flags, not in array order.
11
+ *
12
+ * Matcher shape (see `matchers-seed.js`):
13
+ * {
14
+ * documentType, extensions[], minScore, priority,
15
+ * clues: [{ kind, pattern, flags?, weight=1, group?, required=false, negative=false }],
16
+ * qualify?: [{ <group>: minCount, ... }, ...], // OR-of-ANDs gate
17
+ * extractors, resolveType?, extractNumPedimento?, extractPedimentoYear?
18
+ * }
19
+ *
20
+ * Two separable concerns:
21
+ * - QUALIFICATION (does this matcher apply at all?): `required`/`negative`
22
+ * clues plus an optional `qualify` rule set. `qualify` is a list of
23
+ * alternative rules (OR); a rule is a map of `group -> minimum matched
24
+ * clues` (AND across its entries). This expresses the grouped boolean gates
25
+ * of the original matchers, e.g. `(cfdi>=2) OR (invoice>=1 AND customs>=1)`.
26
+ * When `qualify` is absent, the gate falls back to `score >= minScore`.
27
+ * - RANKING (which qualifying matcher wins?): always the weighted sum of
28
+ * matched clues (`score`), tie-broken by fraction -> priority -> type.
29
+ *
30
+ * Selection only depends on clues/qualify. The winning matcher's `extractors` /
31
+ * `resolveType` / `extractNumPedimento` / `extractPedimentoYear` run AFTER
32
+ * selection with the same post-processing as `extractDocumentFields`, so a
33
+ * side-by-side comparison isolates the selection change.
34
+ */
35
+ import path from 'path';
36
+
37
+ import { FieldResult } from '../document-type-shared.js';
38
+
39
+ // Compile cache for string patterns (from DB matchers): stable across every
40
+ // document in a run, so compile once instead of per document. Invalid patterns
41
+ // cache as null and are treated as a non-match (parity with the TS engine).
42
+ const regexCache = new Map();
43
+ const REGEX_CACHE_MAX = 5000;
44
+
45
+ function toRegExp(clue) {
46
+ if (clue.pattern instanceof RegExp) {
47
+ // Local-seed patterns are already compiled; reset lastIndex so a g/y flag
48
+ // can't make repeated .test() calls stateful across documents.
49
+ clue.pattern.lastIndex = 0;
50
+ return clue.pattern;
51
+ }
52
+ const safeFlags = (clue.flags ?? '').replace(/[gy]/g, '');
53
+ const key = `${safeFlags} ${clue.pattern}`;
54
+ let re = regexCache.get(key);
55
+ if (re === undefined) {
56
+ try {
57
+ re = new RegExp(clue.pattern, safeFlags);
58
+ } catch {
59
+ re = null;
60
+ }
61
+ if (regexCache.size >= REGEX_CACHE_MAX) regexCache.clear();
62
+ regexCache.set(key, re);
63
+ }
64
+ return re;
65
+ }
66
+
67
+ // Cap the text a single regex runs on. Real extracted document text is far
68
+ // below this; the cap only bounds pathological/crafted inputs so an allowed
69
+ // (quadratic) pattern can't blow up on a megabyte-scale adversarial string.
70
+ const MATCH_INPUT_CAP = 1_000_000;
71
+
72
+ function clueTarget(clue, ctx) {
73
+ // FILENAME_REGEX tests the file name; every other kind tests the content.
74
+ const raw =
75
+ clue.kind === 'FILENAME_REGEX' ? (ctx.fileName ?? '') : (ctx.source ?? '');
76
+ return raw.length > MATCH_INPUT_CAP ? raw.slice(0, MATCH_INPUT_CAP) : raw;
77
+ }
78
+
79
+ /**
80
+ * Score a single matcher against a document context.
81
+ * @returns {null} when the matcher does not apply (extension mismatch),
82
+ * `{ disqualified: true, reason }` when a `required`/`negative` clue
83
+ * rules it out, otherwise a scored result object.
84
+ */
85
+ export function scoreMatcher(matcher, ctx) {
86
+ const ext = (ctx.extension ?? '').toLowerCase();
87
+ if (
88
+ Array.isArray(matcher.extensions) &&
89
+ matcher.extensions.length > 0 &&
90
+ ext &&
91
+ !matcher.extensions.includes(ext)
92
+ ) {
93
+ return null;
94
+ }
95
+
96
+ let matchedWeight = 0;
97
+ let totalWeight = 0;
98
+ const matchedClues = [];
99
+ const groupCounts = {};
100
+
101
+ for (const clue of matcher.clues ?? []) {
102
+ const weight = clue.weight ?? 1;
103
+ const re = toRegExp(clue);
104
+ const hit = re ? re.test(clueTarget(clue, ctx)) : false;
105
+
106
+ if (clue.negative) {
107
+ if (hit) {
108
+ return {
109
+ documentType: matcher.documentType,
110
+ disqualified: true,
111
+ reason: `negative:${clue.pattern}`,
112
+ };
113
+ }
114
+ continue;
115
+ }
116
+
117
+ if (clue.required && !hit) {
118
+ return {
119
+ documentType: matcher.documentType,
120
+ disqualified: true,
121
+ reason: `required-missing:${clue.pattern}`,
122
+ };
123
+ }
124
+
125
+ totalWeight += weight;
126
+ if (hit) {
127
+ matchedWeight += weight;
128
+ matchedClues.push(clue);
129
+ if (clue.group) {
130
+ groupCounts[clue.group] = (groupCounts[clue.group] ?? 0) + 1;
131
+ }
132
+ }
133
+ }
134
+
135
+ // Qualification gate: `qualify` rules (OR-of-ANDs over group counts) when
136
+ // present, otherwise the weighted-score threshold.
137
+ const passed = Array.isArray(matcher.qualify)
138
+ ? matcher.qualify.some((rule) =>
139
+ Object.entries(rule).every(
140
+ ([group, min]) => (groupCounts[group] ?? 0) >= min,
141
+ ),
142
+ )
143
+ : matchedWeight >= (matcher.minScore ?? 1);
144
+
145
+ return {
146
+ documentType: matcher.documentType,
147
+ matcher,
148
+ disqualified: false,
149
+ score: matchedWeight,
150
+ totalWeight,
151
+ fraction: totalWeight > 0 ? matchedWeight / totalWeight : 0,
152
+ priority: matcher.priority ?? 0,
153
+ passed,
154
+ matchedClues,
155
+ groupCounts,
156
+ };
157
+ }
158
+
159
+ /**
160
+ * All qualifying candidates, sorted best-first.
161
+ * Order: score desc → fraction desc → priority desc → documentType (stable).
162
+ */
163
+ export function scoreAll(matchers, ctx) {
164
+ const candidates = [];
165
+ for (const matcher of matchers) {
166
+ const result = scoreMatcher(matcher, ctx);
167
+ if (!result || result.disqualified || !result.passed) continue;
168
+ candidates.push(result);
169
+ }
170
+ candidates.sort(
171
+ (a, b) =>
172
+ b.score - a.score ||
173
+ b.fraction - a.fraction ||
174
+ b.priority - a.priority ||
175
+ String(a.documentType).localeCompare(String(b.documentType)),
176
+ );
177
+ return candidates;
178
+ }
179
+
180
+ export function selectBestMatch(matchers, ctx) {
181
+ return scoreAll(matchers, ctx)[0] ?? null;
182
+ }
183
+
184
+ /**
185
+ * Full classification: pick the best matcher, then run ITS extractors /
186
+ * resolveType / pedimento helpers. Post-selection logic mirrors
187
+ * `extractDocumentFields` so a comparison isolates the selection change.
188
+ *
189
+ * @returns {{ detectedType, fields, detectedPedimento, detectedPedimentoYear,
190
+ * winner, candidates }}
191
+ */
192
+ export function classifyDocument(matchers, { source, extension, filePath }) {
193
+ const ctx = {
194
+ source,
195
+ extension,
196
+ fileName: filePath ? path.basename(filePath) : '',
197
+ };
198
+ const candidates = scoreAll(matchers, ctx);
199
+ const winner = candidates[0] ?? null;
200
+
201
+ if (!winner) {
202
+ return {
203
+ detectedType: null,
204
+ fields: [],
205
+ detectedPedimento: null,
206
+ detectedPedimentoYear: null,
207
+ winner: null,
208
+ candidates,
209
+ };
210
+ }
211
+
212
+ const def = winner.matcher;
213
+ const fields = [];
214
+ for (const extractor of def.extractors ?? []) {
215
+ try {
216
+ fields.push(extractor.extract(source));
217
+ } catch {
218
+ fields.push(new FieldResult(extractor.field, false, null));
219
+ }
220
+ }
221
+
222
+ const resolvedType = def.resolveType
223
+ ? def.resolveType(fields)
224
+ : def.documentType;
225
+ const pedimento = def.extractNumPedimento
226
+ ? def.extractNumPedimento(source, fields, filePath)
227
+ : null;
228
+ const year = def.extractPedimentoYear
229
+ ? def.extractPedimentoYear(source, fields, filePath)
230
+ : null;
231
+
232
+ // Backfill numPedimento as a field (same as extractDocumentFields) so
233
+ // downstream consumers (composeArelaPath) see a consistent shape.
234
+ if (pedimento && !fields.some((f) => f.name === 'numPedimento')) {
235
+ fields.push(new FieldResult('numPedimento', true, pedimento));
236
+ }
237
+
238
+ return {
239
+ detectedType: resolvedType,
240
+ fields,
241
+ detectedPedimento: pedimento,
242
+ detectedPedimentoYear: year,
243
+ winner,
244
+ candidates,
245
+ };
246
+ }
@@ -389,6 +389,20 @@ export class ScanApiService {
389
389
  return result;
390
390
  }
391
391
 
392
+ /**
393
+ * Fetch the resolved matcher set (this RFC's matchers + globals) for runtime
394
+ * classification. Returns an array of matchers with clues + fieldExtractors.
395
+ * @param {string|null} rfc - optional RFC to scope per-company matchers
396
+ */
397
+ async getResolvedMatchers(rfc = null) {
398
+ const qs = rfc ? `?rfc=${encodeURIComponent(rfc)}` : '';
399
+ const result = await this.#request(
400
+ `/api/document-matcher/resolved${qs}`,
401
+ 'GET',
402
+ );
403
+ return Array.isArray(result) ? result : [];
404
+ }
405
+
392
406
  async fetchPdfsForDetection(
393
407
  tableName,
394
408
  offset = 0,