npm - @arela/uploader - Versions diffs - 1.0.23 → 1.1.0 - Mend

@arela/uploader 1.0.23 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

package/docs/AUTO_PROCESSING_PIPELINE.md +258 -0
package/docs/COMPLETE_USAGE_GUIDE.md +1363 -0
package/docs/DATABASESERVICE_IMPROVEMENTS.md +546 -0
package/docs/PASO_2_TEST_RESULTS.md +298 -0
package/docs/PASO_3_PLAN.md +385 -0
package/docs/PHASE_1_FILE_DETECTION.md +366 -0
package/docs/PHASE_2_API_INTEGRATION.md +426 -0
package/docs/PHASE_3_DATABASE_MANAGEMENT.md +480 -0
package/docs/PHASE_4_FILE_OPERATIONS.md +448 -0
package/docs/PHASE_5_WATCH_MODE.md +450 -0
package/docs/PHASE_6_SIGNAL_HANDLING.md +472 -0
package/docs/PHASE_7_ADVANCED_FEATURES.md +560 -0
package/docs/PLAN_WATCH_FEATURE.md +417 -0
package/docs/README.md +480 -0
package/docs/SCHEMA_ALIGNMENT_SUMMARY.md +301 -0
package/docs/SMARTWATCH_DATABASE_REFACTORING.md +181 -0
package/docs/SMART_WATCH_DATABASE_CHANGES.md +502 -0
package/docs/TESTING_WATCH_MODE.md +212 -0
package/docs/WATCHER_API_IMPLEMENTATION.md +520 -0
package/docs/WATCHER_API_INTEGRATION.md +562 -0
package/docs/WATCHER_SETUP_GUIDE.md +614 -0
package/docs/WATCH_ARCHITECTURE.md +395 -0
package/docs/WATCH_AUTO_PIPELINE.md +334 -0
package/docs/WATCH_CONFIGURATION.md +267 -0
package/docs/WATCH_USAGE_GUIDE.md +567 -0
package/docs/commands.md +14 -0
package/package.json +1 -1
package/scripts/scoring-compare.js +243 -0
package/scripts/scoring-phase4-check.js +96 -0
package/src/commands/IdentifyCommand.js +36 -0
package/src/config/config.js +2 -2
package/src/file-detection.js +71 -4
package/src/scoring/db-matcher-adapter.js +98 -0
package/src/scoring/matchers-seed.js +386 -0
package/src/scoring/scoring-engine.js +246 -0
package/src/services/ScanApiService.js +14 -0
package/tests/unit/scoring-engine.test.js +221 -0
package/.vscode/settings.json +0 -1
package/coverage/IdentifyCommand.js.html +0 -1462
package/coverage/PropagateCommand.js.html +0 -1507
package/coverage/PushCommand.js.html +0 -1504
package/coverage/ScanCommand.js.html +0 -1654
package/coverage/UploadCommand.js.html +0 -1846
package/coverage/WatchCommand.js.html +0 -4111
package/coverage/base.css +0 -224
package/coverage/block-navigation.js +0 -87
package/coverage/favicon.png +0 -0
package/coverage/index.html +0 -191
package/coverage/lcov-report/IdentifyCommand.js.html +0 -1462
package/coverage/lcov-report/PropagateCommand.js.html +0 -1507
package/coverage/lcov-report/PushCommand.js.html +0 -1504
package/coverage/lcov-report/ScanCommand.js.html +0 -1654
package/coverage/lcov-report/UploadCommand.js.html +0 -1846
package/coverage/lcov-report/WatchCommand.js.html +0 -4111
package/coverage/lcov-report/base.css +0 -224
package/coverage/lcov-report/block-navigation.js +0 -87
package/coverage/lcov-report/favicon.png +0 -0
package/coverage/lcov-report/index.html +0 -191
package/coverage/lcov-report/prettify.css +0 -1
package/coverage/lcov-report/prettify.js +0 -2
package/coverage/lcov-report/sort-arrow-sprite.png +0 -0
package/coverage/lcov-report/sorter.js +0 -210
package/coverage/lcov.info +0 -1937
package/coverage/prettify.css +0 -1
package/coverage/prettify.js +0 -2
package/coverage/sort-arrow-sprite.png +0 -0
package/coverage/sorter.js +0 -210
package/docs/API_ENDPOINTS_FOR_DETECTION.md +0 -647
package/docs/API_RETRY_MECHANISM.md +0 -338
package/docs/ARELA_IDENTIFY_IMPLEMENTATION.md +0 -489
package/docs/ARELA_IDENTIFY_QUICKREF.md +0 -186
package/docs/ARELA_PROPAGATE_IMPLEMENTATION.md +0 -581
package/docs/ARELA_PROPAGATE_QUICKREF.md +0 -272
package/docs/ARELA_PUSH_IMPLEMENTATION.md +0 -577
package/docs/ARELA_PUSH_QUICKREF.md +0 -322
package/docs/ARELA_SCAN_IMPLEMENTATION.md +0 -373
package/docs/ARELA_SCAN_QUICKREF.md +0 -139
package/docs/CROSS_PLATFORM_PATH_HANDLING.md +0 -597
package/docs/DETECTION_ATTEMPT_TRACKING.md +0 -414
package/docs/MIGRATION_UPLOADER_TO_FILE_STATS.md +0 -1020
package/docs/MULTI_LEVEL_DIRECTORY_SCANNING.md +0 -494
package/docs/QUICK_REFERENCE_API_DETECTION.md +0 -264
package/docs/REFACTORING_SUMMARY_DETECT_PEDIMENTOS.md +0 -200
package/docs/STATS_COMMAND_SEQUENCE_DIAGRAM.md +0 -287
package/docs/STATS_COMMAND_SIMPLE.md +0 -93

package/src/scoring/matchers-seed.js ADDED Viewed

@@ -0,0 +1,386 @@
+/**
+ * Seed matchers for the scoring engine (PROTOTYPE).
+ *
+ * Each entry re-expresses the boolean `match()` of an existing
+ * `src/document-types/*.js` definition as a set of weighted **clues**, while
+ * REUSING that definition's `extractors` / `resolveType` /
+ * `extractNumPedimento` / `extractPedimentoYear` verbatim. Only the SELECTION
+ * logic is new — field extraction is unchanged, so a comparison against
+ * `extractDocumentFields` isolates the first-match-wins → best-match change.
+ *
+ * Clue → flag mapping used throughout:
+ *   - strong positive signal   → high `weight`
+ *   - hard exclusion (return false in the original) → `negative: true`
+ *   - mandatory signature       → `required: true`
+ *   - `minScore` is tuned so the weighted sum reproduces the original boolean
+ *     on the existing test fixtures.
+ *
+ * NOTE: a few original conditions are compound (e.g. simplificado's
+ * "COVE: present AND PAGO absent", or inter-agencia's "≥2 distinct RFCs from a
+ * set"). Where a single clue cannot express the exact boolean, the closest
+ * faithful approximation is used and flagged inline — the comparison harness
+ * (`scripts/scoring-compare.js`) surfaces any divergence on a real corpus.
+ */
+import { dodaPdfDefinition } from '../document-types/doda-pdf.js';
+import { dodaXmlDefinition } from '../document-types/doda-xml.js';
+import {
+  INTER_AGENCIA_RFCS,
+  facturaInterAgenciaDefinition,
+} from '../document-types/factura-inter-agencia.js';
+import { facturasComerciales } from '../document-types/facturas-comerciales.js';
+import { pedimentoCompletoXmlDefinition } from '../document-types/pedimento-completo-xml.js';
+import { pedimentoCompletoDefinition } from '../document-types/pedimento-completo.js';
+import { pedimentoSimplificadoDefinition } from '../document-types/pedimento-simplificado.js';
+import { supportDocumentDefinition } from '../document-types/support-document.js';
+// Pull the extraction half of a definition (reused unchanged).
+function reuse(def) {
+  return {
+    extractors: def.extractors,
+    resolveType: def.resolveType,
+    extractNumPedimento: def.extractNumPedimento,
+    extractPedimentoYear: def.extractPedimentoYear,
+  };
+}
+const PEDIMENTO_NUM = /\d{2}\s?\d{2}\s?\d{4}\s?\d{7}/;
+// --- pedimento_simplificado --------------------------------------------------
+// Original: AVISO/COVE excluded; title "FORMA SIMPLIFICADA DE[L] PEDIMENTO"
+// short-circuits to true; otherwise the header trio (all three) qualifies.
+// Copy markers are NOT made negative here — they let `pedimento_completo`
+// outscore on completo layouts, which reproduces the title short-circuit.
+const simplificado = {
+  documentType: 'pedimento_simplificado',
+  extensions: ['pdf'],
+  minScore: 3, // title(5) OR full header trio(1+1+1)
+  priority: 1,
+  ...reuse(pedimentoSimplificadoDefinition),
+  clues: [
+    { kind: 'CONTENT_REGEX', pattern: /AVISO\s+CONSOLIDADO/i, negative: true },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /COMPROBANTE\s+DE\s+VALOR\s+ELECTR[ÓO]NICO/i,
+      negative: true,
+    },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i,
+      weight: 5,
+    },
+    { kind: 'CONTENT_REGEX', pattern: /NUM\.?\s*PEDIMENTO:/i, weight: 1 },
+    { kind: 'CONTENT_REGEX', pattern: /CVE\.?\s*PEDIMENTO:/i, weight: 1 },
+    { kind: 'CONTENT_REGEX', pattern: /T\.?\s*OPER:?/i, weight: 1 },
+  ],
+};
+// --- pedimento_completo ------------------------------------------------------
+// Original: exclude FORMA SIMPLIFICADA & AVISO; (header trio AND ≥1 copy marker)
+// OR clue-count heuristic (>25% of ~18 clues ≈ ≥5).
+const completo = {
+  documentType: 'pedimento_completo',
+  extensions: ['pdf'],
+  minScore: 5, // header trio(3) + 1 copy marker(2), or ≥5 fallback clues
+  priority: 1,
+  ...reuse(pedimentoCompletoDefinition),
+  clues: [
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i,
+      negative: true,
+    },
+    { kind: 'CONTENT_REGEX', pattern: /AVISO\s+CONSOLIDADO/i, negative: true },
+    // header trio
+    { kind: 'CONTENT_REGEX', pattern: /NUM\.?\s*PEDIMENTO:/i, weight: 1 },
+    { kind: 'CONTENT_REGEX', pattern: /CVE\.?\s*PEDIMENTO:/i, weight: 1 },
+    { kind: 'CONTENT_REGEX', pattern: /T\.?\s*OPER:?/i, weight: 1 },
+    // copy markers (long-form pedimento signatures)
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /ORIGINAL:\s*ADMINISTRACION GENERAL DE ADUANAS/i,
+      weight: 2,
+    },
+    { kind: 'CONTENT_REGEX', pattern: /SEGUNDA\s+COPIA/i, weight: 2 },
+    { kind: 'CONTENT_REGEX', pattern: /TERCERA\s+COPIA/i, weight: 2 },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /COPIA\s+(SIMPLIFICAD[AO])?\s*TRANSPORTISTA/i,
+      weight: 2,
+    },
+    { kind: 'CONTENT_REGEX', pattern: /DEFINITIVO/i, weight: 2 },
+    { kind: 'CONTENT_REGEX', pattern: /ANEXO\s+DEL\s+PEDIMENTO/i, weight: 2 },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /\*+FIN\s+DE\s+PEDIMENTO\s*\*+/i,
+      weight: 2,
+    },
+    // exotic-layout fallback clues (weight 1 each)
+    { kind: 'CONTENT_REGEX', pattern: /CERTIFICACIONES/i, weight: 1 },
+    { kind: 'CONTENT_REGEX', pattern: /CUADRO\s+DE\s+LIQUIDACION/i, weight: 1 },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /\*\*\*\s+PAGO\s+ELECTRONICO\s+\*\*\*/i,
+      weight: 1,
+    },
+    { kind: 'CONTENT_REGEX', pattern: /MEDIOS\s+DE\s+TRANSPORTE/i, weight: 1 },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /DATOS\s+DEL\s+IMPORTADOR\/EXPORTADOR/i,
+      weight: 1,
+    },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /DATOS\s+DEL\s+PROVEEDOR\s+O\s+COMPRADOR/i,
+      weight: 1,
+    },
+    { kind: 'CONTENT_REGEX', pattern: /LINEA\s+DE\s+CAPTURA:/i, weight: 1 },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /DECLARO\s+BAJO\s+PROTESTA\s+DE\s+DECIR\s+VERDAD/i,
+      weight: 1,
+    },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /PEDIMENTO\s+ELABORADO\s+DE\s+CONFORMIDAD/i,
+      weight: 1,
+    },
+  ],
+};
+// --- pedimento_completo_xml --------------------------------------------------
+// Original: single condition — the VUCEM response root tag.
+const completoXml = {
+  documentType: 'pedimento_completo_xml',
+  extensions: ['xml'],
+  minScore: 1,
+  priority: 2, // authoritative signal — must dominate doda_xml/support on XML
+  ...reuse(pedimentoCompletoXmlDefinition),
+  clues: [
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /consultarPedimentoCompletoRespuesta/i,
+      weight: 10,
+      required: true,
+    },
+  ],
+};
+// --- doda_pdf ----------------------------------------------------------------
+// Original: primary marker → true; OR (≥2 secondary + pedimento#);
+// OR (doda-context + pedimento# + ≥1 secondary).
+const dodaPdf = {
+  documentType: 'doda_pdf',
+  extensions: ['pdf'],
+  priority: 1,
+  qualify: [
+    { primary: 1 },
+    { secondary: 2, pedimento: 1 },
+    { context: 1, pedimento: 1, secondary: 1 },
+  ],
+  ...reuse(dodaPdfDefinition),
+  clues: [
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /DOCUMENTO DE OPERACI[OÓ]N PARA DESPACHO ADUANERO/i,
+      weight: 5,
+      group: 'primary',
+    },
+    { kind: 'CONTENT_REGEX', pattern: /DODA/i, weight: 1, group: 'secondary' },
+    { kind: 'CONTENT_REGEX', pattern: /VUCEM/i, weight: 1, group: 'secondary' },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: PEDIMENTO_NUM,
+      weight: 1,
+      group: 'pedimento',
+    },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /despacho aduanero|operaci[oó]n aduanera|validaci[oó]n/i,
+      weight: 1,
+      group: 'context',
+    },
+  ],
+};
+// --- doda_xml ----------------------------------------------------------------
+// Original: ≥1 doda marker → true; OR (≥3 pedimento markers AND <?xml).
+const dodaXml = {
+  documentType: 'doda_xml',
+  extensions: ['xml'],
+  priority: 1,
+  qualify: [{ doda: 1 }, { pedimento: 3, xml: 1 }],
+  ...reuse(dodaXmlDefinition),
+  clues: [
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /documentoOperacion/i,
+      weight: 3,
+      group: 'doda',
+    },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /despachoAduanero/i,
+      weight: 3,
+      group: 'doda',
+    },
+    { kind: 'CONTENT_REGEX', pattern: /<doda\b/i, weight: 3, group: 'doda' },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /xmlns[^"]*doda/i,
+      weight: 3,
+      group: 'doda',
+    },
+    { kind: 'CONTENT_REGEX', pattern: /VUCEM/i, weight: 3, group: 'doda' },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /numPedimento/i,
+      weight: 1,
+      group: 'pedimento',
+    },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /patenteAduanal/i,
+      weight: 1,
+      group: 'pedimento',
+    },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /aduanaDespacho/i,
+      weight: 1,
+      group: 'pedimento',
+    },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /tipoOperacion/i,
+      weight: 1,
+      group: 'pedimento',
+    },
+    // structural gate for the pedimento-markers path (weight 0 = gate only)
+    { kind: 'CONTENT_REGEX', pattern: /<\?xml/i, weight: 0, group: 'xml' },
+  ],
+};
+// --- factura_inter_agencia ---------------------------------------------------
+// Original: isCfdiContent AND ≥2 distinct configured RFCs AND broker clave.
+// The pair of agency RFCs and the broker-service clave are modelled as REQUIRED
+// clues, which is what lets this win over `factura_comercial` purely by score —
+// no registration-order dependency. (Scope-limited to the configured pair, same
+// as the original; widening means making RFC presence a counting rule.)
+const interAgencia = {
+  documentType: 'factura_inter_agencia',
+  extensions: ['xml', 'pdf'],
+  minScore: 25, // both required RFCs (10+10) + required clave (5)
+  priority: 3,
+  ...reuse(facturaInterAgenciaDefinition),
+  clues: [
+    ...INTER_AGENCIA_RFCS.map((rfc) => ({
+      kind: 'CONTENT_REGEX',
+      pattern: new RegExp(`\\b${rfc}\\b`, 'i'),
+      weight: 10,
+      required: true,
+    })),
+    // BROKER_SERVICE_CLAVE_PROD_SERV (78141502 = servicios de agentes aduaneros)
+    { kind: 'CONTENT_REGEX', pattern: /78141502/, weight: 5, required: true },
+    // CFDI content markers (informational positive signal)
+    { kind: 'CONTENT_REGEX', pattern: /cfdi:Comprobante/i, weight: 1 },
+    { kind: 'CONTENT_REGEX', pattern: /xmlns:cfdi/i, weight: 1 },
+    { kind: 'CONTENT_REGEX', pattern: /TipoDeComprobante/i, weight: 1 },
+  ],
+};
+// --- factura_comercial -------------------------------------------------------
+// Original: cfdiMatches≥2 OR (invoiceMatches≥1 AND customsMatches≥1).
+// Faithfully expressed with clue groups + qualify (OR-of-ANDs) — a flat
+// minScore could not enforce the "invoice AND customs" pairing and produced
+// false positives on COVE acuses (customs keywords alone reaching the threshold).
+const facturaComercial = {
+  documentType: 'factura_comercial',
+  extensions: ['pdf', 'xml'],
+  priority: 0,
+  qualify: [{ cfdi: 2 }, { invoice: 1, customs: 1 }],
+  ...reuse(facturasComerciales),
+  clues: [
+    { kind: 'CONTENT_REGEX', pattern: /cfdi:Comprobante/i, group: 'cfdi' },
+    { kind: 'CONTENT_REGEX', pattern: /xmlns:cfdi/i, group: 'cfdi' },
+    { kind: 'CONTENT_REGEX', pattern: /TipoDeComprobante/i, group: 'cfdi' },
+    { kind: 'CONTENT_REGEX', pattern: /timbreFiscalDigital/i, group: 'cfdi' },
+    { kind: 'CONTENT_REGEX', pattern: /SelloSAT/i, group: 'cfdi' },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /factura\s*(comercial|de\s*venta|de\s*exportaci[oó]n)?/i,
+      group: 'invoice',
+    },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /commercial\s*invoice/i,
+      group: 'invoice',
+    },
+    { kind: 'CONTENT_REGEX', pattern: /invoice\s*number/i, group: 'invoice' },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /n[uú]mero\s*de\s*factura/i,
+      group: 'invoice',
+    },
+    { kind: 'CONTENT_REGEX', pattern: /pedimento/i, group: 'customs' },
+    { kind: 'CONTENT_REGEX', pattern: /aduana/i, group: 'customs' },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /importaci[oó]n|exportaci[oó]n/i,
+      group: 'customs',
+    },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /despacho\s*aduanero/i,
+      group: 'customs',
+    },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /fracci[oó]n\s*arancelaria/i,
+      group: 'customs',
+    },
+  ],
+};
+// --- support_document --------------------------------------------------------
+// Original: soapFound≥2 OR customsFound≥2. Broad fallback → lowest priority so
+// it only wins when no specific matcher qualifies.
+const supportDocument = {
+  documentType: 'support_document',
+  extensions: ['xml', 'txt', 'json'],
+  priority: -1,
+  qualify: [{ soap: 2 }, { customs: 2 }],
+  ...reuse(supportDocumentDefinition),
+  clues: [
+    { kind: 'CONTENT_REGEX', pattern: /soapenv:Envelope/i, group: 'soap' },
+    { kind: 'CONTENT_REGEX', pattern: /xmlns:soapenv=/i, group: 'soap' },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /solicitarRecibirCoveServicio/i,
+      group: 'soap',
+    },
+    { kind: 'CONTENT_REGEX', pattern: /tipoOperacion/i, group: 'soap' },
+    { kind: 'CONTENT_REGEX', pattern: /patenteAduanal/i, group: 'soap' },
+    // customs metadata fallback — original requires BOTH patterns present
+    { kind: 'CONTENT_REGEX', pattern: /rfc/i, group: 'customs' },
+    {
+      kind: 'CONTENT_REGEX',
+      pattern: /patente|aduana|customs|pedimento/i,
+      group: 'customs',
+    },
+  ],
+};
+/**
+ * Default/global seed set. Order is irrelevant — best-match selects the winner.
+ * (This becomes the seed for DEFAULT matchers when the model moves to the DB.)
+ */
+export const scoringMatchers = [
+  simplificado,
+  completo,
+  completoXml,
+  dodaPdf,
+  dodaXml,
+  interAgencia,
+  facturaComercial,
+  supportDocument,
+];

package/src/scoring/scoring-engine.js ADDED Viewed

@@ -0,0 +1,246 @@
+/**
+ * Scoring-based document classification engine (PROTOTYPE).
+ *
+ * Replaces the "first-match-wins" selection in `document-type-shared.js`
+ * (`extractDocumentFields`) with "best-match": every applicable matcher is
+ * scored by the weight of the clues it satisfies, and the highest score wins.
+ * This removes the order-dependent registration that the current registry
+ * relies on (e.g. `factura_inter_agencia` MUST be evaluated before
+ * `facturas_comerciales`) — precedence now lives in clue weights / `required`
+ * / `negative` flags, not in array order.
+ *
+ * Matcher shape (see `matchers-seed.js`):
+ *   {
+ *     documentType, extensions[], minScore, priority,
+ *     clues: [{ kind, pattern, flags?, weight=1, group?, required=false, negative=false }],
+ *     qualify?: [{ <group>: minCount, ... }, ...],   // OR-of-ANDs gate
+ *     extractors, resolveType?, extractNumPedimento?, extractPedimentoYear?
+ *   }
+ *
+ * Two separable concerns:
+ *   - QUALIFICATION (does this matcher apply at all?): `required`/`negative`
+ *     clues plus an optional `qualify` rule set. `qualify` is a list of
+ *     alternative rules (OR); a rule is a map of `group -> minimum matched
+ *     clues` (AND across its entries). This expresses the grouped boolean gates
+ *     of the original matchers, e.g. `(cfdi>=2) OR (invoice>=1 AND customs>=1)`.
+ *     When `qualify` is absent, the gate falls back to `score >= minScore`.
+ *   - RANKING (which qualifying matcher wins?): always the weighted sum of
+ *     matched clues (`score`), tie-broken by fraction -> priority -> type.
+ *
+ * Selection only depends on clues/qualify. The winning matcher's `extractors` /
+ * `resolveType` / `extractNumPedimento` / `extractPedimentoYear` run AFTER
+ * selection with the same post-processing as `extractDocumentFields`, so a
+ * side-by-side comparison isolates the selection change.
+ */
+import path from 'path';
+import { FieldResult } from '../document-type-shared.js';
+// Compile cache for string patterns (from DB matchers): stable across every
+// document in a run, so compile once instead of per document. Invalid patterns
+// cache as null and are treated as a non-match (parity with the TS engine).
+const regexCache = new Map();
+const REGEX_CACHE_MAX = 5000;
+function toRegExp(clue) {
+  if (clue.pattern instanceof RegExp) {
+    // Local-seed patterns are already compiled; reset lastIndex so a g/y flag
+    // can't make repeated .test() calls stateful across documents.
+    clue.pattern.lastIndex = 0;
+    return clue.pattern;
+  }
+  const safeFlags = (clue.flags ?? '').replace(/[gy]/g, '');
+  const key = `${safeFlags} ${clue.pattern}`;
+  let re = regexCache.get(key);
+  if (re === undefined) {
+    try {
+      re = new RegExp(clue.pattern, safeFlags);
+    } catch {
+      re = null;
+    }
+    if (regexCache.size >= REGEX_CACHE_MAX) regexCache.clear();
+    regexCache.set(key, re);
+  }
+  return re;
+}
+// Cap the text a single regex runs on. Real extracted document text is far
+// below this; the cap only bounds pathological/crafted inputs so an allowed
+// (quadratic) pattern can't blow up on a megabyte-scale adversarial string.
+const MATCH_INPUT_CAP = 1_000_000;
+function clueTarget(clue, ctx) {
+  // FILENAME_REGEX tests the file name; every other kind tests the content.
+  const raw =
+    clue.kind === 'FILENAME_REGEX' ? (ctx.fileName ?? '') : (ctx.source ?? '');
+  return raw.length > MATCH_INPUT_CAP ? raw.slice(0, MATCH_INPUT_CAP) : raw;
+}
+/**
+ * Score a single matcher against a document context.
+ * @returns {null} when the matcher does not apply (extension mismatch),
+ *          `{ disqualified: true, reason }` when a `required`/`negative` clue
+ *          rules it out, otherwise a scored result object.
+ */
+export function scoreMatcher(matcher, ctx) {
+  const ext = (ctx.extension ?? '').toLowerCase();
+  if (
+    Array.isArray(matcher.extensions) &&
+    matcher.extensions.length > 0 &&
+    ext &&
+    !matcher.extensions.includes(ext)
+  ) {
+    return null;
+  }
+  let matchedWeight = 0;
+  let totalWeight = 0;
+  const matchedClues = [];
+  const groupCounts = {};
+  for (const clue of matcher.clues ?? []) {
+    const weight = clue.weight ?? 1;
+    const re = toRegExp(clue);
+    const hit = re ? re.test(clueTarget(clue, ctx)) : false;
+    if (clue.negative) {
+      if (hit) {
+        return {
+          documentType: matcher.documentType,
+          disqualified: true,
+          reason: `negative:${clue.pattern}`,
+        };
+      }
+      continue;
+    }
+    if (clue.required && !hit) {
+      return {
+        documentType: matcher.documentType,
+        disqualified: true,
+        reason: `required-missing:${clue.pattern}`,
+      };
+    }
+    totalWeight += weight;
+    if (hit) {
+      matchedWeight += weight;
+      matchedClues.push(clue);
+      if (clue.group) {
+        groupCounts[clue.group] = (groupCounts[clue.group] ?? 0) + 1;
+      }
+    }
+  }
+  // Qualification gate: `qualify` rules (OR-of-ANDs over group counts) when
+  // present, otherwise the weighted-score threshold.
+  const passed = Array.isArray(matcher.qualify)
+    ? matcher.qualify.some((rule) =>
+        Object.entries(rule).every(
+          ([group, min]) => (groupCounts[group] ?? 0) >= min,
+        ),
+      )
+    : matchedWeight >= (matcher.minScore ?? 1);
+  return {
+    documentType: matcher.documentType,
+    matcher,
+    disqualified: false,
+    score: matchedWeight,
+    totalWeight,
+    fraction: totalWeight > 0 ? matchedWeight / totalWeight : 0,
+    priority: matcher.priority ?? 0,
+    passed,
+    matchedClues,
+    groupCounts,
+  };
+}
+/**
+ * All qualifying candidates, sorted best-first.
+ * Order: score desc → fraction desc → priority desc → documentType (stable).
+ */
+export function scoreAll(matchers, ctx) {
+  const candidates = [];
+  for (const matcher of matchers) {
+    const result = scoreMatcher(matcher, ctx);
+    if (!result || result.disqualified || !result.passed) continue;
+    candidates.push(result);
+  }
+  candidates.sort(
+    (a, b) =>
+      b.score - a.score ||
+      b.fraction - a.fraction ||
+      b.priority - a.priority ||
+      String(a.documentType).localeCompare(String(b.documentType)),
+  );
+  return candidates;
+}
+export function selectBestMatch(matchers, ctx) {
+  return scoreAll(matchers, ctx)[0] ?? null;
+}
+/**
+ * Full classification: pick the best matcher, then run ITS extractors /
+ * resolveType / pedimento helpers. Post-selection logic mirrors
+ * `extractDocumentFields` so a comparison isolates the selection change.
+ *
+ * @returns {{ detectedType, fields, detectedPedimento, detectedPedimentoYear,
+ *             winner, candidates }}
+ */
+export function classifyDocument(matchers, { source, extension, filePath }) {
+  const ctx = {
+    source,
+    extension,
+    fileName: filePath ? path.basename(filePath) : '',
+  };
+  const candidates = scoreAll(matchers, ctx);
+  const winner = candidates[0] ?? null;
+  if (!winner) {
+    return {
+      detectedType: null,
+      fields: [],
+      detectedPedimento: null,
+      detectedPedimentoYear: null,
+      winner: null,
+      candidates,
+    };
+  }
+  const def = winner.matcher;
+  const fields = [];
+  for (const extractor of def.extractors ?? []) {
+    try {
+      fields.push(extractor.extract(source));
+    } catch {
+      fields.push(new FieldResult(extractor.field, false, null));
+    }
+  }
+  const resolvedType = def.resolveType
+    ? def.resolveType(fields)
+    : def.documentType;
+  const pedimento = def.extractNumPedimento
+    ? def.extractNumPedimento(source, fields, filePath)
+    : null;
+  const year = def.extractPedimentoYear
+    ? def.extractPedimentoYear(source, fields, filePath)
+    : null;
+  // Backfill numPedimento as a field (same as extractDocumentFields) so
+  // downstream consumers (composeArelaPath) see a consistent shape.
+  if (pedimento && !fields.some((f) => f.name === 'numPedimento')) {
+    fields.push(new FieldResult('numPedimento', true, pedimento));
+  }
+  return {
+    detectedType: resolvedType,
+    fields,
+    detectedPedimento: pedimento,
+    detectedPedimentoYear: year,
+    winner,
+    candidates,
+  };
+}

package/src/services/ScanApiService.js CHANGED Viewed

@@ -389,6 +389,20 @@ export class ScanApiService {
     return result;
   }
+  /**
+   * Fetch the resolved matcher set (this RFC's matchers + globals) for runtime
+   * classification. Returns an array of matchers with clues + fieldExtractors.
+   * @param {string|null} rfc - optional RFC to scope per-company matchers
+   */
+  async getResolvedMatchers(rfc = null) {
+    const qs = rfc ? `?rfc=${encodeURIComponent(rfc)}` : '';
+    const result = await this.#request(
+      `/api/document-matcher/resolved${qs}`,
+      'GET',
+    );
+    return Array.isArray(result) ? result : [];
+  }
   async fetchPdfsForDetection(
     tableName,
     offset = 0,