npm - @arela/uploader - Versions diffs - 1.0.23 → 1.0.24 - Mend

@arela/uploader 1.0.23 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/package.json +1 -1
package/scripts/scoring-compare.js +243 -0
package/scripts/scoring-phase4-check.js +96 -0
package/src/commands/IdentifyCommand.js +28 -0
package/src/config/config.js +2 -2
package/src/file-detection.js +29 -3
package/src/scoring/db-matcher-adapter.js +98 -0
package/src/scoring/matchers-seed.js +386 -0
package/src/scoring/scoring-engine.js +218 -0
package/src/services/ScanApiService.js +14 -0
package/tests/unit/scoring-engine.test.js +221 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@arela/uploader",
-  "version": "1.0.23",
+  "version": "1.0.24",
   "description": "CLI to upload files/directories to Arela",
   "bin": {
     "arela": "./src/index.js"

package/scripts/scoring-compare.js ADDED Viewed

@@ -0,0 +1,243 @@
+/**
+ * Scoring engine validation harness (PROTOTYPE).
+ *
+ * Runs the CURRENT first-match-wins engine (`extractDocumentFields`) and the new
+ * best-match scoring engine (`classifyDocument`) over the same corpus and prints
+ * a side-by-side comparison so we can confirm best-match reproduces (or
+ * improves on) the current behaviour before wiring it into the pipeline.
+ *
+ * Usage:
+ *   node scripts/scoring-compare.js                # built-in synthetic samples
+ *   node scripts/scoring-compare.js <folder>       # + real .pdf/.xml/.txt files
+ *
+ * The built-in samples include the `factura_inter_agencia` vs
+ * `factura_comercial` case, which the current engine only resolves via
+ * registration order — the harness shows best-match resolving it by score,
+ * independent of matcher order.
+ */
+import fs from 'fs';
+import path from 'path';
+import { extractDocumentFields } from '../src/document-type-shared.js';
+import FileDetectionService from '../src/file-detection.js';
+import { classifyDocument, scoreAll } from '../src/scoring/scoring-engine.js';
+import { scoringMatchers } from '../src/scoring/matchers-seed.js';
+// --------------------------- synthetic corpus -------------------------------
+// Compact, representative texts that trigger the relevant clues. Real pdf-parse
+// output is messier — pass a folder to validate against production documents.
+const SAMPLES = [
+  {
+    name: 'simplificado-paid',
+    extension: 'pdf',
+    expected: 'pedimento_simplificado',
+    text: `FORMA SIMPLIFICADA DEL PEDIMENTO
+NUM. PEDIMENTO: 26 07 3429 6000079
+CVE. PEDIMENTO: A1
+T. OPER: IMP
+RFC: CSM9204097Q1
+FECHA DE PAGO: 04/03/2026
+*** PAGO ELECTRONICO ***`,
+  },
+  {
+    name: 'simplificado-unpaid (proforma)',
+    extension: 'pdf',
+    expected: 'proforma',
+    text: `FORMA SIMPLIFICADA DE PEDIMENTO
+NUM. PEDIMENTO: 26 07 3429 6000080
+CVE. PEDIMENTO: A1
+T. OPER: IMP
+RFC: CSM9204097Q1
+*** NO PAGADO ***`,
+  },
+  {
+    name: 'completo',
+    extension: 'pdf',
+    expected: 'pedimento_completo',
+    text: `NUM. PEDIMENTO: 26 07 3429 2002089
+CVE. PEDIMENTO: A1
+T. OPER: IMP
+SEGUNDA COPIA TRANSPORTISTA
+CERTIFICACIONES
+CUADRO DE LIQUIDACION
+*** PAGO ELECTRONICO ***
+FECHA DE PAGO: 02/03/2026`,
+  },
+  {
+    name: 'completo-xml',
+    extension: 'xml',
+    filePath: '/tmp/260734296016642.xml',
+    expected: 'pedimento_completo_xml',
+    text: `<?xml version="1.0"?>
+<ns2:consultarPedimentoCompletoRespuesta>
+  <ns2:pedimento>6016642</ns2:pedimento>
+  <ns2:aduanaEntradaSalida><ns2:clave>70</ns2:clave></ns2:aduanaEntradaSalida>
+  <ns2:fechas><ns2:clave>2</ns2:clave><ns2:fecha>2026-03-02-06:00</ns2:fecha></ns2:fechas>
+  <ns2:fechas><ns2:clave>5</ns2:clave><ns2:fecha>2026-02-20-06:00</ns2:fecha></ns2:fechas>
+  <ns2:rfc>CSM9204097Q1</ns2:rfc>
+</ns2:consultarPedimentoCompletoRespuesta>`,
+  },
+  {
+    name: 'doda-pdf',
+    extension: 'pdf',
+    expected: 'doda_pdf',
+    text: `DOCUMENTO DE OPERACION PARA DESPACHO ADUANERO
+DODA
+VUCEM
+||070|3429|2|4009029|109335668|A231|
+2026-03-02`,
+  },
+  {
+    name: 'doda-xml',
+    extension: 'xml',
+    expected: 'doda_xml',
+    text: `<?xml version="1.0"?>
+<documentoOperacion>
+  <numPedimento>260734292002089</numPedimento>
+  <patenteAduanal>3429</patenteAduanal>
+  <aduanaDespacho>07</aduanaDespacho>
+</documentoOperacion>`,
+  },
+  {
+    name: 'inter-agencia (vs comercial)',
+    extension: 'xml',
+    expected: 'factura_inter_agencia',
+    text: `<cfdi:Comprobante xmlns:cfdi="..." TipoDeComprobante="I">
+  <cfdi:Emisor Rfc="NAA120215F20"/>
+  <cfdi:Receptor Rfc="PCC1008161WA"/>
+  <cfdi:Concepto ClaveProdServ="78141502" Descripcion="Servicios de agente aduanal"/>
+</cfdi:Comprobante>`,
+  },
+  {
+    name: 'factura-comercial',
+    extension: 'xml',
+    expected: 'factura_comercial',
+    text: `<cfdi:Comprobante xmlns:cfdi="..." TipoDeComprobante="I">
+  <cfdi:Emisor Rfc="ABC010101AB1"/>
+  <cfdi:Receptor Rfc="XYZ020202CD2"/>
+  <tfd:TimbreFiscalDigital/>
+  pedimento 26 07 3429 6016477
+</cfdi:Comprobante>`,
+  },
+  {
+    name: 'support-document',
+    extension: 'xml',
+    expected: 'support_document',
+    text: `<?xml version="1.0"?>
+<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/">
+  <oxml:tipoOperacion>IMP</oxml:tipoOperacion>
+  <oxml:patenteAduanal>3429</oxml:patenteAduanal>
+</soapenv:Envelope>`,
+  },
+];
+// --------------------------- comparison -------------------------------------
+function firstMatchType(source, extension, filePath) {
+  const [type] = extractDocumentFields(source, extension, filePath);
+  return type;
+}
+function bestMatchResult(source, extension, filePath) {
+  return classifyDocument(scoringMatchers, { source, extension, filePath });
+}
+function topCandidates(source, extension, filePath, n = 3) {
+  return scoreAll(scoringMatchers, {
+    source,
+    extension,
+    fileName: filePath ? path.basename(filePath) : '',
+  })
+    .slice(0, n)
+    .map((c) => `${c.documentType}:${c.score}`)
+    .join(', ');
+}
+function row(name, first, best, expected) {
+  const agree = first === best ? 'sí ' : 'NO ';
+  const vsExp = expected ? (best === expected ? 'ok ' : '⚠️ ') : '   ';
+  return (
+    `${name.padEnd(34)} first=${String(first).padEnd(24)} ` +
+    `best=${String(best).padEnd(24)} coinciden=${agree} esperado=${vsExp}`
+  );
+}
+async function run() {
+  const folder = process.argv[2];
+  let total = 0;
+  let disagreements = 0;
+  console.log('\n=== Muestras sintéticas ===');
+  for (const s of SAMPLES) {
+    const first = firstMatchType(s.text, s.extension, s.filePath);
+    const best = bestMatchResult(s.text, s.extension, s.filePath).detectedType;
+    total++;
+    if (first !== best) disagreements++;
+    console.log(row(s.name, first, best, s.expected));
+  }
+  // Order-independence demonstration for the inter-agencia/comercial case.
+  const ia = SAMPLES.find((s) => s.name.startsWith('inter-agencia'));
+  const reversed = [...scoringMatchers].reverse();
+  const normalWinner = classifyDocument(scoringMatchers, {
+    source: ia.text,
+    extension: ia.extension,
+  }).detectedType;
+  const reversedWinner = classifyDocument(reversed, {
+    source: ia.text,
+    extension: ia.extension,
+  }).detectedType;
+  console.log('\n=== Independencia de orden (inter-agencia) ===');
+  console.log(`candidatos (por score): ${topCandidates(ia.text, ia.extension)}`);
+  console.log(`seed normal   -> ${normalWinner}`);
+  console.log(`seed invertido-> ${reversedWinner}`);
+  console.log(
+    `order-independent: ${normalWinner === reversedWinner ? 'sí ✅' : 'NO ❌'}`,
+  );
+  // Optional: real files from a folder.
+  if (folder) {
+    if (!fs.existsSync(folder)) {
+      console.error(`\nCarpeta no existe: ${folder}`);
+    } else {
+      console.log(`\n=== Archivos reales (${folder}) ===`);
+      const detection = new FileDetectionService();
+      const files = walk(folder).filter((f) =>
+        ['.pdf', '.xml', '.txt'].includes(path.extname(f).toLowerCase()),
+      );
+      for (const file of files) {
+        const ext = path.extname(file).toLowerCase().replace('.', '');
+        let text = '';
+        try {
+          text =
+            ext === 'pdf'
+              ? await detection.extractTextFromPDF(file)
+              : fs.readFileSync(file, 'utf8');
+        } catch (err) {
+          console.log(`${path.basename(file).padEnd(34)} ERROR: ${err.message}`);
+          continue;
+        }
+        const first = firstMatchType(text, ext, file);
+        const best = bestMatchResult(text, ext, file).detectedType;
+        total++;
+        if (first !== best) disagreements++;
+        console.log(row(path.basename(file), first, best, null));
+      }
+    }
+  }
+  console.log(
+    `\n=== Resumen: ${total} documentos, ${disagreements} divergencias first-vs-best ===\n`,
+  );
+}
+function walk(dir) {
+  const out = [];
+  for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
+    const full = path.join(dir, entry.name);
+    if (entry.isDirectory()) out.push(...walk(full));
+    else out.push(full);
+  }
+  return out;
+}
+run();

package/scripts/scoring-phase4-check.js ADDED Viewed

@@ -0,0 +1,96 @@
+/**
+ * Phase 4 validation: runs the REAL runtime path the uploader now uses
+ * (DB-shape matchers -> adaptDbMatchers -> classifyDocument with rich extraction)
+ * against a corpus and compares it to the legacy engine (extractDocumentFields).
+ *
+ * Usage: node scripts/scoring-phase4-check.js <folder>
+ */
+import fs from 'fs';
+import path from 'path';
+import { extractDocumentFields } from '../src/document-type-shared.js';
+import FileDetectionService from '../src/file-detection.js';
+import { adaptDbMatchers } from '../src/scoring/db-matcher-adapter.js';
+import { scoringMatchers } from '../src/scoring/matchers-seed.js';
+import { classifyDocument } from '../src/scoring/scoring-engine.js';
+// Serialize the local seed to the shape the API `/resolved` endpoint returns,
+// so we exercise the adapter exactly as in production.
+function toDbShape(matchers) {
+  return matchers.map((m) => ({
+    documentType: m.documentType,
+    extensions: m.extensions,
+    minScore: m.minScore ?? null,
+    priority: m.priority ?? 0,
+    qualify: m.qualify ?? null,
+    clues: (m.clues || []).map((c) => ({
+      kind: c.kind,
+      pattern: c.pattern instanceof RegExp ? c.pattern.source : c.pattern,
+      flags: c.pattern instanceof RegExp ? c.pattern.flags : c.flags || '',
+      weight: c.weight ?? 1,
+      group: c.group ?? null,
+      required: !!c.required,
+      negative: !!c.negative,
+    })),
+    fieldExtractors: [], // rich extraction comes from the registry by documentType
+  }));
+}
+const adapted = adaptDbMatchers(toDbShape(scoringMatchers));
+function walk(dir) {
+  const out = [];
+  for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
+    const full = path.join(dir, e.name);
+    if (e.isDirectory()) out.push(...walk(full));
+    else out.push(full);
+  }
+  return out;
+}
+async function run() {
+  const folder = process.argv[2];
+  if (!folder) {
+    console.error('Pass a folder: node scripts/scoring-phase4-check.js <folder>');
+    process.exit(1);
+  }
+  const detection = new FileDetectionService();
+  const files = walk(folder).filter((f) =>
+    ['.pdf', '.xml', '.txt'].includes(path.extname(f).toLowerCase()),
+  );
+  let total = 0;
+  let diverge = 0;
+  const patterns = {};
+  for (const file of files) {
+    const ext = path.extname(file).toLowerCase().replace('.', '');
+    let text = '';
+    try {
+      text =
+        ext === 'pdf'
+          ? await detection.extractTextFromPDF(file)
+          : fs.readFileSync(file, 'utf8');
+    } catch {
+      continue;
+    }
+    const legacy = extractDocumentFields(text, ext, file)[0];
+    const phase4 = classifyDocument(adapted, {
+      source: text,
+      extension: ext,
+      filePath: file,
+    }).detectedType;
+    total++;
+    if (legacy !== phase4) {
+      diverge++;
+      const key = `${legacy} -> ${phase4}`;
+      patterns[key] = (patterns[key] || 0) + 1;
+      console.log(`NO  ${path.basename(file).padEnd(40)} ${key}`);
+    }
+  }
+  console.log(`\n=== Fase 4 vs legacy: ${total} docs, ${diverge} divergencias ===`);
+  for (const [k, n] of Object.entries(patterns)) console.log(`  ${n}×  ${k}`);
+}
+run();

package/src/commands/IdentifyCommand.js CHANGED Viewed

@@ -8,6 +8,8 @@ import appConfig from '../config/config.js';
 import ErrorHandler from '../errors/ErrorHandler.js';
 import { ConfigurationError } from '../errors/ErrorTypes.js';
 import FileDetectionService from '../file-detection.js';
+import { adaptDbMatchers } from '../scoring/db-matcher-adapter.js';
+import { scoringMatchers } from '../scoring/matchers-seed.js';
 /**
  * Paid pedimento detected_type values.
@@ -69,6 +71,32 @@ export class IdentifyCommand {
       );
       this.scanApiService = new ScanApiService(apiTarget);
+      // Load matchers for best-match classification (phase 4 hybrid). Prefer the
+      // DB-resolved set (this RFC + globals); fall back to the validated local
+      // seed; set DISABLE_SCORING_MATCHERS=true to force legacy first-match.
+      if (process.env.DISABLE_SCORING_MATCHERS === 'true') {
+        logger.info('🧩 Scoring matchers disabled — legacy detection');
+      } else {
+        let matchers = null;
+        try {
+          const rfc = process.env.MATCHER_RFC || null;
+          const dbMatchers = await this.scanApiService.getResolvedMatchers(rfc);
+          if (dbMatchers.length) {
+            matchers = adaptDbMatchers(dbMatchers);
+            logger.info(`🧩 Loaded ${matchers.length} matchers from API`);
+          }
+        } catch (err) {
+          logger.warn(`🧩 Could not load matchers from API: ${err.message}`);
+        }
+        if (!matchers) {
+          matchers = scoringMatchers;
+          logger.info(`🧩 Using local seed matchers (${matchers.length})`);
+        }
+        if (typeof this.detectionService.setMatchers === 'function') {
+          this.detectionService.setMatchers(matchers);
+        }
+      }
       const scanConfig = appConfig.getScanConfig();
       const batchSize = parseInt(options.batchSize) || 100;

package/src/config/config.js CHANGED Viewed

@@ -37,10 +37,10 @@ class Config {
       const __dirname = path.dirname(__filename);
       const packageJsonPath = path.resolve(__dirname, '../../package.json');
       const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
-      return packageJson.version || '1.0.23';
+      return packageJson.version || '1.0.24';
     } catch (error) {
       console.warn('⚠️ Could not read package.json version, using fallback');
-      return '1.0.23';
+      return '1.0.24';
     }
   }

package/src/file-detection.js CHANGED Viewed

@@ -3,6 +3,7 @@ import path from 'path';
 import { PDFParse } from 'pdf-parse';
 import { extractDocumentFields } from './document-type-shared.js';
+import { classifyDocument } from './scoring/scoring-engine.js';
 // Document types that participate in arela_path composition.
 const ARELA_PATH_TYPES = new Set([
@@ -84,6 +85,17 @@ function composeArelaPath(
  * Detects document types and extracts metadata from files
  */
 export class FileDetectionService {
+  constructor() {
+    // Best-match matchers (adapted from the API). When set, classification uses
+    // the scoring engine; otherwise it falls back to legacy first-match-wins.
+    this.matchers = null;
+  }
+  /** Provide the resolved+adapted matcher set for scoring-based classification. */
+  setMatchers(matchers) {
+    this.matchers = matchers && matchers.length ? matchers : null;
+  }
   /**
    * Detect document type from a file
    * @param {string} filePath - Path to the file to analyze
@@ -140,9 +152,23 @@ export class FileDetectionService {
         };
       }
-      // Extract document fields and detect type
-      const [detectedType, fields, detectedPedimento, detectedPedimentoYear] =
-        extractDocumentFields(text, fileExtension, filePath);
+      // Extract document fields and detect type. Use the best-match scoring
+      // engine when matchers are configured; otherwise legacy first-match-wins.
+      let detectedType, fields, detectedPedimento, detectedPedimentoYear;
+      if (this.matchers) {
+        const r = classifyDocument(this.matchers, {
+          source: text,
+          extension: fileExtension,
+          filePath,
+        });
+        detectedType = r.detectedType;
+        fields = r.fields;
+        detectedPedimento = r.detectedPedimento;
+        detectedPedimentoYear = r.detectedPedimentoYear;
+      } else {
+        [detectedType, fields, detectedPedimento, detectedPedimentoYear] =
+          extractDocumentFields(text, fileExtension, filePath);
+      }
       // Extract RFC from fields
       const rfc = fields?.find((f) => f.name === 'rfc')?.value ?? null;

package/src/scoring/db-matcher-adapter.js ADDED Viewed

@@ -0,0 +1,98 @@
+/**
+ * Adapt DB matchers (from arela-api `GET /document-matcher/resolved`) into the
+ * shape the scoring engine consumes — the HYBRID model:
+ *
+ *   - SELECTION comes from the DB matcher's clues / qualify (per-RFC + globals).
+ *   - EXTRACTION uses the rich JS extractors keyed by `documentType` when one
+ *     exists (resolveType, multi-pattern field extractors, pedimento composition);
+ *     otherwise it falls back to building simple regex extractors from the DB
+ *     matcher's `fieldExtractors`.
+ *
+ * This keeps per-client matching configurable from the UI while preserving the
+ * robust field extraction that already ships in the uploader.
+ */
+// IMPORTANT: load document-type-shared FIRST so it becomes the root of the
+// shared<->definitions import cycle and fully evaluates before the individual
+// definitions are referenced (otherwise: "Cannot access X before initialization").
+import { FieldResult } from '../document-type-shared.js';
+import { dodaPdfDefinition } from '../document-types/doda-pdf.js';
+import { dodaXmlDefinition } from '../document-types/doda-xml.js';
+import { facturaInterAgenciaDefinition } from '../document-types/factura-inter-agencia.js';
+import { facturasComerciales } from '../document-types/facturas-comerciales.js';
+import { pedimentoCompletoXmlDefinition } from '../document-types/pedimento-completo-xml.js';
+import { pedimentoCompletoDefinition } from '../document-types/pedimento-completo.js';
+import { pedimentoSimplificadoDefinition } from '../document-types/pedimento-simplificado.js';
+import { supportDocumentDefinition } from '../document-types/support-document.js';
+// documentType -> rich extraction half of the JS definition.
+function extractionOf(def) {
+  return {
+    extractors: def.extractors,
+    resolveType: def.resolveType,
+    extractNumPedimento: def.extractNumPedimento,
+    extractPedimentoYear: def.extractPedimentoYear,
+  };
+}
+const EXTRACTION_REGISTRY = {
+  pedimento_simplificado: extractionOf(pedimentoSimplificadoDefinition),
+  pedimento_completo: extractionOf(pedimentoCompletoDefinition),
+  pedimento_completo_xml: extractionOf(pedimentoCompletoXmlDefinition),
+  doda_pdf: extractionOf(dodaPdfDefinition),
+  doda_xml: extractionOf(dodaXmlDefinition),
+  factura_inter_agencia: extractionOf(facturaInterAgenciaDefinition),
+  factura_comercial: extractionOf(facturasComerciales),
+  support_document: extractionOf(supportDocumentDefinition),
+};
+// Build a scoring-engine extractor from a DB fieldExtractor (regex + capture).
+function regexExtractor(fe) {
+  return {
+    field: fe.field,
+    extract: (source) => {
+      try {
+        const m = source.match(new RegExp(fe.extractor, fe.flags || ''));
+        return new FieldResult(fe.field, !!m, m ? (m[1] ?? m[0]) : null);
+      } catch {
+        return new FieldResult(fe.field, false, null);
+      }
+    },
+  };
+}
+/**
+ * Convert resolved DB matchers into scoring-engine matchers.
+ * @param {Array} dbMatchers - matchers from the API (with clues + fieldExtractors)
+ * @returns {Array} scoring matchers
+ */
+export function adaptDbMatchers(dbMatchers) {
+  return (dbMatchers || []).map((m) => {
+    const rich = EXTRACTION_REGISTRY[m.documentType];
+    const extraction = rich
+      ? rich
+      : { extractors: (m.fieldExtractors || []).map(regexExtractor) };
+    return {
+      documentType: m.documentType,
+      extensions: Array.isArray(m.extensions)
+        ? m.extensions
+        : String(m.extensions || '')
+            .split(',')
+            .map((s) => s.trim())
+            .filter(Boolean),
+      minScore: m.minScore ?? undefined,
+      priority: m.priority ?? 0,
+      qualify: m.qualify ?? undefined,
+      clues: (m.clues || []).map((c) => ({
+        kind: c.kind,
+        pattern: c.pattern,
+        flags: c.flags || undefined,
+        weight: c.weight ?? 1,
+        group: c.group || undefined,
+        required: !!c.required,
+        negative: !!c.negative,
+      })),
+      ...extraction,
+    };
+  });
+}