@arela/uploader 1.0.22 → 1.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/scoring-compare.js +243 -0
- package/scripts/scoring-phase4-check.js +96 -0
- package/src/commands/IdentifyCommand.js +34 -6
- package/src/commands/ScanCommand.js +15 -0
- package/src/config/config.js +28 -2
- package/src/document-type-shared.js +15 -7
- package/src/document-types/_pedimento-shared-extractors.js +27 -8
- package/src/document-types/factura-inter-agencia.js +186 -0
- package/src/document-types/pedimento-completo-xml.js +62 -12
- package/src/document-types/pedimento-completo.js +5 -3
- package/src/document-types/pedimento-simplificado.js +5 -2
- package/src/document-types/proforma.js +2 -2
- package/src/file-detection.js +30 -6
- package/src/scoring/db-matcher-adapter.js +98 -0
- package/src/scoring/matchers-seed.js +386 -0
- package/src/scoring/scoring-engine.js +218 -0
- package/src/services/ScanApiService.js +14 -0
- package/tests/unit/factura-inter-agencia.test.js +218 -0
- package/tests/unit/pedimento-completo-xml-matcher.test.js +271 -0
- package/tests/unit/pedimento-simplificado-matcher.test.js +185 -0
- package/tests/unit/scoring-engine.test.js +221 -0
package/package.json
CHANGED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scoring engine validation harness (PROTOTYPE).
|
|
3
|
+
*
|
|
4
|
+
* Runs the CURRENT first-match-wins engine (`extractDocumentFields`) and the new
|
|
5
|
+
* best-match scoring engine (`classifyDocument`) over the same corpus and prints
|
|
6
|
+
* a side-by-side comparison so we can confirm best-match reproduces (or
|
|
7
|
+
* improves on) the current behaviour before wiring it into the pipeline.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* node scripts/scoring-compare.js # built-in synthetic samples
|
|
11
|
+
* node scripts/scoring-compare.js <folder> # + real .pdf/.xml/.txt files
|
|
12
|
+
*
|
|
13
|
+
* The built-in samples include the `factura_inter_agencia` vs
|
|
14
|
+
* `factura_comercial` case, which the current engine only resolves via
|
|
15
|
+
* registration order — the harness shows best-match resolving it by score,
|
|
16
|
+
* independent of matcher order.
|
|
17
|
+
*/
|
|
18
|
+
import fs from 'fs';
|
|
19
|
+
import path from 'path';
|
|
20
|
+
|
|
21
|
+
import { extractDocumentFields } from '../src/document-type-shared.js';
|
|
22
|
+
import FileDetectionService from '../src/file-detection.js';
|
|
23
|
+
import { classifyDocument, scoreAll } from '../src/scoring/scoring-engine.js';
|
|
24
|
+
import { scoringMatchers } from '../src/scoring/matchers-seed.js';
|
|
25
|
+
|
|
26
|
+
// --------------------------- synthetic corpus -------------------------------
|
|
27
|
+
// Compact, representative texts that trigger the relevant clues. Real pdf-parse
|
|
28
|
+
// output is messier — pass a folder to validate against production documents.
|
|
29
|
+
const SAMPLES = [
|
|
30
|
+
{
|
|
31
|
+
name: 'simplificado-paid',
|
|
32
|
+
extension: 'pdf',
|
|
33
|
+
expected: 'pedimento_simplificado',
|
|
34
|
+
text: `FORMA SIMPLIFICADA DEL PEDIMENTO
|
|
35
|
+
NUM. PEDIMENTO: 26 07 3429 6000079
|
|
36
|
+
CVE. PEDIMENTO: A1
|
|
37
|
+
T. OPER: IMP
|
|
38
|
+
RFC: CSM9204097Q1
|
|
39
|
+
FECHA DE PAGO: 04/03/2026
|
|
40
|
+
*** PAGO ELECTRONICO ***`,
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
name: 'simplificado-unpaid (proforma)',
|
|
44
|
+
extension: 'pdf',
|
|
45
|
+
expected: 'proforma',
|
|
46
|
+
text: `FORMA SIMPLIFICADA DE PEDIMENTO
|
|
47
|
+
NUM. PEDIMENTO: 26 07 3429 6000080
|
|
48
|
+
CVE. PEDIMENTO: A1
|
|
49
|
+
T. OPER: IMP
|
|
50
|
+
RFC: CSM9204097Q1
|
|
51
|
+
*** NO PAGADO ***`,
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
name: 'completo',
|
|
55
|
+
extension: 'pdf',
|
|
56
|
+
expected: 'pedimento_completo',
|
|
57
|
+
text: `NUM. PEDIMENTO: 26 07 3429 2002089
|
|
58
|
+
CVE. PEDIMENTO: A1
|
|
59
|
+
T. OPER: IMP
|
|
60
|
+
SEGUNDA COPIA TRANSPORTISTA
|
|
61
|
+
CERTIFICACIONES
|
|
62
|
+
CUADRO DE LIQUIDACION
|
|
63
|
+
*** PAGO ELECTRONICO ***
|
|
64
|
+
FECHA DE PAGO: 02/03/2026`,
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
name: 'completo-xml',
|
|
68
|
+
extension: 'xml',
|
|
69
|
+
filePath: '/tmp/260734296016642.xml',
|
|
70
|
+
expected: 'pedimento_completo_xml',
|
|
71
|
+
text: `<?xml version="1.0"?>
|
|
72
|
+
<ns2:consultarPedimentoCompletoRespuesta>
|
|
73
|
+
<ns2:pedimento>6016642</ns2:pedimento>
|
|
74
|
+
<ns2:aduanaEntradaSalida><ns2:clave>70</ns2:clave></ns2:aduanaEntradaSalida>
|
|
75
|
+
<ns2:fechas><ns2:clave>2</ns2:clave><ns2:fecha>2026-03-02-06:00</ns2:fecha></ns2:fechas>
|
|
76
|
+
<ns2:fechas><ns2:clave>5</ns2:clave><ns2:fecha>2026-02-20-06:00</ns2:fecha></ns2:fechas>
|
|
77
|
+
<ns2:rfc>CSM9204097Q1</ns2:rfc>
|
|
78
|
+
</ns2:consultarPedimentoCompletoRespuesta>`,
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
name: 'doda-pdf',
|
|
82
|
+
extension: 'pdf',
|
|
83
|
+
expected: 'doda_pdf',
|
|
84
|
+
text: `DOCUMENTO DE OPERACION PARA DESPACHO ADUANERO
|
|
85
|
+
DODA
|
|
86
|
+
VUCEM
|
|
87
|
+
||070|3429|2|4009029|109335668|A231|
|
|
88
|
+
2026-03-02`,
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
name: 'doda-xml',
|
|
92
|
+
extension: 'xml',
|
|
93
|
+
expected: 'doda_xml',
|
|
94
|
+
text: `<?xml version="1.0"?>
|
|
95
|
+
<documentoOperacion>
|
|
96
|
+
<numPedimento>260734292002089</numPedimento>
|
|
97
|
+
<patenteAduanal>3429</patenteAduanal>
|
|
98
|
+
<aduanaDespacho>07</aduanaDespacho>
|
|
99
|
+
</documentoOperacion>`,
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
name: 'inter-agencia (vs comercial)',
|
|
103
|
+
extension: 'xml',
|
|
104
|
+
expected: 'factura_inter_agencia',
|
|
105
|
+
text: `<cfdi:Comprobante xmlns:cfdi="..." TipoDeComprobante="I">
|
|
106
|
+
<cfdi:Emisor Rfc="NAA120215F20"/>
|
|
107
|
+
<cfdi:Receptor Rfc="PCC1008161WA"/>
|
|
108
|
+
<cfdi:Concepto ClaveProdServ="78141502" Descripcion="Servicios de agente aduanal"/>
|
|
109
|
+
</cfdi:Comprobante>`,
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
name: 'factura-comercial',
|
|
113
|
+
extension: 'xml',
|
|
114
|
+
expected: 'factura_comercial',
|
|
115
|
+
text: `<cfdi:Comprobante xmlns:cfdi="..." TipoDeComprobante="I">
|
|
116
|
+
<cfdi:Emisor Rfc="ABC010101AB1"/>
|
|
117
|
+
<cfdi:Receptor Rfc="XYZ020202CD2"/>
|
|
118
|
+
<tfd:TimbreFiscalDigital/>
|
|
119
|
+
pedimento 26 07 3429 6016477
|
|
120
|
+
</cfdi:Comprobante>`,
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
name: 'support-document',
|
|
124
|
+
extension: 'xml',
|
|
125
|
+
expected: 'support_document',
|
|
126
|
+
text: `<?xml version="1.0"?>
|
|
127
|
+
<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/">
|
|
128
|
+
<oxml:tipoOperacion>IMP</oxml:tipoOperacion>
|
|
129
|
+
<oxml:patenteAduanal>3429</oxml:patenteAduanal>
|
|
130
|
+
</soapenv:Envelope>`,
|
|
131
|
+
},
|
|
132
|
+
];
|
|
133
|
+
|
|
134
|
+
// --------------------------- comparison -------------------------------------
|
|
135
|
+
function firstMatchType(source, extension, filePath) {
|
|
136
|
+
const [type] = extractDocumentFields(source, extension, filePath);
|
|
137
|
+
return type;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function bestMatchResult(source, extension, filePath) {
|
|
141
|
+
return classifyDocument(scoringMatchers, { source, extension, filePath });
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function topCandidates(source, extension, filePath, n = 3) {
|
|
145
|
+
return scoreAll(scoringMatchers, {
|
|
146
|
+
source,
|
|
147
|
+
extension,
|
|
148
|
+
fileName: filePath ? path.basename(filePath) : '',
|
|
149
|
+
})
|
|
150
|
+
.slice(0, n)
|
|
151
|
+
.map((c) => `${c.documentType}:${c.score}`)
|
|
152
|
+
.join(', ');
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function row(name, first, best, expected) {
|
|
156
|
+
const agree = first === best ? 'sí ' : 'NO ';
|
|
157
|
+
const vsExp = expected ? (best === expected ? 'ok ' : '⚠️ ') : ' ';
|
|
158
|
+
return (
|
|
159
|
+
`${name.padEnd(34)} first=${String(first).padEnd(24)} ` +
|
|
160
|
+
`best=${String(best).padEnd(24)} coinciden=${agree} esperado=${vsExp}`
|
|
161
|
+
);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
async function run() {
|
|
165
|
+
const folder = process.argv[2];
|
|
166
|
+
let total = 0;
|
|
167
|
+
let disagreements = 0;
|
|
168
|
+
|
|
169
|
+
console.log('\n=== Muestras sintéticas ===');
|
|
170
|
+
for (const s of SAMPLES) {
|
|
171
|
+
const first = firstMatchType(s.text, s.extension, s.filePath);
|
|
172
|
+
const best = bestMatchResult(s.text, s.extension, s.filePath).detectedType;
|
|
173
|
+
total++;
|
|
174
|
+
if (first !== best) disagreements++;
|
|
175
|
+
console.log(row(s.name, first, best, s.expected));
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Order-independence demonstration for the inter-agencia/comercial case.
|
|
179
|
+
const ia = SAMPLES.find((s) => s.name.startsWith('inter-agencia'));
|
|
180
|
+
const reversed = [...scoringMatchers].reverse();
|
|
181
|
+
const normalWinner = classifyDocument(scoringMatchers, {
|
|
182
|
+
source: ia.text,
|
|
183
|
+
extension: ia.extension,
|
|
184
|
+
}).detectedType;
|
|
185
|
+
const reversedWinner = classifyDocument(reversed, {
|
|
186
|
+
source: ia.text,
|
|
187
|
+
extension: ia.extension,
|
|
188
|
+
}).detectedType;
|
|
189
|
+
console.log('\n=== Independencia de orden (inter-agencia) ===');
|
|
190
|
+
console.log(`candidatos (por score): ${topCandidates(ia.text, ia.extension)}`);
|
|
191
|
+
console.log(`seed normal -> ${normalWinner}`);
|
|
192
|
+
console.log(`seed invertido-> ${reversedWinner}`);
|
|
193
|
+
console.log(
|
|
194
|
+
`order-independent: ${normalWinner === reversedWinner ? 'sí ✅' : 'NO ❌'}`,
|
|
195
|
+
);
|
|
196
|
+
|
|
197
|
+
// Optional: real files from a folder.
|
|
198
|
+
if (folder) {
|
|
199
|
+
if (!fs.existsSync(folder)) {
|
|
200
|
+
console.error(`\nCarpeta no existe: ${folder}`);
|
|
201
|
+
} else {
|
|
202
|
+
console.log(`\n=== Archivos reales (${folder}) ===`);
|
|
203
|
+
const detection = new FileDetectionService();
|
|
204
|
+
const files = walk(folder).filter((f) =>
|
|
205
|
+
['.pdf', '.xml', '.txt'].includes(path.extname(f).toLowerCase()),
|
|
206
|
+
);
|
|
207
|
+
for (const file of files) {
|
|
208
|
+
const ext = path.extname(file).toLowerCase().replace('.', '');
|
|
209
|
+
let text = '';
|
|
210
|
+
try {
|
|
211
|
+
text =
|
|
212
|
+
ext === 'pdf'
|
|
213
|
+
? await detection.extractTextFromPDF(file)
|
|
214
|
+
: fs.readFileSync(file, 'utf8');
|
|
215
|
+
} catch (err) {
|
|
216
|
+
console.log(`${path.basename(file).padEnd(34)} ERROR: ${err.message}`);
|
|
217
|
+
continue;
|
|
218
|
+
}
|
|
219
|
+
const first = firstMatchType(text, ext, file);
|
|
220
|
+
const best = bestMatchResult(text, ext, file).detectedType;
|
|
221
|
+
total++;
|
|
222
|
+
if (first !== best) disagreements++;
|
|
223
|
+
console.log(row(path.basename(file), first, best, null));
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
console.log(
|
|
229
|
+
`\n=== Resumen: ${total} documentos, ${disagreements} divergencias first-vs-best ===\n`,
|
|
230
|
+
);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
function walk(dir) {
|
|
234
|
+
const out = [];
|
|
235
|
+
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
|
236
|
+
const full = path.join(dir, entry.name);
|
|
237
|
+
if (entry.isDirectory()) out.push(...walk(full));
|
|
238
|
+
else out.push(full);
|
|
239
|
+
}
|
|
240
|
+
return out;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
run();
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase 4 validation: runs the REAL runtime path the uploader now uses
|
|
3
|
+
* (DB-shape matchers -> adaptDbMatchers -> classifyDocument with rich extraction)
|
|
4
|
+
* against a corpus and compares it to the legacy engine (extractDocumentFields).
|
|
5
|
+
*
|
|
6
|
+
* Usage: node scripts/scoring-phase4-check.js <folder>
|
|
7
|
+
*/
|
|
8
|
+
import fs from 'fs';
|
|
9
|
+
import path from 'path';
|
|
10
|
+
|
|
11
|
+
import { extractDocumentFields } from '../src/document-type-shared.js';
|
|
12
|
+
import FileDetectionService from '../src/file-detection.js';
|
|
13
|
+
import { adaptDbMatchers } from '../src/scoring/db-matcher-adapter.js';
|
|
14
|
+
import { scoringMatchers } from '../src/scoring/matchers-seed.js';
|
|
15
|
+
import { classifyDocument } from '../src/scoring/scoring-engine.js';
|
|
16
|
+
|
|
17
|
+
// Serialize the local seed to the shape the API `/resolved` endpoint returns,
|
|
18
|
+
// so we exercise the adapter exactly as in production.
|
|
19
|
+
function toDbShape(matchers) {
|
|
20
|
+
return matchers.map((m) => ({
|
|
21
|
+
documentType: m.documentType,
|
|
22
|
+
extensions: m.extensions,
|
|
23
|
+
minScore: m.minScore ?? null,
|
|
24
|
+
priority: m.priority ?? 0,
|
|
25
|
+
qualify: m.qualify ?? null,
|
|
26
|
+
clues: (m.clues || []).map((c) => ({
|
|
27
|
+
kind: c.kind,
|
|
28
|
+
pattern: c.pattern instanceof RegExp ? c.pattern.source : c.pattern,
|
|
29
|
+
flags: c.pattern instanceof RegExp ? c.pattern.flags : c.flags || '',
|
|
30
|
+
weight: c.weight ?? 1,
|
|
31
|
+
group: c.group ?? null,
|
|
32
|
+
required: !!c.required,
|
|
33
|
+
negative: !!c.negative,
|
|
34
|
+
})),
|
|
35
|
+
fieldExtractors: [], // rich extraction comes from the registry by documentType
|
|
36
|
+
}));
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const adapted = adaptDbMatchers(toDbShape(scoringMatchers));
|
|
40
|
+
|
|
41
|
+
function walk(dir) {
|
|
42
|
+
const out = [];
|
|
43
|
+
for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
|
|
44
|
+
const full = path.join(dir, e.name);
|
|
45
|
+
if (e.isDirectory()) out.push(...walk(full));
|
|
46
|
+
else out.push(full);
|
|
47
|
+
}
|
|
48
|
+
return out;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async function run() {
|
|
52
|
+
const folder = process.argv[2];
|
|
53
|
+
if (!folder) {
|
|
54
|
+
console.error('Pass a folder: node scripts/scoring-phase4-check.js <folder>');
|
|
55
|
+
process.exit(1);
|
|
56
|
+
}
|
|
57
|
+
const detection = new FileDetectionService();
|
|
58
|
+
const files = walk(folder).filter((f) =>
|
|
59
|
+
['.pdf', '.xml', '.txt'].includes(path.extname(f).toLowerCase()),
|
|
60
|
+
);
|
|
61
|
+
|
|
62
|
+
let total = 0;
|
|
63
|
+
let diverge = 0;
|
|
64
|
+
const patterns = {};
|
|
65
|
+
|
|
66
|
+
for (const file of files) {
|
|
67
|
+
const ext = path.extname(file).toLowerCase().replace('.', '');
|
|
68
|
+
let text = '';
|
|
69
|
+
try {
|
|
70
|
+
text =
|
|
71
|
+
ext === 'pdf'
|
|
72
|
+
? await detection.extractTextFromPDF(file)
|
|
73
|
+
: fs.readFileSync(file, 'utf8');
|
|
74
|
+
} catch {
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
77
|
+
const legacy = extractDocumentFields(text, ext, file)[0];
|
|
78
|
+
const phase4 = classifyDocument(adapted, {
|
|
79
|
+
source: text,
|
|
80
|
+
extension: ext,
|
|
81
|
+
filePath: file,
|
|
82
|
+
}).detectedType;
|
|
83
|
+
total++;
|
|
84
|
+
if (legacy !== phase4) {
|
|
85
|
+
diverge++;
|
|
86
|
+
const key = `${legacy} -> ${phase4}`;
|
|
87
|
+
patterns[key] = (patterns[key] || 0) + 1;
|
|
88
|
+
console.log(`NO ${path.basename(file).padEnd(40)} ${key}`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
console.log(`\n=== Fase 4 vs legacy: ${total} docs, ${diverge} divergencias ===`);
|
|
93
|
+
for (const [k, n] of Object.entries(patterns)) console.log(` ${n}× ${k}`);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
run();
|
|
@@ -8,11 +8,11 @@ import appConfig from '../config/config.js';
|
|
|
8
8
|
import ErrorHandler from '../errors/ErrorHandler.js';
|
|
9
9
|
import { ConfigurationError } from '../errors/ErrorTypes.js';
|
|
10
10
|
import FileDetectionService from '../file-detection.js';
|
|
11
|
+
import { adaptDbMatchers } from '../scoring/db-matcher-adapter.js';
|
|
12
|
+
import { scoringMatchers } from '../scoring/matchers-seed.js';
|
|
11
13
|
|
|
12
14
|
/**
|
|
13
|
-
* Paid pedimento detected_type values.
|
|
14
|
-
* even though the XML matcher is currently disabled in the registry so that
|
|
15
|
-
* re-enabling it requires no changes here.
|
|
15
|
+
* Paid pedimento detected_type values.
|
|
16
16
|
*/
|
|
17
17
|
const DETECTED_PEDIMENTO_TYPES = new Set([
|
|
18
18
|
'pedimento_simplificado',
|
|
@@ -71,6 +71,32 @@ export class IdentifyCommand {
|
|
|
71
71
|
);
|
|
72
72
|
this.scanApiService = new ScanApiService(apiTarget);
|
|
73
73
|
|
|
74
|
+
// Load matchers for best-match classification (phase 4 hybrid). Prefer the
|
|
75
|
+
// DB-resolved set (this RFC + globals); fall back to the validated local
|
|
76
|
+
// seed; set DISABLE_SCORING_MATCHERS=true to force legacy first-match.
|
|
77
|
+
if (process.env.DISABLE_SCORING_MATCHERS === 'true') {
|
|
78
|
+
logger.info('🧩 Scoring matchers disabled — legacy detection');
|
|
79
|
+
} else {
|
|
80
|
+
let matchers = null;
|
|
81
|
+
try {
|
|
82
|
+
const rfc = process.env.MATCHER_RFC || null;
|
|
83
|
+
const dbMatchers = await this.scanApiService.getResolvedMatchers(rfc);
|
|
84
|
+
if (dbMatchers.length) {
|
|
85
|
+
matchers = adaptDbMatchers(dbMatchers);
|
|
86
|
+
logger.info(`🧩 Loaded ${matchers.length} matchers from API`);
|
|
87
|
+
}
|
|
88
|
+
} catch (err) {
|
|
89
|
+
logger.warn(`🧩 Could not load matchers from API: ${err.message}`);
|
|
90
|
+
}
|
|
91
|
+
if (!matchers) {
|
|
92
|
+
matchers = scoringMatchers;
|
|
93
|
+
logger.info(`🧩 Using local seed matchers (${matchers.length})`);
|
|
94
|
+
}
|
|
95
|
+
if (typeof this.detectionService.setMatchers === 'function') {
|
|
96
|
+
this.detectionService.setMatchers(matchers);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
74
100
|
const scanConfig = appConfig.getScanConfig();
|
|
75
101
|
const batchSize = parseInt(options.batchSize) || 100;
|
|
76
102
|
|
|
@@ -543,13 +569,15 @@ export class IdentifyCommand {
|
|
|
543
569
|
|
|
544
570
|
// Check if the text contains any required pedimento marker. This must
|
|
545
571
|
// stay aligned with the `match()` predicates in pedimento-simplificado.js
|
|
546
|
-
// and pedimento-completo.js
|
|
572
|
+
// and pedimento-completo.js (which accept both "DE" and "DEL" in the
|
|
573
|
+
// title, and treat the colon after "T. OPER" as optional).
|
|
547
574
|
const text = result.text || '';
|
|
548
|
-
const hasSimplificadoMarker =
|
|
575
|
+
const hasSimplificadoMarker =
|
|
576
|
+
/FORMA\s+SIMPLIFICADA\s+DEL?\s+PEDIMENTO/i.test(text);
|
|
549
577
|
const hasCompletoMarkers =
|
|
550
578
|
/NUM\.?\s*PEDIMENTO:/i.test(text) &&
|
|
551
579
|
/CVE\.?\s*PEDIMENTO:/i.test(text) &&
|
|
552
|
-
/T\.?\s*OPER
|
|
580
|
+
/T\.?\s*OPER:?/i.test(text);
|
|
553
581
|
|
|
554
582
|
return !hasSimplificadoMarker && !hasCompletoMarkers;
|
|
555
583
|
}
|
|
@@ -579,6 +579,9 @@ export class ScanCommand {
|
|
|
579
579
|
* Normalize file record for database insertion
|
|
580
580
|
* Stores paths with forward slashes for consistency but keeps them absolute
|
|
581
581
|
* Sets likelySimplificado to true if file is a PDF and filename contains 'simp'
|
|
582
|
+
* Sets likelyInterAgencia to true if filename matches an inter-agency CFDI
|
|
583
|
+
* pattern (e.g. SICINGR*), so the API forces these XML/PDF through detection
|
|
584
|
+
* even though they lack the 'simp/pedim/covefact' heuristic.
|
|
582
585
|
* @private
|
|
583
586
|
*/
|
|
584
587
|
#normalizeFileRecord(filePath, fileStats, basePath, scanTimestamp) {
|
|
@@ -600,6 +603,17 @@ export class ScanCommand {
|
|
|
600
603
|
const likelySimplificado =
|
|
601
604
|
fileExtension === 'pdf' && /(simp|pedim|covefact)/i.test(fileName);
|
|
602
605
|
|
|
606
|
+
// Flag inter-agency CFDIs by filename so detection picks them up.
|
|
607
|
+
// Patterns are configurable via SCAN_INTER_AGENCIA_PATTERNS env var
|
|
608
|
+
// (see config.js). Only meaningful for PDF and XML.
|
|
609
|
+
let likelyInterAgencia = false;
|
|
610
|
+
if (fileExtension === 'pdf' || fileExtension === 'xml') {
|
|
611
|
+
const patterns = appConfig.scan.interAgenciaPatterns;
|
|
612
|
+
if (patterns && patterns.length > 0) {
|
|
613
|
+
likelyInterAgencia = patterns.some((re) => re.test(fileName));
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
|
|
603
617
|
return {
|
|
604
618
|
fileName,
|
|
605
619
|
fileExtension,
|
|
@@ -610,6 +624,7 @@ export class ScanCommand {
|
|
|
610
624
|
modifiedAt: fileStats.mtime.toISOString(),
|
|
611
625
|
scanTimestamp,
|
|
612
626
|
likelySimplificado,
|
|
627
|
+
likelyInterAgencia,
|
|
613
628
|
};
|
|
614
629
|
}
|
|
615
630
|
|
package/src/config/config.js
CHANGED
|
@@ -37,10 +37,10 @@ class Config {
|
|
|
37
37
|
const __dirname = path.dirname(__filename);
|
|
38
38
|
const packageJsonPath = path.resolve(__dirname, '../../package.json');
|
|
39
39
|
const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
|
|
40
|
-
return packageJson.version || '1.0.
|
|
40
|
+
return packageJson.version || '1.0.24';
|
|
41
41
|
} catch (error) {
|
|
42
42
|
console.warn('⚠️ Could not read package.json version, using fallback');
|
|
43
|
-
return '1.0.
|
|
43
|
+
return '1.0.24';
|
|
44
44
|
}
|
|
45
45
|
}
|
|
46
46
|
|
|
@@ -294,6 +294,31 @@ class Config {
|
|
|
294
294
|
.map((p) => p.trim())
|
|
295
295
|
.filter(Boolean);
|
|
296
296
|
|
|
297
|
+
// Parse inter-agency CFDI filename patterns. Files whose basename matches
|
|
298
|
+
// any of these regex patterns are flagged at scan time (likelyInterAgencia)
|
|
299
|
+
// so the API forces them through detection and the factura_inter_agencia
|
|
300
|
+
// matcher can classify them. The push pipeline then excludes them (see
|
|
301
|
+
// NON_PUSHABLE_TYPES_SQL in arela-api). Comma-separated regex source list.
|
|
302
|
+
// Default: ^SICINGR — covers NORCOM's SICINGR70-NNNNNN(...).pdf/.XML files.
|
|
303
|
+
const defaultInterAgenciaPatterns = '^SICINGR';
|
|
304
|
+
const interAgenciaPatterns = (
|
|
305
|
+
process.env.SCAN_INTER_AGENCIA_PATTERNS || defaultInterAgenciaPatterns
|
|
306
|
+
)
|
|
307
|
+
.split(',')
|
|
308
|
+
.map((p) => p.trim())
|
|
309
|
+
.filter(Boolean)
|
|
310
|
+
.map((p) => {
|
|
311
|
+
try {
|
|
312
|
+
return new RegExp(p, 'i');
|
|
313
|
+
} catch (err) {
|
|
314
|
+
console.warn(
|
|
315
|
+
`⚠️ Invalid SCAN_INTER_AGENCIA_PATTERNS regex "${p}": ${err.message}`,
|
|
316
|
+
);
|
|
317
|
+
return null;
|
|
318
|
+
}
|
|
319
|
+
})
|
|
320
|
+
.filter(Boolean);
|
|
321
|
+
|
|
297
322
|
// Generate table name if all components are available
|
|
298
323
|
// Note: This is just for reference; actual table names are generated dynamically
|
|
299
324
|
// in ScanCommand based on discovered directories and levels
|
|
@@ -312,6 +337,7 @@ class Config {
|
|
|
312
337
|
basePathFull: basePathLabel, // Renamed for consistency
|
|
313
338
|
tableName,
|
|
314
339
|
excludePatterns,
|
|
340
|
+
interAgenciaPatterns,
|
|
315
341
|
batchSize: parseInt(process.env.SCAN_BATCH_SIZE) || 2000,
|
|
316
342
|
directoryLevel: parseInt(process.env.SCAN_DIRECTORY_LEVEL) || 0,
|
|
317
343
|
};
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
// Import all document type definitions
|
|
2
2
|
import { dodaPdfDefinition } from './document-types/doda-pdf.js';
|
|
3
3
|
import { dodaXmlDefinition } from './document-types/doda-xml.js';
|
|
4
|
+
import { facturaInterAgenciaDefinition } from './document-types/factura-inter-agencia.js';
|
|
4
5
|
import { facturasComerciales } from './document-types/facturas-comerciales.js';
|
|
6
|
+
import { pedimentoCompletoXmlDefinition } from './document-types/pedimento-completo-xml.js';
|
|
5
7
|
import { pedimentoCompletoDefinition } from './document-types/pedimento-completo.js';
|
|
6
|
-
// TODO: enable XML pedimento detection — implementation ready in pedimento-completo-xml.js
|
|
7
|
-
// import { pedimentoCompletoXmlDefinition } from './document-types/pedimento-completo-xml.js';
|
|
8
8
|
import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
|
|
9
9
|
import { proformaDefinition } from './document-types/proforma.js';
|
|
10
10
|
import { supportDocumentDefinition } from './document-types/support-document.js';
|
|
@@ -45,14 +45,14 @@ export class DocumentTypeDefinition {
|
|
|
45
45
|
const documentTypes = [
|
|
46
46
|
pedimentoSimplificadoDefinition,
|
|
47
47
|
pedimentoCompletoDefinition,
|
|
48
|
-
|
|
49
|
-
// matching import at the top of this file. All downstream code
|
|
50
|
-
// (composeArelaPath, arela-api SQL filters, IdentifyCommand counters)
|
|
51
|
-
// already accepts `pedimento_completo_xml`.
|
|
52
|
-
// pedimentoCompletoXmlDefinition,
|
|
48
|
+
pedimentoCompletoXmlDefinition,
|
|
53
49
|
supportDocumentDefinition,
|
|
54
50
|
dodaPdfDefinition,
|
|
55
51
|
dodaXmlDefinition,
|
|
52
|
+
// factura_inter_agencia MUST be evaluated BEFORE facturasComerciales
|
|
53
|
+
// because a NORCOM↔PALCO CFDI would also match the generic commercial
|
|
54
|
+
// invoice matcher. First match wins (see extractDocumentFields).
|
|
55
|
+
facturaInterAgenciaDefinition,
|
|
56
56
|
facturasComerciales,
|
|
57
57
|
// Add more document types here as needed
|
|
58
58
|
];
|
|
@@ -114,6 +114,14 @@ export function extractDocumentFields(source, fileExtension, filePath) {
|
|
|
114
114
|
? docType.extractPedimentoYear(source, fields, filePath)
|
|
115
115
|
: null;
|
|
116
116
|
|
|
117
|
+
// Ensure downstream code (composeArelaPath) sees `numPedimento` as a
|
|
118
|
+
// field. PDF matchers add it via an explicit extractor; XML matchers
|
|
119
|
+
// compose it externally via extractNumPedimento. Backfill so both paths
|
|
120
|
+
// expose the same shape.
|
|
121
|
+
if (pedimento && !fields.some((f) => f.name === 'numPedimento')) {
|
|
122
|
+
fields.push(new FieldResult('numPedimento', true, pedimento));
|
|
123
|
+
}
|
|
124
|
+
|
|
117
125
|
return [resolvedType, fields, pedimento, year];
|
|
118
126
|
}
|
|
119
127
|
}
|
|
@@ -186,15 +186,34 @@ export const paymentDateExtractor = {
|
|
|
186
186
|
field: 'paymentDate',
|
|
187
187
|
extract: (source) => {
|
|
188
188
|
const patterns = [
|
|
189
|
-
/FECHA\s+DE\s+PAGO:?\s*(\d{2}\/\d{2}\/\d{4})/i,
|
|
190
|
-
/FECHA\s+DE\s+PAGO:?\s*(\d{4}\/\d{2}\/\d{2})/i,
|
|
191
|
-
/2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/,
|
|
192
|
-
/(?:^|\n)\s*PAGO\s+(\d{2}\/\d{2}\/\d{4})/i,
|
|
193
|
-
/
|
|
189
|
+
/FECHA\s+DE\s+PAGO:?\s*(\d{2}\/\d{2}\/\d{4})/i, // 0: explicit label DD/MM/YYYY
|
|
190
|
+
/FECHA\s+DE\s+PAGO:?\s*(\d{4}\/\d{2}\/\d{2})/i, // 1: explicit label YYYY/MM/DD
|
|
191
|
+
/2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/, // 2: forma simplificada scheduled date ⚠️
|
|
192
|
+
/(?:^|\n)\s*PAGO\s+(\d{2}\/\d{2}\/\d{4})/i, // 3: PAGO at line start (original)
|
|
193
|
+
/(?<=\d)PAGO\s+(\d{2}\/\d{2}\/\d{4})/i, // 4: PAGO after digit (pdf-parse artifact)
|
|
194
|
+
/(\d{2}\/\d{2}\/\d{4})[ \t]+PAGO[ \t]*$/im, // 5: reversed layout — date before PAGO (FECHAS column)
|
|
195
|
+
// 6: forma simplificada — pdf-parse extracts table cells out of order, so the
|
|
196
|
+
// label "FECHA DE PAGO:" can appear on its own line and the value (along with
|
|
197
|
+
// other cells like línea de captura, pedimento, importe) follows several lines
|
|
198
|
+
// later. Take the FIRST dd/mm/yyyy after the label within a 400-char window.
|
|
199
|
+
// Safe because `isNoPagado` short-circuits documents without a real payment,
|
|
200
|
+
// so we won't grab the unrelated ENTRADA date from the "FECHAS:" block above.
|
|
201
|
+
/FECHA\s+DE\s+PAGO:[\s\S]{1,400}?(\d{2}\/\d{2}\/\d{4})/i,
|
|
202
|
+
/PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/i, // 7: fallback
|
|
194
203
|
];
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
204
|
+
// "*** NO PAGADO" is the explicit SAT marker that no payment has been
|
|
205
|
+
// certified. When present, the bank-certification block is physically
|
|
206
|
+
// absent, so any date matched by the fallback patterns (e.g.
|
|
207
|
+
// "2 PAGO:" with a scheduled date, or "PRESENTACION:") would be a false
|
|
208
|
+
// positive. Return null outright — the document is classified as proforma.
|
|
209
|
+
const isNoPagado = /\*{3}\s*NO\s+PAGADO/i.test(source);
|
|
210
|
+
if (isNoPagado) {
|
|
211
|
+
return new FieldResult('paymentDate', false, null);
|
|
212
|
+
}
|
|
213
|
+
for (const pattern of patterns) {
|
|
214
|
+
const m = source.match(pattern);
|
|
215
|
+
if (!m) continue;
|
|
216
|
+
return new FieldResult('paymentDate', true, m[1]);
|
|
198
217
|
}
|
|
199
218
|
return new FieldResult('paymentDate', false, null);
|
|
200
219
|
},
|