@arela/uploader 1.0.22 → 1.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/scoring-compare.js +243 -0
- package/scripts/scoring-phase4-check.js +96 -0
- package/src/commands/IdentifyCommand.js +34 -6
- package/src/commands/ScanCommand.js +15 -0
- package/src/config/config.js +28 -2
- package/src/document-type-shared.js +15 -7
- package/src/document-types/_pedimento-shared-extractors.js +27 -8
- package/src/document-types/factura-inter-agencia.js +186 -0
- package/src/document-types/pedimento-completo-xml.js +62 -12
- package/src/document-types/pedimento-completo.js +5 -3
- package/src/document-types/pedimento-simplificado.js +5 -2
- package/src/document-types/proforma.js +2 -2
- package/src/file-detection.js +30 -6
- package/src/scoring/db-matcher-adapter.js +98 -0
- package/src/scoring/matchers-seed.js +386 -0
- package/src/scoring/scoring-engine.js +218 -0
- package/src/services/ScanApiService.js +14 -0
- package/tests/unit/factura-inter-agencia.test.js +218 -0
- package/tests/unit/pedimento-completo-xml-matcher.test.js +271 -0
- package/tests/unit/pedimento-simplificado-matcher.test.js +185 -0
- package/tests/unit/scoring-engine.test.js +221 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for the best-match scoring engine (PROTOTYPE).
|
|
3
|
+
*
|
|
4
|
+
* Two layers:
|
|
5
|
+
* 1. Engine mechanics (pure): extension filter, required/negative
|
|
6
|
+
* disqualification, weighted scoring, threshold, tie-breaking.
|
|
7
|
+
* 2. Faithfulness on representative documents: the seed matchers select the
|
|
8
|
+
* same document type best-match would, including the key
|
|
9
|
+
* `factura_inter_agencia` vs `factura_comercial` case resolved by SCORE
|
|
10
|
+
* (not registration order).
|
|
11
|
+
*/
|
|
12
|
+
import { describe, it, expect } from '@jest/globals';
|
|
13
|
+
|
|
14
|
+
import {
|
|
15
|
+
classifyDocument,
|
|
16
|
+
scoreMatcher,
|
|
17
|
+
selectBestMatch,
|
|
18
|
+
} from '../../src/scoring/scoring-engine.js';
|
|
19
|
+
import { scoringMatchers } from '../../src/scoring/matchers-seed.js';
|
|
20
|
+
|
|
21
|
+
// ----------------------------- engine mechanics -----------------------------
|
|
22
|
+
describe('scoreMatcher (mechanics)', () => {
|
|
23
|
+
const base = {
|
|
24
|
+
documentType: 't',
|
|
25
|
+
extensions: ['pdf'],
|
|
26
|
+
minScore: 2,
|
|
27
|
+
clues: [
|
|
28
|
+
{ kind: 'CONTENT_REGEX', pattern: /alpha/i, weight: 2 },
|
|
29
|
+
{ kind: 'CONTENT_REGEX', pattern: /beta/i, weight: 1 },
|
|
30
|
+
],
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
it('returns null when the extension does not apply', () => {
|
|
34
|
+
expect(scoreMatcher(base, { source: 'alpha', extension: 'xml' })).toBeNull();
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
it('sums the weights of matched clues', () => {
|
|
38
|
+
const r = scoreMatcher(base, { source: 'alpha beta', extension: 'pdf' });
|
|
39
|
+
expect(r.score).toBe(3);
|
|
40
|
+
expect(r.totalWeight).toBe(3);
|
|
41
|
+
expect(r.fraction).toBeCloseTo(1);
|
|
42
|
+
expect(r.passed).toBe(true);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it('does not pass below minScore', () => {
|
|
46
|
+
const r = scoreMatcher(base, { source: 'beta', extension: 'pdf' });
|
|
47
|
+
expect(r.score).toBe(1);
|
|
48
|
+
expect(r.passed).toBe(false);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it('disqualifies on a negative clue hit', () => {
|
|
52
|
+
const m = {
|
|
53
|
+
...base,
|
|
54
|
+
clues: [
|
|
55
|
+
{ kind: 'CONTENT_REGEX', pattern: /alpha/i, weight: 2 },
|
|
56
|
+
{ kind: 'CONTENT_REGEX', pattern: /excluded/i, negative: true },
|
|
57
|
+
],
|
|
58
|
+
};
|
|
59
|
+
const r = scoreMatcher(m, { source: 'alpha excluded', extension: 'pdf' });
|
|
60
|
+
expect(r.disqualified).toBe(true);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it('disqualifies when a required clue is missing', () => {
|
|
64
|
+
const m = {
|
|
65
|
+
...base,
|
|
66
|
+
clues: [
|
|
67
|
+
{ kind: 'CONTENT_REGEX', pattern: /alpha/i, weight: 2 },
|
|
68
|
+
{ kind: 'CONTENT_REGEX', pattern: /mandatory/i, required: true, weight: 1 },
|
|
69
|
+
],
|
|
70
|
+
};
|
|
71
|
+
const r = scoreMatcher(m, { source: 'alpha', extension: 'pdf' });
|
|
72
|
+
expect(r.disqualified).toBe(true);
|
|
73
|
+
});
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
describe('selectBestMatch (ordering)', () => {
|
|
77
|
+
const ctx = { source: 'alpha beta gamma', extension: 'pdf' };
|
|
78
|
+
const low = {
|
|
79
|
+
documentType: 'low',
|
|
80
|
+
extensions: ['pdf'],
|
|
81
|
+
minScore: 1,
|
|
82
|
+
clues: [{ kind: 'CONTENT_REGEX', pattern: /alpha/i, weight: 1 }],
|
|
83
|
+
};
|
|
84
|
+
const high = {
|
|
85
|
+
documentType: 'high',
|
|
86
|
+
extensions: ['pdf'],
|
|
87
|
+
minScore: 1,
|
|
88
|
+
clues: [
|
|
89
|
+
{ kind: 'CONTENT_REGEX', pattern: /alpha/i, weight: 1 },
|
|
90
|
+
{ kind: 'CONTENT_REGEX', pattern: /beta/i, weight: 1 },
|
|
91
|
+
{ kind: 'CONTENT_REGEX', pattern: /gamma/i, weight: 1 },
|
|
92
|
+
],
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
it('picks the highest score regardless of array order', () => {
|
|
96
|
+
expect(selectBestMatch([low, high], ctx).documentType).toBe('high');
|
|
97
|
+
expect(selectBestMatch([high, low], ctx).documentType).toBe('high');
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
it('breaks score ties by priority', () => {
|
|
101
|
+
const a = {
|
|
102
|
+
documentType: 'a',
|
|
103
|
+
extensions: ['pdf'],
|
|
104
|
+
minScore: 1,
|
|
105
|
+
priority: 0,
|
|
106
|
+
clues: [{ kind: 'CONTENT_REGEX', pattern: /alpha/i, weight: 2 }],
|
|
107
|
+
};
|
|
108
|
+
const b = { ...a, documentType: 'b', priority: 5 };
|
|
109
|
+
expect(selectBestMatch([a, b], ctx).documentType).toBe('b');
|
|
110
|
+
});
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
describe('qualify rules (group gate, OR-of-ANDs)', () => {
|
|
114
|
+
// mirrors factura_comercial: (cfdi>=2) OR (invoice>=1 AND customs>=1)
|
|
115
|
+
const m = {
|
|
116
|
+
documentType: 'q',
|
|
117
|
+
extensions: ['xml'],
|
|
118
|
+
qualify: [{ cfdi: 2 }, { invoice: 1, customs: 1 }],
|
|
119
|
+
clues: [
|
|
120
|
+
{ kind: 'CONTENT_REGEX', pattern: /cfdiA/i, group: 'cfdi' },
|
|
121
|
+
{ kind: 'CONTENT_REGEX', pattern: /cfdiB/i, group: 'cfdi' },
|
|
122
|
+
{ kind: 'CONTENT_REGEX', pattern: /factura/i, group: 'invoice' },
|
|
123
|
+
{ kind: 'CONTENT_REGEX', pattern: /aduana/i, group: 'customs' },
|
|
124
|
+
],
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
it('qualifies when a single rule is fully satisfied (cfdi>=2)', () => {
|
|
128
|
+
expect(scoreMatcher(m, { source: 'cfdiA cfdiB', extension: 'xml' }).passed).toBe(true);
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
it('qualifies on the AND rule (invoice AND customs)', () => {
|
|
132
|
+
expect(scoreMatcher(m, { source: 'factura aduana', extension: 'xml' }).passed).toBe(true);
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
it('does NOT qualify on a partial AND rule (customs alone)', () => {
|
|
136
|
+
// This is the COVE-acuse false-positive case the flat model produced.
|
|
137
|
+
const r = scoreMatcher(m, { source: 'aduana pedimento aduana', extension: 'xml' });
|
|
138
|
+
expect(r.passed).toBe(false);
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
it('does NOT qualify with only one cfdi marker', () => {
|
|
142
|
+
expect(scoreMatcher(m, { source: 'cfdiA', extension: 'xml' }).passed).toBe(false);
|
|
143
|
+
});
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
// --------------------------- faithfulness on docs ---------------------------
|
|
147
|
+
const DOCS = {
|
|
148
|
+
simplificadoPaid: {
|
|
149
|
+
extension: 'pdf',
|
|
150
|
+
source: `FORMA SIMPLIFICADA DEL PEDIMENTO
|
|
151
|
+
NUM. PEDIMENTO: 26 07 3429 6000079
|
|
152
|
+
CVE. PEDIMENTO: A1
|
|
153
|
+
T. OPER: IMP
|
|
154
|
+
RFC: CSM9204097Q1
|
|
155
|
+
FECHA DE PAGO: 04/03/2026`,
|
|
156
|
+
},
|
|
157
|
+
simplificadoUnpaid: {
|
|
158
|
+
extension: 'pdf',
|
|
159
|
+
source: `FORMA SIMPLIFICADA DE PEDIMENTO
|
|
160
|
+
NUM. PEDIMENTO: 26 07 3429 6000080
|
|
161
|
+
CVE. PEDIMENTO: A1
|
|
162
|
+
T. OPER: IMP
|
|
163
|
+
RFC: CSM9204097Q1
|
|
164
|
+
*** NO PAGADO ***`,
|
|
165
|
+
},
|
|
166
|
+
completo: {
|
|
167
|
+
extension: 'pdf',
|
|
168
|
+
source: `NUM. PEDIMENTO: 26 07 3429 2002089
|
|
169
|
+
CVE. PEDIMENTO: A1
|
|
170
|
+
T. OPER: IMP
|
|
171
|
+
SEGUNDA COPIA TRANSPORTISTA
|
|
172
|
+
CUADRO DE LIQUIDACION
|
|
173
|
+
*** PAGO ELECTRONICO ***
|
|
174
|
+
FECHA DE PAGO: 02/03/2026`,
|
|
175
|
+
},
|
|
176
|
+
interAgencia: {
|
|
177
|
+
extension: 'xml',
|
|
178
|
+
source: `<cfdi:Comprobante xmlns:cfdi="..." TipoDeComprobante="I">
|
|
179
|
+
<cfdi:Emisor Rfc="NAA120215F20"/>
|
|
180
|
+
<cfdi:Receptor Rfc="PCC1008161WA"/>
|
|
181
|
+
<cfdi:Concepto ClaveProdServ="78141502"/>
|
|
182
|
+
</cfdi:Comprobante>`,
|
|
183
|
+
},
|
|
184
|
+
};
|
|
185
|
+
|
|
186
|
+
describe('seed matchers select the correct document type', () => {
|
|
187
|
+
it('classifies a paid simplificado', () => {
|
|
188
|
+
const r = classifyDocument(scoringMatchers, DOCS.simplificadoPaid);
|
|
189
|
+
expect(r.detectedType).toBe('pedimento_simplificado');
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
it('resolves an unpaid simplificado to proforma (resolveType reused)', () => {
|
|
193
|
+
const r = classifyDocument(scoringMatchers, DOCS.simplificadoUnpaid);
|
|
194
|
+
expect(r.detectedType).toBe('proforma');
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
it('separates completo from simplificado without registration order', () => {
|
|
198
|
+
const r = classifyDocument(scoringMatchers, DOCS.completo);
|
|
199
|
+
expect(r.detectedType).toBe('pedimento_completo');
|
|
200
|
+
});
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
describe('inter-agencia precedence is by score, not order', () => {
|
|
204
|
+
it('wins over factura_comercial with the default seed', () => {
|
|
205
|
+
const r = classifyDocument(scoringMatchers, DOCS.interAgencia);
|
|
206
|
+
expect(r.detectedType).toBe('factura_inter_agencia');
|
|
207
|
+
// factura_comercial also qualifies but scores far lower
|
|
208
|
+
const comercial = r.candidates.find(
|
|
209
|
+
(c) => c.documentType === 'factura_comercial',
|
|
210
|
+
);
|
|
211
|
+
const winner = r.candidates[0];
|
|
212
|
+
expect(winner.documentType).toBe('factura_inter_agencia');
|
|
213
|
+
if (comercial) expect(winner.score).toBeGreaterThan(comercial.score);
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
it('still wins when the matcher array is reversed (order-independent)', () => {
|
|
217
|
+
const reversed = [...scoringMatchers].reverse();
|
|
218
|
+
const r = classifyDocument(reversed, DOCS.interAgencia);
|
|
219
|
+
expect(r.detectedType).toBe('factura_inter_agencia');
|
|
220
|
+
});
|
|
221
|
+
});
|