@arela/uploader 1.0.19 → 1.0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/src/commands/GDriveSyncCommand.js +475 -0
- package/src/commands/IdentifyCommand.js +41 -16
- package/src/commands/ScanCommand.js +6 -3
- package/src/config/config.js +88 -2
- package/src/document-type-shared.js +13 -3
- package/src/document-types/_pedimento-shared-extractors.js +226 -0
- package/src/document-types/pedimento-completo-xml.js +322 -0
- package/src/document-types/pedimento-completo.js +68 -0
- package/src/document-types/pedimento-simplificado.js +7 -286
- package/src/file-detection.js +43 -5
- package/src/index.js +27 -0
- package/src/services/DatabaseService.js +3 -1
- package/src/services/GoogleDriveService.js +217 -0
- package/src/services/LoggingService.js +1 -1
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import {
|
|
2
|
+
pedimentoYearFromFields,
|
|
3
|
+
sharedPedimentoExtractors,
|
|
4
|
+
} from './_pedimento-shared-extractors.js';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* "Pedimento Completo" matcher — the 7-page DEF / SEGUNDA / TERCERA copy
|
|
8
|
+
* (and the "CoveFact" variant). This is distinct from `pedimento_simplificado`
|
|
9
|
+
* which uses the "FORMA SIMPLIFICADA DE PEDIMENTO" header.
|
|
10
|
+
*
|
|
11
|
+
* Match strategy: require the structural fields that the long-form pedimento
|
|
12
|
+
* always carries (`NUM. PEDIMENTO:`, `CVE.PEDIMENTO:`, `T.OPER:`) plus at
|
|
13
|
+
* least one of the printed copy markers, while explicitly excluding any
|
|
14
|
+
* document that already declares itself as a "FORMA SIMPLIFICADA".
|
|
15
|
+
*/
|
|
16
|
+
export const pedimentoCompletoDefinition = {
|
|
17
|
+
type: 'pedimento_completo',
|
|
18
|
+
extensions: ['pdf'],
|
|
19
|
+
match: (source) => {
|
|
20
|
+
if (/FORMA SIMPLIFICADA DE PEDIMENTO/i.test(source)) return false;
|
|
21
|
+
|
|
22
|
+
const hasHeaderFields =
|
|
23
|
+
/NUM\.?\s*PEDIMENTO:/i.test(source) &&
|
|
24
|
+
/CVE\.?\s*PEDIMENTO:/i.test(source) &&
|
|
25
|
+
/T\.?\s*OPER:/i.test(source);
|
|
26
|
+
if (!hasHeaderFields) return false;
|
|
27
|
+
|
|
28
|
+
const hasCopyMarker =
|
|
29
|
+
/ORIGINAL:\s*ADMINISTRACION GENERAL DE ADUANAS/i.test(source) ||
|
|
30
|
+
/SEGUNDA COPIA/i.test(source) ||
|
|
31
|
+
/TERCERA COPIA/i.test(source) ||
|
|
32
|
+
/COPIA\s+(SIMPLIFICAD[AO])?\s*TRANSPORTISTA/i.test(source) ||
|
|
33
|
+
/DEFINITIVO/i.test(source);
|
|
34
|
+
|
|
35
|
+
return hasCopyMarker;
|
|
36
|
+
},
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Resolve the final document type after fields have been extracted.
|
|
40
|
+
* Mirrors the simplificado logic:
|
|
41
|
+
* - R1 rectifications require fechaPagoRectificacion
|
|
42
|
+
* - Everything else requires paymentDate
|
|
43
|
+
* No payment evidence ⇒ proforma_completo.
|
|
44
|
+
*/
|
|
45
|
+
resolveType: (fields) => {
|
|
46
|
+
const clavePedimento =
|
|
47
|
+
fields?.find((f) => f.name === 'clavePedimento')?.value ?? null;
|
|
48
|
+
const paymentDate =
|
|
49
|
+
fields?.find((f) => f.name === 'paymentDate' && f.found)?.value ?? null;
|
|
50
|
+
const fechaPagoRectificacion =
|
|
51
|
+
fields?.find((f) => f.name === 'fechaPagoRectificacion' && f.found)
|
|
52
|
+
?.value ?? null;
|
|
53
|
+
|
|
54
|
+
if (clavePedimento === 'R1') {
|
|
55
|
+
return fechaPagoRectificacion
|
|
56
|
+
? 'pedimento_completo'
|
|
57
|
+
: 'proforma_completo';
|
|
58
|
+
}
|
|
59
|
+
return paymentDate ? 'pedimento_completo' : 'proforma_completo';
|
|
60
|
+
},
|
|
61
|
+
|
|
62
|
+
extractNumPedimento: (source, fields) => {
|
|
63
|
+
return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
|
|
64
|
+
},
|
|
65
|
+
extractPedimentoYear: (source, fields) => pedimentoYearFromFields(fields),
|
|
66
|
+
|
|
67
|
+
extractors: sharedPedimentoExtractors,
|
|
68
|
+
};
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import {
|
|
2
|
+
pedimentoYearFromFields,
|
|
3
|
+
sharedPedimentoExtractors,
|
|
4
|
+
} from './_pedimento-shared-extractors.js';
|
|
2
5
|
|
|
3
6
|
export const pedimentoSimplificadoDefinition = {
|
|
4
7
|
type: 'pedimento_simplificado',
|
|
@@ -14,7 +17,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
14
17
|
* - Otherwise: must have paymentDate
|
|
15
18
|
* If no payment evidence is found, it's a "proforma".
|
|
16
19
|
*
|
|
17
|
-
* @param {FieldResult[]} fields
|
|
20
|
+
* @param {import('../document-type-shared.js').FieldResult[]} fields
|
|
18
21
|
* @returns {string} - 'pedimento_simplificado' or 'proforma'
|
|
19
22
|
*/
|
|
20
23
|
resolveType: (fields) => {
|
|
@@ -27,297 +30,15 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
27
30
|
?.value ?? null;
|
|
28
31
|
|
|
29
32
|
if (clavePedimento === 'R1') {
|
|
30
|
-
// Rectification pedimentos require fechaPagoRectificacion
|
|
31
33
|
return fechaPagoRectificacion ? 'pedimento_simplificado' : 'proforma';
|
|
32
34
|
}
|
|
33
|
-
|
|
34
|
-
// Regular pedimentos require paymentDate
|
|
35
35
|
return paymentDate ? 'pedimento_simplificado' : 'proforma';
|
|
36
36
|
},
|
|
37
37
|
|
|
38
38
|
extractNumPedimento: (source, fields) => {
|
|
39
39
|
return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
|
|
40
40
|
},
|
|
41
|
-
extractPedimentoYear: (source, fields) =>
|
|
42
|
-
const numPedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
|
|
43
|
-
if (!numPedimento) {
|
|
44
|
-
return null;
|
|
45
|
-
}
|
|
46
|
-
const year = parseInt(numPedimento.substring(0, 2), 10);
|
|
47
|
-
return year < 50 ? year + 2000 : year + 1900;
|
|
48
|
-
},
|
|
49
|
-
extractors: [
|
|
50
|
-
// 1) Número de Pedimento (15 dígitos)
|
|
51
|
-
{
|
|
52
|
-
field: 'numPedimento',
|
|
53
|
-
extract: (source) => {
|
|
54
|
-
const match = source.match(/\d{2}\s?\d{2}\s?\d{4}\s?\d{7}/);
|
|
55
|
-
return new FieldResult(
|
|
56
|
-
'numPedimento',
|
|
57
|
-
!!match,
|
|
58
|
-
match ? match[0].replace(/\s/g, '') : null,
|
|
59
|
-
);
|
|
60
|
-
},
|
|
61
|
-
},
|
|
62
|
-
|
|
63
|
-
// 2) Tipo de Operación: los 3 caracteres justo después del número
|
|
64
|
-
{
|
|
65
|
-
field: 'tipoOperacion',
|
|
66
|
-
extract: (source) => {
|
|
67
|
-
// Look for the pedimento number pattern followed by operation type
|
|
68
|
-
// Pattern matches: "22 07 3429 2002089 EXP RT"
|
|
69
|
-
const match = source.match(
|
|
70
|
-
/\d{2}\s+\d{2}\s+\d{4}\s+\d{7}\s+([A-Z]{3})/,
|
|
71
|
-
);
|
|
72
|
-
return new FieldResult(
|
|
73
|
-
'tipoOperacion',
|
|
74
|
-
!!match,
|
|
75
|
-
match ? match[1] : null,
|
|
76
|
-
);
|
|
77
|
-
},
|
|
78
|
-
},
|
|
79
|
-
|
|
80
|
-
// 3) Clave de Pedimento: los 2 caracteres justo después de la operación
|
|
81
|
-
{
|
|
82
|
-
field: 'clavePedimento',
|
|
83
|
-
extract: (source) => {
|
|
84
|
-
// Look for the pedimento number pattern followed by operation type and then the key
|
|
85
|
-
// Pattern matches: "22 07 3429 2002089 EXP RT" to capture "RT"
|
|
86
|
-
const match = source.match(
|
|
87
|
-
/\d{2}\s+\d{2}\s+\d{4}\s+\d{7}\s+[A-Z]{3}\s+([A-Z0-9]{2})/,
|
|
88
|
-
);
|
|
89
|
-
return new FieldResult(
|
|
90
|
-
'clavePedimento',
|
|
91
|
-
!!match,
|
|
92
|
-
match ? match[1] : null,
|
|
93
|
-
);
|
|
94
|
-
},
|
|
95
|
-
},
|
|
96
|
-
|
|
97
|
-
// 4) Aduana E/S: Extract the 3-digit aduana code that appears at the end of the peso bruto line
|
|
98
|
-
{
|
|
99
|
-
field: 'aduanaEntradaSalida',
|
|
100
|
-
extract: (source) => {
|
|
101
|
-
// Look for the peso bruto line format: number followed by decimal amount followed by 3-digit aduana code
|
|
102
|
-
// Pattern matches formats like: "7 1.100 071" or "7 19,834.260 071" to capture "071"
|
|
103
|
-
// Uses multiline flag to match line boundaries precisely
|
|
104
|
-
const match = source.match(/^\s*\d+\s+[\d,.]+\s+(\d{3})\s*$/m);
|
|
105
|
-
return new FieldResult(
|
|
106
|
-
'aduanaEntradaSalida',
|
|
107
|
-
!!match,
|
|
108
|
-
match ? match[1] : null,
|
|
109
|
-
);
|
|
110
|
-
},
|
|
111
|
-
},
|
|
112
|
-
|
|
113
|
-
// 5) RFC: línea con 12-13 caracteres alfanuméricos
|
|
114
|
-
{
|
|
115
|
-
field: 'rfc',
|
|
116
|
-
extract: (source) => {
|
|
117
|
-
const match = source.match(/\n\s*([A-Z0-9]{12,13})\s*\n/);
|
|
118
|
-
return new FieldResult('rfc', !!match, match ? match[1] : null);
|
|
119
|
-
},
|
|
120
|
-
},
|
|
121
|
-
|
|
122
|
-
// 6) Código de Aceptación: línea con 8 caracteres alfanuméricos justo después del RFC
|
|
123
|
-
{
|
|
124
|
-
field: 'codigoAceptacion',
|
|
125
|
-
extract: (source) => {
|
|
126
|
-
// 1) split into trimmed, non-empty lines
|
|
127
|
-
const lines = source
|
|
128
|
-
.split(/\r?\n/)
|
|
129
|
-
.map((l) => l.trim())
|
|
130
|
-
.filter((l) => l.length > 0);
|
|
131
|
-
|
|
132
|
-
// 2) find the index of an RFC line (12–13 alnum chars)
|
|
133
|
-
const rfcIndex = lines.findIndex((l) => /^[A-Z0-9]{12,13}$/.test(l));
|
|
134
|
-
let code = null;
|
|
135
|
-
|
|
136
|
-
// 3) if next line exists and is exactly 8 alnum chars, that's the code
|
|
137
|
-
if (rfcIndex >= 0 && /^[A-Z0-9]{8}$/.test(lines[rfcIndex + 1] || '')) {
|
|
138
|
-
code = lines[rfcIndex + 1];
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
return new FieldResult('codigoAceptacion', code !== null, code);
|
|
142
|
-
},
|
|
143
|
-
},
|
|
144
|
-
|
|
145
|
-
// 7) Num. E-Document: exactamente 13 caracteres tras la etiqueta (puede haber múltiples líneas)
|
|
146
|
-
// {
|
|
147
|
-
// field: 'numEDocumento',
|
|
148
|
-
// extract: (source) => {
|
|
149
|
-
// // Split into lines and find all lines containing NUM. E-DOCUMENT
|
|
150
|
-
// const lines = source.split(/\r?\n/);
|
|
151
|
-
// const edocLines = lines.filter((line) =>
|
|
152
|
-
// /NUM\.?\s*E-DOCUMENT/i.test(line),
|
|
153
|
-
// );
|
|
154
|
-
|
|
155
|
-
// if (edocLines.length === 0) {
|
|
156
|
-
// return new FieldResult('numEDocumento', false, null);
|
|
157
|
-
// }
|
|
158
|
-
|
|
159
|
-
// // Extract all 13-character alphanumeric codes from all NUM. E-DOCUMENT lines
|
|
160
|
-
// const extractedCodes = [];
|
|
161
|
-
// edocLines.forEach((line) => {
|
|
162
|
-
// const afterEdoc = line.replace(/.*NUM\.?\s*E-DOCUMENT\s*/i, '');
|
|
163
|
-
// const codes = afterEdoc.match(/[A-Z0-9]{13}/g) || [];
|
|
164
|
-
// extractedCodes.push(...codes);
|
|
165
|
-
// });
|
|
166
|
-
|
|
167
|
-
// if (extractedCodes.length === 0) {
|
|
168
|
-
// return new FieldResult('numEDocumento', false, null);
|
|
169
|
-
// }
|
|
170
|
-
|
|
171
|
-
// // Remove duplicates using Set
|
|
172
|
-
// const uniqueCodes = [...new Set(extractedCodes)];
|
|
173
|
-
// const formattedValue = `[${uniqueCodes.join(',')}]`;
|
|
174
|
-
// return new FieldResult('numEDocumento', true, formattedValue);
|
|
175
|
-
// },
|
|
176
|
-
// },
|
|
177
|
-
|
|
178
|
-
// 8) Fecha de Pago: Look for various payment date patterns
|
|
179
|
-
{
|
|
180
|
-
field: 'paymentDate',
|
|
181
|
-
extract: (source) => {
|
|
182
|
-
// Try multiple patterns for payment dates
|
|
183
|
-
let match = source.match(/2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/);
|
|
184
|
-
if (!match) {
|
|
185
|
-
match = source.match(/FECHA DE PAGO:\s*(\d{4}\/\d{2}\/\d{2})/);
|
|
186
|
-
}
|
|
187
|
-
if (!match) {
|
|
188
|
-
match = source.match(/PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/);
|
|
189
|
-
}
|
|
190
|
-
return new FieldResult('paymentDate', !!match, match ? match[1] : null);
|
|
191
|
-
},
|
|
192
|
-
},
|
|
193
|
-
|
|
194
|
-
// 9) COVE: NUMERO DE ACUSE DE VALOR (puede haber múltiples líneas)
|
|
195
|
-
// {
|
|
196
|
-
// field: 'cove',
|
|
197
|
-
// extract: (source) => {
|
|
198
|
-
// // Split into lines and find all lines containing NUMERO DE ACUSE DE VALOR or COVE
|
|
199
|
-
// const lines = source.split(/\r?\n/);
|
|
200
|
-
// const coveLines = lines.filter(
|
|
201
|
-
// (line) =>
|
|
202
|
-
// /COVE/i.test(line) || /NUMERO DE ACUSE DE VALOR/i.test(line),
|
|
203
|
-
// );
|
|
204
|
-
|
|
205
|
-
// if (coveLines.length === 0) {
|
|
206
|
-
// return new FieldResult('cove', false, null);
|
|
207
|
-
// }
|
|
208
|
-
|
|
209
|
-
// // Extract all COVE values from all matching lines
|
|
210
|
-
// const coveValues = [];
|
|
211
|
-
// coveLines.forEach((line) => {
|
|
212
|
-
// const coveMatches = line.match(/COVE([A-Z0-9]+)/gi) || [];
|
|
213
|
-
// // Extract just the alphanumeric parts (remove the "COVE" prefix)
|
|
214
|
-
// const codes = coveMatches.map((match) => match);
|
|
215
|
-
// coveValues.push(...codes);
|
|
216
|
-
// });
|
|
217
|
-
|
|
218
|
-
// if (coveValues.length === 0) {
|
|
219
|
-
// return new FieldResult('cove', false, null);
|
|
220
|
-
// }
|
|
221
|
-
|
|
222
|
-
// // Remove duplicates using Set
|
|
223
|
-
// const uniqueCoveValues = [...new Set(coveValues)];
|
|
224
|
-
// const formattedValue = `[${uniqueCoveValues.join(',')}]`;
|
|
225
|
-
// return new FieldResult('cove', true, formattedValue);
|
|
226
|
-
// },
|
|
227
|
-
// },
|
|
228
|
-
|
|
229
|
-
// 10) Peso Bruto: Extract weight value
|
|
230
|
-
// {
|
|
231
|
-
// field: 'pesoBruto',
|
|
232
|
-
// extract: (source) => {
|
|
233
|
-
// // Look for the peso bruto value with decimal format
|
|
234
|
-
// const match = source.match(/(\d+\.\d+)\d{3}/);
|
|
235
|
-
// return new FieldResult('pesoBruto', !!match, match ? match[1] : null);
|
|
236
|
-
// },
|
|
237
|
-
// },
|
|
238
|
-
|
|
239
|
-
// 11) Patente: Extract patent number
|
|
240
|
-
{
|
|
241
|
-
field: 'patente',
|
|
242
|
-
extract: (source) => {
|
|
243
|
-
// Look for the PATENTE: PEDIMENTO: ADUANA: header line
|
|
244
|
-
// Then find the corresponding data line with three numbers
|
|
245
|
-
const lines = source.split(/\r?\n/);
|
|
246
|
-
const patenteHeaderIndex = lines.findIndex((line) =>
|
|
247
|
-
/PATENTE:.*PEDIMENTO:.*ADUANA:/i.test(line),
|
|
248
|
-
);
|
|
249
|
-
|
|
250
|
-
if (patenteHeaderIndex >= 0) {
|
|
251
|
-
// Look for the data line after the header (format: "3429 2002089 07")
|
|
252
|
-
for (let i = patenteHeaderIndex + 1; i < lines.length; i++) {
|
|
253
|
-
const line = lines[i].trim();
|
|
254
|
-
if (/^\d+\s+\d+\s+\d+$/.test(line)) {
|
|
255
|
-
const parts = line.split(/\s+/);
|
|
256
|
-
return new FieldResult('patente', true, parts[0]); // First number is the PATENTE
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
return new FieldResult('patente', false, null);
|
|
262
|
-
},
|
|
263
|
-
},
|
|
264
|
-
|
|
265
|
-
// 12) Numero de Operacion Bancaria
|
|
266
|
-
// {
|
|
267
|
-
// field: 'numeroOperacionBancaria',
|
|
268
|
-
// extract: (source) => {
|
|
269
|
-
// const match = source.match(
|
|
270
|
-
// /NUMERO DE OPERACION BANCARIA:\s*([A-Z0-9]+)/i,
|
|
271
|
-
// );
|
|
272
|
-
// return new FieldResult(
|
|
273
|
-
// 'numeroOperacionBancaria',
|
|
274
|
-
// !!match,
|
|
275
|
-
// match ? match[1] : null,
|
|
276
|
-
// );
|
|
277
|
-
// },
|
|
278
|
-
// },
|
|
279
|
-
|
|
280
|
-
// 13) Numero de Transaccion SAT
|
|
281
|
-
// {
|
|
282
|
-
// field: 'numeroTransaccionSAT',
|
|
283
|
-
// extract: (source) => {
|
|
284
|
-
// const match = source.match(/NUMERO DE TRANSACCION SAT:\s*([A-Z0-9]+)/i);
|
|
285
|
-
// return new FieldResult(
|
|
286
|
-
// 'numeroTransaccionSAT',
|
|
287
|
-
// !!match,
|
|
288
|
-
// match ? match[1] : null,
|
|
289
|
-
// );
|
|
290
|
-
// },
|
|
291
|
-
// },
|
|
292
|
-
|
|
293
|
-
// 14) Fecha de Pago Rectificación
|
|
294
|
-
{
|
|
295
|
-
field: 'fechaPagoRectificacion',
|
|
296
|
-
extract: (source) => {
|
|
297
|
-
// Look for the RECTIFICACION section header
|
|
298
|
-
const rectSectionMatch = source.match(
|
|
299
|
-
/RECTIFICACION[\s\S]{0,500}?(\d{2}\/\d{2}\/\d{4})/i,
|
|
300
|
-
);
|
|
301
|
-
|
|
302
|
-
if (rectSectionMatch) {
|
|
303
|
-
return new FieldResult(
|
|
304
|
-
'fechaPagoRectificacion',
|
|
305
|
-
true,
|
|
306
|
-
rectSectionMatch[1],
|
|
307
|
-
);
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
// Fallback: look for any date after FECHA PAGO RECT
|
|
311
|
-
const fechaMatch = source.match(
|
|
312
|
-
/FECHA PAGO RECT[\s\S]{0,500}?(\d{2}\/\d{2}\/\d{4})/i,
|
|
313
|
-
);
|
|
314
|
-
|
|
315
|
-
if (fechaMatch) {
|
|
316
|
-
return new FieldResult('fechaPagoRectificacion', true, fechaMatch[1]);
|
|
317
|
-
}
|
|
41
|
+
extractPedimentoYear: (source, fields) => pedimentoYearFromFields(fields),
|
|
318
42
|
|
|
319
|
-
|
|
320
|
-
},
|
|
321
|
-
},
|
|
322
|
-
],
|
|
43
|
+
extractors: sharedPedimentoExtractors,
|
|
323
44
|
};
|
package/src/file-detection.js
CHANGED
|
@@ -4,6 +4,35 @@ import { PDFParse } from 'pdf-parse';
|
|
|
4
4
|
|
|
5
5
|
import { extractDocumentFields } from './document-type-shared.js';
|
|
6
6
|
|
|
7
|
+
// Document types that participate in arela_path composition. The XML type is
|
|
8
|
+
// kept here even though its matcher is currently disabled — once re-enabled
|
|
9
|
+
// in document-type-shared.js no further changes are needed here.
|
|
10
|
+
const ARELA_PATH_TYPES = new Set([
|
|
11
|
+
'pedimento_simplificado',
|
|
12
|
+
'pedimento_completo',
|
|
13
|
+
'pedimento_completo_xml',
|
|
14
|
+
]);
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* For `pedimento_completo_xml` the patente is not present in the XML body —
|
|
18
|
+
* it must be parsed from the filename. Three known patterns are tried.
|
|
19
|
+
*/
|
|
20
|
+
function patenteFromXmlFilename(filePath) {
|
|
21
|
+
if (!filePath) return null;
|
|
22
|
+
const fileName = path.basename(filePath);
|
|
23
|
+
|
|
24
|
+
let m = fileName.match(/^VU_(\d{4})_\d{3}_\d{7}\.xml$/i);
|
|
25
|
+
if (m) return m[1];
|
|
26
|
+
|
|
27
|
+
m = fileName.match(/^\d{3}-(\d{4})-\d{7}\.xml$/i);
|
|
28
|
+
if (m) return m[1];
|
|
29
|
+
|
|
30
|
+
m = fileName.match(/^\d{4}(\d{4})\d{7}(?:_\d{15})?\.xml$/i);
|
|
31
|
+
if (m) return m[1];
|
|
32
|
+
|
|
33
|
+
return null;
|
|
34
|
+
}
|
|
35
|
+
|
|
7
36
|
/**
|
|
8
37
|
* Compose arela_path from extracted pedimento fields
|
|
9
38
|
* Format: RFC/Year/Patente/Aduana/Pedimento/
|
|
@@ -15,16 +44,21 @@ function composeArelaPath(
|
|
|
15
44
|
detectedPedimentoYear,
|
|
16
45
|
filePath,
|
|
17
46
|
) {
|
|
18
|
-
if (detectedType
|
|
47
|
+
if (!ARELA_PATH_TYPES.has(detectedType)) {
|
|
19
48
|
return null;
|
|
20
49
|
}
|
|
21
50
|
|
|
22
51
|
const rfc = fields?.find((f) => f.name === 'rfc')?.value;
|
|
23
|
-
|
|
52
|
+
let patente = fields?.find((f) => f.name === 'patente')?.value;
|
|
24
53
|
const aduana = fields?.find((f) => f.name === 'aduanaEntradaSalida')?.value;
|
|
25
54
|
const pedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
|
|
26
55
|
const year = detectedPedimentoYear;
|
|
27
56
|
|
|
57
|
+
// XML matcher does not extract patente from the body — derive from filename.
|
|
58
|
+
if (!patente && detectedType === 'pedimento_completo_xml') {
|
|
59
|
+
patente = patenteFromXmlFilename(filePath);
|
|
60
|
+
}
|
|
61
|
+
|
|
28
62
|
// All components are required for a valid arela_path
|
|
29
63
|
if (!rfc || !year || !patente || !aduana || !pedimento) {
|
|
30
64
|
console.log('⚠️ Missing required fields for arela_path composition:', {
|
|
@@ -155,12 +189,12 @@ export class FileDetectionService {
|
|
|
155
189
|
* @returns {Promise<string>} - Extracted text
|
|
156
190
|
*/
|
|
157
191
|
async extractTextFromPDF(filePath) {
|
|
192
|
+
let parser;
|
|
158
193
|
try {
|
|
159
194
|
const dataBuffer = fs.readFileSync(filePath);
|
|
160
|
-
// Convert Buffer to Uint8Array as required by pdf-parse
|
|
161
195
|
const uint8Array = new Uint8Array(dataBuffer);
|
|
162
|
-
|
|
163
|
-
const result = await
|
|
196
|
+
parser = new PDFParse({ data: uint8Array });
|
|
197
|
+
const result = await parser.getText();
|
|
164
198
|
return result.text;
|
|
165
199
|
} catch (error) {
|
|
166
200
|
console.error(
|
|
@@ -168,6 +202,10 @@ export class FileDetectionService {
|
|
|
168
202
|
error.message,
|
|
169
203
|
);
|
|
170
204
|
throw new Error(`Failed to extract text from PDF: ${error.message}`);
|
|
205
|
+
} finally {
|
|
206
|
+
if (parser) {
|
|
207
|
+
await parser.destroy();
|
|
208
|
+
}
|
|
171
209
|
}
|
|
172
210
|
}
|
|
173
211
|
|
package/src/index.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { Command } from 'commander';
|
|
3
3
|
|
|
4
|
+
import gdriveSyncCommand from './commands/GDriveSyncCommand.js';
|
|
4
5
|
import identifyCommand from './commands/IdentifyCommand.js';
|
|
5
6
|
import pollWorkerCommand from './commands/PollWorkerCommand.js';
|
|
6
7
|
import PropagateCommand from './commands/PropagateCommand.js';
|
|
@@ -458,6 +459,32 @@ class ArelaUploaderCLI {
|
|
|
458
459
|
// END OF NEW SIMPLIFIED COMMANDS
|
|
459
460
|
// ============================================================================
|
|
460
461
|
|
|
462
|
+
// GDrive sync command - mirror a Google Drive folder to local before scan
|
|
463
|
+
this.program
|
|
464
|
+
.command('gdrive-sync')
|
|
465
|
+
.description(
|
|
466
|
+
'☁️ Mirror a Google Drive folder to local filesystem (pre-scan source)',
|
|
467
|
+
)
|
|
468
|
+
.option(
|
|
469
|
+
'--root-folder <id>',
|
|
470
|
+
'Drive folder ID to sync (overrides GDRIVE_ROOT_FOLDER_ID)',
|
|
471
|
+
)
|
|
472
|
+
.option(
|
|
473
|
+
'--dest <path>',
|
|
474
|
+
'Local mirror destination (overrides GDRIVE_LOCAL_MIRROR_PATH)',
|
|
475
|
+
)
|
|
476
|
+
.option('--full', 'Ignore state file and re-verify all files')
|
|
477
|
+
.option('--dry-run', 'List/plan only, no downloads or writes')
|
|
478
|
+
.action(async (options) => {
|
|
479
|
+
try {
|
|
480
|
+
await gdriveSyncCommand.execute(options);
|
|
481
|
+
} catch (error) {
|
|
482
|
+
this.errorHandler.handleFatalError(error, {
|
|
483
|
+
command: 'gdrive-sync',
|
|
484
|
+
});
|
|
485
|
+
}
|
|
486
|
+
});
|
|
487
|
+
|
|
461
488
|
// Watch command
|
|
462
489
|
this.program
|
|
463
490
|
.command('watch')
|
|
@@ -144,7 +144,9 @@ export class DatabaseService {
|
|
|
144
144
|
rfc: null,
|
|
145
145
|
message: null,
|
|
146
146
|
file_extension: fileExtension,
|
|
147
|
-
|
|
147
|
+
// Flag any PDF whose filename hints at a pedimento (simplificado,
|
|
148
|
+
// completo, or CoveFact). Column name preserved; semantics broadened.
|
|
149
|
+
is_like_simplificado: /(simp|pedim|covefact)/i.test(filename),
|
|
148
150
|
year: null,
|
|
149
151
|
created_at: new Date().toISOString(),
|
|
150
152
|
updated_at: new Date().toISOString(),
|