@arela/uploader 1.0.19 → 1.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,68 @@
1
+ import {
2
+ pedimentoYearFromFields,
3
+ sharedPedimentoExtractors,
4
+ } from './_pedimento-shared-extractors.js';
5
+
6
+ /**
7
+ * "Pedimento Completo" matcher — the 7-page DEF / SEGUNDA / TERCERA copy
8
+ * (and the "CoveFact" variant). This is distinct from `pedimento_simplificado`
9
+ * which uses the "FORMA SIMPLIFICADA DE PEDIMENTO" header.
10
+ *
11
+ * Match strategy: require the structural fields that the long-form pedimento
12
+ * always carries (`NUM. PEDIMENTO:`, `CVE.PEDIMENTO:`, `T.OPER:`) plus at
13
+ * least one of the printed copy markers, while explicitly excluding any
14
+ * document that already declares itself as a "FORMA SIMPLIFICADA".
15
+ */
16
+ export const pedimentoCompletoDefinition = {
17
+ type: 'pedimento_completo',
18
+ extensions: ['pdf'],
19
+ match: (source) => {
20
+ if (/FORMA SIMPLIFICADA DE PEDIMENTO/i.test(source)) return false;
21
+
22
+ const hasHeaderFields =
23
+ /NUM\.?\s*PEDIMENTO:/i.test(source) &&
24
+ /CVE\.?\s*PEDIMENTO:/i.test(source) &&
25
+ /T\.?\s*OPER:/i.test(source);
26
+ if (!hasHeaderFields) return false;
27
+
28
+ const hasCopyMarker =
29
+ /ORIGINAL:\s*ADMINISTRACION GENERAL DE ADUANAS/i.test(source) ||
30
+ /SEGUNDA COPIA/i.test(source) ||
31
+ /TERCERA COPIA/i.test(source) ||
32
+ /COPIA\s+(SIMPLIFICAD[AO])?\s*TRANSPORTISTA/i.test(source) ||
33
+ /DEFINITIVO/i.test(source);
34
+
35
+ return hasCopyMarker;
36
+ },
37
+
38
+ /**
39
+ * Resolve the final document type after fields have been extracted.
40
+ * Mirrors the simplificado logic:
41
+ * - R1 rectifications require fechaPagoRectificacion
42
+ * - Everything else requires paymentDate
43
+ * No payment evidence ⇒ proforma_completo.
44
+ */
45
+ resolveType: (fields) => {
46
+ const clavePedimento =
47
+ fields?.find((f) => f.name === 'clavePedimento')?.value ?? null;
48
+ const paymentDate =
49
+ fields?.find((f) => f.name === 'paymentDate' && f.found)?.value ?? null;
50
+ const fechaPagoRectificacion =
51
+ fields?.find((f) => f.name === 'fechaPagoRectificacion' && f.found)
52
+ ?.value ?? null;
53
+
54
+ if (clavePedimento === 'R1') {
55
+ return fechaPagoRectificacion
56
+ ? 'pedimento_completo'
57
+ : 'proforma_completo';
58
+ }
59
+ return paymentDate ? 'pedimento_completo' : 'proforma_completo';
60
+ },
61
+
62
+ extractNumPedimento: (source, fields) => {
63
+ return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
64
+ },
65
+ extractPedimentoYear: (source, fields) => pedimentoYearFromFields(fields),
66
+
67
+ extractors: sharedPedimentoExtractors,
68
+ };
@@ -1,4 +1,7 @@
1
- import { FieldResult } from '../document-type-shared.js';
1
+ import {
2
+ pedimentoYearFromFields,
3
+ sharedPedimentoExtractors,
4
+ } from './_pedimento-shared-extractors.js';
2
5
 
3
6
  export const pedimentoSimplificadoDefinition = {
4
7
  type: 'pedimento_simplificado',
@@ -14,7 +17,7 @@ export const pedimentoSimplificadoDefinition = {
14
17
  * - Otherwise: must have paymentDate
15
18
  * If no payment evidence is found, it's a "proforma".
16
19
  *
17
- * @param {FieldResult[]} fields - Extracted fields
20
+ * @param {import('../document-type-shared.js').FieldResult[]} fields
18
21
  * @returns {string} - 'pedimento_simplificado' or 'proforma'
19
22
  */
20
23
  resolveType: (fields) => {
@@ -27,297 +30,15 @@ export const pedimentoSimplificadoDefinition = {
27
30
  ?.value ?? null;
28
31
 
29
32
  if (clavePedimento === 'R1') {
30
- // Rectification pedimentos require fechaPagoRectificacion
31
33
  return fechaPagoRectificacion ? 'pedimento_simplificado' : 'proforma';
32
34
  }
33
-
34
- // Regular pedimentos require paymentDate
35
35
  return paymentDate ? 'pedimento_simplificado' : 'proforma';
36
36
  },
37
37
 
38
38
  extractNumPedimento: (source, fields) => {
39
39
  return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
40
40
  },
41
- extractPedimentoYear: (source, fields) => {
42
- const numPedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
43
- if (!numPedimento) {
44
- return null;
45
- }
46
- const year = parseInt(numPedimento.substring(0, 2), 10);
47
- return year < 50 ? year + 2000 : year + 1900;
48
- },
49
- extractors: [
50
- // 1) Número de Pedimento (15 dígitos)
51
- {
52
- field: 'numPedimento',
53
- extract: (source) => {
54
- const match = source.match(/\d{2}\s?\d{2}\s?\d{4}\s?\d{7}/);
55
- return new FieldResult(
56
- 'numPedimento',
57
- !!match,
58
- match ? match[0].replace(/\s/g, '') : null,
59
- );
60
- },
61
- },
62
-
63
- // 2) Tipo de Operación: los 3 caracteres justo después del número
64
- {
65
- field: 'tipoOperacion',
66
- extract: (source) => {
67
- // Look for the pedimento number pattern followed by operation type
68
- // Pattern matches: "22 07 3429 2002089 EXP RT"
69
- const match = source.match(
70
- /\d{2}\s+\d{2}\s+\d{4}\s+\d{7}\s+([A-Z]{3})/,
71
- );
72
- return new FieldResult(
73
- 'tipoOperacion',
74
- !!match,
75
- match ? match[1] : null,
76
- );
77
- },
78
- },
79
-
80
- // 3) Clave de Pedimento: los 2 caracteres justo después de la operación
81
- {
82
- field: 'clavePedimento',
83
- extract: (source) => {
84
- // Look for the pedimento number pattern followed by operation type and then the key
85
- // Pattern matches: "22 07 3429 2002089 EXP RT" to capture "RT"
86
- const match = source.match(
87
- /\d{2}\s+\d{2}\s+\d{4}\s+\d{7}\s+[A-Z]{3}\s+([A-Z0-9]{2})/,
88
- );
89
- return new FieldResult(
90
- 'clavePedimento',
91
- !!match,
92
- match ? match[1] : null,
93
- );
94
- },
95
- },
96
-
97
- // 4) Aduana E/S: Extract the 3-digit aduana code that appears at the end of the peso bruto line
98
- {
99
- field: 'aduanaEntradaSalida',
100
- extract: (source) => {
101
- // Look for the peso bruto line format: number followed by decimal amount followed by 3-digit aduana code
102
- // Pattern matches formats like: "7 1.100 071" or "7 19,834.260 071" to capture "071"
103
- // Uses multiline flag to match line boundaries precisely
104
- const match = source.match(/^\s*\d+\s+[\d,.]+\s+(\d{3})\s*$/m);
105
- return new FieldResult(
106
- 'aduanaEntradaSalida',
107
- !!match,
108
- match ? match[1] : null,
109
- );
110
- },
111
- },
112
-
113
- // 5) RFC: línea con 12-13 caracteres alfanuméricos
114
- {
115
- field: 'rfc',
116
- extract: (source) => {
117
- const match = source.match(/\n\s*([A-Z0-9]{12,13})\s*\n/);
118
- return new FieldResult('rfc', !!match, match ? match[1] : null);
119
- },
120
- },
121
-
122
- // 6) Código de Aceptación: línea con 8 caracteres alfanuméricos justo después del RFC
123
- {
124
- field: 'codigoAceptacion',
125
- extract: (source) => {
126
- // 1) split into trimmed, non-empty lines
127
- const lines = source
128
- .split(/\r?\n/)
129
- .map((l) => l.trim())
130
- .filter((l) => l.length > 0);
131
-
132
- // 2) find the index of an RFC line (12–13 alnum chars)
133
- const rfcIndex = lines.findIndex((l) => /^[A-Z0-9]{12,13}$/.test(l));
134
- let code = null;
135
-
136
- // 3) if next line exists and is exactly 8 alnum chars, that's the code
137
- if (rfcIndex >= 0 && /^[A-Z0-9]{8}$/.test(lines[rfcIndex + 1] || '')) {
138
- code = lines[rfcIndex + 1];
139
- }
140
-
141
- return new FieldResult('codigoAceptacion', code !== null, code);
142
- },
143
- },
144
-
145
- // 7) Num. E-Document: exactamente 13 caracteres tras la etiqueta (puede haber múltiples líneas)
146
- // {
147
- // field: 'numEDocumento',
148
- // extract: (source) => {
149
- // // Split into lines and find all lines containing NUM. E-DOCUMENT
150
- // const lines = source.split(/\r?\n/);
151
- // const edocLines = lines.filter((line) =>
152
- // /NUM\.?\s*E-DOCUMENT/i.test(line),
153
- // );
154
-
155
- // if (edocLines.length === 0) {
156
- // return new FieldResult('numEDocumento', false, null);
157
- // }
158
-
159
- // // Extract all 13-character alphanumeric codes from all NUM. E-DOCUMENT lines
160
- // const extractedCodes = [];
161
- // edocLines.forEach((line) => {
162
- // const afterEdoc = line.replace(/.*NUM\.?\s*E-DOCUMENT\s*/i, '');
163
- // const codes = afterEdoc.match(/[A-Z0-9]{13}/g) || [];
164
- // extractedCodes.push(...codes);
165
- // });
166
-
167
- // if (extractedCodes.length === 0) {
168
- // return new FieldResult('numEDocumento', false, null);
169
- // }
170
-
171
- // // Remove duplicates using Set
172
- // const uniqueCodes = [...new Set(extractedCodes)];
173
- // const formattedValue = `[${uniqueCodes.join(',')}]`;
174
- // return new FieldResult('numEDocumento', true, formattedValue);
175
- // },
176
- // },
177
-
178
- // 8) Fecha de Pago: Look for various payment date patterns
179
- {
180
- field: 'paymentDate',
181
- extract: (source) => {
182
- // Try multiple patterns for payment dates
183
- let match = source.match(/2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/);
184
- if (!match) {
185
- match = source.match(/FECHA DE PAGO:\s*(\d{4}\/\d{2}\/\d{2})/);
186
- }
187
- if (!match) {
188
- match = source.match(/PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/);
189
- }
190
- return new FieldResult('paymentDate', !!match, match ? match[1] : null);
191
- },
192
- },
193
-
194
- // 9) COVE: NUMERO DE ACUSE DE VALOR (puede haber múltiples líneas)
195
- // {
196
- // field: 'cove',
197
- // extract: (source) => {
198
- // // Split into lines and find all lines containing NUMERO DE ACUSE DE VALOR or COVE
199
- // const lines = source.split(/\r?\n/);
200
- // const coveLines = lines.filter(
201
- // (line) =>
202
- // /COVE/i.test(line) || /NUMERO DE ACUSE DE VALOR/i.test(line),
203
- // );
204
-
205
- // if (coveLines.length === 0) {
206
- // return new FieldResult('cove', false, null);
207
- // }
208
-
209
- // // Extract all COVE values from all matching lines
210
- // const coveValues = [];
211
- // coveLines.forEach((line) => {
212
- // const coveMatches = line.match(/COVE([A-Z0-9]+)/gi) || [];
213
- // // Extract just the alphanumeric parts (remove the "COVE" prefix)
214
- // const codes = coveMatches.map((match) => match);
215
- // coveValues.push(...codes);
216
- // });
217
-
218
- // if (coveValues.length === 0) {
219
- // return new FieldResult('cove', false, null);
220
- // }
221
-
222
- // // Remove duplicates using Set
223
- // const uniqueCoveValues = [...new Set(coveValues)];
224
- // const formattedValue = `[${uniqueCoveValues.join(',')}]`;
225
- // return new FieldResult('cove', true, formattedValue);
226
- // },
227
- // },
228
-
229
- // 10) Peso Bruto: Extract weight value
230
- // {
231
- // field: 'pesoBruto',
232
- // extract: (source) => {
233
- // // Look for the peso bruto value with decimal format
234
- // const match = source.match(/(\d+\.\d+)\d{3}/);
235
- // return new FieldResult('pesoBruto', !!match, match ? match[1] : null);
236
- // },
237
- // },
238
-
239
- // 11) Patente: Extract patent number
240
- {
241
- field: 'patente',
242
- extract: (source) => {
243
- // Look for the PATENTE: PEDIMENTO: ADUANA: header line
244
- // Then find the corresponding data line with three numbers
245
- const lines = source.split(/\r?\n/);
246
- const patenteHeaderIndex = lines.findIndex((line) =>
247
- /PATENTE:.*PEDIMENTO:.*ADUANA:/i.test(line),
248
- );
249
-
250
- if (patenteHeaderIndex >= 0) {
251
- // Look for the data line after the header (format: "3429 2002089 07")
252
- for (let i = patenteHeaderIndex + 1; i < lines.length; i++) {
253
- const line = lines[i].trim();
254
- if (/^\d+\s+\d+\s+\d+$/.test(line)) {
255
- const parts = line.split(/\s+/);
256
- return new FieldResult('patente', true, parts[0]); // First number is the PATENTE
257
- }
258
- }
259
- }
260
-
261
- return new FieldResult('patente', false, null);
262
- },
263
- },
264
-
265
- // 12) Numero de Operacion Bancaria
266
- // {
267
- // field: 'numeroOperacionBancaria',
268
- // extract: (source) => {
269
- // const match = source.match(
270
- // /NUMERO DE OPERACION BANCARIA:\s*([A-Z0-9]+)/i,
271
- // );
272
- // return new FieldResult(
273
- // 'numeroOperacionBancaria',
274
- // !!match,
275
- // match ? match[1] : null,
276
- // );
277
- // },
278
- // },
279
-
280
- // 13) Numero de Transaccion SAT
281
- // {
282
- // field: 'numeroTransaccionSAT',
283
- // extract: (source) => {
284
- // const match = source.match(/NUMERO DE TRANSACCION SAT:\s*([A-Z0-9]+)/i);
285
- // return new FieldResult(
286
- // 'numeroTransaccionSAT',
287
- // !!match,
288
- // match ? match[1] : null,
289
- // );
290
- // },
291
- // },
292
-
293
- // 14) Fecha de Pago Rectificación
294
- {
295
- field: 'fechaPagoRectificacion',
296
- extract: (source) => {
297
- // Look for the RECTIFICACION section header
298
- const rectSectionMatch = source.match(
299
- /RECTIFICACION[\s\S]{0,500}?(\d{2}\/\d{2}\/\d{4})/i,
300
- );
301
-
302
- if (rectSectionMatch) {
303
- return new FieldResult(
304
- 'fechaPagoRectificacion',
305
- true,
306
- rectSectionMatch[1],
307
- );
308
- }
309
-
310
- // Fallback: look for any date after FECHA PAGO RECT
311
- const fechaMatch = source.match(
312
- /FECHA PAGO RECT[\s\S]{0,500}?(\d{2}\/\d{2}\/\d{4})/i,
313
- );
314
-
315
- if (fechaMatch) {
316
- return new FieldResult('fechaPagoRectificacion', true, fechaMatch[1]);
317
- }
41
+ extractPedimentoYear: (source, fields) => pedimentoYearFromFields(fields),
318
42
 
319
- return new FieldResult('fechaPagoRectificacion', false, null);
320
- },
321
- },
322
- ],
43
+ extractors: sharedPedimentoExtractors,
323
44
  };
@@ -4,6 +4,35 @@ import { PDFParse } from 'pdf-parse';
4
4
 
5
5
  import { extractDocumentFields } from './document-type-shared.js';
6
6
 
7
+ // Document types that participate in arela_path composition. The XML type is
8
+ // kept here even though its matcher is currently disabled — once re-enabled
9
+ // in document-type-shared.js no further changes are needed here.
10
+ const ARELA_PATH_TYPES = new Set([
11
+ 'pedimento_simplificado',
12
+ 'pedimento_completo',
13
+ 'pedimento_completo_xml',
14
+ ]);
15
+
16
+ /**
17
+ * For `pedimento_completo_xml` the patente is not present in the XML body —
18
+ * it must be parsed from the filename. Three known patterns are tried.
19
+ */
20
+ function patenteFromXmlFilename(filePath) {
21
+ if (!filePath) return null;
22
+ const fileName = path.basename(filePath);
23
+
24
+ let m = fileName.match(/^VU_(\d{4})_\d{3}_\d{7}\.xml$/i);
25
+ if (m) return m[1];
26
+
27
+ m = fileName.match(/^\d{3}-(\d{4})-\d{7}\.xml$/i);
28
+ if (m) return m[1];
29
+
30
+ m = fileName.match(/^\d{4}(\d{4})\d{7}(?:_\d{15})?\.xml$/i);
31
+ if (m) return m[1];
32
+
33
+ return null;
34
+ }
35
+
7
36
  /**
8
37
  * Compose arela_path from extracted pedimento fields
9
38
  * Format: RFC/Year/Patente/Aduana/Pedimento/
@@ -15,16 +44,21 @@ function composeArelaPath(
15
44
  detectedPedimentoYear,
16
45
  filePath,
17
46
  ) {
18
- if (detectedType !== 'pedimento_simplificado') {
47
+ if (!ARELA_PATH_TYPES.has(detectedType)) {
19
48
  return null;
20
49
  }
21
50
 
22
51
  const rfc = fields?.find((f) => f.name === 'rfc')?.value;
23
- const patente = fields?.find((f) => f.name === 'patente')?.value;
52
+ let patente = fields?.find((f) => f.name === 'patente')?.value;
24
53
  const aduana = fields?.find((f) => f.name === 'aduanaEntradaSalida')?.value;
25
54
  const pedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
26
55
  const year = detectedPedimentoYear;
27
56
 
57
+ // XML matcher does not extract patente from the body — derive from filename.
58
+ if (!patente && detectedType === 'pedimento_completo_xml') {
59
+ patente = patenteFromXmlFilename(filePath);
60
+ }
61
+
28
62
  // All components are required for a valid arela_path
29
63
  if (!rfc || !year || !patente || !aduana || !pedimento) {
30
64
  console.log('⚠️ Missing required fields for arela_path composition:', {
@@ -155,12 +189,12 @@ export class FileDetectionService {
155
189
  * @returns {Promise<string>} - Extracted text
156
190
  */
157
191
  async extractTextFromPDF(filePath) {
192
+ let parser;
158
193
  try {
159
194
  const dataBuffer = fs.readFileSync(filePath);
160
- // Convert Buffer to Uint8Array as required by pdf-parse
161
195
  const uint8Array = new Uint8Array(dataBuffer);
162
- const pdfParse = new PDFParse(uint8Array);
163
- const result = await pdfParse.getText();
196
+ parser = new PDFParse({ data: uint8Array });
197
+ const result = await parser.getText();
164
198
  return result.text;
165
199
  } catch (error) {
166
200
  console.error(
@@ -168,6 +202,10 @@ export class FileDetectionService {
168
202
  error.message,
169
203
  );
170
204
  throw new Error(`Failed to extract text from PDF: ${error.message}`);
205
+ } finally {
206
+ if (parser) {
207
+ await parser.destroy();
208
+ }
171
209
  }
172
210
  }
173
211
 
package/src/index.js CHANGED
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import { Command } from 'commander';
3
3
 
4
+ import gdriveSyncCommand from './commands/GDriveSyncCommand.js';
4
5
  import identifyCommand from './commands/IdentifyCommand.js';
5
6
  import pollWorkerCommand from './commands/PollWorkerCommand.js';
6
7
  import PropagateCommand from './commands/PropagateCommand.js';
@@ -458,6 +459,32 @@ class ArelaUploaderCLI {
458
459
  // END OF NEW SIMPLIFIED COMMANDS
459
460
  // ============================================================================
460
461
 
462
+ // GDrive sync command - mirror a Google Drive folder to local before scan
463
+ this.program
464
+ .command('gdrive-sync')
465
+ .description(
466
+ '☁️ Mirror a Google Drive folder to local filesystem (pre-scan source)',
467
+ )
468
+ .option(
469
+ '--root-folder <id>',
470
+ 'Drive folder ID to sync (overrides GDRIVE_ROOT_FOLDER_ID)',
471
+ )
472
+ .option(
473
+ '--dest <path>',
474
+ 'Local mirror destination (overrides GDRIVE_LOCAL_MIRROR_PATH)',
475
+ )
476
+ .option('--full', 'Ignore state file and re-verify all files')
477
+ .option('--dry-run', 'List/plan only, no downloads or writes')
478
+ .action(async (options) => {
479
+ try {
480
+ await gdriveSyncCommand.execute(options);
481
+ } catch (error) {
482
+ this.errorHandler.handleFatalError(error, {
483
+ command: 'gdrive-sync',
484
+ });
485
+ }
486
+ });
487
+
461
488
  // Watch command
462
489
  this.program
463
490
  .command('watch')
@@ -144,7 +144,9 @@ export class DatabaseService {
144
144
  rfc: null,
145
145
  message: null,
146
146
  file_extension: fileExtension,
147
- is_like_simplificado: filename.toLowerCase().includes('simp'),
147
+ // Flag any PDF whose filename hints at a pedimento (simplificado,
148
+ // completo, or CoveFact). Column name preserved; semantics broadened.
149
+ is_like_simplificado: /(simp|pedim|covefact)/i.test(filename),
148
150
  year: null,
149
151
  created_at: new Date().toISOString(),
150
152
  updated_at: new Date().toISOString(),