@arela/uploader 0.0.12 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,289 @@
1
+ import { FieldResult } from '../document-type-shared.js';
2
+
3
+ export const pedimentoSimplificadoDefinition = {
4
+ type: 'pedimento_simplificado',
5
+ extensions: ['pdf'],
6
+ match: (source) => {
7
+ const clues = [/FORMA SIMPLIFICADA DE PEDIMENTO/i];
8
+
9
+ const found = clues.filter((clue) => clue.test(source));
10
+
11
+ if (found.length > clues.length / 2) {
12
+ return true;
13
+ }
14
+ return false;
15
+ },
16
+ extractNumPedimento: (source, fields) => {
17
+ return fields?.find((f) => f.name === 'numPedimento')?.value ?? null;
18
+ },
19
+ extractPedimentoYear: (source, fields) => {
20
+ const numPedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
21
+ if (!numPedimento) {
22
+ return null;
23
+ }
24
+ const year = parseInt(numPedimento.substring(0, 2), 10);
25
+ return year < 50 ? year + 2000 : year + 1900;
26
+ },
27
+ extractors: [
28
+ // 1) Número de Pedimento (15 dígitos)
29
+ {
30
+ field: 'numPedimento',
31
+ extract: (source) => {
32
+ const match = source.match(/\d{2}\s?\d{2}\s?\d{4}\s?\d{7}/);
33
+ return new FieldResult(
34
+ 'numPedimento',
35
+ !!match,
36
+ match ? match[0].replace(/\s/g, '') : null
37
+ );
38
+ },
39
+ },
40
+
41
+ // 2) Tipo de Operación: los 3 caracteres justo después del número
42
+ {
43
+ field: 'tipoOperacion',
44
+ extract: (source) => {
45
+ // Look for the pedimento number pattern followed by operation type
46
+ // Pattern matches: "22 07 3429 2002089 EXP RT"
47
+ const match = source.match(
48
+ /\d{2}\s+\d{2}\s+\d{4}\s+\d{7}\s+([A-Z]{3})/,
49
+ );
50
+ return new FieldResult(
51
+ 'tipoOperacion',
52
+ !!match,
53
+ match ? match[1] : null
54
+ );
55
+ },
56
+ },
57
+
58
+ // 3) Clave de Pedimento: los 2 caracteres justo después de la operación
59
+ {
60
+ field: 'clavePedimento',
61
+ extract: (source) => {
62
+ // Look for the pedimento number pattern followed by operation type and then the key
63
+ // Pattern matches: "22 07 3429 2002089 EXP RT" to capture "RT"
64
+ const match = source.match(
65
+ /\d{2}\s+\d{2}\s+\d{4}\s+\d{7}\s+[A-Z]{3}\s+([A-Z0-9]{2})/,
66
+ );
67
+ return new FieldResult(
68
+ 'clavePedimento',
69
+ !!match,
70
+ match ? match[1] : null
71
+ );
72
+ },
73
+ },
74
+
75
+ // 4) Aduana E/S: Extract the 3-digit aduana code that appears at the end of the peso bruto line
76
+ {
77
+ field: 'aduanaEntradaSalida',
78
+ extract: (source) => {
79
+ // Look for the peso bruto line format: number followed by decimal amount followed by 3-digit aduana code
80
+ // Pattern matches formats like: "7 1.100 071" or "7 19,834.260 071" to capture "071"
81
+ // Uses multiline flag to match line boundaries precisely
82
+ const match = source.match(/^\s*\d+\s+[\d,.]+\s+(\d{3})\s*$/m);
83
+ return new FieldResult(
84
+ 'aduanaEntradaSalida',
85
+ !!match,
86
+ match ? match[1] : null
87
+ );
88
+ },
89
+ },
90
+
91
+ // 5) RFC: línea con 12-13 caracteres alfanuméricos
92
+ {
93
+ field: 'rfc',
94
+ extract: (source) => {
95
+ const match = source.match(/\n\s*([A-Z0-9]{12,13})\s*\n/);
96
+ return new FieldResult(
97
+ 'rfc',
98
+ !!match,
99
+ match ? match[1] : null
100
+ );
101
+ },
102
+ },
103
+
104
+ // 6) Código de Aceptación: línea con 8 caracteres alfanuméricos justo después del RFC
105
+ {
106
+ field: 'codigoAceptacion',
107
+ extract: (source) => {
108
+ // 1) split into trimmed, non-empty lines
109
+ const lines = source
110
+ .split(/\r?\n/)
111
+ .map((l) => l.trim())
112
+ .filter((l) => l.length > 0);
113
+
114
+ // 2) find the index of an RFC line (12–13 alnum chars)
115
+ const rfcIndex = lines.findIndex((l) =>
116
+ /^[A-Z0-9]{12,13}$/.test(l),
117
+ );
118
+ let code = null;
119
+
120
+ // 3) if next line exists and is exactly 8 alnum chars, that's the code
121
+ if (rfcIndex >= 0 && /^[A-Z0-9]{8}$/.test(lines[rfcIndex + 1] || '')) {
122
+ code = lines[rfcIndex + 1];
123
+ }
124
+
125
+ return new FieldResult(
126
+ 'codigoAceptacion',
127
+ code !== null,
128
+ code
129
+ );
130
+ },
131
+ },
132
+
133
+ // 7) Num. E-Document: exactamente 13 caracteres tras la etiqueta (puede haber múltiples líneas)
134
+ {
135
+ field: 'numEDocumento',
136
+ extract: (source) => {
137
+ // Split into lines and find all lines containing NUM. E-DOCUMENT
138
+ const lines = source.split(/\r?\n/);
139
+ const edocLines = lines.filter((line) =>
140
+ /NUM\.?\s*E-DOCUMENT/i.test(line),
141
+ );
142
+
143
+ if (edocLines.length === 0) {
144
+ return new FieldResult('numEDocumento', false, null);
145
+ }
146
+
147
+ // Extract all 13-character alphanumeric codes from all NUM. E-DOCUMENT lines
148
+ const extractedCodes = [];
149
+ edocLines.forEach((line) => {
150
+ const afterEdoc = line.replace(/.*NUM\.?\s*E-DOCUMENT\s*/i, '');
151
+ const codes = afterEdoc.match(/[A-Z0-9]{13}/g) || [];
152
+ extractedCodes.push(...codes);
153
+ });
154
+
155
+ if (extractedCodes.length === 0) {
156
+ return new FieldResult('numEDocumento', false, null);
157
+ }
158
+
159
+ // Remove duplicates using Set
160
+ const uniqueCodes = [...new Set(extractedCodes)];
161
+ const formattedValue = `[${uniqueCodes.join(',')}]`;
162
+ return new FieldResult('numEDocumento', true, formattedValue);
163
+ },
164
+ },
165
+
166
+ // 8) Fecha de Pago: Look for various payment date patterns
167
+ {
168
+ field: 'paymentDate',
169
+ extract: (source) => {
170
+ // Try multiple patterns for payment dates
171
+ let match = source.match(/2\s+PAGO:\s*(\d{2}\/\d{2}\/\d{4})/);
172
+ if (!match) {
173
+ match = source.match(/FECHA DE PAGO:\s*(\d{4}\/\d{2}\/\d{2})/);
174
+ }
175
+ if (!match) {
176
+ match = source.match(/PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/);
177
+ }
178
+ return new FieldResult(
179
+ 'paymentDate',
180
+ !!match,
181
+ match ? match[1] : null
182
+ );
183
+ },
184
+ },
185
+
186
+ // 9) COVE: NUMERO DE ACUSE DE VALOR (puede haber múltiples líneas)
187
+ {
188
+ field: 'cove',
189
+ extract: (source) => {
190
+ // Split into lines and find all lines containing NUMERO DE ACUSE DE VALOR or COVE
191
+ const lines = source.split(/\r?\n/);
192
+ const coveLines = lines.filter(
193
+ (line) =>
194
+ /COVE/i.test(line) || /NUMERO DE ACUSE DE VALOR/i.test(line),
195
+ );
196
+
197
+ if (coveLines.length === 0) {
198
+ return new FieldResult('cove', false, null);
199
+ }
200
+
201
+ // Extract all COVE values from all matching lines
202
+ const coveValues = [];
203
+ coveLines.forEach((line) => {
204
+ const coveMatches = line.match(/COVE([A-Z0-9]+)/gi) || [];
205
+ // Extract just the alphanumeric parts (remove the "COVE" prefix)
206
+ const codes = coveMatches.map((match) => match);
207
+ coveValues.push(...codes);
208
+ });
209
+
210
+ if (coveValues.length === 0) {
211
+ return new FieldResult('cove', false, null);
212
+ }
213
+
214
+ // Remove duplicates using Set
215
+ const uniqueCoveValues = [...new Set(coveValues)];
216
+ const formattedValue = `[${uniqueCoveValues.join(',')}]`;
217
+ return new FieldResult('cove', true, formattedValue);
218
+ },
219
+ },
220
+
221
+ // 10) Peso Bruto: Extract weight value
222
+ {
223
+ field: 'pesoBruto',
224
+ extract: (source) => {
225
+ // Look for the peso bruto value with decimal format
226
+ const match = source.match(/(\d+\.\d+)\d{3}/);
227
+ return new FieldResult(
228
+ 'pesoBruto',
229
+ !!match,
230
+ match ? match[1] : null
231
+ );
232
+ },
233
+ },
234
+
235
+ // 11) Patente: Extract patent number
236
+ {
237
+ field: 'patente',
238
+ extract: (source) => {
239
+ // Look for the PATENTE: PEDIMENTO: ADUANA: header line
240
+ // Then find the corresponding data line with three numbers
241
+ const lines = source.split(/\r?\n/);
242
+ const patenteHeaderIndex = lines.findIndex((line) =>
243
+ /PATENTE:.*PEDIMENTO:.*ADUANA:/i.test(line),
244
+ );
245
+
246
+ if (patenteHeaderIndex >= 0) {
247
+ // Look for the data line after the header (format: "3429 2002089 07")
248
+ for (let i = patenteHeaderIndex + 1; i < lines.length; i++) {
249
+ const line = lines[i].trim();
250
+ if (/^\d+\s+\d+\s+\d+$/.test(line)) {
251
+ const parts = line.split(/\s+/);
252
+ return new FieldResult('patente', true, parts[0]); // First number is the PATENTE
253
+ }
254
+ }
255
+ }
256
+
257
+ return new FieldResult('patente', false, null);
258
+ },
259
+ },
260
+
261
+ // 12) Numero de Operacion Bancaria
262
+ {
263
+ field: 'numeroOperacionBancaria',
264
+ extract: (source) => {
265
+ const match = source.match(
266
+ /NUMERO DE OPERACION BANCARIA:\s*([A-Z0-9]+)/i,
267
+ );
268
+ return new FieldResult(
269
+ 'numeroOperacionBancaria',
270
+ !!match,
271
+ match ? match[1] : null
272
+ );
273
+ },
274
+ },
275
+
276
+ // 13) Numero de Transaccion SAT
277
+ {
278
+ field: 'numeroTransaccionSAT',
279
+ extract: (source) => {
280
+ const match = source.match(/NUMERO DE TRANSACCION SAT:\s*([A-Z0-9]+)/i);
281
+ return new FieldResult(
282
+ 'numeroTransaccionSAT',
283
+ !!match,
284
+ match ? match[1] : null
285
+ );
286
+ },
287
+ },
288
+ ],
289
+ };
@@ -0,0 +1,194 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import { getTextExtractor } from 'office-text-extractor';
4
+ import { extractDocumentFields } from './document-type-shared.js';
5
+
6
+ const extractor = getTextExtractor();
7
+
8
+ /**
9
+ * Compose arela_path from extracted pedimento fields
10
+ * Format: RFC/Year/Patente/Aduana/Pedimento/
11
+ * Example: PED781129JT6/2023/3429/07/3019796/
12
+ */
13
+ function composeArelaPath(detectedType, fields, detectedPedimentoYear, filePath) {
14
+ if (detectedType !== 'pedimento_simplificado') {
15
+ return null;
16
+ }
17
+
18
+ const rfc = fields?.find(f => f.name === 'rfc')?.value;
19
+ const patente = fields?.find(f => f.name === 'patente')?.value;
20
+ const aduana = fields?.find(f => f.name === 'aduanaEntradaSalida')?.value;
21
+ const pedimento = fields?.find(f => f.name === 'numPedimento')?.value;
22
+ const year = detectedPedimentoYear;
23
+
24
+ // All components are required for a valid arela_path
25
+ if (!rfc || !year || !patente || !aduana || !pedimento) {
26
+ console.log('⚠️ Missing required fields for arela_path composition:', {
27
+ rfc: !!rfc,
28
+ year: !!year,
29
+ patente: !!patente,
30
+ aduana: !!aduana,
31
+ pedimento: !!pedimento
32
+ });
33
+ return null;
34
+ }
35
+
36
+ // Ensure aduana is padded to 2 digits if needed (07 instead of 7)
37
+ const aduanaFormatted = aduana.toString().padStart(2, '0');
38
+
39
+ // arela_path should be the folder structure only, without filename
40
+ const arelaPath = `${rfc}/${year}/${patente}/${aduanaFormatted}/${pedimento}/`;
41
+
42
+ console.log(`✅ Composed arela_path: ${arelaPath}`);
43
+ return arelaPath;
44
+ }
45
+
46
+ /**
47
+ * File Detection Service
48
+ * Detects document types and extracts metadata from files
49
+ */
50
+ export class FileDetectionService {
51
+
52
+ /**
53
+ * Detect document type from a file
54
+ * @param {string} filePath - Path to the file to analyze
55
+ * @returns {Promise<{detectedType: string|null, fields: Array, detectedPedimento: string|null, detectedPedimentoYear: number|null, text: string}>}
56
+ */
57
+ async detectFile(filePath) {
58
+ try {
59
+ const fileExtension = path.extname(filePath).toLowerCase().replace('.', '');
60
+ const fileName = path.basename(filePath);
61
+
62
+ console.log(`🔍 Analyzing file: ${fileName} (${fileExtension})`);
63
+
64
+ let text = '';
65
+
66
+ // Extract text based on file type
67
+ switch (fileExtension) {
68
+ case 'pdf':
69
+ text = await this.extractTextFromPDF(filePath);
70
+ break;
71
+ case 'txt':
72
+ text = fs.readFileSync(filePath, 'utf8');
73
+ break;
74
+ case 'xml':
75
+ text = fs.readFileSync(filePath, 'utf8');
76
+ break;
77
+ default:
78
+ console.log(`⚠️ Unsupported file type: ${fileExtension}`);
79
+ return {
80
+ detectedType: null,
81
+ fields: [],
82
+ detectedPedimento: null,
83
+ detectedPedimentoYear: null,
84
+ arelaPath: null,
85
+ text: '',
86
+ error: `Unsupported file type: ${fileExtension}`
87
+ };
88
+ }
89
+
90
+ if (!text || text.trim().length === 0) {
91
+ console.log('⚠️ No text extracted from file');
92
+ return {
93
+ detectedType: null,
94
+ fields: [],
95
+ detectedPedimento: null,
96
+ detectedPedimentoYear: null,
97
+ arelaPath: null,
98
+ text: '',
99
+ error: 'No text could be extracted from file'
100
+ };
101
+ }
102
+
103
+ // Extract document fields and detect type
104
+ const [detectedType, fields, detectedPedimento, detectedPedimentoYear] =
105
+ extractDocumentFields(text, fileExtension, filePath);
106
+
107
+ // Compose arela_path for pedimento_simplificado documents
108
+ const arelaPath = composeArelaPath(detectedType, fields, detectedPedimentoYear, filePath);
109
+
110
+ return {
111
+ detectedType,
112
+ fields,
113
+ detectedPedimento,
114
+ detectedPedimentoYear,
115
+ arelaPath,
116
+ text,
117
+ error: null
118
+ };
119
+
120
+ } catch (error) {
121
+ console.error(`❌ Error detecting file ${filePath}:`, error.message);
122
+ return {
123
+ detectedType: null,
124
+ fields: [],
125
+ detectedPedimento: null,
126
+ detectedPedimentoYear: null,
127
+ arelaPath: null,
128
+ text: '',
129
+ error: error.message
130
+ };
131
+ }
132
+ }
133
+
134
+ /**
135
+ * Extract text from PDF file
136
+ * @param {string} filePath - Path to PDF file
137
+ * @returns {Promise<string>} - Extracted text
138
+ */
139
+ async extractTextFromPDF(filePath) {
140
+ try {
141
+ const buffer = fs.readFileSync(filePath);
142
+ const text = await extractor.extractText({
143
+ input: buffer,
144
+ type: 'file'
145
+ });
146
+ return text;
147
+ } catch (error) {
148
+ console.error(`Error extracting text from PDF ${filePath}:`, error.message);
149
+ throw new Error(`Failed to extract text from PDF: ${error.message}`);
150
+ }
151
+ }
152
+
153
+ /**
154
+ * Detect multiple files in a directory
155
+ * @param {Array<string>} filePaths - Array of file paths to analyze
156
+ * @returns {Promise<Array>} - Array of detection results
157
+ */
158
+ async detectFiles(filePaths) {
159
+ const results = [];
160
+
161
+ for (const filePath of filePaths) {
162
+ const result = await this.detectFile(filePath);
163
+ results.push({
164
+ filePath,
165
+ ...result
166
+ });
167
+ }
168
+
169
+ return results;
170
+ }
171
+
172
+ /**
173
+ * Check if file type is supported for detection
174
+ * @param {string} filePath - Path to file
175
+ * @returns {boolean} - True if file type is supported
176
+ */
177
+ isSupportedFileType(filePath) {
178
+ const fileExtension = path.extname(filePath).toLowerCase().replace('.', '');
179
+ const supportedExtensions = ['pdf', 'txt', 'xml'];
180
+ return supportedExtensions.includes(fileExtension);
181
+ }
182
+
183
+ /**
184
+ * Filter files to only include supported types
185
+ * @param {Array<string>} filePaths - Array of file paths
186
+ * @returns {Array<string>} - Filtered array of supported file paths
187
+ */
188
+ filterSupportedFiles(filePaths) {
189
+ return filePaths.filter(filePath => this.isSupportedFileType(filePath));
190
+ }
191
+ }
192
+
193
+ export default FileDetectionService;
194
+ export { composeArelaPath };