@arela/uploader 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/document-type-shared.js +22 -8
- package/src/document-types/pedimento-simplificado.js +11 -29
- package/src/file-detection.js +43 -28
- package/src/index.js +375 -199
- package/OPTIMIZATION_SUMMARY.md +0 -154
- package/PERFORMANCE_OPTIMIZATIONS.md +0 -270
package/package.json
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
// Import all document type definitions
|
|
2
|
+
import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
|
|
3
|
+
|
|
1
4
|
// Document type definitions and extraction utilities
|
|
2
5
|
// Ported from TypeScript to JavaScript for Node.js
|
|
3
6
|
|
|
@@ -10,7 +13,14 @@ export class FieldResult {
|
|
|
10
13
|
}
|
|
11
14
|
|
|
12
15
|
export class DocumentTypeDefinition {
|
|
13
|
-
constructor(
|
|
16
|
+
constructor(
|
|
17
|
+
type,
|
|
18
|
+
extensions,
|
|
19
|
+
match,
|
|
20
|
+
extractors,
|
|
21
|
+
extractNumPedimento,
|
|
22
|
+
extractPedimentoYear,
|
|
23
|
+
) {
|
|
14
24
|
this.type = type;
|
|
15
25
|
this.extensions = extensions;
|
|
16
26
|
this.match = match;
|
|
@@ -20,9 +30,6 @@ export class DocumentTypeDefinition {
|
|
|
20
30
|
}
|
|
21
31
|
}
|
|
22
32
|
|
|
23
|
-
// Import all document type definitions
|
|
24
|
-
import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
|
|
25
|
-
|
|
26
33
|
// Registry of all document types
|
|
27
34
|
const documentTypes = [
|
|
28
35
|
pedimentoSimplificadoDefinition,
|
|
@@ -44,14 +51,17 @@ export function extractDocumentFields(source, fileExtension, filePath) {
|
|
|
44
51
|
// Try to match against each document type
|
|
45
52
|
for (const docType of documentTypes) {
|
|
46
53
|
// Check if file extension matches
|
|
47
|
-
if (
|
|
54
|
+
if (
|
|
55
|
+
fileExtension &&
|
|
56
|
+
!docType.extensions.includes(fileExtension.toLowerCase())
|
|
57
|
+
) {
|
|
48
58
|
continue;
|
|
49
59
|
}
|
|
50
60
|
|
|
51
61
|
// Test if content matches this document type
|
|
52
62
|
if (docType.match(source)) {
|
|
53
63
|
console.log(`✅ Matched document type: ${docType.type}`);
|
|
54
|
-
|
|
64
|
+
|
|
55
65
|
// Extract all fields
|
|
56
66
|
const fields = [];
|
|
57
67
|
for (const extractor of docType.extractors) {
|
|
@@ -68,8 +78,12 @@ export function extractDocumentFields(source, fileExtension, filePath) {
|
|
|
68
78
|
}
|
|
69
79
|
|
|
70
80
|
// Extract pedimento number and year
|
|
71
|
-
const pedimento = docType.extractNumPedimento
|
|
72
|
-
|
|
81
|
+
const pedimento = docType.extractNumPedimento
|
|
82
|
+
? docType.extractNumPedimento(source, fields)
|
|
83
|
+
: null;
|
|
84
|
+
const year = docType.extractPedimentoYear
|
|
85
|
+
? docType.extractPedimentoYear(source, fields)
|
|
86
|
+
: null;
|
|
73
87
|
|
|
74
88
|
return [docType.type, fields, pedimento, year];
|
|
75
89
|
}
|
|
@@ -33,7 +33,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
33
33
|
return new FieldResult(
|
|
34
34
|
'numPedimento',
|
|
35
35
|
!!match,
|
|
36
|
-
match ? match[0].replace(/\s/g, '') : null
|
|
36
|
+
match ? match[0].replace(/\s/g, '') : null,
|
|
37
37
|
);
|
|
38
38
|
},
|
|
39
39
|
},
|
|
@@ -50,7 +50,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
50
50
|
return new FieldResult(
|
|
51
51
|
'tipoOperacion',
|
|
52
52
|
!!match,
|
|
53
|
-
match ? match[1] : null
|
|
53
|
+
match ? match[1] : null,
|
|
54
54
|
);
|
|
55
55
|
},
|
|
56
56
|
},
|
|
@@ -67,7 +67,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
67
67
|
return new FieldResult(
|
|
68
68
|
'clavePedimento',
|
|
69
69
|
!!match,
|
|
70
|
-
match ? match[1] : null
|
|
70
|
+
match ? match[1] : null,
|
|
71
71
|
);
|
|
72
72
|
},
|
|
73
73
|
},
|
|
@@ -83,7 +83,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
83
83
|
return new FieldResult(
|
|
84
84
|
'aduanaEntradaSalida',
|
|
85
85
|
!!match,
|
|
86
|
-
match ? match[1] : null
|
|
86
|
+
match ? match[1] : null,
|
|
87
87
|
);
|
|
88
88
|
},
|
|
89
89
|
},
|
|
@@ -93,11 +93,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
93
93
|
field: 'rfc',
|
|
94
94
|
extract: (source) => {
|
|
95
95
|
const match = source.match(/\n\s*([A-Z0-9]{12,13})\s*\n/);
|
|
96
|
-
return new FieldResult(
|
|
97
|
-
'rfc',
|
|
98
|
-
!!match,
|
|
99
|
-
match ? match[1] : null
|
|
100
|
-
);
|
|
96
|
+
return new FieldResult('rfc', !!match, match ? match[1] : null);
|
|
101
97
|
},
|
|
102
98
|
},
|
|
103
99
|
|
|
@@ -112,9 +108,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
112
108
|
.filter((l) => l.length > 0);
|
|
113
109
|
|
|
114
110
|
// 2) find the index of an RFC line (12–13 alnum chars)
|
|
115
|
-
const rfcIndex = lines.findIndex((l) =>
|
|
116
|
-
/^[A-Z0-9]{12,13}$/.test(l),
|
|
117
|
-
);
|
|
111
|
+
const rfcIndex = lines.findIndex((l) => /^[A-Z0-9]{12,13}$/.test(l));
|
|
118
112
|
let code = null;
|
|
119
113
|
|
|
120
114
|
// 3) if next line exists and is exactly 8 alnum chars, that's the code
|
|
@@ -122,11 +116,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
122
116
|
code = lines[rfcIndex + 1];
|
|
123
117
|
}
|
|
124
118
|
|
|
125
|
-
return new FieldResult(
|
|
126
|
-
'codigoAceptacion',
|
|
127
|
-
code !== null,
|
|
128
|
-
code
|
|
129
|
-
);
|
|
119
|
+
return new FieldResult('codigoAceptacion', code !== null, code);
|
|
130
120
|
},
|
|
131
121
|
},
|
|
132
122
|
|
|
@@ -175,11 +165,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
175
165
|
if (!match) {
|
|
176
166
|
match = source.match(/PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/);
|
|
177
167
|
}
|
|
178
|
-
return new FieldResult(
|
|
179
|
-
'paymentDate',
|
|
180
|
-
!!match,
|
|
181
|
-
match ? match[1] : null
|
|
182
|
-
);
|
|
168
|
+
return new FieldResult('paymentDate', !!match, match ? match[1] : null);
|
|
183
169
|
},
|
|
184
170
|
},
|
|
185
171
|
|
|
@@ -224,11 +210,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
224
210
|
extract: (source) => {
|
|
225
211
|
// Look for the peso bruto value with decimal format
|
|
226
212
|
const match = source.match(/(\d+\.\d+)\d{3}/);
|
|
227
|
-
return new FieldResult(
|
|
228
|
-
'pesoBruto',
|
|
229
|
-
!!match,
|
|
230
|
-
match ? match[1] : null
|
|
231
|
-
);
|
|
213
|
+
return new FieldResult('pesoBruto', !!match, match ? match[1] : null);
|
|
232
214
|
},
|
|
233
215
|
},
|
|
234
216
|
|
|
@@ -268,7 +250,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
268
250
|
return new FieldResult(
|
|
269
251
|
'numeroOperacionBancaria',
|
|
270
252
|
!!match,
|
|
271
|
-
match ? match[1] : null
|
|
253
|
+
match ? match[1] : null,
|
|
272
254
|
);
|
|
273
255
|
},
|
|
274
256
|
},
|
|
@@ -281,7 +263,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
281
263
|
return new FieldResult(
|
|
282
264
|
'numeroTransaccionSAT',
|
|
283
265
|
!!match,
|
|
284
|
-
match ? match[1] : null
|
|
266
|
+
match ? match[1] : null,
|
|
285
267
|
);
|
|
286
268
|
},
|
|
287
269
|
},
|
package/src/file-detection.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import fs from 'fs';
|
|
2
|
-
import path from 'path';
|
|
3
2
|
import { getTextExtractor } from 'office-text-extractor';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
|
|
4
5
|
import { extractDocumentFields } from './document-type-shared.js';
|
|
5
6
|
|
|
6
7
|
const extractor = getTextExtractor();
|
|
@@ -10,15 +11,20 @@ const extractor = getTextExtractor();
|
|
|
10
11
|
* Format: RFC/Year/Patente/Aduana/Pedimento/
|
|
11
12
|
* Example: PED781129JT6/2023/3429/07/3019796/
|
|
12
13
|
*/
|
|
13
|
-
function composeArelaPath(
|
|
14
|
+
function composeArelaPath(
|
|
15
|
+
detectedType,
|
|
16
|
+
fields,
|
|
17
|
+
detectedPedimentoYear,
|
|
18
|
+
filePath,
|
|
19
|
+
) {
|
|
14
20
|
if (detectedType !== 'pedimento_simplificado') {
|
|
15
21
|
return null;
|
|
16
22
|
}
|
|
17
23
|
|
|
18
|
-
const rfc = fields?.find(f => f.name === 'rfc')?.value;
|
|
19
|
-
const patente = fields?.find(f => f.name === 'patente')?.value;
|
|
20
|
-
const aduana = fields?.find(f => f.name === 'aduanaEntradaSalida')?.value;
|
|
21
|
-
const pedimento = fields?.find(f => f.name === 'numPedimento')?.value;
|
|
24
|
+
const rfc = fields?.find((f) => f.name === 'rfc')?.value;
|
|
25
|
+
const patente = fields?.find((f) => f.name === 'patente')?.value;
|
|
26
|
+
const aduana = fields?.find((f) => f.name === 'aduanaEntradaSalida')?.value;
|
|
27
|
+
const pedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
|
|
22
28
|
const year = detectedPedimentoYear;
|
|
23
29
|
|
|
24
30
|
// All components are required for a valid arela_path
|
|
@@ -28,17 +34,17 @@ function composeArelaPath(detectedType, fields, detectedPedimentoYear, filePath)
|
|
|
28
34
|
year: !!year,
|
|
29
35
|
patente: !!patente,
|
|
30
36
|
aduana: !!aduana,
|
|
31
|
-
pedimento: !!pedimento
|
|
37
|
+
pedimento: !!pedimento,
|
|
32
38
|
});
|
|
33
39
|
return null;
|
|
34
40
|
}
|
|
35
41
|
|
|
36
42
|
// Ensure aduana is padded to 2 digits if needed (07 instead of 7)
|
|
37
43
|
const aduanaFormatted = aduana.toString().padStart(2, '0');
|
|
38
|
-
|
|
44
|
+
|
|
39
45
|
// arela_path should be the folder structure only, without filename
|
|
40
46
|
const arelaPath = `${rfc}/${year}/${patente}/${aduanaFormatted}/${pedimento}/`;
|
|
41
|
-
|
|
47
|
+
|
|
42
48
|
console.log(`✅ Composed arela_path: ${arelaPath}`);
|
|
43
49
|
return arelaPath;
|
|
44
50
|
}
|
|
@@ -48,7 +54,6 @@ function composeArelaPath(detectedType, fields, detectedPedimentoYear, filePath)
|
|
|
48
54
|
* Detects document types and extracts metadata from files
|
|
49
55
|
*/
|
|
50
56
|
export class FileDetectionService {
|
|
51
|
-
|
|
52
57
|
/**
|
|
53
58
|
* Detect document type from a file
|
|
54
59
|
* @param {string} filePath - Path to the file to analyze
|
|
@@ -56,13 +61,16 @@ export class FileDetectionService {
|
|
|
56
61
|
*/
|
|
57
62
|
async detectFile(filePath) {
|
|
58
63
|
try {
|
|
59
|
-
const fileExtension = path
|
|
64
|
+
const fileExtension = path
|
|
65
|
+
.extname(filePath)
|
|
66
|
+
.toLowerCase()
|
|
67
|
+
.replace('.', '');
|
|
60
68
|
const fileName = path.basename(filePath);
|
|
61
|
-
|
|
69
|
+
|
|
62
70
|
console.log(`🔍 Analyzing file: ${fileName} (${fileExtension})`);
|
|
63
71
|
|
|
64
72
|
let text = '';
|
|
65
|
-
|
|
73
|
+
|
|
66
74
|
// Extract text based on file type
|
|
67
75
|
switch (fileExtension) {
|
|
68
76
|
case 'pdf':
|
|
@@ -83,7 +91,7 @@ export class FileDetectionService {
|
|
|
83
91
|
detectedPedimentoYear: null,
|
|
84
92
|
arelaPath: null,
|
|
85
93
|
text: '',
|
|
86
|
-
error: `Unsupported file type: ${fileExtension}
|
|
94
|
+
error: `Unsupported file type: ${fileExtension}`,
|
|
87
95
|
};
|
|
88
96
|
}
|
|
89
97
|
|
|
@@ -96,16 +104,21 @@ export class FileDetectionService {
|
|
|
96
104
|
detectedPedimentoYear: null,
|
|
97
105
|
arelaPath: null,
|
|
98
106
|
text: '',
|
|
99
|
-
error: 'No text could be extracted from file'
|
|
107
|
+
error: 'No text could be extracted from file',
|
|
100
108
|
};
|
|
101
109
|
}
|
|
102
110
|
|
|
103
111
|
// Extract document fields and detect type
|
|
104
|
-
const [detectedType, fields, detectedPedimento, detectedPedimentoYear] =
|
|
112
|
+
const [detectedType, fields, detectedPedimento, detectedPedimentoYear] =
|
|
105
113
|
extractDocumentFields(text, fileExtension, filePath);
|
|
106
114
|
|
|
107
115
|
// Compose arela_path for pedimento_simplificado documents
|
|
108
|
-
const arelaPath = composeArelaPath(
|
|
116
|
+
const arelaPath = composeArelaPath(
|
|
117
|
+
detectedType,
|
|
118
|
+
fields,
|
|
119
|
+
detectedPedimentoYear,
|
|
120
|
+
filePath,
|
|
121
|
+
);
|
|
109
122
|
|
|
110
123
|
return {
|
|
111
124
|
detectedType,
|
|
@@ -114,9 +127,8 @@ export class FileDetectionService {
|
|
|
114
127
|
detectedPedimentoYear,
|
|
115
128
|
arelaPath,
|
|
116
129
|
text,
|
|
117
|
-
error: null
|
|
130
|
+
error: null,
|
|
118
131
|
};
|
|
119
|
-
|
|
120
132
|
} catch (error) {
|
|
121
133
|
console.error(`❌ Error detecting file ${filePath}:`, error.message);
|
|
122
134
|
return {
|
|
@@ -126,7 +138,7 @@ export class FileDetectionService {
|
|
|
126
138
|
detectedPedimentoYear: null,
|
|
127
139
|
arelaPath: null,
|
|
128
140
|
text: '',
|
|
129
|
-
error: error.message
|
|
141
|
+
error: error.message,
|
|
130
142
|
};
|
|
131
143
|
}
|
|
132
144
|
}
|
|
@@ -139,13 +151,16 @@ export class FileDetectionService {
|
|
|
139
151
|
async extractTextFromPDF(filePath) {
|
|
140
152
|
try {
|
|
141
153
|
const buffer = fs.readFileSync(filePath);
|
|
142
|
-
const text = await extractor.extractText({
|
|
143
|
-
input: buffer,
|
|
144
|
-
type: 'file'
|
|
154
|
+
const text = await extractor.extractText({
|
|
155
|
+
input: buffer,
|
|
156
|
+
type: 'file',
|
|
145
157
|
});
|
|
146
158
|
return text;
|
|
147
159
|
} catch (error) {
|
|
148
|
-
console.error(
|
|
160
|
+
console.error(
|
|
161
|
+
`Error extracting text from PDF ${filePath}:`,
|
|
162
|
+
error.message,
|
|
163
|
+
);
|
|
149
164
|
throw new Error(`Failed to extract text from PDF: ${error.message}`);
|
|
150
165
|
}
|
|
151
166
|
}
|
|
@@ -157,15 +172,15 @@ export class FileDetectionService {
|
|
|
157
172
|
*/
|
|
158
173
|
async detectFiles(filePaths) {
|
|
159
174
|
const results = [];
|
|
160
|
-
|
|
175
|
+
|
|
161
176
|
for (const filePath of filePaths) {
|
|
162
177
|
const result = await this.detectFile(filePath);
|
|
163
178
|
results.push({
|
|
164
179
|
filePath,
|
|
165
|
-
...result
|
|
180
|
+
...result,
|
|
166
181
|
});
|
|
167
182
|
}
|
|
168
|
-
|
|
183
|
+
|
|
169
184
|
return results;
|
|
170
185
|
}
|
|
171
186
|
|
|
@@ -186,7 +201,7 @@ export class FileDetectionService {
|
|
|
186
201
|
* @returns {Array<string>} - Filtered array of supported file paths
|
|
187
202
|
*/
|
|
188
203
|
filterSupportedFiles(filePaths) {
|
|
189
|
-
return filePaths.filter(filePath => this.isSupportedFileType(filePath));
|
|
204
|
+
return filePaths.filter((filePath) => this.isSupportedFileType(filePath));
|
|
190
205
|
}
|
|
191
206
|
}
|
|
192
207
|
|