@arela/uploader 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +97 -7
- package/commands.md +6 -0
- package/package.json +1 -1
- package/src/document-type-shared.js +22 -8
- package/src/document-types/pedimento-simplificado.js +11 -29
- package/src/file-detection.js +44 -29
- package/src/index.js +821 -225
package/README.md
CHANGED
|
@@ -2,6 +2,71 @@
|
|
|
2
2
|
|
|
3
3
|
CLI tool to upload files and directories to Arela API or Supabase Storage with automatic file processing, detection, and organization.
|
|
4
4
|
|
|
5
|
+
## 🚀 OPTIMIZED 4-PHASE WORKFLOW
|
|
6
|
+
|
|
7
|
+
**New in v0.2.0**: The tool now supports an optimized 4-phase workflow designed for maximum performance when processing large file collections:
|
|
8
|
+
|
|
9
|
+
### Phase 1: Filesystem Stats Collection 📊
|
|
10
|
+
```bash
|
|
11
|
+
arela --stats-only
|
|
12
|
+
```
|
|
13
|
+
- ⚡ **ULTRA FAST**: Only reads filesystem metadata (no file content)
|
|
14
|
+
- 📈 **Bulk database operations**: Processes 1000+ files per batch
|
|
15
|
+
- 🔄 **Upsert optimization**: Handles duplicates efficiently
|
|
16
|
+
- 💾 **Minimal memory usage**: No file content loading
|
|
17
|
+
|
|
18
|
+
### Phase 2: PDF Detection 🔍
|
|
19
|
+
```bash
|
|
20
|
+
arela --detect-pdfs
|
|
21
|
+
```
|
|
22
|
+
- 🎯 **Targeted processing**: Only processes PDF files from database
|
|
23
|
+
- � **Pedimento-simplificado detection**: Extracts RFC, pedimento numbers, and metadata
|
|
24
|
+
- 🔄 **Batched processing**: Handles large datasets efficiently
|
|
25
|
+
- 📊 **Progress tracking**: Real-time detection statistics
|
|
26
|
+
|
|
27
|
+
### Phase 3: Path Propagation �📁
|
|
28
|
+
```bash
|
|
29
|
+
arela --propagate-arela-path
|
|
30
|
+
```
|
|
31
|
+
- 🎯 **Smart path copying**: Propagates arela_path from pedimento documents to related files
|
|
32
|
+
- 📦 **Batch updates**: Processes files in groups for optimal database performance
|
|
33
|
+
- 🔗 **Relationship mapping**: Links supporting documents to their pedimento
|
|
34
|
+
|
|
35
|
+
### Phase 4: RFC-based Upload 🚀
|
|
36
|
+
```bash
|
|
37
|
+
arela --upload-by-rfc
|
|
38
|
+
```
|
|
39
|
+
- 🎯 **Targeted uploads**: Only uploads files for specified RFCs
|
|
40
|
+
- 📋 **Supporting documents**: Includes all related files, not just pedimentos
|
|
41
|
+
- 🏗️ **Structure preservation**: Maintains proper folder hierarchy
|
|
42
|
+
|
|
43
|
+
### Combined Workflow 🎯
|
|
44
|
+
```bash
|
|
45
|
+
# Run all 4 phases in sequence (recommended)
|
|
46
|
+
arela --run-all-phases
|
|
47
|
+
|
|
48
|
+
# Or run phases individually for more control
|
|
49
|
+
arela --stats-only # Phase 1: Collect filesystem stats
|
|
50
|
+
arela --detect-pdfs # Phase 2: Detect pedimento documents
|
|
51
|
+
arela --propagate-arela-path # Phase 3: Propagate paths to related files
|
|
52
|
+
arela --upload-by-rfc # Phase 4: Upload by RFC
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Performance Benefits
|
|
56
|
+
|
|
57
|
+
**Before optimization** (single phase with detection):
|
|
58
|
+
- 🐌 Read every file for detection
|
|
59
|
+
- 💾 High memory usage
|
|
60
|
+
- 🔄 Slow database operations
|
|
61
|
+
- ❌ Process unsupported files
|
|
62
|
+
|
|
63
|
+
**After optimization** (4-phase approach):
|
|
64
|
+
- ⚡ **10x faster**: Phase 1 only reads filesystem metadata
|
|
65
|
+
- 📊 **Bulk operations**: Database inserts up to 1000 records per batch
|
|
66
|
+
- 🎯 **Targeted processing**: Phase 2 only processes PDFs needing detection
|
|
67
|
+
- 💾 **Memory efficient**: No unnecessary file content loading
|
|
68
|
+
- 🔄 **Optimized I/O**: Separates filesystem, database, and network operations
|
|
69
|
+
|
|
5
70
|
## Features
|
|
6
71
|
|
|
7
72
|
- 📁 Upload entire directories or individual files
|
|
@@ -18,6 +83,7 @@ CLI tool to upload files and directories to Arela API or Supabase Storage with a
|
|
|
18
83
|
- 🔧 **Performance optimizations with caching**
|
|
19
84
|
- 📋 **Upload files by specific RFC values**
|
|
20
85
|
- 🔍 **Propagate arela_path from pedimento documents to related files**
|
|
86
|
+
- ⚡ **4-Phase optimized workflow for maximum performance**
|
|
21
87
|
|
|
22
88
|
## Installation
|
|
23
89
|
|
|
@@ -27,7 +93,22 @@ npm install -g @arela/uploader
|
|
|
27
93
|
|
|
28
94
|
## Usage
|
|
29
95
|
|
|
30
|
-
###
|
|
96
|
+
### 🚀 Optimized 4-Phase Workflow (Recommended)
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
# Run all phases automatically (most efficient)
|
|
100
|
+
arela --run-all-phases --batch-size 20
|
|
101
|
+
|
|
102
|
+
# Or run phases individually for fine-grained control
|
|
103
|
+
arela --stats-only # Phase 1: Filesystem stats only
|
|
104
|
+
arela --detect-pdfs --batch-size 10 # Phase 2: PDF detection
|
|
105
|
+
arela --propagate-arela-path # Phase 3: Path propagation
|
|
106
|
+
arela --upload-by-rfc --batch-size 5 # Phase 4: RFC-based upload
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Traditional Single-Phase Upload (Legacy)
|
|
110
|
+
|
|
111
|
+
#### Basic Upload with Auto-Processing (API Mode)
|
|
31
112
|
```bash
|
|
32
113
|
arela --batch-size 10 -c 5
|
|
33
114
|
```
|
|
@@ -88,10 +169,21 @@ arela --client-path "/client/documents" --batch-size 10 -c 5
|
|
|
88
169
|
|
|
89
170
|
### Options
|
|
90
171
|
|
|
91
|
-
|
|
92
|
-
-
|
|
172
|
+
#### Phase Control
|
|
173
|
+
- `--stats-only`: **Phase 1** - Only collect filesystem stats (no file reading)
|
|
174
|
+
- `--detect-pdfs`: **Phase 2** - Process PDF files for pedimento-simplificado detection
|
|
175
|
+
- `--propagate-arela-path`: **Phase 3** - Propagate arela_path from pedimento records to related files
|
|
176
|
+
- `--upload-by-rfc`: **Phase 4** - Upload files based on RFC values from UPLOAD_RFCS
|
|
177
|
+
- `--run-all-phases`: **All Phases** - Run complete optimized workflow
|
|
178
|
+
|
|
179
|
+
#### Performance & Configuration
|
|
93
180
|
- `-c, --concurrency <number>`: Files per batch for processing (default: 10)
|
|
94
181
|
- `--batch-size <number>`: API batch size (default: 10)
|
|
182
|
+
- `--show-stats`: Show detailed processing statistics
|
|
183
|
+
|
|
184
|
+
#### Upload Configuration
|
|
185
|
+
- `-p, --prefix <prefix>`: Prefix path in bucket (default: "")
|
|
186
|
+
- `-b, --bucket <bucket>`: Bucket name override
|
|
95
187
|
- `--force-supabase`: Force direct Supabase upload (skip API)
|
|
96
188
|
- `--no-auto-detect`: Disable automatic file detection (API mode only)
|
|
97
189
|
- `--no-auto-organize`: Disable automatic file organization (API mode only)
|
|
@@ -99,11 +191,9 @@ arela --client-path "/client/documents" --batch-size 10 -c 5
|
|
|
99
191
|
- `--folder-structure <structure>`: **Custom folder structure** (e.g., "2024/4023260" or "cliente1/pedimentos")
|
|
100
192
|
- `--auto-detect-structure`: **Automatically detect year/pedimento from file paths**
|
|
101
193
|
- `--client-path <path>`: Client path for metadata tracking
|
|
102
|
-
|
|
194
|
+
|
|
195
|
+
#### Legacy Options
|
|
103
196
|
- `--no-detect`: Disable document type detection in stats-only mode
|
|
104
|
-
- `--propagate-arela-path`: Propagate arela_path from pedimento_simplificado records to related files
|
|
105
|
-
- `--upload-by-rfc`: Upload files to Arela API based on RFC values from UPLOAD_RFCS environment variable
|
|
106
|
-
- `--show-stats`: Show detailed processing statistics
|
|
107
197
|
- `-v, --version`: Display version number
|
|
108
198
|
- `-h, --help`: Display help information
|
|
109
199
|
|
package/commands.md
ADDED
package/package.json
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
// Import all document type definitions
|
|
2
|
+
import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
|
|
3
|
+
|
|
1
4
|
// Document type definitions and extraction utilities
|
|
2
5
|
// Ported from TypeScript to JavaScript for Node.js
|
|
3
6
|
|
|
@@ -10,7 +13,14 @@ export class FieldResult {
|
|
|
10
13
|
}
|
|
11
14
|
|
|
12
15
|
export class DocumentTypeDefinition {
|
|
13
|
-
constructor(
|
|
16
|
+
constructor(
|
|
17
|
+
type,
|
|
18
|
+
extensions,
|
|
19
|
+
match,
|
|
20
|
+
extractors,
|
|
21
|
+
extractNumPedimento,
|
|
22
|
+
extractPedimentoYear,
|
|
23
|
+
) {
|
|
14
24
|
this.type = type;
|
|
15
25
|
this.extensions = extensions;
|
|
16
26
|
this.match = match;
|
|
@@ -20,9 +30,6 @@ export class DocumentTypeDefinition {
|
|
|
20
30
|
}
|
|
21
31
|
}
|
|
22
32
|
|
|
23
|
-
// Import all document type definitions
|
|
24
|
-
import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
|
|
25
|
-
|
|
26
33
|
// Registry of all document types
|
|
27
34
|
const documentTypes = [
|
|
28
35
|
pedimentoSimplificadoDefinition,
|
|
@@ -44,14 +51,17 @@ export function extractDocumentFields(source, fileExtension, filePath) {
|
|
|
44
51
|
// Try to match against each document type
|
|
45
52
|
for (const docType of documentTypes) {
|
|
46
53
|
// Check if file extension matches
|
|
47
|
-
if (
|
|
54
|
+
if (
|
|
55
|
+
fileExtension &&
|
|
56
|
+
!docType.extensions.includes(fileExtension.toLowerCase())
|
|
57
|
+
) {
|
|
48
58
|
continue;
|
|
49
59
|
}
|
|
50
60
|
|
|
51
61
|
// Test if content matches this document type
|
|
52
62
|
if (docType.match(source)) {
|
|
53
63
|
console.log(`✅ Matched document type: ${docType.type}`);
|
|
54
|
-
|
|
64
|
+
|
|
55
65
|
// Extract all fields
|
|
56
66
|
const fields = [];
|
|
57
67
|
for (const extractor of docType.extractors) {
|
|
@@ -68,8 +78,12 @@ export function extractDocumentFields(source, fileExtension, filePath) {
|
|
|
68
78
|
}
|
|
69
79
|
|
|
70
80
|
// Extract pedimento number and year
|
|
71
|
-
const pedimento = docType.extractNumPedimento
|
|
72
|
-
|
|
81
|
+
const pedimento = docType.extractNumPedimento
|
|
82
|
+
? docType.extractNumPedimento(source, fields)
|
|
83
|
+
: null;
|
|
84
|
+
const year = docType.extractPedimentoYear
|
|
85
|
+
? docType.extractPedimentoYear(source, fields)
|
|
86
|
+
: null;
|
|
73
87
|
|
|
74
88
|
return [docType.type, fields, pedimento, year];
|
|
75
89
|
}
|
|
@@ -33,7 +33,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
33
33
|
return new FieldResult(
|
|
34
34
|
'numPedimento',
|
|
35
35
|
!!match,
|
|
36
|
-
match ? match[0].replace(/\s/g, '') : null
|
|
36
|
+
match ? match[0].replace(/\s/g, '') : null,
|
|
37
37
|
);
|
|
38
38
|
},
|
|
39
39
|
},
|
|
@@ -50,7 +50,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
50
50
|
return new FieldResult(
|
|
51
51
|
'tipoOperacion',
|
|
52
52
|
!!match,
|
|
53
|
-
match ? match[1] : null
|
|
53
|
+
match ? match[1] : null,
|
|
54
54
|
);
|
|
55
55
|
},
|
|
56
56
|
},
|
|
@@ -67,7 +67,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
67
67
|
return new FieldResult(
|
|
68
68
|
'clavePedimento',
|
|
69
69
|
!!match,
|
|
70
|
-
match ? match[1] : null
|
|
70
|
+
match ? match[1] : null,
|
|
71
71
|
);
|
|
72
72
|
},
|
|
73
73
|
},
|
|
@@ -83,7 +83,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
83
83
|
return new FieldResult(
|
|
84
84
|
'aduanaEntradaSalida',
|
|
85
85
|
!!match,
|
|
86
|
-
match ? match[1] : null
|
|
86
|
+
match ? match[1] : null,
|
|
87
87
|
);
|
|
88
88
|
},
|
|
89
89
|
},
|
|
@@ -93,11 +93,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
93
93
|
field: 'rfc',
|
|
94
94
|
extract: (source) => {
|
|
95
95
|
const match = source.match(/\n\s*([A-Z0-9]{12,13})\s*\n/);
|
|
96
|
-
return new FieldResult(
|
|
97
|
-
'rfc',
|
|
98
|
-
!!match,
|
|
99
|
-
match ? match[1] : null
|
|
100
|
-
);
|
|
96
|
+
return new FieldResult('rfc', !!match, match ? match[1] : null);
|
|
101
97
|
},
|
|
102
98
|
},
|
|
103
99
|
|
|
@@ -112,9 +108,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
112
108
|
.filter((l) => l.length > 0);
|
|
113
109
|
|
|
114
110
|
// 2) find the index of an RFC line (12–13 alnum chars)
|
|
115
|
-
const rfcIndex = lines.findIndex((l) =>
|
|
116
|
-
/^[A-Z0-9]{12,13}$/.test(l),
|
|
117
|
-
);
|
|
111
|
+
const rfcIndex = lines.findIndex((l) => /^[A-Z0-9]{12,13}$/.test(l));
|
|
118
112
|
let code = null;
|
|
119
113
|
|
|
120
114
|
// 3) if next line exists and is exactly 8 alnum chars, that's the code
|
|
@@ -122,11 +116,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
122
116
|
code = lines[rfcIndex + 1];
|
|
123
117
|
}
|
|
124
118
|
|
|
125
|
-
return new FieldResult(
|
|
126
|
-
'codigoAceptacion',
|
|
127
|
-
code !== null,
|
|
128
|
-
code
|
|
129
|
-
);
|
|
119
|
+
return new FieldResult('codigoAceptacion', code !== null, code);
|
|
130
120
|
},
|
|
131
121
|
},
|
|
132
122
|
|
|
@@ -175,11 +165,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
175
165
|
if (!match) {
|
|
176
166
|
match = source.match(/PRESENTACION:\s*(\d{2}\/\d{2}\/\d{4})/);
|
|
177
167
|
}
|
|
178
|
-
return new FieldResult(
|
|
179
|
-
'paymentDate',
|
|
180
|
-
!!match,
|
|
181
|
-
match ? match[1] : null
|
|
182
|
-
);
|
|
168
|
+
return new FieldResult('paymentDate', !!match, match ? match[1] : null);
|
|
183
169
|
},
|
|
184
170
|
},
|
|
185
171
|
|
|
@@ -224,11 +210,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
224
210
|
extract: (source) => {
|
|
225
211
|
// Look for the peso bruto value with decimal format
|
|
226
212
|
const match = source.match(/(\d+\.\d+)\d{3}/);
|
|
227
|
-
return new FieldResult(
|
|
228
|
-
'pesoBruto',
|
|
229
|
-
!!match,
|
|
230
|
-
match ? match[1] : null
|
|
231
|
-
);
|
|
213
|
+
return new FieldResult('pesoBruto', !!match, match ? match[1] : null);
|
|
232
214
|
},
|
|
233
215
|
},
|
|
234
216
|
|
|
@@ -268,7 +250,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
268
250
|
return new FieldResult(
|
|
269
251
|
'numeroOperacionBancaria',
|
|
270
252
|
!!match,
|
|
271
|
-
match ? match[1] : null
|
|
253
|
+
match ? match[1] : null,
|
|
272
254
|
);
|
|
273
255
|
},
|
|
274
256
|
},
|
|
@@ -281,7 +263,7 @@ export const pedimentoSimplificadoDefinition = {
|
|
|
281
263
|
return new FieldResult(
|
|
282
264
|
'numeroTransaccionSAT',
|
|
283
265
|
!!match,
|
|
284
|
-
match ? match[1] : null
|
|
266
|
+
match ? match[1] : null,
|
|
285
267
|
);
|
|
286
268
|
},
|
|
287
269
|
},
|
package/src/file-detection.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import fs from 'fs';
|
|
2
|
-
import path from 'path';
|
|
3
2
|
import { getTextExtractor } from 'office-text-extractor';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
|
|
4
5
|
import { extractDocumentFields } from './document-type-shared.js';
|
|
5
6
|
|
|
6
7
|
const extractor = getTextExtractor();
|
|
@@ -10,15 +11,20 @@ const extractor = getTextExtractor();
|
|
|
10
11
|
* Format: RFC/Year/Patente/Aduana/Pedimento/
|
|
11
12
|
* Example: PED781129JT6/2023/3429/07/3019796/
|
|
12
13
|
*/
|
|
13
|
-
function composeArelaPath(
|
|
14
|
+
function composeArelaPath(
|
|
15
|
+
detectedType,
|
|
16
|
+
fields,
|
|
17
|
+
detectedPedimentoYear,
|
|
18
|
+
filePath,
|
|
19
|
+
) {
|
|
14
20
|
if (detectedType !== 'pedimento_simplificado') {
|
|
15
21
|
return null;
|
|
16
22
|
}
|
|
17
23
|
|
|
18
|
-
const rfc = fields?.find(f => f.name === 'rfc')?.value;
|
|
19
|
-
const patente = fields?.find(f => f.name === 'patente')?.value;
|
|
20
|
-
const aduana = fields?.find(f => f.name === 'aduanaEntradaSalida')?.value;
|
|
21
|
-
const pedimento = fields?.find(f => f.name === 'numPedimento')?.value;
|
|
24
|
+
const rfc = fields?.find((f) => f.name === 'rfc')?.value;
|
|
25
|
+
const patente = fields?.find((f) => f.name === 'patente')?.value;
|
|
26
|
+
const aduana = fields?.find((f) => f.name === 'aduanaEntradaSalida')?.value;
|
|
27
|
+
const pedimento = fields?.find((f) => f.name === 'numPedimento')?.value;
|
|
22
28
|
const year = detectedPedimentoYear;
|
|
23
29
|
|
|
24
30
|
// All components are required for a valid arela_path
|
|
@@ -28,17 +34,17 @@ function composeArelaPath(detectedType, fields, detectedPedimentoYear, filePath)
|
|
|
28
34
|
year: !!year,
|
|
29
35
|
patente: !!patente,
|
|
30
36
|
aduana: !!aduana,
|
|
31
|
-
pedimento: !!pedimento
|
|
37
|
+
pedimento: !!pedimento,
|
|
32
38
|
});
|
|
33
39
|
return null;
|
|
34
40
|
}
|
|
35
41
|
|
|
36
42
|
// Ensure aduana is padded to 2 digits if needed (07 instead of 7)
|
|
37
43
|
const aduanaFormatted = aduana.toString().padStart(2, '0');
|
|
38
|
-
|
|
44
|
+
|
|
39
45
|
// arela_path should be the folder structure only, without filename
|
|
40
46
|
const arelaPath = `${rfc}/${year}/${patente}/${aduanaFormatted}/${pedimento}/`;
|
|
41
|
-
|
|
47
|
+
|
|
42
48
|
console.log(`✅ Composed arela_path: ${arelaPath}`);
|
|
43
49
|
return arelaPath;
|
|
44
50
|
}
|
|
@@ -48,7 +54,6 @@ function composeArelaPath(detectedType, fields, detectedPedimentoYear, filePath)
|
|
|
48
54
|
* Detects document types and extracts metadata from files
|
|
49
55
|
*/
|
|
50
56
|
export class FileDetectionService {
|
|
51
|
-
|
|
52
57
|
/**
|
|
53
58
|
* Detect document type from a file
|
|
54
59
|
* @param {string} filePath - Path to the file to analyze
|
|
@@ -56,13 +61,16 @@ export class FileDetectionService {
|
|
|
56
61
|
*/
|
|
57
62
|
async detectFile(filePath) {
|
|
58
63
|
try {
|
|
59
|
-
const fileExtension = path
|
|
64
|
+
const fileExtension = path
|
|
65
|
+
.extname(filePath)
|
|
66
|
+
.toLowerCase()
|
|
67
|
+
.replace('.', '');
|
|
60
68
|
const fileName = path.basename(filePath);
|
|
61
|
-
|
|
69
|
+
|
|
62
70
|
console.log(`🔍 Analyzing file: ${fileName} (${fileExtension})`);
|
|
63
71
|
|
|
64
72
|
let text = '';
|
|
65
|
-
|
|
73
|
+
|
|
66
74
|
// Extract text based on file type
|
|
67
75
|
switch (fileExtension) {
|
|
68
76
|
case 'pdf':
|
|
@@ -83,7 +91,7 @@ export class FileDetectionService {
|
|
|
83
91
|
detectedPedimentoYear: null,
|
|
84
92
|
arelaPath: null,
|
|
85
93
|
text: '',
|
|
86
|
-
error: `Unsupported file type: ${fileExtension}
|
|
94
|
+
error: `Unsupported file type: ${fileExtension}`,
|
|
87
95
|
};
|
|
88
96
|
}
|
|
89
97
|
|
|
@@ -96,16 +104,21 @@ export class FileDetectionService {
|
|
|
96
104
|
detectedPedimentoYear: null,
|
|
97
105
|
arelaPath: null,
|
|
98
106
|
text: '',
|
|
99
|
-
error: 'No text could be extracted from file'
|
|
107
|
+
error: 'No text could be extracted from file',
|
|
100
108
|
};
|
|
101
109
|
}
|
|
102
110
|
|
|
103
111
|
// Extract document fields and detect type
|
|
104
|
-
const [detectedType, fields, detectedPedimento, detectedPedimentoYear] =
|
|
112
|
+
const [detectedType, fields, detectedPedimento, detectedPedimentoYear] =
|
|
105
113
|
extractDocumentFields(text, fileExtension, filePath);
|
|
106
114
|
|
|
107
115
|
// Compose arela_path for pedimento_simplificado documents
|
|
108
|
-
const arelaPath = composeArelaPath(
|
|
116
|
+
const arelaPath = composeArelaPath(
|
|
117
|
+
detectedType,
|
|
118
|
+
fields,
|
|
119
|
+
detectedPedimentoYear,
|
|
120
|
+
filePath,
|
|
121
|
+
);
|
|
109
122
|
|
|
110
123
|
return {
|
|
111
124
|
detectedType,
|
|
@@ -114,9 +127,8 @@ export class FileDetectionService {
|
|
|
114
127
|
detectedPedimentoYear,
|
|
115
128
|
arelaPath,
|
|
116
129
|
text,
|
|
117
|
-
error: null
|
|
130
|
+
error: null,
|
|
118
131
|
};
|
|
119
|
-
|
|
120
132
|
} catch (error) {
|
|
121
133
|
console.error(`❌ Error detecting file ${filePath}:`, error.message);
|
|
122
134
|
return {
|
|
@@ -126,7 +138,7 @@ export class FileDetectionService {
|
|
|
126
138
|
detectedPedimentoYear: null,
|
|
127
139
|
arelaPath: null,
|
|
128
140
|
text: '',
|
|
129
|
-
error: error.message
|
|
141
|
+
error: error.message,
|
|
130
142
|
};
|
|
131
143
|
}
|
|
132
144
|
}
|
|
@@ -139,13 +151,16 @@ export class FileDetectionService {
|
|
|
139
151
|
async extractTextFromPDF(filePath) {
|
|
140
152
|
try {
|
|
141
153
|
const buffer = fs.readFileSync(filePath);
|
|
142
|
-
const text = await extractor.extractText({
|
|
143
|
-
input: buffer,
|
|
144
|
-
type: 'file'
|
|
154
|
+
const text = await extractor.extractText({
|
|
155
|
+
input: buffer,
|
|
156
|
+
type: 'file',
|
|
145
157
|
});
|
|
146
158
|
return text;
|
|
147
159
|
} catch (error) {
|
|
148
|
-
console.error(
|
|
160
|
+
console.error(
|
|
161
|
+
`Error extracting text from PDF ${filePath}:`,
|
|
162
|
+
error.message,
|
|
163
|
+
);
|
|
149
164
|
throw new Error(`Failed to extract text from PDF: ${error.message}`);
|
|
150
165
|
}
|
|
151
166
|
}
|
|
@@ -157,15 +172,15 @@ export class FileDetectionService {
|
|
|
157
172
|
*/
|
|
158
173
|
async detectFiles(filePaths) {
|
|
159
174
|
const results = [];
|
|
160
|
-
|
|
175
|
+
|
|
161
176
|
for (const filePath of filePaths) {
|
|
162
177
|
const result = await this.detectFile(filePath);
|
|
163
178
|
results.push({
|
|
164
179
|
filePath,
|
|
165
|
-
...result
|
|
180
|
+
...result,
|
|
166
181
|
});
|
|
167
182
|
}
|
|
168
|
-
|
|
183
|
+
|
|
169
184
|
return results;
|
|
170
185
|
}
|
|
171
186
|
|
|
@@ -176,7 +191,7 @@ export class FileDetectionService {
|
|
|
176
191
|
*/
|
|
177
192
|
isSupportedFileType(filePath) {
|
|
178
193
|
const fileExtension = path.extname(filePath).toLowerCase().replace('.', '');
|
|
179
|
-
const supportedExtensions = ['pdf'
|
|
194
|
+
const supportedExtensions = ['pdf'];
|
|
180
195
|
return supportedExtensions.includes(fileExtension);
|
|
181
196
|
}
|
|
182
197
|
|
|
@@ -186,7 +201,7 @@ export class FileDetectionService {
|
|
|
186
201
|
* @returns {Array<string>} - Filtered array of supported file paths
|
|
187
202
|
*/
|
|
188
203
|
filterSupportedFiles(filePaths) {
|
|
189
|
-
return filePaths.filter(filePath => this.isSupportedFileType(filePath));
|
|
204
|
+
return filePaths.filter((filePath) => this.isSupportedFileType(filePath));
|
|
190
205
|
}
|
|
191
206
|
}
|
|
192
207
|
|