@arela/uploader 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.template +20 -0
- package/README.md +322 -24
- package/arela-upload.log +0 -0
- package/package.json +12 -9
- package/src/document-type-shared.js +80 -0
- package/src/document-types/pedimento-simplificado.js +289 -0
- package/src/file-detection.js +194 -0
- package/src/index.js +1353 -593
package/.env.template
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Test environment configuration for arela-uploader
|
|
2
|
+
# Copy this to .env and update with your actual values
|
|
3
|
+
|
|
4
|
+
# Supabase Configuration
|
|
5
|
+
SUPABASE_URL=https://your-project.supabase.co
|
|
6
|
+
SUPABASE_KEY=your-supabase-anon-key
|
|
7
|
+
SUPABASE_BUCKET=your-bucket-name
|
|
8
|
+
|
|
9
|
+
# Arela API Configuration
|
|
10
|
+
ARELA_API_URL=https://your-arela-api-url.com
|
|
11
|
+
ARELA_API_TOKEN=your-api-token
|
|
12
|
+
|
|
13
|
+
# Upload Configuration
|
|
14
|
+
UPLOAD_BASE_PATH=/Users/your-username/documents
|
|
15
|
+
UPLOAD_SOURCES=folder1|folder2|folder3
|
|
16
|
+
|
|
17
|
+
# RFC Upload Configuration
|
|
18
|
+
# Pipe-separated list of RFCs to upload files for
|
|
19
|
+
# Example: MMJ0810145N1|ABC1234567XY|DEF9876543ZZ
|
|
20
|
+
UPLOAD_RFCS=RFC1|RFC2|RFC3
|
package/README.md
CHANGED
|
@@ -1,15 +1,23 @@
|
|
|
1
1
|
# arela-uploader
|
|
2
2
|
|
|
3
|
-
CLI tool to upload files and directories to Supabase Storage with automatic file
|
|
3
|
+
CLI tool to upload files and directories to Arela API or Supabase Storage with automatic file processing, detection, and organization.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
7
|
- 📁 Upload entire directories or individual files
|
|
8
|
+
- 🤖 **Automatic file detection and organization** (API mode)
|
|
9
|
+
- 🗂️ **Smart year/pedimento auto-detection from file paths**
|
|
10
|
+
- 🏗️ **Custom folder structure support**
|
|
8
11
|
- 🔄 Automatic file renaming to handle problematic characters
|
|
9
12
|
- 📝 Comprehensive logging (local and remote)
|
|
10
13
|
- ⚡ Retry mechanism for failed uploads
|
|
11
14
|
- 🎯 Skip duplicate files automatically
|
|
12
15
|
- 📊 Progress bars and detailed summaries
|
|
16
|
+
- 📂 **Preserve directory structure with auto-organization**
|
|
17
|
+
- 🚀 **Batch processing with configurable concurrency**
|
|
18
|
+
- 🔧 **Performance optimizations with caching**
|
|
19
|
+
- 📋 **Upload files by specific RFC values**
|
|
20
|
+
- 🔍 **Propagate arela_path from pedimento documents to related files**
|
|
13
21
|
|
|
14
22
|
## Installation
|
|
15
23
|
|
|
@@ -19,51 +27,274 @@ npm install -g @arela/uploader
|
|
|
19
27
|
|
|
20
28
|
## Usage
|
|
21
29
|
|
|
22
|
-
### Basic Upload
|
|
30
|
+
### Basic Upload with Auto-Processing (API Mode)
|
|
23
31
|
```bash
|
|
24
|
-
arela -
|
|
32
|
+
arela --batch-size 10 -c 5
|
|
25
33
|
```
|
|
26
34
|
|
|
27
|
-
### Upload with
|
|
28
|
-
|
|
35
|
+
### Upload with Auto-Detection of Year/Pedimento
|
|
36
|
+
```bash
|
|
37
|
+
arela --auto-detect-structure --batch-size 10 -c 5
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Upload with Custom Folder Structure
|
|
41
|
+
```bash
|
|
42
|
+
arela --folder-structure "2024/4023260" --batch-size 10 -c 5
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Upload with Directory Structure Preservation
|
|
46
|
+
```bash
|
|
47
|
+
arela --batch-size 10 -c 5 --preserve-structure
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Upload to Supabase Directly (Skip API)
|
|
51
|
+
```bash
|
|
52
|
+
arela --force-supabase -p "my-folder"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Upload Files by Specific RFC Values
|
|
56
|
+
```bash
|
|
57
|
+
# Upload all files associated with specific RFCs
|
|
58
|
+
arela --upload-by-rfc --batch-size 5
|
|
59
|
+
|
|
60
|
+
# Upload RFC files with custom folder prefix
|
|
61
|
+
arela --upload-by-rfc --folder-structure "palco" --batch-size 5
|
|
62
|
+
|
|
63
|
+
# Upload RFC files with nested folder structure
|
|
64
|
+
arela --upload-by-rfc --folder-structure "2024/client1/pedimentos" --batch-size 5
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Propagate Arela Path from Pedimentos to Related Files
|
|
68
|
+
```bash
|
|
69
|
+
# Copy arela_path from pedimento_simplificado records to related files
|
|
70
|
+
arela --propagate-arela-path
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Stats-Only Mode (No Upload)
|
|
74
|
+
```bash
|
|
75
|
+
# Only process file stats and insert to database, don't upload
|
|
76
|
+
arela --stats-only --folder-structure "2023/3019796"
|
|
77
|
+
```
|
|
29
78
|
|
|
79
|
+
### Upload with Performance Statistics
|
|
30
80
|
```bash
|
|
31
|
-
|
|
32
|
-
|
|
81
|
+
arela --batch-size 10 -c 5 --show-stats
|
|
82
|
+
```
|
|
33
83
|
|
|
34
|
-
|
|
35
|
-
|
|
84
|
+
### Upload with Client Path Tracking
|
|
85
|
+
```bash
|
|
86
|
+
arela --client-path "/client/documents" --batch-size 10 -c 5
|
|
36
87
|
```
|
|
37
88
|
|
|
38
89
|
### Options
|
|
39
90
|
|
|
40
91
|
- `-p, --prefix <prefix>`: Prefix path in bucket (default: "")
|
|
41
|
-
- `-
|
|
42
|
-
-
|
|
43
|
-
-
|
|
92
|
+
- `-b, --bucket <bucket>`: Bucket name override
|
|
93
|
+
- `-c, --concurrency <number>`: Files per batch for processing (default: 10)
|
|
94
|
+
- `--batch-size <number>`: API batch size (default: 10)
|
|
95
|
+
- `--force-supabase`: Force direct Supabase upload (skip API)
|
|
96
|
+
- `--no-auto-detect`: Disable automatic file detection (API mode only)
|
|
97
|
+
- `--no-auto-organize`: Disable automatic file organization (API mode only)
|
|
98
|
+
- `--preserve-structure`: **Preserve original directory structure when using auto-organize**
|
|
99
|
+
- `--folder-structure <structure>`: **Custom folder structure** (e.g., "2024/4023260" or "cliente1/pedimentos")
|
|
100
|
+
- `--auto-detect-structure`: **Automatically detect year/pedimento from file paths**
|
|
101
|
+
- `--client-path <path>`: Client path for metadata tracking
|
|
102
|
+
- `--stats-only`: Only read file stats and insert to uploader table, skip file upload
|
|
103
|
+
- `--no-detect`: Disable document type detection in stats-only mode
|
|
104
|
+
- `--propagate-arela-path`: Propagate arela_path from pedimento_simplificado records to related files
|
|
105
|
+
- `--upload-by-rfc`: Upload files to Arela API based on RFC values from UPLOAD_RFCS environment variable
|
|
106
|
+
- `--show-stats`: Show detailed processing statistics
|
|
44
107
|
- `-v, --version`: Display version number
|
|
108
|
+
- `-h, --help`: Display help information
|
|
45
109
|
|
|
46
110
|
## Environment Variables
|
|
47
111
|
|
|
48
112
|
Create a `.env` file in your project root:
|
|
49
113
|
|
|
50
114
|
```env
|
|
115
|
+
# For API Mode (recommended)
|
|
116
|
+
ARELA_API_URL=http://localhost:3010
|
|
117
|
+
ARELA_API_TOKEN=your_api_token
|
|
118
|
+
|
|
119
|
+
# For Direct Supabase Mode (fallback)
|
|
51
120
|
SUPABASE_URL=your_supabase_url
|
|
52
121
|
SUPABASE_KEY=your_supabase_anon_key
|
|
53
122
|
SUPABASE_BUCKET=your_bucket_name
|
|
123
|
+
|
|
124
|
+
# Required for both modes
|
|
54
125
|
UPLOAD_BASE_PATH=/path/to/your/files
|
|
55
126
|
UPLOAD_SOURCES=folder1|folder2|file.pdf
|
|
127
|
+
|
|
128
|
+
# RFC-based Upload Configuration
|
|
129
|
+
# Pipe-separated list of RFCs to upload files for
|
|
130
|
+
UPLOAD_RFCS=MMJ0810145N1|ABC1234567XY|DEF9876543ZZ
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
**Environment Variable Details:**
|
|
134
|
+
|
|
135
|
+
- `ARELA_API_URL`: Base URL for the Arela API service
|
|
136
|
+
- `ARELA_API_TOKEN`: Authentication token for API access
|
|
137
|
+
- `SUPABASE_URL`: Your Supabase project URL
|
|
138
|
+
- `SUPABASE_KEY`: Supabase anonymous key for direct uploads
|
|
139
|
+
- `SUPABASE_BUCKET`: Target bucket name in Supabase Storage
|
|
140
|
+
- `UPLOAD_BASE_PATH`: Root directory containing files to upload
|
|
141
|
+
- `UPLOAD_SOURCES`: Pipe-separated list of folders/files to process
|
|
142
|
+
- `UPLOAD_RFCS`: Pipe-separated list of RFC values for targeted uploads
|
|
143
|
+
|
|
144
|
+
## RFC-Based File Upload
|
|
145
|
+
|
|
146
|
+
The `--upload-by-rfc` feature allows you to upload files to the Arela API based on specific RFC values. This is useful when you want to upload only files associated with certain companies or entities.
|
|
147
|
+
|
|
148
|
+
### How it works:
|
|
149
|
+
|
|
150
|
+
1. **Configure RFCs**: Set the `UPLOAD_RFCS` environment variable with pipe-separated RFC values
|
|
151
|
+
2. **Query Database**: The tool searches the Supabase database for files matching the specified RFCs
|
|
152
|
+
3. **Include Supporting Documents**: Finds all files sharing the same `arela_path` as the RFC matches (not just the pedimento files)
|
|
153
|
+
4. **Apply Folder Structure**: Optionally applies custom folder prefix using `--folder-structure`
|
|
154
|
+
5. **Group and Upload**: Files are grouped by their final destination path and uploaded with proper structure
|
|
155
|
+
|
|
156
|
+
### Folder Structure Options:
|
|
157
|
+
|
|
158
|
+
**Default Behavior** (no `--folder-structure`):
|
|
159
|
+
- Uses original `arela_path`: `CAD890407NK7/2023/3429/070/230734293000421/`
|
|
160
|
+
|
|
161
|
+
**With Custom Prefix** (`--folder-structure "palco"`):
|
|
162
|
+
- Results in: `palco/CAD890407NK7/2023/3429/070/230734293000421/`
|
|
163
|
+
|
|
164
|
+
**With Nested Prefix** (`--folder-structure "2024/client1/pedimentos"`):
|
|
165
|
+
- Results in: `2024/client1/pedimentos/CAD890407NK7/2023/3429/070/230734293000421/`
|
|
166
|
+
|
|
167
|
+
### Prerequisites:
|
|
168
|
+
|
|
169
|
+
- Files must have been previously processed (have entries in the `uploader` table)
|
|
170
|
+
- Files must have `rfc` field populated (from document detection)
|
|
171
|
+
- Files must have `arela_path` populated (from pedimento processing)
|
|
172
|
+
- Original files must still exist at their `original_path` locations
|
|
173
|
+
|
|
174
|
+
### Example:
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
# Set RFCs in environment
|
|
178
|
+
export UPLOAD_RFCS="MMJ0810145N1|ABC1234567XY|DEF9876543ZZ"
|
|
179
|
+
|
|
180
|
+
# Upload files for these RFCs (original folder structure)
|
|
181
|
+
arela --upload-by-rfc --batch-size 5 --show-stats
|
|
182
|
+
|
|
183
|
+
# Upload with custom folder prefix
|
|
184
|
+
arela --upload-by-rfc --folder-structure "palco" --batch-size 10
|
|
185
|
+
|
|
186
|
+
# Upload with nested organization
|
|
187
|
+
arela --upload-by-rfc --folder-structure "2024/Q1/processed" --batch-size 15
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
The tool will:
|
|
191
|
+
- Find all database records matching the specified RFCs
|
|
192
|
+
- Include ALL supporting documents that share the same `arela_path`
|
|
193
|
+
- Apply the optional folder structure prefix if specified
|
|
194
|
+
- Group files by their final destination folder structure
|
|
195
|
+
- Upload each group maintaining the correct Arela folder hierarchy
|
|
196
|
+
- Provide detailed progress and summary statistics
|
|
197
|
+
- Handle large datasets with automatic pagination (no 1000-file limit)
|
|
198
|
+
|
|
199
|
+
## File Processing Modes
|
|
200
|
+
|
|
201
|
+
### API Mode (Default)
|
|
202
|
+
When `ARELA_API_URL` and `ARELA_API_TOKEN` are configured:
|
|
203
|
+
- ✅ Automatic file detection and classification
|
|
204
|
+
- ✅ Intelligent file organization
|
|
205
|
+
- ✅ **Smart year/pedimento auto-detection from paths**
|
|
206
|
+
- ✅ **Custom folder structure support**
|
|
207
|
+
- ✅ Batch processing with progress tracking
|
|
208
|
+
- ✅ Advanced error handling and retry logic
|
|
209
|
+
- ✅ **Performance optimizations with file sanitization caching**
|
|
210
|
+
|
|
211
|
+
### Auto-Detection Features
|
|
212
|
+
The tool can automatically detect year and pedimento numbers from file paths using multiple patterns:
|
|
213
|
+
|
|
214
|
+
**Pattern 1: Direct Structure**
|
|
215
|
+
```
|
|
216
|
+
/path/to/2024/4023260/file.pdf
|
|
217
|
+
/path/to/pedimentos/2024/4023260/file.pdf
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
**Pattern 2: Named Patterns**
|
|
221
|
+
```
|
|
222
|
+
/path/to/docs/año2024/ped4023260/file.pdf
|
|
223
|
+
/path/to/files/year2024/pedimento4023260/file.pdf
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
**Pattern 3: Loose Detection**
|
|
227
|
+
- Year: Any 4-digit number starting with "202" (2020-2029)
|
|
228
|
+
- Pedimento: Any 4-8 consecutive digits in path
|
|
229
|
+
|
|
230
|
+
Use `--auto-detect-structure` to enable automatic detection:
|
|
231
|
+
```bash
|
|
232
|
+
arela --auto-detect-structure --batch-size 10
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Custom Folder Structure
|
|
236
|
+
Specify a custom organization pattern:
|
|
237
|
+
```bash
|
|
238
|
+
# Static structure
|
|
239
|
+
arela --folder-structure "2024/4023260" --batch-size 10
|
|
240
|
+
|
|
241
|
+
# Client-based structure
|
|
242
|
+
arela --folder-structure "cliente1/pedimentos" --batch-size 10
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Directory Structure Preservation
|
|
246
|
+
Use `--preserve-structure` to maintain your original folder structure even with auto-organization:
|
|
247
|
+
|
|
248
|
+
```bash
|
|
249
|
+
# Without --preserve-structure
|
|
250
|
+
# Files organized by API: bucket/filename.pdf
|
|
251
|
+
|
|
252
|
+
# With --preserve-structure
|
|
253
|
+
# Files keep structure: bucket/2024/4023260/filename.pdf
|
|
254
|
+
arela --preserve-structure --batch-size 10
|
|
56
255
|
```
|
|
57
256
|
|
|
58
|
-
|
|
257
|
+
### Supabase Direct Mode (Fallback)
|
|
258
|
+
When API is unavailable or `--force-supabase` is used:
|
|
259
|
+
- ✅ Direct upload to Supabase Storage
|
|
260
|
+
- ✅ File sanitization and renaming
|
|
261
|
+
- ✅ Basic progress tracking
|
|
262
|
+
- ✅ **Optimized sanitization with pre-compiled regex patterns**
|
|
263
|
+
- ✅ **Performance caching for file name sanitization**
|
|
59
264
|
|
|
60
|
-
|
|
265
|
+
## Performance Features
|
|
61
266
|
|
|
62
|
-
|
|
63
|
-
-
|
|
64
|
-
-
|
|
65
|
-
-
|
|
66
|
-
|
|
267
|
+
### Database Pagination
|
|
268
|
+
- **No Upload Limits**: Handles datasets larger than 1000 files through automatic pagination
|
|
269
|
+
- **Efficient Querying**: Uses Supabase `.range()` method to fetch data in batches
|
|
270
|
+
- **Memory Optimization**: Processes large datasets without memory overflow
|
|
271
|
+
|
|
272
|
+
### File Processing
|
|
273
|
+
- **Pre-compiled Regex**: Sanitization patterns are compiled once for optimal performance
|
|
274
|
+
- **Caching System**: File name sanitization results are cached to avoid re-processing
|
|
275
|
+
- **Batch Processing**: Configurable batch sizes for optimal upload throughput
|
|
276
|
+
|
|
277
|
+
### RFC Upload Optimizations
|
|
278
|
+
- **Smart Querying**: Three-step query process to efficiently find related files
|
|
279
|
+
- **Supporting Document Inclusion**: Automatically includes all related documents, not just pedimentos
|
|
280
|
+
- **Path Concatenation**: Efficiently combines custom folder structures with arela_paths
|
|
281
|
+
|
|
282
|
+
## File Sanitization
|
|
283
|
+
|
|
284
|
+
The tool automatically handles problematic characters using advanced sanitization:
|
|
285
|
+
|
|
286
|
+
**Character Replacements:**
|
|
287
|
+
- **Accents**: á→a, é→e, í→i, ó→o, ú→u, ñ→n, ç→c
|
|
288
|
+
- **Korean characters**: 멕→meok, 시→si, 코→ko, 용→yong, others→kr
|
|
289
|
+
- **Special symbols**: &→and, {}[]~^|"<>?*: →-
|
|
290
|
+
- **Email symbols**: @→(removed), spaces→-
|
|
291
|
+
- **Multiple dashes**: collapsed to single dash
|
|
292
|
+
- **Leading/trailing**: dashes and dots removed
|
|
293
|
+
|
|
294
|
+
**Performance Features:**
|
|
295
|
+
- Pre-compiled regex patterns for faster processing
|
|
296
|
+
- Sanitization result caching to avoid re-processing
|
|
297
|
+
- Unicode normalization (NFD) for consistent handling
|
|
67
298
|
|
|
68
299
|
### Examples
|
|
69
300
|
|
|
@@ -73,12 +304,79 @@ The tool automatically handles problematic characters by:
|
|
|
73
304
|
| `File{with}brackets.pdf` | `File-with-brackets.pdf` |
|
|
74
305
|
| `Document ^& symbols.pdf` | `Document-and-symbols.pdf` |
|
|
75
306
|
| `CI & PL-20221212(멕시코용).xls` | `CI-and-PL-20221212.xls` |
|
|
307
|
+
| `impresora@nereprint.com_file.xml` | `impresoranereprint.com_file.xml` |
|
|
308
|
+
| `07-3429-3000430 HC.pdf` | `07-3429-3000430-HC.pdf` |
|
|
309
|
+
| `FACTURA IN 3000430.pdf` | `FACTURA-IN-3000430.pdf` |
|
|
76
310
|
|
|
77
|
-
## Logging
|
|
311
|
+
## Logging and Monitoring
|
|
78
312
|
|
|
79
|
-
The tool maintains logs both locally
|
|
313
|
+
The tool maintains comprehensive logs both locally and remotely:
|
|
80
314
|
|
|
81
|
-
|
|
82
|
-
-
|
|
315
|
+
**Local Logging (`arela-upload.log`):**
|
|
316
|
+
- Upload status (SUCCESS/ERROR/SKIPPED/SANITIZED)
|
|
317
|
+
- File paths and sanitization changes
|
|
83
318
|
- Error messages and timestamps
|
|
84
|
-
- Rename operations
|
|
319
|
+
- Rename operations with before/after names
|
|
320
|
+
- Processing statistics and performance metrics
|
|
321
|
+
|
|
322
|
+
**Log Entry Examples:**
|
|
323
|
+
```
|
|
324
|
+
[2025-09-04T01:17:00.141Z] SUCCESS: /Users/.../file.xml -> 2023/2003180/file.xml
|
|
325
|
+
[2025-09-04T01:17:00.822Z] SANITIZED: file name.pdf → file-name.pdf
|
|
326
|
+
[2025-09-04T01:17:00.856Z] SKIPPED: /Users/.../duplicate.pdf (already exists)
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
**Remote Logging:**
|
|
330
|
+
- Integration with Supabase database for centralized logging
|
|
331
|
+
- Upload tracking and audit trails
|
|
332
|
+
- Error reporting and monitoring
|
|
333
|
+
|
|
334
|
+
## Performance Features
|
|
335
|
+
|
|
336
|
+
**Version 2.0.0 introduces several performance optimizations:**
|
|
337
|
+
|
|
338
|
+
- **Pre-compiled Regex Patterns**: Sanitization patterns are compiled once and reused
|
|
339
|
+
- **Sanitization Caching**: File name sanitization results are cached to avoid reprocessing
|
|
340
|
+
- **Batch Processing**: Configurable batch sizes for optimal API usage
|
|
341
|
+
- **Concurrent Processing**: Adjustable concurrency levels for file processing
|
|
342
|
+
- **Smart Skip Logic**: Efficiently skips already processed files using log analysis
|
|
343
|
+
- **Memory Optimization**: Large file outputs are truncated to prevent memory issues
|
|
344
|
+
|
|
345
|
+
## Version History
|
|
346
|
+
|
|
347
|
+
**v2.0.0** - Latest Release
|
|
348
|
+
- ✨ Added smart year/pedimento auto-detection from file paths
|
|
349
|
+
- ✨ Custom folder structure support with `--folder-structure` option
|
|
350
|
+
- ✨ Client path tracking with `--client-path` option
|
|
351
|
+
- ✨ Performance optimizations with regex pre-compilation
|
|
352
|
+
- ✨ Sanitization result caching for improved speed
|
|
353
|
+
- ✨ Enhanced file sanitization with Korean character support
|
|
354
|
+
- ✨ Improved email character handling in file names
|
|
355
|
+
- ✨ Better error handling and logging
|
|
356
|
+
- 📝 Comprehensive logging with SANITIZED status
|
|
357
|
+
- 🔧 Memory optimization for large file processing
|
|
358
|
+
|
|
359
|
+
## Troubleshooting
|
|
360
|
+
|
|
361
|
+
**Connection Issues:**
|
|
362
|
+
- Verify `ARELA_API_URL` and `ARELA_API_TOKEN` are correct
|
|
363
|
+
- Check network connectivity to the API endpoint
|
|
364
|
+
- The tool will automatically fallback to Supabase direct mode if API is unavailable
|
|
365
|
+
|
|
366
|
+
**Performance Issues:**
|
|
367
|
+
- Adjust `--batch-size` for optimal API performance (default: 10)
|
|
368
|
+
- Modify `--concurrency` to control parallel processing (default: 10)
|
|
369
|
+
- Use `--show-stats` to monitor sanitization cache performance
|
|
370
|
+
|
|
371
|
+
**File Issues:**
|
|
372
|
+
- Check file permissions in `UPLOAD_BASE_PATH`
|
|
373
|
+
- Verify `UPLOAD_SOURCES` paths exist and are accessible
|
|
374
|
+
- Review `arela-upload.log` for detailed error information
|
|
375
|
+
|
|
376
|
+
## Contributing
|
|
377
|
+
|
|
378
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
379
|
+
|
|
380
|
+
## License
|
|
381
|
+
|
|
382
|
+
ISC License - see LICENSE file for details.
|
package/arela-upload.log
ADDED
|
File without changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arela/uploader",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "CLI to upload files/directories to Arela",
|
|
5
5
|
"bin": {
|
|
6
6
|
"arela": "./src/index.js"
|
|
@@ -28,15 +28,18 @@
|
|
|
28
28
|
},
|
|
29
29
|
"homepage": "https://github.com/inspiraCode/arela-uploader#readme",
|
|
30
30
|
"dependencies": {
|
|
31
|
-
"@supabase/supabase-js": "
|
|
32
|
-
"cli-progress": "
|
|
33
|
-
"commander": "
|
|
34
|
-
"dotenv": "
|
|
35
|
-
"
|
|
36
|
-
"
|
|
31
|
+
"@supabase/supabase-js": "2.49.4",
|
|
32
|
+
"cli-progress": "3.12.0",
|
|
33
|
+
"commander": "13.1.0",
|
|
34
|
+
"dotenv": "16.5.0",
|
|
35
|
+
"form-data": "4.0.4",
|
|
36
|
+
"globby": "14.1.0",
|
|
37
|
+
"mime-types": "3.0.1",
|
|
38
|
+
"node-fetch": "3.3.2",
|
|
39
|
+
"office-text-extractor": "3.0.3"
|
|
37
40
|
},
|
|
38
41
|
"devDependencies": {
|
|
39
|
-
"@trivago/prettier-plugin-sort-imports": "
|
|
40
|
-
"prettier": "
|
|
42
|
+
"@trivago/prettier-plugin-sort-imports": "5.2.2",
|
|
43
|
+
"prettier": "3.5.3"
|
|
41
44
|
}
|
|
42
45
|
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
// Document type definitions and extraction utilities
|
|
2
|
+
// Ported from TypeScript to JavaScript for Node.js
|
|
3
|
+
|
|
4
|
+
export class FieldResult {
|
|
5
|
+
constructor(name, found, value) {
|
|
6
|
+
this.name = name;
|
|
7
|
+
this.found = found;
|
|
8
|
+
this.value = value;
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export class DocumentTypeDefinition {
|
|
13
|
+
constructor(type, extensions, match, extractors, extractNumPedimento, extractPedimentoYear) {
|
|
14
|
+
this.type = type;
|
|
15
|
+
this.extensions = extensions;
|
|
16
|
+
this.match = match;
|
|
17
|
+
this.extractors = extractors;
|
|
18
|
+
this.extractNumPedimento = extractNumPedimento;
|
|
19
|
+
this.extractPedimentoYear = extractPedimentoYear;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Import all document type definitions
|
|
24
|
+
import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
|
|
25
|
+
|
|
26
|
+
// Registry of all document types
|
|
27
|
+
const documentTypes = [
|
|
28
|
+
pedimentoSimplificadoDefinition,
|
|
29
|
+
// Add more document types here as needed
|
|
30
|
+
];
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Extract document fields from text content
|
|
34
|
+
* @param {string} source - The text content to analyze
|
|
35
|
+
* @param {string} fileExtension - File extension for context
|
|
36
|
+
* @param {string} filePath - File path for context
|
|
37
|
+
* @returns {[string|null, FieldResult[], string|null, number|null]} - [detectedType, fields, pedimento, year]
|
|
38
|
+
*/
|
|
39
|
+
export function extractDocumentFields(source, fileExtension, filePath) {
|
|
40
|
+
if (!source || typeof source !== 'string') {
|
|
41
|
+
return [null, [], null, null];
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Try to match against each document type
|
|
45
|
+
for (const docType of documentTypes) {
|
|
46
|
+
// Check if file extension matches
|
|
47
|
+
if (fileExtension && !docType.extensions.includes(fileExtension.toLowerCase())) {
|
|
48
|
+
continue;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Test if content matches this document type
|
|
52
|
+
if (docType.match(source)) {
|
|
53
|
+
console.log(`✅ Matched document type: ${docType.type}`);
|
|
54
|
+
|
|
55
|
+
// Extract all fields
|
|
56
|
+
const fields = [];
|
|
57
|
+
for (const extractor of docType.extractors) {
|
|
58
|
+
try {
|
|
59
|
+
const result = extractor.extract(source);
|
|
60
|
+
fields.push(result);
|
|
61
|
+
if (result.found) {
|
|
62
|
+
console.log(` - ${result.name}: ${result.value}`);
|
|
63
|
+
}
|
|
64
|
+
} catch (error) {
|
|
65
|
+
console.error(`Error extracting field ${extractor.field}:`, error);
|
|
66
|
+
fields.push(new FieldResult(extractor.field, false, null));
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Extract pedimento number and year
|
|
71
|
+
const pedimento = docType.extractNumPedimento ? docType.extractNumPedimento(source, fields) : null;
|
|
72
|
+
const year = docType.extractPedimentoYear ? docType.extractPedimentoYear(source, fields) : null;
|
|
73
|
+
|
|
74
|
+
return [docType.type, fields, pedimento, year];
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
console.log('❓ No document type matched');
|
|
79
|
+
return [null, [], null, null];
|
|
80
|
+
}
|