@arela/uploader 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.template ADDED
@@ -0,0 +1,20 @@
1
+ # Test environment configuration for arela-uploader
2
+ # Copy this to .env and update with your actual values
3
+
4
+ # Supabase Configuration
5
+ SUPABASE_URL=https://your-project.supabase.co
6
+ SUPABASE_KEY=your-supabase-anon-key
7
+ SUPABASE_BUCKET=your-bucket-name
8
+
9
+ # Arela API Configuration
10
+ ARELA_API_URL=https://your-arela-api-url.com
11
+ ARELA_API_TOKEN=your-api-token
12
+
13
+ # Upload Configuration
14
+ UPLOAD_BASE_PATH=/Users/your-username/documents
15
+ UPLOAD_SOURCES=folder1|folder2|folder3
16
+
17
+ # RFC Upload Configuration
18
+ # Pipe-separated list of RFCs to upload files for
19
+ # Example: MMJ0810145N1|ABC1234567XY|DEF9876543ZZ
20
+ UPLOAD_RFCS=RFC1|RFC2|RFC3
package/README.md CHANGED
@@ -1,15 +1,23 @@
1
1
  # arela-uploader
2
2
 
3
- CLI tool to upload files and directories to Supabase Storage with automatic file renaming and sanitization.
3
+ CLI tool to upload files and directories to Arela API or Supabase Storage with automatic file processing, detection, and organization.
4
4
 
5
5
  ## Features
6
6
 
7
7
  - 📁 Upload entire directories or individual files
8
+ - 🤖 **Automatic file detection and organization** (API mode)
9
+ - 🗂️ **Smart year/pedimento auto-detection from file paths**
10
+ - 🏗️ **Custom folder structure support**
8
11
  - 🔄 Automatic file renaming to handle problematic characters
9
12
  - 📝 Comprehensive logging (local and remote)
10
13
  - ⚡ Retry mechanism for failed uploads
11
14
  - 🎯 Skip duplicate files automatically
12
15
  - 📊 Progress bars and detailed summaries
16
+ - 📂 **Preserve directory structure with auto-organization**
17
+ - 🚀 **Batch processing with configurable concurrency**
18
+ - 🔧 **Performance optimizations with caching**
19
+ - 📋 **Upload files by specific RFC values**
20
+ - 🔍 **Propagate arela_path from pedimento documents to related files**
13
21
 
14
22
  ## Installation
15
23
 
@@ -19,51 +27,274 @@ npm install -g @arela/uploader
19
27
 
20
28
  ## Usage
21
29
 
22
- ### Basic Upload
30
+ ### Basic Upload with Auto-Processing (API Mode)
23
31
  ```bash
24
- arela -p "my-folder"
32
+ arela --batch-size 10 -c 5
25
33
  ```
26
34
 
27
- ### Upload with File Renaming
28
- For files with accents, special characters, or problematic names:
35
+ ### Upload with Auto-Detection of Year/Pedimento
36
+ ```bash
37
+ arela --auto-detect-structure --batch-size 10 -c 5
38
+ ```
39
+
40
+ ### Upload with Custom Folder Structure
41
+ ```bash
42
+ arela --folder-structure "2024/4023260" --batch-size 10 -c 5
43
+ ```
44
+
45
+ ### Upload with Directory Structure Preservation
46
+ ```bash
47
+ arela --batch-size 10 -c 5 --preserve-structure
48
+ ```
49
+
50
+ ### Upload to Supabase Directly (Skip API)
51
+ ```bash
52
+ arela --force-supabase -p "my-folder"
53
+ ```
54
+
55
+ ### Upload Files by Specific RFC Values
56
+ ```bash
57
+ # Upload all files associated with specific RFCs
58
+ arela --upload-by-rfc --batch-size 5
59
+
60
+ # Upload RFC files with custom folder prefix
61
+ arela --upload-by-rfc --folder-structure "palco" --batch-size 5
62
+
63
+ # Upload RFC files with nested folder structure
64
+ arela --upload-by-rfc --folder-structure "2024/client1/pedimentos" --batch-size 5
65
+ ```
66
+
67
+ ### Propagate Arela Path from Pedimentos to Related Files
68
+ ```bash
69
+ # Copy arela_path from pedimento_simplificado records to related files
70
+ arela --propagate-arela-path
71
+ ```
72
+
73
+ ### Stats-Only Mode (No Upload)
74
+ ```bash
75
+ # Only process file stats and insert to database, don't upload
76
+ arela --stats-only --folder-structure "2023/3019796"
77
+ ```
29
78
 
79
+ ### Upload with Performance Statistics
30
80
  ```bash
31
- # Preview what files would be renamed (dry run)
32
- arela --rename-files --dry-run
81
+ arela --batch-size 10 -c 5 --show-stats
82
+ ```
33
83
 
34
- # Actually rename and upload files
35
- arela --rename-files -p "documents"
84
+ ### Upload with Client Path Tracking
85
+ ```bash
86
+ arela --client-path "/client/documents" --batch-size 10 -c 5
36
87
  ```
37
88
 
38
89
  ### Options
39
90
 
40
91
  - `-p, --prefix <prefix>`: Prefix path in bucket (default: "")
41
- - `-r, --rename-files`: Rename files with problematic characters before uploading
42
- - `--dry-run`: Show what files would be renamed without actually renaming them
43
- - `-h, --help`: Display help information
92
+ - `-b, --bucket <bucket>`: Bucket name override
93
+ - `-c, --concurrency <number>`: Files per batch for processing (default: 10)
94
+ - `--batch-size <number>`: API batch size (default: 10)
95
+ - `--force-supabase`: Force direct Supabase upload (skip API)
96
+ - `--no-auto-detect`: Disable automatic file detection (API mode only)
97
+ - `--no-auto-organize`: Disable automatic file organization (API mode only)
98
+ - `--preserve-structure`: **Preserve original directory structure when using auto-organize**
99
+ - `--folder-structure <structure>`: **Custom folder structure** (e.g., "2024/4023260" or "cliente1/pedimentos")
100
+ - `--auto-detect-structure`: **Automatically detect year/pedimento from file paths**
101
+ - `--client-path <path>`: Client path for metadata tracking
102
+ - `--stats-only`: Only read file stats and insert to uploader table, skip file upload
103
+ - `--no-detect`: Disable document type detection in stats-only mode
104
+ - `--propagate-arela-path`: Propagate arela_path from pedimento_simplificado records to related files
105
+ - `--upload-by-rfc`: Upload files to Arela API based on RFC values from UPLOAD_RFCS environment variable
106
+ - `--show-stats`: Show detailed processing statistics
44
107
  - `-v, --version`: Display version number
108
+ - `-h, --help`: Display help information
45
109
 
46
110
  ## Environment Variables
47
111
 
48
112
  Create a `.env` file in your project root:
49
113
 
50
114
  ```env
115
+ # For API Mode (recommended)
116
+ ARELA_API_URL=http://localhost:3010
117
+ ARELA_API_TOKEN=your_api_token
118
+
119
+ # For Direct Supabase Mode (fallback)
51
120
  SUPABASE_URL=your_supabase_url
52
121
  SUPABASE_KEY=your_supabase_anon_key
53
122
  SUPABASE_BUCKET=your_bucket_name
123
+
124
+ # Required for both modes
54
125
  UPLOAD_BASE_PATH=/path/to/your/files
55
126
  UPLOAD_SOURCES=folder1|folder2|file.pdf
127
+
128
+ # RFC-based Upload Configuration
129
+ # Pipe-separated list of RFCs to upload files for
130
+ UPLOAD_RFCS=MMJ0810145N1|ABC1234567XY|DEF9876543ZZ
131
+ ```
132
+
133
+ **Environment Variable Details:**
134
+
135
+ - `ARELA_API_URL`: Base URL for the Arela API service
136
+ - `ARELA_API_TOKEN`: Authentication token for API access
137
+ - `SUPABASE_URL`: Your Supabase project URL
138
+ - `SUPABASE_KEY`: Supabase anonymous key for direct uploads
139
+ - `SUPABASE_BUCKET`: Target bucket name in Supabase Storage
140
+ - `UPLOAD_BASE_PATH`: Root directory containing files to upload
141
+ - `UPLOAD_SOURCES`: Pipe-separated list of folders/files to process
142
+ - `UPLOAD_RFCS`: Pipe-separated list of RFC values for targeted uploads
143
+
144
+ ## RFC-Based File Upload
145
+
146
+ The `--upload-by-rfc` feature allows you to upload files to the Arela API based on specific RFC values. This is useful when you want to upload only files associated with certain companies or entities.
147
+
148
+ ### How it works:
149
+
150
+ 1. **Configure RFCs**: Set the `UPLOAD_RFCS` environment variable with pipe-separated RFC values
151
+ 2. **Query Database**: The tool searches the Supabase database for files matching the specified RFCs
152
+ 3. **Include Supporting Documents**: Finds all files sharing the same `arela_path` as the RFC matches (not just the pedimento files)
153
+ 4. **Apply Folder Structure**: Optionally applies custom folder prefix using `--folder-structure`
154
+ 5. **Group and Upload**: Files are grouped by their final destination path and uploaded with proper structure
155
+
156
+ ### Folder Structure Options:
157
+
158
+ **Default Behavior** (no `--folder-structure`):
159
+ - Uses original `arela_path`: `CAD890407NK7/2023/3429/070/230734293000421/`
160
+
161
+ **With Custom Prefix** (`--folder-structure "palco"`):
162
+ - Results in: `palco/CAD890407NK7/2023/3429/070/230734293000421/`
163
+
164
+ **With Nested Prefix** (`--folder-structure "2024/client1/pedimentos"`):
165
+ - Results in: `2024/client1/pedimentos/CAD890407NK7/2023/3429/070/230734293000421/`
166
+
167
+ ### Prerequisites:
168
+
169
+ - Files must have been previously processed (have entries in the `uploader` table)
170
+ - Files must have `rfc` field populated (from document detection)
171
+ - Files must have `arela_path` populated (from pedimento processing)
172
+ - Original files must still exist at their `original_path` locations
173
+
174
+ ### Example:
175
+
176
+ ```bash
177
+ # Set RFCs in environment
178
+ export UPLOAD_RFCS="MMJ0810145N1|ABC1234567XY|DEF9876543ZZ"
179
+
180
+ # Upload files for these RFCs (original folder structure)
181
+ arela --upload-by-rfc --batch-size 5 --show-stats
182
+
183
+ # Upload with custom folder prefix
184
+ arela --upload-by-rfc --folder-structure "palco" --batch-size 10
185
+
186
+ # Upload with nested organization
187
+ arela --upload-by-rfc --folder-structure "2024/Q1/processed" --batch-size 15
188
+ ```
189
+
190
+ The tool will:
191
+ - Find all database records matching the specified RFCs
192
+ - Include ALL supporting documents that share the same `arela_path`
193
+ - Apply the optional folder structure prefix if specified
194
+ - Group files by their final destination folder structure
195
+ - Upload each group maintaining the correct Arela folder hierarchy
196
+ - Provide detailed progress and summary statistics
197
+ - Handle large datasets with automatic pagination (no 1000-file limit)
198
+
199
+ ## File Processing Modes
200
+
201
+ ### API Mode (Default)
202
+ When `ARELA_API_URL` and `ARELA_API_TOKEN` are configured:
203
+ - ✅ Automatic file detection and classification
204
+ - ✅ Intelligent file organization
205
+ - ✅ **Smart year/pedimento auto-detection from paths**
206
+ - ✅ **Custom folder structure support**
207
+ - ✅ Batch processing with progress tracking
208
+ - ✅ Advanced error handling and retry logic
209
+ - ✅ **Performance optimizations with file sanitization caching**
210
+
211
+ ### Auto-Detection Features
212
+ The tool can automatically detect year and pedimento numbers from file paths using multiple patterns:
213
+
214
+ **Pattern 1: Direct Structure**
215
+ ```
216
+ /path/to/2024/4023260/file.pdf
217
+ /path/to/pedimentos/2024/4023260/file.pdf
218
+ ```
219
+
220
+ **Pattern 2: Named Patterns**
221
+ ```
222
+ /path/to/docs/año2024/ped4023260/file.pdf
223
+ /path/to/files/year2024/pedimento4023260/file.pdf
224
+ ```
225
+
226
+ **Pattern 3: Loose Detection**
227
+ - Year: Any 4-digit number starting with "202" (2020-2029)
228
+ - Pedimento: Any 4-8 consecutive digits in path
229
+
230
+ Use `--auto-detect-structure` to enable automatic detection:
231
+ ```bash
232
+ arela --auto-detect-structure --batch-size 10
233
+ ```
234
+
235
+ ### Custom Folder Structure
236
+ Specify a custom organization pattern:
237
+ ```bash
238
+ # Static structure
239
+ arela --folder-structure "2024/4023260" --batch-size 10
240
+
241
+ # Client-based structure
242
+ arela --folder-structure "cliente1/pedimentos" --batch-size 10
243
+ ```
244
+
245
+ ### Directory Structure Preservation
246
+ Use `--preserve-structure` to maintain your original folder structure even with auto-organization:
247
+
248
+ ```bash
249
+ # Without --preserve-structure
250
+ # Files organized by API: bucket/filename.pdf
251
+
252
+ # With --preserve-structure
253
+ # Files keep structure: bucket/2024/4023260/filename.pdf
254
+ arela --preserve-structure --batch-size 10
56
255
  ```
57
256
 
58
- ## File Renaming
257
+ ### Supabase Direct Mode (Fallback)
258
+ When API is unavailable or `--force-supabase` is used:
259
+ - ✅ Direct upload to Supabase Storage
260
+ - ✅ File sanitization and renaming
261
+ - ✅ Basic progress tracking
262
+ - ✅ **Optimized sanitization with pre-compiled regex patterns**
263
+ - ✅ **Performance caching for file name sanitization**
59
264
 
60
- The tool automatically handles problematic characters by:
265
+ ## Performance Features
61
266
 
62
- - Removing accents (á → a, ñ → n, etc.)
63
- - Replacing special characters with safe alternatives
64
- - Converting spaces to dashes
65
- - Removing or replacing symbols like `{}[]~^`|"<>?*:`
66
- - Handling Korean characters and other Unicode symbols
267
+ ### Database Pagination
268
+ - **No Upload Limits**: Handles datasets larger than 1000 files through automatic pagination
269
+ - **Efficient Querying**: Uses Supabase `.range()` method to fetch data in batches
270
+ - **Memory Optimization**: Processes large datasets without memory overflow
271
+
272
+ ### File Processing
273
+ - **Pre-compiled Regex**: Sanitization patterns are compiled once for optimal performance
274
+ - **Caching System**: File name sanitization results are cached to avoid re-processing
275
+ - **Batch Processing**: Configurable batch sizes for optimal upload throughput
276
+
277
+ ### RFC Upload Optimizations
278
+ - **Smart Querying**: Three-step query process to efficiently find related files
279
+ - **Supporting Document Inclusion**: Automatically includes all related documents, not just pedimentos
280
+ - **Path Concatenation**: Efficiently combines custom folder structures with arela_paths
281
+
282
+ ## File Sanitization
283
+
284
+ The tool automatically handles problematic characters using advanced sanitization:
285
+
286
+ **Character Replacements:**
287
+ - **Accents**: á→a, é→e, í→i, ó→o, ú→u, ñ→n, ç→c
288
+ - **Korean characters**: 멕→meok, 시→si, 코→ko, 용→yong, others→kr
289
+ - **Special symbols**: &→and, {}[]~^|"<>?*: →-
290
+ - **Email symbols**: @→(removed), spaces→-
291
+ - **Multiple dashes**: collapsed to single dash
292
+ - **Leading/trailing**: dashes and dots removed
293
+
294
+ **Performance Features:**
295
+ - Pre-compiled regex patterns for faster processing
296
+ - Sanitization result caching to avoid re-processing
297
+ - Unicode normalization (NFD) for consistent handling
67
298
 
68
299
  ### Examples
69
300
 
@@ -73,12 +304,79 @@ The tool automatically handles problematic characters by:
73
304
  | `File{with}brackets.pdf` | `File-with-brackets.pdf` |
74
305
  | `Document ^& symbols.pdf` | `Document-and-symbols.pdf` |
75
306
  | `CI & PL-20221212(멕시코용).xls` | `CI-and-PL-20221212.xls` |
307
+ | `impresora@nereprint.com_file.xml` | `impresoranereprint.com_file.xml` |
308
+ | `07-3429-3000430 HC.pdf` | `07-3429-3000430-HC.pdf` |
309
+ | `FACTURA IN 3000430.pdf` | `FACTURA-IN-3000430.pdf` |
76
310
 
77
- ## Logging
311
+ ## Logging and Monitoring
78
312
 
79
- The tool maintains logs both locally (`upload.log`) and remotely in your Supabase database. Logs include:
313
+ The tool maintains comprehensive logs both locally and remotely:
80
314
 
81
- - Upload status (success/error/skipped)
82
- - File paths and sanitization changes
315
+ **Local Logging (`arela-upload.log`):**
316
+ - Upload status (SUCCESS/ERROR/SKIPPED/SANITIZED)
317
+ - File paths and sanitization changes
83
318
  - Error messages and timestamps
84
- - Rename operations
319
+ - Rename operations with before/after names
320
+ - Processing statistics and performance metrics
321
+
322
+ **Log Entry Examples:**
323
+ ```
324
+ [2025-09-04T01:17:00.141Z] SUCCESS: /Users/.../file.xml -> 2023/2003180/file.xml
325
+ [2025-09-04T01:17:00.822Z] SANITIZED: file name.pdf → file-name.pdf
326
+ [2025-09-04T01:17:00.856Z] SKIPPED: /Users/.../duplicate.pdf (already exists)
327
+ ```
328
+
329
+ **Remote Logging:**
330
+ - Integration with Supabase database for centralized logging
331
+ - Upload tracking and audit trails
332
+ - Error reporting and monitoring
333
+
334
+ ## Performance Features
335
+
336
+ **Version 2.0.0 introduces several performance optimizations:**
337
+
338
+ - **Pre-compiled Regex Patterns**: Sanitization patterns are compiled once and reused
339
+ - **Sanitization Caching**: File name sanitization results are cached to avoid reprocessing
340
+ - **Batch Processing**: Configurable batch sizes for optimal API usage
341
+ - **Concurrent Processing**: Adjustable concurrency levels for file processing
342
+ - **Smart Skip Logic**: Efficiently skips already processed files using log analysis
343
+ - **Memory Optimization**: Large file outputs are truncated to prevent memory issues
344
+
345
+ ## Version History
346
+
347
+ **v2.0.0** - Latest Release
348
+ - ✨ Added smart year/pedimento auto-detection from file paths
349
+ - ✨ Custom folder structure support with `--folder-structure` option
350
+ - ✨ Client path tracking with `--client-path` option
351
+ - ✨ Performance optimizations with regex pre-compilation
352
+ - ✨ Sanitization result caching for improved speed
353
+ - ✨ Enhanced file sanitization with Korean character support
354
+ - ✨ Improved email character handling in file names
355
+ - ✨ Better error handling and logging
356
+ - 📝 Comprehensive logging with SANITIZED status
357
+ - 🔧 Memory optimization for large file processing
358
+
359
+ ## Troubleshooting
360
+
361
+ **Connection Issues:**
362
+ - Verify `ARELA_API_URL` and `ARELA_API_TOKEN` are correct
363
+ - Check network connectivity to the API endpoint
364
+ - The tool will automatically fallback to Supabase direct mode if API is unavailable
365
+
366
+ **Performance Issues:**
367
+ - Adjust `--batch-size` for optimal API performance (default: 10)
368
+ - Modify `--concurrency` to control parallel processing (default: 10)
369
+ - Use `--show-stats` to monitor sanitization cache performance
370
+
371
+ **File Issues:**
372
+ - Check file permissions in `UPLOAD_BASE_PATH`
373
+ - Verify `UPLOAD_SOURCES` paths exist and are accessible
374
+ - Review `arela-upload.log` for detailed error information
375
+
376
+ ## Contributing
377
+
378
+ Contributions are welcome! Please feel free to submit a Pull Request.
379
+
380
+ ## License
381
+
382
+ ISC License - see LICENSE file for details.
File without changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arela/uploader",
3
- "version": "0.1.0",
3
+ "version": "0.2.0",
4
4
  "description": "CLI to upload files/directories to Arela",
5
5
  "bin": {
6
6
  "arela": "./src/index.js"
@@ -28,15 +28,18 @@
28
28
  },
29
29
  "homepage": "https://github.com/inspiraCode/arela-uploader#readme",
30
30
  "dependencies": {
31
- "@supabase/supabase-js": "^2.49.4",
32
- "cli-progress": "^3.12.0",
33
- "commander": "^13.1.0",
34
- "dotenv": "^16.5.0",
35
- "globby": "^14.1.0",
36
- "mime-types": "^3.0.1"
31
+ "@supabase/supabase-js": "2.49.4",
32
+ "cli-progress": "3.12.0",
33
+ "commander": "13.1.0",
34
+ "dotenv": "16.5.0",
35
+ "form-data": "4.0.4",
36
+ "globby": "14.1.0",
37
+ "mime-types": "3.0.1",
38
+ "node-fetch": "3.3.2",
39
+ "office-text-extractor": "3.0.3"
37
40
  },
38
41
  "devDependencies": {
39
- "@trivago/prettier-plugin-sort-imports": "^5.2.2",
40
- "prettier": "^3.5.3"
42
+ "@trivago/prettier-plugin-sort-imports": "5.2.2",
43
+ "prettier": "3.5.3"
41
44
  }
42
45
  }
@@ -0,0 +1,80 @@
1
+ // Document type definitions and extraction utilities
2
+ // Ported from TypeScript to JavaScript for Node.js
3
+
4
+ export class FieldResult {
5
+ constructor(name, found, value) {
6
+ this.name = name;
7
+ this.found = found;
8
+ this.value = value;
9
+ }
10
+ }
11
+
12
+ export class DocumentTypeDefinition {
13
+ constructor(type, extensions, match, extractors, extractNumPedimento, extractPedimentoYear) {
14
+ this.type = type;
15
+ this.extensions = extensions;
16
+ this.match = match;
17
+ this.extractors = extractors;
18
+ this.extractNumPedimento = extractNumPedimento;
19
+ this.extractPedimentoYear = extractPedimentoYear;
20
+ }
21
+ }
22
+
23
+ // Import all document type definitions
24
+ import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
25
+
26
+ // Registry of all document types
27
+ const documentTypes = [
28
+ pedimentoSimplificadoDefinition,
29
+ // Add more document types here as needed
30
+ ];
31
+
32
+ /**
33
+ * Extract document fields from text content
34
+ * @param {string} source - The text content to analyze
35
+ * @param {string} fileExtension - File extension for context
36
+ * @param {string} filePath - File path for context
37
+ * @returns {[string|null, FieldResult[], string|null, number|null]} - [detectedType, fields, pedimento, year]
38
+ */
39
+ export function extractDocumentFields(source, fileExtension, filePath) {
40
+ if (!source || typeof source !== 'string') {
41
+ return [null, [], null, null];
42
+ }
43
+
44
+ // Try to match against each document type
45
+ for (const docType of documentTypes) {
46
+ // Check if file extension matches
47
+ if (fileExtension && !docType.extensions.includes(fileExtension.toLowerCase())) {
48
+ continue;
49
+ }
50
+
51
+ // Test if content matches this document type
52
+ if (docType.match(source)) {
53
+ console.log(`✅ Matched document type: ${docType.type}`);
54
+
55
+ // Extract all fields
56
+ const fields = [];
57
+ for (const extractor of docType.extractors) {
58
+ try {
59
+ const result = extractor.extract(source);
60
+ fields.push(result);
61
+ if (result.found) {
62
+ console.log(` - ${result.name}: ${result.value}`);
63
+ }
64
+ } catch (error) {
65
+ console.error(`Error extracting field ${extractor.field}:`, error);
66
+ fields.push(new FieldResult(extractor.field, false, null));
67
+ }
68
+ }
69
+
70
+ // Extract pedimento number and year
71
+ const pedimento = docType.extractNumPedimento ? docType.extractNumPedimento(source, fields) : null;
72
+ const year = docType.extractPedimentoYear ? docType.extractPedimentoYear(source, fields) : null;
73
+
74
+ return [docType.type, fields, pedimento, year];
75
+ }
76
+ }
77
+
78
+ console.log('❓ No document type matched');
79
+ return [null, [], null, null];
80
+ }