npm - @arela/uploader - Versions diffs - 0.1.0 → 0.2.1 - Mend

@arela/uploader 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/.env.template +20 -0
package/OPTIMIZATION_SUMMARY.md +154 -0
package/PERFORMANCE_OPTIMIZATIONS.md +270 -0
package/README.md +412 -24
package/arela-upload.log +0 -0
package/commands.md +6 -0
package/package.json +12 -9
package/src/document-type-shared.js +80 -0
package/src/document-types/pedimento-simplificado.js +289 -0
package/src/file-detection.js +194 -0
package/src/index.js +1755 -575

package/README.md CHANGED Viewed

@@ -1,15 +1,89 @@
 # arela-uploader
-CLI tool to upload files and directories to Supabase Storage with automatic file renaming and sanitization.
+CLI tool to upload files and directories to Arela API or Supabase Storage with automatic file processing, detection, and organization.
+## 🚀 OPTIMIZED 4-PHASE WORKFLOW
+**New in v0.2.0**: The tool now supports an optimized 4-phase workflow designed for maximum performance when processing large file collections:
+### Phase 1: Filesystem Stats Collection 📊
+```bash
+arela --stats-only
+```
+- ⚡ **ULTRA FAST**: Only reads filesystem metadata (no file content)
+- 📈 **Bulk database operations**: Processes 1000+ files per batch
+- 🔄 **Upsert optimization**: Handles duplicates efficiently
+- 💾 **Minimal memory usage**: No file content loading
+### Phase 2: PDF Detection 🔍
+```bash
+arela --detect-pdfs
+```
+- 🎯 **Targeted processing**: Only processes PDF files from database
+- � **Pedimento-simplificado detection**: Extracts RFC, pedimento numbers, and metadata
+- 🔄 **Batched processing**: Handles large datasets efficiently
+- 📊 **Progress tracking**: Real-time detection statistics
+### Phase 3: Path Propagation �📁
+```bash
+arela --propagate-arela-path
+```
+- 🎯 **Smart path copying**: Propagates arela_path from pedimento documents to related files
+- 📦 **Batch updates**: Processes files in groups for optimal database performance
+- 🔗 **Relationship mapping**: Links supporting documents to their pedimento
+### Phase 4: RFC-based Upload 🚀
+```bash
+arela --upload-by-rfc
+```
+- 🎯 **Targeted uploads**: Only uploads files for specified RFCs
+- 📋 **Supporting documents**: Includes all related files, not just pedimentos
+- 🏗️ **Structure preservation**: Maintains proper folder hierarchy
+### Combined Workflow 🎯
+```bash
+# Run all 4 phases in sequence (recommended)
+arela --run-all-phases
+# Or run phases individually for more control
+arela --stats-only           # Phase 1: Collect filesystem stats
+arela --detect-pdfs          # Phase 2: Detect pedimento documents
+arela --propagate-arela-path # Phase 3: Propagate paths to related files
+arela --upload-by-rfc        # Phase 4: Upload by RFC
+```
+### Performance Benefits
+**Before optimization** (single phase with detection):
+- 🐌 Read every file for detection
+- 💾 High memory usage
+- 🔄 Slow database operations
+- ❌ Process unsupported files
+**After optimization** (4-phase approach):
+- ⚡ **10x faster**: Phase 1 only reads filesystem metadata
+- 📊 **Bulk operations**: Database inserts up to 1000 records per batch
+- 🎯 **Targeted processing**: Phase 2 only processes PDFs needing detection
+- 💾 **Memory efficient**: No unnecessary file content loading
+- 🔄 **Optimized I/O**: Separates filesystem, database, and network operations
 ## Features
 - 📁 Upload entire directories or individual files
+- 🤖 **Automatic file detection and organization** (API mode)
+- 🗂️ **Smart year/pedimento auto-detection from file paths**
+- 🏗️ **Custom folder structure support**
 - 🔄 Automatic file renaming to handle problematic characters
 - 📝 Comprehensive logging (local and remote)
 - ⚡ Retry mechanism for failed uploads
 - 🎯 Skip duplicate files automatically
 - 📊 Progress bars and detailed summaries
+- 📂 **Preserve directory structure with auto-organization**
+- 🚀 **Batch processing with configurable concurrency**
+- 🔧 **Performance optimizations with caching**
+- 📋 **Upload files by specific RFC values**
+- 🔍 **Propagate arela_path from pedimento documents to related files**
+- ⚡ **4-Phase optimized workflow for maximum performance**
 ## Installation
@@ -19,51 +93,298 @@ npm install -g @arela/uploader
 ## Usage
-### Basic Upload
+### 🚀 Optimized 4-Phase Workflow (Recommended)
+```bash
+# Run all phases automatically (most efficient)
+arela --run-all-phases --batch-size 20
+# Or run phases individually for fine-grained control
+arela --stats-only                    # Phase 1: Filesystem stats only
+arela --detect-pdfs --batch-size 10   # Phase 2: PDF detection
+arela --propagate-arela-path          # Phase 3: Path propagation
+arela --upload-by-rfc --batch-size 5  # Phase 4: RFC-based upload
+```
+### Traditional Single-Phase Upload (Legacy)
+#### Basic Upload with Auto-Processing (API Mode)
+```bash
+arela --batch-size 10 -c 5
+```
+### Upload with Auto-Detection of Year/Pedimento
+```bash
+arela --auto-detect-structure --batch-size 10 -c 5
+```
+### Upload with Custom Folder Structure
+```bash
+arela --folder-structure "2024/4023260" --batch-size 10 -c 5
+```
+### Upload with Directory Structure Preservation
+```bash
+arela --batch-size 10 -c 5 --preserve-structure
+```
+### Upload to Supabase Directly (Skip API)
 ```bash
-arela -p "my-folder"
+arela --force-supabase -p "my-folder"
 ```
-### Upload with File Renaming
-For files with accents, special characters, or problematic names:
+### Upload Files by Specific RFC Values
+```bash
+# Upload all files associated with specific RFCs
+arela --upload-by-rfc --batch-size 5
+# Upload RFC files with custom folder prefix
+arela --upload-by-rfc --folder-structure "palco" --batch-size 5
+# Upload RFC files with nested folder structure
+arela --upload-by-rfc --folder-structure "2024/client1/pedimentos" --batch-size 5
+```
+### Propagate Arela Path from Pedimentos to Related Files
 ```bash
-# Preview what files would be renamed (dry run)
-arela --rename-files --dry-run
+# Copy arela_path from pedimento_simplificado records to related files
+arela --propagate-arela-path
+```
-# Actually rename and upload files
-arela --rename-files -p "documents"
+### Stats-Only Mode (No Upload)
+```bash
+# Only process file stats and insert to database, don't upload
+arela --stats-only --folder-structure "2023/3019796"
+```
+### Upload with Performance Statistics
+```bash
+arela --batch-size 10 -c 5 --show-stats
+```
+### Upload with Client Path Tracking
+```bash
+arela --client-path "/client/documents" --batch-size 10 -c 5
 ```
 ### Options
+#### Phase Control
+- `--stats-only`: **Phase 1** - Only collect filesystem stats (no file reading)
+- `--detect-pdfs`: **Phase 2** - Process PDF files for pedimento-simplificado detection
+- `--propagate-arela-path`: **Phase 3** - Propagate arela_path from pedimento records to related files
+- `--upload-by-rfc`: **Phase 4** - Upload files based on RFC values from UPLOAD_RFCS
+- `--run-all-phases`: **All Phases** - Run complete optimized workflow
+#### Performance & Configuration
+- `-c, --concurrency <number>`: Files per batch for processing (default: 10)
+- `--batch-size <number>`: API batch size (default: 10)
+- `--show-stats`: Show detailed processing statistics
+#### Upload Configuration
 - `-p, --prefix <prefix>`: Prefix path in bucket (default: "")
-- `-r, --rename-files`: Rename files with problematic characters before uploading
-- `--dry-run`: Show what files would be renamed without actually renaming them
-- `-h, --help`: Display help information
+- `-b, --bucket <bucket>`: Bucket name override
+- `--force-supabase`: Force direct Supabase upload (skip API)
+- `--no-auto-detect`: Disable automatic file detection (API mode only)
+- `--no-auto-organize`: Disable automatic file organization (API mode only)
+- `--preserve-structure`: **Preserve original directory structure when using auto-organize**
+- `--folder-structure <structure>`: **Custom folder structure** (e.g., "2024/4023260" or "cliente1/pedimentos")
+- `--auto-detect-structure`: **Automatically detect year/pedimento from file paths**
+- `--client-path <path>`: Client path for metadata tracking
+#### Legacy Options
+- `--no-detect`: Disable document type detection in stats-only mode
 - `-v, --version`: Display version number
+- `-h, --help`: Display help information
 ## Environment Variables
 Create a `.env` file in your project root:
 ```env
+# For API Mode (recommended)
+ARELA_API_URL=http://localhost:3010
+ARELA_API_TOKEN=your_api_token
+# For Direct Supabase Mode (fallback)
 SUPABASE_URL=your_supabase_url
 SUPABASE_KEY=your_supabase_anon_key
 SUPABASE_BUCKET=your_bucket_name
+# Required for both modes
 UPLOAD_BASE_PATH=/path/to/your/files
 UPLOAD_SOURCES=folder1|folder2|file.pdf
+# RFC-based Upload Configuration
+# Pipe-separated list of RFCs to upload files for
+UPLOAD_RFCS=MMJ0810145N1|ABC1234567XY|DEF9876543ZZ
+```
+**Environment Variable Details:**
+- `ARELA_API_URL`: Base URL for the Arela API service
+- `ARELA_API_TOKEN`: Authentication token for API access
+- `SUPABASE_URL`: Your Supabase project URL
+- `SUPABASE_KEY`: Supabase anonymous key for direct uploads
+- `SUPABASE_BUCKET`: Target bucket name in Supabase Storage
+- `UPLOAD_BASE_PATH`: Root directory containing files to upload
+- `UPLOAD_SOURCES`: Pipe-separated list of folders/files to process
+- `UPLOAD_RFCS`: Pipe-separated list of RFC values for targeted uploads
+## RFC-Based File Upload
+The `--upload-by-rfc` feature allows you to upload files to the Arela API based on specific RFC values. This is useful when you want to upload only files associated with certain companies or entities.
+### How it works:
+1. **Configure RFCs**: Set the `UPLOAD_RFCS` environment variable with pipe-separated RFC values
+2. **Query Database**: The tool searches the Supabase database for files matching the specified RFCs
+3. **Include Supporting Documents**: Finds all files sharing the same `arela_path` as the RFC matches (not just the pedimento files)
+4. **Apply Folder Structure**: Optionally applies custom folder prefix using `--folder-structure`
+5. **Group and Upload**: Files are grouped by their final destination path and uploaded with proper structure
+### Folder Structure Options:
+**Default Behavior** (no `--folder-structure`):
+- Uses original `arela_path`: `CAD890407NK7/2023/3429/070/230734293000421/`
+**With Custom Prefix** (`--folder-structure "palco"`):
+- Results in: `palco/CAD890407NK7/2023/3429/070/230734293000421/`
+**With Nested Prefix** (`--folder-structure "2024/client1/pedimentos"`):
+- Results in: `2024/client1/pedimentos/CAD890407NK7/2023/3429/070/230734293000421/`
+### Prerequisites:
+- Files must have been previously processed (have entries in the `uploader` table)
+- Files must have `rfc` field populated (from document detection)
+- Files must have `arela_path` populated (from pedimento processing)
+- Original files must still exist at their `original_path` locations
+### Example:
+```bash
+# Set RFCs in environment
+export UPLOAD_RFCS="MMJ0810145N1|ABC1234567XY|DEF9876543ZZ"
+# Upload files for these RFCs (original folder structure)
+arela --upload-by-rfc --batch-size 5 --show-stats
+# Upload with custom folder prefix
+arela --upload-by-rfc --folder-structure "palco" --batch-size 10
+# Upload with nested organization
+arela --upload-by-rfc --folder-structure "2024/Q1/processed" --batch-size 15
+```
+The tool will:
+- Find all database records matching the specified RFCs
+- Include ALL supporting documents that share the same `arela_path`
+- Apply the optional folder structure prefix if specified
+- Group files by their final destination folder structure
+- Upload each group maintaining the correct Arela folder hierarchy
+- Provide detailed progress and summary statistics
+- Handle large datasets with automatic pagination (no 1000-file limit)
+## File Processing Modes
+### API Mode (Default)
+When `ARELA_API_URL` and `ARELA_API_TOKEN` are configured:
+- ✅ Automatic file detection and classification
+- ✅ Intelligent file organization
+- ✅ **Smart year/pedimento auto-detection from paths**
+- ✅ **Custom folder structure support**
+- ✅ Batch processing with progress tracking
+- ✅ Advanced error handling and retry logic
+- ✅ **Performance optimizations with file sanitization caching**
+### Auto-Detection Features
+The tool can automatically detect year and pedimento numbers from file paths using multiple patterns:
+**Pattern 1: Direct Structure**
+```
+/path/to/2024/4023260/file.pdf
+/path/to/pedimentos/2024/4023260/file.pdf
+```
+**Pattern 2: Named Patterns**
+```
+/path/to/docs/año2024/ped4023260/file.pdf
+/path/to/files/year2024/pedimento4023260/file.pdf
+```
+**Pattern 3: Loose Detection**
+- Year: Any 4-digit number starting with "202" (2020-2029)
+- Pedimento: Any 4-8 consecutive digits in path
+Use `--auto-detect-structure` to enable automatic detection:
+```bash
+arela --auto-detect-structure --batch-size 10
+```
+### Custom Folder Structure
+Specify a custom organization pattern:
+```bash
+# Static structure
+arela --folder-structure "2024/4023260" --batch-size 10
+# Client-based structure
+arela --folder-structure "cliente1/pedimentos" --batch-size 10
+```
+### Directory Structure Preservation
+Use `--preserve-structure` to maintain your original folder structure even with auto-organization:
+```bash
+# Without --preserve-structure
+# Files organized by API: bucket/filename.pdf
+# With --preserve-structure
+# Files keep structure: bucket/2024/4023260/filename.pdf
+arela --preserve-structure --batch-size 10
 ```
-## File Renaming
+### Supabase Direct Mode (Fallback)
+When API is unavailable or `--force-supabase` is used:
+- ✅ Direct upload to Supabase Storage
+- ✅ File sanitization and renaming
+- ✅ Basic progress tracking
+- ✅ **Optimized sanitization with pre-compiled regex patterns**
+- ✅ **Performance caching for file name sanitization**
-The tool automatically handles problematic characters by:
+## Performance Features
-- Removing accents (á → a, ñ → n, etc.)
-- Replacing special characters with safe alternatives
-- Converting spaces to dashes
-- Removing or replacing symbols like `{}[]~^`|"<>?*:`
-- Handling Korean characters and other Unicode symbols
+### Database Pagination
+- **No Upload Limits**: Handles datasets larger than 1000 files through automatic pagination
+- **Efficient Querying**: Uses Supabase `.range()` method to fetch data in batches
+- **Memory Optimization**: Processes large datasets without memory overflow
+### File Processing
+- **Pre-compiled Regex**: Sanitization patterns are compiled once for optimal performance
+- **Caching System**: File name sanitization results are cached to avoid re-processing
+- **Batch Processing**: Configurable batch sizes for optimal upload throughput
+### RFC Upload Optimizations
+- **Smart Querying**: Three-step query process to efficiently find related files
+- **Supporting Document Inclusion**: Automatically includes all related documents, not just pedimentos
+- **Path Concatenation**: Efficiently combines custom folder structures with arela_paths
+## File Sanitization
+The tool automatically handles problematic characters using advanced sanitization:
+**Character Replacements:**
+- **Accents**: á→a, é→e, í→i, ó→o, ú→u, ñ→n, ç→c
+- **Korean characters**: 멕→meok, 시→si, 코→ko, 용→yong, others→kr
+- **Special symbols**: &→and, {}[]~^|"<>?*: →-
+- **Email symbols**: @→(removed), spaces→-
+- **Multiple dashes**: collapsed to single dash
+- **Leading/trailing**: dashes and dots removed
+**Performance Features:**
+- Pre-compiled regex patterns for faster processing
+- Sanitization result caching to avoid re-processing
+- Unicode normalization (NFD) for consistent handling
 ### Examples
@@ -73,12 +394,79 @@ The tool automatically handles problematic characters by:
 | `File{with}brackets.pdf` | `File-with-brackets.pdf` |
 | `Document ^& symbols.pdf` | `Document-and-symbols.pdf` |
 | `CI & PL-20221212(멕시코용).xls` | `CI-and-PL-20221212.xls` |
+| `impresora@nereprint.com_file.xml` | `impresoranereprint.com_file.xml` |
+| `07-3429-3000430 HC.pdf` | `07-3429-3000430-HC.pdf` |
+| `FACTURA IN 3000430.pdf` | `FACTURA-IN-3000430.pdf` |
-## Logging
+## Logging and Monitoring
-The tool maintains logs both locally (`upload.log`) and remotely in your Supabase database. Logs include:
+The tool maintains comprehensive logs both locally and remotely:
-- Upload status (success/error/skipped)
-- File paths and sanitization changes
+**Local Logging (`arela-upload.log`):**
+- Upload status (SUCCESS/ERROR/SKIPPED/SANITIZED)
+- File paths and sanitization changes
 - Error messages and timestamps
-- Rename operations
+- Rename operations with before/after names
+- Processing statistics and performance metrics
+**Log Entry Examples:**
+```
+[2025-09-04T01:17:00.141Z] SUCCESS: /Users/.../file.xml -> 2023/2003180/file.xml
+[2025-09-04T01:17:00.822Z] SANITIZED: file name.pdf → file-name.pdf
+[2025-09-04T01:17:00.856Z] SKIPPED: /Users/.../duplicate.pdf (already exists)
+```
+**Remote Logging:**
+- Integration with Supabase database for centralized logging
+- Upload tracking and audit trails
+- Error reporting and monitoring
+## Performance Features
+**Version 2.0.0 introduces several performance optimizations:**
+- **Pre-compiled Regex Patterns**: Sanitization patterns are compiled once and reused
+- **Sanitization Caching**: File name sanitization results are cached to avoid reprocessing
+- **Batch Processing**: Configurable batch sizes for optimal API usage
+- **Concurrent Processing**: Adjustable concurrency levels for file processing
+- **Smart Skip Logic**: Efficiently skips already processed files using log analysis
+- **Memory Optimization**: Large file outputs are truncated to prevent memory issues
+## Version History
+**v2.0.0** - Latest Release
+- ✨ Added smart year/pedimento auto-detection from file paths
+- ✨ Custom folder structure support with `--folder-structure` option
+- ✨ Client path tracking with `--client-path` option
+- ✨ Performance optimizations with regex pre-compilation
+- ✨ Sanitization result caching for improved speed
+- ✨ Enhanced file sanitization with Korean character support
+- ✨ Improved email character handling in file names
+- ✨ Better error handling and logging
+- 📝 Comprehensive logging with SANITIZED status
+- 🔧 Memory optimization for large file processing
+## Troubleshooting
+**Connection Issues:**
+- Verify `ARELA_API_URL` and `ARELA_API_TOKEN` are correct
+- Check network connectivity to the API endpoint
+- The tool will automatically fallback to Supabase direct mode if API is unavailable
+**Performance Issues:**
+- Adjust `--batch-size` for optimal API performance (default: 10)
+- Modify `--concurrency` to control parallel processing (default: 10)
+- Use `--show-stats` to monitor sanitization cache performance
+**File Issues:**
+- Check file permissions in `UPLOAD_BASE_PATH`
+- Verify `UPLOAD_SOURCES` paths exist and are accessible
+- Review `arela-upload.log` for detailed error information
+## Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.
+## License
+ISC License - see LICENSE file for details.

package/arela-upload.log ADDED Viewed

File without changes

package/commands.md ADDED Viewed

@@ -0,0 +1,6 @@
+node src/index.js --stats-only
+node src/index.js --detect-pdfs
+node src/index.js --propagate-arela-path
+node src/index.js --upload-by-rfc --folder-structure palco
+UPLOAD_RFCS="RFC1|RFC2" node src/index.js --upload-by-rfc --folder-structure target-folder

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@arela/uploader",
-  "version": "0.1.0",
+  "version": "0.2.1",
   "description": "CLI to upload files/directories to Arela",
   "bin": {
     "arela": "./src/index.js"
@@ -28,15 +28,18 @@
   },
   "homepage": "https://github.com/inspiraCode/arela-uploader#readme",
   "dependencies": {
-    "@supabase/supabase-js": "^2.49.4",
-    "cli-progress": "^3.12.0",
-    "commander": "^13.1.0",
-    "dotenv": "^16.5.0",
-    "globby": "^14.1.0",
-    "mime-types": "^3.0.1"
+    "@supabase/supabase-js": "2.49.4",
+    "cli-progress": "3.12.0",
+    "commander": "13.1.0",
+    "dotenv": "16.5.0",
+    "form-data": "4.0.4",
+    "globby": "14.1.0",
+    "mime-types": "3.0.1",
+    "node-fetch": "3.3.2",
+    "office-text-extractor": "3.0.3"
   },
   "devDependencies": {
-    "@trivago/prettier-plugin-sort-imports": "^5.2.2",
-    "prettier": "^3.5.3"
+    "@trivago/prettier-plugin-sort-imports": "5.2.2",
+    "prettier": "3.5.3"
   }
 }

package/src/document-type-shared.js ADDED Viewed

@@ -0,0 +1,80 @@
+// Document type definitions and extraction utilities
+// Ported from TypeScript to JavaScript for Node.js
+export class FieldResult {
+  constructor(name, found, value) {
+    this.name = name;
+    this.found = found;
+    this.value = value;
+  }
+}
+export class DocumentTypeDefinition {
+  constructor(type, extensions, match, extractors, extractNumPedimento, extractPedimentoYear) {
+    this.type = type;
+    this.extensions = extensions;
+    this.match = match;
+    this.extractors = extractors;
+    this.extractNumPedimento = extractNumPedimento;
+    this.extractPedimentoYear = extractPedimentoYear;
+  }
+}
+// Import all document type definitions
+import { pedimentoSimplificadoDefinition } from './document-types/pedimento-simplificado.js';
+// Registry of all document types
+const documentTypes = [
+  pedimentoSimplificadoDefinition,
+  // Add more document types here as needed
+];
+/**
+ * Extract document fields from text content
+ * @param {string} source - The text content to analyze
+ * @param {string} fileExtension - File extension for context
+ * @param {string} filePath - File path for context
+ * @returns {[string|null, FieldResult[], string|null, number|null]} - [detectedType, fields, pedimento, year]
+ */
+export function extractDocumentFields(source, fileExtension, filePath) {
+  if (!source || typeof source !== 'string') {
+    return [null, [], null, null];
+  }
+  // Try to match against each document type
+  for (const docType of documentTypes) {
+    // Check if file extension matches
+    if (fileExtension && !docType.extensions.includes(fileExtension.toLowerCase())) {
+      continue;
+    }
+    // Test if content matches this document type
+    if (docType.match(source)) {
+      console.log(`✅ Matched document type: ${docType.type}`);
+      // Extract all fields
+      const fields = [];
+      for (const extractor of docType.extractors) {
+        try {
+          const result = extractor.extract(source);
+          fields.push(result);
+          if (result.found) {
+            console.log(`  - ${result.name}: ${result.value}`);
+          }
+        } catch (error) {
+          console.error(`Error extracting field ${extractor.field}:`, error);
+          fields.push(new FieldResult(extractor.field, false, null));
+        }
+      }
+      // Extract pedimento number and year
+      const pedimento = docType.extractNumPedimento ? docType.extractNumPedimento(source, fields) : null;
+      const year = docType.extractPedimentoYear ? docType.extractPedimentoYear(source, fields) : null;
+      return [docType.type, fields, pedimento, year];
+    }
+  }
+  console.log('❓ No document type matched');
+  return [null, [], null, null];
+}