@arela/uploader 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.template CHANGED
@@ -19,6 +19,64 @@ UPLOAD_BASE_PATH=/path/to/your/upload/base
19
19
  UPLOAD_SOURCES=folder1|folder2|folder3
20
20
  UPLOAD_RFCS=rfc1|rfc2|rfc3
21
21
 
22
+ # =============================================================================
23
+ # SCAN CONFIGURATION (for arela scan command)
24
+ # =============================================================================
25
+
26
+ # Company identifier for this CLI instance (required)
27
+ # Use a short, descriptive slug for your company/agency/client
28
+ # Examples: "acme_corp", "cliente_123", "agencia_xyz"
29
+ ARELA_COMPANY_SLUG=
30
+
31
+ # Server identifier (required)
32
+ # Use a unique ID for each server/NAS where arela-cli is installed
33
+ # Examples: "nas01", "server-mx", "storage-01"
34
+ ARELA_SERVER_ID=
35
+
36
+ # Base path label (optional, auto-derived from UPLOAD_BASE_PATH if not set)
37
+ # Short label describing the base path being scanned
38
+ # Examples: "data", "documents", "archive"
39
+ ARELA_BASE_PATH_LABEL=
40
+
41
+ # System file patterns to exclude from scan (comma-separated)
42
+ # These files will be filtered before uploading stats to reduce payload
43
+ SCAN_EXCLUDE_PATTERNS=.DS_Store,Thumbs.db,desktop.ini,__pycache__,.pyc,.tmp,.swp,$RECYCLE.BIN,System Volume Information,~$*
44
+
45
+ # Batch size for scan operations (default: 2000 records per API call)
46
+ SCAN_BATCH_SIZE=2000
47
+
48
+ # Directory depth level for creating separate tables (default: 0)
49
+ # 0 = single table for entire base path
50
+ # 1 = one table per first-level subdirectory
51
+ # 2 = one table per second-level subdirectory, etc.
52
+ # Example: with level=1 and base=/data, creates tables for /data/folder1, /data/folder2, etc.
53
+ SCAN_DIRECTORY_LEVEL=0
54
+
55
+ # =============================================================================
56
+ # PUSH CONFIGURATION (for arela push command)
57
+ # =============================================================================
58
+
59
+ # Filter files to upload by RFC (pipe-separated, optional)
60
+ # If not set, all files with arela_path will be uploaded
61
+ # Examples: "RFC123456ABC|RFC789012DEF"
62
+ PUSH_RFCS=
63
+
64
+ # Filter files to upload by year (pipe-separated, optional)
65
+ # If not set, all files with arela_path will be uploaded
66
+ # Examples: "2023|2024|2025"
67
+ PUSH_YEARS=
68
+
69
+ # Batch size for fetching files from database (default: 100)
70
+ PUSH_BATCH_SIZE=100
71
+
72
+ # Concurrent upload batch size (default: 10)
73
+ # Number of files to upload simultaneously
74
+ PUSH_UPLOAD_BATCH_SIZE=10
75
+
76
+ # Storage bucket for uploaded files (optional, defaults to SUPABASE_BUCKET)
77
+ # Examples: "archivos", "documents", "storage"
78
+ PUSH_BUCKET=arela
79
+
22
80
  # =============================================================================
23
81
  # PERFORMANCE OPTIMIZATION FOR MULTIPLE API REPLICAS
24
82
  # =============================================================================
@@ -30,6 +88,18 @@ MAX_API_CONNECTIONS=10
30
88
  # API Connection Timeout (milliseconds)
31
89
  API_CONNECTION_TIMEOUT=60000
32
90
 
91
+ # API Retry Configuration
92
+ # Maximum number of retry attempts for failed API requests
93
+ API_MAX_RETRIES=3
94
+
95
+ # Enable exponential backoff for retries (true/false)
96
+ # When true, retry delays increase: 1s, 2s, 4s, 8s, 16s
97
+ # When false, uses fixed delay (API_RETRY_DELAY)
98
+ API_RETRY_EXPONENTIAL_BACKOFF=true
99
+
100
+ # Fixed retry delay in milliseconds (only used if exponential backoff is disabled)
101
+ API_RETRY_DELAY=1000
102
+
33
103
  # Batch Processing Configuration
34
104
  # Files processed concurrently per batch (should be >= MAX_API_CONNECTIONS for best performance)
35
105
  BATCH_SIZE=100
@@ -0,0 +1,338 @@
1
+ # API Retry Mechanism
2
+
3
+ ## Overview
4
+
5
+ The `arela scan` and `arela identify` commands now include robust retry logic with exponential backoff for all API requests. This ensures resilience against transient network issues, temporary server overload, and rate limiting.
6
+
7
+ ## Features
8
+
9
+ ### 1. **Automatic Retry on Transient Errors**
10
+
11
+ Retries are automatically triggered for:
12
+ - **Network errors**: Connection reset, timeout, refused, DNS failures
13
+ - **HTTP 429**: Too Many Requests (rate limiting)
14
+ - **HTTP 5xx**: Server errors (500, 502, 503, 504)
15
+
16
+ ### 2. **Exponential Backoff (Default)**
17
+
18
+ When enabled (default), retry delays increase exponentially:
19
+ - Attempt 1: ~1 second
20
+ - Attempt 2: ~2 seconds
21
+ - Attempt 3: ~4 seconds
22
+ - Attempt 4: ~8 seconds
23
+ - Attempt 5: ~16 seconds (max)
24
+
25
+ ### 3. **Jitter to Prevent Thundering Herd**
26
+
27
+ Each retry delay includes ±20% random jitter to prevent multiple clients from retrying simultaneously, which could overwhelm the server.
28
+
29
+ ### 4. **Smart Error Detection**
30
+
31
+ The system distinguishes between:
32
+ - **Retryable errors**: Network issues, server errors, rate limits
33
+ - **Non-retryable errors**: Client errors (400, 401, 403, 404), validation errors
34
+
35
+ Non-retryable errors fail immediately without wasting time on useless retries.
36
+
37
+ ## Configuration
38
+
39
+ ### Environment Variables
40
+
41
+ Add to `.env` file:
42
+
43
+ ```bash
44
+ # Maximum number of retry attempts (default: 3)
45
+ API_MAX_RETRIES=3
46
+
47
+ # Use exponential backoff (default: true)
48
+ # When true: 1s → 2s → 4s → 8s → 16s
49
+ # When false: uses fixed delay
50
+ API_RETRY_EXPONENTIAL_BACKOFF=true
51
+
52
+ # Fixed retry delay in milliseconds (only used if exponential backoff is disabled)
53
+ API_RETRY_DELAY=1000
54
+ ```
55
+
56
+ ### Configuration Examples
57
+
58
+ #### High Reliability (Recommended)
59
+
60
+ ```bash
61
+ API_MAX_RETRIES=5
62
+ API_RETRY_EXPONENTIAL_BACKOFF=true
63
+ ```
64
+
65
+ **Use case**: Production environments with unreliable network or high load
66
+
67
+ #### Fast Failure (Testing/Development)
68
+
69
+ ```bash
70
+ API_MAX_RETRIES=1
71
+ API_RETRY_EXPONENTIAL_BACKOFF=false
72
+ API_RETRY_DELAY=500
73
+ ```
74
+
75
+ **Use case**: Local development where you want quick feedback
76
+
77
+ #### Aggressive Retry (High-Volume Processing)
78
+
79
+ ```bash
80
+ API_MAX_RETRIES=7
81
+ API_RETRY_EXPONENTIAL_BACKOFF=true
82
+ ```
83
+
84
+ **Use case**: Large batch operations where you can't afford to lose progress
85
+
86
+ ## Retry Behavior
87
+
88
+ ### Retryable Scenarios
89
+
90
+ | Error Type | Example | Retry Behavior |
91
+ |------------|---------|----------------|
92
+ | Network timeout | `ETIMEDOUT` | ✅ Retry with backoff |
93
+ | Connection reset | `ECONNRESET` | ✅ Retry with backoff |
94
+ | Connection refused | `ECONNREFUSED` | ✅ Retry with backoff |
95
+ | DNS failure | `ENOTFOUND`, `EAI_AGAIN` | ✅ Retry with backoff |
96
+ | Rate limiting | HTTP 429 | ✅ Retry with backoff |
97
+ | Server error | HTTP 5xx | ✅ Retry with backoff |
98
+
99
+ ### Non-Retryable Scenarios
100
+
101
+ | Error Type | Example | Retry Behavior |
102
+ |------------|---------|----------------|
103
+ | Unauthorized | HTTP 401 | ❌ Fail immediately |
104
+ | Forbidden | HTTP 403 | ❌ Fail immediately |
105
+ | Not found | HTTP 404 | ❌ Fail immediately |
106
+ | Bad request | HTTP 400 | ❌ Fail immediately |
107
+ | Conflict | HTTP 409 | ❌ Fail immediately |
108
+
109
+ ## Logging
110
+
111
+ ### Retry Warnings
112
+
113
+ When a retry is triggered, you'll see warnings like:
114
+
115
+ ```
116
+ ⚠️ API request failed (attempt 1/4): Connection timeout. Retrying in 1234ms...
117
+ ⚠️ API request failed (attempt 2/4): HTTP 503 Service Unavailable. Retrying in 2456ms...
118
+ ```
119
+
120
+ ### Retry Success
121
+
122
+ When a retry succeeds, you'll see:
123
+
124
+ ```
125
+ ℹ️ API request succeeded on attempt 3/4
126
+ ```
127
+
128
+ ### Final Failure
129
+
130
+ If all retries fail, you'll see:
131
+
132
+ ```
133
+ ❌ API request failed after 4 attempt(s): Connection timeout
134
+ ```
135
+
136
+ ## Performance Impact
137
+
138
+ ### With Default Settings (3 retries, exponential backoff)
139
+
140
+ **Best case** (no failures): No overhead
141
+
142
+ **Worst case** (all retries fail):
143
+ - Total retry time: ~1s + 2s + 4s = ~7 seconds
144
+ - Total attempts: 4 (1 initial + 3 retries)
145
+
146
+ ### Optimization Tips
147
+
148
+ 1. **For stable networks**: Reduce `API_MAX_RETRIES` to 2-3
149
+ 2. **For unstable networks**: Increase to 5-7
150
+ 3. **For rate-limited APIs**: Keep exponential backoff enabled
151
+ 4. **For fast development**: Disable retries or set to 1
152
+
153
+ ## Integration with Commands
154
+
155
+ ### arela scan
156
+
157
+ All API operations during scan now have retry logic:
158
+ - Instance registration (`POST /api/uploader/scan/register`)
159
+ - Batch insert (`POST /api/uploader/scan/batch-insert`)
160
+ - Scan completion (`PATCH /api/uploader/scan/complete`)
161
+
162
+ ### arela identify
163
+
164
+ All API operations during identify now have retry logic:
165
+ - Fetch detection stats (`GET /api/uploader/scan/detection-stats`)
166
+ - Fetch PDFs for detection (`GET /api/uploader/scan/pdfs-for-detection`)
167
+ - Batch update detection (`PATCH /api/uploader/scan/batch-update-detection`)
168
+
169
+ ## Comparison with DatabaseService
170
+
171
+ | Feature | DatabaseService (Supabase) | ScanApiService (HTTP) |
172
+ |---------|---------------------------|----------------------|
173
+ | Retry Logic | ✅ Yes | ✅ Yes |
174
+ | Max Retries | 3 (hardcoded) | 3 (configurable) |
175
+ | Backoff Strategy | Exponential | Exponential or Fixed |
176
+ | Jitter | No | ✅ Yes (±20%) |
177
+ | Error Detection | Generic | HTTP-specific |
178
+ | Configurable | No | ✅ Yes via .env |
179
+
180
+ ## Best Practices
181
+
182
+ ### 1. **Enable in Production**
183
+
184
+ Always use retry logic in production:
185
+
186
+ ```bash
187
+ API_MAX_RETRIES=3
188
+ API_RETRY_EXPONENTIAL_BACKOFF=true
189
+ ```
190
+
191
+ ### 2. **Monitor Retry Rates**
192
+
193
+ Track retry warnings in logs to detect:
194
+ - Network instability
195
+ - Server overload
196
+ - API rate limiting
197
+
198
+ ### 3. **Adjust for Your Environment**
199
+
200
+ - **Cloud/remote**: Higher retries (5-7)
201
+ - **Local/LAN**: Lower retries (1-3)
202
+ - **Rate-limited APIs**: Exponential backoff
203
+
204
+ ### 4. **Use Jitter**
205
+
206
+ Always keep jitter enabled (built-in) to prevent retry storms.
207
+
208
+ ### 5. **Set Connection Timeout**
209
+
210
+ Combine retries with appropriate timeout:
211
+
212
+ ```bash
213
+ API_CONNECTION_TIMEOUT=30000 # 30 seconds
214
+ API_MAX_RETRIES=3
215
+ ```
216
+
217
+ This ensures retries happen within reasonable time.
218
+
219
+ ## Troubleshooting
220
+
221
+ ### Too Many Retries
222
+
223
+ **Symptom**: Commands take too long due to retries
224
+
225
+ **Solution**: Reduce `API_MAX_RETRIES` or disable exponential backoff
226
+
227
+ ```bash
228
+ API_MAX_RETRIES=1
229
+ ```
230
+
231
+ ### Not Enough Retries
232
+
233
+ **Symptom**: Commands fail due to transient errors
234
+
235
+ **Solution**: Increase `API_MAX_RETRIES`
236
+
237
+ ```bash
238
+ API_MAX_RETRIES=5
239
+ ```
240
+
241
+ ### Rate Limiting Issues
242
+
243
+ **Symptom**: Many HTTP 429 errors
244
+
245
+ **Solution**: Ensure exponential backoff is enabled and increase retries
246
+
247
+ ```bash
248
+ API_MAX_RETRIES=5
249
+ API_RETRY_EXPONENTIAL_BACKOFF=true
250
+ ```
251
+
252
+ ### Network Timeout Issues
253
+
254
+ **Symptom**: `ETIMEDOUT` errors
255
+
256
+ **Solution**: Increase connection timeout and retries
257
+
258
+ ```bash
259
+ API_CONNECTION_TIMEOUT=60000 # 60 seconds
260
+ API_MAX_RETRIES=5
261
+ ```
262
+
263
+ ## Implementation Details
264
+
265
+ ### Code Location
266
+
267
+ - **Service**: `arela-uploader/src/services/ScanApiService.js`
268
+ - **Methods**:
269
+ - `#isRetryableError()` - Determines if error should trigger retry
270
+ - `#calculateBackoff()` - Calculates delay between retries
271
+ - `#request()` - Main request method with retry loop
272
+
273
+ ### Retry Loop Logic
274
+
275
+ ```javascript
276
+ for (let attempt = 1; attempt <= maxRetries + 1; attempt++) {
277
+ try {
278
+ // Make request
279
+ const response = await fetch(url, options);
280
+
281
+ // Check if response is ok
282
+ if (!response.ok) {
283
+ const error = new Error(`${response.status} ${response.statusText}`);
284
+
285
+ // Retry if error is retryable
286
+ if (isRetryable(error, response) && attempt <= maxRetries) {
287
+ await sleep(calculateBackoff(attempt));
288
+ continue;
289
+ }
290
+
291
+ throw error;
292
+ }
293
+
294
+ // Success
295
+ return await response.json();
296
+
297
+ } catch (error) {
298
+ // Handle network errors
299
+ if (isRetryable(error) && attempt <= maxRetries) {
300
+ await sleep(calculateBackoff(attempt));
301
+ continue;
302
+ }
303
+
304
+ throw error;
305
+ }
306
+ }
307
+ ```
308
+
309
+ ### Backoff Calculation
310
+
311
+ ```javascript
312
+ function calculateBackoff(attempt) {
313
+ if (exponentialBackoff) {
314
+ // 1s, 2s, 4s, 8s, 16s (max)
315
+ const delay = Math.min(1000 * Math.pow(2, attempt - 1), 16000);
316
+
317
+ // Add jitter (±20%)
318
+ const jitter = delay * 0.2 * (Math.random() * 2 - 1);
319
+ return delay + jitter;
320
+ } else {
321
+ // Fixed delay with jitter
322
+ return fixedDelay + (fixedDelay * 0.2 * (Math.random() * 2 - 1));
323
+ }
324
+ }
325
+ ```
326
+
327
+ ## Future Enhancements
328
+
329
+ Potential improvements:
330
+ 1. **Circuit breaker pattern**: Stop retrying after N consecutive failures
331
+ 2. **Adaptive backoff**: Adjust delays based on error patterns
332
+ 3. **Retry budget**: Limit total retry time per operation
333
+ 4. **Metrics collection**: Track retry rates and success rates
334
+ 5. **Per-endpoint configuration**: Different retry settings for different endpoints
335
+
336
+ ## Conclusion
337
+
338
+ The retry mechanism provides robust error handling for the `arela scan` and `arela identify` commands, ensuring operations can recover from transient failures without manual intervention. Proper configuration and monitoring ensure optimal performance and reliability.