@arela/uploader 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.template +70 -0
- package/docs/API_RETRY_MECHANISM.md +338 -0
- package/docs/ARELA_IDENTIFY_IMPLEMENTATION.md +489 -0
- package/docs/ARELA_IDENTIFY_QUICKREF.md +186 -0
- package/docs/ARELA_PROPAGATE_IMPLEMENTATION.md +581 -0
- package/docs/ARELA_PROPAGATE_QUICKREF.md +272 -0
- package/docs/ARELA_PUSH_IMPLEMENTATION.md +577 -0
- package/docs/ARELA_PUSH_QUICKREF.md +322 -0
- package/docs/ARELA_SCAN_IMPLEMENTATION.md +373 -0
- package/docs/ARELA_SCAN_QUICKREF.md +139 -0
- package/docs/DETECTION_ATTEMPT_TRACKING.md +414 -0
- package/docs/MIGRATION_UPLOADER_TO_FILE_STATS.md +1020 -0
- package/docs/MULTI_LEVEL_DIRECTORY_SCANNING.md +494 -0
- package/docs/STATS_COMMAND_SEQUENCE_DIAGRAM.md +287 -0
- package/docs/STATS_COMMAND_SIMPLE.md +93 -0
- package/package.json +4 -2
- package/src/commands/IdentifyCommand.js +486 -0
- package/src/commands/PropagateCommand.js +474 -0
- package/src/commands/PushCommand.js +473 -0
- package/src/commands/ScanCommand.js +516 -0
- package/src/config/config.js +177 -7
- package/src/file-detection.js +9 -10
- package/src/index.js +150 -0
- package/src/services/DatabaseService.js +2 -2
- package/src/services/ScanApiService.js +646 -0
- package/src/services/upload/ApiUploadService.js +12 -0
package/.env.template
CHANGED
|
@@ -19,6 +19,64 @@ UPLOAD_BASE_PATH=/path/to/your/upload/base
|
|
|
19
19
|
UPLOAD_SOURCES=folder1|folder2|folder3
|
|
20
20
|
UPLOAD_RFCS=rfc1|rfc2|rfc3
|
|
21
21
|
|
|
22
|
+
# =============================================================================
|
|
23
|
+
# SCAN CONFIGURATION (for arela scan command)
|
|
24
|
+
# =============================================================================
|
|
25
|
+
|
|
26
|
+
# Company identifier for this CLI instance (required)
|
|
27
|
+
# Use a short, descriptive slug for your company/agency/client
|
|
28
|
+
# Examples: "acme_corp", "cliente_123", "agencia_xyz"
|
|
29
|
+
ARELA_COMPANY_SLUG=
|
|
30
|
+
|
|
31
|
+
# Server identifier (required)
|
|
32
|
+
# Use a unique ID for each server/NAS where arela-cli is installed
|
|
33
|
+
# Examples: "nas01", "server-mx", "storage-01"
|
|
34
|
+
ARELA_SERVER_ID=
|
|
35
|
+
|
|
36
|
+
# Base path label (optional, auto-derived from UPLOAD_BASE_PATH if not set)
|
|
37
|
+
# Short label describing the base path being scanned
|
|
38
|
+
# Examples: "data", "documents", "archive"
|
|
39
|
+
ARELA_BASE_PATH_LABEL=
|
|
40
|
+
|
|
41
|
+
# System file patterns to exclude from scan (comma-separated)
|
|
42
|
+
# These files will be filtered before uploading stats to reduce payload
|
|
43
|
+
SCAN_EXCLUDE_PATTERNS=.DS_Store,Thumbs.db,desktop.ini,__pycache__,.pyc,.tmp,.swp,$RECYCLE.BIN,System Volume Information,~$*
|
|
44
|
+
|
|
45
|
+
# Batch size for scan operations (default: 2000 records per API call)
|
|
46
|
+
SCAN_BATCH_SIZE=2000
|
|
47
|
+
|
|
48
|
+
# Directory depth level for creating separate tables (default: 0)
|
|
49
|
+
# 0 = single table for entire base path
|
|
50
|
+
# 1 = one table per first-level subdirectory
|
|
51
|
+
# 2 = one table per second-level subdirectory, etc.
|
|
52
|
+
# Example: with level=1 and base=/data, creates tables for /data/folder1, /data/folder2, etc.
|
|
53
|
+
SCAN_DIRECTORY_LEVEL=0
|
|
54
|
+
|
|
55
|
+
# =============================================================================
|
|
56
|
+
# PUSH CONFIGURATION (for arela push command)
|
|
57
|
+
# =============================================================================
|
|
58
|
+
|
|
59
|
+
# Filter files to upload by RFC (pipe-separated, optional)
|
|
60
|
+
# If not set, all files with arela_path will be uploaded
|
|
61
|
+
# Examples: "RFC123456ABC|RFC789012DEF"
|
|
62
|
+
PUSH_RFCS=
|
|
63
|
+
|
|
64
|
+
# Filter files to upload by year (pipe-separated, optional)
|
|
65
|
+
# If not set, all files with arela_path will be uploaded
|
|
66
|
+
# Examples: "2023|2024|2025"
|
|
67
|
+
PUSH_YEARS=
|
|
68
|
+
|
|
69
|
+
# Batch size for fetching files from database (default: 100)
|
|
70
|
+
PUSH_BATCH_SIZE=100
|
|
71
|
+
|
|
72
|
+
# Concurrent upload batch size (default: 10)
|
|
73
|
+
# Number of files to upload simultaneously
|
|
74
|
+
PUSH_UPLOAD_BATCH_SIZE=10
|
|
75
|
+
|
|
76
|
+
# Storage bucket for uploaded files (optional, defaults to SUPABASE_BUCKET)
|
|
77
|
+
# Examples: "archivos", "documents", "storage"
|
|
78
|
+
PUSH_BUCKET=arela
|
|
79
|
+
|
|
22
80
|
# =============================================================================
|
|
23
81
|
# PERFORMANCE OPTIMIZATION FOR MULTIPLE API REPLICAS
|
|
24
82
|
# =============================================================================
|
|
@@ -30,6 +88,18 @@ MAX_API_CONNECTIONS=10
|
|
|
30
88
|
# API Connection Timeout (milliseconds)
|
|
31
89
|
API_CONNECTION_TIMEOUT=60000
|
|
32
90
|
|
|
91
|
+
# API Retry Configuration
|
|
92
|
+
# Maximum number of retry attempts for failed API requests
|
|
93
|
+
API_MAX_RETRIES=3
|
|
94
|
+
|
|
95
|
+
# Enable exponential backoff for retries (true/false)
|
|
96
|
+
# When true, retry delays increase: 1s, 2s, 4s, 8s, 16s
|
|
97
|
+
# When false, uses fixed delay (API_RETRY_DELAY)
|
|
98
|
+
API_RETRY_EXPONENTIAL_BACKOFF=true
|
|
99
|
+
|
|
100
|
+
# Fixed retry delay in milliseconds (only used if exponential backoff is disabled)
|
|
101
|
+
API_RETRY_DELAY=1000
|
|
102
|
+
|
|
33
103
|
# Batch Processing Configuration
|
|
34
104
|
# Files processed concurrently per batch (should be >= MAX_API_CONNECTIONS for best performance)
|
|
35
105
|
BATCH_SIZE=100
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
# API Retry Mechanism
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
The `arela scan` and `arela identify` commands now include robust retry logic with exponential backoff for all API requests. This ensures resilience against transient network issues, temporary server overload, and rate limiting.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
### 1. **Automatic Retry on Transient Errors**
|
|
10
|
+
|
|
11
|
+
Retries are automatically triggered for:
|
|
12
|
+
- **Network errors**: Connection reset, timeout, refused, DNS failures
|
|
13
|
+
- **HTTP 429**: Too Many Requests (rate limiting)
|
|
14
|
+
- **HTTP 5xx**: Server errors (500, 502, 503, 504)
|
|
15
|
+
|
|
16
|
+
### 2. **Exponential Backoff (Default)**
|
|
17
|
+
|
|
18
|
+
When enabled (default), retry delays increase exponentially:
|
|
19
|
+
- Attempt 1: ~1 second
|
|
20
|
+
- Attempt 2: ~2 seconds
|
|
21
|
+
- Attempt 3: ~4 seconds
|
|
22
|
+
- Attempt 4: ~8 seconds
|
|
23
|
+
- Attempt 5: ~16 seconds (max)
|
|
24
|
+
|
|
25
|
+
### 3. **Jitter to Prevent Thundering Herd**
|
|
26
|
+
|
|
27
|
+
Each retry delay includes ±20% random jitter to prevent multiple clients from retrying simultaneously, which could overwhelm the server.
|
|
28
|
+
|
|
29
|
+
### 4. **Smart Error Detection**
|
|
30
|
+
|
|
31
|
+
The system distinguishes between:
|
|
32
|
+
- **Retryable errors**: Network issues, server errors, rate limits
|
|
33
|
+
- **Non-retryable errors**: Client errors (400, 401, 403, 404), validation errors
|
|
34
|
+
|
|
35
|
+
Non-retryable errors fail immediately without wasting time on useless retries.
|
|
36
|
+
|
|
37
|
+
## Configuration
|
|
38
|
+
|
|
39
|
+
### Environment Variables
|
|
40
|
+
|
|
41
|
+
Add to `.env` file:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# Maximum number of retry attempts (default: 3)
|
|
45
|
+
API_MAX_RETRIES=3
|
|
46
|
+
|
|
47
|
+
# Use exponential backoff (default: true)
|
|
48
|
+
# When true: 1s → 2s → 4s → 8s → 16s
|
|
49
|
+
# When false: uses fixed delay
|
|
50
|
+
API_RETRY_EXPONENTIAL_BACKOFF=true
|
|
51
|
+
|
|
52
|
+
# Fixed retry delay in milliseconds (only used if exponential backoff is disabled)
|
|
53
|
+
API_RETRY_DELAY=1000
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Configuration Examples
|
|
57
|
+
|
|
58
|
+
#### High Reliability (Recommended)
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
API_MAX_RETRIES=5
|
|
62
|
+
API_RETRY_EXPONENTIAL_BACKOFF=true
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
**Use case**: Production environments with unreliable network or high load
|
|
66
|
+
|
|
67
|
+
#### Fast Failure (Testing/Development)
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
API_MAX_RETRIES=1
|
|
71
|
+
API_RETRY_EXPONENTIAL_BACKOFF=false
|
|
72
|
+
API_RETRY_DELAY=500
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
**Use case**: Local development where you want quick feedback
|
|
76
|
+
|
|
77
|
+
#### Aggressive Retry (High-Volume Processing)
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
API_MAX_RETRIES=7
|
|
81
|
+
API_RETRY_EXPONENTIAL_BACKOFF=true
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Use case**: Large batch operations where you can't afford to lose progress
|
|
85
|
+
|
|
86
|
+
## Retry Behavior
|
|
87
|
+
|
|
88
|
+
### Retryable Scenarios
|
|
89
|
+
|
|
90
|
+
| Error Type | Example | Retry Behavior |
|
|
91
|
+
|------------|---------|----------------|
|
|
92
|
+
| Network timeout | `ETIMEDOUT` | ✅ Retry with backoff |
|
|
93
|
+
| Connection reset | `ECONNRESET` | ✅ Retry with backoff |
|
|
94
|
+
| Connection refused | `ECONNREFUSED` | ✅ Retry with backoff |
|
|
95
|
+
| DNS failure | `ENOTFOUND`, `EAI_AGAIN` | ✅ Retry with backoff |
|
|
96
|
+
| Rate limiting | HTTP 429 | ✅ Retry with backoff |
|
|
97
|
+
| Server error | HTTP 5xx | ✅ Retry with backoff |
|
|
98
|
+
|
|
99
|
+
### Non-Retryable Scenarios
|
|
100
|
+
|
|
101
|
+
| Error Type | Example | Retry Behavior |
|
|
102
|
+
|------------|---------|----------------|
|
|
103
|
+
| Unauthorized | HTTP 401 | ❌ Fail immediately |
|
|
104
|
+
| Forbidden | HTTP 403 | ❌ Fail immediately |
|
|
105
|
+
| Not found | HTTP 404 | ❌ Fail immediately |
|
|
106
|
+
| Bad request | HTTP 400 | ❌ Fail immediately |
|
|
107
|
+
| Conflict | HTTP 409 | ❌ Fail immediately |
|
|
108
|
+
|
|
109
|
+
## Logging
|
|
110
|
+
|
|
111
|
+
### Retry Warnings
|
|
112
|
+
|
|
113
|
+
When a retry is triggered, you'll see warnings like:
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
⚠️ API request failed (attempt 1/4): Connection timeout. Retrying in 1234ms...
|
|
117
|
+
⚠️ API request failed (attempt 2/4): HTTP 503 Service Unavailable. Retrying in 2456ms...
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Retry Success
|
|
121
|
+
|
|
122
|
+
When a retry succeeds, you'll see:
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
ℹ️ API request succeeded on attempt 3/4
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Final Failure
|
|
129
|
+
|
|
130
|
+
If all retries fail, you'll see:
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
❌ API request failed after 4 attempt(s): Connection timeout
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Performance Impact
|
|
137
|
+
|
|
138
|
+
### With Default Settings (3 retries, exponential backoff)
|
|
139
|
+
|
|
140
|
+
**Best case** (no failures): No overhead
|
|
141
|
+
|
|
142
|
+
**Worst case** (all retries fail):
|
|
143
|
+
- Total retry time: ~1s + 2s + 4s = ~7 seconds
|
|
144
|
+
- Total attempts: 4 (1 initial + 3 retries)
|
|
145
|
+
|
|
146
|
+
### Optimization Tips
|
|
147
|
+
|
|
148
|
+
1. **For stable networks**: Reduce `API_MAX_RETRIES` to 2-3
|
|
149
|
+
2. **For unstable networks**: Increase to 5-7
|
|
150
|
+
3. **For rate-limited APIs**: Keep exponential backoff enabled
|
|
151
|
+
4. **For fast development**: Disable retries or set to 1
|
|
152
|
+
|
|
153
|
+
## Integration with Commands
|
|
154
|
+
|
|
155
|
+
### arela scan
|
|
156
|
+
|
|
157
|
+
All API operations during scan now have retry logic:
|
|
158
|
+
- Instance registration (`POST /api/uploader/scan/register`)
|
|
159
|
+
- Batch insert (`POST /api/uploader/scan/batch-insert`)
|
|
160
|
+
- Scan completion (`PATCH /api/uploader/scan/complete`)
|
|
161
|
+
|
|
162
|
+
### arela identify
|
|
163
|
+
|
|
164
|
+
All API operations during identify now have retry logic:
|
|
165
|
+
- Fetch detection stats (`GET /api/uploader/scan/detection-stats`)
|
|
166
|
+
- Fetch PDFs for detection (`GET /api/uploader/scan/pdfs-for-detection`)
|
|
167
|
+
- Batch update detection (`PATCH /api/uploader/scan/batch-update-detection`)
|
|
168
|
+
|
|
169
|
+
## Comparison with DatabaseService
|
|
170
|
+
|
|
171
|
+
| Feature | DatabaseService (Supabase) | ScanApiService (HTTP) |
|
|
172
|
+
|---------|---------------------------|----------------------|
|
|
173
|
+
| Retry Logic | ✅ Yes | ✅ Yes |
|
|
174
|
+
| Max Retries | 3 (hardcoded) | 3 (configurable) |
|
|
175
|
+
| Backoff Strategy | Exponential | Exponential or Fixed |
|
|
176
|
+
| Jitter | No | ✅ Yes (±20%) |
|
|
177
|
+
| Error Detection | Generic | HTTP-specific |
|
|
178
|
+
| Configurable | No | ✅ Yes via .env |
|
|
179
|
+
|
|
180
|
+
## Best Practices
|
|
181
|
+
|
|
182
|
+
### 1. **Enable in Production**
|
|
183
|
+
|
|
184
|
+
Always use retry logic in production:
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
API_MAX_RETRIES=3
|
|
188
|
+
API_RETRY_EXPONENTIAL_BACKOFF=true
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### 2. **Monitor Retry Rates**
|
|
192
|
+
|
|
193
|
+
Track retry warnings in logs to detect:
|
|
194
|
+
- Network instability
|
|
195
|
+
- Server overload
|
|
196
|
+
- API rate limiting
|
|
197
|
+
|
|
198
|
+
### 3. **Adjust for Your Environment**
|
|
199
|
+
|
|
200
|
+
- **Cloud/remote**: Higher retries (5-7)
|
|
201
|
+
- **Local/LAN**: Lower retries (1-3)
|
|
202
|
+
- **Rate-limited APIs**: Exponential backoff
|
|
203
|
+
|
|
204
|
+
### 4. **Use Jitter**
|
|
205
|
+
|
|
206
|
+
Always keep jitter enabled (built-in) to prevent retry storms.
|
|
207
|
+
|
|
208
|
+
### 5. **Set Connection Timeout**
|
|
209
|
+
|
|
210
|
+
Combine retries with appropriate timeout:
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
API_CONNECTION_TIMEOUT=30000 # 30 seconds
|
|
214
|
+
API_MAX_RETRIES=3
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
This ensures retries happen within reasonable time.
|
|
218
|
+
|
|
219
|
+
## Troubleshooting
|
|
220
|
+
|
|
221
|
+
### Too Many Retries
|
|
222
|
+
|
|
223
|
+
**Symptom**: Commands take too long due to retries
|
|
224
|
+
|
|
225
|
+
**Solution**: Reduce `API_MAX_RETRIES` or disable exponential backoff
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
API_MAX_RETRIES=1
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Not Enough Retries
|
|
232
|
+
|
|
233
|
+
**Symptom**: Commands fail due to transient errors
|
|
234
|
+
|
|
235
|
+
**Solution**: Increase `API_MAX_RETRIES`
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
API_MAX_RETRIES=5
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### Rate Limiting Issues
|
|
242
|
+
|
|
243
|
+
**Symptom**: Many HTTP 429 errors
|
|
244
|
+
|
|
245
|
+
**Solution**: Ensure exponential backoff is enabled and increase retries
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
API_MAX_RETRIES=5
|
|
249
|
+
API_RETRY_EXPONENTIAL_BACKOFF=true
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### Network Timeout Issues
|
|
253
|
+
|
|
254
|
+
**Symptom**: `ETIMEDOUT` errors
|
|
255
|
+
|
|
256
|
+
**Solution**: Increase connection timeout and retries
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
API_CONNECTION_TIMEOUT=60000 # 60 seconds
|
|
260
|
+
API_MAX_RETRIES=5
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## Implementation Details
|
|
264
|
+
|
|
265
|
+
### Code Location
|
|
266
|
+
|
|
267
|
+
- **Service**: `arela-uploader/src/services/ScanApiService.js`
|
|
268
|
+
- **Methods**:
|
|
269
|
+
- `#isRetryableError()` - Determines if error should trigger retry
|
|
270
|
+
- `#calculateBackoff()` - Calculates delay between retries
|
|
271
|
+
- `#request()` - Main request method with retry loop
|
|
272
|
+
|
|
273
|
+
### Retry Loop Logic
|
|
274
|
+
|
|
275
|
+
```javascript
|
|
276
|
+
for (let attempt = 1; attempt <= maxRetries + 1; attempt++) {
|
|
277
|
+
try {
|
|
278
|
+
// Make request
|
|
279
|
+
const response = await fetch(url, options);
|
|
280
|
+
|
|
281
|
+
// Check if response is ok
|
|
282
|
+
if (!response.ok) {
|
|
283
|
+
const error = new Error(`${response.status} ${response.statusText}`);
|
|
284
|
+
|
|
285
|
+
// Retry if error is retryable
|
|
286
|
+
if (isRetryable(error, response) && attempt <= maxRetries) {
|
|
287
|
+
await sleep(calculateBackoff(attempt));
|
|
288
|
+
continue;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
throw error;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// Success
|
|
295
|
+
return await response.json();
|
|
296
|
+
|
|
297
|
+
} catch (error) {
|
|
298
|
+
// Handle network errors
|
|
299
|
+
if (isRetryable(error) && attempt <= maxRetries) {
|
|
300
|
+
await sleep(calculateBackoff(attempt));
|
|
301
|
+
continue;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
throw error;
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
### Backoff Calculation
|
|
310
|
+
|
|
311
|
+
```javascript
|
|
312
|
+
function calculateBackoff(attempt) {
|
|
313
|
+
if (exponentialBackoff) {
|
|
314
|
+
// 1s, 2s, 4s, 8s, 16s (max)
|
|
315
|
+
const delay = Math.min(1000 * Math.pow(2, attempt - 1), 16000);
|
|
316
|
+
|
|
317
|
+
// Add jitter (±20%)
|
|
318
|
+
const jitter = delay * 0.2 * (Math.random() * 2 - 1);
|
|
319
|
+
return delay + jitter;
|
|
320
|
+
} else {
|
|
321
|
+
// Fixed delay with jitter
|
|
322
|
+
return fixedDelay + (fixedDelay * 0.2 * (Math.random() * 2 - 1));
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
## Future Enhancements
|
|
328
|
+
|
|
329
|
+
Potential improvements:
|
|
330
|
+
1. **Circuit breaker pattern**: Stop retrying after N consecutive failures
|
|
331
|
+
2. **Adaptive backoff**: Adjust delays based on error patterns
|
|
332
|
+
3. **Retry budget**: Limit total retry time per operation
|
|
333
|
+
4. **Metrics collection**: Track retry rates and success rates
|
|
334
|
+
5. **Per-endpoint configuration**: Different retry settings for different endpoints
|
|
335
|
+
|
|
336
|
+
## Conclusion
|
|
337
|
+
|
|
338
|
+
The retry mechanism provides robust error handling for the `arela scan` and `arela identify` commands, ensuring operations can recover from transient failures without manual intervention. Proper configuration and monitoring ensure optimal performance and reliability.
|