@aj-archipelago/cortex 1.3.51 → 1.3.53
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/helper-apps/cortex-file-handler/{.env.test.azure → .env.test.azure.sample} +2 -1
- package/helper-apps/cortex-file-handler/{.env.test.gcs → .env.test.gcs.sample} +2 -1
- package/helper-apps/cortex-file-handler/{.env.test → .env.test.sample} +2 -1
- package/helper-apps/cortex-file-handler/Dockerfile +1 -1
- package/helper-apps/cortex-file-handler/INTERFACE.md +178 -0
- package/helper-apps/cortex-file-handler/package.json +4 -3
- package/helper-apps/cortex-file-handler/scripts/test-azure.sh +3 -0
- package/helper-apps/cortex-file-handler/{blobHandler.js → src/blobHandler.js} +167 -99
- package/helper-apps/cortex-file-handler/{fileChunker.js → src/fileChunker.js} +11 -24
- package/helper-apps/cortex-file-handler/{index.js → src/index.js} +236 -256
- package/helper-apps/cortex-file-handler/{services → src/services}/ConversionService.js +39 -18
- package/helper-apps/cortex-file-handler/{services → src/services}/FileConversionService.js +7 -3
- package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +177 -0
- package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +258 -0
- package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +182 -0
- package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +86 -0
- package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +53 -0
- package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +259 -0
- package/helper-apps/cortex-file-handler/{start.js → src/start.js} +1 -1
- package/helper-apps/cortex-file-handler/src/utils/filenameUtils.js +28 -0
- package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +1 -1
- package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +4 -4
- package/helper-apps/cortex-file-handler/tests/conversionResilience.test.js +152 -0
- package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +2 -28
- package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +134 -23
- package/helper-apps/cortex-file-handler/tests/getOperations.test.js +307 -0
- package/helper-apps/cortex-file-handler/tests/postOperations.test.js +291 -0
- package/helper-apps/cortex-file-handler/tests/start.test.js +50 -14
- package/helper-apps/cortex-file-handler/tests/storage/AzureStorageProvider.test.js +120 -0
- package/helper-apps/cortex-file-handler/tests/storage/GCSStorageProvider.test.js +193 -0
- package/helper-apps/cortex-file-handler/tests/storage/LocalStorageProvider.test.js +148 -0
- package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +100 -0
- package/helper-apps/cortex-file-handler/tests/storage/StorageService.test.js +113 -0
- package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +73 -19
- package/lib/entityConstants.js +17 -2
- package/package.json +1 -1
- /package/helper-apps/cortex-file-handler/{constants.js → src/constants.js} +0 -0
- /package/helper-apps/cortex-file-handler/{docHelper.js → src/docHelper.js} +0 -0
- /package/helper-apps/cortex-file-handler/{helper.js → src/helper.js} +0 -0
- /package/helper-apps/cortex-file-handler/{localFileHandler.js → src/localFileHandler.js} +0 -0
- /package/helper-apps/cortex-file-handler/{redis.js → src/redis.js} +0 -0
|
@@ -3,4 +3,5 @@ REDIS_CONNECTION_STRING=redis://default:redispw@localhost:32768
|
|
|
3
3
|
AZURE_STORAGE_CONNECTION_STRING=UseDevelopmentStorage=true
|
|
4
4
|
AZURE_STORAGE_CONTAINER_NAME=test-container
|
|
5
5
|
NODE_ENV=test
|
|
6
|
-
PORT=7072 # Different port for testing
|
|
6
|
+
PORT=7072 # Different port for testing
|
|
7
|
+
MARKITDOWN_CONVERT_URL= #cortex-markitdown url
|
|
@@ -6,4 +6,5 @@ GCS_BUCKETNAME=cortextempfiles
|
|
|
6
6
|
AZURE_STORAGE_CONNECTION_STRING=UseDevelopmentStorage=true
|
|
7
7
|
AZURE_STORAGE_CONTAINER_NAME=test-container
|
|
8
8
|
NODE_ENV=test
|
|
9
|
-
PORT=7072 # Different port for testing
|
|
9
|
+
PORT=7072 # Different port for testing
|
|
10
|
+
MARKITDOWN_CONVERT_URL= #cortex-markitdown url
|
|
@@ -4,4 +4,5 @@ REDIS_CONNECTION_STRING=redis://default:redispw@localhost:32768
|
|
|
4
4
|
AZURE_STORAGE_CONTAINER_NAME=test-container
|
|
5
5
|
#GCP_SERVICE_ACCOUNT_KEY={"type":"service_account","project_id":"test-project"}
|
|
6
6
|
NODE_ENV=test
|
|
7
|
-
PORT=7072 # Different port for testing
|
|
7
|
+
PORT=7072 # Different port for testing
|
|
8
|
+
MARKITDOWN_CONVERT_URL= #cortex-markitdown url
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# Cortex File Handler Interface Documentation
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
The Cortex File Handler is a service that processes files through various operations including uploading, downloading, chunking, and document processing. It supports multiple storage backends (Azure Blob Storage, Google Cloud Storage, and Local File System).
|
|
5
|
+
|
|
6
|
+
## Request Methods
|
|
7
|
+
|
|
8
|
+
### POST
|
|
9
|
+
- **Purpose**: Upload a file
|
|
10
|
+
- **Content-Type**: `multipart/form-data`
|
|
11
|
+
- **Parameters**:
|
|
12
|
+
- `hash` (optional): Unique identifier for the file
|
|
13
|
+
- `requestId` (required): Unique identifier for the request
|
|
14
|
+
- File content must be included in the form data
|
|
15
|
+
- **Behavior**:
|
|
16
|
+
- Uploads file to primary storage (Azure or Local)
|
|
17
|
+
- If GCS is configured, also uploads to GCS
|
|
18
|
+
- If hash is provided, stores file metadata in Redis
|
|
19
|
+
- Returns upload result with file URLs
|
|
20
|
+
- **Response**: Object containing:
|
|
21
|
+
- `url`: Primary storage URL
|
|
22
|
+
- `gcs`: GCS URL (if GCS is configured)
|
|
23
|
+
- `hash`: Hash value (if provided)
|
|
24
|
+
- `message`: Success message
|
|
25
|
+
- `filename`: Original filename
|
|
26
|
+
- **Note**: The `save` parameter is not supported in POST requests. To convert and save a document as text, use GET with the `save` parameter.
|
|
27
|
+
|
|
28
|
+
### GET
|
|
29
|
+
- **Purpose**: Process or retrieve files
|
|
30
|
+
- **Parameters** (can be in query string or request body):
|
|
31
|
+
- `uri` (required if not using fetch/load/restore): URL of the file to process
|
|
32
|
+
- Requires `requestId` parameter
|
|
33
|
+
- No Redis caching
|
|
34
|
+
- Direct processing based on file type
|
|
35
|
+
- `requestId` (required with `uri`): Unique identifier for the request
|
|
36
|
+
- `save` (optional): If true, saves document as text file
|
|
37
|
+
- When true, converts document to text and saves to primary storage only (Azure or Local)
|
|
38
|
+
- Does not save to GCS
|
|
39
|
+
- Original document is deleted from storage after text conversion
|
|
40
|
+
- `hash` (optional): Unique identifier for the file
|
|
41
|
+
- `checkHash` (optional): Check if hash exists
|
|
42
|
+
- `clearHash` (optional): Remove hash from storage
|
|
43
|
+
- `fetch`/`load`/`restore` (optional): URL to fetch remote file (these are aliases - any of the three parameters will trigger the same remote file processing behavior)
|
|
44
|
+
- Does not require `requestId`
|
|
45
|
+
- Uses Redis caching
|
|
46
|
+
- Downloads and validates file first
|
|
47
|
+
- Ensures correct file extension
|
|
48
|
+
- Truncates long filenames
|
|
49
|
+
- **Behavior**:
|
|
50
|
+
- For documents (PDF, DOC, etc.):
|
|
51
|
+
- If `save=true`:
|
|
52
|
+
- Converts document to text
|
|
53
|
+
- Saves text file to primary storage (Azure or Local)
|
|
54
|
+
- Deletes original document from storage
|
|
55
|
+
- Does not save to GCS
|
|
56
|
+
- Returns object with primary storage URL
|
|
57
|
+
- If `save=false`:
|
|
58
|
+
- Converts document to text
|
|
59
|
+
- Returns array of text chunks
|
|
60
|
+
- Does not persist any files
|
|
61
|
+
- For media files:
|
|
62
|
+
- Splits into chunks
|
|
63
|
+
- Uploads chunks to primary storage and GCS (if configured)
|
|
64
|
+
- Returns chunk information with offsets
|
|
65
|
+
- For remote files (`fetch`/`load`/`restore`):
|
|
66
|
+
- Downloads file from URL
|
|
67
|
+
- Processes based on file type
|
|
68
|
+
- Returns processed result
|
|
69
|
+
- Caches result in Redis using URL as key
|
|
70
|
+
- Updates Redis timestamp on subsequent requests
|
|
71
|
+
- Truncates filenames longer than 200 characters
|
|
72
|
+
- Ensures correct file extension based on content type
|
|
73
|
+
|
|
74
|
+
### DELETE
|
|
75
|
+
- **Purpose**: Remove files from storage
|
|
76
|
+
- **Parameters** (can be in query string or request body):
|
|
77
|
+
- `requestId` (required): Unique identifier for the request
|
|
78
|
+
- **Behavior**:
|
|
79
|
+
- Deletes file from primary storage (Azure or Local)
|
|
80
|
+
- Deletes file from GCS if configured
|
|
81
|
+
- Returns deletion result
|
|
82
|
+
- **Response**: Array of deleted file URLs
|
|
83
|
+
|
|
84
|
+
## Storage Configuration
|
|
85
|
+
- **Azure**: Enabled if `AZURE_STORAGE_CONNECTION_STRING` is set
|
|
86
|
+
- **GCS**: Enabled if `GCP_SERVICE_ACCOUNT_KEY_BASE64` or `GCP_SERVICE_ACCOUNT_KEY` is set
|
|
87
|
+
- **Local**: Used as fallback if Azure is not configured
|
|
88
|
+
|
|
89
|
+
## Response Format
|
|
90
|
+
- **Success**:
|
|
91
|
+
- Status: 200
|
|
92
|
+
- Body: Varies by operation (see specific methods above)
|
|
93
|
+
- **Error**:
|
|
94
|
+
- Status: 400/404/500
|
|
95
|
+
- Body: Error message string
|
|
96
|
+
|
|
97
|
+
## Progress Tracking
|
|
98
|
+
- Progress updates are published to Redis for each operation
|
|
99
|
+
- Progress includes:
|
|
100
|
+
- `progress`: Completion percentage (0-1)
|
|
101
|
+
- `completedCount`: Number of completed steps
|
|
102
|
+
- `totalCount`: Total number of steps
|
|
103
|
+
- `numberOfChunks`: Number of chunks (for media files)
|
|
104
|
+
- `data`: Additional operation data
|
|
105
|
+
- Progress updates are published to Redis channel associated with `requestId`
|
|
106
|
+
|
|
107
|
+
## File Types
|
|
108
|
+
- **Documents**: Processed based on `DOC_EXTENSIONS` list
|
|
109
|
+
- Supported extensions:
|
|
110
|
+
- Text: .txt, .json, .csv, .md, .xml, .js, .html, .css
|
|
111
|
+
- Office: .doc, .docx, .xls, .xlsx
|
|
112
|
+
- Document processing limitations:
|
|
113
|
+
- PDFs: Does not support scanned, encrypted, or password-protected PDFs
|
|
114
|
+
- Requires OCR for PDFs without embedded fonts
|
|
115
|
+
- Text chunking:
|
|
116
|
+
- Maximum chunk size: 10,000 characters
|
|
117
|
+
- Chunks are split at sentence boundaries when possible
|
|
118
|
+
- Returns array of text chunks
|
|
119
|
+
- **Media**: All other file types, processed through chunking
|
|
120
|
+
- Chunked into smaller pieces for processing
|
|
121
|
+
- Each chunk is stored separately
|
|
122
|
+
- Media chunking behavior:
|
|
123
|
+
- Default chunk duration: 500 seconds
|
|
124
|
+
- Chunks are processed in parallel (3 at a time)
|
|
125
|
+
- Audio is converted to MP3 format (128kbps)
|
|
126
|
+
- Uses 4MB read buffer for file processing
|
|
127
|
+
- Supported media types:
|
|
128
|
+
- Images: .jpg, .jpeg, .png, .webp, .heic, .heif, .pdf
|
|
129
|
+
- Video: .mp4, .mpeg, .mov, .avi, .flv, .mpg, .webm, .wmv, .3gp
|
|
130
|
+
- Audio: .wav, .mp3, .aac, .ogg, .flac, .m4a
|
|
131
|
+
- File download behavior:
|
|
132
|
+
- 30 second timeout for downloads
|
|
133
|
+
- Supports streaming downloads
|
|
134
|
+
- Handles URL encoding/decoding
|
|
135
|
+
- Truncates filenames longer than 200 characters
|
|
136
|
+
|
|
137
|
+
## Storage Behavior
|
|
138
|
+
- **Primary Storage** (Azure or Local):
|
|
139
|
+
- Files are stored with UUID-based names
|
|
140
|
+
- Organized by requestId folders
|
|
141
|
+
- Azure: Uses SAS tokens for access
|
|
142
|
+
- Local: Served via HTTP on configured port
|
|
143
|
+
- **GCS** (if configured):
|
|
144
|
+
- Files stored with gs:// protocol URLs
|
|
145
|
+
- Same folder structure as primary storage
|
|
146
|
+
- Only used for media file chunks
|
|
147
|
+
- **Redis**:
|
|
148
|
+
- Stores file metadata and URLs
|
|
149
|
+
- Used for caching remote file results
|
|
150
|
+
- Tracks file access timestamps
|
|
151
|
+
- Used for progress tracking
|
|
152
|
+
|
|
153
|
+
## Cleanup
|
|
154
|
+
- Automatic cleanup of inactive files
|
|
155
|
+
- Removes files from:
|
|
156
|
+
- Primary storage (Azure/Local)
|
|
157
|
+
- GCS (if configured)
|
|
158
|
+
- Redis file store map
|
|
159
|
+
- Cleanup is triggered on each request but only runs if not already in progress
|
|
160
|
+
- Temporary files are cleaned up:
|
|
161
|
+
- After 1 hour of inactivity
|
|
162
|
+
- After successful processing
|
|
163
|
+
- On error conditions
|
|
164
|
+
|
|
165
|
+
## Error Handling
|
|
166
|
+
- **400 Bad Request**:
|
|
167
|
+
- Missing required parameters
|
|
168
|
+
- Invalid or inaccessible URL
|
|
169
|
+
- Unsupported file type
|
|
170
|
+
- **404 Not Found**:
|
|
171
|
+
- File or hash not found
|
|
172
|
+
- File not found in storage
|
|
173
|
+
- **500 Internal Server Error**:
|
|
174
|
+
- Processing errors
|
|
175
|
+
- Storage errors
|
|
176
|
+
- Document conversion errors
|
|
177
|
+
- PDF processing errors (scanned, encrypted, password-protected)
|
|
178
|
+
- All errors include descriptive message in response body
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@aj-archipelago/cortex-file-handler",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.02",
|
|
4
4
|
"description": "File handling service for Cortex - handles file uploads, media chunking, and document processing",
|
|
5
5
|
"type": "module",
|
|
6
|
+
"main": "src/index.js",
|
|
6
7
|
"scripts": {
|
|
7
|
-
"start": "node start.js",
|
|
8
|
-
"dev": "node -r dotenv/config start.js",
|
|
8
|
+
"start": "node src/start.js",
|
|
9
|
+
"dev": "node -r dotenv/config src/start.js",
|
|
9
10
|
"test": "DOTENV_CONFIG_PATH=.env.test NODE_ENV=test node -r dotenv/config node_modules/ava/entrypoints/cli.mjs",
|
|
10
11
|
"test:azure": "DOTENV_CONFIG_PATH=.env.test.azure NODE_ENV=test ./scripts/test-azure.sh",
|
|
11
12
|
"test:watch": "DOTENV_CONFIG_PATH=.env.test NODE_ENV=test node -r dotenv/config node_modules/ava/entrypoints/cli.mjs --watch",
|
|
@@ -23,6 +23,7 @@ import { CONVERTED_EXTENSIONS } from './constants.js';
|
|
|
23
23
|
import mime from 'mime-types';
|
|
24
24
|
|
|
25
25
|
import os from 'os';
|
|
26
|
+
import { sanitizeFilename } from './utils/filenameUtils.js';
|
|
26
27
|
|
|
27
28
|
import { FileConversionService } from './services/FileConversionService.js';
|
|
28
29
|
|
|
@@ -81,8 +82,15 @@ function ensureUnencodedGcsUrl(url) {
|
|
|
81
82
|
}
|
|
82
83
|
// Split into bucket and path parts
|
|
83
84
|
const [bucket, ...pathParts] = url.replace('gs://', '').split('/');
|
|
84
|
-
// Reconstruct URL with decoded path parts
|
|
85
|
-
return `gs://${bucket}/${pathParts.map(part =>
|
|
85
|
+
// Reconstruct URL with decoded path parts, handling invalid characters
|
|
86
|
+
return `gs://${bucket}/${pathParts.map(part => {
|
|
87
|
+
try {
|
|
88
|
+
return decodeURIComponent(part);
|
|
89
|
+
} catch (error) {
|
|
90
|
+
// If decoding fails, sanitize the filename by removing invalid characters
|
|
91
|
+
return part.replace(/[^\w\-\.]/g, '_');
|
|
92
|
+
}
|
|
93
|
+
}).join('/')}`;
|
|
86
94
|
}
|
|
87
95
|
|
|
88
96
|
async function gcsUrlExists(url, defaultReturn = false) {
|
|
@@ -348,14 +356,11 @@ function uploadBlob(
|
|
|
348
356
|
return;
|
|
349
357
|
}
|
|
350
358
|
tempFilePath = path.join(tempDir, safeFilename);
|
|
351
|
-
console.log('Temp dir:', tempDir, 'Original filename:', filename, 'Safe filename:', safeFilename, 'Temp file path:', tempFilePath);
|
|
352
|
-
console.log('About to create write stream for:', tempFilePath);
|
|
353
359
|
try {
|
|
354
360
|
diskWriteStream = fs.createWriteStream(tempFilePath, {
|
|
355
361
|
highWaterMark: 1024 * 1024,
|
|
356
362
|
autoClose: true,
|
|
357
363
|
});
|
|
358
|
-
console.log('Write stream created successfully for:', tempFilePath);
|
|
359
364
|
} catch (err) {
|
|
360
365
|
console.error('Error creating write stream:', err, 'Temp dir exists:', fs.existsSync(tempDir));
|
|
361
366
|
errorOccurred = true;
|
|
@@ -476,7 +481,58 @@ function uploadBlob(
|
|
|
476
481
|
}
|
|
477
482
|
}
|
|
478
483
|
|
|
479
|
-
//
|
|
484
|
+
// After original uploads, handle optional conversion
|
|
485
|
+
const conversionService = new FileConversionService(context, !saveToLocal);
|
|
486
|
+
|
|
487
|
+
if (conversionService.needsConversion(safeFilename)) {
|
|
488
|
+
try {
|
|
489
|
+
context.log('Starting file conversion (busboy)...');
|
|
490
|
+
|
|
491
|
+
// Ensure we have a local copy of the file for conversion
|
|
492
|
+
let localPathForConversion = tempFilePath;
|
|
493
|
+
|
|
494
|
+
if (!localPathForConversion) {
|
|
495
|
+
// No temp file was written (saveToLocal === false). Download from primary URL.
|
|
496
|
+
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'convert-'));
|
|
497
|
+
localPathForConversion = path.join(tmpDir, safeFilename);
|
|
498
|
+
await conversionService._downloadFile(result.url, localPathForConversion);
|
|
499
|
+
} else {
|
|
500
|
+
// Wait until disk write completes to guarantee full file is present
|
|
501
|
+
if (diskWritePromise) {
|
|
502
|
+
await diskWritePromise;
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
// Perform the conversion
|
|
507
|
+
const conversion = await conversionService.convertFile(localPathForConversion, result.url);
|
|
508
|
+
context.log('File conversion completed (busboy):', conversion);
|
|
509
|
+
|
|
510
|
+
if (conversion.converted) {
|
|
511
|
+
context.log('Saving converted file (busboy)...');
|
|
512
|
+
// Save converted file to primary storage
|
|
513
|
+
const convertedSaveResult = await conversionService._saveConvertedFile(conversion.convertedPath, requestId);
|
|
514
|
+
|
|
515
|
+
// Optionally save to GCS
|
|
516
|
+
let convertedGcsUrl;
|
|
517
|
+
if (conversionService._isGCSConfigured()) {
|
|
518
|
+
convertedGcsUrl = await conversionService._uploadChunkToGCS(conversion.convertedPath, requestId);
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
// Attach to response body
|
|
522
|
+
result.converted = {
|
|
523
|
+
url: convertedSaveResult.url,
|
|
524
|
+
gcs: convertedGcsUrl,
|
|
525
|
+
};
|
|
526
|
+
context.log('Conversion process (busboy) completed successfully');
|
|
527
|
+
}
|
|
528
|
+
} catch (convErr) {
|
|
529
|
+
console.error('Error converting file (busboy):', convErr);
|
|
530
|
+
context.log('Error during conversion (busboy):', convErr.message);
|
|
531
|
+
// Continue without failing the upload
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
// Respond after conversion (if any)
|
|
480
536
|
context.res = { status: 200, body: result };
|
|
481
537
|
resolve(result);
|
|
482
538
|
} catch (err) {
|
|
@@ -552,23 +608,24 @@ function uploadBlob(
|
|
|
552
608
|
async function saveToLocalStorage(context, requestId, encodedFilename, file) {
|
|
553
609
|
const localPath = join(publicFolder, requestId);
|
|
554
610
|
fs.mkdirSync(localPath, { recursive: true });
|
|
555
|
-
|
|
556
|
-
|
|
611
|
+
|
|
612
|
+
// Sanitize filename by removing invalid characters
|
|
613
|
+
const sanitizedFilename = sanitizeFilename(encodedFilename);
|
|
614
|
+
const destinationPath = `${localPath}/${sanitizedFilename}`;
|
|
615
|
+
|
|
557
616
|
await pipeline(file, fs.createWriteStream(destinationPath));
|
|
558
|
-
return `http://${ipAddress}:${port}/files/${requestId}/${
|
|
617
|
+
return `http://${ipAddress}:${port}/files/${requestId}/${sanitizedFilename}`;
|
|
559
618
|
}
|
|
560
619
|
|
|
561
620
|
// Helper function to handle Azure blob storage
|
|
562
621
|
async function saveToAzureStorage(context, encodedFilename, file) {
|
|
563
622
|
const { containerClient } = await getBlobClient();
|
|
564
623
|
const contentType = mime.lookup(encodedFilename);
|
|
565
|
-
|
|
566
|
-
//
|
|
567
|
-
let blobName = encodedFilename;
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
}
|
|
571
|
-
|
|
624
|
+
|
|
625
|
+
// Create a safe blob name that is URI-encoded once (no double encoding)
|
|
626
|
+
let blobName = sanitizeFilename(encodedFilename);
|
|
627
|
+
blobName = encodeURIComponent(blobName);
|
|
628
|
+
|
|
572
629
|
const options = {
|
|
573
630
|
blobHTTPHeaders: contentType ? { blobContentType: contentType } : {},
|
|
574
631
|
maxConcurrency: 50,
|
|
@@ -583,31 +640,29 @@ async function saveToAzureStorage(context, encodedFilename, file) {
|
|
|
583
640
|
}
|
|
584
641
|
|
|
585
642
|
// Helper function to upload a file to Google Cloud Storage
|
|
586
|
-
async function uploadToGCS(context, file,
|
|
587
|
-
const
|
|
643
|
+
async function uploadToGCS(context, file, filename) {
|
|
644
|
+
const objectName = sanitizeFilename(filename);
|
|
645
|
+
const gcsFile = gcs.bucket(GCS_BUCKETNAME).file(objectName);
|
|
588
646
|
const writeStream = gcsFile.createWriteStream({
|
|
589
647
|
resumable: true,
|
|
590
648
|
validation: false,
|
|
591
649
|
metadata: {
|
|
592
|
-
contentType: mime.lookup(
|
|
650
|
+
contentType: mime.lookup(objectName) || 'application/octet-stream',
|
|
593
651
|
},
|
|
594
652
|
chunkSize: 8 * 1024 * 1024,
|
|
595
653
|
numRetries: 3,
|
|
596
654
|
retryDelay: 1000,
|
|
597
655
|
});
|
|
598
|
-
context.log(`Uploading to GCS... ${
|
|
656
|
+
context.log(`Uploading to GCS... ${objectName}`);
|
|
599
657
|
await pipeline(file, writeStream);
|
|
600
|
-
|
|
601
|
-
const gcsUrl = `gs://${GCS_BUCKETNAME}/${encodedFilename}`;
|
|
602
|
-
return gcsUrl;
|
|
658
|
+
return `gs://${GCS_BUCKETNAME}/${objectName}`;
|
|
603
659
|
}
|
|
604
660
|
|
|
605
|
-
//
|
|
661
|
+
// Wrapper that checks if GCS is configured
|
|
606
662
|
async function saveToGoogleStorage(context, encodedFilename, file) {
|
|
607
663
|
if (!gcs) {
|
|
608
664
|
throw new Error('Google Cloud Storage is not initialized');
|
|
609
665
|
}
|
|
610
|
-
|
|
611
666
|
return uploadToGCS(context, file, encodedFilename);
|
|
612
667
|
}
|
|
613
668
|
|
|
@@ -816,7 +871,6 @@ async function cleanup(context, urls = null) {
|
|
|
816
871
|
cleanedURLs.push(blob.name);
|
|
817
872
|
} catch (error) {
|
|
818
873
|
if (error.statusCode !== 404) {
|
|
819
|
-
// Ignore "not found" errors
|
|
820
874
|
context.log(`Error cleaning blob ${blob.name}:`, error);
|
|
821
875
|
}
|
|
822
876
|
}
|
|
@@ -832,7 +886,6 @@ async function cleanup(context, urls = null) {
|
|
|
832
886
|
cleanedURLs.push(blobName);
|
|
833
887
|
} catch (error) {
|
|
834
888
|
if (error.statusCode !== 404) {
|
|
835
|
-
// Ignore "not found" errors
|
|
836
889
|
context.log(`Error cleaning blob ${url}:`, error);
|
|
837
890
|
}
|
|
838
891
|
}
|
|
@@ -842,13 +895,14 @@ async function cleanup(context, urls = null) {
|
|
|
842
895
|
}
|
|
843
896
|
|
|
844
897
|
async function cleanupGCS(urls = null) {
|
|
898
|
+
if (!gcs) return [];
|
|
845
899
|
const bucket = gcs.bucket(GCS_BUCKETNAME);
|
|
846
900
|
const directories = new Set();
|
|
847
901
|
const cleanedURLs = [];
|
|
848
902
|
|
|
849
903
|
if (!urls) {
|
|
850
904
|
const daysN = 30;
|
|
851
|
-
const
|
|
905
|
+
const threshold = Date.now() - daysN * 24 * 60 * 60 * 1000;
|
|
852
906
|
const [files] = await bucket.getFiles();
|
|
853
907
|
|
|
854
908
|
for (const file of files) {
|
|
@@ -856,33 +910,27 @@ async function cleanupGCS(urls = null) {
|
|
|
856
910
|
const directoryPath = path.dirname(file.name);
|
|
857
911
|
directories.add(directoryPath);
|
|
858
912
|
if (metadata.updated) {
|
|
859
|
-
const updatedTime = new Date(metadata.updated);
|
|
860
|
-
if (updatedTime
|
|
861
|
-
console.log(`Cleaning file: ${file.name}`);
|
|
913
|
+
const updatedTime = new Date(metadata.updated).getTime();
|
|
914
|
+
if (updatedTime < threshold) {
|
|
862
915
|
await file.delete();
|
|
863
916
|
cleanedURLs.push(file.name);
|
|
864
917
|
}
|
|
865
918
|
}
|
|
866
919
|
}
|
|
867
920
|
} else {
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
cleanedURLs.push(url);
|
|
876
|
-
}
|
|
877
|
-
} catch (error) {
|
|
878
|
-
console.error(`Error cleaning up files: ${error}`);
|
|
921
|
+
for (const url of urls) {
|
|
922
|
+
const filePath = url.split('/').slice(3).join('/');
|
|
923
|
+
const file = bucket.file(filePath);
|
|
924
|
+
const directoryPath = path.dirname(file.name);
|
|
925
|
+
directories.add(directoryPath);
|
|
926
|
+
await file.delete();
|
|
927
|
+
cleanedURLs.push(url);
|
|
879
928
|
}
|
|
880
929
|
}
|
|
881
930
|
|
|
882
931
|
for (const directory of directories) {
|
|
883
932
|
const [files] = await bucket.getFiles({ prefix: directory });
|
|
884
933
|
if (files.length === 0) {
|
|
885
|
-
console.log(`Deleting empty directory: ${directory}`);
|
|
886
934
|
await bucket.deleteFiles({ prefix: directory });
|
|
887
935
|
}
|
|
888
936
|
}
|
|
@@ -891,47 +939,85 @@ async function cleanupGCS(urls = null) {
|
|
|
891
939
|
}
|
|
892
940
|
|
|
893
941
|
async function deleteGCS(blobName) {
|
|
894
|
-
if (!blobName)
|
|
895
|
-
|
|
942
|
+
if (!blobName) {
|
|
943
|
+
console.log('[deleteGCS] No blobName provided, skipping GCS deletion');
|
|
944
|
+
return;
|
|
945
|
+
}
|
|
896
946
|
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
947
|
+
if (!gcs) {
|
|
948
|
+
console.log('[deleteGCS] GCS not initialized, skipping deletion');
|
|
949
|
+
return;
|
|
950
|
+
}
|
|
900
951
|
|
|
952
|
+
try {
|
|
901
953
|
if (process.env.STORAGE_EMULATOR_HOST) {
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
954
|
+
console.log(`[deleteGCS] Using emulator at ${process.env.STORAGE_EMULATOR_HOST}`);
|
|
955
|
+
console.log(`[deleteGCS] Attempting to delete files with prefix: ${blobName}`);
|
|
956
|
+
|
|
957
|
+
// List files first
|
|
958
|
+
const listUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o?prefix=${blobName}`;
|
|
959
|
+
console.log(`[deleteGCS] Listing files with URL: ${listUrl}`);
|
|
960
|
+
|
|
961
|
+
const listResponse = await axios.get(listUrl, {
|
|
962
|
+
validateStatus: (status) => true,
|
|
963
|
+
});
|
|
964
|
+
console.log(`[deleteGCS] List response status: ${listResponse.status}`);
|
|
965
|
+
console.log(`[deleteGCS] List response data: ${JSON.stringify(listResponse.data)}`);
|
|
966
|
+
|
|
967
|
+
if (listResponse.status === 200 && listResponse.data.items) {
|
|
968
|
+
console.log(`[deleteGCS] Found ${listResponse.data.items.length} items to delete`);
|
|
969
|
+
|
|
970
|
+
// Delete each file
|
|
971
|
+
for (const item of listResponse.data.items) {
|
|
972
|
+
const deleteUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o/${encodeURIComponent(item.name)}`;
|
|
973
|
+
console.log(`[deleteGCS] Deleting file: ${item.name}`);
|
|
974
|
+
console.log(`[deleteGCS] Delete URL: ${deleteUrl}`);
|
|
975
|
+
|
|
976
|
+
const deleteResponse = await axios.delete(deleteUrl, {
|
|
977
|
+
validateStatus: (status) => true,
|
|
978
|
+
headers: {
|
|
979
|
+
'Content-Type': 'application/json',
|
|
980
|
+
},
|
|
981
|
+
});
|
|
982
|
+
console.log(`[deleteGCS] Delete response status: ${deleteResponse.status}`);
|
|
983
|
+
console.log(`[deleteGCS] Delete response data: ${JSON.stringify(deleteResponse.data)}`);
|
|
914
984
|
}
|
|
985
|
+
console.log('[deleteGCS] All files deleted successfully');
|
|
986
|
+
} else {
|
|
987
|
+
console.log('[deleteGCS] No files found to delete');
|
|
915
988
|
}
|
|
916
989
|
} else {
|
|
917
|
-
|
|
990
|
+
console.log('[deleteGCS] Using real GCS');
|
|
991
|
+
const bucket = gcs.bucket(GCS_BUCKETNAME);
|
|
918
992
|
const [files] = await bucket.getFiles({ prefix: blobName });
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
993
|
+
console.log(`[deleteGCS] Found ${files.length} files to delete`);
|
|
994
|
+
|
|
995
|
+
if (files.length > 0) {
|
|
996
|
+
await Promise.all(files.map((file) => file.delete()));
|
|
997
|
+
console.log('[deleteGCS] All files deleted successfully');
|
|
998
|
+
} else {
|
|
999
|
+
console.log('[deleteGCS] No files found to delete');
|
|
922
1000
|
}
|
|
923
1001
|
}
|
|
924
|
-
|
|
925
|
-
if (deletedFiles.length > 0) {
|
|
926
|
-
console.log(`Cleaned GCS files: ${deletedFiles.join(', ')}`);
|
|
927
|
-
}
|
|
928
|
-
return deletedFiles;
|
|
929
1002
|
} catch (error) {
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
1003
|
+
// If we get a 404 error, it means the file is already gone, which is fine
|
|
1004
|
+
if (error.response?.status === 404 || error.code === 404) {
|
|
1005
|
+
console.log('[deleteGCS] File not found in GCS (404) - this is expected if file was already deleted');
|
|
1006
|
+
return;
|
|
933
1007
|
}
|
|
934
|
-
|
|
1008
|
+
console.error('[deleteGCS] Error during deletion:', error);
|
|
1009
|
+
console.error('[deleteGCS] Error details:', {
|
|
1010
|
+
message: error.message,
|
|
1011
|
+
code: error.code,
|
|
1012
|
+
errors: error.errors,
|
|
1013
|
+
response: error.response ? {
|
|
1014
|
+
status: error.response.status,
|
|
1015
|
+
statusText: error.response.statusText,
|
|
1016
|
+
data: error.response.data,
|
|
1017
|
+
headers: error.response.headers,
|
|
1018
|
+
} : null,
|
|
1019
|
+
});
|
|
1020
|
+
// Don't throw the error - we want to continue with cleanup even if GCS deletion fails
|
|
935
1021
|
}
|
|
936
1022
|
}
|
|
937
1023
|
|
|
@@ -939,37 +1025,19 @@ async function deleteGCS(blobName) {
|
|
|
939
1025
|
async function ensureGCSUpload(context, existingFile) {
|
|
940
1026
|
if (!existingFile.gcs && gcs) {
|
|
941
1027
|
context.log('GCS file was missing - uploading.');
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
}
|
|
946
|
-
// Download the file from Azure/local storage
|
|
947
|
-
const response = await axios({
|
|
948
|
-
method: 'get',
|
|
949
|
-
url: existingFile.url,
|
|
950
|
-
responseType: 'stream',
|
|
951
|
-
});
|
|
952
|
-
// Upload the file stream to GCS
|
|
953
|
-
existingFile.gcs = await uploadToGCS(
|
|
954
|
-
context,
|
|
955
|
-
response.data,
|
|
956
|
-
encodedFilename,
|
|
957
|
-
);
|
|
1028
|
+
const fileName = sanitizeFilename(path.basename(existingFile.url.split('?')[0]));
|
|
1029
|
+
const response = await axios({ method: 'get', url: existingFile.url, responseType: 'stream' });
|
|
1030
|
+
existingFile.gcs = await uploadToGCS(context, response.data, fileName);
|
|
958
1031
|
}
|
|
959
1032
|
return existingFile;
|
|
960
1033
|
}
|
|
961
1034
|
|
|
962
|
-
// Helper function to upload a chunk to GCS
|
|
963
1035
|
async function uploadChunkToGCS(chunkPath, requestId) {
|
|
964
1036
|
if (!gcs) return null;
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
}
|
|
969
|
-
const gcsFileName = `${requestId}/${baseName}`;
|
|
970
|
-
await gcs.bucket(GCS_BUCKETNAME).upload(chunkPath, {
|
|
971
|
-
destination: gcsFileName,
|
|
972
|
-
});
|
|
1037
|
+
const dirName = requestId || uuidv4();
|
|
1038
|
+
const baseName = sanitizeFilename(path.basename(chunkPath));
|
|
1039
|
+
const gcsFileName = `${dirName}/${baseName}`;
|
|
1040
|
+
await gcs.bucket(GCS_BUCKETNAME).upload(chunkPath, { destination: gcsFileName });
|
|
973
1041
|
return `gs://${GCS_BUCKETNAME}/${gcsFileName}`;
|
|
974
1042
|
}
|
|
975
1043
|
|
|
@@ -985,4 +1053,4 @@ export {
|
|
|
985
1053
|
gcs,
|
|
986
1054
|
uploadChunkToGCS,
|
|
987
1055
|
downloadFromGCS,
|
|
988
|
-
};
|
|
1056
|
+
};
|