@aj-archipelago/cortex 1.3.51 → 1.3.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/helper-apps/cortex-file-handler/{.env.test.azure → .env.test.azure.sample} +2 -1
  2. package/helper-apps/cortex-file-handler/{.env.test.gcs → .env.test.gcs.sample} +2 -1
  3. package/helper-apps/cortex-file-handler/{.env.test → .env.test.sample} +2 -1
  4. package/helper-apps/cortex-file-handler/Dockerfile +1 -1
  5. package/helper-apps/cortex-file-handler/INTERFACE.md +178 -0
  6. package/helper-apps/cortex-file-handler/package.json +4 -3
  7. package/helper-apps/cortex-file-handler/scripts/test-azure.sh +3 -0
  8. package/helper-apps/cortex-file-handler/{blobHandler.js → src/blobHandler.js} +167 -99
  9. package/helper-apps/cortex-file-handler/{fileChunker.js → src/fileChunker.js} +11 -24
  10. package/helper-apps/cortex-file-handler/{index.js → src/index.js} +236 -256
  11. package/helper-apps/cortex-file-handler/{services → src/services}/ConversionService.js +39 -18
  12. package/helper-apps/cortex-file-handler/{services → src/services}/FileConversionService.js +7 -3
  13. package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +177 -0
  14. package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +258 -0
  15. package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +182 -0
  16. package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +86 -0
  17. package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +53 -0
  18. package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +259 -0
  19. package/helper-apps/cortex-file-handler/{start.js → src/start.js} +1 -1
  20. package/helper-apps/cortex-file-handler/src/utils/filenameUtils.js +28 -0
  21. package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +1 -1
  22. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +4 -4
  23. package/helper-apps/cortex-file-handler/tests/conversionResilience.test.js +152 -0
  24. package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +2 -28
  25. package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +134 -23
  26. package/helper-apps/cortex-file-handler/tests/getOperations.test.js +307 -0
  27. package/helper-apps/cortex-file-handler/tests/postOperations.test.js +291 -0
  28. package/helper-apps/cortex-file-handler/tests/start.test.js +50 -14
  29. package/helper-apps/cortex-file-handler/tests/storage/AzureStorageProvider.test.js +120 -0
  30. package/helper-apps/cortex-file-handler/tests/storage/GCSStorageProvider.test.js +193 -0
  31. package/helper-apps/cortex-file-handler/tests/storage/LocalStorageProvider.test.js +148 -0
  32. package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +100 -0
  33. package/helper-apps/cortex-file-handler/tests/storage/StorageService.test.js +113 -0
  34. package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +73 -19
  35. package/lib/entityConstants.js +1 -1
  36. package/package.json +1 -1
  37. /package/helper-apps/cortex-file-handler/{constants.js → src/constants.js} +0 -0
  38. /package/helper-apps/cortex-file-handler/{docHelper.js → src/docHelper.js} +0 -0
  39. /package/helper-apps/cortex-file-handler/{helper.js → src/helper.js} +0 -0
  40. /package/helper-apps/cortex-file-handler/{localFileHandler.js → src/localFileHandler.js} +0 -0
  41. /package/helper-apps/cortex-file-handler/{redis.js → src/redis.js} +0 -0
@@ -3,4 +3,5 @@ REDIS_CONNECTION_STRING=redis://default:redispw@localhost:32768
3
3
  AZURE_STORAGE_CONNECTION_STRING=UseDevelopmentStorage=true
4
4
  AZURE_STORAGE_CONTAINER_NAME=test-container
5
5
  NODE_ENV=test
6
- PORT=7072 # Different port for testing
6
+ PORT=7072 # Different port for testing
7
+ MARKITDOWN_CONVERT_URL= #cortex-markitdown url
@@ -6,4 +6,5 @@ GCS_BUCKETNAME=cortextempfiles
6
6
  AZURE_STORAGE_CONNECTION_STRING=UseDevelopmentStorage=true
7
7
  AZURE_STORAGE_CONTAINER_NAME=test-container
8
8
  NODE_ENV=test
9
- PORT=7072 # Different port for testing
9
+ PORT=7072 # Different port for testing
10
+ MARKITDOWN_CONVERT_URL= #cortex-markitdown url
@@ -4,4 +4,5 @@ REDIS_CONNECTION_STRING=redis://default:redispw@localhost:32768
4
4
  AZURE_STORAGE_CONTAINER_NAME=test-container
5
5
  #GCP_SERVICE_ACCOUNT_KEY={"type":"service_account","project_id":"test-project"}
6
6
  NODE_ENV=test
7
- PORT=7072 # Different port for testing
7
+ PORT=7072 # Different port for testing
8
+ MARKITDOWN_CONVERT_URL= #cortex-markitdown url
@@ -16,4 +16,4 @@ EXPOSE 7071
16
16
 
17
17
  # RUN npm run build
18
18
 
19
- CMD [ "node", "start.js" ]
19
+ CMD [ "npm", "start" ]
@@ -0,0 +1,178 @@
1
+ # Cortex File Handler Interface Documentation
2
+
3
+ ## Overview
4
+ The Cortex File Handler is a service that processes files through various operations including uploading, downloading, chunking, and document processing. It supports multiple storage backends (Azure Blob Storage, Google Cloud Storage, and Local File System).
5
+
6
+ ## Request Methods
7
+
8
+ ### POST
9
+ - **Purpose**: Upload a file
10
+ - **Content-Type**: `multipart/form-data`
11
+ - **Parameters**:
12
+ - `hash` (optional): Unique identifier for the file
13
+ - `requestId` (required): Unique identifier for the request
14
+ - File content must be included in the form data
15
+ - **Behavior**:
16
+ - Uploads file to primary storage (Azure or Local)
17
+ - If GCS is configured, also uploads to GCS
18
+ - If hash is provided, stores file metadata in Redis
19
+ - Returns upload result with file URLs
20
+ - **Response**: Object containing:
21
+ - `url`: Primary storage URL
22
+ - `gcs`: GCS URL (if GCS is configured)
23
+ - `hash`: Hash value (if provided)
24
+ - `message`: Success message
25
+ - `filename`: Original filename
26
+ - **Note**: The `save` parameter is not supported in POST requests. To convert and save a document as text, use GET with the `save` parameter.
27
+
28
+ ### GET
29
+ - **Purpose**: Process or retrieve files
30
+ - **Parameters** (can be in query string or request body):
31
+ - `uri` (required if not using fetch/load/restore): URL of the file to process
32
+ - Requires `requestId` parameter
33
+ - No Redis caching
34
+ - Direct processing based on file type
35
+ - `requestId` (required with `uri`): Unique identifier for the request
36
+ - `save` (optional): If true, saves document as text file
37
+ - When true, converts document to text and saves to primary storage only (Azure or Local)
38
+ - Does not save to GCS
39
+ - Original document is deleted from storage after text conversion
40
+ - `hash` (optional): Unique identifier for the file
41
+ - `checkHash` (optional): Check if hash exists
42
+ - `clearHash` (optional): Remove hash from storage
43
+ - `fetch`/`load`/`restore` (optional): URL to fetch remote file (these are aliases - any of the three parameters will trigger the same remote file processing behavior)
44
+ - Does not require `requestId`
45
+ - Uses Redis caching
46
+ - Downloads and validates file first
47
+ - Ensures correct file extension
48
+ - Truncates long filenames
49
+ - **Behavior**:
50
+ - For documents (PDF, DOC, etc.):
51
+ - If `save=true`:
52
+ - Converts document to text
53
+ - Saves text file to primary storage (Azure or Local)
54
+ - Deletes original document from storage
55
+ - Does not save to GCS
56
+ - Returns object with primary storage URL
57
+ - If `save=false`:
58
+ - Converts document to text
59
+ - Returns array of text chunks
60
+ - Does not persist any files
61
+ - For media files:
62
+ - Splits into chunks
63
+ - Uploads chunks to primary storage and GCS (if configured)
64
+ - Returns chunk information with offsets
65
+ - For remote files (`fetch`/`load`/`restore`):
66
+ - Downloads file from URL
67
+ - Processes based on file type
68
+ - Returns processed result
69
+ - Caches result in Redis using URL as key
70
+ - Updates Redis timestamp on subsequent requests
71
+ - Truncates filenames longer than 200 characters
72
+ - Ensures correct file extension based on content type
73
+
74
+ ### DELETE
75
+ - **Purpose**: Remove files from storage
76
+ - **Parameters** (can be in query string or request body):
77
+ - `requestId` (required): Unique identifier for the request
78
+ - **Behavior**:
79
+ - Deletes file from primary storage (Azure or Local)
80
+ - Deletes file from GCS if configured
81
+ - Returns deletion result
82
+ - **Response**: Array of deleted file URLs
83
+
84
+ ## Storage Configuration
85
+ - **Azure**: Enabled if `AZURE_STORAGE_CONNECTION_STRING` is set
86
+ - **GCS**: Enabled if `GCP_SERVICE_ACCOUNT_KEY_BASE64` or `GCP_SERVICE_ACCOUNT_KEY` is set
87
+ - **Local**: Used as fallback if Azure is not configured
88
+
89
+ ## Response Format
90
+ - **Success**:
91
+ - Status: 200
92
+ - Body: Varies by operation (see specific methods above)
93
+ - **Error**:
94
+ - Status: 400/404/500
95
+ - Body: Error message string
96
+
97
+ ## Progress Tracking
98
+ - Progress updates are published to Redis for each operation
99
+ - Progress includes:
100
+ - `progress`: Completion percentage (0-1)
101
+ - `completedCount`: Number of completed steps
102
+ - `totalCount`: Total number of steps
103
+ - `numberOfChunks`: Number of chunks (for media files)
104
+ - `data`: Additional operation data
105
+ - Progress updates are published to Redis channel associated with `requestId`
106
+
107
+ ## File Types
108
+ - **Documents**: Processed based on `DOC_EXTENSIONS` list
109
+ - Supported extensions:
110
+ - Text: .txt, .json, .csv, .md, .xml, .js, .html, .css
111
+ - Office: .doc, .docx, .xls, .xlsx
112
+ - Document processing limitations:
113
+ - PDFs: Does not support scanned, encrypted, or password-protected PDFs
114
+ - Requires OCR for PDFs without embedded fonts
115
+ - Text chunking:
116
+ - Maximum chunk size: 10,000 characters
117
+ - Chunks are split at sentence boundaries when possible
118
+ - Returns array of text chunks
119
+ - **Media**: All other file types, processed through chunking
120
+ - Chunked into smaller pieces for processing
121
+ - Each chunk is stored separately
122
+ - Media chunking behavior:
123
+ - Default chunk duration: 500 seconds
124
+ - Chunks are processed in parallel (3 at a time)
125
+ - Audio is converted to MP3 format (128kbps)
126
+ - Uses 4MB read buffer for file processing
127
+ - Supported media types:
128
+ - Images: .jpg, .jpeg, .png, .webp, .heic, .heif, .pdf
129
+ - Video: .mp4, .mpeg, .mov, .avi, .flv, .mpg, .webm, .wmv, .3gp
130
+ - Audio: .wav, .mp3, .aac, .ogg, .flac, .m4a
131
+ - File download behavior:
132
+ - 30 second timeout for downloads
133
+ - Supports streaming downloads
134
+ - Handles URL encoding/decoding
135
+ - Truncates filenames longer than 200 characters
136
+
137
+ ## Storage Behavior
138
+ - **Primary Storage** (Azure or Local):
139
+ - Files are stored with UUID-based names
140
+ - Organized by requestId folders
141
+ - Azure: Uses SAS tokens for access
142
+ - Local: Served via HTTP on configured port
143
+ - **GCS** (if configured):
144
+ - Files stored with gs:// protocol URLs
145
+ - Same folder structure as primary storage
146
+ - Only used for media file chunks
147
+ - **Redis**:
148
+ - Stores file metadata and URLs
149
+ - Used for caching remote file results
150
+ - Tracks file access timestamps
151
+ - Used for progress tracking
152
+
153
+ ## Cleanup
154
+ - Automatic cleanup of inactive files
155
+ - Removes files from:
156
+ - Primary storage (Azure/Local)
157
+ - GCS (if configured)
158
+ - Redis file store map
159
+ - Cleanup is triggered on each request but only runs if not already in progress
160
+ - Temporary files are cleaned up:
161
+ - After 1 hour of inactivity
162
+ - After successful processing
163
+ - On error conditions
164
+
165
+ ## Error Handling
166
+ - **400 Bad Request**:
167
+ - Missing required parameters
168
+ - Invalid or inaccessible URL
169
+ - Unsupported file type
170
+ - **404 Not Found**:
171
+ - File or hash not found
172
+ - File not found in storage
173
+ - **500 Internal Server Error**:
174
+ - Processing errors
175
+ - Storage errors
176
+ - Document conversion errors
177
+ - PDF processing errors (scanned, encrypted, password-protected)
178
+ - All errors include descriptive message in response body
@@ -1,11 +1,12 @@
1
1
  {
2
2
  "name": "@aj-archipelago/cortex-file-handler",
3
- "version": "1.1.01",
3
+ "version": "2.0.02",
4
4
  "description": "File handling service for Cortex - handles file uploads, media chunking, and document processing",
5
5
  "type": "module",
6
+ "main": "src/index.js",
6
7
  "scripts": {
7
- "start": "node start.js",
8
- "dev": "node -r dotenv/config start.js",
8
+ "start": "node src/start.js",
9
+ "dev": "node -r dotenv/config src/start.js",
9
10
  "test": "DOTENV_CONFIG_PATH=.env.test NODE_ENV=test node -r dotenv/config node_modules/ava/entrypoints/cli.mjs",
10
11
  "test:azure": "DOTENV_CONFIG_PATH=.env.test.azure NODE_ENV=test ./scripts/test-azure.sh",
11
12
  "test:watch": "DOTENV_CONFIG_PATH=.env.test NODE_ENV=test node -r dotenv/config node_modules/ava/entrypoints/cli.mjs --watch",
@@ -27,6 +27,9 @@ TEST_RESULT=$?
27
27
  echo "Cleaning up..."
28
28
  kill $AZURITE_PID
29
29
 
30
+ # Wait for Azurite to finish cleanup
31
+ sleep 2
32
+
30
33
  # Clean up Azurite directory
31
34
  rm -rf $AZURITE_DIR
32
35
 
@@ -23,6 +23,7 @@ import { CONVERTED_EXTENSIONS } from './constants.js';
23
23
  import mime from 'mime-types';
24
24
 
25
25
  import os from 'os';
26
+ import { sanitizeFilename } from './utils/filenameUtils.js';
26
27
 
27
28
  import { FileConversionService } from './services/FileConversionService.js';
28
29
 
@@ -81,8 +82,15 @@ function ensureUnencodedGcsUrl(url) {
81
82
  }
82
83
  // Split into bucket and path parts
83
84
  const [bucket, ...pathParts] = url.replace('gs://', '').split('/');
84
- // Reconstruct URL with decoded path parts
85
- return `gs://${bucket}/${pathParts.map(part => decodeURIComponent(part)).join('/')}`;
85
+ // Reconstruct URL with decoded path parts, handling invalid characters
86
+ return `gs://${bucket}/${pathParts.map(part => {
87
+ try {
88
+ return decodeURIComponent(part);
89
+ } catch (error) {
90
+ // If decoding fails, sanitize the filename by removing invalid characters
91
+ return part.replace(/[^\w\-\.]/g, '_');
92
+ }
93
+ }).join('/')}`;
86
94
  }
87
95
 
88
96
  async function gcsUrlExists(url, defaultReturn = false) {
@@ -348,14 +356,11 @@ function uploadBlob(
348
356
  return;
349
357
  }
350
358
  tempFilePath = path.join(tempDir, safeFilename);
351
- console.log('Temp dir:', tempDir, 'Original filename:', filename, 'Safe filename:', safeFilename, 'Temp file path:', tempFilePath);
352
- console.log('About to create write stream for:', tempFilePath);
353
359
  try {
354
360
  diskWriteStream = fs.createWriteStream(tempFilePath, {
355
361
  highWaterMark: 1024 * 1024,
356
362
  autoClose: true,
357
363
  });
358
- console.log('Write stream created successfully for:', tempFilePath);
359
364
  } catch (err) {
360
365
  console.error('Error creating write stream:', err, 'Temp dir exists:', fs.existsSync(tempDir));
361
366
  errorOccurred = true;
@@ -476,7 +481,58 @@ function uploadBlob(
476
481
  }
477
482
  }
478
483
 
479
- // Respond as soon as cloud uploads are done
484
+ // After original uploads, handle optional conversion
485
+ const conversionService = new FileConversionService(context, !saveToLocal);
486
+
487
+ if (conversionService.needsConversion(safeFilename)) {
488
+ try {
489
+ context.log('Starting file conversion (busboy)...');
490
+
491
+ // Ensure we have a local copy of the file for conversion
492
+ let localPathForConversion = tempFilePath;
493
+
494
+ if (!localPathForConversion) {
495
+ // No temp file was written (saveToLocal === false). Download from primary URL.
496
+ const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'convert-'));
497
+ localPathForConversion = path.join(tmpDir, safeFilename);
498
+ await conversionService._downloadFile(result.url, localPathForConversion);
499
+ } else {
500
+ // Wait until disk write completes to guarantee full file is present
501
+ if (diskWritePromise) {
502
+ await diskWritePromise;
503
+ }
504
+ }
505
+
506
+ // Perform the conversion
507
+ const conversion = await conversionService.convertFile(localPathForConversion, result.url);
508
+ context.log('File conversion completed (busboy):', conversion);
509
+
510
+ if (conversion.converted) {
511
+ context.log('Saving converted file (busboy)...');
512
+ // Save converted file to primary storage
513
+ const convertedSaveResult = await conversionService._saveConvertedFile(conversion.convertedPath, requestId);
514
+
515
+ // Optionally save to GCS
516
+ let convertedGcsUrl;
517
+ if (conversionService._isGCSConfigured()) {
518
+ convertedGcsUrl = await conversionService._uploadChunkToGCS(conversion.convertedPath, requestId);
519
+ }
520
+
521
+ // Attach to response body
522
+ result.converted = {
523
+ url: convertedSaveResult.url,
524
+ gcs: convertedGcsUrl,
525
+ };
526
+ context.log('Conversion process (busboy) completed successfully');
527
+ }
528
+ } catch (convErr) {
529
+ console.error('Error converting file (busboy):', convErr);
530
+ context.log('Error during conversion (busboy):', convErr.message);
531
+ // Continue without failing the upload
532
+ }
533
+ }
534
+
535
+ // Respond after conversion (if any)
480
536
  context.res = { status: 200, body: result };
481
537
  resolve(result);
482
538
  } catch (err) {
@@ -552,23 +608,24 @@ function uploadBlob(
552
608
  async function saveToLocalStorage(context, requestId, encodedFilename, file) {
553
609
  const localPath = join(publicFolder, requestId);
554
610
  fs.mkdirSync(localPath, { recursive: true });
555
- const destinationPath = `${localPath}/${encodedFilename}`;
556
- context.log(`Saving to local storage... ${destinationPath}`);
611
+
612
+ // Sanitize filename by removing invalid characters
613
+ const sanitizedFilename = sanitizeFilename(encodedFilename);
614
+ const destinationPath = `${localPath}/${sanitizedFilename}`;
615
+
557
616
  await pipeline(file, fs.createWriteStream(destinationPath));
558
- return `http://${ipAddress}:${port}/files/${requestId}/${encodedFilename}`;
617
+ return `http://${ipAddress}:${port}/files/${requestId}/${sanitizedFilename}`;
559
618
  }
560
619
 
561
620
  // Helper function to handle Azure blob storage
562
621
  async function saveToAzureStorage(context, encodedFilename, file) {
563
622
  const { containerClient } = await getBlobClient();
564
623
  const contentType = mime.lookup(encodedFilename);
565
-
566
- // Decode the filename if it's already encoded to prevent double-encoding
567
- let blobName = encodedFilename;
568
- if (isEncoded(blobName)) {
569
- blobName = decodeURIComponent(blobName);
570
- }
571
-
624
+
625
+ // Create a safe blob name that is URI-encoded once (no double encoding)
626
+ let blobName = sanitizeFilename(encodedFilename);
627
+ blobName = encodeURIComponent(blobName);
628
+
572
629
  const options = {
573
630
  blobHTTPHeaders: contentType ? { blobContentType: contentType } : {},
574
631
  maxConcurrency: 50,
@@ -583,31 +640,29 @@ async function saveToAzureStorage(context, encodedFilename, file) {
583
640
  }
584
641
 
585
642
  // Helper function to upload a file to Google Cloud Storage
586
- async function uploadToGCS(context, file, encodedFilename) {
587
- const gcsFile = gcs.bucket(GCS_BUCKETNAME).file(encodedFilename);
643
+ async function uploadToGCS(context, file, filename) {
644
+ const objectName = sanitizeFilename(filename);
645
+ const gcsFile = gcs.bucket(GCS_BUCKETNAME).file(objectName);
588
646
  const writeStream = gcsFile.createWriteStream({
589
647
  resumable: true,
590
648
  validation: false,
591
649
  metadata: {
592
- contentType: mime.lookup(encodedFilename) || 'application/octet-stream',
650
+ contentType: mime.lookup(objectName) || 'application/octet-stream',
593
651
  },
594
652
  chunkSize: 8 * 1024 * 1024,
595
653
  numRetries: 3,
596
654
  retryDelay: 1000,
597
655
  });
598
- context.log(`Uploading to GCS... ${encodedFilename}`);
656
+ context.log(`Uploading to GCS... ${objectName}`);
599
657
  await pipeline(file, writeStream);
600
- // Never encode GCS URLs
601
- const gcsUrl = `gs://${GCS_BUCKETNAME}/${encodedFilename}`;
602
- return gcsUrl;
658
+ return `gs://${GCS_BUCKETNAME}/${objectName}`;
603
659
  }
604
660
 
605
- // Helper function to handle Google Cloud Storage
661
+ // Wrapper that checks if GCS is configured
606
662
  async function saveToGoogleStorage(context, encodedFilename, file) {
607
663
  if (!gcs) {
608
664
  throw new Error('Google Cloud Storage is not initialized');
609
665
  }
610
-
611
666
  return uploadToGCS(context, file, encodedFilename);
612
667
  }
613
668
 
@@ -816,7 +871,6 @@ async function cleanup(context, urls = null) {
816
871
  cleanedURLs.push(blob.name);
817
872
  } catch (error) {
818
873
  if (error.statusCode !== 404) {
819
- // Ignore "not found" errors
820
874
  context.log(`Error cleaning blob ${blob.name}:`, error);
821
875
  }
822
876
  }
@@ -832,7 +886,6 @@ async function cleanup(context, urls = null) {
832
886
  cleanedURLs.push(blobName);
833
887
  } catch (error) {
834
888
  if (error.statusCode !== 404) {
835
- // Ignore "not found" errors
836
889
  context.log(`Error cleaning blob ${url}:`, error);
837
890
  }
838
891
  }
@@ -842,13 +895,14 @@ async function cleanup(context, urls = null) {
842
895
  }
843
896
 
844
897
  async function cleanupGCS(urls = null) {
898
+ if (!gcs) return [];
845
899
  const bucket = gcs.bucket(GCS_BUCKETNAME);
846
900
  const directories = new Set();
847
901
  const cleanedURLs = [];
848
902
 
849
903
  if (!urls) {
850
904
  const daysN = 30;
851
- const thirtyDaysAgo = new Date(Date.now() - daysN * 24 * 60 * 60 * 1000);
905
+ const threshold = Date.now() - daysN * 24 * 60 * 60 * 1000;
852
906
  const [files] = await bucket.getFiles();
853
907
 
854
908
  for (const file of files) {
@@ -856,33 +910,27 @@ async function cleanupGCS(urls = null) {
856
910
  const directoryPath = path.dirname(file.name);
857
911
  directories.add(directoryPath);
858
912
  if (metadata.updated) {
859
- const updatedTime = new Date(metadata.updated);
860
- if (updatedTime.getTime() < thirtyDaysAgo.getTime()) {
861
- console.log(`Cleaning file: ${file.name}`);
913
+ const updatedTime = new Date(metadata.updated).getTime();
914
+ if (updatedTime < threshold) {
862
915
  await file.delete();
863
916
  cleanedURLs.push(file.name);
864
917
  }
865
918
  }
866
919
  }
867
920
  } else {
868
- try {
869
- for (const url of urls) {
870
- const filename = path.join(url.split('/').slice(3).join('/'));
871
- const file = bucket.file(filename);
872
- const directoryPath = path.dirname(file.name);
873
- directories.add(directoryPath);
874
- await file.delete();
875
- cleanedURLs.push(url);
876
- }
877
- } catch (error) {
878
- console.error(`Error cleaning up files: ${error}`);
921
+ for (const url of urls) {
922
+ const filePath = url.split('/').slice(3).join('/');
923
+ const file = bucket.file(filePath);
924
+ const directoryPath = path.dirname(file.name);
925
+ directories.add(directoryPath);
926
+ await file.delete();
927
+ cleanedURLs.push(url);
879
928
  }
880
929
  }
881
930
 
882
931
  for (const directory of directories) {
883
932
  const [files] = await bucket.getFiles({ prefix: directory });
884
933
  if (files.length === 0) {
885
- console.log(`Deleting empty directory: ${directory}`);
886
934
  await bucket.deleteFiles({ prefix: directory });
887
935
  }
888
936
  }
@@ -891,47 +939,85 @@ async function cleanupGCS(urls = null) {
891
939
  }
892
940
 
893
941
  async function deleteGCS(blobName) {
894
- if (!blobName) throw new Error('Missing blobName parameter');
895
- if (!gcs) throw new Error('Google Cloud Storage is not initialized');
942
+ if (!blobName) {
943
+ console.log('[deleteGCS] No blobName provided, skipping GCS deletion');
944
+ return;
945
+ }
896
946
 
897
- try {
898
- const bucket = gcs.bucket(GCS_BUCKETNAME);
899
- const deletedFiles = [];
947
+ if (!gcs) {
948
+ console.log('[deleteGCS] GCS not initialized, skipping deletion');
949
+ return;
950
+ }
900
951
 
952
+ try {
901
953
  if (process.env.STORAGE_EMULATOR_HOST) {
902
- // For fake GCS server, use HTTP API directly
903
- const response = await axios.get(
904
- `http://localhost:4443/storage/v1/b/${GCS_BUCKETNAME}/o`,
905
- { params: { prefix: blobName } },
906
- );
907
- if (response.data.items) {
908
- for (const item of response.data.items) {
909
- await axios.delete(
910
- `http://localhost:4443/storage/v1/b/${GCS_BUCKETNAME}/o/${encodeURIComponent(item.name)}`,
911
- { validateStatus: (status) => status === 200 || status === 404 },
912
- );
913
- deletedFiles.push(item.name);
954
+ console.log(`[deleteGCS] Using emulator at ${process.env.STORAGE_EMULATOR_HOST}`);
955
+ console.log(`[deleteGCS] Attempting to delete files with prefix: ${blobName}`);
956
+
957
+ // List files first
958
+ const listUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o?prefix=${blobName}`;
959
+ console.log(`[deleteGCS] Listing files with URL: ${listUrl}`);
960
+
961
+ const listResponse = await axios.get(listUrl, {
962
+ validateStatus: (status) => true,
963
+ });
964
+ console.log(`[deleteGCS] List response status: ${listResponse.status}`);
965
+ console.log(`[deleteGCS] List response data: ${JSON.stringify(listResponse.data)}`);
966
+
967
+ if (listResponse.status === 200 && listResponse.data.items) {
968
+ console.log(`[deleteGCS] Found ${listResponse.data.items.length} items to delete`);
969
+
970
+ // Delete each file
971
+ for (const item of listResponse.data.items) {
972
+ const deleteUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o/${encodeURIComponent(item.name)}`;
973
+ console.log(`[deleteGCS] Deleting file: ${item.name}`);
974
+ console.log(`[deleteGCS] Delete URL: ${deleteUrl}`);
975
+
976
+ const deleteResponse = await axios.delete(deleteUrl, {
977
+ validateStatus: (status) => true,
978
+ headers: {
979
+ 'Content-Type': 'application/json',
980
+ },
981
+ });
982
+ console.log(`[deleteGCS] Delete response status: ${deleteResponse.status}`);
983
+ console.log(`[deleteGCS] Delete response data: ${JSON.stringify(deleteResponse.data)}`);
914
984
  }
985
+ console.log('[deleteGCS] All files deleted successfully');
986
+ } else {
987
+ console.log('[deleteGCS] No files found to delete');
915
988
  }
916
989
  } else {
917
- // For real GCS, use the SDK
990
+ console.log('[deleteGCS] Using real GCS');
991
+ const bucket = gcs.bucket(GCS_BUCKETNAME);
918
992
  const [files] = await bucket.getFiles({ prefix: blobName });
919
- for (const file of files) {
920
- await file.delete();
921
- deletedFiles.push(file.name);
993
+ console.log(`[deleteGCS] Found ${files.length} files to delete`);
994
+
995
+ if (files.length > 0) {
996
+ await Promise.all(files.map((file) => file.delete()));
997
+ console.log('[deleteGCS] All files deleted successfully');
998
+ } else {
999
+ console.log('[deleteGCS] No files found to delete');
922
1000
  }
923
1001
  }
924
-
925
- if (deletedFiles.length > 0) {
926
- console.log(`Cleaned GCS files: ${deletedFiles.join(', ')}`);
927
- }
928
- return deletedFiles;
929
1002
  } catch (error) {
930
- if (error.code !== 404) {
931
- console.error(`Error in deleteGCS: ${error}`);
932
- throw error;
1003
+ // If we get a 404 error, it means the file is already gone, which is fine
1004
+ if (error.response?.status === 404 || error.code === 404) {
1005
+ console.log('[deleteGCS] File not found in GCS (404) - this is expected if file was already deleted');
1006
+ return;
933
1007
  }
934
- return [];
1008
+ console.error('[deleteGCS] Error during deletion:', error);
1009
+ console.error('[deleteGCS] Error details:', {
1010
+ message: error.message,
1011
+ code: error.code,
1012
+ errors: error.errors,
1013
+ response: error.response ? {
1014
+ status: error.response.status,
1015
+ statusText: error.response.statusText,
1016
+ data: error.response.data,
1017
+ headers: error.response.headers,
1018
+ } : null,
1019
+ });
1020
+ // Don't throw the error - we want to continue with cleanup even if GCS deletion fails
935
1021
  }
936
1022
  }
937
1023
 
@@ -939,37 +1025,19 @@ async function deleteGCS(blobName) {
939
1025
  async function ensureGCSUpload(context, existingFile) {
940
1026
  if (!existingFile.gcs && gcs) {
941
1027
  context.log('GCS file was missing - uploading.');
942
- let encodedFilename = path.basename(existingFile.url.split('?')[0]);
943
- if (!isEncoded(encodedFilename)) {
944
- encodedFilename = encodeURIComponent(encodedFilename);
945
- }
946
- // Download the file from Azure/local storage
947
- const response = await axios({
948
- method: 'get',
949
- url: existingFile.url,
950
- responseType: 'stream',
951
- });
952
- // Upload the file stream to GCS
953
- existingFile.gcs = await uploadToGCS(
954
- context,
955
- response.data,
956
- encodedFilename,
957
- );
1028
+ const fileName = sanitizeFilename(path.basename(existingFile.url.split('?')[0]));
1029
+ const response = await axios({ method: 'get', url: existingFile.url, responseType: 'stream' });
1030
+ existingFile.gcs = await uploadToGCS(context, response.data, fileName);
958
1031
  }
959
1032
  return existingFile;
960
1033
  }
961
1034
 
962
- // Helper function to upload a chunk to GCS
963
1035
  async function uploadChunkToGCS(chunkPath, requestId) {
964
1036
  if (!gcs) return null;
965
- let baseName = path.basename(chunkPath);
966
- if (!isEncoded(baseName)) {
967
- baseName = encodeURIComponent(baseName);
968
- }
969
- const gcsFileName = `${requestId}/${baseName}`;
970
- await gcs.bucket(GCS_BUCKETNAME).upload(chunkPath, {
971
- destination: gcsFileName,
972
- });
1037
+ const dirName = requestId || uuidv4();
1038
+ const baseName = sanitizeFilename(path.basename(chunkPath));
1039
+ const gcsFileName = `${dirName}/${baseName}`;
1040
+ await gcs.bucket(GCS_BUCKETNAME).upload(chunkPath, { destination: gcsFileName });
973
1041
  return `gs://${GCS_BUCKETNAME}/${gcsFileName}`;
974
1042
  }
975
1043
 
@@ -985,4 +1053,4 @@ export {
985
1053
  gcs,
986
1054
  uploadChunkToGCS,
987
1055
  downloadFromGCS,
988
- };
1056
+ };