@aj-archipelago/cortex 1.3.50 → 1.3.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/config.js +1 -1
  2. package/helper-apps/cortex-browser/Dockerfile +19 -31
  3. package/helper-apps/cortex-browser/function_app.py +708 -181
  4. package/helper-apps/cortex-browser/requirements.txt +4 -4
  5. package/helper-apps/cortex-file-handler/{.env.test.azure → .env.test.azure.sample} +2 -1
  6. package/helper-apps/cortex-file-handler/{.env.test.gcs → .env.test.gcs.sample} +2 -1
  7. package/helper-apps/cortex-file-handler/{.env.test → .env.test.sample} +2 -1
  8. package/helper-apps/cortex-file-handler/Dockerfile +1 -1
  9. package/helper-apps/cortex-file-handler/INTERFACE.md +178 -0
  10. package/helper-apps/cortex-file-handler/function.json +2 -6
  11. package/helper-apps/cortex-file-handler/package-lock.json +6065 -5964
  12. package/helper-apps/cortex-file-handler/package.json +11 -6
  13. package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +12 -9
  14. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +21 -18
  15. package/helper-apps/cortex-file-handler/scripts/test-azure.sh +4 -1
  16. package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +1 -1
  17. package/helper-apps/cortex-file-handler/src/blobHandler.js +1056 -0
  18. package/helper-apps/cortex-file-handler/{constants.js → src/constants.js} +64 -48
  19. package/helper-apps/cortex-file-handler/src/docHelper.js +37 -0
  20. package/helper-apps/cortex-file-handler/{fileChunker.js → src/fileChunker.js} +97 -65
  21. package/helper-apps/cortex-file-handler/{helper.js → src/helper.js} +34 -25
  22. package/helper-apps/cortex-file-handler/src/index.js +608 -0
  23. package/helper-apps/cortex-file-handler/src/localFileHandler.js +107 -0
  24. package/helper-apps/cortex-file-handler/{redis.js → src/redis.js} +23 -17
  25. package/helper-apps/cortex-file-handler/src/services/ConversionService.js +309 -0
  26. package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +57 -0
  27. package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +177 -0
  28. package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +258 -0
  29. package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +182 -0
  30. package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +86 -0
  31. package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +53 -0
  32. package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +259 -0
  33. package/helper-apps/cortex-file-handler/src/start.js +88 -0
  34. package/helper-apps/cortex-file-handler/src/utils/filenameUtils.js +28 -0
  35. package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +144 -0
  36. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +90 -66
  37. package/helper-apps/cortex-file-handler/tests/conversionResilience.test.js +152 -0
  38. package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +105 -108
  39. package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +462 -0
  40. package/helper-apps/cortex-file-handler/tests/files/DOCX_TestPage.docx +0 -0
  41. package/helper-apps/cortex-file-handler/tests/files/tests-example.xls +0 -0
  42. package/helper-apps/cortex-file-handler/tests/getOperations.test.js +307 -0
  43. package/helper-apps/cortex-file-handler/tests/postOperations.test.js +291 -0
  44. package/helper-apps/cortex-file-handler/tests/start.test.js +984 -647
  45. package/helper-apps/cortex-file-handler/tests/storage/AzureStorageProvider.test.js +120 -0
  46. package/helper-apps/cortex-file-handler/tests/storage/GCSStorageProvider.test.js +193 -0
  47. package/helper-apps/cortex-file-handler/tests/storage/LocalStorageProvider.test.js +148 -0
  48. package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +100 -0
  49. package/helper-apps/cortex-file-handler/tests/storage/StorageService.test.js +113 -0
  50. package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +85 -0
  51. package/helper-apps/cortex-markitdown/.funcignore +1 -0
  52. package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py +64 -0
  53. package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json +21 -0
  54. package/helper-apps/cortex-markitdown/README.md +94 -0
  55. package/helper-apps/cortex-markitdown/host.json +15 -0
  56. package/helper-apps/cortex-markitdown/requirements.txt +2 -0
  57. package/lib/entityConstants.js +1 -1
  58. package/lib/requestExecutor.js +44 -36
  59. package/package.json +1 -1
  60. package/pathways/system/entity/tools/sys_tool_readfile.js +24 -2
  61. package/server/plugins/openAiWhisperPlugin.js +59 -87
  62. package/helper-apps/cortex-file-handler/blobHandler.js +0 -567
  63. package/helper-apps/cortex-file-handler/docHelper.js +0 -144
  64. package/helper-apps/cortex-file-handler/index.js +0 -440
  65. package/helper-apps/cortex-file-handler/localFileHandler.js +0 -108
  66. package/helper-apps/cortex-file-handler/start.js +0 -63
  67. package/helper-apps/cortex-file-handler/tests/docHelper.test.js +0 -148
@@ -0,0 +1,1056 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import { join } from 'path';
4
+ import { PassThrough } from 'stream';
5
+ import { pipeline as _pipeline } from 'stream';
6
+ import { promisify } from 'util';
7
+
8
+ import {
9
+ generateBlobSASQueryParameters,
10
+ StorageSharedKeyCredential,
11
+ BlobServiceClient,
12
+ } from '@azure/storage-blob';
13
+ import { Storage } from '@google-cloud/storage';
14
+ import axios from 'axios';
15
+ import Busboy from 'busboy';
16
+ import { v4 as uuidv4 } from 'uuid';
17
+ const pipeline = promisify(_pipeline);
18
+
19
+ import { publicFolder, port, ipAddress } from './start.js';
20
+ import { CONVERTED_EXTENSIONS } from './constants.js';
21
+
22
+ // eslint-disable-next-line import/no-extraneous-dependencies
23
+ import mime from 'mime-types';
24
+
25
+ import os from 'os';
26
+ import { sanitizeFilename } from './utils/filenameUtils.js';
27
+
28
+ import { FileConversionService } from './services/FileConversionService.js';
29
+
30
+ function isBase64(str) {
31
+ try {
32
+ return btoa(atob(str)) == str;
33
+ } catch (err) {
34
+ return false;
35
+ }
36
+ }
37
+
38
+ const { SAS_TOKEN_LIFE_DAYS = 30 } = process.env;
39
+ const GCP_SERVICE_ACCOUNT_KEY =
40
+ process.env.GCP_SERVICE_ACCOUNT_KEY_BASE64 ||
41
+ process.env.GCP_SERVICE_ACCOUNT_KEY ||
42
+ '{}';
43
+ const GCP_SERVICE_ACCOUNT = isBase64(GCP_SERVICE_ACCOUNT_KEY)
44
+ ? JSON.parse(Buffer.from(GCP_SERVICE_ACCOUNT_KEY, 'base64').toString())
45
+ : JSON.parse(GCP_SERVICE_ACCOUNT_KEY);
46
+ const { project_id: GCP_PROJECT_ID } = GCP_SERVICE_ACCOUNT;
47
+
48
+ let gcs;
49
+ if (!GCP_PROJECT_ID || !GCP_SERVICE_ACCOUNT) {
50
+ console.warn(
51
+ 'No Google Cloud Storage credentials provided - GCS will not be used',
52
+ );
53
+ } else {
54
+ try {
55
+ gcs = new Storage({
56
+ projectId: GCP_PROJECT_ID,
57
+ credentials: GCP_SERVICE_ACCOUNT,
58
+ });
59
+
60
+ // Rest of your Google Cloud operations using gcs object
61
+ } catch (error) {
62
+ console.error(
63
+ 'Google Cloud Storage credentials are invalid - GCS will not be used: ',
64
+ error,
65
+ );
66
+ }
67
+ }
68
+
69
+ export const AZURE_STORAGE_CONTAINER_NAME =
70
+ process.env.AZURE_STORAGE_CONTAINER_NAME || 'whispertempfiles';
71
+ export const GCS_BUCKETNAME = process.env.GCS_BUCKETNAME || 'cortextempfiles';
72
+
73
+ function isEncoded(str) {
74
+ // Checks for any percent-encoded sequence
75
+ return /%[0-9A-Fa-f]{2}/.test(str);
76
+ }
77
+
78
+ // Helper function to ensure GCS URLs are never encoded
79
+ function ensureUnencodedGcsUrl(url) {
80
+ if (!url || !url.startsWith('gs://')) {
81
+ return url;
82
+ }
83
+ // Split into bucket and path parts
84
+ const [bucket, ...pathParts] = url.replace('gs://', '').split('/');
85
+ // Reconstruct URL with decoded path parts, handling invalid characters
86
+ return `gs://${bucket}/${pathParts.map(part => {
87
+ try {
88
+ return decodeURIComponent(part);
89
+ } catch (error) {
90
+ // If decoding fails, sanitize the filename by removing invalid characters
91
+ return part.replace(/[^\w\-\.]/g, '_');
92
+ }
93
+ }).join('/')}`;
94
+ }
95
+
96
+ async function gcsUrlExists(url, defaultReturn = false) {
97
+ try {
98
+ if (!url || !gcs) {
99
+ return defaultReturn; // Cannot check return
100
+ }
101
+
102
+ // Ensure URL is not encoded
103
+ const unencodedUrl = ensureUnencodedGcsUrl(url);
104
+ const urlParts = unencodedUrl.replace('gs://', '').split('/');
105
+ const bucketName = urlParts[0];
106
+ const fileName = urlParts.slice(1).join('/');
107
+
108
+ if (process.env.STORAGE_EMULATOR_HOST) {
109
+ try {
110
+ const response = await axios.get(
111
+ `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${bucketName}/o/${encodeURIComponent(fileName)}`,
112
+ { validateStatus: (status) => status === 200 || status === 404 },
113
+ );
114
+ return response.status === 200;
115
+ } catch (error) {
116
+ console.error('Error checking emulator file:', error);
117
+ return false;
118
+ }
119
+ }
120
+
121
+ const bucket = gcs.bucket(bucketName);
122
+ const file = bucket.file(fileName);
123
+
124
+ const [exists] = await file.exists();
125
+
126
+ return exists;
127
+ } catch (error) {
128
+ console.error('Error checking if GCS URL exists:', error);
129
+ return false;
130
+ }
131
+ }
132
+
133
+ /**
134
+ * Downloads a file from Google Cloud Storage to a local file
135
+ * @param {string} gcsUrl - The GCS URL in format gs://bucket-name/file-path
136
+ * @param {string} destinationPath - The local path where the file should be saved
137
+ * @returns {Promise<void>}
138
+ */
139
+ async function downloadFromGCS(gcsUrl, destinationPath) {
140
+ if (!gcsUrl || !gcs) {
141
+ throw new Error('Invalid GCS URL or GCS client not initialized');
142
+ }
143
+
144
+ const urlParts = gcsUrl.replace('gs://', '').split('/');
145
+ const bucketName = urlParts[0];
146
+ const fileName = urlParts.slice(1).join('/');
147
+
148
+ if (process.env.STORAGE_EMULATOR_HOST) {
149
+ // Use axios to download from emulator
150
+ const response = await axios({
151
+ method: 'GET',
152
+ url: `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${bucketName}/o/${encodeURIComponent(fileName)}?alt=media`,
153
+ responseType: 'stream'
154
+ });
155
+
156
+ // Write the response to file
157
+ const writer = fs.createWriteStream(destinationPath);
158
+ await new Promise((resolve, reject) => {
159
+ response.data.pipe(writer);
160
+ writer.on('finish', resolve);
161
+ writer.on('error', reject);
162
+ });
163
+ } else {
164
+ // Use GCS client for real GCS
165
+ const bucket = gcs.bucket(bucketName);
166
+ const file = bucket.file(fileName);
167
+ await file.download({ destination: destinationPath });
168
+ }
169
+ }
170
+
171
+ export const getBlobClient = async () => {
172
+ const connectionString = process.env.AZURE_STORAGE_CONNECTION_STRING;
173
+ const containerName = AZURE_STORAGE_CONTAINER_NAME;
174
+ if (!connectionString || !containerName) {
175
+ throw new Error(
176
+ 'Missing Azure Storage connection string or container name environment variable',
177
+ );
178
+ }
179
+
180
+ const blobServiceClient =
181
+ BlobServiceClient.fromConnectionString(connectionString);
182
+
183
+ const serviceProperties = await blobServiceClient.getProperties();
184
+ if (!serviceProperties.defaultServiceVersion) {
185
+ serviceProperties.defaultServiceVersion = '2020-02-10';
186
+ await blobServiceClient.setProperties(serviceProperties);
187
+ }
188
+
189
+ const containerClient = blobServiceClient.getContainerClient(containerName);
190
+
191
+ return { blobServiceClient, containerClient };
192
+ };
193
+
194
+ async function saveFileToBlob(chunkPath, requestId) {
195
+ const { containerClient } = await getBlobClient();
196
+ // Use the filename with a UUID as the blob name
197
+ let baseName = path.basename(chunkPath);
198
+ // Remove any query parameters from the filename
199
+ baseName = baseName.split('?')[0];
200
+ // Only encode if not already encoded
201
+ if (!isEncoded(baseName)) {
202
+ baseName = encodeURIComponent(baseName);
203
+ }
204
+ const blobName = `${requestId}/${uuidv4()}_${baseName}`;
205
+
206
+ // Create a read stream for the chunk file
207
+ const fileStream = fs.createReadStream(chunkPath);
208
+
209
+ // Upload the chunk to Azure Blob Storage using the stream
210
+ const blockBlobClient = containerClient.getBlockBlobClient(blobName);
211
+ await blockBlobClient.uploadStream(fileStream);
212
+
213
+ // Generate SAS token after successful upload
214
+ const sasToken = generateSASToken(containerClient, blobName);
215
+
216
+ // Return an object with the URL property
217
+ return {
218
+ url: `${blockBlobClient.url}?${sasToken}`,
219
+ blobName: blobName
220
+ };
221
+ }
222
+
223
+ const generateSASToken = (
224
+ containerClient,
225
+ blobName,
226
+ expiryTimeSeconds = parseInt(SAS_TOKEN_LIFE_DAYS) * 24 * 60 * 60,
227
+ ) => {
228
+ const { accountName, accountKey } = containerClient.credential;
229
+ const sharedKeyCredential = new StorageSharedKeyCredential(
230
+ accountName,
231
+ accountKey,
232
+ );
233
+
234
+ const sasOptions = {
235
+ containerName: containerClient.containerName,
236
+ blobName: blobName,
237
+ permissions: 'r', // Read permission
238
+ startsOn: new Date(),
239
+ expiresOn: new Date(new Date().valueOf() + expiryTimeSeconds * 1000),
240
+ };
241
+
242
+ const sasToken = generateBlobSASQueryParameters(
243
+ sasOptions,
244
+ sharedKeyCredential,
245
+ ).toString();
246
+ return sasToken;
247
+ };
248
+
249
+ //deletes blob that has the requestId
250
+ async function deleteBlob(requestId) {
251
+ if (!requestId) throw new Error('Missing requestId parameter');
252
+ const { containerClient } = await getBlobClient();
253
+ // List all blobs in the container
254
+ const blobs = containerClient.listBlobsFlat();
255
+
256
+ const result = [];
257
+ // Iterate through the blobs
258
+ for await (const blob of blobs) {
259
+ // Check if the blob name starts with requestId_ (flat structure)
260
+ // or is inside a folder named requestId/ (folder structure)
261
+ if (
262
+ blob.name.startsWith(`${requestId}_`) ||
263
+ blob.name.startsWith(`${requestId}/`)
264
+ ) {
265
+ // Delete the matching blob
266
+ const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
267
+ await blockBlobClient.delete();
268
+ console.log(`Cleaned blob: ${blob.name}`);
269
+ result.push(blob.name);
270
+ }
271
+ }
272
+
273
+ return result;
274
+ }
275
+
276
+ function uploadBlob(
277
+ context,
278
+ req,
279
+ saveToLocal = false,
280
+ filePath = null,
281
+ hash = null,
282
+ ) {
283
+ return new Promise((resolve, reject) => {
284
+ (async () => {
285
+ try {
286
+ let requestId = uuidv4();
287
+ const body = {};
288
+
289
+ // If filePath is given, we are dealing with local file and not form-data
290
+ if (filePath) {
291
+ const file = fs.createReadStream(filePath);
292
+ const filename = path.basename(filePath);
293
+ try {
294
+ const result = await uploadFile(
295
+ context,
296
+ requestId,
297
+ body,
298
+ saveToLocal,
299
+ file,
300
+ filename,
301
+ resolve,
302
+ hash,
303
+ );
304
+ resolve(result);
305
+ } catch (error) {
306
+ const err = new Error('Error processing file upload.');
307
+ err.status = 500;
308
+ throw err;
309
+ }
310
+ } else {
311
+ // Otherwise, continue working with form-data
312
+ const busboy = Busboy({ headers: req.headers });
313
+ let hasFile = false;
314
+ let errorOccurred = false;
315
+
316
+ busboy.on('field', (fieldname, value) => {
317
+ if (fieldname === 'requestId') {
318
+ requestId = value;
319
+ } else if (fieldname === 'hash') {
320
+ hash = value;
321
+ }
322
+ });
323
+
324
+ busboy.on('file', async (fieldname, file, info) => {
325
+ if (errorOccurred) return;
326
+ hasFile = true;
327
+
328
+ // Validate file
329
+ if (!info.filename || info.filename.trim() === '') {
330
+ errorOccurred = true;
331
+ const err = new Error('Invalid file: missing filename');
332
+ err.status = 400;
333
+ reject(err);
334
+ return;
335
+ }
336
+
337
+ // Prepare for streaming to cloud destinations
338
+ const filename = info.filename;
339
+ const safeFilename = path.basename(filename); // Sanitize filename
340
+ const uploadName = `${requestId || uuidv4()}_${safeFilename}`;
341
+ const azureStream = !saveToLocal ? new PassThrough() : null;
342
+ const gcsStream = gcs ? new PassThrough() : null;
343
+ let diskWriteStream, tempDir, tempFilePath;
344
+ let diskWritePromise;
345
+ let diskWriteError = null;
346
+ let cloudUploadError = null;
347
+
348
+ // Start local disk write in parallel (non-blocking for response)
349
+ if (saveToLocal) {
350
+ try {
351
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'upload-'));
352
+ } catch (err) {
353
+ console.error('Error creating tempDir:', err);
354
+ errorOccurred = true;
355
+ reject(err);
356
+ return;
357
+ }
358
+ tempFilePath = path.join(tempDir, safeFilename);
359
+ try {
360
+ diskWriteStream = fs.createWriteStream(tempFilePath, {
361
+ highWaterMark: 1024 * 1024,
362
+ autoClose: true,
363
+ });
364
+ } catch (err) {
365
+ console.error('Error creating write stream:', err, 'Temp dir exists:', fs.existsSync(tempDir));
366
+ errorOccurred = true;
367
+ reject(err);
368
+ return;
369
+ }
370
+ diskWriteStream.on('error', (err) => {
371
+ console.error('Disk write stream error:', err);
372
+ });
373
+ diskWriteStream.on('close', () => {
374
+ console.log('Disk write stream closed for:', tempFilePath);
375
+ });
376
+ diskWritePromise = new Promise((res, rej) => {
377
+ diskWriteStream.on('finish', res);
378
+ diskWriteStream.on('error', (err) => {
379
+ diskWriteError = err;
380
+ rej(err);
381
+ });
382
+ });
383
+ }
384
+
385
+ // Pipe incoming file to all destinations
386
+ let receivedAnyData = false;
387
+ file.on('data', () => { receivedAnyData = true; });
388
+ if (azureStream) file.pipe(azureStream);
389
+ if (gcsStream) file.pipe(gcsStream);
390
+ if (diskWriteStream) file.pipe(diskWriteStream);
391
+
392
+ // Listen for end event to check for empty file
393
+ file.on('end', async () => {
394
+ if (!receivedAnyData) {
395
+ errorOccurred = true;
396
+ // Abort all streams
397
+ if (azureStream) azureStream.destroy();
398
+ if (gcsStream) gcsStream.destroy();
399
+ if (diskWriteStream) diskWriteStream.destroy();
400
+ const err = new Error('Invalid file: file is empty');
401
+ err.status = 400;
402
+ reject(err);
403
+ }
404
+ });
405
+
406
+ // Start cloud uploads immediately
407
+ let azurePromise;
408
+ if (!saveToLocal) {
409
+ azurePromise = saveToAzureStorage(context, uploadName, azureStream)
410
+ .catch(async (err) => {
411
+ cloudUploadError = err;
412
+ // Fallback: try from disk if available
413
+ if (diskWritePromise) {
414
+ await diskWritePromise;
415
+ const diskStream = fs.createReadStream(tempFilePath, {
416
+ highWaterMark: 1024 * 1024,
417
+ autoClose: true,
418
+ });
419
+ return saveToAzureStorage(context, uploadName, diskStream);
420
+ }
421
+ throw err;
422
+ });
423
+ }
424
+ let gcsPromise;
425
+ if (gcsStream) {
426
+ gcsPromise = saveToGoogleStorage(context, uploadName, gcsStream)
427
+ .catch(async (err) => {
428
+ cloudUploadError = err;
429
+ if (diskWritePromise) {
430
+ await diskWritePromise;
431
+ const diskStream = fs.createReadStream(tempFilePath, {
432
+ highWaterMark: 1024 * 1024,
433
+ autoClose: true,
434
+ });
435
+ return saveToGoogleStorage(context, uploadName, diskStream);
436
+ }
437
+ throw err;
438
+ });
439
+ }
440
+
441
+ // Wait for cloud uploads to finish
442
+ try {
443
+ const results = await Promise.all([
444
+ azurePromise ? azurePromise.then((url) => ({ url, type: 'primary' })) : null,
445
+ (!azurePromise && saveToLocal)
446
+ ? Promise.resolve({ url: null, type: 'primary-local' }) // placeholder for local, url handled later
447
+ : null,
448
+ gcsPromise ? gcsPromise.then((gcs) => ({ gcs, type: 'gcs' })) : null,
449
+ ].filter(Boolean));
450
+
451
+ const result = {
452
+ message: `File '${uploadName}' uploaded successfully.`,
453
+ filename: uploadName,
454
+ ...results.reduce((acc, result) => {
455
+ if (result.type === 'primary') acc.url = result.url;
456
+ if (result.type === 'gcs') acc.gcs = ensureUnencodedGcsUrl(result.gcs);
457
+ return acc;
458
+ }, {}),
459
+ };
460
+ if (hash) result.hash = hash;
461
+
462
+ // If saving locally, wait for disk write to finish and then move to public folder
463
+ if (saveToLocal) {
464
+ try {
465
+ if (diskWritePromise) {
466
+ await diskWritePromise; // ensure file fully written
467
+ }
468
+ const localUrl = await saveToLocalStorage(
469
+ context,
470
+ requestId,
471
+ uploadName,
472
+ fs.createReadStream(tempFilePath, {
473
+ highWaterMark: 1024 * 1024,
474
+ autoClose: true,
475
+ }),
476
+ );
477
+ result.url = localUrl;
478
+ } catch (err) {
479
+ console.error('Error saving to local storage:', err);
480
+ throw err;
481
+ }
482
+ }
483
+
484
+ // After original uploads, handle optional conversion
485
+ const conversionService = new FileConversionService(context, !saveToLocal);
486
+
487
+ if (conversionService.needsConversion(safeFilename)) {
488
+ try {
489
+ context.log('Starting file conversion (busboy)...');
490
+
491
+ // Ensure we have a local copy of the file for conversion
492
+ let localPathForConversion = tempFilePath;
493
+
494
+ if (!localPathForConversion) {
495
+ // No temp file was written (saveToLocal === false). Download from primary URL.
496
+ const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'convert-'));
497
+ localPathForConversion = path.join(tmpDir, safeFilename);
498
+ await conversionService._downloadFile(result.url, localPathForConversion);
499
+ } else {
500
+ // Wait until disk write completes to guarantee full file is present
501
+ if (diskWritePromise) {
502
+ await diskWritePromise;
503
+ }
504
+ }
505
+
506
+ // Perform the conversion
507
+ const conversion = await conversionService.convertFile(localPathForConversion, result.url);
508
+ context.log('File conversion completed (busboy):', conversion);
509
+
510
+ if (conversion.converted) {
511
+ context.log('Saving converted file (busboy)...');
512
+ // Save converted file to primary storage
513
+ const convertedSaveResult = await conversionService._saveConvertedFile(conversion.convertedPath, requestId);
514
+
515
+ // Optionally save to GCS
516
+ let convertedGcsUrl;
517
+ if (conversionService._isGCSConfigured()) {
518
+ convertedGcsUrl = await conversionService._uploadChunkToGCS(conversion.convertedPath, requestId);
519
+ }
520
+
521
+ // Attach to response body
522
+ result.converted = {
523
+ url: convertedSaveResult.url,
524
+ gcs: convertedGcsUrl,
525
+ };
526
+ context.log('Conversion process (busboy) completed successfully');
527
+ }
528
+ } catch (convErr) {
529
+ console.error('Error converting file (busboy):', convErr);
530
+ context.log('Error during conversion (busboy):', convErr.message);
531
+ // Continue without failing the upload
532
+ }
533
+ }
534
+
535
+ // Respond after conversion (if any)
536
+ context.res = { status: 200, body: result };
537
+ resolve(result);
538
+ } catch (err) {
539
+ errorOccurred = true;
540
+ reject(err);
541
+ } finally {
542
+ // Clean up temp file if written
543
+ if (tempDir) {
544
+ fs.rmSync(tempDir, { recursive: true, force: true });
545
+ }
546
+ }
547
+ });
548
+
549
+ busboy.on('error', (error) => {
550
+ if (errorOccurred) return;
551
+ errorOccurred = true;
552
+ const err = new Error('No file provided in request');
553
+ err.status = 400;
554
+ reject(err);
555
+ });
556
+
557
+ busboy.on('finish', () => {
558
+ if (errorOccurred) return;
559
+ if (!hasFile) {
560
+ errorOccurred = true;
561
+ const err = new Error('No file provided in request');
562
+ err.status = 400;
563
+ reject(err);
564
+ }
565
+ });
566
+
567
+ // Handle errors from piping the request
568
+ req.on('error', (error) => {
569
+ if (errorOccurred) return;
570
+ errorOccurred = true;
571
+ // Only log unexpected errors
572
+ if (error.message !== 'No file provided in request') {
573
+ context.log('Error in request stream:', error);
574
+ }
575
+ const err = new Error('No file provided in request');
576
+ err.status = 400;
577
+ reject(err);
578
+ });
579
+
580
+ try {
581
+ req.pipe(busboy);
582
+ } catch (error) {
583
+ if (errorOccurred) return;
584
+ errorOccurred = true;
585
+ // Only log unexpected errors
586
+ if (error.message !== 'No file provided in request') {
587
+ context.log('Error piping request to busboy:', error);
588
+ }
589
+ const err = new Error('No file provided in request');
590
+ err.status = 400;
591
+ reject(err);
592
+ }
593
+ }
594
+ } catch (error) {
595
+ // Only log unexpected errors
596
+ if (error.message !== 'No file provided in request') {
597
+ context.log('Error processing file upload:', error);
598
+ }
599
+ const err = new Error(error.message || 'Error processing file upload.');
600
+ err.status = error.status || 500;
601
+ reject(err);
602
+ }
603
+ })();
604
+ });
605
+ }
606
+
607
+ // Helper function to handle local file storage
608
+ async function saveToLocalStorage(context, requestId, encodedFilename, file) {
609
+ const localPath = join(publicFolder, requestId);
610
+ fs.mkdirSync(localPath, { recursive: true });
611
+
612
+ // Sanitize filename by removing invalid characters
613
+ const sanitizedFilename = sanitizeFilename(encodedFilename);
614
+ const destinationPath = `${localPath}/${sanitizedFilename}`;
615
+
616
+ await pipeline(file, fs.createWriteStream(destinationPath));
617
+ return `http://${ipAddress}:${port}/files/${requestId}/${sanitizedFilename}`;
618
+ }
619
+
620
+ // Helper function to handle Azure blob storage
621
+ async function saveToAzureStorage(context, encodedFilename, file) {
622
+ const { containerClient } = await getBlobClient();
623
+ const contentType = mime.lookup(encodedFilename);
624
+
625
+ // Create a safe blob name that is URI-encoded once (no double encoding)
626
+ let blobName = sanitizeFilename(encodedFilename);
627
+ blobName = encodeURIComponent(blobName);
628
+
629
+ const options = {
630
+ blobHTTPHeaders: contentType ? { blobContentType: contentType } : {},
631
+ maxConcurrency: 50,
632
+ blockSize: 8 * 1024 * 1024,
633
+ };
634
+
635
+ const blockBlobClient = containerClient.getBlockBlobClient(blobName);
636
+ context.log(`Uploading to Azure... ${blobName}`);
637
+ await blockBlobClient.uploadStream(file, undefined, undefined, options);
638
+ const sasToken = generateSASToken(containerClient, blobName);
639
+ return `${blockBlobClient.url}?${sasToken}`;
640
+ }
641
+
642
+ // Helper function to upload a file to Google Cloud Storage
643
+ async function uploadToGCS(context, file, filename) {
644
+ const objectName = sanitizeFilename(filename);
645
+ const gcsFile = gcs.bucket(GCS_BUCKETNAME).file(objectName);
646
+ const writeStream = gcsFile.createWriteStream({
647
+ resumable: true,
648
+ validation: false,
649
+ metadata: {
650
+ contentType: mime.lookup(objectName) || 'application/octet-stream',
651
+ },
652
+ chunkSize: 8 * 1024 * 1024,
653
+ numRetries: 3,
654
+ retryDelay: 1000,
655
+ });
656
+ context.log(`Uploading to GCS... ${objectName}`);
657
+ await pipeline(file, writeStream);
658
+ return `gs://${GCS_BUCKETNAME}/${objectName}`;
659
+ }
660
+
661
+ // Wrapper that checks if GCS is configured
662
+ async function saveToGoogleStorage(context, encodedFilename, file) {
663
+ if (!gcs) {
664
+ throw new Error('Google Cloud Storage is not initialized');
665
+ }
666
+ return uploadToGCS(context, file, encodedFilename);
667
+ }
668
+
669
+ async function uploadFile(
670
+ context,
671
+ requestId,
672
+ body,
673
+ saveToLocal,
674
+ file,
675
+ filename,
676
+ resolve,
677
+ hash = null,
678
+ ) {
679
+ try {
680
+ if (!file) {
681
+ context.res = {
682
+ status: 400,
683
+ body: 'No file provided in request',
684
+ };
685
+ resolve(context.res);
686
+ return;
687
+ }
688
+
689
+ const ext = path.extname(filename).toLowerCase();
690
+ context.log(`Processing file with extension: ${ext}`);
691
+ let uploadPath = null;
692
+ let uploadName = null;
693
+ let tempDir = null;
694
+
695
+ // Create temp directory for file operations
696
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'upload-'));
697
+ const tempOriginal = path.join(tempDir, filename);
698
+ context.log(`Created temp directory: ${tempDir}`);
699
+
700
+ // Optimize initial write with larger buffer
701
+ const writeStream = fs.createWriteStream(tempOriginal, {
702
+ highWaterMark: 1024 * 1024, // 1MB chunks for initial write
703
+ autoClose: true,
704
+ });
705
+
706
+ // Use pipeline with error handling
707
+ context.log('Writing file to temp location...');
708
+ await pipeline(file, writeStream);
709
+ context.log('File written to temp location successfully');
710
+
711
+ uploadPath = tempOriginal;
712
+ uploadName = `${requestId || uuidv4()}_${filename}`;
713
+ context.log(`Prepared upload name: ${uploadName}`);
714
+
715
+ // Create optimized read streams with larger buffers for storage uploads
716
+ const createOptimizedReadStream = (path) => fs.createReadStream(path, {
717
+ highWaterMark: 1024 * 1024, // 1MB chunks for storage uploads
718
+ autoClose: true,
719
+ });
720
+
721
+ // Upload original in parallel with optimized streams
722
+ const storagePromises = [];
723
+ context.log('Starting primary storage upload...');
724
+ const primaryPromise = saveToLocal
725
+ ? saveToLocalStorage(
726
+ context,
727
+ requestId,
728
+ uploadName,
729
+ createOptimizedReadStream(uploadPath),
730
+ )
731
+ : saveToAzureStorage(
732
+ context,
733
+ uploadName,
734
+ createOptimizedReadStream(uploadPath),
735
+ );
736
+ storagePromises.push(
737
+ primaryPromise.then((url) => {
738
+ context.log('Primary storage upload completed');
739
+ return { url, type: 'primary' };
740
+ }),
741
+ );
742
+
743
+ if (gcs) {
744
+ context.log('Starting GCS upload...');
745
+ storagePromises.push(
746
+ saveToGoogleStorage(
747
+ context,
748
+ uploadName,
749
+ createOptimizedReadStream(uploadPath),
750
+ ).then((gcsUrl) => {
751
+ context.log('GCS upload completed');
752
+ return {
753
+ gcs: gcsUrl,
754
+ type: 'gcs',
755
+ };
756
+ }),
757
+ );
758
+ }
759
+
760
+ // Wait for original uploads to complete
761
+ context.log('Waiting for all storage uploads to complete...');
762
+ const results = await Promise.all(storagePromises);
763
+ const result = {
764
+ message: `File '${uploadName}' ${saveToLocal ? 'saved to folder' : 'uploaded'} successfully.`,
765
+ filename: uploadName,
766
+ ...results.reduce((acc, result) => {
767
+ if (result.type === 'primary') acc.url = result.url;
768
+ if (result.type === 'gcs') acc.gcs = ensureUnencodedGcsUrl(result.gcs);
769
+ return acc;
770
+ }, {}),
771
+ };
772
+
773
+ if (hash) {
774
+ result.hash = hash;
775
+ }
776
+
777
+ // Initialize conversion service
778
+ const conversionService = new FileConversionService(context, !saveToLocal);
779
+
780
+ // Check if file needs conversion and handle it
781
+ if (conversionService.needsConversion(filename)) {
782
+ try {
783
+ context.log('Starting file conversion...');
784
+ // Convert the file
785
+ const conversion = await conversionService.convertFile(uploadPath, result.url);
786
+ context.log('File conversion completed:', conversion);
787
+
788
+ if (conversion.converted) {
789
+ context.log('Saving converted file...');
790
+ // Save converted file
791
+ const convertedSaveResult = await conversionService._saveConvertedFile(conversion.convertedPath, requestId);
792
+ context.log('Converted file saved to primary storage');
793
+
794
+ // If GCS is configured, also save to GCS
795
+ let convertedGcsUrl;
796
+ if (conversionService._isGCSConfigured()) {
797
+ context.log('Saving converted file to GCS...');
798
+ convertedGcsUrl = await conversionService._uploadChunkToGCS(conversion.convertedPath, requestId);
799
+ context.log('Converted file saved to GCS');
800
+ }
801
+
802
+ // Add converted file info to result
803
+ result.converted = {
804
+ url: convertedSaveResult.url,
805
+ gcs: convertedGcsUrl
806
+ };
807
+ context.log('Conversion process completed successfully');
808
+ }
809
+ } catch (error) {
810
+ console.error('Error converting file:', error);
811
+ context.log('Error during conversion:', error.message);
812
+ // Don't fail the upload if conversion fails
813
+ }
814
+ }
815
+
816
+ context.res = {
817
+ status: 200,
818
+ body: result,
819
+ };
820
+
821
+ // Clean up temp files
822
+ context.log('Cleaning up temporary files...');
823
+ if (tempDir) {
824
+ fs.rmSync(tempDir, { recursive: true, force: true });
825
+ context.log('Temporary files cleaned up');
826
+ }
827
+
828
+ context.log('Upload process completed successfully');
829
+ resolve(result);
830
+ } catch (error) {
831
+ context.log('Error in upload process:', error);
832
+ if (body.url) {
833
+ try {
834
+ await cleanup(context, [body.url]);
835
+ } catch (cleanupError) {
836
+ context.log('Error during cleanup after failure:', cleanupError);
837
+ }
838
+ }
839
+ throw error;
840
+ }
841
+ }
842
+
843
+ // Helper to convert a stream to a buffer
844
+ async function streamToBuffer(stream) {
845
+ return new Promise((resolve, reject) => {
846
+ const chunks = [];
847
+ stream.on('data', (chunk) => chunks.push(chunk));
848
+ stream.on('end', () => resolve(Buffer.concat(chunks)));
849
+ stream.on('error', reject);
850
+ });
851
+ }
852
+
853
+ // Function to delete files that haven't been used in more than a month
854
+ async function cleanup(context, urls = null) {
855
+ const { containerClient } = await getBlobClient();
856
+ const cleanedURLs = [];
857
+
858
+ if (!urls) {
859
+ const xMonthAgo = new Date();
860
+ xMonthAgo.setMonth(xMonthAgo.getMonth() - 1);
861
+
862
+ const blobs = containerClient.listBlobsFlat();
863
+
864
+ for await (const blob of blobs) {
865
+ const lastModified = blob.properties.lastModified;
866
+ if (lastModified < xMonthAgo) {
867
+ try {
868
+ const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
869
+ await blockBlobClient.delete();
870
+ context.log(`Cleaned blob: ${blob.name}`);
871
+ cleanedURLs.push(blob.name);
872
+ } catch (error) {
873
+ if (error.statusCode !== 404) {
874
+ context.log(`Error cleaning blob ${blob.name}:`, error);
875
+ }
876
+ }
877
+ }
878
+ }
879
+ } else {
880
+ for (const url of urls) {
881
+ try {
882
+ const blobName = url.replace(containerClient.url, '');
883
+ const blockBlobClient = containerClient.getBlockBlobClient(blobName);
884
+ await blockBlobClient.delete();
885
+ context.log(`Cleaned blob: ${blobName}`);
886
+ cleanedURLs.push(blobName);
887
+ } catch (error) {
888
+ if (error.statusCode !== 404) {
889
+ context.log(`Error cleaning blob ${url}:`, error);
890
+ }
891
+ }
892
+ }
893
+ }
894
+ return cleanedURLs;
895
+ }
896
+
897
+ async function cleanupGCS(urls = null) {
898
+ if (!gcs) return [];
899
+ const bucket = gcs.bucket(GCS_BUCKETNAME);
900
+ const directories = new Set();
901
+ const cleanedURLs = [];
902
+
903
+ if (!urls) {
904
+ const daysN = 30;
905
+ const threshold = Date.now() - daysN * 24 * 60 * 60 * 1000;
906
+ const [files] = await bucket.getFiles();
907
+
908
+ for (const file of files) {
909
+ const [metadata] = await file.getMetadata();
910
+ const directoryPath = path.dirname(file.name);
911
+ directories.add(directoryPath);
912
+ if (metadata.updated) {
913
+ const updatedTime = new Date(metadata.updated).getTime();
914
+ if (updatedTime < threshold) {
915
+ await file.delete();
916
+ cleanedURLs.push(file.name);
917
+ }
918
+ }
919
+ }
920
+ } else {
921
+ for (const url of urls) {
922
+ const filePath = url.split('/').slice(3).join('/');
923
+ const file = bucket.file(filePath);
924
+ const directoryPath = path.dirname(file.name);
925
+ directories.add(directoryPath);
926
+ await file.delete();
927
+ cleanedURLs.push(url);
928
+ }
929
+ }
930
+
931
+ for (const directory of directories) {
932
+ const [files] = await bucket.getFiles({ prefix: directory });
933
+ if (files.length === 0) {
934
+ await bucket.deleteFiles({ prefix: directory });
935
+ }
936
+ }
937
+
938
+ return cleanedURLs;
939
+ }
940
+
941
+ async function deleteGCS(blobName) {
942
+ if (!blobName) {
943
+ console.log('[deleteGCS] No blobName provided, skipping GCS deletion');
944
+ return;
945
+ }
946
+
947
+ if (!gcs) {
948
+ console.log('[deleteGCS] GCS not initialized, skipping deletion');
949
+ return;
950
+ }
951
+
952
+ try {
953
+ if (process.env.STORAGE_EMULATOR_HOST) {
954
+ console.log(`[deleteGCS] Using emulator at ${process.env.STORAGE_EMULATOR_HOST}`);
955
+ console.log(`[deleteGCS] Attempting to delete files with prefix: ${blobName}`);
956
+
957
+ // List files first
958
+ const listUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o?prefix=${blobName}`;
959
+ console.log(`[deleteGCS] Listing files with URL: ${listUrl}`);
960
+
961
+ const listResponse = await axios.get(listUrl, {
962
+ validateStatus: (status) => true,
963
+ });
964
+ console.log(`[deleteGCS] List response status: ${listResponse.status}`);
965
+ console.log(`[deleteGCS] List response data: ${JSON.stringify(listResponse.data)}`);
966
+
967
+ if (listResponse.status === 200 && listResponse.data.items) {
968
+ console.log(`[deleteGCS] Found ${listResponse.data.items.length} items to delete`);
969
+
970
+ // Delete each file
971
+ for (const item of listResponse.data.items) {
972
+ const deleteUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o/${encodeURIComponent(item.name)}`;
973
+ console.log(`[deleteGCS] Deleting file: ${item.name}`);
974
+ console.log(`[deleteGCS] Delete URL: ${deleteUrl}`);
975
+
976
+ const deleteResponse = await axios.delete(deleteUrl, {
977
+ validateStatus: (status) => true,
978
+ headers: {
979
+ 'Content-Type': 'application/json',
980
+ },
981
+ });
982
+ console.log(`[deleteGCS] Delete response status: ${deleteResponse.status}`);
983
+ console.log(`[deleteGCS] Delete response data: ${JSON.stringify(deleteResponse.data)}`);
984
+ }
985
+ console.log('[deleteGCS] All files deleted successfully');
986
+ } else {
987
+ console.log('[deleteGCS] No files found to delete');
988
+ }
989
+ } else {
990
+ console.log('[deleteGCS] Using real GCS');
991
+ const bucket = gcs.bucket(GCS_BUCKETNAME);
992
+ const [files] = await bucket.getFiles({ prefix: blobName });
993
+ console.log(`[deleteGCS] Found ${files.length} files to delete`);
994
+
995
+ if (files.length > 0) {
996
+ await Promise.all(files.map((file) => file.delete()));
997
+ console.log('[deleteGCS] All files deleted successfully');
998
+ } else {
999
+ console.log('[deleteGCS] No files found to delete');
1000
+ }
1001
+ }
1002
+ } catch (error) {
1003
+ // If we get a 404 error, it means the file is already gone, which is fine
1004
+ if (error.response?.status === 404 || error.code === 404) {
1005
+ console.log('[deleteGCS] File not found in GCS (404) - this is expected if file was already deleted');
1006
+ return;
1007
+ }
1008
+ console.error('[deleteGCS] Error during deletion:', error);
1009
+ console.error('[deleteGCS] Error details:', {
1010
+ message: error.message,
1011
+ code: error.code,
1012
+ errors: error.errors,
1013
+ response: error.response ? {
1014
+ status: error.response.status,
1015
+ statusText: error.response.statusText,
1016
+ data: error.response.data,
1017
+ headers: error.response.headers,
1018
+ } : null,
1019
+ });
1020
+ // Don't throw the error - we want to continue with cleanup even if GCS deletion fails
1021
+ }
1022
+ }
1023
+
1024
+ // Helper function to ensure GCS upload for existing files
1025
+ async function ensureGCSUpload(context, existingFile) {
1026
+ if (!existingFile.gcs && gcs) {
1027
+ context.log('GCS file was missing - uploading.');
1028
+ const fileName = sanitizeFilename(path.basename(existingFile.url.split('?')[0]));
1029
+ const response = await axios({ method: 'get', url: existingFile.url, responseType: 'stream' });
1030
+ existingFile.gcs = await uploadToGCS(context, response.data, fileName);
1031
+ }
1032
+ return existingFile;
1033
+ }
1034
+
1035
+ async function uploadChunkToGCS(chunkPath, requestId) {
1036
+ if (!gcs) return null;
1037
+ const dirName = requestId || uuidv4();
1038
+ const baseName = sanitizeFilename(path.basename(chunkPath));
1039
+ const gcsFileName = `${dirName}/${baseName}`;
1040
+ await gcs.bucket(GCS_BUCKETNAME).upload(chunkPath, { destination: gcsFileName });
1041
+ return `gs://${GCS_BUCKETNAME}/${gcsFileName}`;
1042
+ }
1043
+
1044
+ export {
1045
+ saveFileToBlob,
1046
+ deleteBlob,
1047
+ deleteGCS,
1048
+ uploadBlob,
1049
+ cleanup,
1050
+ cleanupGCS,
1051
+ gcsUrlExists,
1052
+ ensureGCSUpload,
1053
+ gcs,
1054
+ uploadChunkToGCS,
1055
+ downloadFromGCS,
1056
+ };