@aj-archipelago/cortex 1.3.49 → 1.3.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/config.js +1 -1
  2. package/helper-apps/cortex-browser/Dockerfile +19 -31
  3. package/helper-apps/cortex-browser/function_app.py +708 -181
  4. package/helper-apps/cortex-browser/requirements.txt +4 -4
  5. package/helper-apps/cortex-file-handler/blobHandler.js +850 -429
  6. package/helper-apps/cortex-file-handler/constants.js +64 -48
  7. package/helper-apps/cortex-file-handler/docHelper.js +7 -114
  8. package/helper-apps/cortex-file-handler/fileChunker.js +96 -51
  9. package/helper-apps/cortex-file-handler/function.json +2 -6
  10. package/helper-apps/cortex-file-handler/helper.js +34 -25
  11. package/helper-apps/cortex-file-handler/index.js +324 -136
  12. package/helper-apps/cortex-file-handler/localFileHandler.js +56 -57
  13. package/helper-apps/cortex-file-handler/package-lock.json +6065 -5964
  14. package/helper-apps/cortex-file-handler/package.json +8 -4
  15. package/helper-apps/cortex-file-handler/redis.js +23 -17
  16. package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +12 -9
  17. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +21 -18
  18. package/helper-apps/cortex-file-handler/scripts/test-azure.sh +1 -1
  19. package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +1 -1
  20. package/helper-apps/cortex-file-handler/services/ConversionService.js +288 -0
  21. package/helper-apps/cortex-file-handler/services/FileConversionService.js +53 -0
  22. package/helper-apps/cortex-file-handler/start.js +63 -38
  23. package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +144 -0
  24. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +88 -64
  25. package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +114 -91
  26. package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +351 -0
  27. package/helper-apps/cortex-file-handler/tests/files/DOCX_TestPage.docx +0 -0
  28. package/helper-apps/cortex-file-handler/tests/files/tests-example.xls +0 -0
  29. package/helper-apps/cortex-file-handler/tests/start.test.js +943 -642
  30. package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +31 -0
  31. package/helper-apps/cortex-markitdown/.funcignore +1 -0
  32. package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py +64 -0
  33. package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json +21 -0
  34. package/helper-apps/cortex-markitdown/README.md +94 -0
  35. package/helper-apps/cortex-markitdown/host.json +15 -0
  36. package/helper-apps/cortex-markitdown/requirements.txt +2 -0
  37. package/lib/requestExecutor.js +44 -36
  38. package/package.json +1 -1
  39. package/pathways/system/entity/tools/sys_tool_cognitive_search.js +1 -1
  40. package/pathways/system/entity/tools/sys_tool_readfile.js +24 -2
  41. package/server/plugins/openAiWhisperPlugin.js +59 -87
  42. package/helper-apps/cortex-file-handler/tests/docHelper.test.js +0 -148
@@ -1,68 +1,99 @@
1
- import fs from "fs";
2
- import path from "path";
3
- import { generateBlobSASQueryParameters, StorageSharedKeyCredential, BlobServiceClient } from "@azure/storage-blob";
4
- import { v4 as uuidv4 } from "uuid";
5
- import Busboy from "busboy";
6
- import { PassThrough } from "stream";
7
- import { pipeline as _pipeline } from "stream";
8
- import { promisify } from "util";
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import { join } from 'path';
4
+ import { PassThrough } from 'stream';
5
+ import { pipeline as _pipeline } from 'stream';
6
+ import { promisify } from 'util';
7
+
8
+ import {
9
+ generateBlobSASQueryParameters,
10
+ StorageSharedKeyCredential,
11
+ BlobServiceClient,
12
+ } from '@azure/storage-blob';
13
+ import { Storage } from '@google-cloud/storage';
14
+ import axios from 'axios';
15
+ import Busboy from 'busboy';
16
+ import { v4 as uuidv4 } from 'uuid';
9
17
  const pipeline = promisify(_pipeline);
10
- import { join } from "path";
11
- import { Storage } from "@google-cloud/storage";
12
- import axios from "axios";
13
- import { publicFolder, port, ipAddress } from "./start.js";
18
+
19
+ import { publicFolder, port, ipAddress } from './start.js';
20
+ import { CONVERTED_EXTENSIONS } from './constants.js';
21
+
14
22
  // eslint-disable-next-line import/no-extraneous-dependencies
15
- import mime from "mime-types";
23
+ import mime from 'mime-types';
24
+
25
+ import os from 'os';
26
+
27
+ import { FileConversionService } from './services/FileConversionService.js';
16
28
 
17
29
  function isBase64(str) {
18
- try {
19
- return btoa(atob(str)) == str;
20
- } catch (err) {
21
- return false;
22
- }
30
+ try {
31
+ return btoa(atob(str)) == str;
32
+ } catch (err) {
33
+ return false;
34
+ }
23
35
  }
24
36
 
25
37
  const { SAS_TOKEN_LIFE_DAYS = 30 } = process.env;
26
38
  const GCP_SERVICE_ACCOUNT_KEY =
27
39
  process.env.GCP_SERVICE_ACCOUNT_KEY_BASE64 ||
28
40
  process.env.GCP_SERVICE_ACCOUNT_KEY ||
29
- "{}";
41
+ '{}';
30
42
  const GCP_SERVICE_ACCOUNT = isBase64(GCP_SERVICE_ACCOUNT_KEY)
31
- ? JSON.parse(Buffer.from(GCP_SERVICE_ACCOUNT_KEY, "base64").toString())
32
- : JSON.parse(GCP_SERVICE_ACCOUNT_KEY);
43
+ ? JSON.parse(Buffer.from(GCP_SERVICE_ACCOUNT_KEY, 'base64').toString())
44
+ : JSON.parse(GCP_SERVICE_ACCOUNT_KEY);
33
45
  const { project_id: GCP_PROJECT_ID } = GCP_SERVICE_ACCOUNT;
34
46
 
35
47
  let gcs;
36
48
  if (!GCP_PROJECT_ID || !GCP_SERVICE_ACCOUNT) {
37
- console.warn(
38
- "No Google Cloud Storage credentials provided - GCS will not be used"
39
- );
49
+ console.warn(
50
+ 'No Google Cloud Storage credentials provided - GCS will not be used',
51
+ );
40
52
  } else {
41
- try {
42
- gcs = new Storage({
43
- projectId: GCP_PROJECT_ID,
44
- credentials: GCP_SERVICE_ACCOUNT,
45
- });
53
+ try {
54
+ gcs = new Storage({
55
+ projectId: GCP_PROJECT_ID,
56
+ credentials: GCP_SERVICE_ACCOUNT,
57
+ });
46
58
 
47
59
  // Rest of your Google Cloud operations using gcs object
48
- } catch (error) {
49
- console.error(
50
- "Google Cloud Storage credentials are invalid - GCS will not be used: ",
51
- error
52
- );
53
- }
60
+ } catch (error) {
61
+ console.error(
62
+ 'Google Cloud Storage credentials are invalid - GCS will not be used: ',
63
+ error,
64
+ );
65
+ }
54
66
  }
55
67
 
56
- export const AZURE_STORAGE_CONTAINER_NAME = process.env.AZURE_STORAGE_CONTAINER_NAME || "whispertempfiles";
57
- export const GCS_BUCKETNAME = process.env.GCS_BUCKETNAME || "cortextempfiles";
68
+ export const AZURE_STORAGE_CONTAINER_NAME =
69
+ process.env.AZURE_STORAGE_CONTAINER_NAME || 'whispertempfiles';
70
+ export const GCS_BUCKETNAME = process.env.GCS_BUCKETNAME || 'cortextempfiles';
71
+
72
+ function isEncoded(str) {
73
+ // Checks for any percent-encoded sequence
74
+ return /%[0-9A-Fa-f]{2}/.test(str);
75
+ }
76
+
77
+ // Helper function to ensure GCS URLs are never encoded
78
+ function ensureUnencodedGcsUrl(url) {
79
+ if (!url || !url.startsWith('gs://')) {
80
+ return url;
81
+ }
82
+ // Split into bucket and path parts
83
+ const [bucket, ...pathParts] = url.replace('gs://', '').split('/');
84
+ // Reconstruct URL with decoded path parts
85
+ return `gs://${bucket}/${pathParts.map(part => decodeURIComponent(part)).join('/')}`;
86
+ }
58
87
 
59
88
  async function gcsUrlExists(url, defaultReturn = false) {
60
89
  try {
61
- if(!url || !gcs) {
90
+ if (!url || !gcs) {
62
91
  return defaultReturn; // Cannot check return
63
92
  }
64
93
 
65
- const urlParts = url.replace('gs://', '').split('/');
94
+ // Ensure URL is not encoded
95
+ const unencodedUrl = ensureUnencodedGcsUrl(url);
96
+ const urlParts = unencodedUrl.replace('gs://', '').split('/');
66
97
  const bucketName = urlParts[0];
67
98
  const fileName = urlParts.slice(1).join('/');
68
99
 
@@ -70,7 +101,7 @@ async function gcsUrlExists(url, defaultReturn = false) {
70
101
  try {
71
102
  const response = await axios.get(
72
103
  `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${bucketName}/o/${encodeURIComponent(fileName)}`,
73
- { validateStatus: status => status === 200 || status === 404 }
104
+ { validateStatus: (status) => status === 200 || status === 404 },
74
105
  );
75
106
  return response.status === 200;
76
107
  } catch (error) {
@@ -83,7 +114,7 @@ async function gcsUrlExists(url, defaultReturn = false) {
83
114
  const file = bucket.file(fileName);
84
115
 
85
116
  const [exists] = await file.exists();
86
-
117
+
87
118
  return exists;
88
119
  } catch (error) {
89
120
  console.error('Error checking if GCS URL exists:', error);
@@ -91,477 +122,867 @@ async function gcsUrlExists(url, defaultReturn = false) {
91
122
  }
92
123
  }
93
124
 
125
+ /**
126
+ * Downloads a file from Google Cloud Storage to a local file
127
+ * @param {string} gcsUrl - The GCS URL in format gs://bucket-name/file-path
128
+ * @param {string} destinationPath - The local path where the file should be saved
129
+ * @returns {Promise<void>}
130
+ */
131
+ async function downloadFromGCS(gcsUrl, destinationPath) {
132
+ if (!gcsUrl || !gcs) {
133
+ throw new Error('Invalid GCS URL or GCS client not initialized');
134
+ }
135
+
136
+ const urlParts = gcsUrl.replace('gs://', '').split('/');
137
+ const bucketName = urlParts[0];
138
+ const fileName = urlParts.slice(1).join('/');
139
+
140
+ if (process.env.STORAGE_EMULATOR_HOST) {
141
+ // Use axios to download from emulator
142
+ const response = await axios({
143
+ method: 'GET',
144
+ url: `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${bucketName}/o/${encodeURIComponent(fileName)}?alt=media`,
145
+ responseType: 'stream'
146
+ });
147
+
148
+ // Write the response to file
149
+ const writer = fs.createWriteStream(destinationPath);
150
+ await new Promise((resolve, reject) => {
151
+ response.data.pipe(writer);
152
+ writer.on('finish', resolve);
153
+ writer.on('error', reject);
154
+ });
155
+ } else {
156
+ // Use GCS client for real GCS
157
+ const bucket = gcs.bucket(bucketName);
158
+ const file = bucket.file(fileName);
159
+ await file.download({ destination: destinationPath });
160
+ }
161
+ }
162
+
94
163
  export const getBlobClient = async () => {
95
- const connectionString = process.env.AZURE_STORAGE_CONNECTION_STRING;
96
- const containerName = AZURE_STORAGE_CONTAINER_NAME;
97
- if (!connectionString || !containerName) {
98
- throw new Error(
99
- "Missing Azure Storage connection string or container name environment variable"
100
- );
101
- }
164
+ const connectionString = process.env.AZURE_STORAGE_CONNECTION_STRING;
165
+ const containerName = AZURE_STORAGE_CONTAINER_NAME;
166
+ if (!connectionString || !containerName) {
167
+ throw new Error(
168
+ 'Missing Azure Storage connection string or container name environment variable',
169
+ );
170
+ }
102
171
 
103
- const blobServiceClient = BlobServiceClient.fromConnectionString(connectionString);
172
+ const blobServiceClient =
173
+ BlobServiceClient.fromConnectionString(connectionString);
104
174
 
105
- const serviceProperties = await blobServiceClient.getProperties();
106
- if(!serviceProperties.defaultServiceVersion) {
107
- serviceProperties.defaultServiceVersion = '2020-02-10';
108
- await blobServiceClient.setProperties(serviceProperties);
109
- }
175
+ const serviceProperties = await blobServiceClient.getProperties();
176
+ if (!serviceProperties.defaultServiceVersion) {
177
+ serviceProperties.defaultServiceVersion = '2020-02-10';
178
+ await blobServiceClient.setProperties(serviceProperties);
179
+ }
110
180
 
111
- const containerClient = blobServiceClient.getContainerClient(containerName);
181
+ const containerClient = blobServiceClient.getContainerClient(containerName);
112
182
 
113
- return { blobServiceClient, containerClient };
183
+ return { blobServiceClient, containerClient };
114
184
  };
115
185
 
116
186
  async function saveFileToBlob(chunkPath, requestId) {
117
- const { containerClient } = await getBlobClient();
118
- // Use the filename with a UUID as the blob name
119
- const blobName = `${requestId}/${uuidv4()}_${encodeURIComponent(path.basename(chunkPath))}`;
120
- const sasToken = generateSASToken(containerClient, blobName);
121
-
122
- // Create a read stream for the chunk file
123
- const fileStream = fs.createReadStream(chunkPath);
187
+ const { containerClient } = await getBlobClient();
188
+ // Use the filename with a UUID as the blob name
189
+ let baseName = path.basename(chunkPath);
190
+ // Remove any query parameters from the filename
191
+ baseName = baseName.split('?')[0];
192
+ // Only encode if not already encoded
193
+ if (!isEncoded(baseName)) {
194
+ baseName = encodeURIComponent(baseName);
195
+ }
196
+ const blobName = `${requestId}/${uuidv4()}_${baseName}`;
197
+
198
+ // Create a read stream for the chunk file
199
+ const fileStream = fs.createReadStream(chunkPath);
124
200
 
125
- // Upload the chunk to Azure Blob Storage using the stream
126
- const blockBlobClient = containerClient.getBlockBlobClient(blobName);
127
- await blockBlobClient.uploadStream(fileStream);
201
+ // Upload the chunk to Azure Blob Storage using the stream
202
+ const blockBlobClient = containerClient.getBlockBlobClient(blobName);
203
+ await blockBlobClient.uploadStream(fileStream);
128
204
 
129
- // Return the full URI of the uploaded blob
130
- const blobUrl = `${blockBlobClient.url}?${sasToken}`;
131
- return blobUrl;
205
+ // Generate SAS token after successful upload
206
+ const sasToken = generateSASToken(containerClient, blobName);
207
+
208
+ // Return an object with the URL property
209
+ return {
210
+ url: `${blockBlobClient.url}?${sasToken}`,
211
+ blobName: blobName
212
+ };
132
213
  }
133
214
 
134
- const generateSASToken = (containerClient, blobName, expiryTimeSeconds =
135
- parseInt(SAS_TOKEN_LIFE_DAYS) * 24 * 60 * 60
215
+ const generateSASToken = (
216
+ containerClient,
217
+ blobName,
218
+ expiryTimeSeconds = parseInt(SAS_TOKEN_LIFE_DAYS) * 24 * 60 * 60,
136
219
  ) => {
137
- const { accountName, accountKey } = containerClient.credential;
138
- const sharedKeyCredential = new StorageSharedKeyCredential(accountName, accountKey);
139
-
140
- const sasOptions = {
141
- containerName: containerClient.containerName,
142
- blobName: blobName,
143
- permissions: "r", // Read permission
144
- startsOn: new Date(),
145
- expiresOn: new Date(new Date().valueOf() + expiryTimeSeconds * 1000)
146
- };
147
-
148
- const sasToken = generateBlobSASQueryParameters(sasOptions, sharedKeyCredential).toString();
149
- return sasToken;
220
+ const { accountName, accountKey } = containerClient.credential;
221
+ const sharedKeyCredential = new StorageSharedKeyCredential(
222
+ accountName,
223
+ accountKey,
224
+ );
225
+
226
+ const sasOptions = {
227
+ containerName: containerClient.containerName,
228
+ blobName: blobName,
229
+ permissions: 'r', // Read permission
230
+ startsOn: new Date(),
231
+ expiresOn: new Date(new Date().valueOf() + expiryTimeSeconds * 1000),
232
+ };
233
+
234
+ const sasToken = generateBlobSASQueryParameters(
235
+ sasOptions,
236
+ sharedKeyCredential,
237
+ ).toString();
238
+ return sasToken;
150
239
  };
151
240
 
152
241
  //deletes blob that has the requestId
153
242
  async function deleteBlob(requestId) {
154
- if (!requestId) throw new Error("Missing requestId parameter");
155
- const { containerClient } = await getBlobClient();
156
- // List all blobs in the container
157
- const blobs = containerClient.listBlobsFlat();
158
-
159
- const result = [];
160
- // Iterate through the blobs
161
- for await (const blob of blobs) {
243
+ if (!requestId) throw new Error('Missing requestId parameter');
244
+ const { containerClient } = await getBlobClient();
245
+ // List all blobs in the container
246
+ const blobs = containerClient.listBlobsFlat();
247
+
248
+ const result = [];
249
+ // Iterate through the blobs
250
+ for await (const blob of blobs) {
162
251
  // Check if the blob name starts with requestId_ (flat structure)
163
252
  // or is inside a folder named requestId/ (folder structure)
164
- if (blob.name.startsWith(`${requestId}_`) || blob.name.startsWith(`${requestId}/`)) {
165
- // Delete the matching blob
166
- const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
167
- await blockBlobClient.delete();
168
- console.log(`Cleaned blob: ${blob.name}`);
169
- result.push(blob.name);
253
+ if (
254
+ blob.name.startsWith(`${requestId}_`) ||
255
+ blob.name.startsWith(`${requestId}/`)
256
+ ) {
257
+ // Delete the matching blob
258
+ const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
259
+ await blockBlobClient.delete();
260
+ console.log(`Cleaned blob: ${blob.name}`);
261
+ result.push(blob.name);
262
+ }
170
263
  }
171
- }
172
264
 
173
- return result;
265
+ return result;
174
266
  }
175
267
 
176
- function uploadBlob(context, req, saveToLocal = false, filePath=null, hash=null) {
177
- return new Promise((resolve, reject) => {
178
- (async () => {
179
- try {
180
- let requestId = uuidv4();
181
- let body = {};
182
-
183
- // If filePath is given, we are dealing with local file and not form-data
184
- if (filePath) {
185
- const file = fs.createReadStream(filePath);
186
- const filename = path.basename(filePath);
187
- try {
188
- const result = await uploadFile(context, requestId, body, saveToLocal, file, filename, resolve, hash);
189
- resolve(result);
190
- } catch (error) {
191
- const err = new Error("Error processing file upload.");
192
- err.status = 500;
193
- throw err;
194
- }
195
- } else {
196
- // Otherwise, continue working with form-data
197
- const busboy = Busboy({ headers: req.headers });
198
- let hasFile = false;
199
- let errorOccurred = false;
200
-
201
- busboy.on("field", (fieldname, value) => {
202
- if (fieldname === "requestId") {
203
- requestId = value;
204
- }
205
- });
206
-
207
- busboy.on("file", async (fieldname, file, filename) => {
208
- if (errorOccurred) return;
209
- hasFile = true;
210
- uploadFile(context, requestId, body, saveToLocal, file, filename?.filename || filename, resolve, hash).catch(_error => {
211
- if (errorOccurred) return;
212
- errorOccurred = true;
213
- const err = new Error("Error processing file upload.");
214
- err.status = 500;
215
- reject(err);
216
- });
217
- });
218
-
219
- busboy.on("error", (_error) => {
220
- if (errorOccurred) return;
221
- errorOccurred = true;
222
- const err = new Error("No file provided in request");
223
- err.status = 400;
224
- reject(err);
225
- });
226
-
227
- busboy.on("finish", () => {
228
- if (errorOccurred) return;
229
- if (!hasFile) {
230
- errorOccurred = true;
231
- const err = new Error("No file provided in request");
232
- err.status = 400;
233
- reject(err);
234
- }
235
- });
236
-
237
- // Handle errors from piping the request
238
- req.on('error', (error) => {
239
- if (errorOccurred) return;
240
- errorOccurred = true;
241
- // Only log unexpected errors
242
- if (error.message !== "No file provided in request") {
243
- context.log("Error in request stream:", error);
244
- }
245
- const err = new Error("No file provided in request");
246
- err.status = 400;
247
- reject(err);
248
- });
249
-
250
- try {
251
- req.pipe(busboy);
252
- } catch (error) {
253
- if (errorOccurred) return;
254
- errorOccurred = true;
255
- // Only log unexpected errors
256
- if (error.message !== "No file provided in request") {
257
- context.log("Error piping request to busboy:", error);
268
+ function uploadBlob(
269
+ context,
270
+ req,
271
+ saveToLocal = false,
272
+ filePath = null,
273
+ hash = null,
274
+ ) {
275
+ return new Promise((resolve, reject) => {
276
+ (async () => {
277
+ try {
278
+ let requestId = uuidv4();
279
+ const body = {};
280
+
281
+ // If filePath is given, we are dealing with local file and not form-data
282
+ if (filePath) {
283
+ const file = fs.createReadStream(filePath);
284
+ const filename = path.basename(filePath);
285
+ try {
286
+ const result = await uploadFile(
287
+ context,
288
+ requestId,
289
+ body,
290
+ saveToLocal,
291
+ file,
292
+ filename,
293
+ resolve,
294
+ hash,
295
+ );
296
+ resolve(result);
297
+ } catch (error) {
298
+ const err = new Error('Error processing file upload.');
299
+ err.status = 500;
300
+ throw err;
301
+ }
302
+ } else {
303
+ // Otherwise, continue working with form-data
304
+ const busboy = Busboy({ headers: req.headers });
305
+ let hasFile = false;
306
+ let errorOccurred = false;
307
+
308
+ busboy.on('field', (fieldname, value) => {
309
+ if (fieldname === 'requestId') {
310
+ requestId = value;
311
+ } else if (fieldname === 'hash') {
312
+ hash = value;
313
+ }
314
+ });
315
+
316
+ busboy.on('file', async (fieldname, file, info) => {
317
+ if (errorOccurred) return;
318
+ hasFile = true;
319
+
320
+ // Validate file
321
+ if (!info.filename || info.filename.trim() === '') {
322
+ errorOccurred = true;
323
+ const err = new Error('Invalid file: missing filename');
324
+ err.status = 400;
325
+ reject(err);
326
+ return;
327
+ }
328
+
329
+ // Prepare for streaming to cloud destinations
330
+ const filename = info.filename;
331
+ const safeFilename = path.basename(filename); // Sanitize filename
332
+ const uploadName = `${requestId || uuidv4()}_${safeFilename}`;
333
+ const azureStream = !saveToLocal ? new PassThrough() : null;
334
+ const gcsStream = gcs ? new PassThrough() : null;
335
+ let diskWriteStream, tempDir, tempFilePath;
336
+ let diskWritePromise;
337
+ let diskWriteError = null;
338
+ let cloudUploadError = null;
339
+
340
+ // Start local disk write in parallel (non-blocking for response)
341
+ if (saveToLocal) {
342
+ try {
343
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'upload-'));
344
+ } catch (err) {
345
+ console.error('Error creating tempDir:', err);
346
+ errorOccurred = true;
347
+ reject(err);
348
+ return;
349
+ }
350
+ tempFilePath = path.join(tempDir, safeFilename);
351
+ console.log('Temp dir:', tempDir, 'Original filename:', filename, 'Safe filename:', safeFilename, 'Temp file path:', tempFilePath);
352
+ console.log('About to create write stream for:', tempFilePath);
353
+ try {
354
+ diskWriteStream = fs.createWriteStream(tempFilePath, {
355
+ highWaterMark: 1024 * 1024,
356
+ autoClose: true,
357
+ });
358
+ console.log('Write stream created successfully for:', tempFilePath);
359
+ } catch (err) {
360
+ console.error('Error creating write stream:', err, 'Temp dir exists:', fs.existsSync(tempDir));
361
+ errorOccurred = true;
362
+ reject(err);
363
+ return;
364
+ }
365
+ diskWriteStream.on('error', (err) => {
366
+ console.error('Disk write stream error:', err);
367
+ });
368
+ diskWriteStream.on('close', () => {
369
+ console.log('Disk write stream closed for:', tempFilePath);
370
+ });
371
+ diskWritePromise = new Promise((res, rej) => {
372
+ diskWriteStream.on('finish', res);
373
+ diskWriteStream.on('error', (err) => {
374
+ diskWriteError = err;
375
+ rej(err);
376
+ });
377
+ });
378
+ }
379
+
380
+ // Pipe incoming file to all destinations
381
+ let receivedAnyData = false;
382
+ file.on('data', () => { receivedAnyData = true; });
383
+ if (azureStream) file.pipe(azureStream);
384
+ if (gcsStream) file.pipe(gcsStream);
385
+ if (diskWriteStream) file.pipe(diskWriteStream);
386
+
387
+ // Listen for end event to check for empty file
388
+ file.on('end', async () => {
389
+ if (!receivedAnyData) {
390
+ errorOccurred = true;
391
+ // Abort all streams
392
+ if (azureStream) azureStream.destroy();
393
+ if (gcsStream) gcsStream.destroy();
394
+ if (diskWriteStream) diskWriteStream.destroy();
395
+ const err = new Error('Invalid file: file is empty');
396
+ err.status = 400;
397
+ reject(err);
398
+ }
399
+ });
400
+
401
+ // Start cloud uploads immediately
402
+ let azurePromise;
403
+ if (!saveToLocal) {
404
+ azurePromise = saveToAzureStorage(context, uploadName, azureStream)
405
+ .catch(async (err) => {
406
+ cloudUploadError = err;
407
+ // Fallback: try from disk if available
408
+ if (diskWritePromise) {
409
+ await diskWritePromise;
410
+ const diskStream = fs.createReadStream(tempFilePath, {
411
+ highWaterMark: 1024 * 1024,
412
+ autoClose: true,
413
+ });
414
+ return saveToAzureStorage(context, uploadName, diskStream);
415
+ }
416
+ throw err;
417
+ });
418
+ }
419
+ let gcsPromise;
420
+ if (gcsStream) {
421
+ gcsPromise = saveToGoogleStorage(context, uploadName, gcsStream)
422
+ .catch(async (err) => {
423
+ cloudUploadError = err;
424
+ if (diskWritePromise) {
425
+ await diskWritePromise;
426
+ const diskStream = fs.createReadStream(tempFilePath, {
427
+ highWaterMark: 1024 * 1024,
428
+ autoClose: true,
429
+ });
430
+ return saveToGoogleStorage(context, uploadName, diskStream);
431
+ }
432
+ throw err;
433
+ });
434
+ }
435
+
436
+ // Wait for cloud uploads to finish
437
+ try {
438
+ const results = await Promise.all([
439
+ azurePromise ? azurePromise.then((url) => ({ url, type: 'primary' })) : null,
440
+ (!azurePromise && saveToLocal)
441
+ ? Promise.resolve({ url: null, type: 'primary-local' }) // placeholder for local, url handled later
442
+ : null,
443
+ gcsPromise ? gcsPromise.then((gcs) => ({ gcs, type: 'gcs' })) : null,
444
+ ].filter(Boolean));
445
+
446
+ const result = {
447
+ message: `File '${uploadName}' uploaded successfully.`,
448
+ filename: uploadName,
449
+ ...results.reduce((acc, result) => {
450
+ if (result.type === 'primary') acc.url = result.url;
451
+ if (result.type === 'gcs') acc.gcs = ensureUnencodedGcsUrl(result.gcs);
452
+ return acc;
453
+ }, {}),
454
+ };
455
+ if (hash) result.hash = hash;
456
+
457
+ // If saving locally, wait for disk write to finish and then move to public folder
458
+ if (saveToLocal) {
459
+ try {
460
+ if (diskWritePromise) {
461
+ await diskWritePromise; // ensure file fully written
462
+ }
463
+ const localUrl = await saveToLocalStorage(
464
+ context,
465
+ requestId,
466
+ uploadName,
467
+ fs.createReadStream(tempFilePath, {
468
+ highWaterMark: 1024 * 1024,
469
+ autoClose: true,
470
+ }),
471
+ );
472
+ result.url = localUrl;
473
+ } catch (err) {
474
+ console.error('Error saving to local storage:', err);
475
+ throw err;
476
+ }
477
+ }
478
+
479
+ // Respond as soon as cloud uploads are done
480
+ context.res = { status: 200, body: result };
481
+ resolve(result);
482
+ } catch (err) {
483
+ errorOccurred = true;
484
+ reject(err);
485
+ } finally {
486
+ // Clean up temp file if written
487
+ if (tempDir) {
488
+ fs.rmSync(tempDir, { recursive: true, force: true });
489
+ }
490
+ }
491
+ });
492
+
493
+ busboy.on('error', (error) => {
494
+ if (errorOccurred) return;
495
+ errorOccurred = true;
496
+ const err = new Error('No file provided in request');
497
+ err.status = 400;
498
+ reject(err);
499
+ });
500
+
501
+ busboy.on('finish', () => {
502
+ if (errorOccurred) return;
503
+ if (!hasFile) {
504
+ errorOccurred = true;
505
+ const err = new Error('No file provided in request');
506
+ err.status = 400;
507
+ reject(err);
508
+ }
509
+ });
510
+
511
+ // Handle errors from piping the request
512
+ req.on('error', (error) => {
513
+ if (errorOccurred) return;
514
+ errorOccurred = true;
515
+ // Only log unexpected errors
516
+ if (error.message !== 'No file provided in request') {
517
+ context.log('Error in request stream:', error);
518
+ }
519
+ const err = new Error('No file provided in request');
520
+ err.status = 400;
521
+ reject(err);
522
+ });
523
+
524
+ try {
525
+ req.pipe(busboy);
526
+ } catch (error) {
527
+ if (errorOccurred) return;
528
+ errorOccurred = true;
529
+ // Only log unexpected errors
530
+ if (error.message !== 'No file provided in request') {
531
+ context.log('Error piping request to busboy:', error);
532
+ }
533
+ const err = new Error('No file provided in request');
534
+ err.status = 400;
535
+ reject(err);
536
+ }
537
+ }
538
+ } catch (error) {
539
+ // Only log unexpected errors
540
+ if (error.message !== 'No file provided in request') {
541
+ context.log('Error processing file upload:', error);
542
+ }
543
+ const err = new Error(error.message || 'Error processing file upload.');
544
+ err.status = error.status || 500;
545
+ reject(err);
258
546
  }
259
- const err = new Error("No file provided in request");
260
- err.status = 400;
261
- reject(err);
262
- }
263
- }
264
- } catch (error) {
265
- // Only log unexpected errors
266
- if (error.message !== "No file provided in request") {
267
- context.log("Error processing file upload:", error);
268
- }
269
- const err = new Error(error.message || "Error processing file upload.");
270
- err.status = error.status || 500;
271
- reject(err);
272
- }
273
- })();
274
- });
547
+ })();
548
+ });
275
549
  }
276
550
 
277
551
  // Helper function to handle local file storage
278
552
  async function saveToLocalStorage(context, requestId, encodedFilename, file) {
279
- const localPath = join(publicFolder, requestId);
280
- fs.mkdirSync(localPath, { recursive: true });
281
- const destinationPath = `${localPath}/${encodedFilename}`;
282
- context.log(`Saving to local storage... ${destinationPath}`);
283
- await pipeline(file, fs.createWriteStream(destinationPath));
284
- return `http://${ipAddress}:${port}/files/${requestId}/${encodedFilename}`;
553
+ const localPath = join(publicFolder, requestId);
554
+ fs.mkdirSync(localPath, { recursive: true });
555
+ const destinationPath = `${localPath}/${encodedFilename}`;
556
+ context.log(`Saving to local storage... ${destinationPath}`);
557
+ await pipeline(file, fs.createWriteStream(destinationPath));
558
+ return `http://${ipAddress}:${port}/files/${requestId}/${encodedFilename}`;
285
559
  }
286
560
 
287
561
  // Helper function to handle Azure blob storage
288
562
  async function saveToAzureStorage(context, encodedFilename, file) {
289
- const { containerClient } = await getBlobClient();
290
- const contentType = mime.lookup(encodedFilename);
291
- const options = contentType ? { blobHTTPHeaders: { blobContentType: contentType } } : {};
292
-
293
- const blockBlobClient = containerClient.getBlockBlobClient(encodedFilename);
294
-
295
- context.log(`Uploading to Azure... ${encodedFilename}`);
296
- await blockBlobClient.uploadStream(file, undefined, undefined, options);
297
- const sasToken = generateSASToken(containerClient, encodedFilename);
298
- return `${blockBlobClient.url}?${sasToken}`;
563
+ const { containerClient } = await getBlobClient();
564
+ const contentType = mime.lookup(encodedFilename);
565
+
566
+ // Decode the filename if it's already encoded to prevent double-encoding
567
+ let blobName = encodedFilename;
568
+ if (isEncoded(blobName)) {
569
+ blobName = decodeURIComponent(blobName);
570
+ }
571
+
572
+ const options = {
573
+ blobHTTPHeaders: contentType ? { blobContentType: contentType } : {},
574
+ maxConcurrency: 50,
575
+ blockSize: 8 * 1024 * 1024,
576
+ };
577
+
578
+ const blockBlobClient = containerClient.getBlockBlobClient(blobName);
579
+ context.log(`Uploading to Azure... ${blobName}`);
580
+ await blockBlobClient.uploadStream(file, undefined, undefined, options);
581
+ const sasToken = generateSASToken(containerClient, blobName);
582
+ return `${blockBlobClient.url}?${sasToken}`;
299
583
  }
300
584
 
301
585
  // Helper function to upload a file to Google Cloud Storage
302
586
  async function uploadToGCS(context, file, encodedFilename) {
303
- const gcsFile = gcs.bucket(GCS_BUCKETNAME).file(encodedFilename);
304
- const writeStream = gcsFile.createWriteStream();
305
-
306
- context.log(`Uploading to GCS... ${encodedFilename}`);
307
-
308
- await pipeline(file, writeStream);
309
- return `gs://${GCS_BUCKETNAME}/${encodedFilename}`;
587
+ const gcsFile = gcs.bucket(GCS_BUCKETNAME).file(encodedFilename);
588
+ const writeStream = gcsFile.createWriteStream({
589
+ resumable: true,
590
+ validation: false,
591
+ metadata: {
592
+ contentType: mime.lookup(encodedFilename) || 'application/octet-stream',
593
+ },
594
+ chunkSize: 8 * 1024 * 1024,
595
+ numRetries: 3,
596
+ retryDelay: 1000,
597
+ });
598
+ context.log(`Uploading to GCS... ${encodedFilename}`);
599
+ await pipeline(file, writeStream);
600
+ // Never encode GCS URLs
601
+ const gcsUrl = `gs://${GCS_BUCKETNAME}/${encodedFilename}`;
602
+ return gcsUrl;
310
603
  }
311
604
 
312
605
  // Helper function to handle Google Cloud Storage
313
606
  async function saveToGoogleStorage(context, encodedFilename, file) {
314
- if (!gcs) {
315
- throw new Error('Google Cloud Storage is not initialized');
316
- }
607
+ if (!gcs) {
608
+ throw new Error('Google Cloud Storage is not initialized');
609
+ }
317
610
 
318
- return uploadToGCS(context, file, encodedFilename);
611
+ return uploadToGCS(context, file, encodedFilename);
319
612
  }
320
613
 
321
- async function uploadFile(context, requestId, body, saveToLocal, file, filename, resolve, hash = null) {
322
- try {
323
- if (!file) {
324
- context.res = {
325
- status: 400,
326
- body: 'No file provided in request'
327
- };
328
- resolve(context.res);
329
- return;
330
- }
614
+ async function uploadFile(
615
+ context,
616
+ requestId,
617
+ body,
618
+ saveToLocal,
619
+ file,
620
+ filename,
621
+ resolve,
622
+ hash = null,
623
+ ) {
624
+ try {
625
+ if (!file) {
626
+ context.res = {
627
+ status: 400,
628
+ body: 'No file provided in request',
629
+ };
630
+ resolve(context.res);
631
+ return;
632
+ }
331
633
 
332
- const encodedFilename = encodeURIComponent(`${requestId || uuidv4()}_${filename}`);
333
-
334
- // Create duplicate readable streams for parallel uploads
335
- const streams = [];
336
- if (gcs) {
337
- streams.push(new PassThrough());
338
- }
339
- streams.push(new PassThrough());
634
+ const ext = path.extname(filename).toLowerCase();
635
+ context.log(`Processing file with extension: ${ext}`);
636
+ let uploadPath = null;
637
+ let uploadName = null;
638
+ let tempDir = null;
639
+
640
+ // Create temp directory for file operations
641
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'upload-'));
642
+ const tempOriginal = path.join(tempDir, filename);
643
+ context.log(`Created temp directory: ${tempDir}`);
644
+
645
+ // Optimize initial write with larger buffer
646
+ const writeStream = fs.createWriteStream(tempOriginal, {
647
+ highWaterMark: 1024 * 1024, // 1MB chunks for initial write
648
+ autoClose: true,
649
+ });
650
+
651
+ // Use pipeline with error handling
652
+ context.log('Writing file to temp location...');
653
+ await pipeline(file, writeStream);
654
+ context.log('File written to temp location successfully');
655
+
656
+ uploadPath = tempOriginal;
657
+ uploadName = `${requestId || uuidv4()}_${filename}`;
658
+ context.log(`Prepared upload name: ${uploadName}`);
659
+
660
+ // Create optimized read streams with larger buffers for storage uploads
661
+ const createOptimizedReadStream = (path) => fs.createReadStream(path, {
662
+ highWaterMark: 1024 * 1024, // 1MB chunks for storage uploads
663
+ autoClose: true,
664
+ });
665
+
666
+ // Upload original in parallel with optimized streams
667
+ const storagePromises = [];
668
+ context.log('Starting primary storage upload...');
669
+ const primaryPromise = saveToLocal
670
+ ? saveToLocalStorage(
671
+ context,
672
+ requestId,
673
+ uploadName,
674
+ createOptimizedReadStream(uploadPath),
675
+ )
676
+ : saveToAzureStorage(
677
+ context,
678
+ uploadName,
679
+ createOptimizedReadStream(uploadPath),
680
+ );
681
+ storagePromises.push(
682
+ primaryPromise.then((url) => {
683
+ context.log('Primary storage upload completed');
684
+ return { url, type: 'primary' };
685
+ }),
686
+ );
687
+
688
+ if (gcs) {
689
+ context.log('Starting GCS upload...');
690
+ storagePromises.push(
691
+ saveToGoogleStorage(
692
+ context,
693
+ uploadName,
694
+ createOptimizedReadStream(uploadPath),
695
+ ).then((gcsUrl) => {
696
+ context.log('GCS upload completed');
697
+ return {
698
+ gcs: gcsUrl,
699
+ type: 'gcs',
700
+ };
701
+ }),
702
+ );
703
+ }
340
704
 
341
- // Pipe the input file to all streams
342
- streams.forEach(stream => {
343
- file.pipe(stream);
344
- });
705
+ // Wait for original uploads to complete
706
+ context.log('Waiting for all storage uploads to complete...');
707
+ const results = await Promise.all(storagePromises);
708
+ const result = {
709
+ message: `File '${uploadName}' ${saveToLocal ? 'saved to folder' : 'uploaded'} successfully.`,
710
+ filename: uploadName,
711
+ ...results.reduce((acc, result) => {
712
+ if (result.type === 'primary') acc.url = result.url;
713
+ if (result.type === 'gcs') acc.gcs = ensureUnencodedGcsUrl(result.gcs);
714
+ return acc;
715
+ }, {}),
716
+ };
717
+
718
+ if (hash) {
719
+ result.hash = hash;
720
+ }
345
721
 
346
- // Set up storage promises
347
- const storagePromises = [];
348
- const primaryPromise = saveToLocal
349
- ? saveToLocalStorage(context, requestId, encodedFilename, streams[streams.length - 1])
350
- : saveToAzureStorage(context, encodedFilename, streams[streams.length - 1]);
351
-
352
- storagePromises.push(primaryPromise.then(url => ({ url, type: 'primary' })));
353
-
354
- // Add GCS promise if configured - now uses its own stream
355
- if (gcs) {
356
- storagePromises.push(
357
- saveToGoogleStorage(context, encodedFilename, streams[0])
358
- .then(gcsUrl => ({ gcs: gcsUrl, type: 'gcs' }))
359
- );
360
- }
722
+ // Initialize conversion service
723
+ const conversionService = new FileConversionService(context, !saveToLocal);
361
724
 
362
- // Wait for all storage operations to complete
363
- const results = await Promise.all(storagePromises);
364
-
365
- // Combine results
366
- const result = {
367
- message: `File '${encodedFilename}' ${saveToLocal ? 'saved to folder' : 'uploaded'} successfully.`,
368
- filename,
369
- ...results.reduce((acc, result) => {
370
- if (result.type === 'primary') acc.url = result.url;
371
- if (result.type === 'gcs') acc.gcs = result.gcs;
372
- return acc;
373
- }, {})
374
- };
725
+ // Check if file needs conversion and handle it
726
+ if (conversionService.needsConversion(filename)) {
727
+ try {
728
+ context.log('Starting file conversion...');
729
+ // Convert the file
730
+ const conversion = await conversionService.convertFile(uploadPath, result.url);
731
+ context.log('File conversion completed:', conversion);
732
+
733
+ if (conversion.converted) {
734
+ context.log('Saving converted file...');
735
+ // Save converted file
736
+ const convertedSaveResult = await conversionService._saveConvertedFile(conversion.convertedPath, requestId);
737
+ context.log('Converted file saved to primary storage');
738
+
739
+ // If GCS is configured, also save to GCS
740
+ let convertedGcsUrl;
741
+ if (conversionService._isGCSConfigured()) {
742
+ context.log('Saving converted file to GCS...');
743
+ convertedGcsUrl = await conversionService._uploadChunkToGCS(conversion.convertedPath, requestId);
744
+ context.log('Converted file saved to GCS');
745
+ }
746
+
747
+ // Add converted file info to result
748
+ result.converted = {
749
+ url: convertedSaveResult.url,
750
+ gcs: convertedGcsUrl
751
+ };
752
+ context.log('Conversion process completed successfully');
753
+ }
754
+ } catch (error) {
755
+ console.error('Error converting file:', error);
756
+ context.log('Error during conversion:', error.message);
757
+ // Don't fail the upload if conversion fails
758
+ }
759
+ }
375
760
 
376
- if (hash) {
377
- result.hash = hash;
378
- }
761
+ context.res = {
762
+ status: 200,
763
+ body: result,
764
+ };
379
765
 
380
- context.res = {
381
- status: 200,
382
- body: result,
383
- };
766
+ // Clean up temp files
767
+ context.log('Cleaning up temporary files...');
768
+ if (tempDir) {
769
+ fs.rmSync(tempDir, { recursive: true, force: true });
770
+ context.log('Temporary files cleaned up');
771
+ }
384
772
 
385
- resolve(result);
386
- } catch (error) {
387
- context.log("Error in uploadFile:", error);
388
- if (body.url) {
389
- try {
390
- await cleanup(context, [body.url]);
391
- } catch (cleanupError) {
392
- context.log("Error during cleanup after failure:", cleanupError);
393
- }
773
+ context.log('Upload process completed successfully');
774
+ resolve(result);
775
+ } catch (error) {
776
+ context.log('Error in upload process:', error);
777
+ if (body.url) {
778
+ try {
779
+ await cleanup(context, [body.url]);
780
+ } catch (cleanupError) {
781
+ context.log('Error during cleanup after failure:', cleanupError);
782
+ }
783
+ }
784
+ throw error;
394
785
  }
395
- throw error;
396
- }
397
786
  }
398
787
 
399
- // Function to delete files that haven't been used in more than a month
400
- async function cleanup(context, urls=null) {
401
- const { containerClient } = await getBlobClient();
402
- const cleanedURLs = [];
403
-
404
- if(!urls) {
405
- const xMonthAgo = new Date();
406
- xMonthAgo.setMonth(xMonthAgo.getMonth() - 1);
788
+ // Helper to convert a stream to a buffer
789
+ async function streamToBuffer(stream) {
790
+ return new Promise((resolve, reject) => {
791
+ const chunks = [];
792
+ stream.on('data', (chunk) => chunks.push(chunk));
793
+ stream.on('end', () => resolve(Buffer.concat(chunks)));
794
+ stream.on('error', reject);
795
+ });
796
+ }
407
797
 
408
- const blobs = containerClient.listBlobsFlat();
409
-
410
- for await (const blob of blobs) {
411
- const lastModified = blob.properties.lastModified;
412
- if (lastModified < xMonthAgo) {
413
- try {
414
- const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
415
- await blockBlobClient.delete();
416
- context.log(`Cleaned blob: ${blob.name}`);
417
- cleanedURLs.push(blob.name);
418
- } catch (error) {
419
- if (error.statusCode !== 404) { // Ignore "not found" errors
420
- context.log(`Error cleaning blob ${blob.name}:`, error);
421
- }
798
+ // Function to delete files that haven't been used in more than a month
799
+ async function cleanup(context, urls = null) {
800
+ const { containerClient } = await getBlobClient();
801
+ const cleanedURLs = [];
802
+
803
+ if (!urls) {
804
+ const xMonthAgo = new Date();
805
+ xMonthAgo.setMonth(xMonthAgo.getMonth() - 1);
806
+
807
+ const blobs = containerClient.listBlobsFlat();
808
+
809
+ for await (const blob of blobs) {
810
+ const lastModified = blob.properties.lastModified;
811
+ if (lastModified < xMonthAgo) {
812
+ try {
813
+ const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
814
+ await blockBlobClient.delete();
815
+ context.log(`Cleaned blob: ${blob.name}`);
816
+ cleanedURLs.push(blob.name);
817
+ } catch (error) {
818
+ if (error.statusCode !== 404) {
819
+ // Ignore "not found" errors
820
+ context.log(`Error cleaning blob ${blob.name}:`, error);
821
+ }
822
+ }
823
+ }
422
824
  }
423
- }
424
- }
425
- } else {
426
- for(const url of urls) {
427
- try {
428
- const blobName = url.replace(containerClient.url, '');
429
- const blockBlobClient = containerClient.getBlockBlobClient(blobName);
430
- await blockBlobClient.delete();
431
- context.log(`Cleaned blob: ${blobName}`);
432
- cleanedURLs.push(blobName);
433
- } catch (error) {
434
- if (error.statusCode !== 404) { // Ignore "not found" errors
435
- context.log(`Error cleaning blob ${url}:`, error);
825
+ } else {
826
+ for (const url of urls) {
827
+ try {
828
+ const blobName = url.replace(containerClient.url, '');
829
+ const blockBlobClient = containerClient.getBlockBlobClient(blobName);
830
+ await blockBlobClient.delete();
831
+ context.log(`Cleaned blob: ${blobName}`);
832
+ cleanedURLs.push(blobName);
833
+ } catch (error) {
834
+ if (error.statusCode !== 404) {
835
+ // Ignore "not found" errors
836
+ context.log(`Error cleaning blob ${url}:`, error);
837
+ }
838
+ }
436
839
  }
437
- }
438
840
  }
439
- }
440
- return cleanedURLs;
841
+ return cleanedURLs;
441
842
  }
442
843
 
443
- async function cleanupGCS(urls=null) {
444
- const bucket = gcs.bucket(GCS_BUCKETNAME);
445
- const directories = new Set();
446
- const cleanedURLs = [];
447
-
448
- if(!urls){
449
- const daysN = 30;
450
- const thirtyDaysAgo = new Date(Date.now() - daysN * 24 * 60 * 60 * 1000);
451
- const [files] = await bucket.getFiles();
452
-
453
- for (const file of files) {
454
- const [metadata] = await file.getMetadata();
455
- const directoryPath = path.dirname(file.name);
456
- directories.add(directoryPath);
457
- if (metadata.updated) {
458
- const updatedTime = new Date(metadata.updated);
459
- if (updatedTime.getTime() < thirtyDaysAgo.getTime()) {
460
- console.log(`Cleaning file: ${file.name}`);
461
- await file.delete();
462
- cleanedURLs.push(file.name);
844
+ async function cleanupGCS(urls = null) {
845
+ const bucket = gcs.bucket(GCS_BUCKETNAME);
846
+ const directories = new Set();
847
+ const cleanedURLs = [];
848
+
849
+ if (!urls) {
850
+ const daysN = 30;
851
+ const thirtyDaysAgo = new Date(Date.now() - daysN * 24 * 60 * 60 * 1000);
852
+ const [files] = await bucket.getFiles();
853
+
854
+ for (const file of files) {
855
+ const [metadata] = await file.getMetadata();
856
+ const directoryPath = path.dirname(file.name);
857
+ directories.add(directoryPath);
858
+ if (metadata.updated) {
859
+ const updatedTime = new Date(metadata.updated);
860
+ if (updatedTime.getTime() < thirtyDaysAgo.getTime()) {
861
+ console.log(`Cleaning file: ${file.name}`);
862
+ await file.delete();
863
+ cleanedURLs.push(file.name);
864
+ }
865
+ }
866
+ }
867
+ } else {
868
+ try {
869
+ for (const url of urls) {
870
+ const filename = path.join(url.split('/').slice(3).join('/'));
871
+ const file = bucket.file(filename);
872
+ const directoryPath = path.dirname(file.name);
873
+ directories.add(directoryPath);
874
+ await file.delete();
875
+ cleanedURLs.push(url);
876
+ }
877
+ } catch (error) {
878
+ console.error(`Error cleaning up files: ${error}`);
463
879
  }
464
- }
465
- }
466
- }else{
467
- try {
468
- for(const url of urls) {
469
- const filename = path.join(url.split('/').slice(3).join('/'));
470
- const file = bucket.file(filename);
471
- const directoryPath = path.dirname(file.name);
472
- directories.add(directoryPath);
473
- await file.delete();
474
- cleanedURLs.push(url);
475
- }
476
- }catch(error){
477
- console.error(`Error cleaning up files: ${error}`);
478
880
  }
479
- }
480
881
 
481
- for (const directory of directories) {
482
- const [files] = await bucket.getFiles({ prefix: directory });
483
- if (files.length === 0) {
484
- console.log(`Deleting empty directory: ${directory}`);
485
- await bucket.deleteFiles({ prefix: directory });
882
+ for (const directory of directories) {
883
+ const [files] = await bucket.getFiles({ prefix: directory });
884
+ if (files.length === 0) {
885
+ console.log(`Deleting empty directory: ${directory}`);
886
+ await bucket.deleteFiles({ prefix: directory });
887
+ }
486
888
  }
487
- }
488
889
 
489
- return cleanedURLs;
890
+ return cleanedURLs;
490
891
  }
491
892
 
492
893
  async function deleteGCS(blobName) {
493
- if (!blobName) throw new Error("Missing blobName parameter");
494
- if (!gcs) throw new Error("Google Cloud Storage is not initialized");
894
+ if (!blobName) throw new Error('Missing blobName parameter');
895
+ if (!gcs) throw new Error('Google Cloud Storage is not initialized');
495
896
 
496
- try {
497
- const bucket = gcs.bucket(GCS_BUCKETNAME);
498
- const deletedFiles = [];
897
+ try {
898
+ const bucket = gcs.bucket(GCS_BUCKETNAME);
899
+ const deletedFiles = [];
499
900
 
500
- if (process.env.STORAGE_EMULATOR_HOST) {
501
- // For fake GCS server, use HTTP API directly
502
- const response = await axios.get(
503
- `http://localhost:4443/storage/v1/b/${GCS_BUCKETNAME}/o`,
504
- { params: { prefix: blobName } }
505
- );
506
- if (response.data.items) {
507
- for (const item of response.data.items) {
508
- await axios.delete(
509
- `http://localhost:4443/storage/v1/b/${GCS_BUCKETNAME}/o/${encodeURIComponent(item.name)}`,
510
- { validateStatus: status => status === 200 || status === 404 }
511
- );
512
- deletedFiles.push(item.name);
901
+ if (process.env.STORAGE_EMULATOR_HOST) {
902
+ // For fake GCS server, use HTTP API directly
903
+ const response = await axios.get(
904
+ `http://localhost:4443/storage/v1/b/${GCS_BUCKETNAME}/o`,
905
+ { params: { prefix: blobName } },
906
+ );
907
+ if (response.data.items) {
908
+ for (const item of response.data.items) {
909
+ await axios.delete(
910
+ `http://localhost:4443/storage/v1/b/${GCS_BUCKETNAME}/o/${encodeURIComponent(item.name)}`,
911
+ { validateStatus: (status) => status === 200 || status === 404 },
912
+ );
913
+ deletedFiles.push(item.name);
914
+ }
915
+ }
916
+ } else {
917
+ // For real GCS, use the SDK
918
+ const [files] = await bucket.getFiles({ prefix: blobName });
919
+ for (const file of files) {
920
+ await file.delete();
921
+ deletedFiles.push(file.name);
922
+ }
513
923
  }
514
- }
515
- } else {
516
- // For real GCS, use the SDK
517
- const [files] = await bucket.getFiles({ prefix: blobName });
518
- for (const file of files) {
519
- await file.delete();
520
- deletedFiles.push(file.name);
521
- }
522
- }
523
924
 
524
- if (deletedFiles.length > 0) {
525
- console.log(`Cleaned GCS files: ${deletedFiles.join(', ')}`);
526
- }
527
- return deletedFiles;
528
- } catch (error) {
529
- if (error.code !== 404) {
530
- console.error(`Error in deleteGCS: ${error}`);
531
- throw error;
925
+ if (deletedFiles.length > 0) {
926
+ console.log(`Cleaned GCS files: ${deletedFiles.join(', ')}`);
927
+ }
928
+ return deletedFiles;
929
+ } catch (error) {
930
+ if (error.code !== 404) {
931
+ console.error(`Error in deleteGCS: ${error}`);
932
+ throw error;
933
+ }
934
+ return [];
532
935
  }
533
- return [];
534
- }
535
936
  }
536
937
 
537
938
  // Helper function to ensure GCS upload for existing files
538
939
  async function ensureGCSUpload(context, existingFile) {
539
- if (!existingFile.gcs && gcs) {
540
- context.log(`GCS file was missing - uploading.`);
541
- const encodedFilename = path.basename(existingFile.url.split('?')[0]);
542
-
543
- // Download the file from Azure/local storage
544
- const response = await axios({
545
- method: 'get',
546
- url: existingFile.url,
547
- responseType: 'stream'
548
- });
549
-
550
- // Upload the file stream to GCS
551
- existingFile.gcs = await uploadToGCS(context, response.data, encodedFilename);
552
- }
553
- return existingFile;
940
+ if (!existingFile.gcs && gcs) {
941
+ context.log('GCS file was missing - uploading.');
942
+ let encodedFilename = path.basename(existingFile.url.split('?')[0]);
943
+ if (!isEncoded(encodedFilename)) {
944
+ encodedFilename = encodeURIComponent(encodedFilename);
945
+ }
946
+ // Download the file from Azure/local storage
947
+ const response = await axios({
948
+ method: 'get',
949
+ url: existingFile.url,
950
+ responseType: 'stream',
951
+ });
952
+ // Upload the file stream to GCS
953
+ existingFile.gcs = await uploadToGCS(
954
+ context,
955
+ response.data,
956
+ encodedFilename,
957
+ );
958
+ }
959
+ return existingFile;
554
960
  }
555
961
 
556
962
  // Helper function to upload a chunk to GCS
557
963
  async function uploadChunkToGCS(chunkPath, requestId) {
558
964
  if (!gcs) return null;
559
-
560
- const gcsFileName = `${requestId}/${path.basename(chunkPath)}`;
965
+ let baseName = path.basename(chunkPath);
966
+ if (!isEncoded(baseName)) {
967
+ baseName = encodeURIComponent(baseName);
968
+ }
969
+ const gcsFileName = `${requestId}/${baseName}`;
561
970
  await gcs.bucket(GCS_BUCKETNAME).upload(chunkPath, {
562
- destination: gcsFileName
971
+ destination: gcsFileName,
563
972
  });
564
973
  return `gs://${GCS_BUCKETNAME}/${gcsFileName}`;
565
974
  }
566
975
 
567
- export { saveFileToBlob, deleteBlob, deleteGCS, uploadBlob, cleanup, cleanupGCS, gcsUrlExists, ensureGCSUpload, gcs, uploadChunkToGCS };
976
+ export {
977
+ saveFileToBlob,
978
+ deleteBlob,
979
+ deleteGCS,
980
+ uploadBlob,
981
+ cleanup,
982
+ cleanupGCS,
983
+ gcsUrlExists,
984
+ ensureGCSUpload,
985
+ gcs,
986
+ uploadChunkToGCS,
987
+ downloadFromGCS,
988
+ };