@aj-archipelago/cortex 1.3.57 → 1.3.59

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/README.md +6 -0
  2. package/config.js +22 -0
  3. package/helper-apps/cortex-file-handler/INTERFACE.md +20 -9
  4. package/helper-apps/cortex-file-handler/package-lock.json +2 -2
  5. package/helper-apps/cortex-file-handler/package.json +1 -1
  6. package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +17 -17
  7. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +35 -35
  8. package/helper-apps/cortex-file-handler/src/blobHandler.js +1010 -909
  9. package/helper-apps/cortex-file-handler/src/constants.js +98 -98
  10. package/helper-apps/cortex-file-handler/src/docHelper.js +27 -27
  11. package/helper-apps/cortex-file-handler/src/fileChunker.js +224 -214
  12. package/helper-apps/cortex-file-handler/src/helper.js +93 -93
  13. package/helper-apps/cortex-file-handler/src/index.js +584 -550
  14. package/helper-apps/cortex-file-handler/src/localFileHandler.js +86 -86
  15. package/helper-apps/cortex-file-handler/src/redis.js +186 -90
  16. package/helper-apps/cortex-file-handler/src/services/ConversionService.js +301 -273
  17. package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +55 -55
  18. package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +174 -154
  19. package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +239 -223
  20. package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +161 -159
  21. package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +73 -71
  22. package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +46 -45
  23. package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +256 -213
  24. package/helper-apps/cortex-file-handler/src/start.js +4 -1
  25. package/helper-apps/cortex-file-handler/src/utils/filenameUtils.js +59 -25
  26. package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +119 -116
  27. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +257 -257
  28. package/helper-apps/cortex-file-handler/tests/cleanup.test.js +676 -0
  29. package/helper-apps/cortex-file-handler/tests/conversionResilience.test.js +124 -124
  30. package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +249 -208
  31. package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +439 -380
  32. package/helper-apps/cortex-file-handler/tests/getOperations.test.js +299 -263
  33. package/helper-apps/cortex-file-handler/tests/postOperations.test.js +265 -239
  34. package/helper-apps/cortex-file-handler/tests/start.test.js +1230 -1201
  35. package/helper-apps/cortex-file-handler/tests/storage/AzureStorageProvider.test.js +110 -105
  36. package/helper-apps/cortex-file-handler/tests/storage/GCSStorageProvider.test.js +201 -175
  37. package/helper-apps/cortex-file-handler/tests/storage/LocalStorageProvider.test.js +128 -125
  38. package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +78 -73
  39. package/helper-apps/cortex-file-handler/tests/storage/StorageService.test.js +99 -99
  40. package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +74 -70
  41. package/package.json +1 -1
  42. package/pathways/translate_apptek.js +33 -0
  43. package/pathways/translate_subtitle.js +15 -8
  44. package/server/plugins/apptekTranslatePlugin.js +46 -91
  45. package/tests/apptekTranslatePlugin.test.js +0 -2
  46. package/tests/integration/apptekTranslatePlugin.integration.test.js +159 -93
  47. package/tests/translate_apptek.test.js +16 -0
@@ -1,133 +1,134 @@
1
- import fs from 'fs';
2
- import path from 'path';
3
- import { join } from 'path';
4
- import { PassThrough } from 'stream';
5
- import { pipeline as _pipeline } from 'stream';
6
- import { promisify } from 'util';
7
-
1
+ import fs from "fs";
2
+ import os from "os";
3
+ import path from "path";
4
+ import { promisify } from "util";
5
+ import { pipeline as _pipeline } from "stream";
6
+ import { v4 as uuidv4 } from "uuid";
7
+ import Busboy from "busboy";
8
+ import { PassThrough } from "stream";
9
+ import mime from "mime-types";
10
+ import { Storage } from "@google-cloud/storage";
8
11
  import {
9
- generateBlobSASQueryParameters,
10
- StorageSharedKeyCredential,
11
- BlobServiceClient,
12
- } from '@azure/storage-blob';
13
- import { Storage } from '@google-cloud/storage';
14
- import axios from 'axios';
15
- import Busboy from 'busboy';
16
- import { v4 as uuidv4 } from 'uuid';
17
- const pipeline = promisify(_pipeline);
18
-
19
- import { publicFolder, port, ipAddress } from './start.js';
20
- import { CONVERTED_EXTENSIONS } from './constants.js';
12
+ generateBlobSASQueryParameters,
13
+ StorageSharedKeyCredential,
14
+ BlobServiceClient,
15
+ } from "@azure/storage-blob";
16
+ import axios from "axios";
21
17
 
22
- // eslint-disable-next-line import/no-extraneous-dependencies
23
- import mime from 'mime-types';
24
-
25
- import os from 'os';
26
- import { sanitizeFilename } from './utils/filenameUtils.js';
18
+ import {
19
+ sanitizeFilename,
20
+ generateShortId,
21
+ generateBlobName,
22
+ } from "./utils/filenameUtils.js";
23
+ import { publicFolder, port, ipAddress } from "./start.js";
24
+ import { CONVERTED_EXTENSIONS } from "./constants.js";
25
+ import { FileConversionService } from "./services/FileConversionService.js";
27
26
 
28
- import { FileConversionService } from './services/FileConversionService.js';
27
+ const pipeline = promisify(_pipeline);
29
28
 
30
29
  function isBase64(str) {
31
- try {
32
- return btoa(atob(str)) == str;
33
- } catch (err) {
34
- return false;
35
- }
30
+ try {
31
+ return btoa(atob(str)) == str;
32
+ } catch (err) {
33
+ return false;
34
+ }
36
35
  }
37
36
 
38
37
  const { SAS_TOKEN_LIFE_DAYS = 30 } = process.env;
39
38
  const GCP_SERVICE_ACCOUNT_KEY =
40
39
  process.env.GCP_SERVICE_ACCOUNT_KEY_BASE64 ||
41
40
  process.env.GCP_SERVICE_ACCOUNT_KEY ||
42
- '{}';
41
+ "{}";
43
42
  const GCP_SERVICE_ACCOUNT = isBase64(GCP_SERVICE_ACCOUNT_KEY)
44
- ? JSON.parse(Buffer.from(GCP_SERVICE_ACCOUNT_KEY, 'base64').toString())
45
- : JSON.parse(GCP_SERVICE_ACCOUNT_KEY);
43
+ ? JSON.parse(Buffer.from(GCP_SERVICE_ACCOUNT_KEY, "base64").toString())
44
+ : JSON.parse(GCP_SERVICE_ACCOUNT_KEY);
46
45
  const { project_id: GCP_PROJECT_ID } = GCP_SERVICE_ACCOUNT;
47
46
 
48
47
  let gcs;
49
48
  if (!GCP_PROJECT_ID || !GCP_SERVICE_ACCOUNT) {
50
- console.warn(
51
- 'No Google Cloud Storage credentials provided - GCS will not be used',
52
- );
49
+ console.warn(
50
+ "No Google Cloud Storage credentials provided - GCS will not be used",
51
+ );
53
52
  } else {
54
- try {
55
- gcs = new Storage({
56
- projectId: GCP_PROJECT_ID,
57
- credentials: GCP_SERVICE_ACCOUNT,
58
- });
53
+ try {
54
+ gcs = new Storage({
55
+ projectId: GCP_PROJECT_ID,
56
+ credentials: GCP_SERVICE_ACCOUNT,
57
+ });
59
58
 
60
59
  // Rest of your Google Cloud operations using gcs object
61
- } catch (error) {
62
- console.error(
63
- 'Google Cloud Storage credentials are invalid - GCS will not be used: ',
64
- error,
65
- );
66
- }
60
+ } catch (error) {
61
+ console.error(
62
+ "Google Cloud Storage credentials are invalid - GCS will not be used: ",
63
+ error,
64
+ );
65
+ }
67
66
  }
68
67
 
69
68
  export const AZURE_STORAGE_CONTAINER_NAME =
70
- process.env.AZURE_STORAGE_CONTAINER_NAME || 'whispertempfiles';
71
- export const GCS_BUCKETNAME = process.env.GCS_BUCKETNAME || 'cortextempfiles';
69
+ process.env.AZURE_STORAGE_CONTAINER_NAME || "whispertempfiles";
70
+ export const GCS_BUCKETNAME = process.env.GCS_BUCKETNAME || "cortextempfiles";
72
71
 
73
72
  function isEncoded(str) {
74
- // Checks for any percent-encoded sequence
75
- return /%[0-9A-Fa-f]{2}/.test(str);
73
+ // Checks for any percent-encoded sequence
74
+ return /%[0-9A-Fa-f]{2}/.test(str);
76
75
  }
77
76
 
78
77
  // Helper function to ensure GCS URLs are never encoded
79
78
  function ensureUnencodedGcsUrl(url) {
80
- if (!url || !url.startsWith('gs://')) {
81
- return url;
82
- }
83
- // Split into bucket and path parts
84
- const [bucket, ...pathParts] = url.replace('gs://', '').split('/');
85
- // Reconstruct URL with decoded path parts, handling invalid characters
86
- return `gs://${bucket}/${pathParts.map(part => {
87
- try {
88
- return decodeURIComponent(part);
89
- } catch (error) {
90
- // If decoding fails, sanitize the filename by removing invalid characters
91
- return part.replace(/[^\w\-\.]/g, '_');
92
- }
93
- }).join('/')}`;
79
+ if (!url || !url.startsWith("gs://")) {
80
+ return url;
81
+ }
82
+ // Split into bucket and path parts
83
+ const [bucket, ...pathParts] = url.replace("gs://", "").split("/");
84
+ // Reconstruct URL with decoded path parts, handling invalid characters
85
+ return `gs://${bucket}/${pathParts
86
+ .map((part) => {
87
+ try {
88
+ return decodeURIComponent(part);
89
+ } catch (error) {
90
+ // If decoding fails, sanitize the filename by removing invalid characters
91
+ return part.replace(/[^\w\-\.]/g, "_");
92
+ }
93
+ })
94
+ .join("/")}`;
94
95
  }
95
96
 
96
97
  async function gcsUrlExists(url, defaultReturn = false) {
97
- try {
98
- if (!url || !gcs) {
99
- return defaultReturn; // Cannot check return
100
- }
98
+ try {
99
+ if (!url || !gcs) {
100
+ return defaultReturn; // Cannot check return
101
+ }
101
102
 
102
- // Ensure URL is not encoded
103
- const unencodedUrl = ensureUnencodedGcsUrl(url);
104
- const urlParts = unencodedUrl.replace('gs://', '').split('/');
105
- const bucketName = urlParts[0];
106
- const fileName = urlParts.slice(1).join('/');
103
+ // Ensure URL is not encoded
104
+ const unencodedUrl = ensureUnencodedGcsUrl(url);
105
+ const urlParts = unencodedUrl.replace("gs://", "").split("/");
106
+ const bucketName = urlParts[0];
107
+ const fileName = urlParts.slice(1).join("/");
107
108
 
108
- if (process.env.STORAGE_EMULATOR_HOST) {
109
- try {
110
- const response = await axios.get(
111
- `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${bucketName}/o/${encodeURIComponent(fileName)}`,
112
- { validateStatus: (status) => status === 200 || status === 404 },
113
- );
114
- return response.status === 200;
115
- } catch (error) {
116
- console.error('Error checking emulator file:', error);
117
- return false;
118
- }
119
- }
109
+ if (process.env.STORAGE_EMULATOR_HOST) {
110
+ try {
111
+ const response = await axios.get(
112
+ `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${bucketName}/o/${encodeURIComponent(fileName)}`,
113
+ { validateStatus: (status) => status === 200 || status === 404 },
114
+ );
115
+ return response.status === 200;
116
+ } catch (error) {
117
+ console.error("Error checking emulator file:", error);
118
+ return false;
119
+ }
120
+ }
120
121
 
121
- const bucket = gcs.bucket(bucketName);
122
- const file = bucket.file(fileName);
122
+ const bucket = gcs.bucket(bucketName);
123
+ const file = bucket.file(fileName);
123
124
 
124
- const [exists] = await file.exists();
125
+ const [exists] = await file.exists();
125
126
 
126
- return exists;
127
- } catch (error) {
128
- console.error('Error checking if GCS URL exists:', error);
129
- return false;
130
- }
127
+ return exists;
128
+ } catch (error) {
129
+ console.error("Error checking if GCS URL exists:", error);
130
+ return false;
131
+ }
131
132
  }
132
133
 
133
134
  /**
@@ -137,920 +138,1020 @@ async function gcsUrlExists(url, defaultReturn = false) {
137
138
  * @returns {Promise<void>}
138
139
  */
139
140
  async function downloadFromGCS(gcsUrl, destinationPath) {
140
- if (!gcsUrl || !gcs) {
141
- throw new Error('Invalid GCS URL or GCS client not initialized');
142
- }
143
-
144
- const urlParts = gcsUrl.replace('gs://', '').split('/');
145
- const bucketName = urlParts[0];
146
- const fileName = urlParts.slice(1).join('/');
141
+ if (!gcsUrl || !gcs) {
142
+ throw new Error("Invalid GCS URL or GCS client not initialized");
143
+ }
144
+
145
+ const urlParts = gcsUrl.replace("gs://", "").split("/");
146
+ const bucketName = urlParts[0];
147
+ const fileName = urlParts.slice(1).join("/");
148
+
149
+ if (process.env.STORAGE_EMULATOR_HOST) {
150
+ // Use axios to download from emulator
151
+ const response = await axios({
152
+ method: "GET",
153
+ url: `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${bucketName}/o/${encodeURIComponent(fileName)}?alt=media`,
154
+ responseType: "stream",
155
+ });
147
156
 
148
- if (process.env.STORAGE_EMULATOR_HOST) {
149
- // Use axios to download from emulator
150
- const response = await axios({
151
- method: 'GET',
152
- url: `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${bucketName}/o/${encodeURIComponent(fileName)}?alt=media`,
153
- responseType: 'stream'
154
- });
155
-
156
- // Write the response to file
157
- const writer = fs.createWriteStream(destinationPath);
158
- await new Promise((resolve, reject) => {
159
- response.data.pipe(writer);
160
- writer.on('finish', resolve);
161
- writer.on('error', reject);
162
- });
163
- } else {
164
- // Use GCS client for real GCS
165
- const bucket = gcs.bucket(bucketName);
166
- const file = bucket.file(fileName);
167
- await file.download({ destination: destinationPath });
168
- }
157
+ // Write the response to file
158
+ const writer = fs.createWriteStream(destinationPath);
159
+ await new Promise((resolve, reject) => {
160
+ response.data.pipe(writer);
161
+ writer.on("finish", resolve);
162
+ writer.on("error", reject);
163
+ });
164
+ } else {
165
+ // Use GCS client for real GCS
166
+ const bucket = gcs.bucket(bucketName);
167
+ const file = bucket.file(fileName);
168
+ await file.download({ destination: destinationPath });
169
+ }
169
170
  }
170
171
 
171
172
  export const getBlobClient = async () => {
172
- const connectionString = process.env.AZURE_STORAGE_CONNECTION_STRING;
173
- const containerName = AZURE_STORAGE_CONTAINER_NAME;
174
- if (!connectionString || !containerName) {
175
- throw new Error(
176
- 'Missing Azure Storage connection string or container name environment variable',
177
- );
178
- }
173
+ const connectionString = process.env.AZURE_STORAGE_CONNECTION_STRING;
174
+ const containerName = AZURE_STORAGE_CONTAINER_NAME;
175
+ if (!connectionString || !containerName) {
176
+ throw new Error(
177
+ "Missing Azure Storage connection string or container name environment variable",
178
+ );
179
+ }
179
180
 
180
- const blobServiceClient =
181
+ const blobServiceClient =
181
182
  BlobServiceClient.fromConnectionString(connectionString);
182
183
 
183
- const serviceProperties = await blobServiceClient.getProperties();
184
- if (!serviceProperties.defaultServiceVersion) {
185
- serviceProperties.defaultServiceVersion = '2020-02-10';
186
- await blobServiceClient.setProperties(serviceProperties);
187
- }
184
+ const serviceProperties = await blobServiceClient.getProperties();
185
+ if (!serviceProperties.defaultServiceVersion) {
186
+ serviceProperties.defaultServiceVersion = "2020-02-10";
187
+ await blobServiceClient.setProperties(serviceProperties);
188
+ }
188
189
 
189
- const containerClient = blobServiceClient.getContainerClient(containerName);
190
+ const containerClient = blobServiceClient.getContainerClient(containerName);
190
191
 
191
- return { blobServiceClient, containerClient };
192
+ return { blobServiceClient, containerClient };
192
193
  };
193
194
 
194
- async function saveFileToBlob(chunkPath, requestId) {
195
- const { containerClient } = await getBlobClient();
196
- // Use the filename with a UUID as the blob name
197
- let baseName = path.basename(chunkPath);
198
- // Remove any query parameters from the filename
199
- baseName = baseName.split('?')[0];
200
- // Only encode if not already encoded
201
- if (!isEncoded(baseName)) {
202
- baseName = encodeURIComponent(baseName);
203
- }
204
- const blobName = `${requestId}/${uuidv4()}_${baseName}`;
205
-
206
- // Create a read stream for the chunk file
207
- const fileStream = fs.createReadStream(chunkPath);
208
-
209
- // Upload the chunk to Azure Blob Storage using the stream
210
- const blockBlobClient = containerClient.getBlockBlobClient(blobName);
211
- await blockBlobClient.uploadStream(fileStream);
212
-
213
- // Generate SAS token after successful upload
214
- const sasToken = generateSASToken(containerClient, blobName);
215
-
216
- // Return an object with the URL property
217
- return {
218
- url: `${blockBlobClient.url}?${sasToken}`,
219
- blobName: blobName
220
- };
195
+ async function saveFileToBlob(chunkPath, requestId, filename = null) {
196
+ const { containerClient } = await getBlobClient();
197
+ // Use provided filename or generate LLM-friendly naming
198
+ let blobName;
199
+ if (filename) {
200
+ blobName = generateBlobName(requestId, filename);
201
+ } else {
202
+ const fileExtension = path.extname(chunkPath);
203
+ const shortId = generateShortId();
204
+ blobName = generateBlobName(requestId, `${shortId}${fileExtension}`);
205
+ }
206
+
207
+ // Create a read stream for the chunk file
208
+ const fileStream = fs.createReadStream(chunkPath);
209
+
210
+ // Upload the chunk to Azure Blob Storage using the stream
211
+ const blockBlobClient = containerClient.getBlockBlobClient(blobName);
212
+ await blockBlobClient.uploadStream(fileStream);
213
+
214
+ // Generate SAS token after successful upload
215
+ const sasToken = generateSASToken(containerClient, blobName);
216
+
217
+ // Return an object with the URL property
218
+ return {
219
+ url: `${blockBlobClient.url}?${sasToken}`,
220
+ blobName: blobName,
221
+ };
221
222
  }
222
223
 
223
224
  const generateSASToken = (
224
- containerClient,
225
- blobName,
226
- expiryTimeSeconds = parseInt(SAS_TOKEN_LIFE_DAYS) * 24 * 60 * 60,
225
+ containerClient,
226
+ blobName,
227
+ expiryTimeSeconds = parseInt(SAS_TOKEN_LIFE_DAYS) * 24 * 60 * 60,
227
228
  ) => {
228
- const { accountName, accountKey } = containerClient.credential;
229
- const sharedKeyCredential = new StorageSharedKeyCredential(
230
- accountName,
231
- accountKey,
232
- );
233
-
234
- const sasOptions = {
235
- containerName: containerClient.containerName,
236
- blobName: blobName,
237
- permissions: 'r', // Read permission
238
- startsOn: new Date(),
239
- expiresOn: new Date(new Date().valueOf() + expiryTimeSeconds * 1000),
240
- };
241
-
242
- const sasToken = generateBlobSASQueryParameters(
243
- sasOptions,
244
- sharedKeyCredential,
245
- ).toString();
246
- return sasToken;
229
+ const { accountName, accountKey } = containerClient.credential;
230
+ const sharedKeyCredential = new StorageSharedKeyCredential(
231
+ accountName,
232
+ accountKey,
233
+ );
234
+
235
+ const sasOptions = {
236
+ containerName: containerClient.containerName,
237
+ blobName: blobName,
238
+ permissions: "r", // Read permission
239
+ startsOn: new Date(),
240
+ expiresOn: new Date(new Date().valueOf() + expiryTimeSeconds * 1000),
241
+ };
242
+
243
+ const sasToken = generateBlobSASQueryParameters(
244
+ sasOptions,
245
+ sharedKeyCredential,
246
+ ).toString();
247
+ return sasToken;
247
248
  };
248
249
 
249
250
  //deletes blob that has the requestId
250
251
  async function deleteBlob(requestId) {
251
- if (!requestId) throw new Error('Missing requestId parameter');
252
- const { containerClient } = await getBlobClient();
253
- // List all blobs in the container
254
- const blobs = containerClient.listBlobsFlat();
255
-
256
- const result = [];
257
- // Iterate through the blobs
258
- for await (const blob of blobs) {
252
+ if (!requestId) throw new Error("Missing requestId parameter");
253
+ const { containerClient } = await getBlobClient();
254
+ // List all blobs in the container
255
+ const blobs = containerClient.listBlobsFlat();
256
+
257
+ const result = [];
258
+ // Iterate through the blobs
259
+ for await (const blob of blobs) {
259
260
  // Check if the blob name starts with requestId_ (flat structure)
260
261
  // or is inside a folder named requestId/ (folder structure)
261
- if (
262
- blob.name.startsWith(`${requestId}_`) ||
262
+ if (
263
+ blob.name.startsWith(`${requestId}_`) ||
263
264
  blob.name.startsWith(`${requestId}/`)
264
- ) {
265
- // Delete the matching blob
266
- const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
267
- await blockBlobClient.delete();
268
- console.log(`Cleaned blob: ${blob.name}`);
269
- result.push(blob.name);
270
- }
265
+ ) {
266
+ // Delete the matching blob
267
+ const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
268
+ await blockBlobClient.delete();
269
+ console.log(`Cleaned blob: ${blob.name}`);
270
+ result.push(blob.name);
271
271
  }
272
+ }
272
273
 
273
- return result;
274
+ return result;
274
275
  }
275
276
 
276
277
  function uploadBlob(
277
- context,
278
- req,
279
- saveToLocal = false,
280
- filePath = null,
281
- hash = null,
278
+ context,
279
+ req,
280
+ saveToLocal = false,
281
+ filePath = null,
282
+ hash = null,
282
283
  ) {
283
- return new Promise((resolve, reject) => {
284
- (async () => {
284
+ return new Promise((resolve, reject) => {
285
+ (async () => {
286
+ try {
287
+ let requestId = uuidv4();
288
+ const body = {};
289
+
290
+ // If filePath is given, we are dealing with local file and not form-data
291
+ if (filePath) {
292
+ const file = fs.createReadStream(filePath);
293
+ const filename = path.basename(filePath);
294
+
295
+ // Generate LLM-friendly ID for requestId to match the filename pattern
296
+ const fileExtension = path.extname(filename);
297
+ const shortId = generateShortId();
298
+ const uploadName = `${shortId}${fileExtension}`;
299
+ requestId = shortId; // Use the short ID as requestId
300
+
301
+ try {
302
+ const result = await uploadFile(
303
+ context,
304
+ requestId,
305
+ body,
306
+ saveToLocal,
307
+ file,
308
+ uploadName, // Use the LLM-friendly filename
309
+ resolve,
310
+ hash,
311
+ );
312
+ resolve(result);
313
+ } catch (error) {
314
+ const err = new Error("Error processing file upload.");
315
+ err.status = 500;
316
+ throw err;
317
+ }
318
+ } else {
319
+ // Otherwise, continue working with form-data
320
+ const busboy = Busboy({ headers: req.headers });
321
+ let hasFile = false;
322
+ let errorOccurred = false;
323
+
324
+ busboy.on("field", (fieldname, value) => {
325
+ if (fieldname === "requestId") {
326
+ requestId = value;
327
+ } else if (fieldname === "hash") {
328
+ hash = value;
329
+ }
330
+ });
331
+
332
+ busboy.on("file", async (fieldname, file, info) => {
333
+ if (errorOccurred) return;
334
+ hasFile = true;
335
+
336
+ // Validate file
337
+ if (!info.filename || info.filename.trim() === "") {
338
+ errorOccurred = true;
339
+ const err = new Error("Invalid file: missing filename");
340
+ err.status = 400;
341
+ reject(err);
342
+ return;
343
+ }
344
+
345
+ // Prepare for streaming to cloud destinations
346
+ const filename = info.filename;
347
+ const fileExtension = path.extname(filename);
348
+ const shortId = generateShortId();
349
+ const uploadName = `${shortId}${fileExtension}`;
350
+ const azureStream = !saveToLocal ? new PassThrough() : null;
351
+ const gcsStream = gcs ? new PassThrough() : null;
352
+ let diskWriteStream, tempDir, tempFilePath;
353
+ let diskWritePromise;
354
+ let diskWriteError = null;
355
+ let cloudUploadError = null;
356
+
357
+ // Start local disk write in parallel (non-blocking for response)
358
+ if (saveToLocal) {
359
+ try {
360
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "upload-"));
361
+ } catch (err) {
362
+ console.error("Error creating tempDir:", err);
363
+ errorOccurred = true;
364
+ reject(err);
365
+ return;
366
+ }
367
+ tempFilePath = path.join(tempDir, uploadName);
368
+ try {
369
+ diskWriteStream = fs.createWriteStream(tempFilePath, {
370
+ highWaterMark: 1024 * 1024,
371
+ autoClose: true,
372
+ });
373
+ } catch (err) {
374
+ console.error(
375
+ "Error creating write stream:",
376
+ err,
377
+ "Temp dir exists:",
378
+ fs.existsSync(tempDir),
379
+ );
380
+ errorOccurred = true;
381
+ reject(err);
382
+ return;
383
+ }
384
+ diskWriteStream.on("error", (err) => {
385
+ console.error("Disk write stream error:", err);
386
+ });
387
+ diskWriteStream.on("close", () => {
388
+ console.log("Disk write stream closed for:", tempFilePath);
389
+ });
390
+ diskWritePromise = new Promise((res, rej) => {
391
+ diskWriteStream.on("finish", res);
392
+ diskWriteStream.on("error", (err) => {
393
+ diskWriteError = err;
394
+ rej(err);
395
+ });
396
+ });
397
+ }
398
+
399
+ // Pipe incoming file to all destinations
400
+ let receivedAnyData = false;
401
+ file.on("data", () => {
402
+ receivedAnyData = true;
403
+ });
404
+ if (azureStream) file.pipe(azureStream);
405
+ if (gcsStream) file.pipe(gcsStream);
406
+ if (diskWriteStream) file.pipe(diskWriteStream);
407
+
408
+ // Listen for end event to check for empty file
409
+ file.on("end", async () => {
410
+ if (!receivedAnyData) {
411
+ errorOccurred = true;
412
+ // Abort all streams
413
+ if (azureStream) azureStream.destroy();
414
+ if (gcsStream) gcsStream.destroy();
415
+ if (diskWriteStream) diskWriteStream.destroy();
416
+ const err = new Error("Invalid file: file is empty");
417
+ err.status = 400;
418
+ reject(err);
419
+ }
420
+ });
421
+
422
+ // Start cloud uploads immediately
423
+ let azurePromise;
424
+ if (!saveToLocal) {
425
+ azurePromise = saveToAzureStorage(
426
+ context,
427
+ uploadName,
428
+ azureStream,
429
+ ).catch(async (err) => {
430
+ cloudUploadError = err;
431
+ // Fallback: try from disk if available
432
+ if (diskWritePromise) {
433
+ await diskWritePromise;
434
+ const diskStream = fs.createReadStream(tempFilePath, {
435
+ highWaterMark: 1024 * 1024,
436
+ autoClose: true,
437
+ });
438
+ return saveToAzureStorage(context, uploadName, diskStream);
439
+ }
440
+ throw err;
441
+ });
442
+ }
443
+ let gcsPromise;
444
+ if (gcsStream) {
445
+ gcsPromise = saveToGoogleStorage(
446
+ context,
447
+ uploadName,
448
+ gcsStream,
449
+ ).catch(async (err) => {
450
+ cloudUploadError = err;
451
+ if (diskWritePromise) {
452
+ await diskWritePromise;
453
+ const diskStream = fs.createReadStream(tempFilePath, {
454
+ highWaterMark: 1024 * 1024,
455
+ autoClose: true,
456
+ });
457
+ return saveToGoogleStorage(context, uploadName, diskStream);
458
+ }
459
+ throw err;
460
+ });
461
+ }
462
+
463
+ // Wait for cloud uploads to finish
285
464
  try {
286
- let requestId = uuidv4();
287
- const body = {};
288
-
289
- // If filePath is given, we are dealing with local file and not form-data
290
- if (filePath) {
291
- const file = fs.createReadStream(filePath);
292
- const filename = path.basename(filePath);
293
- try {
294
- const result = await uploadFile(
295
- context,
296
- requestId,
297
- body,
298
- saveToLocal,
299
- file,
300
- filename,
301
- resolve,
302
- hash,
303
- );
304
- resolve(result);
305
- } catch (error) {
306
- const err = new Error('Error processing file upload.');
307
- err.status = 500;
308
- throw err;
465
+ const results = await Promise.all(
466
+ [
467
+ azurePromise
468
+ ? azurePromise.then((url) => ({ url, type: "primary" }))
469
+ : null,
470
+ !azurePromise && saveToLocal
471
+ ? Promise.resolve({ url: null, type: "primary-local" }) // placeholder for local, url handled later
472
+ : null,
473
+ gcsPromise
474
+ ? gcsPromise.then((gcs) => ({ gcs, type: "gcs" }))
475
+ : null,
476
+ ].filter(Boolean),
477
+ );
478
+
479
+ const result = {
480
+ message: `File '${uploadName}' uploaded successfully.`,
481
+ filename: uploadName,
482
+ ...results.reduce((acc, result) => {
483
+ if (result.type === "primary") acc.url = result.url;
484
+ if (result.type === "gcs")
485
+ acc.gcs = ensureUnencodedGcsUrl(result.gcs);
486
+ return acc;
487
+ }, {}),
488
+ };
489
+ if (hash) result.hash = hash;
490
+
491
+ // If saving locally, wait for disk write to finish and then move to public folder
492
+ if (saveToLocal) {
493
+ try {
494
+ if (diskWritePromise) {
495
+ await diskWritePromise; // ensure file fully written
496
+ }
497
+ const localUrl = await saveToLocalStorage(
498
+ context,
499
+ requestId,
500
+ uploadName,
501
+ fs.createReadStream(tempFilePath, {
502
+ highWaterMark: 1024 * 1024,
503
+ autoClose: true,
504
+ }),
505
+ );
506
+ result.url = localUrl;
507
+ } catch (err) {
508
+ console.error("Error saving to local storage:", err);
509
+ throw err;
510
+ }
511
+ }
512
+
513
+ // After original uploads, handle optional conversion
514
+ const conversionService = new FileConversionService(
515
+ context,
516
+ !saveToLocal,
517
+ );
518
+
519
+ if (conversionService.needsConversion(fileExtension)) {
520
+ try {
521
+ context.log("Starting file conversion (busboy)...");
522
+
523
+ // Ensure we have a local copy of the file for conversion
524
+ let localPathForConversion = tempFilePath;
525
+
526
+ if (!localPathForConversion) {
527
+ // No temp file was written (saveToLocal === false). Download from primary URL.
528
+ const tmpDir = fs.mkdtempSync(
529
+ path.join(os.tmpdir(), "convert-"),
530
+ );
531
+ localPathForConversion = path.join(tmpDir, uploadName);
532
+ await conversionService._downloadFile(
533
+ result.url,
534
+ localPathForConversion,
535
+ );
536
+ } else {
537
+ // Wait until disk write completes to guarantee full file is present
538
+ if (diskWritePromise) {
539
+ await diskWritePromise;
309
540
  }
310
- } else {
311
- // Otherwise, continue working with form-data
312
- const busboy = Busboy({ headers: req.headers });
313
- let hasFile = false;
314
- let errorOccurred = false;
315
-
316
- busboy.on('field', (fieldname, value) => {
317
- if (fieldname === 'requestId') {
318
- requestId = value;
319
- } else if (fieldname === 'hash') {
320
- hash = value;
321
- }
322
- });
323
-
324
- busboy.on('file', async (fieldname, file, info) => {
325
- if (errorOccurred) return;
326
- hasFile = true;
327
-
328
- // Validate file
329
- if (!info.filename || info.filename.trim() === '') {
330
- errorOccurred = true;
331
- const err = new Error('Invalid file: missing filename');
332
- err.status = 400;
333
- reject(err);
334
- return;
335
- }
336
-
337
- // Prepare for streaming to cloud destinations
338
- const filename = info.filename;
339
- const safeFilename = path.basename(filename); // Sanitize filename
340
- const uploadName = `${requestId || uuidv4()}_${safeFilename}`;
341
- const azureStream = !saveToLocal ? new PassThrough() : null;
342
- const gcsStream = gcs ? new PassThrough() : null;
343
- let diskWriteStream, tempDir, tempFilePath;
344
- let diskWritePromise;
345
- let diskWriteError = null;
346
- let cloudUploadError = null;
347
-
348
- // Start local disk write in parallel (non-blocking for response)
349
- if (saveToLocal) {
350
- try {
351
- tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'upload-'));
352
- } catch (err) {
353
- console.error('Error creating tempDir:', err);
354
- errorOccurred = true;
355
- reject(err);
356
- return;
357
- }
358
- tempFilePath = path.join(tempDir, safeFilename);
359
- try {
360
- diskWriteStream = fs.createWriteStream(tempFilePath, {
361
- highWaterMark: 1024 * 1024,
362
- autoClose: true,
363
- });
364
- } catch (err) {
365
- console.error('Error creating write stream:', err, 'Temp dir exists:', fs.existsSync(tempDir));
366
- errorOccurred = true;
367
- reject(err);
368
- return;
369
- }
370
- diskWriteStream.on('error', (err) => {
371
- console.error('Disk write stream error:', err);
372
- });
373
- diskWriteStream.on('close', () => {
374
- console.log('Disk write stream closed for:', tempFilePath);
375
- });
376
- diskWritePromise = new Promise((res, rej) => {
377
- diskWriteStream.on('finish', res);
378
- diskWriteStream.on('error', (err) => {
379
- diskWriteError = err;
380
- rej(err);
381
- });
382
- });
383
- }
384
-
385
- // Pipe incoming file to all destinations
386
- let receivedAnyData = false;
387
- file.on('data', () => { receivedAnyData = true; });
388
- if (azureStream) file.pipe(azureStream);
389
- if (gcsStream) file.pipe(gcsStream);
390
- if (diskWriteStream) file.pipe(diskWriteStream);
391
-
392
- // Listen for end event to check for empty file
393
- file.on('end', async () => {
394
- if (!receivedAnyData) {
395
- errorOccurred = true;
396
- // Abort all streams
397
- if (azureStream) azureStream.destroy();
398
- if (gcsStream) gcsStream.destroy();
399
- if (diskWriteStream) diskWriteStream.destroy();
400
- const err = new Error('Invalid file: file is empty');
401
- err.status = 400;
402
- reject(err);
403
- }
404
- });
405
-
406
- // Start cloud uploads immediately
407
- let azurePromise;
408
- if (!saveToLocal) {
409
- azurePromise = saveToAzureStorage(context, uploadName, azureStream)
410
- .catch(async (err) => {
411
- cloudUploadError = err;
412
- // Fallback: try from disk if available
413
- if (diskWritePromise) {
414
- await diskWritePromise;
415
- const diskStream = fs.createReadStream(tempFilePath, {
416
- highWaterMark: 1024 * 1024,
417
- autoClose: true,
418
- });
419
- return saveToAzureStorage(context, uploadName, diskStream);
420
- }
421
- throw err;
422
- });
423
- }
424
- let gcsPromise;
425
- if (gcsStream) {
426
- gcsPromise = saveToGoogleStorage(context, uploadName, gcsStream)
427
- .catch(async (err) => {
428
- cloudUploadError = err;
429
- if (diskWritePromise) {
430
- await diskWritePromise;
431
- const diskStream = fs.createReadStream(tempFilePath, {
432
- highWaterMark: 1024 * 1024,
433
- autoClose: true,
434
- });
435
- return saveToGoogleStorage(context, uploadName, diskStream);
436
- }
437
- throw err;
438
- });
439
- }
440
-
441
- // Wait for cloud uploads to finish
442
- try {
443
- const results = await Promise.all([
444
- azurePromise ? azurePromise.then((url) => ({ url, type: 'primary' })) : null,
445
- (!azurePromise && saveToLocal)
446
- ? Promise.resolve({ url: null, type: 'primary-local' }) // placeholder for local, url handled later
447
- : null,
448
- gcsPromise ? gcsPromise.then((gcs) => ({ gcs, type: 'gcs' })) : null,
449
- ].filter(Boolean));
450
-
451
- const result = {
452
- message: `File '${uploadName}' uploaded successfully.`,
453
- filename: uploadName,
454
- ...results.reduce((acc, result) => {
455
- if (result.type === 'primary') acc.url = result.url;
456
- if (result.type === 'gcs') acc.gcs = ensureUnencodedGcsUrl(result.gcs);
457
- return acc;
458
- }, {}),
459
- };
460
- if (hash) result.hash = hash;
461
-
462
- // If saving locally, wait for disk write to finish and then move to public folder
463
- if (saveToLocal) {
464
- try {
465
- if (diskWritePromise) {
466
- await diskWritePromise; // ensure file fully written
467
- }
468
- const localUrl = await saveToLocalStorage(
469
- context,
470
- requestId,
471
- uploadName,
472
- fs.createReadStream(tempFilePath, {
473
- highWaterMark: 1024 * 1024,
474
- autoClose: true,
475
- }),
476
- );
477
- result.url = localUrl;
478
- } catch (err) {
479
- console.error('Error saving to local storage:', err);
480
- throw err;
481
- }
482
- }
483
-
484
- // After original uploads, handle optional conversion
485
- const conversionService = new FileConversionService(context, !saveToLocal);
486
-
487
- if (conversionService.needsConversion(safeFilename)) {
488
- try {
489
- context.log('Starting file conversion (busboy)...');
490
-
491
- // Ensure we have a local copy of the file for conversion
492
- let localPathForConversion = tempFilePath;
493
-
494
- if (!localPathForConversion) {
495
- // No temp file was written (saveToLocal === false). Download from primary URL.
496
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'convert-'));
497
- localPathForConversion = path.join(tmpDir, safeFilename);
498
- await conversionService._downloadFile(result.url, localPathForConversion);
499
- } else {
500
- // Wait until disk write completes to guarantee full file is present
501
- if (diskWritePromise) {
502
- await diskWritePromise;
503
- }
504
- }
505
-
506
- // Perform the conversion
507
- const conversion = await conversionService.convertFile(localPathForConversion, result.url);
508
- context.log('File conversion completed (busboy):', conversion);
509
-
510
- if (conversion.converted) {
511
- context.log('Saving converted file (busboy)...');
512
- // Save converted file to primary storage
513
- const convertedSaveResult = await conversionService._saveConvertedFile(conversion.convertedPath, requestId);
514
-
515
- // Optionally save to GCS
516
- let convertedGcsUrl;
517
- if (conversionService._isGCSConfigured()) {
518
- convertedGcsUrl = await conversionService._uploadChunkToGCS(conversion.convertedPath, requestId);
519
- }
520
-
521
- // Attach to response body
522
- result.converted = {
523
- url: convertedSaveResult.url,
524
- gcs: convertedGcsUrl,
525
- };
526
- context.log('Conversion process (busboy) completed successfully');
527
- }
528
- } catch (convErr) {
529
- console.error('Error converting file (busboy):', convErr);
530
- context.log('Error during conversion (busboy):', convErr.message);
531
- // Continue without failing the upload
532
- }
533
- }
534
-
535
- // Respond after conversion (if any)
536
- context.res = { status: 200, body: result };
537
- resolve(result);
538
- } catch (err) {
539
- errorOccurred = true;
540
- reject(err);
541
- } finally {
542
- // Clean up temp file if written
543
- if (tempDir) {
544
- fs.rmSync(tempDir, { recursive: true, force: true });
545
- }
546
- }
547
- });
548
-
549
- busboy.on('error', (error) => {
550
- if (errorOccurred) return;
551
- errorOccurred = true;
552
- const err = new Error('No file provided in request');
553
- err.status = 400;
554
- reject(err);
555
- });
556
-
557
- busboy.on('finish', () => {
558
- if (errorOccurred) return;
559
- if (!hasFile) {
560
- errorOccurred = true;
561
- const err = new Error('No file provided in request');
562
- err.status = 400;
563
- reject(err);
564
- }
565
- });
566
-
567
- // Handle errors from piping the request
568
- req.on('error', (error) => {
569
- if (errorOccurred) return;
570
- errorOccurred = true;
571
- // Only log unexpected errors
572
- if (error.message !== 'No file provided in request') {
573
- context.log('Error in request stream:', error);
574
- }
575
- const err = new Error('No file provided in request');
576
- err.status = 400;
577
- reject(err);
578
- });
579
-
580
- try {
581
- req.pipe(busboy);
582
- } catch (error) {
583
- if (errorOccurred) return;
584
- errorOccurred = true;
585
- // Only log unexpected errors
586
- if (error.message !== 'No file provided in request') {
587
- context.log('Error piping request to busboy:', error);
588
- }
589
- const err = new Error('No file provided in request');
590
- err.status = 400;
591
- reject(err);
541
+ }
542
+
543
+ // Perform the conversion
544
+ const conversion = await conversionService.convertFile(
545
+ localPathForConversion,
546
+ result.url,
547
+ );
548
+ context.log(
549
+ "File conversion completed (busboy):",
550
+ conversion,
551
+ );
552
+
553
+ if (conversion.converted) {
554
+ context.log("Saving converted file (busboy)...");
555
+ // Save converted file to primary storage
556
+ const convertedSaveResult =
557
+ await conversionService._saveConvertedFile(
558
+ conversion.convertedPath,
559
+ requestId,
560
+ );
561
+
562
+ // Optionally save to GCS
563
+ let convertedGcsUrl;
564
+ if (conversionService._isGCSConfigured()) {
565
+ convertedGcsUrl =
566
+ await conversionService._uploadChunkToGCS(
567
+ conversion.convertedPath,
568
+ requestId,
569
+ );
592
570
  }
571
+
572
+ // Attach to response body
573
+ result.converted = {
574
+ url: convertedSaveResult.url,
575
+ gcs: convertedGcsUrl,
576
+ };
577
+ context.log(
578
+ "Conversion process (busboy) completed successfully",
579
+ );
580
+ }
581
+ } catch (convErr) {
582
+ console.error("Error converting file (busboy):", convErr);
583
+ context.log(
584
+ "Error during conversion (busboy):",
585
+ convErr.message,
586
+ );
587
+ // Continue without failing the upload
593
588
  }
594
- } catch (error) {
595
- // Only log unexpected errors
596
- if (error.message !== 'No file provided in request') {
597
- context.log('Error processing file upload:', error);
598
- }
599
- const err = new Error(error.message || 'Error processing file upload.');
600
- err.status = error.status || 500;
601
- reject(err);
589
+ }
590
+
591
+ // Respond after conversion (if any)
592
+ context.res = { status: 200, body: result };
593
+ resolve(result);
594
+ } catch (err) {
595
+ errorOccurred = true;
596
+ reject(err);
597
+ } finally {
598
+ // Clean up temp file if written
599
+ if (tempDir) {
600
+ fs.rmSync(tempDir, { recursive: true, force: true });
601
+ }
602
602
  }
603
- })();
604
- });
603
+ });
604
+
605
+ busboy.on("error", (error) => {
606
+ if (errorOccurred) return;
607
+ errorOccurred = true;
608
+ const err = new Error("No file provided in request");
609
+ err.status = 400;
610
+ reject(err);
611
+ });
612
+
613
+ busboy.on("finish", () => {
614
+ if (errorOccurred) return;
615
+ if (!hasFile) {
616
+ errorOccurred = true;
617
+ const err = new Error("No file provided in request");
618
+ err.status = 400;
619
+ reject(err);
620
+ }
621
+ });
622
+
623
+ // Handle errors from piping the request
624
+ req.on("error", (error) => {
625
+ if (errorOccurred) return;
626
+ errorOccurred = true;
627
+ // Only log unexpected errors
628
+ if (error.message !== "No file provided in request") {
629
+ context.log("Error in request stream:", error);
630
+ }
631
+ const err = new Error("No file provided in request");
632
+ err.status = 400;
633
+ reject(err);
634
+ });
635
+
636
+ try {
637
+ req.pipe(busboy);
638
+ } catch (error) {
639
+ if (errorOccurred) return;
640
+ errorOccurred = true;
641
+ // Only log unexpected errors
642
+ if (error.message !== "No file provided in request") {
643
+ context.log("Error piping request to busboy:", error);
644
+ }
645
+ const err = new Error("No file provided in request");
646
+ err.status = 400;
647
+ reject(err);
648
+ }
649
+ }
650
+ } catch (error) {
651
+ // Only log unexpected errors
652
+ if (error.message !== "No file provided in request") {
653
+ context.log("Error processing file upload:", error);
654
+ }
655
+ const err = new Error(error.message || "Error processing file upload.");
656
+ err.status = error.status || 500;
657
+ reject(err);
658
+ }
659
+ })();
660
+ });
605
661
  }
606
662
 
607
663
  // Helper function to handle local file storage
608
664
  async function saveToLocalStorage(context, requestId, encodedFilename, file) {
609
- const localPath = join(publicFolder, requestId);
610
- fs.mkdirSync(localPath, { recursive: true });
611
-
612
- // Sanitize filename by removing invalid characters
613
- const sanitizedFilename = sanitizeFilename(encodedFilename);
614
- const destinationPath = `${localPath}/${sanitizedFilename}`;
615
-
616
- await pipeline(file, fs.createWriteStream(destinationPath));
617
- return `http://${ipAddress}:${port}/files/${requestId}/${sanitizedFilename}`;
665
+ const localPath = path.join(publicFolder, requestId);
666
+ fs.mkdirSync(localPath, { recursive: true });
667
+
668
+ // Sanitize filename by removing invalid characters
669
+ const sanitizedFilename = sanitizeFilename(encodedFilename);
670
+ const destinationPath = `${localPath}/${sanitizedFilename}`;
671
+
672
+ await pipeline(file, fs.createWriteStream(destinationPath));
673
+ return `http://${ipAddress}:${port}/files/${requestId}/${sanitizedFilename}`;
618
674
  }
619
675
 
620
676
  // Helper function to handle Azure blob storage
621
677
  async function saveToAzureStorage(context, encodedFilename, file) {
622
- const { containerClient } = await getBlobClient();
623
- const contentType = mime.lookup(encodedFilename);
624
-
625
- // Create a safe blob name that is URI-encoded once (no double encoding)
626
- let blobName = sanitizeFilename(encodedFilename);
627
- blobName = encodeURIComponent(blobName);
628
-
629
- const options = {
630
- blobHTTPHeaders: contentType ? { blobContentType: contentType } : {},
631
- maxConcurrency: 50,
632
- blockSize: 8 * 1024 * 1024,
633
- };
634
-
635
- const blockBlobClient = containerClient.getBlockBlobClient(blobName);
636
- context.log(`Uploading to Azure... ${blobName}`);
637
- await blockBlobClient.uploadStream(file, undefined, undefined, options);
638
- const sasToken = generateSASToken(containerClient, blobName);
639
- return `${blockBlobClient.url}?${sasToken}`;
678
+ const { containerClient } = await getBlobClient();
679
+ const contentType = mime.lookup(encodedFilename);
680
+
681
+ // Create a safe blob name that is URI-encoded once (no double encoding)
682
+ let blobName = sanitizeFilename(encodedFilename);
683
+ blobName = encodeURIComponent(blobName);
684
+
685
+ const options = {
686
+ blobHTTPHeaders: contentType ? { blobContentType: contentType } : {},
687
+ maxConcurrency: 50,
688
+ blockSize: 8 * 1024 * 1024,
689
+ };
690
+
691
+ const blockBlobClient = containerClient.getBlockBlobClient(blobName);
692
+ context.log(`Uploading to Azure... ${blobName}`);
693
+ await blockBlobClient.uploadStream(file, undefined, undefined, options);
694
+ const sasToken = generateSASToken(containerClient, blobName);
695
+ return `${blockBlobClient.url}?${sasToken}`;
640
696
  }
641
697
 
642
698
  // Helper function to upload a file to Google Cloud Storage
643
699
  async function uploadToGCS(context, file, filename) {
644
- const objectName = sanitizeFilename(filename);
645
- const gcsFile = gcs.bucket(GCS_BUCKETNAME).file(objectName);
646
- const writeStream = gcsFile.createWriteStream({
647
- resumable: true,
648
- validation: false,
649
- metadata: {
650
- contentType: mime.lookup(objectName) || 'application/octet-stream',
651
- },
652
- chunkSize: 8 * 1024 * 1024,
653
- numRetries: 3,
654
- retryDelay: 1000,
655
- });
656
- context.log(`Uploading to GCS... ${objectName}`);
657
- await pipeline(file, writeStream);
658
- return `gs://${GCS_BUCKETNAME}/${objectName}`;
700
+ const objectName = sanitizeFilename(filename);
701
+ const gcsFile = gcs.bucket(GCS_BUCKETNAME).file(objectName);
702
+ const writeStream = gcsFile.createWriteStream({
703
+ resumable: true,
704
+ validation: false,
705
+ metadata: {
706
+ contentType: mime.lookup(objectName) || "application/octet-stream",
707
+ },
708
+ chunkSize: 8 * 1024 * 1024,
709
+ numRetries: 3,
710
+ retryDelay: 1000,
711
+ });
712
+ context.log(`Uploading to GCS... ${objectName}`);
713
+ await pipeline(file, writeStream);
714
+ return `gs://${GCS_BUCKETNAME}/${objectName}`;
659
715
  }
660
716
 
661
717
  // Wrapper that checks if GCS is configured
662
718
  async function saveToGoogleStorage(context, encodedFilename, file) {
663
- if (!gcs) {
664
- throw new Error('Google Cloud Storage is not initialized');
665
- }
666
- return uploadToGCS(context, file, encodedFilename);
719
+ if (!gcs) {
720
+ throw new Error("Google Cloud Storage is not initialized");
721
+ }
722
+ return uploadToGCS(context, file, encodedFilename);
667
723
  }
668
724
 
669
725
  async function uploadFile(
670
- context,
671
- requestId,
672
- body,
673
- saveToLocal,
674
- file,
675
- filename,
676
- resolve,
677
- hash = null,
726
+ context,
727
+ requestId,
728
+ body,
729
+ saveToLocal,
730
+ file,
731
+ filename,
732
+ resolve,
733
+ hash = null,
678
734
  ) {
679
- try {
680
- if (!file) {
681
- context.res = {
682
- status: 400,
683
- body: 'No file provided in request',
684
- };
685
- resolve(context.res);
686
- return;
687
- }
735
+ try {
736
+ if (!file) {
737
+ context.res = {
738
+ status: 400,
739
+ body: "No file provided in request",
740
+ };
741
+ resolve(context.res);
742
+ return;
743
+ }
688
744
 
689
- const ext = path.extname(filename).toLowerCase();
690
- context.log(`Processing file with extension: ${ext}`);
691
- let uploadPath = null;
692
- let uploadName = null;
693
- let tempDir = null;
694
-
695
- // Create temp directory for file operations
696
- tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'upload-'));
697
- const tempOriginal = path.join(tempDir, filename);
698
- context.log(`Created temp directory: ${tempDir}`);
699
-
700
- // Optimize initial write with larger buffer
701
- const writeStream = fs.createWriteStream(tempOriginal, {
702
- highWaterMark: 1024 * 1024, // 1MB chunks for initial write
703
- autoClose: true,
704
- });
705
-
706
- // Use pipeline with error handling
707
- context.log('Writing file to temp location...');
708
- await pipeline(file, writeStream);
709
- context.log('File written to temp location successfully');
710
-
711
- uploadPath = tempOriginal;
712
- uploadName = `${requestId || uuidv4()}_${filename}`;
713
- context.log(`Prepared upload name: ${uploadName}`);
714
-
715
- // Create optimized read streams with larger buffers for storage uploads
716
- const createOptimizedReadStream = (path) => fs.createReadStream(path, {
717
- highWaterMark: 1024 * 1024, // 1MB chunks for storage uploads
718
- autoClose: true,
719
- });
720
-
721
- // Upload original in parallel with optimized streams
722
- const storagePromises = [];
723
- context.log('Starting primary storage upload...');
724
- const primaryPromise = saveToLocal
725
- ? saveToLocalStorage(
726
- context,
727
- requestId,
728
- uploadName,
729
- createOptimizedReadStream(uploadPath),
730
- )
731
- : saveToAzureStorage(
732
- context,
733
- uploadName,
734
- createOptimizedReadStream(uploadPath),
735
- );
736
- storagePromises.push(
737
- primaryPromise.then((url) => {
738
- context.log('Primary storage upload completed');
739
- return { url, type: 'primary' };
740
- }),
741
- );
745
+ const ext = path.extname(filename).toLowerCase();
746
+ context.log(`Processing file with extension: ${ext}`);
747
+ let uploadPath = null;
748
+ let uploadName = null;
749
+ let tempDir = null;
750
+
751
+ // Create temp directory for file operations
752
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "upload-"));
753
+ const tempOriginal = path.join(tempDir, filename);
754
+ context.log(`Created temp directory: ${tempDir}`);
755
+
756
+ // Optimize initial write with larger buffer
757
+ const writeStream = fs.createWriteStream(tempOriginal, {
758
+ highWaterMark: 1024 * 1024, // 1MB chunks for initial write
759
+ autoClose: true,
760
+ });
742
761
 
743
- if (gcs) {
744
- context.log('Starting GCS upload...');
745
- storagePromises.push(
746
- saveToGoogleStorage(
747
- context,
748
- uploadName,
749
- createOptimizedReadStream(uploadPath),
750
- ).then((gcsUrl) => {
751
- context.log('GCS upload completed');
752
- return {
753
- gcs: gcsUrl,
754
- type: 'gcs',
755
- };
756
- }),
757
- );
758
- }
762
+ // Use pipeline with error handling
763
+ context.log("Writing file to temp location...");
764
+ await pipeline(file, writeStream);
765
+ context.log("File written to temp location successfully");
766
+
767
+ uploadPath = tempOriginal;
768
+ // Use the filename that was passed in (which should already be the LLM-friendly name)
769
+ uploadName = filename;
770
+ const fileExtension = path.extname(filename);
771
+ context.log(`Prepared upload name: ${uploadName}`);
772
+
773
+ // Create optimized read streams with larger buffers for storage uploads
774
+ const createOptimizedReadStream = (path) =>
775
+ fs.createReadStream(path, {
776
+ highWaterMark: 1024 * 1024, // 1MB chunks for storage uploads
777
+ autoClose: true,
778
+ });
779
+
780
+ // Upload original in parallel with optimized streams
781
+ const storagePromises = [];
782
+ context.log("Starting primary storage upload...");
783
+ const primaryPromise = saveToLocal
784
+ ? saveToLocalStorage(
785
+ context,
786
+ requestId,
787
+ uploadName,
788
+ createOptimizedReadStream(uploadPath),
789
+ )
790
+ : saveToAzureStorage(
791
+ context,
792
+ uploadName,
793
+ createOptimizedReadStream(uploadPath),
794
+ );
795
+ storagePromises.push(
796
+ primaryPromise.then((url) => {
797
+ context.log("Primary storage upload completed");
798
+ return { url, type: "primary" };
799
+ }),
800
+ );
759
801
 
760
- // Wait for original uploads to complete
761
- context.log('Waiting for all storage uploads to complete...');
762
- const results = await Promise.all(storagePromises);
763
- const result = {
764
- message: `File '${uploadName}' ${saveToLocal ? 'saved to folder' : 'uploaded'} successfully.`,
765
- filename: uploadName,
766
- ...results.reduce((acc, result) => {
767
- if (result.type === 'primary') acc.url = result.url;
768
- if (result.type === 'gcs') acc.gcs = ensureUnencodedGcsUrl(result.gcs);
769
- return acc;
770
- }, {}),
771
- };
772
-
773
- if (hash) {
774
- result.hash = hash;
775
- }
802
+ if (gcs) {
803
+ context.log("Starting GCS upload...");
804
+ storagePromises.push(
805
+ saveToGoogleStorage(
806
+ context,
807
+ uploadName,
808
+ createOptimizedReadStream(uploadPath),
809
+ ).then((gcsUrl) => {
810
+ context.log("GCS upload completed");
811
+ return {
812
+ gcs: gcsUrl,
813
+ type: "gcs",
814
+ };
815
+ }),
816
+ );
817
+ }
776
818
 
777
- // Initialize conversion service
778
- const conversionService = new FileConversionService(context, !saveToLocal);
819
+ // Wait for original uploads to complete
820
+ context.log("Waiting for all storage uploads to complete...");
821
+ const results = await Promise.all(storagePromises);
822
+ const result = {
823
+ message: `File '${uploadName}' ${saveToLocal ? "saved to folder" : "uploaded"} successfully.`,
824
+ filename: uploadName,
825
+ ...results.reduce((acc, result) => {
826
+ if (result.type === "primary") acc.url = result.url;
827
+ if (result.type === "gcs") acc.gcs = ensureUnencodedGcsUrl(result.gcs);
828
+ return acc;
829
+ }, {}),
830
+ };
779
831
 
780
- // Check if file needs conversion and handle it
781
- if (conversionService.needsConversion(filename)) {
782
- try {
783
- context.log('Starting file conversion...');
784
- // Convert the file
785
- const conversion = await conversionService.convertFile(uploadPath, result.url);
786
- context.log('File conversion completed:', conversion);
787
-
788
- if (conversion.converted) {
789
- context.log('Saving converted file...');
790
- // Save converted file
791
- const convertedSaveResult = await conversionService._saveConvertedFile(conversion.convertedPath, requestId);
792
- context.log('Converted file saved to primary storage');
793
-
794
- // If GCS is configured, also save to GCS
795
- let convertedGcsUrl;
796
- if (conversionService._isGCSConfigured()) {
797
- context.log('Saving converted file to GCS...');
798
- convertedGcsUrl = await conversionService._uploadChunkToGCS(conversion.convertedPath, requestId);
799
- context.log('Converted file saved to GCS');
800
- }
832
+ if (hash) {
833
+ result.hash = hash;
834
+ }
801
835
 
802
- // Add converted file info to result
803
- result.converted = {
804
- url: convertedSaveResult.url,
805
- gcs: convertedGcsUrl
806
- };
807
- context.log('Conversion process completed successfully');
808
- }
809
- } catch (error) {
810
- console.error('Error converting file:', error);
811
- context.log('Error during conversion:', error.message);
812
- // Don't fail the upload if conversion fails
813
- }
836
+ // Initialize conversion service
837
+ const conversionService = new FileConversionService(context, !saveToLocal);
838
+
839
+ // Check if file needs conversion and handle it
840
+ if (conversionService.needsConversion(fileExtension)) {
841
+ try {
842
+ context.log("Starting file conversion...");
843
+ // Convert the file
844
+ const conversion = await conversionService.convertFile(
845
+ uploadPath,
846
+ result.url,
847
+ );
848
+ context.log("File conversion completed:", conversion);
849
+
850
+ if (conversion.converted) {
851
+ context.log("Saving converted file...");
852
+ // Save converted file
853
+ const convertedSaveResult =
854
+ await conversionService._saveConvertedFile(
855
+ conversion.convertedPath,
856
+ requestId,
857
+ );
858
+ context.log("Converted file saved to primary storage");
859
+
860
+ // If GCS is configured, also save to GCS
861
+ let convertedGcsUrl;
862
+ if (conversionService._isGCSConfigured()) {
863
+ context.log("Saving converted file to GCS...");
864
+ convertedGcsUrl = await conversionService._uploadChunkToGCS(
865
+ conversion.convertedPath,
866
+ requestId,
867
+ );
868
+ context.log("Converted file saved to GCS");
869
+ }
870
+
871
+ // Add converted file info to result
872
+ result.converted = {
873
+ url: convertedSaveResult.url,
874
+ gcs: convertedGcsUrl,
875
+ };
876
+ context.log("Conversion process completed successfully");
814
877
  }
878
+ } catch (error) {
879
+ console.error("Error converting file:", error);
880
+ context.log("Error during conversion:", error.message);
881
+ // Don't fail the upload if conversion fails
882
+ }
883
+ }
815
884
 
816
- context.res = {
817
- status: 200,
818
- body: result,
819
- };
885
+ context.res = {
886
+ status: 200,
887
+ body: result,
888
+ };
820
889
 
821
- // Clean up temp files
822
- context.log('Cleaning up temporary files...');
823
- if (tempDir) {
824
- fs.rmSync(tempDir, { recursive: true, force: true });
825
- context.log('Temporary files cleaned up');
826
- }
890
+ // Clean up temp files
891
+ context.log("Cleaning up temporary files...");
892
+ if (tempDir) {
893
+ fs.rmSync(tempDir, { recursive: true, force: true });
894
+ context.log("Temporary files cleaned up");
895
+ }
827
896
 
828
- context.log('Upload process completed successfully');
829
- resolve(result);
830
- } catch (error) {
831
- context.log('Error in upload process:', error);
832
- if (body.url) {
833
- try {
834
- await cleanup(context, [body.url]);
835
- } catch (cleanupError) {
836
- context.log('Error during cleanup after failure:', cleanupError);
837
- }
838
- }
839
- throw error;
897
+ context.log("Upload process completed successfully");
898
+ resolve(result);
899
+ } catch (error) {
900
+ context.log("Error in upload process:", error);
901
+ if (body.url) {
902
+ try {
903
+ await cleanup(context, [body.url]);
904
+ } catch (cleanupError) {
905
+ context.log("Error during cleanup after failure:", cleanupError);
906
+ }
840
907
  }
908
+ throw error;
909
+ }
841
910
  }
842
911
 
843
912
  // Helper to convert a stream to a buffer
844
913
  async function streamToBuffer(stream) {
845
- return new Promise((resolve, reject) => {
846
- const chunks = [];
847
- stream.on('data', (chunk) => chunks.push(chunk));
848
- stream.on('end', () => resolve(Buffer.concat(chunks)));
849
- stream.on('error', reject);
850
- });
914
+ return new Promise((resolve, reject) => {
915
+ const chunks = [];
916
+ stream.on("data", (chunk) => chunks.push(chunk));
917
+ stream.on("end", () => resolve(Buffer.concat(chunks)));
918
+ stream.on("error", reject);
919
+ });
851
920
  }
852
921
 
853
922
  // Function to delete files that haven't been used in more than a month
854
923
  async function cleanup(context, urls = null) {
855
- const { containerClient } = await getBlobClient();
856
- const cleanedURLs = [];
924
+ const { containerClient } = await getBlobClient();
925
+ const cleanedURLs = [];
857
926
 
858
- if (!urls) {
859
- const xMonthAgo = new Date();
860
- xMonthAgo.setMonth(xMonthAgo.getMonth() - 1);
927
+ if (!urls) {
928
+ const xMonthAgo = new Date();
929
+ xMonthAgo.setMonth(xMonthAgo.getMonth() - 1);
861
930
 
862
- const blobs = containerClient.listBlobsFlat();
931
+ const blobs = containerClient.listBlobsFlat();
863
932
 
864
- for await (const blob of blobs) {
865
- const lastModified = blob.properties.lastModified;
866
- if (lastModified < xMonthAgo) {
867
- try {
868
- const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
869
- await blockBlobClient.delete();
870
- context.log(`Cleaned blob: ${blob.name}`);
871
- cleanedURLs.push(blob.name);
872
- } catch (error) {
873
- if (error.statusCode !== 404) {
874
- context.log(`Error cleaning blob ${blob.name}:`, error);
875
- }
876
- }
877
- }
933
+ for await (const blob of blobs) {
934
+ const lastModified = blob.properties.lastModified;
935
+ if (lastModified < xMonthAgo) {
936
+ try {
937
+ const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
938
+ await blockBlobClient.delete();
939
+ context.log(`Cleaned blob: ${blob.name}`);
940
+ cleanedURLs.push(blob.name);
941
+ } catch (error) {
942
+ if (error.statusCode !== 404) {
943
+ context.log(`Error cleaning blob ${blob.name}:`, error);
944
+ }
878
945
  }
879
- } else {
880
- for (const url of urls) {
881
- try {
882
- const blobName = url.replace(containerClient.url, '');
883
- const blockBlobClient = containerClient.getBlockBlobClient(blobName);
884
- await blockBlobClient.delete();
885
- context.log(`Cleaned blob: ${blobName}`);
886
- cleanedURLs.push(blobName);
887
- } catch (error) {
888
- if (error.statusCode !== 404) {
889
- context.log(`Error cleaning blob ${url}:`, error);
890
- }
891
- }
946
+ }
947
+ }
948
+ } else {
949
+ for (const url of urls) {
950
+ try {
951
+ const blobName = url.replace(containerClient.url, "");
952
+ const blockBlobClient = containerClient.getBlockBlobClient(blobName);
953
+ await blockBlobClient.delete();
954
+ context.log(`Cleaned blob: ${blobName}`);
955
+ cleanedURLs.push(blobName);
956
+ } catch (error) {
957
+ if (error.statusCode !== 404) {
958
+ context.log(`Error cleaning blob ${url}:`, error);
892
959
  }
960
+ }
893
961
  }
894
- return cleanedURLs;
962
+ }
963
+ return cleanedURLs;
895
964
  }
896
965
 
897
966
  async function cleanupGCS(urls = null) {
898
- if (!gcs) return [];
899
- const bucket = gcs.bucket(GCS_BUCKETNAME);
900
- const directories = new Set();
901
- const cleanedURLs = [];
902
-
903
- if (!urls) {
904
- const daysN = 30;
905
- const threshold = Date.now() - daysN * 24 * 60 * 60 * 1000;
906
- const [files] = await bucket.getFiles();
907
-
908
- for (const file of files) {
909
- const [metadata] = await file.getMetadata();
910
- const directoryPath = path.dirname(file.name);
911
- directories.add(directoryPath);
912
- if (metadata.updated) {
913
- const updatedTime = new Date(metadata.updated).getTime();
914
- if (updatedTime < threshold) {
915
- await file.delete();
916
- cleanedURLs.push(file.name);
917
- }
918
- }
919
- }
920
- } else {
921
- for (const url of urls) {
922
- const filePath = url.split('/').slice(3).join('/');
923
- const file = bucket.file(filePath);
924
- const directoryPath = path.dirname(file.name);
925
- directories.add(directoryPath);
926
- await file.delete();
927
- cleanedURLs.push(url);
967
+ if (!gcs) return [];
968
+ const bucket = gcs.bucket(GCS_BUCKETNAME);
969
+ const directories = new Set();
970
+ const cleanedURLs = [];
971
+
972
+ if (!urls) {
973
+ const daysN = 30;
974
+ const threshold = Date.now() - daysN * 24 * 60 * 60 * 1000;
975
+ const [files] = await bucket.getFiles();
976
+
977
+ for (const file of files) {
978
+ const [metadata] = await file.getMetadata();
979
+ const directoryPath = path.dirname(file.name);
980
+ directories.add(directoryPath);
981
+ if (metadata.updated) {
982
+ const updatedTime = new Date(metadata.updated).getTime();
983
+ if (updatedTime < threshold) {
984
+ await file.delete();
985
+ cleanedURLs.push(file.name);
928
986
  }
987
+ }
929
988
  }
989
+ } else {
990
+ for (const url of urls) {
991
+ const filePath = url.split("/").slice(3).join("/");
992
+ const file = bucket.file(filePath);
993
+ const directoryPath = path.dirname(file.name);
994
+ directories.add(directoryPath);
995
+ await file.delete();
996
+ cleanedURLs.push(url);
997
+ }
998
+ }
930
999
 
931
- for (const directory of directories) {
932
- const [files] = await bucket.getFiles({ prefix: directory });
933
- if (files.length === 0) {
934
- await bucket.deleteFiles({ prefix: directory });
935
- }
1000
+ for (const directory of directories) {
1001
+ const [files] = await bucket.getFiles({ prefix: directory });
1002
+ if (files.length === 0) {
1003
+ await bucket.deleteFiles({ prefix: directory });
936
1004
  }
1005
+ }
937
1006
 
938
- return cleanedURLs;
1007
+ return cleanedURLs;
939
1008
  }
940
1009
 
941
1010
  async function deleteGCS(blobName) {
942
- if (!blobName) {
943
- console.log('[deleteGCS] No blobName provided, skipping GCS deletion');
944
- return;
945
- }
1011
+ if (!blobName) {
1012
+ console.log("[deleteGCS] No blobName provided, skipping GCS deletion");
1013
+ return;
1014
+ }
946
1015
 
947
- if (!gcs) {
948
- console.log('[deleteGCS] GCS not initialized, skipping deletion');
949
- return;
950
- }
1016
+ if (!gcs) {
1017
+ console.log("[deleteGCS] GCS not initialized, skipping deletion");
1018
+ return;
1019
+ }
951
1020
 
952
- try {
953
- if (process.env.STORAGE_EMULATOR_HOST) {
954
- console.log(`[deleteGCS] Using emulator at ${process.env.STORAGE_EMULATOR_HOST}`);
955
- console.log(`[deleteGCS] Attempting to delete files with prefix: ${blobName}`);
956
-
957
- // List files first
958
- const listUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o?prefix=${blobName}`;
959
- console.log(`[deleteGCS] Listing files with URL: ${listUrl}`);
960
-
961
- const listResponse = await axios.get(listUrl, {
962
- validateStatus: (status) => true,
963
- });
964
- console.log(`[deleteGCS] List response status: ${listResponse.status}`);
965
- console.log(`[deleteGCS] List response data: ${JSON.stringify(listResponse.data)}`);
966
-
967
- if (listResponse.status === 200 && listResponse.data.items) {
968
- console.log(`[deleteGCS] Found ${listResponse.data.items.length} items to delete`);
969
-
970
- // Delete each file
971
- for (const item of listResponse.data.items) {
972
- const deleteUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o/${encodeURIComponent(item.name)}`;
973
- console.log(`[deleteGCS] Deleting file: ${item.name}`);
974
- console.log(`[deleteGCS] Delete URL: ${deleteUrl}`);
975
-
976
- const deleteResponse = await axios.delete(deleteUrl, {
977
- validateStatus: (status) => true,
978
- headers: {
979
- 'Content-Type': 'application/json',
980
- },
981
- });
982
- console.log(`[deleteGCS] Delete response status: ${deleteResponse.status}`);
983
- console.log(`[deleteGCS] Delete response data: ${JSON.stringify(deleteResponse.data)}`);
984
- }
985
- console.log('[deleteGCS] All files deleted successfully');
986
- } else {
987
- console.log('[deleteGCS] No files found to delete');
988
- }
989
- } else {
990
- console.log('[deleteGCS] Using real GCS');
991
- const bucket = gcs.bucket(GCS_BUCKETNAME);
992
- const [files] = await bucket.getFiles({ prefix: blobName });
993
- console.log(`[deleteGCS] Found ${files.length} files to delete`);
994
-
995
- if (files.length > 0) {
996
- await Promise.all(files.map((file) => file.delete()));
997
- console.log('[deleteGCS] All files deleted successfully');
998
- } else {
999
- console.log('[deleteGCS] No files found to delete');
1000
- }
1001
- }
1002
- } catch (error) {
1003
- // If we get a 404 error, it means the file is already gone, which is fine
1004
- if (error.response?.status === 404 || error.code === 404) {
1005
- console.log('[deleteGCS] File not found in GCS (404) - this is expected if file was already deleted');
1006
- return;
1021
+ try {
1022
+ if (process.env.STORAGE_EMULATOR_HOST) {
1023
+ console.log(
1024
+ `[deleteGCS] Using emulator at ${process.env.STORAGE_EMULATOR_HOST}`,
1025
+ );
1026
+ console.log(
1027
+ `[deleteGCS] Attempting to delete files with prefix: ${blobName}`,
1028
+ );
1029
+
1030
+ // List files first
1031
+ const listUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o?prefix=${blobName}`;
1032
+ console.log(`[deleteGCS] Listing files with URL: ${listUrl}`);
1033
+
1034
+ const listResponse = await axios.get(listUrl, {
1035
+ validateStatus: (status) => true,
1036
+ });
1037
+ console.log(`[deleteGCS] List response status: ${listResponse.status}`);
1038
+ console.log(
1039
+ `[deleteGCS] List response data: ${JSON.stringify(listResponse.data)}`,
1040
+ );
1041
+
1042
+ if (listResponse.status === 200 && listResponse.data.items) {
1043
+ console.log(
1044
+ `[deleteGCS] Found ${listResponse.data.items.length} items to delete`,
1045
+ );
1046
+
1047
+ // Delete each file
1048
+ for (const item of listResponse.data.items) {
1049
+ const deleteUrl = `${process.env.STORAGE_EMULATOR_HOST}/storage/v1/b/${GCS_BUCKETNAME}/o/${encodeURIComponent(item.name)}`;
1050
+ console.log(`[deleteGCS] Deleting file: ${item.name}`);
1051
+ console.log(`[deleteGCS] Delete URL: ${deleteUrl}`);
1052
+
1053
+ const deleteResponse = await axios.delete(deleteUrl, {
1054
+ validateStatus: (status) => true,
1055
+ headers: {
1056
+ "Content-Type": "application/json",
1057
+ },
1058
+ });
1059
+ console.log(
1060
+ `[deleteGCS] Delete response status: ${deleteResponse.status}`,
1061
+ );
1062
+ console.log(
1063
+ `[deleteGCS] Delete response data: ${JSON.stringify(deleteResponse.data)}`,
1064
+ );
1007
1065
  }
1008
- console.error('[deleteGCS] Error during deletion:', error);
1009
- console.error('[deleteGCS] Error details:', {
1010
- message: error.message,
1011
- code: error.code,
1012
- errors: error.errors,
1013
- response: error.response ? {
1014
- status: error.response.status,
1015
- statusText: error.response.statusText,
1016
- data: error.response.data,
1017
- headers: error.response.headers,
1018
- } : null,
1019
- });
1020
- // Don't throw the error - we want to continue with cleanup even if GCS deletion fails
1066
+ console.log("[deleteGCS] All files deleted successfully");
1067
+ } else {
1068
+ console.log("[deleteGCS] No files found to delete");
1069
+ }
1070
+ } else {
1071
+ console.log("[deleteGCS] Using real GCS");
1072
+ const bucket = gcs.bucket(GCS_BUCKETNAME);
1073
+ const [files] = await bucket.getFiles({ prefix: blobName });
1074
+ console.log(`[deleteGCS] Found ${files.length} files to delete`);
1075
+
1076
+ if (files.length > 0) {
1077
+ await Promise.all(files.map((file) => file.delete()));
1078
+ console.log("[deleteGCS] All files deleted successfully");
1079
+ } else {
1080
+ console.log("[deleteGCS] No files found to delete");
1081
+ }
1021
1082
  }
1083
+ } catch (error) {
1084
+ // If we get a 404 error, it means the file is already gone, which is fine
1085
+ if (error.response?.status === 404 || error.code === 404) {
1086
+ console.log(
1087
+ "[deleteGCS] File not found in GCS (404) - this is expected if file was already deleted",
1088
+ );
1089
+ return;
1090
+ }
1091
+ console.error("[deleteGCS] Error during deletion:", error);
1092
+ console.error("[deleteGCS] Error details:", {
1093
+ message: error.message,
1094
+ code: error.code,
1095
+ errors: error.errors,
1096
+ response: error.response
1097
+ ? {
1098
+ status: error.response.status,
1099
+ statusText: error.response.statusText,
1100
+ data: error.response.data,
1101
+ headers: error.response.headers,
1102
+ }
1103
+ : null,
1104
+ });
1105
+ // Don't throw the error - we want to continue with cleanup even if GCS deletion fails
1106
+ }
1022
1107
  }
1023
1108
 
1024
1109
  // Helper function to ensure GCS upload for existing files
1025
1110
  async function ensureGCSUpload(context, existingFile) {
1026
- if (!existingFile.gcs && gcs) {
1027
- context.log('GCS file was missing - uploading.');
1028
- const fileName = sanitizeFilename(path.basename(existingFile.url.split('?')[0]));
1029
- const response = await axios({ method: 'get', url: existingFile.url, responseType: 'stream' });
1030
- existingFile.gcs = await uploadToGCS(context, response.data, fileName);
1031
- }
1032
- return existingFile;
1111
+ if (!existingFile.gcs && gcs) {
1112
+ context.log("GCS file was missing - uploading.");
1113
+ // Use LLM-friendly naming instead of extracting original filename
1114
+ const fileExtension = path.extname(existingFile.url.split("?")[0]);
1115
+ const shortId = generateShortId();
1116
+ const fileName = `${shortId}${fileExtension}`;
1117
+ const response = await axios({
1118
+ method: "get",
1119
+ url: existingFile.url,
1120
+ responseType: "stream",
1121
+ });
1122
+ existingFile.gcs = await uploadToGCS(context, response.data, fileName);
1123
+ }
1124
+ return existingFile;
1033
1125
  }
1034
1126
 
1035
- async function uploadChunkToGCS(chunkPath, requestId) {
1036
- if (!gcs) return null;
1037
- const dirName = requestId || uuidv4();
1038
- const baseName = sanitizeFilename(path.basename(chunkPath));
1039
- const gcsFileName = `${dirName}/${baseName}`;
1040
- await gcs.bucket(GCS_BUCKETNAME).upload(chunkPath, { destination: gcsFileName });
1041
- return `gs://${GCS_BUCKETNAME}/${gcsFileName}`;
1127
+ async function uploadChunkToGCS(chunkPath, requestId, filename = null) {
1128
+ if (!gcs) return null;
1129
+ const dirName = requestId || uuidv4();
1130
+ // Use provided filename or generate LLM-friendly naming
1131
+ let gcsFileName;
1132
+ if (filename) {
1133
+ gcsFileName = `${dirName}/${filename}`;
1134
+ } else {
1135
+ const fileExtension = path.extname(chunkPath);
1136
+ const shortId = generateShortId();
1137
+ gcsFileName = `${dirName}/${shortId}${fileExtension}`;
1138
+ }
1139
+ await gcs
1140
+ .bucket(GCS_BUCKETNAME)
1141
+ .upload(chunkPath, { destination: gcsFileName });
1142
+ return `gs://${GCS_BUCKETNAME}/${gcsFileName}`;
1042
1143
  }
1043
1144
 
1044
1145
  export {
1045
- saveFileToBlob,
1046
- deleteBlob,
1047
- deleteGCS,
1048
- uploadBlob,
1049
- cleanup,
1050
- cleanupGCS,
1051
- gcsUrlExists,
1052
- ensureGCSUpload,
1053
- gcs,
1054
- uploadChunkToGCS,
1055
- downloadFromGCS,
1056
- };
1146
+ saveFileToBlob,
1147
+ deleteBlob,
1148
+ deleteGCS,
1149
+ uploadBlob,
1150
+ cleanup,
1151
+ cleanupGCS,
1152
+ gcsUrlExists,
1153
+ ensureGCSUpload,
1154
+ gcs,
1155
+ uploadChunkToGCS,
1156
+ downloadFromGCS,
1157
+ };