@aj-archipelago/cortex 1.3.49 → 1.3.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/config.js +1 -1
  2. package/helper-apps/cortex-browser/Dockerfile +19 -31
  3. package/helper-apps/cortex-browser/function_app.py +708 -181
  4. package/helper-apps/cortex-browser/requirements.txt +4 -4
  5. package/helper-apps/cortex-file-handler/blobHandler.js +850 -429
  6. package/helper-apps/cortex-file-handler/constants.js +64 -48
  7. package/helper-apps/cortex-file-handler/docHelper.js +7 -114
  8. package/helper-apps/cortex-file-handler/fileChunker.js +96 -51
  9. package/helper-apps/cortex-file-handler/function.json +2 -6
  10. package/helper-apps/cortex-file-handler/helper.js +34 -25
  11. package/helper-apps/cortex-file-handler/index.js +324 -136
  12. package/helper-apps/cortex-file-handler/localFileHandler.js +56 -57
  13. package/helper-apps/cortex-file-handler/package-lock.json +6065 -5964
  14. package/helper-apps/cortex-file-handler/package.json +8 -4
  15. package/helper-apps/cortex-file-handler/redis.js +23 -17
  16. package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +12 -9
  17. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +21 -18
  18. package/helper-apps/cortex-file-handler/scripts/test-azure.sh +1 -1
  19. package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +1 -1
  20. package/helper-apps/cortex-file-handler/services/ConversionService.js +288 -0
  21. package/helper-apps/cortex-file-handler/services/FileConversionService.js +53 -0
  22. package/helper-apps/cortex-file-handler/start.js +63 -38
  23. package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +144 -0
  24. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +88 -64
  25. package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +114 -91
  26. package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +351 -0
  27. package/helper-apps/cortex-file-handler/tests/files/DOCX_TestPage.docx +0 -0
  28. package/helper-apps/cortex-file-handler/tests/files/tests-example.xls +0 -0
  29. package/helper-apps/cortex-file-handler/tests/start.test.js +943 -642
  30. package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +31 -0
  31. package/helper-apps/cortex-markitdown/.funcignore +1 -0
  32. package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py +64 -0
  33. package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json +21 -0
  34. package/helper-apps/cortex-markitdown/README.md +94 -0
  35. package/helper-apps/cortex-markitdown/host.json +15 -0
  36. package/helper-apps/cortex-markitdown/requirements.txt +2 -0
  37. package/lib/requestExecutor.js +44 -36
  38. package/package.json +1 -1
  39. package/pathways/system/entity/tools/sys_tool_cognitive_search.js +1 -1
  40. package/pathways/system/entity/tools/sys_tool_readfile.js +24 -2
  41. package/server/plugins/openAiWhisperPlugin.js +59 -87
  42. package/helper-apps/cortex-file-handler/tests/docHelper.test.js +0 -148
@@ -1,24 +1,58 @@
1
- import { downloadFile, splitMediaFile } from './fileChunker.js';
2
- import { saveFileToBlob, deleteBlob, deleteGCS, uploadBlob, cleanup, cleanupGCS, gcsUrlExists, ensureGCSUpload, gcs, AZURE_STORAGE_CONTAINER_NAME, uploadChunkToGCS } from './blobHandler.js';
3
- import { cleanupRedisFileStoreMap, getFileStoreMap, publishRequestProgress, removeFromFileStoreMap, setFileStoreMap } from './redis.js';
4
- import { ensureEncoded, ensureFileExtension, urlExists } from './helper.js';
5
- import { moveFileToPublicFolder, deleteFolder, cleanupLocal } from './localFileHandler.js';
6
- import { documentToText, easyChunker } from './docHelper.js';
7
- import { DOC_EXTENSIONS } from './constants.js';
8
- import path from 'path';
1
+ import fs from 'fs';
9
2
  import os from 'os';
3
+ import path from 'path';
4
+
10
5
  import { v4 as uuidv4 } from 'uuid';
11
- import fs from 'fs';
6
+
7
+ import {
8
+ saveFileToBlob,
9
+ deleteBlob,
10
+ deleteGCS,
11
+ uploadBlob,
12
+ cleanup,
13
+ cleanupGCS,
14
+ gcsUrlExists,
15
+ ensureGCSUpload,
16
+ gcs,
17
+ AZURE_STORAGE_CONTAINER_NAME,
18
+ uploadChunkToGCS,
19
+ downloadFromGCS,
20
+ } from './blobHandler.js';
21
+ import { DOC_EXTENSIONS, CONVERTED_EXTENSIONS } from './constants.js';
22
+ import { easyChunker } from './docHelper.js';
23
+ import { downloadFile, splitMediaFile } from './fileChunker.js';
24
+ import { ensureEncoded, ensureFileExtension, urlExists } from './helper.js';
25
+ import {
26
+ moveFileToPublicFolder,
27
+ deleteFolder,
28
+ cleanupLocal,
29
+ } from './localFileHandler.js';
30
+ import {
31
+ cleanupRedisFileStoreMap,
32
+ getFileStoreMap,
33
+ publishRequestProgress,
34
+ removeFromFileStoreMap,
35
+ setFileStoreMap,
36
+ } from './redis.js';
37
+ import { FileConversionService } from './services/FileConversionService.js';
12
38
 
13
39
  const useAzure = process.env.AZURE_STORAGE_CONNECTION_STRING ? true : false;
14
- const useGCS = process.env.GCP_SERVICE_ACCOUNT_KEY_BASE64 || process.env.GCP_SERVICE_ACCOUNT_KEY ? true : false;
40
+ const useGCS =
41
+ process.env.GCP_SERVICE_ACCOUNT_KEY_BASE64 ||
42
+ process.env.GCP_SERVICE_ACCOUNT_KEY
43
+ ? true
44
+ : false;
15
45
 
16
- console.log(`Storage configuration - ${useAzure ? 'Azure' : 'Local'} Storage${useGCS ? ' and Google Cloud Storage' : ''}`);
46
+ console.log(
47
+ `Storage configuration - ${useAzure ? 'Azure' : 'Local'} Storage${useGCS ? ' and Google Cloud Storage' : ''}`,
48
+ );
17
49
 
18
50
  let isCleanupRunning = false;
19
51
  async function cleanupInactive(context) {
20
52
  try {
21
- if (isCleanupRunning) { return; } //no need to cleanup every call
53
+ if (isCleanupRunning) {
54
+ return;
55
+ } //no need to cleanup every call
22
56
  isCleanupRunning = true;
23
57
  const cleaned = await cleanupRedisFileStoreMap();
24
58
 
@@ -26,24 +60,24 @@ async function cleanupInactive(context) {
26
60
  const cleanedLocal = [];
27
61
  const cleanedGCS = [];
28
62
 
29
- for(const key in cleaned){
63
+ for (const key in cleaned) {
30
64
  const item = cleaned[key];
31
- const {url,gcs} = item;
32
- if(url){
33
- if(url.includes('.blob.core.windows.net/')){
65
+ const { url, gcs } = item;
66
+ if (url) {
67
+ if (url.includes('.blob.core.windows.net/')) {
34
68
  cleanedAzure.push(url);
35
- }else if(url.startsWith('gs://')){
69
+ } else if (url.startsWith('gs://')) {
36
70
  cleanedGCS.push(url);
37
- }else{
71
+ } else {
38
72
  cleanedLocal.push(url);
39
73
  }
40
74
  }
41
75
 
42
- if(item && item.gcs){
76
+ if (item && item.gcs) {
43
77
  cleanedGCS.push(gcs);
44
78
  }
45
79
  }
46
-
80
+
47
81
  try {
48
82
  if (cleanedAzure && cleanedAzure.length > 0) {
49
83
  await cleanup(context, cleanedAzure);
@@ -56,85 +90,110 @@ async function cleanupInactive(context) {
56
90
  if (cleanedLocal && cleanedLocal.length > 0) {
57
91
  await cleanupLocal(cleanedLocal);
58
92
  }
59
- }catch(err){
93
+ } catch (err) {
60
94
  console.log('Error occurred during local cleanup:', err);
61
95
  }
62
96
 
63
- try{
64
- if(cleanedGCS && cleanedGCS.length > 0){
97
+ try {
98
+ if (cleanedGCS && cleanedGCS.length > 0) {
65
99
  await cleanupGCS(cleanedGCS);
66
100
  }
67
- }catch(err){
101
+ } catch (err) {
68
102
  console.log('Error occurred during GCS cleanup:', err);
69
103
  }
70
-
71
104
  } catch (error) {
72
105
  console.log('Error occurred during cleanup:', error);
73
- } finally{
106
+ } finally {
74
107
  isCleanupRunning = false;
75
108
  }
76
109
  }
77
110
 
78
111
  async function CortexFileHandler(context, req) {
79
- const { uri, requestId, save, hash, checkHash, clearHash, fetch, load, restore } = req.body?.params || req.query;
80
- const operation = save ? 'save' :
81
- checkHash ? 'checkHash' :
82
- clearHash ? 'clearHash' :
83
- fetch || load || restore ? 'remoteFile' :
84
- req.method.toLowerCase() === 'delete' || req.query.operation === 'delete' ? 'delete' :
85
- uri ? (DOC_EXTENSIONS.some(ext => uri.toLowerCase().endsWith(ext)) ? 'document_processing' : 'media_chunking') :
86
- 'upload';
87
-
88
- context.log(`Processing ${req.method} request - ${requestId ? `requestId: ${requestId}, ` : ''}${uri ? `uri: ${uri}, ` : ''}${hash ? `hash: ${hash}, ` : ''}operation: ${operation}`);
112
+ const {
113
+ uri,
114
+ requestId,
115
+ save,
116
+ hash,
117
+ checkHash,
118
+ clearHash,
119
+ fetch,
120
+ load,
121
+ restore,
122
+ } = req.body?.params || req.query;
123
+ const operation = save
124
+ ? 'save'
125
+ : checkHash
126
+ ? 'checkHash'
127
+ : clearHash
128
+ ? 'clearHash'
129
+ : fetch || load || restore
130
+ ? 'remoteFile'
131
+ : req.method.toLowerCase() === 'delete' ||
132
+ req.query.operation === 'delete'
133
+ ? 'delete'
134
+ : uri
135
+ ? DOC_EXTENSIONS.some((ext) => uri.toLowerCase().endsWith(ext))
136
+ ? 'document_processing'
137
+ : 'media_chunking'
138
+ : 'upload';
139
+
140
+ context.log(
141
+ `Processing ${req.method} request - ${requestId ? `requestId: ${requestId}, ` : ''}${uri ? `uri: ${uri}, ` : ''}${hash ? `hash: ${hash}, ` : ''}operation: ${operation}`,
142
+ );
89
143
 
90
144
  cleanupInactive(context); //trigger & no need to wait for it
91
145
 
146
+ // Initialize conversion service
147
+ const conversionService = new FileConversionService(context, useAzure);
148
+
92
149
  // Clean up blob when request delete which means processing marked completed
93
150
  if (operation === 'delete') {
94
151
  const deleteRequestId = req.query.requestId || requestId;
95
152
  if (!deleteRequestId) {
96
153
  context.res = {
97
154
  status: 400,
98
- body: "Please pass a requestId on the query string"
155
+ body: 'Please pass a requestId on the query string',
99
156
  };
100
157
  return;
101
158
  }
102
-
159
+
103
160
  // Delete from Azure/Local storage
104
- const azureResult = useAzure ? await deleteBlob(deleteRequestId) : await deleteFolder(deleteRequestId);
161
+ const azureResult = useAzure
162
+ ? await deleteBlob(deleteRequestId)
163
+ : await deleteFolder(deleteRequestId);
105
164
  const gcsResult = [];
106
165
  if (gcs) {
107
- gcsResult.push(...await deleteGCS(deleteRequestId));
166
+ gcsResult.push(...(await deleteGCS(deleteRequestId)));
108
167
  }
109
-
168
+
110
169
  context.res = {
111
170
  status: 200,
112
- body: { body: [...azureResult, ...gcsResult] }
171
+ body: { body: [...azureResult, ...gcsResult] },
113
172
  };
114
173
  return;
115
174
  }
116
175
 
117
176
  const remoteUrl = fetch || restore || load;
118
- if (req.method.toLowerCase() === `get` && remoteUrl) {
177
+ if (req.method.toLowerCase() === 'get' && remoteUrl) {
119
178
  context.log(`Remote file: ${remoteUrl}`);
120
- let filename; // Declare filename outside try block
179
+ let filename; // Declare filename outside try block
121
180
  try {
122
181
  // Validate URL format and accessibility
123
182
  const urlCheck = await urlExists(remoteUrl);
124
183
  if (!urlCheck.valid) {
125
184
  context.res = {
126
185
  status: 400,
127
- body: 'Invalid or inaccessible URL'
186
+ body: 'Invalid or inaccessible URL',
128
187
  };
129
188
  return;
130
189
  }
131
190
 
132
191
  // Check if file already exists (using hash as the key)
133
- let exists = await getFileStoreMap(remoteUrl);
134
- if(exists){
192
+ const exists = await getFileStoreMap(remoteUrl);
193
+ if (exists) {
135
194
  context.res = {
136
195
  status: 200,
137
- body: exists
196
+ body: exists,
138
197
  };
139
198
  //update redis timestamp with current time
140
199
  await setFileStoreMap(remoteUrl, exists);
@@ -143,28 +202,38 @@ async function CortexFileHandler(context, req) {
143
202
 
144
203
  // Download the file first
145
204
  const urlObj = new URL(remoteUrl);
146
- let originalFileName = path.basename(urlObj.pathname);
205
+ let originalFileName = decodeURIComponent(path.basename(urlObj.pathname));
147
206
  if (!originalFileName || originalFileName === '') {
148
207
  originalFileName = urlObj.hostname;
149
208
  }
150
-
209
+
151
210
  // Ensure the filename has the correct extension based on content type
152
- originalFileName = ensureFileExtension(originalFileName, urlCheck.contentType);
211
+ originalFileName = ensureFileExtension(
212
+ originalFileName,
213
+ urlCheck.contentType,
214
+ );
153
215
 
154
216
  const maxLength = 200; // Set the maximum length for the filename
155
217
  let truncatedFileName = originalFileName;
156
218
  if (originalFileName.length > maxLength) {
157
219
  const extension = path.extname(originalFileName);
158
220
  const basename = path.basename(originalFileName, extension);
159
- truncatedFileName = basename.substring(0, maxLength - extension.length) + extension;
221
+ truncatedFileName =
222
+ basename.substring(0, maxLength - extension.length) + extension;
160
223
  }
161
224
 
162
225
  // Use the original-truncated file name when saving the downloaded file
163
226
  filename = path.join(os.tmpdir(), truncatedFileName);
164
227
  await downloadFile(remoteUrl, filename);
165
-
228
+
166
229
  // Now upload the downloaded file
167
- const res = await uploadBlob(context, null, !useAzure, filename, remoteUrl);
230
+ const res = await uploadBlob(
231
+ context,
232
+ null,
233
+ !useAzure,
234
+ filename,
235
+ remoteUrl,
236
+ );
168
237
 
169
238
  //Update Redis (using hash as the key)
170
239
  await setFileStoreMap(remoteUrl, res);
@@ -175,10 +244,10 @@ async function CortexFileHandler(context, req) {
175
244
  body: res,
176
245
  };
177
246
  } catch (error) {
178
- context.log("Error processing remote file request:", error);
247
+ context.log('Error processing remote file request:', error);
179
248
  context.res = {
180
249
  status: 500,
181
- body: `Error processing file: ${error.message}`
250
+ body: `Error processing file: ${error.message}`,
182
251
  };
183
252
  } finally {
184
253
  // Cleanup temp file if it exists
@@ -187,121 +256,208 @@ async function CortexFileHandler(context, req) {
187
256
  fs.unlinkSync(filename);
188
257
  }
189
258
  } catch (err) {
190
- context.log("Error cleaning up temp file:", err);
259
+ context.log('Error cleaning up temp file:', err);
191
260
  }
192
261
  }
193
262
  return;
194
263
  }
195
264
 
196
- if(hash && clearHash){
265
+ if (hash && clearHash) {
197
266
  try {
198
267
  const hashValue = await getFileStoreMap(hash);
199
268
  if (hashValue) {
200
269
  await removeFromFileStoreMap(hash);
201
270
  context.res = {
202
271
  status: 200,
203
- body: `Hash ${hash} removed`
272
+ body: `Hash ${hash} removed`,
204
273
  };
205
274
  } else {
206
275
  context.res = {
207
276
  status: 404,
208
- body: `Hash ${hash} not found`
277
+ body: `Hash ${hash} not found`,
209
278
  };
210
279
  }
211
280
  } catch (error) {
212
281
  context.res = {
213
282
  status: 500,
214
- body: `Error occurred during hash cleanup: ${error}`
283
+ body: `Error occurred during hash cleanup: ${error}`,
215
284
  };
216
285
  console.log('Error occurred during hash cleanup:', error);
217
286
  }
218
287
  return;
219
288
  }
220
289
 
221
- if(hash && checkHash){ //check if hash exists
290
+ if (hash && checkHash) {
222
291
  let hashResult = await getFileStoreMap(hash);
223
292
 
224
- if(hashResult){
293
+ if (hashResult) {
225
294
  context.log(`File exists in map: ${hash}`);
226
-
295
+
296
+ // Log the URL retrieved from Redis before checking existence
297
+ context.log(`Checking existence of URL from Redis: ${hashResult?.url}`);
298
+
299
+ // Detect double-encoding in the blob name
300
+ if (hashResult.url) {
301
+ const urlPath = hashResult.url.split('?')[0];
302
+ const blobName = urlPath.substring(urlPath.lastIndexOf('/') + 1);
303
+ if (/%25[0-9A-Fa-f]{2}/.test(blobName)) {
304
+ context.log(
305
+ `Double-encoded blob detected for hash ${hash}. Invalidating cache entry.`,
306
+ );
307
+ await removeFromFileStoreMap(hash);
308
+ context.res = {
309
+ status: 404,
310
+ body: `Hash ${hash} is double-encoded and has been invalidated. Please re-upload.`,
311
+ };
312
+ return;
313
+ }
314
+ }
315
+
227
316
  // Check primary storage (Azure/Local) first
228
317
  const primaryExists = await urlExists(hashResult?.url);
229
318
  const gcsExists = gcs ? await gcsUrlExists(hashResult?.gcs) : false;
230
319
 
231
320
  // If neither storage has the file, remove from map and return not found
232
321
  if (!primaryExists.valid && !gcsExists) {
233
- context.log(`File not found in any storage. Removing from map: ${hash}`);
322
+ context.log(
323
+ `File not found in any storage. Removing from map: ${hash}`,
324
+ );
234
325
  await removeFromFileStoreMap(hash);
235
326
  context.res = {
236
327
  status: 404,
237
- body: `Hash ${hash} not found in storage`
328
+ body: `Hash ${hash} not found in storage`,
238
329
  };
239
330
  return;
240
331
  }
241
332
 
333
+ // If GCS is missing but primary exists, restore to GCS
334
+ else if (primaryExists.valid && gcs && !gcsExists) {
335
+ context.log(`GCS file missing, restoring from primary: ${hash}`);
336
+ const { gcs: _, ...fileInfo } = hashResult;
337
+ hashResult = await ensureGCSUpload(context, fileInfo);
338
+ }
339
+
242
340
  // If primary is missing but GCS exists, restore from GCS
243
341
  if (!primaryExists.valid && gcsExists) {
244
342
  context.log(`Primary storage file missing, restoring from GCS: ${hash}`);
245
343
  try {
246
- const res = await CortexFileHandler(context, {
247
- method: 'GET',
248
- body: { params: { fetch: hashResult.gcs } }
249
- });
250
- if (res?.body?.url) {
251
- hashResult.url = res.body.url;
344
+ // Create a temporary file to store the downloaded content
345
+ const tempDir = path.join(os.tmpdir(), `${uuidv4()}`);
346
+ fs.mkdirSync(tempDir);
347
+ const downloadedFile = path.join(tempDir, path.basename(hashResult.gcs));
348
+
349
+ // Download from GCS using the new function
350
+ await downloadFromGCS(hashResult.gcs, downloadedFile);
351
+
352
+ // Upload to primary storage
353
+ const res = await uploadBlob(
354
+ context,
355
+ null,
356
+ !useAzure,
357
+ downloadedFile,
358
+ hash
359
+ );
360
+
361
+ // Update the hash result with the new primary storage URL
362
+ hashResult.url = res.url;
363
+
364
+ // Clean up temp file
365
+ try {
366
+ if (downloadedFile && fs.existsSync(downloadedFile)) {
367
+ fs.unlinkSync(downloadedFile);
368
+ }
369
+ if (tempDir && fs.existsSync(tempDir)) {
370
+ fs.rmSync(tempDir, { recursive: true });
371
+ }
372
+ } catch (err) {
373
+ console.log('Error cleaning up temp files:', err);
252
374
  }
253
375
  } catch (error) {
254
376
  console.error('Error restoring from GCS:', error);
255
377
  }
256
378
  }
257
- // If GCS is missing but primary exists, restore to GCS
258
- else if (primaryExists.valid && gcs && !gcsExists) {
259
- context.log(`GCS file missing, restoring from primary: ${hash}`);
260
- const { gcs: _, ...fileInfo } = hashResult; // eslint-disable-line no-unused-vars
261
- hashResult = await ensureGCSUpload(context, fileInfo);
262
- }
379
+
380
+ // Ensure converted version exists if needed
381
+ hashResult = await conversionService.ensureConvertedVersion(hashResult, requestId);
263
382
 
264
383
  // Final check to ensure we have at least one valid storage location
265
384
  const finalPrimaryCheck = await urlExists(hashResult?.url);
266
- if (!finalPrimaryCheck.valid && !await gcsUrlExists(hashResult?.gcs)) {
385
+ if (!finalPrimaryCheck.valid && !(await gcsUrlExists(hashResult?.gcs))) {
267
386
  context.log(`Failed to restore file. Removing from map: ${hash}`);
268
387
  await removeFromFileStoreMap(hash);
269
388
  context.res = {
270
389
  status: 404,
271
- body: `Hash ${hash} not found and restoration failed`
390
+ body: `Hash ${hash} not found and restoration failed`,
272
391
  };
273
392
  return;
274
393
  }
275
394
 
395
+ // Create the response object
396
+ const response = {
397
+ message: `File '${hashResult.filename}' ${useAzure ? 'uploaded' : 'saved'} successfully.`,
398
+ filename: hashResult.filename,
399
+ url: hashResult.url,
400
+ gcs: hashResult.gcs,
401
+ hash: hashResult.hash,
402
+ timestamp: new Date().toISOString()
403
+ };
404
+
405
+ // Add converted info if it exists and has a valid URL
406
+ if (hashResult.converted?.url) {
407
+ context.log(`Adding converted info to final response`);
408
+ response.converted = {
409
+ url: hashResult.converted.url,
410
+ gcs: hashResult.converted.gcs
411
+ };
412
+ } else if (hashResult.converted?.gcs) {
413
+ // If we only have GCS URL, trigger conversion
414
+ context.log(`Only GCS URL exists for converted file, triggering conversion`);
415
+ const convertedResult = await conversionService.convertFile(
416
+ await downloadFile(hashResult.url, path.join(os.tmpdir(), path.basename(hashResult.url))),
417
+ hashResult.url
418
+ );
419
+ if (convertedResult.converted) {
420
+ const convertedSaveResult = await conversionService._saveConvertedFile(convertedResult.convertedPath, requestId);
421
+ response.converted = {
422
+ url: convertedSaveResult.url,
423
+ gcs: hashResult.converted.gcs
424
+ };
425
+ // Update the hash map with the new converted info
426
+ await setFileStoreMap(`${hashResult.hash}_converted`, response.converted);
427
+ }
428
+ } else {
429
+ context.log(`No converted info to add to final response`);
430
+ }
431
+
276
432
  //update redis timestamp with current time
277
433
  await setFileStoreMap(hash, hashResult);
278
434
 
279
435
  context.res = {
280
436
  status: 200,
281
- body: hashResult
437
+ body: response
282
438
  };
283
439
  return;
284
440
  }
285
441
 
286
442
  context.res = {
287
443
  status: 404,
288
- body: `Hash ${hash} not found`
444
+ body: `Hash ${hash} not found`,
289
445
  };
290
446
  return;
291
447
  }
292
448
 
293
- if (req.method.toLowerCase() === `post`) {
294
- await uploadBlob(context, req, !useAzure, null, hash);
295
- if(hash && context?.res?.body){
296
- await setFileStoreMap(hash, context.res.body);
449
+ if (req.method.toLowerCase() === 'post') {
450
+ const result = await uploadBlob(context, req, !useAzure, null, hash);
451
+ if (result?.hash && context?.res?.body) {
452
+ await setFileStoreMap(result.hash, context.res.body);
297
453
  }
298
- return
454
+ return;
299
455
  }
300
456
 
301
457
  if (!uri || !requestId) {
302
458
  context.res = {
303
459
  status: 400,
304
- body: "Please pass a uri and requestId on the query string or in the request body"
460
+ body: 'Please pass a uri and requestId on the query string or in the request body',
305
461
  };
306
462
  return;
307
463
  }
@@ -310,72 +466,96 @@ async function CortexFileHandler(context, req) {
310
466
  let completedCount = 0;
311
467
  let numberOfChunks;
312
468
 
313
- let file = ensureEncoded(uri); // encode url to handle special characters
469
+ const file = ensureEncoded(uri); // encode url to handle special characters
314
470
 
315
471
  const result = [];
316
472
 
317
473
  const sendProgress = async (data = null) => {
318
474
  completedCount++;
319
475
  const progress = completedCount / totalCount;
320
- await publishRequestProgress({ requestId, progress, completedCount, totalCount, numberOfChunks, data });
321
- }
476
+ await publishRequestProgress({
477
+ requestId,
478
+ progress,
479
+ completedCount,
480
+ totalCount,
481
+ numberOfChunks,
482
+ data,
483
+ });
484
+ };
322
485
 
323
486
  try {
324
- // Parse URL and get pathname without query parameters for extension check
487
+ // Parse URL and get pathname without query parameters for extension check
325
488
  const urlObj = new URL(uri);
326
489
  const pathWithoutQuery = urlObj.pathname;
327
-
328
- if (DOC_EXTENSIONS.some(ext => pathWithoutQuery.toLowerCase().endsWith(ext))) {
490
+
491
+ if (
492
+ DOC_EXTENSIONS.some((ext) => pathWithoutQuery.toLowerCase().endsWith(ext))
493
+ ) {
329
494
  const extension = path.extname(pathWithoutQuery).toLowerCase();
330
495
  const tempDir = path.join(os.tmpdir(), `${uuidv4()}`);
331
496
  fs.mkdirSync(tempDir);
332
497
  const downloadedFile = path.join(tempDir, `${uuidv4()}${extension}`);
333
498
  await downloadFile(uri, downloadedFile);
334
- const text = await documentToText(downloadedFile);
335
- let tmpPath;
336
499
 
337
500
  try {
338
501
  if (save) {
339
- const fileName = `${uuidv4()}.txt`; // generate unique file name
340
- const filePath = path.join(tempDir, fileName);
341
- tmpPath = filePath;
342
- fs.writeFileSync(filePath, text); // write text to file
343
-
344
- // save file to the cloud or local file system
345
- const saveResult = useAzure ? await saveFileToBlob(filePath, requestId) : await moveFileToPublicFolder(filePath, requestId);
346
- result.push(saveResult);
347
-
502
+ const saveResults = [];
503
+ const originalFileName = `${uuidv4()}_${encodeURIComponent(path.basename(downloadedFile))}`;
504
+ const originalFilePath = path.join(tempDir, originalFileName);
505
+ await fs.promises.copyFile(downloadedFile, originalFilePath);
506
+ let fileUrl;
507
+ if (useAzure) {
508
+ const savedBlob = await saveFileToBlob(originalFilePath, requestId);
509
+ fileUrl = savedBlob?.url;
510
+ } else {
511
+ fileUrl = await moveFileToPublicFolder(originalFilePath, requestId);
512
+ }
513
+ saveResults.push(fileUrl);
514
+ result.push(fileUrl);
348
515
  } else {
516
+ const text = await conversionService.convertFile(downloadedFile, uri, true);
349
517
  result.push(...easyChunker(text));
350
518
  }
351
- } catch(err) {
352
- console.log(`Error saving file ${uri} with request id ${requestId}:`, err);
519
+ } catch (err) {
520
+ console.log(
521
+ `Error saving file ${uri} with request id ${requestId}:`,
522
+ err,
523
+ );
524
+ throw err; // Re-throw to handle in outer catch
353
525
  } finally {
354
526
  try {
355
527
  // delete temporary files
356
- tmpPath && fs.unlinkSync(tmpPath);
357
- downloadedFile && fs.unlinkSync(downloadedFile);
358
- console.log(`Cleaned temp files ${tmpPath}, ${downloadedFile}`);
359
- } catch(err) {
360
- console.log(`Error cleaning temp files ${tmpPath}, ${downloadedFile}:`, err);
528
+ if (downloadedFile && fs.existsSync(downloadedFile)) {
529
+ fs.unlinkSync(downloadedFile);
530
+ console.log(`Cleaned temp file ${downloadedFile}`);
531
+ }
532
+ } catch (err) {
533
+ console.log(`Error cleaning temp file ${downloadedFile}:`, err);
361
534
  }
362
-
535
+
363
536
  try {
364
537
  //delete uploaded prev nontext file
365
538
  //check cleanup for uploaded files url
366
- const regex = new RegExp(`${AZURE_STORAGE_CONTAINER_NAME}/([a-z0-9-]+)`);
539
+ const regex = new RegExp(
540
+ `${AZURE_STORAGE_CONTAINER_NAME}/([a-z0-9-]+)`,
541
+ );
367
542
  const match = uri.match(regex);
368
543
  if (match && match[1]) {
369
544
  const extractedValue = match[1];
370
- useAzure ? await deleteBlob(extractedValue) : await deleteFolder(extractedValue);
371
- console.log(`Cleaned temp file ${uri} with request id ${extractedValue}`);
545
+ useAzure
546
+ ? await deleteBlob(extractedValue)
547
+ : await deleteFolder(extractedValue);
548
+ console.log(
549
+ `Cleaned temp file ${uri} with request id ${extractedValue}`,
550
+ );
372
551
  }
373
- } catch(err) {
552
+ } catch (err) {
374
553
  console.log(`Error cleaning temp file ${uri}:`, err);
375
554
  }
376
555
  }
377
556
  } else {
378
- const { chunkPromises, chunkOffsets, uniqueOutputPath } = await splitMediaFile(file);
557
+ const { chunkPromises, chunkOffsets, uniqueOutputPath } =
558
+ await splitMediaFile(file);
379
559
 
380
560
  numberOfChunks = chunkPromises.length; // for progress reporting
381
561
  totalCount += chunkPromises.length * 4; // 4 steps for each chunk (download and upload)
@@ -391,21 +571,24 @@ async function CortexFileHandler(context, req) {
391
571
  // sequential processing of chunks
392
572
  for (let index = 0; index < chunks.length; index++) {
393
573
  const chunkPath = chunks[index];
394
- let blobName;
395
- let gcsUrl;
396
-
574
+ let chunkUrl;
575
+ let chunkGcsUrl;
576
+
397
577
  if (useAzure) {
398
- blobName = await saveFileToBlob(chunkPath, requestId);
578
+ const savedBlob = await saveFileToBlob(chunkPath, requestId);
579
+ chunkUrl = savedBlob.url;
399
580
  } else {
400
- blobName = await moveFileToPublicFolder(chunkPath, requestId);
581
+ chunkUrl = await moveFileToPublicFolder(chunkPath, requestId);
401
582
  }
402
-
583
+
403
584
  // If GCS is configured, save to GCS
404
- gcsUrl = await uploadChunkToGCS(chunkPath, requestId);
405
-
585
+ chunkGcsUrl = await uploadChunkToGCS(chunkPath, requestId);
586
+
406
587
  const chunkOffset = chunkOffsets[index];
407
- result.push({ uri: blobName, offset: chunkOffset, gcs: gcsUrl });
408
- console.log(`Saved chunk as: ${blobName}${gcsUrl ? ` and ${gcsUrl}` : ''}`);
588
+ result.push({ uri: chunkUrl, offset: chunkOffset, gcs: chunkGcsUrl });
589
+ console.log(
590
+ `Saved chunk as: ${chunkUrl}${chunkGcsUrl ? ` and ${chunkGcsUrl}` : ''}`,
591
+ );
409
592
  await sendProgress();
410
593
  }
411
594
 
@@ -420,21 +603,26 @@ async function CortexFileHandler(context, req) {
420
603
  }
421
604
  }
422
605
  } catch (error) {
423
- console.error("An error occurred:", error);
606
+ console.error('An error occurred:', error);
424
607
  context.res = {
425
608
  status: 500,
426
- body: error.message || error
609
+ body: error.message || error,
427
610
  };
428
611
  return;
429
612
  }
430
613
 
431
- console.log('result:', result.map(item =>
432
- typeof item === 'object' ? JSON.stringify(item, null, 2) : item
433
- ).join('\n'));
434
-
614
+ console.log(
615
+ 'result:',
616
+ result
617
+ .map((item) =>
618
+ typeof item === 'object' ? JSON.stringify(item, null, 2) : item,
619
+ )
620
+ .join('\n'),
621
+ );
622
+
435
623
  context.res = {
436
- body: result
624
+ body: result,
437
625
  };
438
626
  }
439
627
 
440
- export default CortexFileHandler;
628
+ export default CortexFileHandler;