@aj-archipelago/cortex 1.3.49 → 1.3.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.js +1 -1
- package/helper-apps/cortex-browser/Dockerfile +19 -31
- package/helper-apps/cortex-browser/function_app.py +708 -181
- package/helper-apps/cortex-browser/requirements.txt +4 -4
- package/helper-apps/cortex-file-handler/blobHandler.js +850 -429
- package/helper-apps/cortex-file-handler/constants.js +64 -48
- package/helper-apps/cortex-file-handler/docHelper.js +7 -114
- package/helper-apps/cortex-file-handler/fileChunker.js +96 -51
- package/helper-apps/cortex-file-handler/function.json +2 -6
- package/helper-apps/cortex-file-handler/helper.js +34 -25
- package/helper-apps/cortex-file-handler/index.js +324 -136
- package/helper-apps/cortex-file-handler/localFileHandler.js +56 -57
- package/helper-apps/cortex-file-handler/package-lock.json +6065 -5964
- package/helper-apps/cortex-file-handler/package.json +8 -4
- package/helper-apps/cortex-file-handler/redis.js +23 -17
- package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +12 -9
- package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +21 -18
- package/helper-apps/cortex-file-handler/scripts/test-azure.sh +1 -1
- package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +1 -1
- package/helper-apps/cortex-file-handler/services/ConversionService.js +288 -0
- package/helper-apps/cortex-file-handler/services/FileConversionService.js +53 -0
- package/helper-apps/cortex-file-handler/start.js +63 -38
- package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +144 -0
- package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +88 -64
- package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +114 -91
- package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +351 -0
- package/helper-apps/cortex-file-handler/tests/files/DOCX_TestPage.docx +0 -0
- package/helper-apps/cortex-file-handler/tests/files/tests-example.xls +0 -0
- package/helper-apps/cortex-file-handler/tests/start.test.js +943 -642
- package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +31 -0
- package/helper-apps/cortex-markitdown/.funcignore +1 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py +64 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json +21 -0
- package/helper-apps/cortex-markitdown/README.md +94 -0
- package/helper-apps/cortex-markitdown/host.json +15 -0
- package/helper-apps/cortex-markitdown/requirements.txt +2 -0
- package/lib/requestExecutor.js +44 -36
- package/package.json +1 -1
- package/pathways/system/entity/tools/sys_tool_cognitive_search.js +1 -1
- package/pathways/system/entity/tools/sys_tool_readfile.js +24 -2
- package/server/plugins/openAiWhisperPlugin.js +59 -87
- package/helper-apps/cortex-file-handler/tests/docHelper.test.js +0 -148
|
@@ -1,24 +1,58 @@
|
|
|
1
|
-
import
|
|
2
|
-
import { saveFileToBlob, deleteBlob, deleteGCS, uploadBlob, cleanup, cleanupGCS, gcsUrlExists, ensureGCSUpload, gcs, AZURE_STORAGE_CONTAINER_NAME, uploadChunkToGCS } from './blobHandler.js';
|
|
3
|
-
import { cleanupRedisFileStoreMap, getFileStoreMap, publishRequestProgress, removeFromFileStoreMap, setFileStoreMap } from './redis.js';
|
|
4
|
-
import { ensureEncoded, ensureFileExtension, urlExists } from './helper.js';
|
|
5
|
-
import { moveFileToPublicFolder, deleteFolder, cleanupLocal } from './localFileHandler.js';
|
|
6
|
-
import { documentToText, easyChunker } from './docHelper.js';
|
|
7
|
-
import { DOC_EXTENSIONS } from './constants.js';
|
|
8
|
-
import path from 'path';
|
|
1
|
+
import fs from 'fs';
|
|
9
2
|
import os from 'os';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
|
|
10
5
|
import { v4 as uuidv4 } from 'uuid';
|
|
11
|
-
|
|
6
|
+
|
|
7
|
+
import {
|
|
8
|
+
saveFileToBlob,
|
|
9
|
+
deleteBlob,
|
|
10
|
+
deleteGCS,
|
|
11
|
+
uploadBlob,
|
|
12
|
+
cleanup,
|
|
13
|
+
cleanupGCS,
|
|
14
|
+
gcsUrlExists,
|
|
15
|
+
ensureGCSUpload,
|
|
16
|
+
gcs,
|
|
17
|
+
AZURE_STORAGE_CONTAINER_NAME,
|
|
18
|
+
uploadChunkToGCS,
|
|
19
|
+
downloadFromGCS,
|
|
20
|
+
} from './blobHandler.js';
|
|
21
|
+
import { DOC_EXTENSIONS, CONVERTED_EXTENSIONS } from './constants.js';
|
|
22
|
+
import { easyChunker } from './docHelper.js';
|
|
23
|
+
import { downloadFile, splitMediaFile } from './fileChunker.js';
|
|
24
|
+
import { ensureEncoded, ensureFileExtension, urlExists } from './helper.js';
|
|
25
|
+
import {
|
|
26
|
+
moveFileToPublicFolder,
|
|
27
|
+
deleteFolder,
|
|
28
|
+
cleanupLocal,
|
|
29
|
+
} from './localFileHandler.js';
|
|
30
|
+
import {
|
|
31
|
+
cleanupRedisFileStoreMap,
|
|
32
|
+
getFileStoreMap,
|
|
33
|
+
publishRequestProgress,
|
|
34
|
+
removeFromFileStoreMap,
|
|
35
|
+
setFileStoreMap,
|
|
36
|
+
} from './redis.js';
|
|
37
|
+
import { FileConversionService } from './services/FileConversionService.js';
|
|
12
38
|
|
|
13
39
|
const useAzure = process.env.AZURE_STORAGE_CONNECTION_STRING ? true : false;
|
|
14
|
-
const useGCS =
|
|
40
|
+
const useGCS =
|
|
41
|
+
process.env.GCP_SERVICE_ACCOUNT_KEY_BASE64 ||
|
|
42
|
+
process.env.GCP_SERVICE_ACCOUNT_KEY
|
|
43
|
+
? true
|
|
44
|
+
: false;
|
|
15
45
|
|
|
16
|
-
console.log(
|
|
46
|
+
console.log(
|
|
47
|
+
`Storage configuration - ${useAzure ? 'Azure' : 'Local'} Storage${useGCS ? ' and Google Cloud Storage' : ''}`,
|
|
48
|
+
);
|
|
17
49
|
|
|
18
50
|
let isCleanupRunning = false;
|
|
19
51
|
async function cleanupInactive(context) {
|
|
20
52
|
try {
|
|
21
|
-
if (isCleanupRunning) {
|
|
53
|
+
if (isCleanupRunning) {
|
|
54
|
+
return;
|
|
55
|
+
} //no need to cleanup every call
|
|
22
56
|
isCleanupRunning = true;
|
|
23
57
|
const cleaned = await cleanupRedisFileStoreMap();
|
|
24
58
|
|
|
@@ -26,24 +60,24 @@ async function cleanupInactive(context) {
|
|
|
26
60
|
const cleanedLocal = [];
|
|
27
61
|
const cleanedGCS = [];
|
|
28
62
|
|
|
29
|
-
for(const key in cleaned){
|
|
63
|
+
for (const key in cleaned) {
|
|
30
64
|
const item = cleaned[key];
|
|
31
|
-
const {url,gcs} = item;
|
|
32
|
-
if(url){
|
|
33
|
-
if(url.includes('.blob.core.windows.net/')){
|
|
65
|
+
const { url, gcs } = item;
|
|
66
|
+
if (url) {
|
|
67
|
+
if (url.includes('.blob.core.windows.net/')) {
|
|
34
68
|
cleanedAzure.push(url);
|
|
35
|
-
}else if(url.startsWith('gs://')){
|
|
69
|
+
} else if (url.startsWith('gs://')) {
|
|
36
70
|
cleanedGCS.push(url);
|
|
37
|
-
}else{
|
|
71
|
+
} else {
|
|
38
72
|
cleanedLocal.push(url);
|
|
39
73
|
}
|
|
40
74
|
}
|
|
41
75
|
|
|
42
|
-
if(item && item.gcs){
|
|
76
|
+
if (item && item.gcs) {
|
|
43
77
|
cleanedGCS.push(gcs);
|
|
44
78
|
}
|
|
45
79
|
}
|
|
46
|
-
|
|
80
|
+
|
|
47
81
|
try {
|
|
48
82
|
if (cleanedAzure && cleanedAzure.length > 0) {
|
|
49
83
|
await cleanup(context, cleanedAzure);
|
|
@@ -56,85 +90,110 @@ async function cleanupInactive(context) {
|
|
|
56
90
|
if (cleanedLocal && cleanedLocal.length > 0) {
|
|
57
91
|
await cleanupLocal(cleanedLocal);
|
|
58
92
|
}
|
|
59
|
-
}catch(err){
|
|
93
|
+
} catch (err) {
|
|
60
94
|
console.log('Error occurred during local cleanup:', err);
|
|
61
95
|
}
|
|
62
96
|
|
|
63
|
-
try{
|
|
64
|
-
if(cleanedGCS && cleanedGCS.length > 0){
|
|
97
|
+
try {
|
|
98
|
+
if (cleanedGCS && cleanedGCS.length > 0) {
|
|
65
99
|
await cleanupGCS(cleanedGCS);
|
|
66
100
|
}
|
|
67
|
-
}catch(err){
|
|
101
|
+
} catch (err) {
|
|
68
102
|
console.log('Error occurred during GCS cleanup:', err);
|
|
69
103
|
}
|
|
70
|
-
|
|
71
104
|
} catch (error) {
|
|
72
105
|
console.log('Error occurred during cleanup:', error);
|
|
73
|
-
} finally{
|
|
106
|
+
} finally {
|
|
74
107
|
isCleanupRunning = false;
|
|
75
108
|
}
|
|
76
109
|
}
|
|
77
110
|
|
|
78
111
|
async function CortexFileHandler(context, req) {
|
|
79
|
-
const {
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
112
|
+
const {
|
|
113
|
+
uri,
|
|
114
|
+
requestId,
|
|
115
|
+
save,
|
|
116
|
+
hash,
|
|
117
|
+
checkHash,
|
|
118
|
+
clearHash,
|
|
119
|
+
fetch,
|
|
120
|
+
load,
|
|
121
|
+
restore,
|
|
122
|
+
} = req.body?.params || req.query;
|
|
123
|
+
const operation = save
|
|
124
|
+
? 'save'
|
|
125
|
+
: checkHash
|
|
126
|
+
? 'checkHash'
|
|
127
|
+
: clearHash
|
|
128
|
+
? 'clearHash'
|
|
129
|
+
: fetch || load || restore
|
|
130
|
+
? 'remoteFile'
|
|
131
|
+
: req.method.toLowerCase() === 'delete' ||
|
|
132
|
+
req.query.operation === 'delete'
|
|
133
|
+
? 'delete'
|
|
134
|
+
: uri
|
|
135
|
+
? DOC_EXTENSIONS.some((ext) => uri.toLowerCase().endsWith(ext))
|
|
136
|
+
? 'document_processing'
|
|
137
|
+
: 'media_chunking'
|
|
138
|
+
: 'upload';
|
|
139
|
+
|
|
140
|
+
context.log(
|
|
141
|
+
`Processing ${req.method} request - ${requestId ? `requestId: ${requestId}, ` : ''}${uri ? `uri: ${uri}, ` : ''}${hash ? `hash: ${hash}, ` : ''}operation: ${operation}`,
|
|
142
|
+
);
|
|
89
143
|
|
|
90
144
|
cleanupInactive(context); //trigger & no need to wait for it
|
|
91
145
|
|
|
146
|
+
// Initialize conversion service
|
|
147
|
+
const conversionService = new FileConversionService(context, useAzure);
|
|
148
|
+
|
|
92
149
|
// Clean up blob when request delete which means processing marked completed
|
|
93
150
|
if (operation === 'delete') {
|
|
94
151
|
const deleteRequestId = req.query.requestId || requestId;
|
|
95
152
|
if (!deleteRequestId) {
|
|
96
153
|
context.res = {
|
|
97
154
|
status: 400,
|
|
98
|
-
body:
|
|
155
|
+
body: 'Please pass a requestId on the query string',
|
|
99
156
|
};
|
|
100
157
|
return;
|
|
101
158
|
}
|
|
102
|
-
|
|
159
|
+
|
|
103
160
|
// Delete from Azure/Local storage
|
|
104
|
-
const azureResult = useAzure
|
|
161
|
+
const azureResult = useAzure
|
|
162
|
+
? await deleteBlob(deleteRequestId)
|
|
163
|
+
: await deleteFolder(deleteRequestId);
|
|
105
164
|
const gcsResult = [];
|
|
106
165
|
if (gcs) {
|
|
107
|
-
gcsResult.push(...await deleteGCS(deleteRequestId));
|
|
166
|
+
gcsResult.push(...(await deleteGCS(deleteRequestId)));
|
|
108
167
|
}
|
|
109
|
-
|
|
168
|
+
|
|
110
169
|
context.res = {
|
|
111
170
|
status: 200,
|
|
112
|
-
body: { body: [...azureResult, ...gcsResult] }
|
|
171
|
+
body: { body: [...azureResult, ...gcsResult] },
|
|
113
172
|
};
|
|
114
173
|
return;
|
|
115
174
|
}
|
|
116
175
|
|
|
117
176
|
const remoteUrl = fetch || restore || load;
|
|
118
|
-
if (req.method.toLowerCase() ===
|
|
177
|
+
if (req.method.toLowerCase() === 'get' && remoteUrl) {
|
|
119
178
|
context.log(`Remote file: ${remoteUrl}`);
|
|
120
|
-
let filename;
|
|
179
|
+
let filename; // Declare filename outside try block
|
|
121
180
|
try {
|
|
122
181
|
// Validate URL format and accessibility
|
|
123
182
|
const urlCheck = await urlExists(remoteUrl);
|
|
124
183
|
if (!urlCheck.valid) {
|
|
125
184
|
context.res = {
|
|
126
185
|
status: 400,
|
|
127
|
-
body: 'Invalid or inaccessible URL'
|
|
186
|
+
body: 'Invalid or inaccessible URL',
|
|
128
187
|
};
|
|
129
188
|
return;
|
|
130
189
|
}
|
|
131
190
|
|
|
132
191
|
// Check if file already exists (using hash as the key)
|
|
133
|
-
|
|
134
|
-
if(exists){
|
|
192
|
+
const exists = await getFileStoreMap(remoteUrl);
|
|
193
|
+
if (exists) {
|
|
135
194
|
context.res = {
|
|
136
195
|
status: 200,
|
|
137
|
-
body: exists
|
|
196
|
+
body: exists,
|
|
138
197
|
};
|
|
139
198
|
//update redis timestamp with current time
|
|
140
199
|
await setFileStoreMap(remoteUrl, exists);
|
|
@@ -143,28 +202,38 @@ async function CortexFileHandler(context, req) {
|
|
|
143
202
|
|
|
144
203
|
// Download the file first
|
|
145
204
|
const urlObj = new URL(remoteUrl);
|
|
146
|
-
let originalFileName = path.basename(urlObj.pathname);
|
|
205
|
+
let originalFileName = decodeURIComponent(path.basename(urlObj.pathname));
|
|
147
206
|
if (!originalFileName || originalFileName === '') {
|
|
148
207
|
originalFileName = urlObj.hostname;
|
|
149
208
|
}
|
|
150
|
-
|
|
209
|
+
|
|
151
210
|
// Ensure the filename has the correct extension based on content type
|
|
152
|
-
originalFileName = ensureFileExtension(
|
|
211
|
+
originalFileName = ensureFileExtension(
|
|
212
|
+
originalFileName,
|
|
213
|
+
urlCheck.contentType,
|
|
214
|
+
);
|
|
153
215
|
|
|
154
216
|
const maxLength = 200; // Set the maximum length for the filename
|
|
155
217
|
let truncatedFileName = originalFileName;
|
|
156
218
|
if (originalFileName.length > maxLength) {
|
|
157
219
|
const extension = path.extname(originalFileName);
|
|
158
220
|
const basename = path.basename(originalFileName, extension);
|
|
159
|
-
truncatedFileName =
|
|
221
|
+
truncatedFileName =
|
|
222
|
+
basename.substring(0, maxLength - extension.length) + extension;
|
|
160
223
|
}
|
|
161
224
|
|
|
162
225
|
// Use the original-truncated file name when saving the downloaded file
|
|
163
226
|
filename = path.join(os.tmpdir(), truncatedFileName);
|
|
164
227
|
await downloadFile(remoteUrl, filename);
|
|
165
|
-
|
|
228
|
+
|
|
166
229
|
// Now upload the downloaded file
|
|
167
|
-
const res = await uploadBlob(
|
|
230
|
+
const res = await uploadBlob(
|
|
231
|
+
context,
|
|
232
|
+
null,
|
|
233
|
+
!useAzure,
|
|
234
|
+
filename,
|
|
235
|
+
remoteUrl,
|
|
236
|
+
);
|
|
168
237
|
|
|
169
238
|
//Update Redis (using hash as the key)
|
|
170
239
|
await setFileStoreMap(remoteUrl, res);
|
|
@@ -175,10 +244,10 @@ async function CortexFileHandler(context, req) {
|
|
|
175
244
|
body: res,
|
|
176
245
|
};
|
|
177
246
|
} catch (error) {
|
|
178
|
-
context.log(
|
|
247
|
+
context.log('Error processing remote file request:', error);
|
|
179
248
|
context.res = {
|
|
180
249
|
status: 500,
|
|
181
|
-
body: `Error processing file: ${error.message}
|
|
250
|
+
body: `Error processing file: ${error.message}`,
|
|
182
251
|
};
|
|
183
252
|
} finally {
|
|
184
253
|
// Cleanup temp file if it exists
|
|
@@ -187,121 +256,208 @@ async function CortexFileHandler(context, req) {
|
|
|
187
256
|
fs.unlinkSync(filename);
|
|
188
257
|
}
|
|
189
258
|
} catch (err) {
|
|
190
|
-
context.log(
|
|
259
|
+
context.log('Error cleaning up temp file:', err);
|
|
191
260
|
}
|
|
192
261
|
}
|
|
193
262
|
return;
|
|
194
263
|
}
|
|
195
264
|
|
|
196
|
-
if(hash && clearHash){
|
|
265
|
+
if (hash && clearHash) {
|
|
197
266
|
try {
|
|
198
267
|
const hashValue = await getFileStoreMap(hash);
|
|
199
268
|
if (hashValue) {
|
|
200
269
|
await removeFromFileStoreMap(hash);
|
|
201
270
|
context.res = {
|
|
202
271
|
status: 200,
|
|
203
|
-
body: `Hash ${hash} removed
|
|
272
|
+
body: `Hash ${hash} removed`,
|
|
204
273
|
};
|
|
205
274
|
} else {
|
|
206
275
|
context.res = {
|
|
207
276
|
status: 404,
|
|
208
|
-
body: `Hash ${hash} not found
|
|
277
|
+
body: `Hash ${hash} not found`,
|
|
209
278
|
};
|
|
210
279
|
}
|
|
211
280
|
} catch (error) {
|
|
212
281
|
context.res = {
|
|
213
282
|
status: 500,
|
|
214
|
-
body: `Error occurred during hash cleanup: ${error}
|
|
283
|
+
body: `Error occurred during hash cleanup: ${error}`,
|
|
215
284
|
};
|
|
216
285
|
console.log('Error occurred during hash cleanup:', error);
|
|
217
286
|
}
|
|
218
287
|
return;
|
|
219
288
|
}
|
|
220
289
|
|
|
221
|
-
if(hash && checkHash){
|
|
290
|
+
if (hash && checkHash) {
|
|
222
291
|
let hashResult = await getFileStoreMap(hash);
|
|
223
292
|
|
|
224
|
-
if(hashResult){
|
|
293
|
+
if (hashResult) {
|
|
225
294
|
context.log(`File exists in map: ${hash}`);
|
|
226
|
-
|
|
295
|
+
|
|
296
|
+
// Log the URL retrieved from Redis before checking existence
|
|
297
|
+
context.log(`Checking existence of URL from Redis: ${hashResult?.url}`);
|
|
298
|
+
|
|
299
|
+
// Detect double-encoding in the blob name
|
|
300
|
+
if (hashResult.url) {
|
|
301
|
+
const urlPath = hashResult.url.split('?')[0];
|
|
302
|
+
const blobName = urlPath.substring(urlPath.lastIndexOf('/') + 1);
|
|
303
|
+
if (/%25[0-9A-Fa-f]{2}/.test(blobName)) {
|
|
304
|
+
context.log(
|
|
305
|
+
`Double-encoded blob detected for hash ${hash}. Invalidating cache entry.`,
|
|
306
|
+
);
|
|
307
|
+
await removeFromFileStoreMap(hash);
|
|
308
|
+
context.res = {
|
|
309
|
+
status: 404,
|
|
310
|
+
body: `Hash ${hash} is double-encoded and has been invalidated. Please re-upload.`,
|
|
311
|
+
};
|
|
312
|
+
return;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
227
316
|
// Check primary storage (Azure/Local) first
|
|
228
317
|
const primaryExists = await urlExists(hashResult?.url);
|
|
229
318
|
const gcsExists = gcs ? await gcsUrlExists(hashResult?.gcs) : false;
|
|
230
319
|
|
|
231
320
|
// If neither storage has the file, remove from map and return not found
|
|
232
321
|
if (!primaryExists.valid && !gcsExists) {
|
|
233
|
-
context.log(
|
|
322
|
+
context.log(
|
|
323
|
+
`File not found in any storage. Removing from map: ${hash}`,
|
|
324
|
+
);
|
|
234
325
|
await removeFromFileStoreMap(hash);
|
|
235
326
|
context.res = {
|
|
236
327
|
status: 404,
|
|
237
|
-
body: `Hash ${hash} not found in storage
|
|
328
|
+
body: `Hash ${hash} not found in storage`,
|
|
238
329
|
};
|
|
239
330
|
return;
|
|
240
331
|
}
|
|
241
332
|
|
|
333
|
+
// If GCS is missing but primary exists, restore to GCS
|
|
334
|
+
else if (primaryExists.valid && gcs && !gcsExists) {
|
|
335
|
+
context.log(`GCS file missing, restoring from primary: ${hash}`);
|
|
336
|
+
const { gcs: _, ...fileInfo } = hashResult;
|
|
337
|
+
hashResult = await ensureGCSUpload(context, fileInfo);
|
|
338
|
+
}
|
|
339
|
+
|
|
242
340
|
// If primary is missing but GCS exists, restore from GCS
|
|
243
341
|
if (!primaryExists.valid && gcsExists) {
|
|
244
342
|
context.log(`Primary storage file missing, restoring from GCS: ${hash}`);
|
|
245
343
|
try {
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
344
|
+
// Create a temporary file to store the downloaded content
|
|
345
|
+
const tempDir = path.join(os.tmpdir(), `${uuidv4()}`);
|
|
346
|
+
fs.mkdirSync(tempDir);
|
|
347
|
+
const downloadedFile = path.join(tempDir, path.basename(hashResult.gcs));
|
|
348
|
+
|
|
349
|
+
// Download from GCS using the new function
|
|
350
|
+
await downloadFromGCS(hashResult.gcs, downloadedFile);
|
|
351
|
+
|
|
352
|
+
// Upload to primary storage
|
|
353
|
+
const res = await uploadBlob(
|
|
354
|
+
context,
|
|
355
|
+
null,
|
|
356
|
+
!useAzure,
|
|
357
|
+
downloadedFile,
|
|
358
|
+
hash
|
|
359
|
+
);
|
|
360
|
+
|
|
361
|
+
// Update the hash result with the new primary storage URL
|
|
362
|
+
hashResult.url = res.url;
|
|
363
|
+
|
|
364
|
+
// Clean up temp file
|
|
365
|
+
try {
|
|
366
|
+
if (downloadedFile && fs.existsSync(downloadedFile)) {
|
|
367
|
+
fs.unlinkSync(downloadedFile);
|
|
368
|
+
}
|
|
369
|
+
if (tempDir && fs.existsSync(tempDir)) {
|
|
370
|
+
fs.rmSync(tempDir, { recursive: true });
|
|
371
|
+
}
|
|
372
|
+
} catch (err) {
|
|
373
|
+
console.log('Error cleaning up temp files:', err);
|
|
252
374
|
}
|
|
253
375
|
} catch (error) {
|
|
254
376
|
console.error('Error restoring from GCS:', error);
|
|
255
377
|
}
|
|
256
378
|
}
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
const { gcs: _, ...fileInfo } = hashResult; // eslint-disable-line no-unused-vars
|
|
261
|
-
hashResult = await ensureGCSUpload(context, fileInfo);
|
|
262
|
-
}
|
|
379
|
+
|
|
380
|
+
// Ensure converted version exists if needed
|
|
381
|
+
hashResult = await conversionService.ensureConvertedVersion(hashResult, requestId);
|
|
263
382
|
|
|
264
383
|
// Final check to ensure we have at least one valid storage location
|
|
265
384
|
const finalPrimaryCheck = await urlExists(hashResult?.url);
|
|
266
|
-
if (!finalPrimaryCheck.valid && !await gcsUrlExists(hashResult?.gcs)) {
|
|
385
|
+
if (!finalPrimaryCheck.valid && !(await gcsUrlExists(hashResult?.gcs))) {
|
|
267
386
|
context.log(`Failed to restore file. Removing from map: ${hash}`);
|
|
268
387
|
await removeFromFileStoreMap(hash);
|
|
269
388
|
context.res = {
|
|
270
389
|
status: 404,
|
|
271
|
-
body: `Hash ${hash} not found and restoration failed
|
|
390
|
+
body: `Hash ${hash} not found and restoration failed`,
|
|
272
391
|
};
|
|
273
392
|
return;
|
|
274
393
|
}
|
|
275
394
|
|
|
395
|
+
// Create the response object
|
|
396
|
+
const response = {
|
|
397
|
+
message: `File '${hashResult.filename}' ${useAzure ? 'uploaded' : 'saved'} successfully.`,
|
|
398
|
+
filename: hashResult.filename,
|
|
399
|
+
url: hashResult.url,
|
|
400
|
+
gcs: hashResult.gcs,
|
|
401
|
+
hash: hashResult.hash,
|
|
402
|
+
timestamp: new Date().toISOString()
|
|
403
|
+
};
|
|
404
|
+
|
|
405
|
+
// Add converted info if it exists and has a valid URL
|
|
406
|
+
if (hashResult.converted?.url) {
|
|
407
|
+
context.log(`Adding converted info to final response`);
|
|
408
|
+
response.converted = {
|
|
409
|
+
url: hashResult.converted.url,
|
|
410
|
+
gcs: hashResult.converted.gcs
|
|
411
|
+
};
|
|
412
|
+
} else if (hashResult.converted?.gcs) {
|
|
413
|
+
// If we only have GCS URL, trigger conversion
|
|
414
|
+
context.log(`Only GCS URL exists for converted file, triggering conversion`);
|
|
415
|
+
const convertedResult = await conversionService.convertFile(
|
|
416
|
+
await downloadFile(hashResult.url, path.join(os.tmpdir(), path.basename(hashResult.url))),
|
|
417
|
+
hashResult.url
|
|
418
|
+
);
|
|
419
|
+
if (convertedResult.converted) {
|
|
420
|
+
const convertedSaveResult = await conversionService._saveConvertedFile(convertedResult.convertedPath, requestId);
|
|
421
|
+
response.converted = {
|
|
422
|
+
url: convertedSaveResult.url,
|
|
423
|
+
gcs: hashResult.converted.gcs
|
|
424
|
+
};
|
|
425
|
+
// Update the hash map with the new converted info
|
|
426
|
+
await setFileStoreMap(`${hashResult.hash}_converted`, response.converted);
|
|
427
|
+
}
|
|
428
|
+
} else {
|
|
429
|
+
context.log(`No converted info to add to final response`);
|
|
430
|
+
}
|
|
431
|
+
|
|
276
432
|
//update redis timestamp with current time
|
|
277
433
|
await setFileStoreMap(hash, hashResult);
|
|
278
434
|
|
|
279
435
|
context.res = {
|
|
280
436
|
status: 200,
|
|
281
|
-
body:
|
|
437
|
+
body: response
|
|
282
438
|
};
|
|
283
439
|
return;
|
|
284
440
|
}
|
|
285
441
|
|
|
286
442
|
context.res = {
|
|
287
443
|
status: 404,
|
|
288
|
-
body: `Hash ${hash} not found
|
|
444
|
+
body: `Hash ${hash} not found`,
|
|
289
445
|
};
|
|
290
446
|
return;
|
|
291
447
|
}
|
|
292
448
|
|
|
293
|
-
if (req.method.toLowerCase() ===
|
|
294
|
-
await uploadBlob(context, req, !useAzure, null, hash);
|
|
295
|
-
if(hash && context?.res?.body){
|
|
296
|
-
await setFileStoreMap(hash, context.res.body);
|
|
449
|
+
if (req.method.toLowerCase() === 'post') {
|
|
450
|
+
const result = await uploadBlob(context, req, !useAzure, null, hash);
|
|
451
|
+
if (result?.hash && context?.res?.body) {
|
|
452
|
+
await setFileStoreMap(result.hash, context.res.body);
|
|
297
453
|
}
|
|
298
|
-
return
|
|
454
|
+
return;
|
|
299
455
|
}
|
|
300
456
|
|
|
301
457
|
if (!uri || !requestId) {
|
|
302
458
|
context.res = {
|
|
303
459
|
status: 400,
|
|
304
|
-
body:
|
|
460
|
+
body: 'Please pass a uri and requestId on the query string or in the request body',
|
|
305
461
|
};
|
|
306
462
|
return;
|
|
307
463
|
}
|
|
@@ -310,72 +466,96 @@ async function CortexFileHandler(context, req) {
|
|
|
310
466
|
let completedCount = 0;
|
|
311
467
|
let numberOfChunks;
|
|
312
468
|
|
|
313
|
-
|
|
469
|
+
const file = ensureEncoded(uri); // encode url to handle special characters
|
|
314
470
|
|
|
315
471
|
const result = [];
|
|
316
472
|
|
|
317
473
|
const sendProgress = async (data = null) => {
|
|
318
474
|
completedCount++;
|
|
319
475
|
const progress = completedCount / totalCount;
|
|
320
|
-
await publishRequestProgress({
|
|
321
|
-
|
|
476
|
+
await publishRequestProgress({
|
|
477
|
+
requestId,
|
|
478
|
+
progress,
|
|
479
|
+
completedCount,
|
|
480
|
+
totalCount,
|
|
481
|
+
numberOfChunks,
|
|
482
|
+
data,
|
|
483
|
+
});
|
|
484
|
+
};
|
|
322
485
|
|
|
323
486
|
try {
|
|
324
|
-
|
|
487
|
+
// Parse URL and get pathname without query parameters for extension check
|
|
325
488
|
const urlObj = new URL(uri);
|
|
326
489
|
const pathWithoutQuery = urlObj.pathname;
|
|
327
|
-
|
|
328
|
-
if (
|
|
490
|
+
|
|
491
|
+
if (
|
|
492
|
+
DOC_EXTENSIONS.some((ext) => pathWithoutQuery.toLowerCase().endsWith(ext))
|
|
493
|
+
) {
|
|
329
494
|
const extension = path.extname(pathWithoutQuery).toLowerCase();
|
|
330
495
|
const tempDir = path.join(os.tmpdir(), `${uuidv4()}`);
|
|
331
496
|
fs.mkdirSync(tempDir);
|
|
332
497
|
const downloadedFile = path.join(tempDir, `${uuidv4()}${extension}`);
|
|
333
498
|
await downloadFile(uri, downloadedFile);
|
|
334
|
-
const text = await documentToText(downloadedFile);
|
|
335
|
-
let tmpPath;
|
|
336
499
|
|
|
337
500
|
try {
|
|
338
501
|
if (save) {
|
|
339
|
-
const
|
|
340
|
-
const
|
|
341
|
-
|
|
342
|
-
fs.
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
502
|
+
const saveResults = [];
|
|
503
|
+
const originalFileName = `${uuidv4()}_${encodeURIComponent(path.basename(downloadedFile))}`;
|
|
504
|
+
const originalFilePath = path.join(tempDir, originalFileName);
|
|
505
|
+
await fs.promises.copyFile(downloadedFile, originalFilePath);
|
|
506
|
+
let fileUrl;
|
|
507
|
+
if (useAzure) {
|
|
508
|
+
const savedBlob = await saveFileToBlob(originalFilePath, requestId);
|
|
509
|
+
fileUrl = savedBlob?.url;
|
|
510
|
+
} else {
|
|
511
|
+
fileUrl = await moveFileToPublicFolder(originalFilePath, requestId);
|
|
512
|
+
}
|
|
513
|
+
saveResults.push(fileUrl);
|
|
514
|
+
result.push(fileUrl);
|
|
348
515
|
} else {
|
|
516
|
+
const text = await conversionService.convertFile(downloadedFile, uri, true);
|
|
349
517
|
result.push(...easyChunker(text));
|
|
350
518
|
}
|
|
351
|
-
} catch(err) {
|
|
352
|
-
console.log(
|
|
519
|
+
} catch (err) {
|
|
520
|
+
console.log(
|
|
521
|
+
`Error saving file ${uri} with request id ${requestId}:`,
|
|
522
|
+
err,
|
|
523
|
+
);
|
|
524
|
+
throw err; // Re-throw to handle in outer catch
|
|
353
525
|
} finally {
|
|
354
526
|
try {
|
|
355
527
|
// delete temporary files
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
528
|
+
if (downloadedFile && fs.existsSync(downloadedFile)) {
|
|
529
|
+
fs.unlinkSync(downloadedFile);
|
|
530
|
+
console.log(`Cleaned temp file ${downloadedFile}`);
|
|
531
|
+
}
|
|
532
|
+
} catch (err) {
|
|
533
|
+
console.log(`Error cleaning temp file ${downloadedFile}:`, err);
|
|
361
534
|
}
|
|
362
|
-
|
|
535
|
+
|
|
363
536
|
try {
|
|
364
537
|
//delete uploaded prev nontext file
|
|
365
538
|
//check cleanup for uploaded files url
|
|
366
|
-
const regex = new RegExp(
|
|
539
|
+
const regex = new RegExp(
|
|
540
|
+
`${AZURE_STORAGE_CONTAINER_NAME}/([a-z0-9-]+)`,
|
|
541
|
+
);
|
|
367
542
|
const match = uri.match(regex);
|
|
368
543
|
if (match && match[1]) {
|
|
369
544
|
const extractedValue = match[1];
|
|
370
|
-
useAzure
|
|
371
|
-
|
|
545
|
+
useAzure
|
|
546
|
+
? await deleteBlob(extractedValue)
|
|
547
|
+
: await deleteFolder(extractedValue);
|
|
548
|
+
console.log(
|
|
549
|
+
`Cleaned temp file ${uri} with request id ${extractedValue}`,
|
|
550
|
+
);
|
|
372
551
|
}
|
|
373
|
-
} catch(err) {
|
|
552
|
+
} catch (err) {
|
|
374
553
|
console.log(`Error cleaning temp file ${uri}:`, err);
|
|
375
554
|
}
|
|
376
555
|
}
|
|
377
556
|
} else {
|
|
378
|
-
const { chunkPromises, chunkOffsets, uniqueOutputPath } =
|
|
557
|
+
const { chunkPromises, chunkOffsets, uniqueOutputPath } =
|
|
558
|
+
await splitMediaFile(file);
|
|
379
559
|
|
|
380
560
|
numberOfChunks = chunkPromises.length; // for progress reporting
|
|
381
561
|
totalCount += chunkPromises.length * 4; // 4 steps for each chunk (download and upload)
|
|
@@ -391,21 +571,24 @@ async function CortexFileHandler(context, req) {
|
|
|
391
571
|
// sequential processing of chunks
|
|
392
572
|
for (let index = 0; index < chunks.length; index++) {
|
|
393
573
|
const chunkPath = chunks[index];
|
|
394
|
-
let
|
|
395
|
-
let
|
|
396
|
-
|
|
574
|
+
let chunkUrl;
|
|
575
|
+
let chunkGcsUrl;
|
|
576
|
+
|
|
397
577
|
if (useAzure) {
|
|
398
|
-
|
|
578
|
+
const savedBlob = await saveFileToBlob(chunkPath, requestId);
|
|
579
|
+
chunkUrl = savedBlob.url;
|
|
399
580
|
} else {
|
|
400
|
-
|
|
581
|
+
chunkUrl = await moveFileToPublicFolder(chunkPath, requestId);
|
|
401
582
|
}
|
|
402
|
-
|
|
583
|
+
|
|
403
584
|
// If GCS is configured, save to GCS
|
|
404
|
-
|
|
405
|
-
|
|
585
|
+
chunkGcsUrl = await uploadChunkToGCS(chunkPath, requestId);
|
|
586
|
+
|
|
406
587
|
const chunkOffset = chunkOffsets[index];
|
|
407
|
-
result.push({ uri:
|
|
408
|
-
console.log(
|
|
588
|
+
result.push({ uri: chunkUrl, offset: chunkOffset, gcs: chunkGcsUrl });
|
|
589
|
+
console.log(
|
|
590
|
+
`Saved chunk as: ${chunkUrl}${chunkGcsUrl ? ` and ${chunkGcsUrl}` : ''}`,
|
|
591
|
+
);
|
|
409
592
|
await sendProgress();
|
|
410
593
|
}
|
|
411
594
|
|
|
@@ -420,21 +603,26 @@ async function CortexFileHandler(context, req) {
|
|
|
420
603
|
}
|
|
421
604
|
}
|
|
422
605
|
} catch (error) {
|
|
423
|
-
console.error(
|
|
606
|
+
console.error('An error occurred:', error);
|
|
424
607
|
context.res = {
|
|
425
608
|
status: 500,
|
|
426
|
-
body: error.message || error
|
|
609
|
+
body: error.message || error,
|
|
427
610
|
};
|
|
428
611
|
return;
|
|
429
612
|
}
|
|
430
613
|
|
|
431
|
-
console.log(
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
614
|
+
console.log(
|
|
615
|
+
'result:',
|
|
616
|
+
result
|
|
617
|
+
.map((item) =>
|
|
618
|
+
typeof item === 'object' ? JSON.stringify(item, null, 2) : item,
|
|
619
|
+
)
|
|
620
|
+
.join('\n'),
|
|
621
|
+
);
|
|
622
|
+
|
|
435
623
|
context.res = {
|
|
436
|
-
body: result
|
|
624
|
+
body: result,
|
|
437
625
|
};
|
|
438
626
|
}
|
|
439
627
|
|
|
440
|
-
export default CortexFileHandler;
|
|
628
|
+
export default CortexFileHandler;
|