@aj-archipelago/cortex 1.1.18 → 1.1.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,176 +1,337 @@
1
- import fs from 'fs';
2
- import path from 'path';
3
- import { BlobServiceClient } from '@azure/storage-blob';
4
- import { v4 as uuidv4 } from 'uuid';
5
- import Busboy from 'busboy';
6
- import { PassThrough } from 'stream';
7
- import { pipeline as _pipeline } from 'stream';
8
- import { promisify } from 'util';
1
+ import fs from "fs";
2
+ import path from "path";
3
+ import { BlobServiceClient } from "@azure/storage-blob";
4
+ import { v4 as uuidv4 } from "uuid";
5
+ import Busboy from "busboy";
6
+ import { PassThrough } from "stream";
7
+ import { pipeline as _pipeline } from "stream";
8
+ import { promisify } from "util";
9
9
  const pipeline = promisify(_pipeline);
10
- import { join } from 'path';
10
+ import { join } from "path";
11
+ import { Storage } from "@google-cloud/storage";
12
+ import axios from "axios";
13
+ import { publicFolder, port, ipAddress } from "./start.js";
11
14
 
15
+ const IMAGE_EXTENSIONS = [
16
+ ".jpg",
17
+ ".jpeg",
18
+ ".png",
19
+ ".gif",
20
+ ".bmp",
21
+ ".webp",
22
+ ".tiff",
23
+ ".svg",
24
+ ];
25
+
26
+ const VIDEO_EXTENSIONS = [
27
+ ".mp4",
28
+ ".webm",
29
+ ".ogg",
30
+ ".mov",
31
+ ".avi",
32
+ ".flv",
33
+ ".wmv",
34
+ ".mkv",
35
+ ];
36
+
37
+ function isBase64(str) {
38
+ try {
39
+ return btoa(atob(str)) == str;
40
+ } catch (err) {
41
+ return false;
42
+ }
43
+ }
12
44
 
13
- import { publicFolder, port, ipAddress } from "./start.js";
45
+ const GCP_SERVICE_ACCOUNT_KEY =
46
+ process.env.GCP_SERVICE_ACCOUNT_KEY_BASE64 ||
47
+ process.env.GCP_SERVICE_ACCOUNT_KEY ||
48
+ "{}";
49
+ const GCP_SERVICE_ACCOUNT = isBase64(GCP_SERVICE_ACCOUNT_KEY)
50
+ ? JSON.parse(Buffer.from(GCP_SERVICE_ACCOUNT_KEY, "base64").toString())
51
+ : JSON.parse(GCP_SERVICE_ACCOUNT_KEY);
52
+ const { project_id: GCP_PROJECT_ID } = GCP_SERVICE_ACCOUNT;
53
+
54
+ let gcs;
55
+ if (!GCP_PROJECT_ID || !GCP_SERVICE_ACCOUNT) {
56
+ console.warn(
57
+ "Google Cloud Project ID or Service Account details are missing"
58
+ );
59
+ } else {
60
+ try {
61
+ gcs = new Storage({
62
+ projectId: GCP_PROJECT_ID,
63
+ credentials: GCP_SERVICE_ACCOUNT,
64
+ });
65
+
66
+ // Rest of your Google Cloud operations using gcs object
67
+ } catch (error) {
68
+ console.error(
69
+ "Provided Google Cloud Service Account details are invalid: ",
70
+ error
71
+ );
72
+ }
73
+ }
74
+
75
+ const GCS_BUCKETNAME = process.env.GCS_BUCKETNAME || "cortextempfiles";
14
76
 
15
77
  const getBlobClient = () => {
16
- const connectionString = process.env.AZURE_STORAGE_CONNECTION_STRING;
17
- const containerName = process.env.AZURE_STORAGE_CONTAINER_NAME;
18
- if (!connectionString || !containerName) {
19
- throw new Error('Missing Azure Storage connection string or container name environment variable');
20
- }
78
+ const connectionString = process.env.AZURE_STORAGE_CONNECTION_STRING;
79
+ const containerName = process.env.AZURE_STORAGE_CONTAINER_NAME;
80
+ if (!connectionString || !containerName) {
81
+ throw new Error(
82
+ "Missing Azure Storage connection string or container name environment variable"
83
+ );
84
+ }
21
85
 
22
- const blobServiceClient = BlobServiceClient.fromConnectionString(connectionString);
23
- const containerClient = blobServiceClient.getContainerClient(containerName);
86
+ const blobServiceClient =
87
+ BlobServiceClient.fromConnectionString(connectionString);
88
+ const containerClient = blobServiceClient.getContainerClient(containerName);
24
89
 
25
- return { blobServiceClient, containerClient };
26
- }
90
+ return { blobServiceClient, containerClient };
91
+ };
27
92
 
28
93
  async function saveFileToBlob(chunkPath, requestId) {
29
- const { containerClient } = getBlobClient();
30
- // Use the filename with a UUID as the blob name
31
- const blobName = `${requestId}/${uuidv4()}_${path.basename(chunkPath)}`;
94
+ const { containerClient } = getBlobClient();
95
+ // Use the filename with a UUID as the blob name
96
+ const blobName = `${requestId}/${uuidv4()}_${path.basename(chunkPath)}`;
32
97
 
33
- // Create a read stream for the chunk file
34
- const fileStream = fs.createReadStream(chunkPath);
98
+ // Create a read stream for the chunk file
99
+ const fileStream = fs.createReadStream(chunkPath);
35
100
 
36
- // Upload the chunk to Azure Blob Storage using the stream
37
- const blockBlobClient = containerClient.getBlockBlobClient(blobName);
38
- await blockBlobClient.uploadStream(fileStream);
101
+ // Upload the chunk to Azure Blob Storage using the stream
102
+ const blockBlobClient = containerClient.getBlockBlobClient(blobName);
103
+ await blockBlobClient.uploadStream(fileStream);
39
104
 
40
- // Return the full URI of the uploaded blob
41
- const blobUrl = blockBlobClient.url;
42
- return blobUrl;
105
+ // Return the full URI of the uploaded blob
106
+ const blobUrl = blockBlobClient.url;
107
+ return blobUrl;
43
108
  }
44
109
 
45
110
  //deletes blob that has the requestId
46
111
  async function deleteBlob(requestId) {
47
- if (!requestId) throw new Error('Missing requestId parameter');
48
- const { containerClient } = getBlobClient();
49
- // List the blobs in the container with the specified prefix
50
- const blobs = containerClient.listBlobsFlat({ prefix: `${requestId}/` });
112
+ if (!requestId) throw new Error("Missing requestId parameter");
113
+ const { containerClient } = getBlobClient();
114
+ // List the blobs in the container with the specified prefix
115
+ const blobs = containerClient.listBlobsFlat({ prefix: `${requestId}/` });
116
+
117
+ const result = [];
118
+ // Iterate through the blobs
119
+ for await (const blob of blobs) {
120
+ // Delete the matching blob
121
+ const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
122
+ await blockBlobClient.delete();
123
+ console.log(`Cleaned blob: ${blob.name}`);
124
+ result.push(blob.name);
125
+ }
126
+
127
+ return result;
128
+ }
51
129
 
52
- const result = []
53
- // Iterate through the blobs
54
- for await (const blob of blobs) {
55
- // Delete the matching blob
56
- const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
57
- await blockBlobClient.delete();
58
- console.log(`Cleaned blob: ${blob.name}`);
59
- result.push(blob.name);
60
- }
130
+ async function uploadBlob(
131
+ context,
132
+ req,
133
+ saveToLocal = false,
134
+ useGoogle = false
135
+ ) {
136
+ return new Promise((resolve, reject) => {
137
+ try {
138
+ const busboy = Busboy({ headers: req.headers });
139
+ let requestId = uuidv4();
140
+ let body = {};
141
+
142
+ busboy.on("field", (fieldname, value) => {
143
+ if (fieldname === "requestId") {
144
+ requestId = value;
145
+ } else if (fieldname === "useGoogle") {
146
+ useGoogle = value;
147
+ }
148
+ });
149
+
150
+ busboy.on("file", async (fieldname, file, info) => {
151
+ //do not use google if file is not image or video
152
+ const ext = path.extname(info.filename).toLowerCase();
153
+ const canUseGoogle = IMAGE_EXTENSIONS.includes(ext) || VIDEO_EXTENSIONS.includes(ext);
154
+ if(!canUseGoogle) {
155
+ useGoogle = false;
156
+ }
61
157
 
62
- return result
63
- }
158
+ //check if useGoogle is set but no gcs and warn
159
+ if(useGoogle && useGoogle !== "false" && !gcs) {
160
+ context.log.warn("Google Cloud Storage is not initialized reverting google upload ");
161
+ useGoogle = false;
162
+ }
163
+
164
+ if (saveToLocal) {
165
+ // Create the target folder if it doesn't exist
166
+ const localPath = join(publicFolder, requestId);
167
+ fs.mkdirSync(localPath, { recursive: true });
168
+
169
+ const filename = `${uuidv4()}_${info.filename}`;
170
+ const destinationPath = `${localPath}/${filename}`;
171
+
172
+ await pipeline(file, fs.createWriteStream(destinationPath));
173
+
174
+ const message = `File '${filename}' saved to folder successfully.`;
175
+ context.log(message);
176
+
177
+ const url = `http://${ipAddress}:${port}/files/${requestId}/${filename}`;
178
+
179
+ body = { message, url };
180
+
181
+ resolve(body); // Resolve the promise
182
+ } else {
183
+ const filename = `${requestId}/${uuidv4()}_${info.filename}`;
184
+ const { containerClient } = getBlobClient();
185
+
186
+ const blockBlobClient = containerClient.getBlockBlobClient(filename);
64
187
 
65
- async function uploadBlob(context, req, saveToLocal = false) {
66
- return new Promise((resolve, reject) => {
67
- try {
68
- const busboy = Busboy({ headers: req.headers });
69
- let requestId = uuidv4();
70
-
71
- busboy.on('field', (fieldname, value) => {
72
- if (fieldname === 'requestId') {
73
- requestId = value;
74
- }
75
- });
76
-
77
- busboy.on('file', async (fieldname, file, info) => {
78
- if (saveToLocal) {
79
- // Create the target folder if it doesn't exist
80
- const localPath = join(publicFolder, requestId);
81
- fs.mkdirSync(localPath, { recursive: true });
82
-
83
- const filename = `${uuidv4()}_${info.filename}`;
84
- const destinationPath = `${localPath}/${filename}`;
85
-
86
- await pipeline(file, fs.createWriteStream(destinationPath));
87
-
88
- const message = `File '${filename}' saved to folder successfully.`;
89
- context.log(message);
90
-
91
- const url = `http://${ipAddress}:${port}/files/${requestId}/${filename}`;
92
-
93
- const body = { message, url };
94
-
95
- context.res = {
96
- status: 200,
97
- body,
98
- };
99
-
100
-
101
- resolve(body); // Resolve the promise
102
- } else {
103
- const { containerClient } = getBlobClient();
104
- const filename = `${requestId}/${uuidv4()}_${info.filename}`;
105
-
106
- const blockBlobClient = containerClient.getBlockBlobClient(filename);
107
-
108
- const passThroughStream = new PassThrough();
109
- file.pipe(passThroughStream);
110
-
111
- await blockBlobClient.uploadStream(passThroughStream);
112
-
113
- const message = `File '${filename}' uploaded successfully.`;
114
- const url = blockBlobClient.url;
115
- context.log(message);
116
- const body = { message, url };
117
-
118
- context.res = {
119
- status: 200,
120
- body,
121
- };
122
-
123
- resolve(body); // Resolve the promise
124
- }
125
- });
126
-
127
- busboy.on('error', (error) => {
128
- context.log.error('Error processing file upload:', error);
129
- context.res = {
130
- status: 500,
131
- body: 'Error processing file upload.',
132
- };
133
- reject(error); // Reject the promise
134
- });
135
-
136
- req.pipe(busboy);
137
- } catch (error) {
138
- context.log.error('Error processing file upload:', error);
139
- context.res = {
140
- status: 500,
141
- body: 'Error processing file upload.',
142
- };
143
- reject(error); // Reject the promise
188
+ const passThroughStream = new PassThrough();
189
+ file.pipe(passThroughStream);
190
+
191
+ await blockBlobClient.uploadStream(passThroughStream);
192
+
193
+ const message = `File '${filename}' uploaded successfully.`;
194
+ const url = blockBlobClient.url;
195
+ context.log(message);
196
+ body = { message, url };
144
197
  }
145
- });
198
+
199
+ context.res = {
200
+ status: 200,
201
+ body,
202
+ };
203
+
204
+ if (useGoogle && useGoogle !== "false") {
205
+ const { url } = body;
206
+ const filename = `${requestId}/${uuidv4()}_${info.filename}`;
207
+ const gcsFile = gcs.bucket(GCS_BUCKETNAME).file(filename);
208
+ const writeStream = gcsFile.createWriteStream();
209
+
210
+ const response = await axios({
211
+ method: "get",
212
+ url: url,
213
+ responseType: "stream",
214
+ });
215
+
216
+ // Pipe the Axios response stream directly into the GCS Write Stream
217
+ response.data.pipe(writeStream);
218
+
219
+ await new Promise((resolve, reject) => {
220
+ writeStream.on("finish", resolve);
221
+ writeStream.on("error", reject);
222
+ });
223
+
224
+ body.gcs = `gs://${GCS_BUCKETNAME}/${filename}`;
225
+ }
226
+
227
+ resolve(body); // Resolve the promise
228
+ });
229
+
230
+ busboy.on("error", (error) => {
231
+ context.log.error("Error processing file upload:", error);
232
+ context.res = {
233
+ status: 500,
234
+ body: "Error processing file upload.",
235
+ };
236
+ reject(error); // Reject the promise
237
+ });
238
+
239
+ req.pipe(busboy);
240
+ } catch (error) {
241
+ context.log.error("Error processing file upload:", error);
242
+ context.res = {
243
+ status: 500,
244
+ body: "Error processing file upload.",
245
+ };
246
+ reject(error); // Reject the promise
247
+ }
248
+ });
146
249
  }
147
250
 
148
251
  // Function to delete files that haven't been used in more than a month
149
- async function cleanup() {
150
- const { containerClient } = getBlobClient();
151
-
152
- // List all the blobs in the container
153
- const blobs = containerClient.listBlobsFlat();
154
-
155
- // Calculate the date that is x month ago
252
+ async function cleanup(urls=null) {
253
+ const { containerClient } = getBlobClient();
254
+
255
+ if(!urls) {
156
256
  const xMonthAgo = new Date();
157
257
  xMonthAgo.setMonth(xMonthAgo.getMonth() - 1);
158
-
159
- // Iterate through the blobs
258
+
259
+ const blobs = containerClient.listBlobsFlat();
260
+ const cleanedURLs = [];
261
+
160
262
  for await (const blob of blobs) {
161
- // Get the last modified date of the blob
162
263
  const lastModified = blob.properties.lastModified;
163
-
164
- // Compare the last modified date with one month ago
165
264
  if (lastModified < xMonthAgo) {
166
- // Delete the blob
167
265
  const blockBlobClient = containerClient.getBlockBlobClient(blob.name);
168
266
  await blockBlobClient.delete();
169
267
  console.log(`Cleaned blob: ${blob.name}`);
268
+ cleanedURLs.push(blob.name);
170
269
  }
171
270
  }
271
+
272
+ return cleanedURLs;
273
+ }else{
274
+ // Delete the blobs with the specified URLs
275
+ const cleanedURLs = [];
276
+ for(const url of urls) {
277
+ // Remove the base url to get the blob name
278
+ const blobName = url.replace(containerClient.url, '');
279
+ const blockBlobClient = containerClient.getBlockBlobClient(blobName);
280
+ await blockBlobClient.delete();
281
+ console.log(`Cleaned blob: ${blobName}`);
282
+ cleanedURLs.push(blobName);
283
+ }
284
+ return cleanedURLs;
285
+ }
286
+ }
287
+
288
+ async function cleanupGCS(urls=null) {
289
+ const bucket = gcs.bucket(GCS_BUCKETNAME);
290
+ const directories = new Set();
291
+ const cleanedURLs = [];
292
+
293
+ if(!urls){
294
+ const daysN = 30;
295
+ const thirtyDaysAgo = new Date(Date.now() - daysN * 24 * 60 * 60 * 1000);
296
+ const [files] = await bucket.getFiles();
297
+
298
+ for (const file of files) {
299
+ const [metadata] = await file.getMetadata();
300
+ const directoryPath = path.dirname(file.name);
301
+ directories.add(directoryPath);
302
+ if (metadata.updated) {
303
+ const updatedTime = new Date(metadata.updated);
304
+ if (updatedTime.getTime() < thirtyDaysAgo.getTime()) {
305
+ console.log(`Cleaning file: ${file.name}`);
306
+ await file.delete();
307
+ cleanedURLs.push(file.name);
308
+ }
309
+ }
310
+ }
311
+ }else{
312
+ try {
313
+ for(const url of urls) {
314
+ const filename = path.join(url.split('/').slice(3).join('/'));
315
+ const file = bucket.file(filename);
316
+ const directoryPath = path.dirname(file.name);
317
+ directories.add(directoryPath);
318
+ await file.delete();
319
+ cleanedURLs.push(url);
320
+ }
321
+ }catch(error){
322
+ console.error(`Error cleaning up files: ${error}`);
323
+ }
324
+ }
325
+
326
+ for (const directory of directories) {
327
+ const [files] = await bucket.getFiles({ prefix: directory });
328
+ if (files.length === 0) {
329
+ console.log(`Deleting empty directory: ${directory}`);
330
+ await bucket.deleteFiles({ prefix: directory });
331
+ }
332
+ }
333
+
334
+ return cleanedURLs;
172
335
  }
173
336
 
174
- export {
175
- saveFileToBlob, deleteBlob, uploadBlob, cleanup
176
- }
337
+ export { saveFileToBlob, deleteBlob, uploadBlob, cleanup, cleanupGCS };
@@ -1,6 +1,6 @@
1
1
  import { downloadFile, processYoutubeUrl, splitMediaFile } from './fileChunker.js';
2
- import { saveFileToBlob, deleteBlob, uploadBlob, cleanup } from './blobHandler.js';
3
- import { publishRequestProgress } from './redis.js';
2
+ import { saveFileToBlob, deleteBlob, uploadBlob, cleanup, cleanupGCS } from './blobHandler.js';
3
+ import { cleanupRedisFileStoreMap, getFileStoreMap, publishRequestProgress, removeFromFileStoreMap, setFileStoreMap } from './redis.js';
4
4
  import { deleteTempPath, ensureEncoded, isValidYoutubeUrl } from './helper.js';
5
5
  import { moveFileToPublicFolder, deleteFolder, cleanupLocal } from './localFileHandler.js';
6
6
  import { documentToText, easyChunker } from './docHelper.js';
@@ -8,6 +8,8 @@ import path from 'path';
8
8
  import os from 'os';
9
9
  import { v4 as uuidv4 } from 'uuid';
10
10
  import fs from 'fs';
11
+ import http from 'http';
12
+ import https from 'https';
11
13
 
12
14
  const DOC_EXTENSIONS = [".txt", ".json", ".csv", ".md", ".xml", ".js", ".html", ".css", '.pdf', '.docx', '.xlsx', '.csv'];
13
15
 
@@ -16,15 +18,58 @@ console.log(useAzure ? 'Using Azure Storage' : 'Using local file system');
16
18
 
17
19
 
18
20
  let isCleanupRunning = false;
19
- async function cleanupInactive(useAzure) {
21
+ async function cleanupInactive() {
20
22
  try {
21
23
  if (isCleanupRunning) { return; } //no need to cleanup every call
22
24
  isCleanupRunning = true;
23
- if (useAzure) {
24
- await cleanup();
25
- } else {
26
- await cleanupLocal();
25
+ const cleaned = await cleanupRedisFileStoreMap();
26
+
27
+ const cleanedAzure = [];
28
+ const cleanedLocal = [];
29
+ const cleanedGCS = [];
30
+
31
+ for(const key in cleaned){
32
+ const item = cleaned[key];
33
+ const {url,gcs} = item;
34
+ if(url){
35
+ if(url.includes('.blob.core.windows.net/')){
36
+ cleanedAzure.push(url);
37
+ }else if(url.startsWith('gs://')){
38
+ cleanedGCS.push(url);
39
+ }else{
40
+ cleanedLocal.push(url);
41
+ }
42
+ }
43
+
44
+ if(item && item.gcs){
45
+ cleanedGCS.push(gcs);
46
+ }
47
+ }
48
+
49
+ try {
50
+ if (cleanedAzure && cleanedAzure.length > 0) {
51
+ await cleanup(cleanedAzure);
52
+ }
53
+ } catch (error) {
54
+ console.log('Error occurred during azure cleanup:', error);
55
+ }
56
+
57
+ try {
58
+ if (cleanedLocal && cleanedLocal.length > 0) {
59
+ await cleanupLocal(cleanedLocal);
60
+ }
61
+ }catch(err){
62
+ console.log('Error occurred during local cleanup:', err);
63
+ }
64
+
65
+ try{
66
+ if(cleanedGCS && cleanedGCS.length > 0){
67
+ await cleanupGCS(cleanedGCS);
68
+ }
69
+ }catch(err){
70
+ console.log('Error occurred during GCS cleanup:', err);
27
71
  }
72
+
28
73
  } catch (error) {
29
74
  console.log('Error occurred during cleanup:', error);
30
75
  } finally{
@@ -32,11 +77,27 @@ async function cleanupInactive(useAzure) {
32
77
  }
33
78
  }
34
79
 
80
+ async function urlExists(url) {
81
+ if(!url) return false;
82
+ const httpModule = url.startsWith('https') ? https : http;
83
+
84
+ return new Promise((resolve) => {
85
+ httpModule
86
+ .get(url, function (response) {
87
+ // Check if the response status is OK
88
+ resolve(response.statusCode === 200);
89
+ })
90
+ .on('error', function () {
91
+ resolve(false);
92
+ });
93
+ });
94
+ }
95
+
35
96
 
36
97
  async function main(context, req) {
37
98
  context.log('Starting req processing..');
38
99
 
39
- cleanupInactive(useAzure); //trigger & no need to wait for it
100
+ cleanupInactive(); //trigger & no need to wait for it
40
101
 
41
102
  // Clean up blob when request delete which means processing marked completed
42
103
  if (req.method.toLowerCase() === `delete`) {
@@ -55,13 +116,40 @@ async function main(context, req) {
55
116
  return;
56
117
  }
57
118
 
119
+ const { uri, requestId, save, hash, checkHash } = req.body?.params || req.query;
120
+
121
+ if(hash && checkHash){ //check if hash exists
122
+ context.log(`Checking hash: ${hash}`);
123
+ const result = await getFileStoreMap(hash);
124
+
125
+ const exists = await urlExists(result?.url);
126
+
127
+ if(!exists){
128
+ await removeFromFileStoreMap(hash);
129
+ return;
130
+ }
131
+
132
+ if(result){
133
+ context.log(`Hash exists: ${hash}`);
134
+ //update redis timestamp with current time
135
+ await setFileStoreMap(hash, result);
136
+ }
137
+ context.res = {
138
+ body: result
139
+ };
140
+ return;
141
+ }
142
+
58
143
  if (req.method.toLowerCase() === `post`) {
59
- const { url } = await uploadBlob(context, req, !useAzure);
144
+ const { useGoogle } = req.body?.params || req.query;
145
+ const { url } = await uploadBlob(context, req, !useAzure, useGoogle);
60
146
  context.log(`File url: ${url}`);
147
+ if(hash && context?.res?.body){ //save hash after upload
148
+ await setFileStoreMap(hash, context.res.body);
149
+ }
61
150
  return
62
151
  }
63
152
 
64
- const { uri, requestId, save } = req.body?.params || req.query;
65
153
  if (!uri || !requestId) {
66
154
  context.res = {
67
155
  status: 400,