npm - @aj-archipelago/cortex - Versions diffs - 1.3.11 → 1.3.14 - Mend

@aj-archipelago/cortex 1.3.11 → 1.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/helper-apps/cortex-file-handler/index.js CHANGED Viewed

@@ -1,28 +1,22 @@
-import { downloadFile, processYoutubeUrl, splitMediaFile } from './fileChunker.js';
-import { saveFileToBlob, deleteBlob, uploadBlob, cleanup, cleanupGCS, gcsUrlExists } from './blobHandler.js';
+import { downloadFile, splitMediaFile } from './fileChunker.js';
+import { saveFileToBlob, deleteBlob, deleteGCS, uploadBlob, cleanup, cleanupGCS, gcsUrlExists, ensureGCSUpload, gcs, AZURE_STORAGE_CONTAINER_NAME } from './blobHandler.js';
 import { cleanupRedisFileStoreMap, getFileStoreMap, publishRequestProgress, removeFromFileStoreMap, setFileStoreMap } from './redis.js';
-import { deleteTempPath, ensureEncoded, isValidYoutubeUrl } from './helper.js';
+import { ensureEncoded, ensureFileExtension, urlExists } from './helper.js';
 import { moveFileToPublicFolder, deleteFolder, cleanupLocal } from './localFileHandler.js';
 import { documentToText, easyChunker } from './docHelper.js';
+import { DOC_EXTENSIONS } from './constants.js';
 import path from 'path';
 import os from 'os';
 import { v4 as uuidv4 } from 'uuid';
 import fs from 'fs';
-import http from 'http';
-import https from 'https';
-import axios from "axios";
-import { pipeline } from "stream";
-import { promisify } from "util";
-const pipelineUtility = promisify(pipeline); // To pipe streams using async/await
-const DOC_EXTENSIONS =  [".txt", ".json", ".csv", ".md", ".xml", ".js", ".html", ".css", '.pdf', '.docx', '.xlsx', '.csv'];
 const useAzure = process.env.AZURE_STORAGE_CONNECTION_STRING ? true : false;
-console.log(useAzure ? 'Using Azure Storage' : 'Using local file system');
+const useGCS = process.env.GCP_SERVICE_ACCOUNT_KEY_BASE64 || process.env.GCP_SERVICE_ACCOUNT_KEY ? true : false;
+console.log(`Storage configuration - ${useAzure ? 'Azure' : 'Local'} Storage${useGCS ? ' and Google Cloud Storage' : ''}`);
 let isCleanupRunning = false;
-async function cleanupInactive() {
+async function cleanupInactive(context) {
     try {
         if (isCleanupRunning) { return; } //no need to cleanup every call
         isCleanupRunning = true;
@@ -52,7 +46,7 @@ async function cleanupInactive() {
         try {
             if (cleanedAzure && cleanedAzure.length > 0) {
-                await cleanup(cleanedAzure);
+                await cleanup(context, cleanedAzure);
             }
         } catch (error) {
             console.log('Error occurred during azure cleanup:', error);
@@ -81,96 +75,141 @@ async function cleanupInactive() {
     }
 }
-async function urlExists(url) {
-  if(!url) return false;
-  const httpModule = url.startsWith('https') ? https : http;
-  return new Promise((resolve) => {
-    httpModule
-      .get(url, function (response) {
-        // Check if the response status is OK
-        resolve(response.statusCode === 200);
-      })
-      .on('error', function () {
-        resolve(false);
-      });
-  });
-}
-async function main(context, req) {
-    context.log('Starting req processing..');
+async function CortexFileHandler(context, req) {
+    const { uri, requestId, save, hash, checkHash, clearHash, fetch, load, restore } = req.body?.params || req.query;
+    const operation = save ? 'save' :
+                     checkHash ? 'checkHash' :
+                     clearHash ? 'clearHash' :
+                     fetch || load || restore ? 'remoteFile' :
+                     req.method.toLowerCase() === 'delete' || req.query.operation === 'delete' ? 'delete' :
+                     uri ? (DOC_EXTENSIONS.some(ext => uri.toLowerCase().endsWith(ext)) ? 'document_processing' : 'media_chunking') :
+                     'upload';
+    context.log(`Processing ${req.method} request - ${requestId ? `requestId: ${requestId}, ` : ''}${uri ? `uri: ${uri}, ` : ''}${hash ? `hash: ${hash}, ` : ''}operation: ${operation}`);
-    cleanupInactive(); //trigger & no need to wait for it
+    cleanupInactive(context); //trigger & no need to wait for it
     // Clean up blob when request delete which means processing marked completed
-    if (req.method.toLowerCase() === `delete`) {
-        const { requestId } = req.query;
-        if (!requestId) {
+    if (operation === 'delete') {
+        const deleteRequestId = req.query.requestId || requestId;
+        if (!deleteRequestId) {
             context.res = {
                 status: 400,
                 body: "Please pass a requestId on the query string"
             };
             return;
         }
-        const result = useAzure ? await deleteBlob(requestId) : await deleteFolder(requestId);
+        // Delete from Azure/Local storage
+        const azureResult = useAzure ? await deleteBlob(deleteRequestId) : await deleteFolder(deleteRequestId);
+        const gcsResult = [];
+        if (gcs) {
+            for (const blobName of azureResult) {
+                gcsResult.push(...await deleteGCS(blobName));
+            }
+        }
         context.res = {
-            body: result
+            status: 200,
+            body: { body: [...azureResult, ...gcsResult] }
         };
         return;
     }
-    const { uri, requestId, save, hash, checkHash, clearHash, fetch, load, restore } = req.body?.params || req.query;
-    const filepond = fetch || restore || load;
-    if (req.method.toLowerCase() === `get` && filepond) {
-        context.log(`Remote file: ${filepond}`);
-        // Check if file already exists (using hash as the key)
-        const exists = await getFileStoreMap(filepond);
-        if(exists){
-            context.res = {
-                status: 200,
-                body: exists // existing file URL
-            };
-            return;
-        }
+    const remoteUrl = fetch || restore || load;
+    if (req.method.toLowerCase() === `get` && remoteUrl) {
+        context.log(`Remote file: ${remoteUrl}`);
+        let filename;  // Declare filename outside try block
+        try {
+            // Validate URL format and accessibility
+            const urlCheck = await urlExists(remoteUrl);
+            if (!urlCheck.valid) {
+                context.res = {
+                    status: 400,
+                    body: 'Invalid or inaccessible URL'
+                };
+                return;
+            }
-        // Check if it's a youtube url
-        let youtubeDownloadedFile = null;
-        if(isValidYoutubeUrl(filepond)){
-            youtubeDownloadedFile = await processYoutubeUrl(filepond, true);
-        }
-        const filename = path.join(os.tmpdir(), path.basename(youtubeDownloadedFile || filepond));
-        // Download the remote file to a local/temporary location keep name & ext
-        if(!youtubeDownloadedFile){
-            const response = await axios.get(filepond, { responseType: "stream" });
-            await pipelineUtility(response.data, fs.createWriteStream(filename));
-        }
+            // Check if file already exists (using hash as the key)
+            let exists = await getFileStoreMap(remoteUrl);
+            if(exists){
+                context.res = {
+                    status: 200,
+                    body: exists
+                };
+                //update redis timestamp with current time
+                await setFileStoreMap(remoteUrl, exists);
+                return;
+            }
-        const res = await uploadBlob(context, null, !useAzure, true, filename);
-        context.log(`File uploaded: ${JSON.stringify(res)}`);
+            // Download the file first
+            const urlObj = new URL(remoteUrl);
+            let originalFileName = path.basename(urlObj.pathname);
+            if (!originalFileName || originalFileName === '') {
+                originalFileName = urlObj.hostname;
+            }
+            // Ensure the filename has the correct extension based on content type
+            originalFileName = ensureFileExtension(originalFileName, urlCheck.contentType);
+            const maxLength = 200; // Set the maximum length for the filename
+            let truncatedFileName = originalFileName;
+            if (originalFileName.length > maxLength) {
+                const extension = path.extname(originalFileName);
+                const basename = path.basename(originalFileName, extension);
+                truncatedFileName = basename.substring(0, maxLength - extension.length) + extension;
+            }
-        //Update Redis (using hash as the key)
-        await setFileStoreMap(filepond, res);
+            // Use the original-truncated file name when saving the downloaded file
+            filename = path.join(os.tmpdir(), truncatedFileName);
+            await downloadFile(remoteUrl, filename);
+            // Now upload the downloaded file
+            const res = await uploadBlob(context, null, !useAzure, filename, remoteUrl);
-        // Return the file URL
-        context.res = {
-            status: 200,
-            body: res,
-        };
+            //Update Redis (using hash as the key)
+            await setFileStoreMap(remoteUrl, res);
+            // Return the file URL
+            context.res = {
+                status: 200,
+                body: res,
+            };
+        } catch (error) {
+            context.log("Error processing remote file request:", error);
+            context.res = {
+                status: 500,
+                body: `Error processing file: ${error.message}`
+            };
+        } finally {
+            // Cleanup temp file if it exists
+            try {
+                if (filename && fs.existsSync(filename)) {
+                    fs.unlinkSync(filename);
+                }
+            } catch (err) {
+                context.log("Error cleaning up temp file:", err);
+            }
+        }
         return;
     }
     if(hash && clearHash){
         try {
             const hashValue = await getFileStoreMap(hash);
-            await removeFromFileStoreMap(hash);
-            context.res = {
-                status: 200,
-                body: hashValue ? `Hash ${hash} removed` : `Hash ${hash} not found`
-            };
+            if (hashValue) {
+                await removeFromFileStoreMap(hash);
+                context.res = {
+                    status: 200,
+                    body: `Hash ${hash} removed`
+                };
+            } else {
+                context.res = {
+                    status: 404,
+                    body: `Hash ${hash} not found`
+                };
+            }
         } catch (error) {
             context.res = {
                 status: 500,
@@ -178,37 +217,84 @@ async function main(context, req) {
             };
             console.log('Error occurred during hash cleanup:', error);
         }
-        return
+        return;
     }
     if(hash && checkHash){ //check if hash exists
-        context.log(`Checking hash: ${hash}`);
-        const result = await getFileStoreMap(hash);
+        let hashResult = await getFileStoreMap(hash);
+        if(hashResult){
+            context.log(`File exists in map: ${hash}`);
+            // Check primary storage (Azure/Local) first
+            const primaryExists = await urlExists(hashResult?.url);
+            const gcsExists = gcs ? await gcsUrlExists(hashResult?.gcs) : false;
+            // If neither storage has the file, remove from map and return not found
+            if (!primaryExists.valid && !gcsExists) {
+                context.log(`File not found in any storage. Removing from map: ${hash}`);
+                await removeFromFileStoreMap(hash);
+                context.res = {
+                    status: 404,
+                    body: `Hash ${hash} not found in storage`
+                };
+                return;
+            }
-        if(result){
-            const exists = await urlExists(result?.url);
-            const gcsExists = await gcsUrlExists(result?.gcs);
+            // If primary is missing but GCS exists, restore from GCS
+            if (!primaryExists.valid && gcsExists) {
+                context.log(`Primary storage file missing, restoring from GCS: ${hash}`);
+                try {
+                    const res = await CortexFileHandler(context, {
+                        method: 'GET',
+                        body: { params: { fetch: hashResult.gcs } }
+                    });
+                    if (res?.body?.url) {
+                        hashResult.url = res.body.url;
+                    }
+                } catch (error) {
+                    console.error('Error restoring from GCS:', error);
+                }
+            }
+            // If GCS is missing but primary exists, restore to GCS
+            else if (primaryExists.valid && gcs && !gcsExists) {
+                context.log(`GCS file missing, restoring from primary: ${hash}`);
+                const { gcs: _, ...fileInfo } = hashResult; // eslint-disable-line no-unused-vars
+                hashResult = await ensureGCSUpload(context, fileInfo);
+            }
-            if(!exists || !gcsExists){
+            // Final check to ensure we have at least one valid storage location
+            const finalPrimaryCheck = await urlExists(hashResult?.url);
+            if (!finalPrimaryCheck.valid && !await gcsUrlExists(hashResult?.gcs)) {
+                context.log(`Failed to restore file. Removing from map: ${hash}`);
                 await removeFromFileStoreMap(hash);
+                context.res = {
+                    status: 404,
+                    body: `Hash ${hash} not found and restoration failed`
+                };
                 return;
             }
-            context.log(`Hash exists: ${hash}`);
             //update redis timestamp with current time
-            await setFileStoreMap(hash, result);
+            await setFileStoreMap(hash, hashResult);
+            context.res = {
+                status: 200,
+                body: hashResult
+            };
+            return;
         }
         context.res = {
-            body: result
+            status: 404,
+            body: `Hash ${hash} not found`
         };
         return;
     }
     if (req.method.toLowerCase() === `post`) {
-        const { useGoogle } = req.body?.params || req.query;
-        const { url } = await uploadBlob(context, req, !useAzure, useGoogle, null, hash);
-        context.log(`File url: ${url}`);
-        if(hash && context?.res?.body){ //save hash after upload
+        await uploadBlob(context, req, !useAzure, null, hash);
+        if(hash && context?.res?.body){
             await setFileStoreMap(hash, context.res.body);
         }
         return
@@ -227,8 +313,6 @@ async function main(context, req) {
     let numberOfChunks;
     let file = ensureEncoded(uri); // encode url to handle special characters
-    let folder;
-    const isYoutubeUrl = isValidYoutubeUrl(uri);
     const result = [];
@@ -238,20 +322,24 @@ async function main(context, req) {
         await publishRequestProgress({ requestId, progress, completedCount, totalCount, numberOfChunks, data });
     }
-    const isDocument = DOC_EXTENSIONS.some(ext => uri.toLowerCase().endsWith(ext));
     try {
-        if (isDocument) {
-            const extension = path.extname(uri).toLowerCase();
-            const file = path.join(os.tmpdir(), `${uuidv4()}${extension}`);
-            await downloadFile(uri, file)
-            const text = await documentToText(file);
+        // Parse URL and get pathname without query parameters for extension check
+        const urlObj = new URL(uri);
+        const pathWithoutQuery = urlObj.pathname;
+        if (DOC_EXTENSIONS.some(ext => pathWithoutQuery.toLowerCase().endsWith(ext))) {
+            const extension = path.extname(pathWithoutQuery).toLowerCase();
+            const tempDir = path.join(os.tmpdir(), `${uuidv4()}`);
+            fs.mkdirSync(tempDir);
+            const downloadedFile = path.join(tempDir, `${uuidv4()}${extension}`);
+            await downloadFile(uri, downloadedFile);
+            const text = await documentToText(downloadedFile);
             let tmpPath;
-            try{
+            try {
                 if (save) {
                     const fileName = `${uuidv4()}.txt`; // generate unique file name
-                    const filePath = path.join(os.tmpdir(), fileName);
+                    const filePath = path.join(tempDir, fileName);
                     tmpPath = filePath;
                     fs.writeFileSync(filePath, text); // write text to file
@@ -262,79 +350,73 @@ async function main(context, req) {
                 } else {
                     result.push(...easyChunker(text));
                 }
-            }catch(err){
+            } catch(err) {
                 console.log(`Error saving file ${uri} with request id ${requestId}:`, err);
-            }finally{
-                try{
+            } finally {
+                try {
                     // delete temporary files
                     tmpPath && fs.unlinkSync(tmpPath);
-                    file && fs.unlinkSync(file);
-                    console.log(`Cleaned temp files ${tmpPath}, ${file}`);
-                }catch(err){
-                    console.log(`Error cleaning temp files ${tmpPath}, ${file}:`, err);
+                    downloadedFile && fs.unlinkSync(downloadedFile);
+                    console.log(`Cleaned temp files ${tmpPath}, ${downloadedFile}`);
+                } catch(err) {
+                    console.log(`Error cleaning temp files ${tmpPath}, ${downloadedFile}:`, err);
                 }
-                try{
+                try {
                     //delete uploaded prev nontext file
-                    //check cleanup for whisper temp uploaded files url
-                    const regex = /whispertempfiles\/([a-z0-9-]+)/;
+                    //check cleanup for uploaded files url
+                    const regex = new RegExp(`${AZURE_STORAGE_CONTAINER_NAME}/([a-z0-9-]+)`);
                     const match = uri.match(regex);
                     if (match && match[1]) {
                         const extractedValue = match[1];
                         useAzure ? await deleteBlob(extractedValue) : await deleteFolder(extractedValue);
                         console.log(`Cleaned temp file ${uri} with request id ${extractedValue}`);
                     }
-                }catch(err){
+                } catch(err) {
                     console.log(`Error cleaning temp file ${uri}:`, err);
                 }
             }
-        }else{
-            if (isYoutubeUrl) {
-                // totalCount += 1; // extra 1 step for youtube download
-                const processAsVideo = req.body?.params?.processAsVideo || req.query?.processAsVideo;
-                file = await processYoutubeUrl(file, processAsVideo);
-            }
+        } else {
             const { chunkPromises, chunkOffsets, uniqueOutputPath } = await splitMediaFile(file);
-            folder = uniqueOutputPath;
             numberOfChunks = chunkPromises.length; // for progress reporting
             totalCount += chunkPromises.length * 4; // 4 steps for each chunk (download and upload)
-            // isYoutubeUrl && sendProgress(); // send progress for youtube download after total count is calculated
             // sequential download of chunks
             const chunks = [];
             for (const chunkPromise of chunkPromises) {
-                chunks.push(await chunkPromise);
-                sendProgress();
+                const chunkPath = await chunkPromise;
+                chunks.push(chunkPath);
+                await sendProgress();
             }
             // sequential processing of chunks
             for (let index = 0; index < chunks.length; index++) {
-                const chunk = chunks[index];
-                const blobName = useAzure ? await saveFileToBlob(chunk, requestId) : await moveFileToPublicFolder(chunk, requestId);
+                const chunkPath = chunks[index];
+                const blobName = useAzure ? await saveFileToBlob(chunkPath, requestId) : await moveFileToPublicFolder(chunkPath, requestId);
                 const chunkOffset = chunkOffsets[index];
-                result.push({ uri:blobName, offset:chunkOffset });
-                context.log(`Saved chunk as: ${blobName}`);
-                sendProgress();
+                result.push({ uri: blobName, offset: chunkOffset });
+                console.log(`Saved chunk as: ${blobName}`);
+                await sendProgress();
             }
-            // parallel processing, dropped
-            // result = await Promise.all(mediaSplit.chunks.map(processChunk));
+            // Cleanup the temp directory
+            try {
+                if (uniqueOutputPath && fs.existsSync(uniqueOutputPath)) {
+                    fs.rmSync(uniqueOutputPath, { recursive: true });
+                    console.log(`Cleaned temp directory: ${uniqueOutputPath}`);
+                }
+            } catch (err) {
+                console.log(`Error cleaning temp directory ${uniqueOutputPath}:`, err);
+            }
         }
     } catch (error) {
         console.error("An error occurred:", error);
-        context.res.status(500);
-        context.res.body = error.message || error;
+        context.res = {
+            status: 500,
+            body: error.message || error
+        };
         return;
-    } finally {
-        try {
-            (isYoutubeUrl) && (await deleteTempPath(file));
-            folder && (await deleteTempPath(folder));
-        } catch (error) {
-            console.error("An error occurred while deleting:", error);
-        }
     }
     console.log('result:', result.map(item =>
@@ -344,8 +426,6 @@ async function main(context, req) {
     context.res = {
         body: result
     };
 }
-export default main;
+export default CortexFileHandler;

package/helper-apps/cortex-file-handler/localFileHandler.js CHANGED Viewed

@@ -25,13 +25,31 @@ async function moveFileToPublicFolder(chunkPath, requestId) {
 async function deleteFolder(requestId) {
     if (!requestId) throw new Error('Missing requestId parameter');
     const targetFolder = join(publicFolder, requestId);
-    await fs.rm(targetFolder, { recursive: true });
-    console.log(`Cleaned folder: ${targetFolder}`);
+    try {
+        // Check if folder exists first
+        const stats = await fs.stat(targetFolder);
+        if (stats.isDirectory()) {
+            // Get list of files before deleting
+            const files = await fs.readdir(targetFolder);
+            const deletedFiles = files.map(file => join(requestId, file));
+            // Delete the folder
+            await fs.rm(targetFolder, { recursive: true });
+            console.log(`Cleaned folder: ${targetFolder}`);
+            return deletedFiles;
+        }
+        return [];
+    } catch (error) {
+        if (error.code === 'ENOENT') {
+            // Folder doesn't exist, return empty array
+            return [];
+        }
+        throw error;
+    }
 }
 async function cleanupLocal(urls=null) {
+  const cleanedUrls = [];
   if(!urls){
-    const cleanedUrls = []; // initialize array for holding cleaned file URLs
     try {
       // Read the directory
       const items = await fs.readdir(publicFolder);