npm - @aj-archipelago/cortex - Versions diffs - 1.3.49 → 1.3.51 - Mend

@aj-archipelago/cortex 1.3.49 → 1.3.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/helper-apps/cortex-file-handler/tests/testUtils.helper.js ADDED Viewed

@@ -0,0 +1,31 @@
+import axios from 'axios';
+export async function cleanupHashAndFile(hash, uploadedUrl, baseUrl) {
+    if (uploadedUrl) {
+        try {
+            const fileUrl = new URL(uploadedUrl);
+            const fileIdentifier = fileUrl.pathname.split('/').pop().split('_')[0];
+            const deleteUrl = `${baseUrl}?operation=delete&requestId=${fileIdentifier}`;
+            await axios.delete(deleteUrl, { validateStatus: () => true });
+        } catch (e) {
+            // ignore
+        }
+    }
+    await axios.get(baseUrl, {
+        params: { hash, clearHash: true },
+        validateStatus: (status) => true,
+    });
+    await axios.get(baseUrl, {
+        params: { hash: `${hash}_converted`, clearHash: true },
+        validateStatus: (status) => true,
+    });
+}
+export function getFolderNameFromUrl(url) {
+    const urlObj = new URL(url);
+    const parts = urlObj.pathname.split('/');
+    if (url.includes('127.0.0.1:10000')) {
+        return parts[3].split('_')[0];
+    }
+    return parts[2].split('_')[0];
+}

package/helper-apps/cortex-markitdown/.funcignore ADDED Viewed

	@@ -0,0 +1 @@
1	+ .venv

package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py ADDED Viewed

@@ -0,0 +1,64 @@
+import logging
+import azure.functions as func
+from markitdown import MarkItDown
+import json
+# Initialize MarkItDown converter (do this once, outside the function handler if possible)
+# This is a global instance to be reused across invocations for efficiency.
+# For LLM-based image description, you might need to configure llm_client and llm_model
+# e.g., md = MarkItDown(llm_client=OpenAI(), llm_model="gpt-4o")
+# For simplicity, we'll use the basic setup here.
+md = MarkItDown(enable_plugins=True)
+def main(req: func.HttpRequest) -> func.HttpResponse:
+    logging.info('Python HTTP trigger function processed a request.')
+    uri = req.params.get('uri')
+    if not uri:
+        try:
+            req_body = req.get_json()
+        except ValueError:
+            pass
+        else:
+            uri = req_body.get('uri')
+    if uri:
+        try:
+            logging.info(f"Processing URI: {uri}")
+            # The MarkItDown library's convert method can take a URI directly.
+            # It can also handle local file paths if the function has access to them,
+            # but for a typical HTTP-triggered Azure Function, a web URI is expected.
+            result = md.convert(uri)
+            # The result object has a text_content attribute
+            markdown_content = result.text_content
+            # Return the markdown content
+            # We'll return it as JSON for easier consumption by clients
+            response_data = {
+                "uri": uri,
+                "markdown": markdown_content
+            }
+            return func.HttpResponse(
+                 json.dumps(response_data),
+                 mimetype="application/json",
+                 status_code=200
+            )
+        except Exception as e:
+            logging.error(f"Error converting URI {uri}: {str(e)}")
+            error_response = {
+                "error": "Failed to convert URI to Markdown.",
+                "details": str(e)
+            }
+            return func.HttpResponse(
+                 json.dumps(error_response),
+                 mimetype="application/json",
+                 status_code=500
+            )
+    else:
+        logging.warning("No URI provided in the request.")
+        return func.HttpResponse(
+             "Please pass a URI on the query string or in the request body",
+             status_code=400
+        )

package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json ADDED Viewed

@@ -0,0 +1,21 @@
+{
+  "scriptFile": "__init__.py",
+  "bindings": [
+    {
+      "authLevel": "function",
+      "type": "httpTrigger",
+      "direction": "in",
+      "name": "req",
+      "methods": [
+        "get",
+        "post"
+      ],
+      "route": "convert"
+    },
+    {
+      "type": "http",
+      "direction": "out",
+      "name": "$return"
+    }
+  ]
+}

package/helper-apps/cortex-markitdown/README.md ADDED Viewed

@@ -0,0 +1,94 @@
+# Markitdown Azure Function Converter
+This Azure Function App provides an HTTP endpoint to convert various file formats (specified by a URI) to Markdown using the `microsoft/markitdown` Python library.
+## Function: MarkitdownConverterFunction
+*   **Trigger**: HTTP (GET, POST)
+*   **Route**: `/api/convert` (or as configured by your Azure Function host settings)
+*   **Authentication**: Function (requires a function key for access)
+### Input
+The function expects a `uri` parameter, either in the query string (for GET requests) or in the JSON body (for POST requests).
+**Example GET Request:**
+```
+GET /api/convert?uri=https://www.example.com/somefile.pdf
+```
+**Example POST Request:**
+```
+POST /api/convert
+Content-Type: application/json
+{
+  "uri": "https://www.example.com/somefile.docx"
+}
+```
+### Output
+*   **Success (200 OK):** Returns a JSON object containing the original URI and the converted Markdown content.
+    ```json
+    {
+      "uri": "https://www.example.com/somefile.pdf",
+      "markdown": "# Converted Markdown Content\n..."
+    }
+    ```
+*   **Bad Request (400):** If the `uri` parameter is missing.
+*   **Internal Server Error (500):** If an error occurs during the conversion process. The response will contain an error message and details.
+    ```json
+    {
+        "error": "Failed to convert URI to Markdown.",
+        "details": "<specific error message from the library>"
+    }
+    ```
+## Project Structure
+```
+cortex-markitdown/
+├── MarkitdownConverterFunction/
+│   ├── __init__.py       # The Python code for the Azure Function
+│   └── function.json     # Configuration file for the Azure Function (bindings, triggers)
+├── .gitignore            # Standard Python .gitignore
+├── host.json             # Configuration for the Azure Functions host
+├── requirements.txt      # Python package dependencies
+└── README.md             # This file
+```
+## Prerequisites
+*   Azure Functions Core Tools
+*   Python 3.8+ (check Azure Functions Python version compatibility)
+*   An Azure account (for deployment)
+## Setup and Local Development
+1.  **Clone the repository (if applicable).**
+2.  **Create and activate a virtual environment:**
+    ```bash
+    python -m venv .venv
+    source .venv/bin/activate  # On Windows use `.venv\Scripts\activate`
+    ```
+3.  **Install dependencies:**
+    ```bash
+    pip install -r requirements.txt
+    ```
+4.  **Run the Azure Function locally:**
+    ```bash
+    func start
+    ```
+    The function should be available at `http://localhost:7071/api/convert` (the port might vary).
+## Dependencies
+*   `azure-functions`: For creating Azure Functions.
+*   `markitdown[all]`: The core library used for file conversion to Markdown. The `[all]` option installs all optional dependencies for handling various file types.
+## Notes
+*   The `MarkItDown` instance in `__init__.py` is initialized with `enable_plugins=True` to allow for extended file format support through plugins.
+*   For handling images that require OCR or descriptions, the `markitdown` library might need an LLM client (e.g., OpenAI) configured. This is not included in the basic setup provided but can be added by modifying the `MarkItDown()` instantiation in `__init__.py` and ensuring the necessary environment variables (like API keys) are available to the function.
+*   Ensure that the URIs provided to the function are publicly accessible or accessible from the environment where the Azure Function is running.

package/helper-apps/cortex-markitdown/host.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "version": "2.0",
+  "logging": {
+    "applicationInsights": {
+      "samplingSettings": {
+        "isEnabled": true,
+        "excludedTypes": "Request"
+      }
+    }
+  },
+  "extensionBundle": {
+    "id": "Microsoft.Azure.Functions.ExtensionBundle",
+    "version": "[4.*, 5.0.0)"
+  }
+}

package/helper-apps/cortex-markitdown/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ azure-functions
2	+ markitdown[all]>=0.1.0

package/lib/requestExecutor.js CHANGED Viewed

@@ -214,7 +214,7 @@ const requestWithMonitor = async (endpoint, url, data, axiosConfigObj) => {
     return { response, duration };
 }
-const MAX_RETRY = 5; // retries for error handling
+const MAX_RETRY = 6; // retries for error handling
 const MAX_DUPLICATE_REQUESTS = 3; // duplicate requests to manage latency spikes
 const DUPLICATE_REQUEST_AFTER = 10; // 10 seconds
@@ -312,49 +312,57 @@ const makeRequest = async (cortexRequest) => {
             const { response, duration } = await Promise.race(promises);
             // if response status is 2xx
-            if (response.status >= 200 && response.status < 300) {
+            if (response?.status >= 200 && response?.status < 300) {
                 return { response, duration };
             } else {
-                throw new Error(`Received error response: ${response.status}`);
+                const error = new Error(`Request failed with status ${response?.status}`);
+                error.response = response;
+                error.duration = duration;
+                throw error;
             }
         } catch (error) {
-            const { response, duration, code } = error;
-            if (response || code === 'ECONNRESET') {
-                const status = response?.status || 502; // default to 502 if ECONNRESET
-                // if there is only one endpoint, only retry select error codes
-                if (cortexRequest.model.endpoints.length === 1) {
-                    if (status !== 429 &&
-                        status !== 408 &&
-                        status !== 500 &&
-                        status !== 502 &&
-                        status !== 503 &&
-                        status !== 504) {
-                        return { response, duration };
-                    }
-                    // set up for a retry by reinitializing the request
-                    cortexRequest.initRequest();
-                } else {
-                    // if there are multiple endpoints, retry everything by default
-                    // as it could be a temporary issue with one endpoint
-                    // certain errors (e.g. 400) are problems with the request itself
-                    // and should not be retried
-                    if (status == 400 || status == 413) {
-                        return { response, duration };
-                    }
-                    // set up for a retry by selecting a new endpoint, which will also reinitialize the request
-                    cortexRequest.selectNewEndpoint();
+            // Handle both cases: error with response object and direct error object
+            const status = error?.response?.status || error?.status || 502; // default to 502 if no status
+            const duration = error?.duration;
+            const response = error?.response || {error: error};
+            // Calculate backoff time - use Retry-After for 429s if available
+            let backoffTime = 1000 * Math.pow(2, i);
+            if (status === 429 && (response?.headers?.['retry-after'] || error?.headers?.['retry-after'])) {
+                backoffTime = parseInt(response?.headers?.['retry-after'] || error?.headers?.['retry-after']) * 1000;
+                logger.warn(`>>> [${requestId}] Rate limited (429). Retry-After: ${response?.headers?.['retry-after'] || error?.headers?.['retry-after']}s`);
+            }
+            const jitter = backoffTime * 0.2 * Math.random();
+            // if there is only one endpoint, only retry select error codes
+            if (cortexRequest.model.endpoints.length === 1) {
+                if (status !== 429 &&
+                    status !== 408 &&
+                    status !== 500 &&
+                    status !== 502 &&
+                    status !== 503 &&
+                    status !== 504) {
+                    return { response, duration };
                 }
-                logger.info(`>>> [${requestId}] retrying request (${duration}ms) due to ${status} response. Retry count: ${i + 1}`);
-                if (i < MAX_RETRY - 1) {
-                    const backoffTime = 1000 * Math.pow(2, i);
-                    const jitter = backoffTime * 0.2 * Math.random();
-                    await new Promise(r => setTimeout(r, backoffTime + jitter));
-                } else {
+                // set up for a retry by reinitializing the request
+                cortexRequest.initRequest();
+            } else {
+                // if there are multiple endpoints, retry everything by default
+                // as it could be a temporary issue with one endpoint
+                // certain errors (e.g. 400) are problems with the request itself
+                // and should not be retried
+                if (status == 400 || status == 413) {
                     return { response, duration };
                 }
+                // set up for a retry by selecting a new endpoint, which will also reinitialize the request
+                cortexRequest.selectNewEndpoint();
+            }
+            if (i < MAX_RETRY - 1) {
+                logger.info(`>>> [${requestId}] retrying request due to ${status} response. Retry count: ${i + 1}. Retrying in ${backoffTime + jitter}ms`);
+                await new Promise(r => setTimeout(r, backoffTime + jitter));
             } else {
-                throw error;
+                return { response, duration };
             }
         }
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@aj-archipelago/cortex",
-  "version": "1.3.49",
+  "version": "1.3.51",
   "description": "Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.",
   "private": false,
   "repository": {

package/pathways/system/entity/tools/sys_tool_cognitive_search.js CHANGED Viewed

@@ -162,7 +162,7 @@ export default {
         // Map tool names to index names
         const toolToIndex = {
-            'searchpersonal': 'indexcortex',
+            'searchpersonalindex': 'indexcortex',
             'searchaja': 'indexucmsaja',
             'searchaje': 'indexucmsaje',
             'searchwires': 'indexwires'

package/pathways/system/entity/tools/sys_tool_readfile.js CHANGED Viewed

@@ -49,7 +49,7 @@ export default {
         icon: "📝",
         function: {
             name: "AnalyzeText",
-            description: "Use specifically for reading, analyzing, and answering questions about text and csv files.",
+            description: "Use specifically for reading, analyzing, and answering questions about text files (including csv, json, html, etc.).",
             parameters: {
                 type: "object",
                 properties: {
@@ -65,7 +65,29 @@ export default {
                 required: ["detailedInstructions", "userMessage"]
             }
         }
-    },
+    },
+    {
+        type: "function",
+        icon: "📝",
+        function: {
+            name: "AnalyzeMarkdown",
+            description: "Use specifically for reading, analyzing, and answering questions about markdown files.",
+            parameters: {
+                type: "object",
+                properties: {
+                    detailedInstructions: {
+                        type: "string",
+                        description: "Detailed instructions about what you need the tool to do - questions you need answered about the files, etc."
+                        },
+                    userMessage: {
+                        type: "string",
+                        description: "A user-friendly message that describes what you're doing with this tool"
+                        }
+                },
+                required: ["detailedInstructions", "userMessage"]
+            }
+        }
+    },
     {
         type: "function",
         icon: "🖼️",

package/server/plugins/openAiWhisperPlugin.js CHANGED Viewed

@@ -8,12 +8,6 @@ import logger from '../../lib/logger.js';
 import CortexRequest from '../../lib/cortexRequest.js';
 import { downloadFile, deleteTempPath, convertSrtToText, alignSubtitles, getMediaChunks, markCompletedForCleanUp  } from '../../lib/util.js';
-const WHISPER_TS_API_URL  = config.get('whisperTSApiUrl');
-if(WHISPER_TS_API_URL){
-    logger.info(`WHISPER API URL using ${WHISPER_TS_API_URL}`);
-}else{
-    logger.warn(`WHISPER API URL not set using default OpenAI API Whisper`);
-}
 const OFFSET_CHUNK = 500; //seconds of each chunk offset, only used if helper does not provide
@@ -41,7 +35,6 @@ class OpenAIWhisperPlugin extends ModelPlugin {
                 const response_format = responseFormat || 'text';
                 const whisperInitCallback = (requestInstance) => {
                     const formData = new FormData();
                     formData.append('file', fs.createReadStream(chunk));
                     formData.append('model', requestInstance.params.model);
@@ -51,7 +44,6 @@ class OpenAIWhisperPlugin extends ModelPlugin {
                     requestInstance.data = formData;
                     requestInstance.addHeaders = { ...formData.getHeaders() };
                 };
                 cortexRequest.initCallback = whisperInitCallback;
@@ -64,7 +56,6 @@ class OpenAIWhisperPlugin extends ModelPlugin {
         }
         const processTS = async (uri) => {
             const tsparams = { fileurl:uri };
             const { language } = parameters;
             if(language) tsparams.language = language;
@@ -75,37 +66,18 @@ class OpenAIWhisperPlugin extends ModelPlugin {
             tsparams.word_timestamps = !wordTimestamped ? "False" : wordTimestamped;
             const cortexRequest = new CortexRequest({ pathwayResolver });
-            cortexRequest.url = WHISPER_TS_API_URL;
-            cortexRequest.data = tsparams;
             const whisperInitCallback = (requestInstance) => {
-                requestInstance.url = WHISPER_TS_API_URL;
                 requestInstance.data = tsparams;
             };
             cortexRequest.initCallback = whisperInitCallback;
-            const MAX_RETRIES = 3;
-            let attempt = 0;
-            let res = null;
-            while(attempt < MAX_RETRIES){
-                sendProgress(true, true);
-                try {
-                    res = await this.executeRequest(cortexRequest);
-                    if (!res) {
-                        throw new Error('Received null or empty response');
-                    }
-                    if(res?.statusCode && res?.statusCode >= 400){
-                        throw new Error(res?.message || 'An error occurred.');
-                    }
-                    break;
-                }
-                catch(err){
-                    logger.warn(`Error calling timestamped API: ${err}. Retrying ${attempt+1} of ${MAX_RETRIES}...`);
-                    attempt++;
-                }
+            sendProgress(true, true);
+            const res = await this.executeRequest(cortexRequest);
+            if (!res) {
+                throw new Error('Received null or empty response');
             }
-            if (res?.statusCode && res?.statusCode >= 400) {
-                throw new Error(res.message || 'An error occurred.');
+            if(res?.statusCode && res?.statusCode >= 400){
+                throw new Error(res?.message || 'An error occurred.');
             }
             if(!wordTimestamped && !responseFormat){
@@ -151,71 +123,71 @@ class OpenAIWhisperPlugin extends ModelPlugin {
             });
         }
-async function processURI(uri) {
-    let result = null;
-    let _promise = null;
-    let errorOccurred = false;
+        const processURI = async (uri) => {
+            let result = null;
+            let _promise = null;
+            let errorOccurred = false;
-    const intervalId = setInterval(() => sendProgress(true), 3000);
+            const intervalId = setInterval(() => sendProgress(true), 3000);
-    //const useTS = WHISPER_TS_API_URL && (wordTimestamped || highlightWords); // use TS API only for word timestamped
-    const useTS = !!WHISPER_TS_API_URL; // use TS API always if URL is set
+            // use Timestamped API if model is oai-whisper-ts
+            const useTS = this.modelName === 'oai-whisper-ts';
-    if (useTS) {
-        _promise = processTS;
-    } else {
-        _promise = processChunk;
-    }
+            if (useTS) {
+                _promise = processTS;
+            } else {
+                _promise = processChunk;
+            }
-    await _promise(uri).then((ts) => {
-        result = ts;
-    }).catch((err) => {
-        errorOccurred = err;
-    }).finally(() => {
-        clearInterval(intervalId);
-        sendProgress();
-    });
-    if(errorOccurred) {
-        throw errorOccurred;
-    }
+            await _promise(uri).then((ts) => {
+                result = ts;
+            }).catch((err) => {
+                errorOccurred = err;
+            }).finally(() => {
+                clearInterval(intervalId);
+                sendProgress();
+            });
-    return result;
-}
+            if(errorOccurred) {
+                throw errorOccurred;
+            }
-let offsets = [];
-let uris = []
+            return result;
+        }
-try {
-    const mediaChunks = await getMediaChunks(file, requestId);
-    if (!mediaChunks || !mediaChunks.length) {
-        throw new Error(`Error in getting chunks from media helper for file ${file}`);
-    }
+        let offsets = [];
+        let uris = []
-    uris = mediaChunks.map((chunk) => chunk?.uri || chunk);
-    offsets = mediaChunks.map((chunk, index) => chunk?.offset || index * OFFSET_CHUNK);
+        try {
+            const mediaChunks = await getMediaChunks(file, requestId);
+            if (!mediaChunks || !mediaChunks.length) {
+                throw new Error(`Error in getting chunks from media helper for file ${file}`);
+            }
-    totalCount = mediaChunks.length + 1; // total number of chunks that will be processed
+            uris = mediaChunks.map((chunk) => chunk?.uri || chunk);
+            offsets = mediaChunks.map((chunk, index) => chunk?.offset || index * OFFSET_CHUNK);
-    const batchSize = 4;
-    sendProgress();
+            totalCount = mediaChunks.length + 1; // total number of chunks that will be processed
-    for (let i = 0; i < uris.length; i += batchSize) {
-        const currentBatchURIs = uris.slice(i, i + batchSize);
-        const promisesToProcess = currentBatchURIs.map(uri => processURI(uri));
-        const results = await Promise.all(promisesToProcess);
-        for(const res of results) {
-            result.push(res);
-        }
-    }
+            const batchSize = 4;
+            sendProgress();
-} catch (error) {
-    const errMsg = `Transcribe error: ${error?.response?.data || error?.message || error}`;
-    logger.error(errMsg);
-    return errMsg;
-}
+            for (let i = 0; i < uris.length; i += batchSize) {
+                const currentBatchURIs = uris.slice(i, i + batchSize);
+                const promisesToProcess = currentBatchURIs.map(uri => processURI(uri));
+                const results = await Promise.all(promisesToProcess);
+                for(const res of results) {
+                    result.push(res);
+                }
+            }
+        } catch (error) {
+            const errMsg = `Transcribe error: ${error?.response?.data || error?.message || error}`;
+            logger.error(errMsg);
+            return errMsg;
+        }
         finally {
             try {
                 for (const chunk of chunks) {