npm - @aj-archipelago/cortex - Versions diffs - 1.1.3 → 1.1.4 - Mend

@aj-archipelago/cortex 1.1.3 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

package/.eslintignore +3 -3
package/README.md +17 -4
package/config.js +45 -9
package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/Dockerfile +1 -1
package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/fileChunker.js +4 -1
package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/package-lock.json +25 -216
package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/package.json +2 -2
package/helper-apps/cortex-whisper-wrapper/.dockerignore +27 -0
package/helper-apps/cortex-whisper-wrapper/Dockerfile +32 -0
package/helper-apps/cortex-whisper-wrapper/app.py +104 -0
package/helper-apps/cortex-whisper-wrapper/docker-compose.debug.yml +12 -0
package/helper-apps/cortex-whisper-wrapper/docker-compose.yml +10 -0
package/helper-apps/cortex-whisper-wrapper/models/.gitkeep +0 -0
package/helper-apps/cortex-whisper-wrapper/requirements.txt +5 -0
package/lib/cortexRequest.js +117 -0
package/lib/pathwayTools.js +2 -1
package/lib/redisSubscription.js +2 -2
package/lib/requestExecutor.js +360 -0
package/lib/requestMonitor.js +131 -28
package/package.json +2 -1
package/pathways/summary.js +3 -3
package/server/graphql.js +6 -6
package/server/{pathwayPrompter.js → modelExecutor.js} +24 -21
package/server/pathwayResolver.js +22 -17
package/server/plugins/azureCognitivePlugin.js +25 -20
package/server/plugins/azureTranslatePlugin.js +6 -10
package/server/plugins/cohereGeneratePlugin.js +5 -12
package/server/plugins/cohereSummarizePlugin.js +5 -12
package/server/plugins/localModelPlugin.js +3 -3
package/server/plugins/modelPlugin.js +18 -12
package/server/plugins/openAiChatExtensionPlugin.js +5 -5
package/server/plugins/openAiChatPlugin.js +8 -10
package/server/plugins/openAiCompletionPlugin.js +9 -12
package/server/plugins/openAiDallE3Plugin.js +14 -31
package/server/plugins/openAiEmbeddingsPlugin.js +6 -9
package/server/plugins/openAiImagePlugin.js +19 -15
package/server/plugins/openAiWhisperPlugin.js +168 -100
package/server/plugins/palmChatPlugin.js +9 -10
package/server/plugins/palmCodeCompletionPlugin.js +2 -2
package/server/plugins/palmCompletionPlugin.js +11 -12
package/server/resolver.js +2 -2
package/server/rest.js +1 -1
package/tests/config.test.js +1 -1
package/tests/mocks.js +5 -0
package/tests/modelPlugin.test.js +3 -10
package/tests/openAiChatPlugin.test.js +9 -8
package/tests/openai_api.test.js +3 -3
package/tests/palmChatPlugin.test.js +1 -1
package/tests/palmCompletionPlugin.test.js +1 -1
package/tests/pathwayResolver.test.js +2 -1
package/tests/requestMonitor.test.js +94 -0
package/tests/{requestDurationEstimator.test.js → requestMonitorDurationEstimator.test.js} +21 -17
package/tests/truncateMessages.test.js +1 -1
package/lib/request.js +0 -259
package/lib/requestDurationEstimator.js +0 -90
/package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/blobHandler.js +0 -0
/package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/docHelper.js +0 -0
/package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/function.json +0 -0
/package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/helper.js +0 -0
/package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/index.js +0 -0
/package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/localFileHandler.js +0 -0
/package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/redis.js +0 -0
/package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/start.js +0 -0

package/helper-apps/cortex-whisper-wrapper/app.py ADDED Viewed

@@ -0,0 +1,104 @@
+import uvicorn
+from fastapi import FastAPI, HTTPException, Request
+from uuid import uuid4
+import os
+import asyncio
+import whisper
+from whisper.utils import get_writer
+from fastapi.encoders import jsonable_encoder
+import time
+model_download_root = './models'
+model = whisper.load_model("large", download_root=model_download_root) #large, tiny
+# Create a semaphore with a limit of 1
+semaphore = asyncio.Semaphore(1)
+app = FastAPI()
+save_directory = "./tmp"  # folder for downloaded files
+os.makedirs(save_directory, exist_ok=True)
+def delete_tmp_file(file_path):
+    try:
+        os.remove(file_path)
+        print(f"Temporary file '{file_path}' has been deleted.")
+    except OSError as e:
+        print(f"Error: {e.strerror}")
+def transcribe(params):
+    if 'fileurl' not in params:
+        raise HTTPException(status_code=400, detail="fileurl parameter is required")
+    fileurl = params["fileurl"]
+    #word_timestamps bool, default True
+    word_timestamps = True
+    if 'word_timestamps' in params: #parse as bool
+        word_timestamps = False if params['word_timestamps'] == 'False' else True
+    print(f"Transcribing file {fileurl} with word_timestamps={word_timestamps}")
+    start_time = time.time()
+    result = model.transcribe(fileurl, word_timestamps=word_timestamps)
+    end_time = time.time()
+    execution_time = end_time - start_time
+    print("Transcribe execution time:", execution_time, "seconds")
+    srtpath = os.path.join(save_directory, str(uuid4()) + ".srt")
+    print(f"Saving transcription as : {srtpath}")
+    writer = get_writer("srt", save_directory)
+    writer_args = {'highlight_words': False, 'max_line_count': None, 'max_line_width': None, 'max_words_per_line': None}
+    if 'highlight_words' in params: #parse as bool
+        writer_args['highlight_words'] = params['highlight_words'] == 'True'
+    if 'max_line_count' in params: #parse as int
+        writer_args['max_line_count'] = int(params['max_line_count'])
+    if 'max_line_width' in params: #parse as int
+        writer_args['max_line_width'] = int(params['max_line_width'])
+    if 'max_words_per_line' in params: #parse as int
+        writer_args['max_words_per_line'] = int(params['max_words_per_line'])
+    # if and only if fileurl and word_timestamps=True, max_words_per_line=1
+    if fileurl and word_timestamps and len(params) <= 2:
+        writer_args['max_words_per_line'] = 1
+    # writer_args = {arg: args.pop(arg) for arg in word_options if arg in args}
+    writer(result, srtpath, **writer_args)
+    with open(srtpath, "r") as f:
+        srtstr = f.read()
+    # clean up tmp out files
+    delete_tmp_file(srtpath)
+    print(f"Transcription of file {fileurl} completed")
+    return srtstr
+async def get_params(request: Request):
+    params = {}
+    if request.method == "POST":
+        body = jsonable_encoder(await request.json())
+        params = body
+    else:
+        params = dict(request.query_params)
+    return params
+@app.get("/")
+@app.post("/")
+async def root(request: Request):
+    if semaphore.locked():
+        raise HTTPException(status_code=429, detail="Too Many Requests")
+    params = await get_params(request)
+    async with semaphore:
+        result = await asyncio.to_thread(transcribe, params)
+        return result
+if __name__ == "__main__":
+    print("Starting APP Whisper server", flush=True)
+    uvicorn.run(app, host="0.0.0.0", port=8000)

package/helper-apps/cortex-whisper-wrapper/docker-compose.debug.yml ADDED Viewed

@@ -0,0 +1,12 @@
+version: '3.4'
+services:
+  cortex:
+    image: arc/whisper
+    build:
+      context: .
+      dockerfile: ./Dockerfile
+    command: ["sh", "-c", "pip install debugpy -t /tmp && python /tmp/debugpy --wait-for-client --listen 0.0.0.0:5678 -m uvicorn helper_apps.WhisperX/app:app --host 0.0.0.0 --port 8000"]
+    ports:
+      - 8000:8000
+      - 5678:5678

package/helper-apps/cortex-whisper-wrapper/docker-compose.yml ADDED Viewed

@@ -0,0 +1,10 @@
+version: '3.4'
+services:
+  cortex:
+    image: arc/whisper
+    build:
+      context: .
+      dockerfile: ./Dockerfile
+    ports:
+      - 8000:8000

package/helper-apps/cortex-whisper-wrapper/models/.gitkeep ADDED Viewed

File without changes

package/helper-apps/cortex-whisper-wrapper/requirements.txt ADDED Viewed

@@ -0,0 +1,5 @@
+# To ensure app dependencies are ported from your virtual environment/host machine into your container, run 'pip freeze > requirements.txt' in the terminal to overwrite this file
+fastapi[all]==0.89.0
+uvicorn[standard]==0.20.0
+gunicorn==20.1.0
+openai-whisper

package/lib/cortexRequest.js ADDED Viewed

@@ -0,0 +1,117 @@
+import { selectEndpoint } from './requestExecutor.js';
+class CortexRequest {
+    constructor( { url, data, params, headers, cache, model, pathwayResolver, selectedEndpoint } = {}) {
+        this._url = url || '';
+        this._data = data || {};
+        this._params = params || {};
+        this._headers = headers || {};
+        this._cache = cache || {};
+        this._model = model || '';
+        this._pathwayResolver = pathwayResolver || {};
+        this._selectedEndpoint = selectedEndpoint || {};
+        if (this._pathwayResolver) {
+            this._model = this._pathwayResolver.model;
+        }
+        if (this._model) {
+            this.selectNewEndpoint();
+        }
+    }
+    selectNewEndpoint() {
+        const sep = selectEndpoint(this._model);
+        if (sep) {
+            this._selectedEndpoint = sep;
+            this._url = sep.url;
+            this._data = { ...this._data, ...sep.params };
+            this._headers = { ...this._headers, ...sep.headers };
+            this._params = { ...this._params, ...sep.params };
+        }
+    }
+    // url getter and setter
+    get url() {
+        return this._url;
+    }
+    set url(value) {
+        this._url = value;
+    }
+    // data getter and setter
+    get data() {
+        return this._data;
+    }
+    set data(value) {
+        this._data = value;
+    }
+    // params getter and setter
+    get params() {
+        return this._params;
+    }
+    set params(value) {
+        this._params = value;
+    }
+    // headers getter and setter
+    get headers() {
+        return this._headers;
+    }
+    set headers(value) {
+        this._headers = value;
+    }
+    // cache getter and setter
+    get cache() {
+        return this._cache;
+    }
+    set cache(value) {
+        this._cache = value;
+    }
+    // model getter and setter
+    get model() {
+        return this._model;
+    }
+    set model(value) {
+        this._model = value;
+    }
+    // requestId getter
+    get requestId() {
+        return this._pathwayResolver.requestId;
+    }
+    // pathway getter and setter
+    get pathway() {
+        return this._pathwayResolver.pathway;
+    }
+    // selectedEndpoint getter and setter
+    get selectedEndpoint() {
+        return this._selectedEndpoint;
+    }
+    set selectedEndpoint(value) {
+        this._selectedEndpoint = value;
+    }
+    // pathwayResolver getter and setter
+    get pathwayResolver() {
+        return this._pathwayResolver;
+    }
+    set pathwayResolver(value) {
+        this._pathwayResolver = value;
+    }
+}
+export default CortexRequest;

package/lib/pathwayTools.js CHANGED Viewed

@@ -1,8 +1,9 @@
 // pathwayTools.js
 import { encode , decode } from 'gpt-3-encoder';
+import { config } from '../config.js';
 // callPathway - call a pathway from another pathway
-const callPathway = async (config, pathwayName, args) => {
+const callPathway = async (pathwayName, args) => {
     const pathway = config.get(`pathways.${pathwayName}`);
     if (!pathway) {
         throw new Error(`Pathway ${pathwayName} not found`);

package/lib/redisSubscription.js CHANGED Viewed

@@ -125,7 +125,7 @@ async function publishRequestProgressSubscription(data) {
                         requestState[requestId].useRedis = false;
                         logger.info(`Starting local execution for registered async request: ${requestId}`);
                         const { resolver, args } = requestState[requestId];
-                        resolver(args, false);
+                        resolver && resolver(args, false);
                     }
                 } else {
                     idsToForward.push(requestId);
@@ -163,7 +163,7 @@ function handleSubscription(data){
             requestState[requestId].useRedis = true;
             logger.info(`Starting execution for registered async request: ${requestId}`);
             const { resolver, args } = requestState[requestId];
-            resolver(args);
+            resolver && resolver(args);
         }
     }
 }

package/lib/requestExecutor.js ADDED Viewed

@@ -0,0 +1,360 @@
+import Bottleneck from 'bottleneck/es5.js';
+import RequestMonitor from './requestMonitor.js';
+import { config } from '../config.js';
+import axios from 'axios';
+import { setupCache } from 'axios-cache-interceptor';
+import Redis from 'ioredis';
+import logger from './logger.js';
+import { v4 as uuidv4 } from 'uuid';
+const connectionString = config.get('storageConnectionString');
+if (!connectionString) {
+    logger.info('No STORAGE_CONNECTION_STRING found in environment. Redis features (caching, pubsub, clustered limiters) disabled.')
+} else {
+    logger.info('Using Redis connection specified in STORAGE_CONNECTION_STRING.');
+}
+let client;
+if (connectionString) {
+    try {
+        client = new Redis(connectionString);
+    } catch (error) {
+        logger.error(`Redis connection error: ${error}`);
+    }
+}
+const cortexId = config.get('cortexId');
+const connection = client && new Bottleneck.IORedisConnection({ client: client });
+let modelEndpoints = {};
+const createLimiter = (endpoint, name, index) => {
+    const rps = endpoint.requestsPerSecond ?? 100;
+    let limiterOptions = {
+        minTime: 1000 / rps,
+        maxConcurrent: rps,
+        reservoir: rps,      // Number of tokens available initially
+        reservoirRefreshAmount: rps,     // Number of tokens added per interval
+        reservoirRefreshInterval: 1000, // Interval in milliseconds
+    };
+    // If Redis connection exists, add id and connection to enable clustering
+    if (connection) {
+        limiterOptions.id = `${cortexId}-${name}-${index}-limiter`; // Unique id for each limiter
+        limiterOptions.connection = connection;  // Shared Redis connection
+        }
+    endpoint.limiter = new Bottleneck(limiterOptions);
+    endpoint.limiter.on('error', (err) => {
+        logger.error(`Limiter error for ${cortexId}-${name}-${index}: ${err}`);
+        endpoint.limiter.disconnect();
+        createLimiter(endpoint, name, index);
+        logger.info(`New limiter created for ${cortexId}-${name}-${index}`)
+    });
+    endpoint.limiter.on('failed', (error, info) => {
+        if (error.name === 'CanceledError') {
+            logger.debug(`Request cancelled for ${cortexId}-${name}-${index}: Id: ${info.options.id || 'none'}`);
+        } else {
+            logger.error(`Request failed for ${cortexId}-${name}-${index}: Id: ${info.options.id || 'none'}: ${error}`);
+        }
+    });
+    endpoint.limiter.on('debug', (message) => {
+        if (!message.includes('heartbeat.lua')) {
+            logger.debug(`Limiter ${cortexId}-${name}-${index}: ${message}`);
+        }
+    });
+}
+const buildModelEndpoints = (config) => {
+    modelEndpoints = JSON.parse(JSON.stringify(config.get('models')));
+    logger.info(`Building ${connection ? 'Redis clustered' : 'local'} model rate limiters for ${cortexId}...`);
+    for (const [name, model] of Object.entries(modelEndpoints)) {
+        model.endpoints.forEach((endpoint, index) => {
+            createLimiter(endpoint, name, index)
+            endpoint.monitor = new RequestMonitor();
+        });
+    }
+}
+let currentIndex = 0; // for round-robin selection
+const selectEndpoint = (model) => {
+    if (!model || !Array.isArray(model.endpoints) || model.endpoints.length === 0) {
+        return null;
+    } else {
+        logger.debug(`Selecting endpoint for model ${model.name}...`);
+        if (model.endpoints.length === 1) {
+            logger.debug(`Only one endpoint for model ${model.name}. No selection required.`);
+            return model.endpoints[0];
+        }
+        let healthyEndpoints = model.endpoints.filter(endpoint => endpoint.monitor.healthy);
+        if (healthyEndpoints.length === 0) {
+            const selectedEndpoint = model.endpoints[currentIndex % model.endpoints.length];
+            currentIndex++;
+            logger.warn(`No healthy endpoints for model ${model.name}. Using round-robin selection. Selected: ${selectedEndpoint.name || 'default'}`);
+            return selectedEndpoint;
+        }
+        healthyEndpoints.forEach(endpoint =>{
+            logger.debug(`Healthy endpoint: ${endpoint.name || 'default'}, duration: ${endpoint.monitor.getAverageCallDuration()}ms`);
+        })
+        let selectedEndpoint;
+        const durations = healthyEndpoints.map(endpoint => endpoint.monitor.getAverageCallDuration());
+        if (shouldUseRoundRobin(durations)) {
+            selectedEndpoint = healthyEndpoints[currentIndex % healthyEndpoints.length];
+            currentIndex++;
+            logger.debug(`All endpoints are performing similarly. Using round-robin selection. Selected: ${selectedEndpoint.name || 'default'}`);
+        } else {
+            selectedEndpoint = fastestEndpoint(healthyEndpoints);
+            logger.debug(`Selected fastest endpoint: ${selectedEndpoint.name || 'default'}`);
+        }
+        return selectedEndpoint;
+    }
+}
+const calculateStandardDeviation = (durations) => {
+    const mean = durations.reduce((total, value) => total + value, 0) / durations.length;
+    const variance = durations.reduce((total, value) => total + Math.pow(value - mean, 2), 0) / durations.length;
+    return Math.sqrt(variance);
+}
+const shouldUseRoundRobin = (durations) => {
+    const standardDeviation = calculateStandardDeviation(durations);
+    const threshold = 10;
+    return standardDeviation <= threshold;
+}
+const fastestEndpoint = (endpoints) => {
+    return endpoints.reduce((fastest, current) => {
+        if (current.monitor.getAverageCallDuration() < fastest.monitor.getAverageCallDuration()) {
+            return current;
+        } else {
+            return fastest;
+        }
+    });
+}
+let cortexAxios = axios;
+if (config.get('enableCache')) {
+    // Setup cache
+    cortexAxios = setupCache(axios, {
+        // enable cache for all requests by default
+        methods: ['get', 'post', 'put', 'delete', 'patch'],
+        interpretHeader: false,
+        ttl: 1000 * 60 * 60 * 24 * 7, // 7 days
+    });
+}
+setInterval(() => {
+  // Iterate over each model
+  for (const [name, model] of Object.entries(modelEndpoints)) {
+    // Iterate over each endpoint in the current model
+    let endpointIndex = 0;
+    model.endpoints.forEach((endpoint) => {
+        const monitor = endpoint.monitor;
+        if (!monitor) {
+            // Skip if monitor does not exist
+            return;
+        }
+        const callRate = monitor.getPeakCallRate();
+        if (callRate > 0) {
+            const error429Rate = monitor.getError429Rate();
+            const errorRate = monitor.getErrorRate();
+            const avgCallDuration = monitor.getAverageCallDuration();
+            logger.debug('------------------------');
+            logger.debug(`Monitor of ${name} endpoint ${endpoint.name || endpointIndex} Call rate: ${callRate} calls/sec, duration: ${avgCallDuration}ms, 429 errors: ${error429Rate * 100}%, errors: ${errorRate * 100}%`);
+            logger.debug('------------------------');
+        }
+        endpointIndex++;
+    });
+  }
+}, 10000); // Log rates every 10 seconds (10000 ms).
+const postWithMonitor = async (endpoint, url, data, axiosConfigObj) => {
+    return cortexAxios.post(url, data, axiosConfigObj);
+}
+const MAX_RETRY = 10; // retries for error handling
+const MAX_DUPLICATE_REQUESTS = 3; // duplicate requests to manage latency spikes
+const DUPLICATE_REQUEST_AFTER = 10; // 10 seconds
+const postRequest = async (cortexRequest) => {
+    let promises = [];
+    for (let i = 0; i < MAX_RETRY; i++) {
+        const { url, data, params, headers, cache, selectedEndpoint, requestId, pathway, model} = cortexRequest;
+        const enableDuplicateRequests = pathway?.enableDuplicateRequests !== undefined ? pathway.enableDuplicateRequests : config.get('enableDuplicateRequests');
+        let maxDuplicateRequests = enableDuplicateRequests ? MAX_DUPLICATE_REQUESTS : 1;
+        let duplicateRequestAfter = (pathway?.duplicateRequestAfter || DUPLICATE_REQUEST_AFTER) * 1000;
+        if (enableDuplicateRequests) {
+            //logger.info(`>>> [${requestId}] Duplicate requests enabled after ${duplicateRequestAfter / 1000} seconds`);
+        }
+        const axiosConfigObj = { params, headers, cache };
+        const streamRequested = (params?.stream || data?.stream);
+        if (streamRequested && model.supportsStreaming) {
+            axiosConfigObj.responseType = 'stream';
+            promises.push(selectedEndpoint.limiter.schedule({expiration: pathway.timeout * 1000 + 1000, id: `${requestId}_${uuidv4()}`},() => postWithMonitor(selectedEndpoint, url, data, axiosConfigObj)));
+        } else {
+            if (streamRequested) {
+                logger.info(`>>> [${requestId}] ${model} does not support streaming - sending non-streaming request`);
+                axiosConfigObj.params.stream = false;
+                data.stream = false;
+            }
+            const controllers = Array.from({ length: maxDuplicateRequests }, () => new AbortController());
+            promises = controllers.map((controller, index) =>
+                new Promise((resolve, reject) => {
+                    const duplicateRequestTime = duplicateRequestAfter * Math.pow(2, index) - duplicateRequestAfter;
+                    const jitter = duplicateRequestTime * 0.2 * Math.random();
+                    const duplicateRequestTimeout = Math.max(0, duplicateRequestTime + jitter);
+                    setTimeout(async () => {
+                        try {
+                            const endpointName = selectedEndpoint.name || model;
+                            if (!selectedEndpoint.limiter) {
+                                throw new Error(`No limiter for endpoint ${endpointName}!`);
+                            }
+                            const axiosConfigObj = { params, headers, cache };
+                            let response = null;
+                            if (!controller.signal?.aborted) {
+                                axiosConfigObj.signal = controller.signal;
+                                axiosConfigObj.headers['X-Cortex-Request-Index'] = index;
+                                if (index === 0) {
+                                    //logger.info(`>>> [${requestId}] sending request to ${endpointName} API ${axiosConfigObj.responseType === 'stream' ? 'with streaming' : ''}`);
+                                } else {
+                                    if (model.supportsStreaming) {
+                                        axiosConfigObj.responseType = 'stream';
+                                        axiosConfigObj.cache = false;
+                                    }
+                                    const logMessage = `>>> [${requestId}] taking too long - sending duplicate request ${index} to ${endpointName} API ${axiosConfigObj.responseType === 'stream' ? 'with streaming' : ''}`;
+                                    const header = '>'.repeat(logMessage.length);
+                                    logger.info(`\n${header}\n${logMessage}`);
+                                }
+                                response = await selectedEndpoint.limiter.schedule({expiration: pathway.timeout * 1000 + 1000, id: `${requestId}_${uuidv4()}`}, () => postWithMonitor(selectedEndpoint, url, data, axiosConfigObj));
+                                if (!controller.signal?.aborted) {
+                                    //logger.info(`<<< [${requestId}] received response for request ${index}`);
+                                    if (axiosConfigObj.responseType === 'stream') {
+                                        // Buffering and collecting the stream data
+                                        logger.info(`<<< [${requestId}] buffering streaming response for request ${index}`);
+                                        response = await new Promise((resolve, reject) => {
+                                            let responseData = '';
+                                            response.data.on('data', (chunk) => {
+                                                responseData += chunk;
+                                                //logger.info(`<<< [${requestId}] received chunk for request ${index}`);
+                                            });
+                                            response.data.on('end', () => {
+                                                response.data = JSON.parse(responseData);
+                                                resolve(response);
+                                            });
+                                            response.data.on('error', (error) => {
+                                                reject(error);
+                                            });
+                                        });
+                                    }
+                                }
+                            }
+                            resolve(response);
+                        } catch (error) {
+                            if (error.name === 'AbortError' || error.name === 'CanceledError') {
+                                //logger.info(`XXX [${requestId}] request ${index} was cancelled`);
+                                reject(error);
+                            } else {
+                                logger.error(`!!! [${requestId}] request ${index} failed with error: ${error?.response?.data?.error?.message || error}`);
+                                reject(error);
+                            }
+                        } finally {
+                            controllers.forEach(controller => controller.abort());
+                        }
+                    }, duplicateRequestTimeout);
+                })
+            );
+        }
+        try {
+            const response = await Promise.race(promises);
+            // if response status is 2xx
+            if (response.status >= 200 && response.status < 300) {
+                return response;
+            } else {
+                throw new Error(`Received error response: ${response.status}`);
+            }
+        } catch (error) {
+            if (error.response) {
+                selectedEndpoint.monitor.incrementErrorCount();
+                const status = error.response.status;
+                if (status === 429) {
+                    selectedEndpoint.monitor.incrementError429Count();
+                }
+                if (cortexRequest.model.endpoints.length === 1) {
+                    if (status !== 429) {
+                        return error.response;
+                    }
+                } else {
+                    // if there are multiple endpoints, retry everything
+                    cortexRequest.selectNewEndpoint();
+                }
+                logger.info(`>>> [${requestId}] retrying request due to ${status} response. Retry count: ${i + 1}`);
+                if (i < MAX_RETRY - 1) {
+                    const backoffTime = 200 * Math.pow(2, i);
+                    const jitter = backoffTime * 0.2 * Math.random();
+                    await new Promise(r => setTimeout(r, backoffTime + jitter));
+                } else {
+                    return error.response;
+                }
+            } else {
+                throw error;
+            }
+        }
+    }
+};
+const executeRequest = async (cortexRequest) => {
+    try {
+        const endpoint = cortexRequest.selectedEndpoint;
+        const callId = endpoint?.monitor?.startCall();
+        const response = await postRequest(cortexRequest);
+        endpoint?.monitor?.endCall(callId);
+        const requestId = cortexRequest.requestId;
+        const { error, data, cached } = response;
+        if (cached) {
+            logger.info(`<<< [${requestId}] served with cached response.`);
+        }
+        if (error && error.length > 0) {
+            const lastError = error[error.length - 1];
+            return { error: lastError.toJSON() ?? lastError ?? error };
+        }
+        //logger.info(`<<< [${requestId}] response: ${data.choices[0].delta || data.choices[0]}`)
+        return data;
+    } catch (error) {
+        logger.error(`Error in request: ${error.message || error}`);
+        return { error: error };
+    }
+}
+export {
+    axios, executeRequest, buildModelEndpoints, selectEndpoint, modelEndpoints
+};