@aj-archipelago/cortex 1.1.3 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/.eslintignore +3 -3
  2. package/README.md +17 -4
  3. package/config.js +45 -9
  4. package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/Dockerfile +1 -1
  5. package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/fileChunker.js +4 -1
  6. package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/package-lock.json +25 -216
  7. package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/package.json +2 -2
  8. package/helper-apps/cortex-whisper-wrapper/.dockerignore +27 -0
  9. package/helper-apps/cortex-whisper-wrapper/Dockerfile +32 -0
  10. package/helper-apps/cortex-whisper-wrapper/app.py +104 -0
  11. package/helper-apps/cortex-whisper-wrapper/docker-compose.debug.yml +12 -0
  12. package/helper-apps/cortex-whisper-wrapper/docker-compose.yml +10 -0
  13. package/helper-apps/cortex-whisper-wrapper/models/.gitkeep +0 -0
  14. package/helper-apps/cortex-whisper-wrapper/requirements.txt +5 -0
  15. package/lib/cortexRequest.js +117 -0
  16. package/lib/pathwayTools.js +2 -1
  17. package/lib/redisSubscription.js +2 -2
  18. package/lib/requestExecutor.js +360 -0
  19. package/lib/requestMonitor.js +131 -28
  20. package/package.json +2 -1
  21. package/pathways/summary.js +3 -3
  22. package/server/graphql.js +6 -6
  23. package/server/{pathwayPrompter.js → modelExecutor.js} +24 -21
  24. package/server/pathwayResolver.js +22 -17
  25. package/server/plugins/azureCognitivePlugin.js +25 -20
  26. package/server/plugins/azureTranslatePlugin.js +6 -10
  27. package/server/plugins/cohereGeneratePlugin.js +5 -12
  28. package/server/plugins/cohereSummarizePlugin.js +5 -12
  29. package/server/plugins/localModelPlugin.js +3 -3
  30. package/server/plugins/modelPlugin.js +18 -12
  31. package/server/plugins/openAiChatExtensionPlugin.js +5 -5
  32. package/server/plugins/openAiChatPlugin.js +8 -10
  33. package/server/plugins/openAiCompletionPlugin.js +9 -12
  34. package/server/plugins/openAiDallE3Plugin.js +14 -31
  35. package/server/plugins/openAiEmbeddingsPlugin.js +6 -9
  36. package/server/plugins/openAiImagePlugin.js +19 -15
  37. package/server/plugins/openAiWhisperPlugin.js +168 -100
  38. package/server/plugins/palmChatPlugin.js +9 -10
  39. package/server/plugins/palmCodeCompletionPlugin.js +2 -2
  40. package/server/plugins/palmCompletionPlugin.js +11 -12
  41. package/server/resolver.js +2 -2
  42. package/server/rest.js +1 -1
  43. package/tests/config.test.js +1 -1
  44. package/tests/mocks.js +5 -0
  45. package/tests/modelPlugin.test.js +3 -10
  46. package/tests/openAiChatPlugin.test.js +9 -8
  47. package/tests/openai_api.test.js +3 -3
  48. package/tests/palmChatPlugin.test.js +1 -1
  49. package/tests/palmCompletionPlugin.test.js +1 -1
  50. package/tests/pathwayResolver.test.js +2 -1
  51. package/tests/requestMonitor.test.js +94 -0
  52. package/tests/{requestDurationEstimator.test.js → requestMonitorDurationEstimator.test.js} +21 -17
  53. package/tests/truncateMessages.test.js +1 -1
  54. package/lib/request.js +0 -259
  55. package/lib/requestDurationEstimator.js +0 -90
  56. /package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/blobHandler.js +0 -0
  57. /package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/docHelper.js +0 -0
  58. /package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/function.json +0 -0
  59. /package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/helper.js +0 -0
  60. /package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/index.js +0 -0
  61. /package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/localFileHandler.js +0 -0
  62. /package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/redis.js +0 -0
  63. /package/{helper_apps/CortexFileHandler → helper-apps/cortex-file-handler}/start.js +0 -0
@@ -0,0 +1,104 @@
1
+ import uvicorn
2
+ from fastapi import FastAPI, HTTPException, Request
3
+ from uuid import uuid4
4
+ import os
5
+ import asyncio
6
+ import whisper
7
+ from whisper.utils import get_writer
8
+ from fastapi.encoders import jsonable_encoder
9
+ import time
10
+
11
+ model_download_root = './models'
12
+ model = whisper.load_model("large", download_root=model_download_root) #large, tiny
13
+
14
+ # Create a semaphore with a limit of 1
15
+ semaphore = asyncio.Semaphore(1)
16
+
17
+ app = FastAPI()
18
+
19
+ save_directory = "./tmp" # folder for downloaded files
20
+ os.makedirs(save_directory, exist_ok=True)
21
+
22
+
23
+ def delete_tmp_file(file_path):
24
+ try:
25
+ os.remove(file_path)
26
+ print(f"Temporary file '{file_path}' has been deleted.")
27
+ except OSError as e:
28
+ print(f"Error: {e.strerror}")
29
+
30
+ def transcribe(params):
31
+ if 'fileurl' not in params:
32
+ raise HTTPException(status_code=400, detail="fileurl parameter is required")
33
+
34
+ fileurl = params["fileurl"]
35
+
36
+ #word_timestamps bool, default True
37
+ word_timestamps = True
38
+ if 'word_timestamps' in params: #parse as bool
39
+ word_timestamps = False if params['word_timestamps'] == 'False' else True
40
+
41
+ print(f"Transcribing file {fileurl} with word_timestamps={word_timestamps}")
42
+ start_time = time.time()
43
+ result = model.transcribe(fileurl, word_timestamps=word_timestamps)
44
+ end_time = time.time()
45
+ execution_time = end_time - start_time
46
+ print("Transcribe execution time:", execution_time, "seconds")
47
+
48
+ srtpath = os.path.join(save_directory, str(uuid4()) + ".srt")
49
+
50
+ print(f"Saving transcription as : {srtpath}")
51
+ writer = get_writer("srt", save_directory)
52
+
53
+
54
+ writer_args = {'highlight_words': False, 'max_line_count': None, 'max_line_width': None, 'max_words_per_line': None}
55
+ if 'highlight_words' in params: #parse as bool
56
+ writer_args['highlight_words'] = params['highlight_words'] == 'True'
57
+ if 'max_line_count' in params: #parse as int
58
+ writer_args['max_line_count'] = int(params['max_line_count'])
59
+ if 'max_line_width' in params: #parse as int
60
+ writer_args['max_line_width'] = int(params['max_line_width'])
61
+ if 'max_words_per_line' in params: #parse as int
62
+ writer_args['max_words_per_line'] = int(params['max_words_per_line'])
63
+
64
+ # if and only if fileurl and word_timestamps=True, max_words_per_line=1
65
+ if fileurl and word_timestamps and len(params) <= 2:
66
+ writer_args['max_words_per_line'] = 1
67
+
68
+ # writer_args = {arg: args.pop(arg) for arg in word_options if arg in args}
69
+ writer(result, srtpath, **writer_args)
70
+
71
+
72
+ with open(srtpath, "r") as f:
73
+ srtstr = f.read()
74
+
75
+ # clean up tmp out files
76
+ delete_tmp_file(srtpath)
77
+
78
+ print(f"Transcription of file {fileurl} completed")
79
+ return srtstr
80
+
81
+
82
+ async def get_params(request: Request):
83
+ params = {}
84
+ if request.method == "POST":
85
+ body = jsonable_encoder(await request.json())
86
+ params = body
87
+ else:
88
+ params = dict(request.query_params)
89
+ return params
90
+
91
+ @app.get("/")
92
+ @app.post("/")
93
+ async def root(request: Request):
94
+ if semaphore.locked():
95
+ raise HTTPException(status_code=429, detail="Too Many Requests")
96
+
97
+ params = await get_params(request)
98
+ async with semaphore:
99
+ result = await asyncio.to_thread(transcribe, params)
100
+ return result
101
+
102
+ if __name__ == "__main__":
103
+ print("Starting APP Whisper server", flush=True)
104
+ uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -0,0 +1,12 @@
1
+ version: '3.4'
2
+
3
+ services:
4
+ cortex:
5
+ image: arc/whisper
6
+ build:
7
+ context: .
8
+ dockerfile: ./Dockerfile
9
+ command: ["sh", "-c", "pip install debugpy -t /tmp && python /tmp/debugpy --wait-for-client --listen 0.0.0.0:5678 -m uvicorn helper_apps.WhisperX/app:app --host 0.0.0.0 --port 8000"]
10
+ ports:
11
+ - 8000:8000
12
+ - 5678:5678
@@ -0,0 +1,10 @@
1
+ version: '3.4'
2
+
3
+ services:
4
+ cortex:
5
+ image: arc/whisper
6
+ build:
7
+ context: .
8
+ dockerfile: ./Dockerfile
9
+ ports:
10
+ - 8000:8000
@@ -0,0 +1,5 @@
1
+ # To ensure app dependencies are ported from your virtual environment/host machine into your container, run 'pip freeze > requirements.txt' in the terminal to overwrite this file
2
+ fastapi[all]==0.89.0
3
+ uvicorn[standard]==0.20.0
4
+ gunicorn==20.1.0
5
+ openai-whisper
@@ -0,0 +1,117 @@
1
+ import { selectEndpoint } from './requestExecutor.js';
2
+
3
+ class CortexRequest {
4
+ constructor( { url, data, params, headers, cache, model, pathwayResolver, selectedEndpoint } = {}) {
5
+ this._url = url || '';
6
+ this._data = data || {};
7
+ this._params = params || {};
8
+ this._headers = headers || {};
9
+ this._cache = cache || {};
10
+ this._model = model || '';
11
+ this._pathwayResolver = pathwayResolver || {};
12
+ this._selectedEndpoint = selectedEndpoint || {};
13
+
14
+ if (this._pathwayResolver) {
15
+ this._model = this._pathwayResolver.model;
16
+ }
17
+
18
+ if (this._model) {
19
+ this.selectNewEndpoint();
20
+ }
21
+ }
22
+
23
+ selectNewEndpoint() {
24
+ const sep = selectEndpoint(this._model);
25
+ if (sep) {
26
+ this._selectedEndpoint = sep;
27
+ this._url = sep.url;
28
+ this._data = { ...this._data, ...sep.params };
29
+ this._headers = { ...this._headers, ...sep.headers };
30
+ this._params = { ...this._params, ...sep.params };
31
+ }
32
+ }
33
+
34
+ // url getter and setter
35
+ get url() {
36
+ return this._url;
37
+ }
38
+
39
+ set url(value) {
40
+ this._url = value;
41
+ }
42
+
43
+ // data getter and setter
44
+ get data() {
45
+ return this._data;
46
+ }
47
+
48
+ set data(value) {
49
+ this._data = value;
50
+ }
51
+
52
+ // params getter and setter
53
+ get params() {
54
+ return this._params;
55
+ }
56
+
57
+ set params(value) {
58
+ this._params = value;
59
+ }
60
+
61
+ // headers getter and setter
62
+ get headers() {
63
+ return this._headers;
64
+ }
65
+
66
+ set headers(value) {
67
+ this._headers = value;
68
+ }
69
+
70
+ // cache getter and setter
71
+ get cache() {
72
+ return this._cache;
73
+ }
74
+
75
+ set cache(value) {
76
+ this._cache = value;
77
+ }
78
+
79
+ // model getter and setter
80
+ get model() {
81
+ return this._model;
82
+ }
83
+
84
+ set model(value) {
85
+ this._model = value;
86
+ }
87
+
88
+ // requestId getter
89
+ get requestId() {
90
+ return this._pathwayResolver.requestId;
91
+ }
92
+
93
+ // pathway getter and setter
94
+ get pathway() {
95
+ return this._pathwayResolver.pathway;
96
+ }
97
+
98
+ // selectedEndpoint getter and setter
99
+ get selectedEndpoint() {
100
+ return this._selectedEndpoint;
101
+ }
102
+
103
+ set selectedEndpoint(value) {
104
+ this._selectedEndpoint = value;
105
+ }
106
+
107
+ // pathwayResolver getter and setter
108
+ get pathwayResolver() {
109
+ return this._pathwayResolver;
110
+ }
111
+
112
+ set pathwayResolver(value) {
113
+ this._pathwayResolver = value;
114
+ }
115
+ }
116
+
117
+ export default CortexRequest;
@@ -1,8 +1,9 @@
1
1
  // pathwayTools.js
2
2
  import { encode , decode } from 'gpt-3-encoder';
3
+ import { config } from '../config.js';
3
4
 
4
5
  // callPathway - call a pathway from another pathway
5
- const callPathway = async (config, pathwayName, args) => {
6
+ const callPathway = async (pathwayName, args) => {
6
7
  const pathway = config.get(`pathways.${pathwayName}`);
7
8
  if (!pathway) {
8
9
  throw new Error(`Pathway ${pathwayName} not found`);
@@ -125,7 +125,7 @@ async function publishRequestProgressSubscription(data) {
125
125
  requestState[requestId].useRedis = false;
126
126
  logger.info(`Starting local execution for registered async request: ${requestId}`);
127
127
  const { resolver, args } = requestState[requestId];
128
- resolver(args, false);
128
+ resolver && resolver(args, false);
129
129
  }
130
130
  } else {
131
131
  idsToForward.push(requestId);
@@ -163,7 +163,7 @@ function handleSubscription(data){
163
163
  requestState[requestId].useRedis = true;
164
164
  logger.info(`Starting execution for registered async request: ${requestId}`);
165
165
  const { resolver, args } = requestState[requestId];
166
- resolver(args);
166
+ resolver && resolver(args);
167
167
  }
168
168
  }
169
169
  }
@@ -0,0 +1,360 @@
1
+ import Bottleneck from 'bottleneck/es5.js';
2
+ import RequestMonitor from './requestMonitor.js';
3
+ import { config } from '../config.js';
4
+ import axios from 'axios';
5
+ import { setupCache } from 'axios-cache-interceptor';
6
+ import Redis from 'ioredis';
7
+ import logger from './logger.js';
8
+ import { v4 as uuidv4 } from 'uuid';
9
+
10
+ const connectionString = config.get('storageConnectionString');
11
+
12
+ if (!connectionString) {
13
+ logger.info('No STORAGE_CONNECTION_STRING found in environment. Redis features (caching, pubsub, clustered limiters) disabled.')
14
+ } else {
15
+ logger.info('Using Redis connection specified in STORAGE_CONNECTION_STRING.');
16
+ }
17
+
18
+ let client;
19
+
20
+ if (connectionString) {
21
+ try {
22
+ client = new Redis(connectionString);
23
+ } catch (error) {
24
+ logger.error(`Redis connection error: ${error}`);
25
+ }
26
+ }
27
+
28
+ const cortexId = config.get('cortexId');
29
+ const connection = client && new Bottleneck.IORedisConnection({ client: client });
30
+
31
+ let modelEndpoints = {};
32
+
33
+ const createLimiter = (endpoint, name, index) => {
34
+ const rps = endpoint.requestsPerSecond ?? 100;
35
+ let limiterOptions = {
36
+ minTime: 1000 / rps,
37
+ maxConcurrent: rps,
38
+ reservoir: rps, // Number of tokens available initially
39
+ reservoirRefreshAmount: rps, // Number of tokens added per interval
40
+ reservoirRefreshInterval: 1000, // Interval in milliseconds
41
+ };
42
+
43
+ // If Redis connection exists, add id and connection to enable clustering
44
+ if (connection) {
45
+ limiterOptions.id = `${cortexId}-${name}-${index}-limiter`; // Unique id for each limiter
46
+ limiterOptions.connection = connection; // Shared Redis connection
47
+ }
48
+
49
+ endpoint.limiter = new Bottleneck(limiterOptions);
50
+
51
+ endpoint.limiter.on('error', (err) => {
52
+ logger.error(`Limiter error for ${cortexId}-${name}-${index}: ${err}`);
53
+ endpoint.limiter.disconnect();
54
+ createLimiter(endpoint, name, index);
55
+ logger.info(`New limiter created for ${cortexId}-${name}-${index}`)
56
+ });
57
+
58
+ endpoint.limiter.on('failed', (error, info) => {
59
+ if (error.name === 'CanceledError') {
60
+ logger.debug(`Request cancelled for ${cortexId}-${name}-${index}: Id: ${info.options.id || 'none'}`);
61
+ } else {
62
+ logger.error(`Request failed for ${cortexId}-${name}-${index}: Id: ${info.options.id || 'none'}: ${error}`);
63
+ }
64
+ });
65
+
66
+ endpoint.limiter.on('debug', (message) => {
67
+ if (!message.includes('heartbeat.lua')) {
68
+ logger.debug(`Limiter ${cortexId}-${name}-${index}: ${message}`);
69
+ }
70
+ });
71
+ }
72
+
73
+ const buildModelEndpoints = (config) => {
74
+ modelEndpoints = JSON.parse(JSON.stringify(config.get('models')));
75
+ logger.info(`Building ${connection ? 'Redis clustered' : 'local'} model rate limiters for ${cortexId}...`);
76
+ for (const [name, model] of Object.entries(modelEndpoints)) {
77
+ model.endpoints.forEach((endpoint, index) => {
78
+ createLimiter(endpoint, name, index)
79
+ endpoint.monitor = new RequestMonitor();
80
+ });
81
+ }
82
+ }
83
+
84
+ let currentIndex = 0; // for round-robin selection
85
+
86
+ const selectEndpoint = (model) => {
87
+ if (!model || !Array.isArray(model.endpoints) || model.endpoints.length === 0) {
88
+ return null;
89
+ } else {
90
+ logger.debug(`Selecting endpoint for model ${model.name}...`);
91
+ if (model.endpoints.length === 1) {
92
+ logger.debug(`Only one endpoint for model ${model.name}. No selection required.`);
93
+ return model.endpoints[0];
94
+ }
95
+
96
+ let healthyEndpoints = model.endpoints.filter(endpoint => endpoint.monitor.healthy);
97
+ if (healthyEndpoints.length === 0) {
98
+ const selectedEndpoint = model.endpoints[currentIndex % model.endpoints.length];
99
+ currentIndex++;
100
+ logger.warn(`No healthy endpoints for model ${model.name}. Using round-robin selection. Selected: ${selectedEndpoint.name || 'default'}`);
101
+ return selectedEndpoint;
102
+ }
103
+
104
+ healthyEndpoints.forEach(endpoint =>{
105
+ logger.debug(`Healthy endpoint: ${endpoint.name || 'default'}, duration: ${endpoint.monitor.getAverageCallDuration()}ms`);
106
+ })
107
+
108
+ let selectedEndpoint;
109
+ const durations = healthyEndpoints.map(endpoint => endpoint.monitor.getAverageCallDuration());
110
+ if (shouldUseRoundRobin(durations)) {
111
+ selectedEndpoint = healthyEndpoints[currentIndex % healthyEndpoints.length];
112
+ currentIndex++;
113
+ logger.debug(`All endpoints are performing similarly. Using round-robin selection. Selected: ${selectedEndpoint.name || 'default'}`);
114
+ } else {
115
+ selectedEndpoint = fastestEndpoint(healthyEndpoints);
116
+ logger.debug(`Selected fastest endpoint: ${selectedEndpoint.name || 'default'}`);
117
+ }
118
+
119
+ return selectedEndpoint;
120
+ }
121
+ }
122
+
123
+ const calculateStandardDeviation = (durations) => {
124
+ const mean = durations.reduce((total, value) => total + value, 0) / durations.length;
125
+ const variance = durations.reduce((total, value) => total + Math.pow(value - mean, 2), 0) / durations.length;
126
+ return Math.sqrt(variance);
127
+ }
128
+
129
+ const shouldUseRoundRobin = (durations) => {
130
+ const standardDeviation = calculateStandardDeviation(durations);
131
+ const threshold = 10;
132
+ return standardDeviation <= threshold;
133
+ }
134
+
135
+ const fastestEndpoint = (endpoints) => {
136
+ return endpoints.reduce((fastest, current) => {
137
+ if (current.monitor.getAverageCallDuration() < fastest.monitor.getAverageCallDuration()) {
138
+ return current;
139
+ } else {
140
+ return fastest;
141
+ }
142
+ });
143
+ }
144
+
145
+ let cortexAxios = axios;
146
+
147
+ if (config.get('enableCache')) {
148
+ // Setup cache
149
+ cortexAxios = setupCache(axios, {
150
+ // enable cache for all requests by default
151
+ methods: ['get', 'post', 'put', 'delete', 'patch'],
152
+ interpretHeader: false,
153
+ ttl: 1000 * 60 * 60 * 24 * 7, // 7 days
154
+ });
155
+ }
156
+
157
+ setInterval(() => {
158
+ // Iterate over each model
159
+ for (const [name, model] of Object.entries(modelEndpoints)) {
160
+ // Iterate over each endpoint in the current model
161
+ let endpointIndex = 0;
162
+ model.endpoints.forEach((endpoint) => {
163
+ const monitor = endpoint.monitor;
164
+ if (!monitor) {
165
+ // Skip if monitor does not exist
166
+ return;
167
+ }
168
+
169
+ const callRate = monitor.getPeakCallRate();
170
+
171
+ if (callRate > 0) {
172
+ const error429Rate = monitor.getError429Rate();
173
+ const errorRate = monitor.getErrorRate();
174
+ const avgCallDuration = monitor.getAverageCallDuration();
175
+ logger.debug('------------------------');
176
+ logger.debug(`Monitor of ${name} endpoint ${endpoint.name || endpointIndex} Call rate: ${callRate} calls/sec, duration: ${avgCallDuration}ms, 429 errors: ${error429Rate * 100}%, errors: ${errorRate * 100}%`);
177
+ logger.debug('------------------------');
178
+ }
179
+ endpointIndex++;
180
+ });
181
+ }
182
+ }, 10000); // Log rates every 10 seconds (10000 ms).
183
+
184
+ const postWithMonitor = async (endpoint, url, data, axiosConfigObj) => {
185
+ return cortexAxios.post(url, data, axiosConfigObj);
186
+ }
187
+
188
+ const MAX_RETRY = 10; // retries for error handling
189
+ const MAX_DUPLICATE_REQUESTS = 3; // duplicate requests to manage latency spikes
190
+ const DUPLICATE_REQUEST_AFTER = 10; // 10 seconds
191
+
192
+ const postRequest = async (cortexRequest) => {
193
+ let promises = [];
194
+ for (let i = 0; i < MAX_RETRY; i++) {
195
+ const { url, data, params, headers, cache, selectedEndpoint, requestId, pathway, model} = cortexRequest;
196
+ const enableDuplicateRequests = pathway?.enableDuplicateRequests !== undefined ? pathway.enableDuplicateRequests : config.get('enableDuplicateRequests');
197
+ let maxDuplicateRequests = enableDuplicateRequests ? MAX_DUPLICATE_REQUESTS : 1;
198
+ let duplicateRequestAfter = (pathway?.duplicateRequestAfter || DUPLICATE_REQUEST_AFTER) * 1000;
199
+
200
+ if (enableDuplicateRequests) {
201
+ //logger.info(`>>> [${requestId}] Duplicate requests enabled after ${duplicateRequestAfter / 1000} seconds`);
202
+ }
203
+
204
+ const axiosConfigObj = { params, headers, cache };
205
+ const streamRequested = (params?.stream || data?.stream);
206
+ if (streamRequested && model.supportsStreaming) {
207
+ axiosConfigObj.responseType = 'stream';
208
+ promises.push(selectedEndpoint.limiter.schedule({expiration: pathway.timeout * 1000 + 1000, id: `${requestId}_${uuidv4()}`},() => postWithMonitor(selectedEndpoint, url, data, axiosConfigObj)));
209
+ } else {
210
+ if (streamRequested) {
211
+ logger.info(`>>> [${requestId}] ${model} does not support streaming - sending non-streaming request`);
212
+ axiosConfigObj.params.stream = false;
213
+ data.stream = false;
214
+ }
215
+ const controllers = Array.from({ length: maxDuplicateRequests }, () => new AbortController());
216
+ promises = controllers.map((controller, index) =>
217
+ new Promise((resolve, reject) => {
218
+ const duplicateRequestTime = duplicateRequestAfter * Math.pow(2, index) - duplicateRequestAfter;
219
+ const jitter = duplicateRequestTime * 0.2 * Math.random();
220
+ const duplicateRequestTimeout = Math.max(0, duplicateRequestTime + jitter);
221
+ setTimeout(async () => {
222
+ try {
223
+ const endpointName = selectedEndpoint.name || model;
224
+ if (!selectedEndpoint.limiter) {
225
+ throw new Error(`No limiter for endpoint ${endpointName}!`);
226
+ }
227
+ const axiosConfigObj = { params, headers, cache };
228
+
229
+ let response = null;
230
+
231
+ if (!controller.signal?.aborted) {
232
+
233
+ axiosConfigObj.signal = controller.signal;
234
+ axiosConfigObj.headers['X-Cortex-Request-Index'] = index;
235
+
236
+ if (index === 0) {
237
+ //logger.info(`>>> [${requestId}] sending request to ${endpointName} API ${axiosConfigObj.responseType === 'stream' ? 'with streaming' : ''}`);
238
+ } else {
239
+ if (model.supportsStreaming) {
240
+ axiosConfigObj.responseType = 'stream';
241
+ axiosConfigObj.cache = false;
242
+ }
243
+ const logMessage = `>>> [${requestId}] taking too long - sending duplicate request ${index} to ${endpointName} API ${axiosConfigObj.responseType === 'stream' ? 'with streaming' : ''}`;
244
+ const header = '>'.repeat(logMessage.length);
245
+ logger.info(`\n${header}\n${logMessage}`);
246
+ }
247
+
248
+ response = await selectedEndpoint.limiter.schedule({expiration: pathway.timeout * 1000 + 1000, id: `${requestId}_${uuidv4()}`}, () => postWithMonitor(selectedEndpoint, url, data, axiosConfigObj));
249
+
250
+ if (!controller.signal?.aborted) {
251
+
252
+ //logger.info(`<<< [${requestId}] received response for request ${index}`);
253
+
254
+ if (axiosConfigObj.responseType === 'stream') {
255
+ // Buffering and collecting the stream data
256
+ logger.info(`<<< [${requestId}] buffering streaming response for request ${index}`);
257
+ response = await new Promise((resolve, reject) => {
258
+ let responseData = '';
259
+ response.data.on('data', (chunk) => {
260
+ responseData += chunk;
261
+ //logger.info(`<<< [${requestId}] received chunk for request ${index}`);
262
+ });
263
+ response.data.on('end', () => {
264
+ response.data = JSON.parse(responseData);
265
+ resolve(response);
266
+ });
267
+ response.data.on('error', (error) => {
268
+ reject(error);
269
+ });
270
+ });
271
+ }
272
+ }
273
+ }
274
+
275
+ resolve(response);
276
+
277
+ } catch (error) {
278
+ if (error.name === 'AbortError' || error.name === 'CanceledError') {
279
+ //logger.info(`XXX [${requestId}] request ${index} was cancelled`);
280
+ reject(error);
281
+ } else {
282
+ logger.error(`!!! [${requestId}] request ${index} failed with error: ${error?.response?.data?.error?.message || error}`);
283
+ reject(error);
284
+ }
285
+ } finally {
286
+ controllers.forEach(controller => controller.abort());
287
+ }
288
+ }, duplicateRequestTimeout);
289
+ })
290
+ );
291
+ }
292
+
293
+ try {
294
+ const response = await Promise.race(promises);
295
+
296
+ // if response status is 2xx
297
+ if (response.status >= 200 && response.status < 300) {
298
+ return response;
299
+ } else {
300
+ throw new Error(`Received error response: ${response.status}`);
301
+ }
302
+ } catch (error) {
303
+ if (error.response) {
304
+ selectedEndpoint.monitor.incrementErrorCount();
305
+ const status = error.response.status;
306
+
307
+ if (status === 429) {
308
+ selectedEndpoint.monitor.incrementError429Count();
309
+ }
310
+
311
+ if (cortexRequest.model.endpoints.length === 1) {
312
+ if (status !== 429) {
313
+ return error.response;
314
+ }
315
+ } else {
316
+ // if there are multiple endpoints, retry everything
317
+ cortexRequest.selectNewEndpoint();
318
+ }
319
+
320
+ logger.info(`>>> [${requestId}] retrying request due to ${status} response. Retry count: ${i + 1}`);
321
+ if (i < MAX_RETRY - 1) {
322
+ const backoffTime = 200 * Math.pow(2, i);
323
+ const jitter = backoffTime * 0.2 * Math.random();
324
+ await new Promise(r => setTimeout(r, backoffTime + jitter));
325
+ } else {
326
+ return error.response;
327
+ }
328
+ } else {
329
+ throw error;
330
+ }
331
+ }
332
+ }
333
+ };
334
+
335
+ const executeRequest = async (cortexRequest) => {
336
+ try {
337
+ const endpoint = cortexRequest.selectedEndpoint;
338
+ const callId = endpoint?.monitor?.startCall();
339
+ const response = await postRequest(cortexRequest);
340
+ endpoint?.monitor?.endCall(callId);
341
+ const requestId = cortexRequest.requestId;
342
+ const { error, data, cached } = response;
343
+ if (cached) {
344
+ logger.info(`<<< [${requestId}] served with cached response.`);
345
+ }
346
+ if (error && error.length > 0) {
347
+ const lastError = error[error.length - 1];
348
+ return { error: lastError.toJSON() ?? lastError ?? error };
349
+ }
350
+ //logger.info(`<<< [${requestId}] response: ${data.choices[0].delta || data.choices[0]}`)
351
+ return data;
352
+ } catch (error) {
353
+ logger.error(`Error in request: ${error.message || error}`);
354
+ return { error: error };
355
+ }
356
+ }
357
+
358
+ export {
359
+ axios, executeRequest, buildModelEndpoints, selectEndpoint, modelEndpoints
360
+ };