@aj-archipelago/cortex 1.3.49 → 1.3.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.js +1 -1
- package/helper-apps/cortex-browser/Dockerfile +19 -31
- package/helper-apps/cortex-browser/function_app.py +708 -181
- package/helper-apps/cortex-browser/requirements.txt +4 -4
- package/helper-apps/cortex-file-handler/blobHandler.js +850 -429
- package/helper-apps/cortex-file-handler/constants.js +64 -48
- package/helper-apps/cortex-file-handler/docHelper.js +7 -114
- package/helper-apps/cortex-file-handler/fileChunker.js +96 -51
- package/helper-apps/cortex-file-handler/function.json +2 -6
- package/helper-apps/cortex-file-handler/helper.js +34 -25
- package/helper-apps/cortex-file-handler/index.js +324 -136
- package/helper-apps/cortex-file-handler/localFileHandler.js +56 -57
- package/helper-apps/cortex-file-handler/package-lock.json +6065 -5964
- package/helper-apps/cortex-file-handler/package.json +8 -4
- package/helper-apps/cortex-file-handler/redis.js +23 -17
- package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +12 -9
- package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +21 -18
- package/helper-apps/cortex-file-handler/scripts/test-azure.sh +1 -1
- package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +1 -1
- package/helper-apps/cortex-file-handler/services/ConversionService.js +288 -0
- package/helper-apps/cortex-file-handler/services/FileConversionService.js +53 -0
- package/helper-apps/cortex-file-handler/start.js +63 -38
- package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +144 -0
- package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +88 -64
- package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +114 -91
- package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +351 -0
- package/helper-apps/cortex-file-handler/tests/files/DOCX_TestPage.docx +0 -0
- package/helper-apps/cortex-file-handler/tests/files/tests-example.xls +0 -0
- package/helper-apps/cortex-file-handler/tests/start.test.js +943 -642
- package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +31 -0
- package/helper-apps/cortex-markitdown/.funcignore +1 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py +64 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json +21 -0
- package/helper-apps/cortex-markitdown/README.md +94 -0
- package/helper-apps/cortex-markitdown/host.json +15 -0
- package/helper-apps/cortex-markitdown/requirements.txt +2 -0
- package/lib/requestExecutor.js +44 -36
- package/package.json +1 -1
- package/pathways/system/entity/tools/sys_tool_cognitive_search.js +1 -1
- package/pathways/system/entity/tools/sys_tool_readfile.js +24 -2
- package/server/plugins/openAiWhisperPlugin.js +59 -87
- package/helper-apps/cortex-file-handler/tests/docHelper.test.js +0 -148
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
|
|
3
|
+
export async function cleanupHashAndFile(hash, uploadedUrl, baseUrl) {
|
|
4
|
+
if (uploadedUrl) {
|
|
5
|
+
try {
|
|
6
|
+
const fileUrl = new URL(uploadedUrl);
|
|
7
|
+
const fileIdentifier = fileUrl.pathname.split('/').pop().split('_')[0];
|
|
8
|
+
const deleteUrl = `${baseUrl}?operation=delete&requestId=${fileIdentifier}`;
|
|
9
|
+
await axios.delete(deleteUrl, { validateStatus: () => true });
|
|
10
|
+
} catch (e) {
|
|
11
|
+
// ignore
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
await axios.get(baseUrl, {
|
|
15
|
+
params: { hash, clearHash: true },
|
|
16
|
+
validateStatus: (status) => true,
|
|
17
|
+
});
|
|
18
|
+
await axios.get(baseUrl, {
|
|
19
|
+
params: { hash: `${hash}_converted`, clearHash: true },
|
|
20
|
+
validateStatus: (status) => true,
|
|
21
|
+
});
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export function getFolderNameFromUrl(url) {
|
|
25
|
+
const urlObj = new URL(url);
|
|
26
|
+
const parts = urlObj.pathname.split('/');
|
|
27
|
+
if (url.includes('127.0.0.1:10000')) {
|
|
28
|
+
return parts[3].split('_')[0];
|
|
29
|
+
}
|
|
30
|
+
return parts[2].split('_')[0];
|
|
31
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
.venv
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import azure.functions as func
|
|
3
|
+
from markitdown import MarkItDown
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
# Initialize MarkItDown converter (do this once, outside the function handler if possible)
|
|
7
|
+
# This is a global instance to be reused across invocations for efficiency.
|
|
8
|
+
# For LLM-based image description, you might need to configure llm_client and llm_model
|
|
9
|
+
# e.g., md = MarkItDown(llm_client=OpenAI(), llm_model="gpt-4o")
|
|
10
|
+
# For simplicity, we'll use the basic setup here.
|
|
11
|
+
md = MarkItDown(enable_plugins=True)
|
|
12
|
+
|
|
13
|
+
def main(req: func.HttpRequest) -> func.HttpResponse:
|
|
14
|
+
logging.info('Python HTTP trigger function processed a request.')
|
|
15
|
+
|
|
16
|
+
uri = req.params.get('uri')
|
|
17
|
+
if not uri:
|
|
18
|
+
try:
|
|
19
|
+
req_body = req.get_json()
|
|
20
|
+
except ValueError:
|
|
21
|
+
pass
|
|
22
|
+
else:
|
|
23
|
+
uri = req_body.get('uri')
|
|
24
|
+
|
|
25
|
+
if uri:
|
|
26
|
+
try:
|
|
27
|
+
logging.info(f"Processing URI: {uri}")
|
|
28
|
+
# The MarkItDown library's convert method can take a URI directly.
|
|
29
|
+
# It can also handle local file paths if the function has access to them,
|
|
30
|
+
# but for a typical HTTP-triggered Azure Function, a web URI is expected.
|
|
31
|
+
result = md.convert(uri)
|
|
32
|
+
|
|
33
|
+
# The result object has a text_content attribute
|
|
34
|
+
markdown_content = result.text_content
|
|
35
|
+
|
|
36
|
+
# Return the markdown content
|
|
37
|
+
# We'll return it as JSON for easier consumption by clients
|
|
38
|
+
response_data = {
|
|
39
|
+
"uri": uri,
|
|
40
|
+
"markdown": markdown_content
|
|
41
|
+
}
|
|
42
|
+
return func.HttpResponse(
|
|
43
|
+
json.dumps(response_data),
|
|
44
|
+
mimetype="application/json",
|
|
45
|
+
status_code=200
|
|
46
|
+
)
|
|
47
|
+
except Exception as e:
|
|
48
|
+
logging.error(f"Error converting URI {uri}: {str(e)}")
|
|
49
|
+
error_response = {
|
|
50
|
+
"error": "Failed to convert URI to Markdown.",
|
|
51
|
+
"details": str(e)
|
|
52
|
+
}
|
|
53
|
+
return func.HttpResponse(
|
|
54
|
+
json.dumps(error_response),
|
|
55
|
+
mimetype="application/json",
|
|
56
|
+
status_code=500
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
logging.warning("No URI provided in the request.")
|
|
60
|
+
return func.HttpResponse(
|
|
61
|
+
"Please pass a URI on the query string or in the request body",
|
|
62
|
+
status_code=400
|
|
63
|
+
)
|
|
64
|
+
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"scriptFile": "__init__.py",
|
|
3
|
+
"bindings": [
|
|
4
|
+
{
|
|
5
|
+
"authLevel": "function",
|
|
6
|
+
"type": "httpTrigger",
|
|
7
|
+
"direction": "in",
|
|
8
|
+
"name": "req",
|
|
9
|
+
"methods": [
|
|
10
|
+
"get",
|
|
11
|
+
"post"
|
|
12
|
+
],
|
|
13
|
+
"route": "convert"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"type": "http",
|
|
17
|
+
"direction": "out",
|
|
18
|
+
"name": "$return"
|
|
19
|
+
}
|
|
20
|
+
]
|
|
21
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Markitdown Azure Function Converter
|
|
2
|
+
|
|
3
|
+
This Azure Function App provides an HTTP endpoint to convert various file formats (specified by a URI) to Markdown using the `microsoft/markitdown` Python library.
|
|
4
|
+
|
|
5
|
+
## Function: MarkitdownConverterFunction
|
|
6
|
+
|
|
7
|
+
* **Trigger**: HTTP (GET, POST)
|
|
8
|
+
* **Route**: `/api/convert` (or as configured by your Azure Function host settings)
|
|
9
|
+
* **Authentication**: Function (requires a function key for access)
|
|
10
|
+
|
|
11
|
+
### Input
|
|
12
|
+
|
|
13
|
+
The function expects a `uri` parameter, either in the query string (for GET requests) or in the JSON body (for POST requests).
|
|
14
|
+
|
|
15
|
+
**Example GET Request:**
|
|
16
|
+
```
|
|
17
|
+
GET /api/convert?uri=https://www.example.com/somefile.pdf
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
**Example POST Request:**
|
|
21
|
+
```
|
|
22
|
+
POST /api/convert
|
|
23
|
+
Content-Type: application/json
|
|
24
|
+
|
|
25
|
+
{
|
|
26
|
+
"uri": "https://www.example.com/somefile.docx"
|
|
27
|
+
}
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Output
|
|
31
|
+
|
|
32
|
+
* **Success (200 OK):** Returns a JSON object containing the original URI and the converted Markdown content.
|
|
33
|
+
```json
|
|
34
|
+
{
|
|
35
|
+
"uri": "https://www.example.com/somefile.pdf",
|
|
36
|
+
"markdown": "# Converted Markdown Content\n..."
|
|
37
|
+
}
|
|
38
|
+
```
|
|
39
|
+
* **Bad Request (400):** If the `uri` parameter is missing.
|
|
40
|
+
* **Internal Server Error (500):** If an error occurs during the conversion process. The response will contain an error message and details.
|
|
41
|
+
```json
|
|
42
|
+
{
|
|
43
|
+
"error": "Failed to convert URI to Markdown.",
|
|
44
|
+
"details": "<specific error message from the library>"
|
|
45
|
+
}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Project Structure
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
cortex-markitdown/
|
|
52
|
+
├── MarkitdownConverterFunction/
|
|
53
|
+
│ ├── __init__.py # The Python code for the Azure Function
|
|
54
|
+
│ └── function.json # Configuration file for the Azure Function (bindings, triggers)
|
|
55
|
+
├── .gitignore # Standard Python .gitignore
|
|
56
|
+
├── host.json # Configuration for the Azure Functions host
|
|
57
|
+
├── requirements.txt # Python package dependencies
|
|
58
|
+
└── README.md # This file
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Prerequisites
|
|
62
|
+
|
|
63
|
+
* Azure Functions Core Tools
|
|
64
|
+
* Python 3.8+ (check Azure Functions Python version compatibility)
|
|
65
|
+
* An Azure account (for deployment)
|
|
66
|
+
|
|
67
|
+
## Setup and Local Development
|
|
68
|
+
|
|
69
|
+
1. **Clone the repository (if applicable).**
|
|
70
|
+
2. **Create and activate a virtual environment:**
|
|
71
|
+
```bash
|
|
72
|
+
python -m venv .venv
|
|
73
|
+
source .venv/bin/activate # On Windows use `.venv\Scripts\activate`
|
|
74
|
+
```
|
|
75
|
+
3. **Install dependencies:**
|
|
76
|
+
```bash
|
|
77
|
+
pip install -r requirements.txt
|
|
78
|
+
```
|
|
79
|
+
4. **Run the Azure Function locally:**
|
|
80
|
+
```bash
|
|
81
|
+
func start
|
|
82
|
+
```
|
|
83
|
+
The function should be available at `http://localhost:7071/api/convert` (the port might vary).
|
|
84
|
+
|
|
85
|
+
## Dependencies
|
|
86
|
+
|
|
87
|
+
* `azure-functions`: For creating Azure Functions.
|
|
88
|
+
* `markitdown[all]`: The core library used for file conversion to Markdown. The `[all]` option installs all optional dependencies for handling various file types.
|
|
89
|
+
|
|
90
|
+
## Notes
|
|
91
|
+
|
|
92
|
+
* The `MarkItDown` instance in `__init__.py` is initialized with `enable_plugins=True` to allow for extended file format support through plugins.
|
|
93
|
+
* For handling images that require OCR or descriptions, the `markitdown` library might need an LLM client (e.g., OpenAI) configured. This is not included in the basic setup provided but can be added by modifying the `MarkItDown()` instantiation in `__init__.py` and ensuring the necessary environment variables (like API keys) are available to the function.
|
|
94
|
+
* Ensure that the URIs provided to the function are publicly accessible or accessible from the environment where the Azure Function is running.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "2.0",
|
|
3
|
+
"logging": {
|
|
4
|
+
"applicationInsights": {
|
|
5
|
+
"samplingSettings": {
|
|
6
|
+
"isEnabled": true,
|
|
7
|
+
"excludedTypes": "Request"
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
},
|
|
11
|
+
"extensionBundle": {
|
|
12
|
+
"id": "Microsoft.Azure.Functions.ExtensionBundle",
|
|
13
|
+
"version": "[4.*, 5.0.0)"
|
|
14
|
+
}
|
|
15
|
+
}
|
package/lib/requestExecutor.js
CHANGED
|
@@ -214,7 +214,7 @@ const requestWithMonitor = async (endpoint, url, data, axiosConfigObj) => {
|
|
|
214
214
|
return { response, duration };
|
|
215
215
|
}
|
|
216
216
|
|
|
217
|
-
const MAX_RETRY =
|
|
217
|
+
const MAX_RETRY = 6; // retries for error handling
|
|
218
218
|
const MAX_DUPLICATE_REQUESTS = 3; // duplicate requests to manage latency spikes
|
|
219
219
|
const DUPLICATE_REQUEST_AFTER = 10; // 10 seconds
|
|
220
220
|
|
|
@@ -312,49 +312,57 @@ const makeRequest = async (cortexRequest) => {
|
|
|
312
312
|
const { response, duration } = await Promise.race(promises);
|
|
313
313
|
|
|
314
314
|
// if response status is 2xx
|
|
315
|
-
if (response
|
|
315
|
+
if (response?.status >= 200 && response?.status < 300) {
|
|
316
316
|
return { response, duration };
|
|
317
317
|
} else {
|
|
318
|
-
|
|
318
|
+
const error = new Error(`Request failed with status ${response?.status}`);
|
|
319
|
+
error.response = response;
|
|
320
|
+
error.duration = duration;
|
|
321
|
+
throw error;
|
|
319
322
|
}
|
|
320
323
|
} catch (error) {
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
}
|
|
344
|
-
// set up for a retry by selecting a new endpoint, which will also reinitialize the request
|
|
345
|
-
cortexRequest.selectNewEndpoint();
|
|
324
|
+
// Handle both cases: error with response object and direct error object
|
|
325
|
+
const status = error?.response?.status || error?.status || 502; // default to 502 if no status
|
|
326
|
+
const duration = error?.duration;
|
|
327
|
+
const response = error?.response || {error: error};
|
|
328
|
+
|
|
329
|
+
// Calculate backoff time - use Retry-After for 429s if available
|
|
330
|
+
let backoffTime = 1000 * Math.pow(2, i);
|
|
331
|
+
if (status === 429 && (response?.headers?.['retry-after'] || error?.headers?.['retry-after'])) {
|
|
332
|
+
backoffTime = parseInt(response?.headers?.['retry-after'] || error?.headers?.['retry-after']) * 1000;
|
|
333
|
+
logger.warn(`>>> [${requestId}] Rate limited (429). Retry-After: ${response?.headers?.['retry-after'] || error?.headers?.['retry-after']}s`);
|
|
334
|
+
}
|
|
335
|
+
const jitter = backoffTime * 0.2 * Math.random();
|
|
336
|
+
|
|
337
|
+
// if there is only one endpoint, only retry select error codes
|
|
338
|
+
if (cortexRequest.model.endpoints.length === 1) {
|
|
339
|
+
if (status !== 429 &&
|
|
340
|
+
status !== 408 &&
|
|
341
|
+
status !== 500 &&
|
|
342
|
+
status !== 502 &&
|
|
343
|
+
status !== 503 &&
|
|
344
|
+
status !== 504) {
|
|
345
|
+
return { response, duration };
|
|
346
346
|
}
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
347
|
+
// set up for a retry by reinitializing the request
|
|
348
|
+
cortexRequest.initRequest();
|
|
349
|
+
} else {
|
|
350
|
+
// if there are multiple endpoints, retry everything by default
|
|
351
|
+
// as it could be a temporary issue with one endpoint
|
|
352
|
+
// certain errors (e.g. 400) are problems with the request itself
|
|
353
|
+
// and should not be retried
|
|
354
|
+
if (status == 400 || status == 413) {
|
|
354
355
|
return { response, duration };
|
|
355
356
|
}
|
|
357
|
+
// set up for a retry by selecting a new endpoint, which will also reinitialize the request
|
|
358
|
+
cortexRequest.selectNewEndpoint();
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
if (i < MAX_RETRY - 1) {
|
|
362
|
+
logger.info(`>>> [${requestId}] retrying request due to ${status} response. Retry count: ${i + 1}. Retrying in ${backoffTime + jitter}ms`);
|
|
363
|
+
await new Promise(r => setTimeout(r, backoffTime + jitter));
|
|
356
364
|
} else {
|
|
357
|
-
|
|
365
|
+
return { response, duration };
|
|
358
366
|
}
|
|
359
367
|
}
|
|
360
368
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@aj-archipelago/cortex",
|
|
3
|
-
"version": "1.3.
|
|
3
|
+
"version": "1.3.51",
|
|
4
4
|
"description": "Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.",
|
|
5
5
|
"private": false,
|
|
6
6
|
"repository": {
|
|
@@ -162,7 +162,7 @@ export default {
|
|
|
162
162
|
|
|
163
163
|
// Map tool names to index names
|
|
164
164
|
const toolToIndex = {
|
|
165
|
-
'
|
|
165
|
+
'searchpersonalindex': 'indexcortex',
|
|
166
166
|
'searchaja': 'indexucmsaja',
|
|
167
167
|
'searchaje': 'indexucmsaje',
|
|
168
168
|
'searchwires': 'indexwires'
|
|
@@ -49,7 +49,7 @@ export default {
|
|
|
49
49
|
icon: "📝",
|
|
50
50
|
function: {
|
|
51
51
|
name: "AnalyzeText",
|
|
52
|
-
description: "Use specifically for reading, analyzing, and answering questions about text
|
|
52
|
+
description: "Use specifically for reading, analyzing, and answering questions about text files (including csv, json, html, etc.).",
|
|
53
53
|
parameters: {
|
|
54
54
|
type: "object",
|
|
55
55
|
properties: {
|
|
@@ -65,7 +65,29 @@ export default {
|
|
|
65
65
|
required: ["detailedInstructions", "userMessage"]
|
|
66
66
|
}
|
|
67
67
|
}
|
|
68
|
-
},
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
type: "function",
|
|
71
|
+
icon: "📝",
|
|
72
|
+
function: {
|
|
73
|
+
name: "AnalyzeMarkdown",
|
|
74
|
+
description: "Use specifically for reading, analyzing, and answering questions about markdown files.",
|
|
75
|
+
parameters: {
|
|
76
|
+
type: "object",
|
|
77
|
+
properties: {
|
|
78
|
+
detailedInstructions: {
|
|
79
|
+
type: "string",
|
|
80
|
+
description: "Detailed instructions about what you need the tool to do - questions you need answered about the files, etc."
|
|
81
|
+
},
|
|
82
|
+
userMessage: {
|
|
83
|
+
type: "string",
|
|
84
|
+
description: "A user-friendly message that describes what you're doing with this tool"
|
|
85
|
+
}
|
|
86
|
+
},
|
|
87
|
+
required: ["detailedInstructions", "userMessage"]
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
},
|
|
69
91
|
{
|
|
70
92
|
type: "function",
|
|
71
93
|
icon: "🖼️",
|
|
@@ -8,12 +8,6 @@ import logger from '../../lib/logger.js';
|
|
|
8
8
|
import CortexRequest from '../../lib/cortexRequest.js';
|
|
9
9
|
import { downloadFile, deleteTempPath, convertSrtToText, alignSubtitles, getMediaChunks, markCompletedForCleanUp } from '../../lib/util.js';
|
|
10
10
|
|
|
11
|
-
const WHISPER_TS_API_URL = config.get('whisperTSApiUrl');
|
|
12
|
-
if(WHISPER_TS_API_URL){
|
|
13
|
-
logger.info(`WHISPER API URL using ${WHISPER_TS_API_URL}`);
|
|
14
|
-
}else{
|
|
15
|
-
logger.warn(`WHISPER API URL not set using default OpenAI API Whisper`);
|
|
16
|
-
}
|
|
17
11
|
|
|
18
12
|
const OFFSET_CHUNK = 500; //seconds of each chunk offset, only used if helper does not provide
|
|
19
13
|
|
|
@@ -41,7 +35,6 @@ class OpenAIWhisperPlugin extends ModelPlugin {
|
|
|
41
35
|
const response_format = responseFormat || 'text';
|
|
42
36
|
|
|
43
37
|
const whisperInitCallback = (requestInstance) => {
|
|
44
|
-
|
|
45
38
|
const formData = new FormData();
|
|
46
39
|
formData.append('file', fs.createReadStream(chunk));
|
|
47
40
|
formData.append('model', requestInstance.params.model);
|
|
@@ -51,7 +44,6 @@ class OpenAIWhisperPlugin extends ModelPlugin {
|
|
|
51
44
|
|
|
52
45
|
requestInstance.data = formData;
|
|
53
46
|
requestInstance.addHeaders = { ...formData.getHeaders() };
|
|
54
|
-
|
|
55
47
|
};
|
|
56
48
|
|
|
57
49
|
cortexRequest.initCallback = whisperInitCallback;
|
|
@@ -64,7 +56,6 @@ class OpenAIWhisperPlugin extends ModelPlugin {
|
|
|
64
56
|
}
|
|
65
57
|
|
|
66
58
|
const processTS = async (uri) => {
|
|
67
|
-
|
|
68
59
|
const tsparams = { fileurl:uri };
|
|
69
60
|
const { language } = parameters;
|
|
70
61
|
if(language) tsparams.language = language;
|
|
@@ -75,37 +66,18 @@ class OpenAIWhisperPlugin extends ModelPlugin {
|
|
|
75
66
|
tsparams.word_timestamps = !wordTimestamped ? "False" : wordTimestamped;
|
|
76
67
|
|
|
77
68
|
const cortexRequest = new CortexRequest({ pathwayResolver });
|
|
78
|
-
cortexRequest.url = WHISPER_TS_API_URL;
|
|
79
|
-
cortexRequest.data = tsparams;
|
|
80
69
|
const whisperInitCallback = (requestInstance) => {
|
|
81
|
-
requestInstance.url = WHISPER_TS_API_URL;
|
|
82
70
|
requestInstance.data = tsparams;
|
|
83
71
|
};
|
|
84
72
|
cortexRequest.initCallback = whisperInitCallback;
|
|
85
73
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
sendProgress(true, true);
|
|
91
|
-
try {
|
|
92
|
-
res = await this.executeRequest(cortexRequest);
|
|
93
|
-
if (!res) {
|
|
94
|
-
throw new Error('Received null or empty response');
|
|
95
|
-
}
|
|
96
|
-
if(res?.statusCode && res?.statusCode >= 400){
|
|
97
|
-
throw new Error(res?.message || 'An error occurred.');
|
|
98
|
-
}
|
|
99
|
-
break;
|
|
100
|
-
}
|
|
101
|
-
catch(err){
|
|
102
|
-
logger.warn(`Error calling timestamped API: ${err}. Retrying ${attempt+1} of ${MAX_RETRIES}...`);
|
|
103
|
-
attempt++;
|
|
104
|
-
}
|
|
74
|
+
sendProgress(true, true);
|
|
75
|
+
const res = await this.executeRequest(cortexRequest);
|
|
76
|
+
if (!res) {
|
|
77
|
+
throw new Error('Received null or empty response');
|
|
105
78
|
}
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
throw new Error(res.message || 'An error occurred.');
|
|
79
|
+
if(res?.statusCode && res?.statusCode >= 400){
|
|
80
|
+
throw new Error(res?.message || 'An error occurred.');
|
|
109
81
|
}
|
|
110
82
|
|
|
111
83
|
if(!wordTimestamped && !responseFormat){
|
|
@@ -151,71 +123,71 @@ class OpenAIWhisperPlugin extends ModelPlugin {
|
|
|
151
123
|
});
|
|
152
124
|
}
|
|
153
125
|
|
|
154
|
-
async
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
126
|
+
const processURI = async (uri) => {
|
|
127
|
+
let result = null;
|
|
128
|
+
let _promise = null;
|
|
129
|
+
let errorOccurred = false;
|
|
158
130
|
|
|
159
|
-
|
|
131
|
+
const intervalId = setInterval(() => sendProgress(true), 3000);
|
|
160
132
|
|
|
161
|
-
|
|
162
|
-
|
|
133
|
+
// use Timestamped API if model is oai-whisper-ts
|
|
134
|
+
const useTS = this.modelName === 'oai-whisper-ts';
|
|
163
135
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
136
|
+
if (useTS) {
|
|
137
|
+
_promise = processTS;
|
|
138
|
+
} else {
|
|
139
|
+
_promise = processChunk;
|
|
140
|
+
}
|
|
169
141
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
if(errorOccurred) {
|
|
180
|
-
throw errorOccurred;
|
|
181
|
-
}
|
|
142
|
+
await _promise(uri).then((ts) => {
|
|
143
|
+
result = ts;
|
|
144
|
+
}).catch((err) => {
|
|
145
|
+
errorOccurred = err;
|
|
146
|
+
}).finally(() => {
|
|
147
|
+
clearInterval(intervalId);
|
|
148
|
+
sendProgress();
|
|
149
|
+
});
|
|
182
150
|
|
|
183
|
-
|
|
184
|
-
|
|
151
|
+
if(errorOccurred) {
|
|
152
|
+
throw errorOccurred;
|
|
153
|
+
}
|
|
185
154
|
|
|
186
|
-
|
|
187
|
-
|
|
155
|
+
return result;
|
|
156
|
+
}
|
|
188
157
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
if (!mediaChunks || !mediaChunks.length) {
|
|
193
|
-
throw new Error(`Error in getting chunks from media helper for file ${file}`);
|
|
194
|
-
}
|
|
158
|
+
let offsets = [];
|
|
159
|
+
let uris = []
|
|
195
160
|
|
|
196
|
-
|
|
197
|
-
|
|
161
|
+
try {
|
|
162
|
+
const mediaChunks = await getMediaChunks(file, requestId);
|
|
163
|
+
|
|
164
|
+
if (!mediaChunks || !mediaChunks.length) {
|
|
165
|
+
throw new Error(`Error in getting chunks from media helper for file ${file}`);
|
|
166
|
+
}
|
|
198
167
|
|
|
199
|
-
|
|
168
|
+
uris = mediaChunks.map((chunk) => chunk?.uri || chunk);
|
|
169
|
+
offsets = mediaChunks.map((chunk, index) => chunk?.offset || index * OFFSET_CHUNK);
|
|
200
170
|
|
|
201
|
-
|
|
202
|
-
sendProgress();
|
|
171
|
+
totalCount = mediaChunks.length + 1; // total number of chunks that will be processed
|
|
203
172
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
const promisesToProcess = currentBatchURIs.map(uri => processURI(uri));
|
|
207
|
-
const results = await Promise.all(promisesToProcess);
|
|
208
|
-
|
|
209
|
-
for(const res of results) {
|
|
210
|
-
result.push(res);
|
|
211
|
-
}
|
|
212
|
-
}
|
|
173
|
+
const batchSize = 4;
|
|
174
|
+
sendProgress();
|
|
213
175
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
176
|
+
for (let i = 0; i < uris.length; i += batchSize) {
|
|
177
|
+
const currentBatchURIs = uris.slice(i, i + batchSize);
|
|
178
|
+
const promisesToProcess = currentBatchURIs.map(uri => processURI(uri));
|
|
179
|
+
const results = await Promise.all(promisesToProcess);
|
|
180
|
+
|
|
181
|
+
for(const res of results) {
|
|
182
|
+
result.push(res);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
} catch (error) {
|
|
187
|
+
const errMsg = `Transcribe error: ${error?.response?.data || error?.message || error}`;
|
|
188
|
+
logger.error(errMsg);
|
|
189
|
+
return errMsg;
|
|
190
|
+
}
|
|
219
191
|
finally {
|
|
220
192
|
try {
|
|
221
193
|
for (const chunk of chunks) {
|