@aj-archipelago/cortex 1.0.13 → 1.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/helper_apps/MediaFileChunker/Dockerfile +3 -4
- package/helper_apps/MediaFileChunker/docHelper.js +15 -0
- package/helper_apps/MediaFileChunker/index.js +20 -4
- package/helper_apps/MediaFileChunker/package-lock.json +11 -0
- package/helper_apps/MediaFileChunker/package.json +1 -0
- package/package.json +1 -1
- package/pathways/index.js +2 -0
- package/server/plugins/azureCognitivePlugin.js +46 -2
|
@@ -2,6 +2,7 @@ import pdfjsLib from 'pdfjs-dist';
|
|
|
2
2
|
import fs from 'fs/promises';
|
|
3
3
|
import mammoth from 'mammoth';
|
|
4
4
|
import XLSX from 'xlsx';
|
|
5
|
+
import Papa from 'papaparse';
|
|
5
6
|
|
|
6
7
|
export async function txtToText(filePath) {
|
|
7
8
|
const text = await fs.readFile(filePath, 'utf-8');
|
|
@@ -43,6 +44,18 @@ export async function pdfToText(filePath) {
|
|
|
43
44
|
return finalText;
|
|
44
45
|
}
|
|
45
46
|
|
|
47
|
+
export async function csvToText(filePath) {
|
|
48
|
+
const text = await fs.readFile(filePath, 'utf-8');
|
|
49
|
+
const results = Papa.parse(text);
|
|
50
|
+
let finalText = '';
|
|
51
|
+
|
|
52
|
+
results.data.forEach(row => {
|
|
53
|
+
finalText += row.join(' ') + '\n';
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
return finalText;
|
|
57
|
+
}
|
|
58
|
+
|
|
46
59
|
export async function documentToText(filePath) {
|
|
47
60
|
const fileExtension = filePath.split('.').pop();
|
|
48
61
|
|
|
@@ -55,6 +68,8 @@ export async function documentToText(filePath) {
|
|
|
55
68
|
return docxToText(filePath);
|
|
56
69
|
case 'xlsx':
|
|
57
70
|
return xlsxToText(filePath);
|
|
71
|
+
case 'csv':
|
|
72
|
+
return csvToText(filePath);
|
|
58
73
|
default:
|
|
59
74
|
throw new Error(`Unsupported file type: ${fileExtension}`);
|
|
60
75
|
}
|
|
@@ -7,6 +7,7 @@ import { documentToText, easyChunker } from './docHelper.js';
|
|
|
7
7
|
import path from 'path';
|
|
8
8
|
import os from 'os';
|
|
9
9
|
import { v4 as uuidv4 } from 'uuid';
|
|
10
|
+
import fs from 'fs';
|
|
10
11
|
|
|
11
12
|
const useAzure = process.env.AZURE_STORAGE_CONNECTION_STRING ? true : false;
|
|
12
13
|
console.log(useAzure ? 'Using Azure Storage' : 'Using local file system');
|
|
@@ -38,7 +39,7 @@ async function main(context, req) {
|
|
|
38
39
|
return
|
|
39
40
|
}
|
|
40
41
|
|
|
41
|
-
const { uri, requestId } = req.body?.params || req.query;
|
|
42
|
+
const { uri, requestId, save } = req.body?.params || req.query;
|
|
42
43
|
if (!uri || !requestId) {
|
|
43
44
|
context.res = {
|
|
44
45
|
status: 400,
|
|
@@ -63,14 +64,29 @@ async function main(context, req) {
|
|
|
63
64
|
await publishRequestProgress({ requestId, progress, completedCount, totalCount, numberOfChunks, data });
|
|
64
65
|
}
|
|
65
66
|
|
|
66
|
-
const isDocument = ['.pdf', '.txt', '.docx', '.xlsx'].some(ext => uri.toLowerCase().endsWith(ext));
|
|
67
|
+
const isDocument = ['.pdf', '.txt', '.docx', '.xlsx', '.csv'].some(ext => uri.toLowerCase().endsWith(ext));
|
|
67
68
|
|
|
68
69
|
try {
|
|
69
70
|
if (isDocument) {
|
|
70
71
|
const extension = path.extname(uri).toLowerCase();
|
|
71
72
|
const file = path.join(os.tmpdir(), `${uuidv4()}${extension}`);
|
|
72
|
-
await downloadFile(uri,file)
|
|
73
|
-
|
|
73
|
+
await downloadFile(uri, file)
|
|
74
|
+
const text = await documentToText(file);
|
|
75
|
+
if (save) {
|
|
76
|
+
const fileName = `${uuidv4()}.txt`; // generate unique file name
|
|
77
|
+
const filePath = path.join(os.tmpdir(), fileName);
|
|
78
|
+
const tmpPath = filePath;
|
|
79
|
+
fs.writeFileSync(filePath, text); // write text to file
|
|
80
|
+
|
|
81
|
+
// save file to the cloud or local file system
|
|
82
|
+
const saveResult = useAzure ? await saveFileToBlob(filePath, requestId) : await moveFileToPublicFolder(filePath, requestId);
|
|
83
|
+
result.push(saveResult);
|
|
84
|
+
|
|
85
|
+
// delete temporary file
|
|
86
|
+
fs.unlinkSync(tmpPath);
|
|
87
|
+
} else {
|
|
88
|
+
result.push(...easyChunker(text));
|
|
89
|
+
}
|
|
74
90
|
}else{
|
|
75
91
|
|
|
76
92
|
if (isYoutubeUrl) {
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
"fluent-ffmpeg": "^2.1.2",
|
|
19
19
|
"ioredis": "^5.3.1",
|
|
20
20
|
"mammoth": "^1.6.0",
|
|
21
|
+
"papaparse": "^5.4.1",
|
|
21
22
|
"pdfjs-dist": "^3.9.179",
|
|
22
23
|
"public-ip": "^6.0.1",
|
|
23
24
|
"uuid": "^9.0.0",
|
|
@@ -1989,6 +1990,11 @@
|
|
|
1989
1990
|
"resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz",
|
|
1990
1991
|
"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw=="
|
|
1991
1992
|
},
|
|
1993
|
+
"node_modules/papaparse": {
|
|
1994
|
+
"version": "5.4.1",
|
|
1995
|
+
"resolved": "https://registry.npmjs.org/papaparse/-/papaparse-5.4.1.tgz",
|
|
1996
|
+
"integrity": "sha512-HipMsgJkZu8br23pW15uvo6sib6wne/4woLZPlFf3rpDyMe9ywEXUsuD7+6K9PRkJlVT51j/sCOYDKGGS3ZJrw=="
|
|
1997
|
+
},
|
|
1992
1998
|
"node_modules/parseurl": {
|
|
1993
1999
|
"version": "1.3.3",
|
|
1994
2000
|
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
|
@@ -4098,6 +4104,11 @@
|
|
|
4098
4104
|
"resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz",
|
|
4099
4105
|
"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw=="
|
|
4100
4106
|
},
|
|
4107
|
+
"papaparse": {
|
|
4108
|
+
"version": "5.4.1",
|
|
4109
|
+
"resolved": "https://registry.npmjs.org/papaparse/-/papaparse-5.4.1.tgz",
|
|
4110
|
+
"integrity": "sha512-HipMsgJkZu8br23pW15uvo6sib6wne/4woLZPlFf3rpDyMe9ywEXUsuD7+6K9PRkJlVT51j/sCOYDKGGS3ZJrw=="
|
|
4111
|
+
},
|
|
4101
4112
|
"parseurl": {
|
|
4102
4113
|
"version": "1.3.3",
|
|
4103
4114
|
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
package/package.json
CHANGED
package/pathways/index.js
CHANGED
|
@@ -6,6 +6,7 @@ import cognitive_insert from './cognitive_insert.js';
|
|
|
6
6
|
import cognitive_search from './cognitive_search.js';
|
|
7
7
|
import complete from './complete.js';
|
|
8
8
|
import entities from './entities.js';
|
|
9
|
+
import language from './language.js';
|
|
9
10
|
import paraphrase from './paraphrase.js';
|
|
10
11
|
import sentiment from './sentiment.js';
|
|
11
12
|
import summary from './summary.js';
|
|
@@ -28,6 +29,7 @@ export {
|
|
|
28
29
|
complete,
|
|
29
30
|
embeddings,
|
|
30
31
|
entities,
|
|
32
|
+
language,
|
|
31
33
|
paraphrase,
|
|
32
34
|
sentiment,
|
|
33
35
|
summary,
|
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
import { callPathway } from '../../lib/pathwayTools.js';
|
|
3
3
|
import ModelPlugin from './modelPlugin.js';
|
|
4
4
|
import { v4 as uuidv4 } from 'uuid';
|
|
5
|
+
import path from 'path';
|
|
6
|
+
import { config } from '../../config.js';
|
|
7
|
+
import { axios } from '../../lib/request.js';
|
|
8
|
+
|
|
9
|
+
const API_URL = config.get('whisperMediaApiUrl');
|
|
5
10
|
|
|
6
11
|
const TOP = 1000;
|
|
7
12
|
|
|
@@ -55,7 +60,8 @@ class AzureCognitivePlugin extends ModelPlugin {
|
|
|
55
60
|
content: text,
|
|
56
61
|
contentVector: inputVector || (await calculateInputVector()),
|
|
57
62
|
owner: savedContextId,
|
|
58
|
-
docId: docId || uuidv4()
|
|
63
|
+
docId: docId || uuidv4(),
|
|
64
|
+
createdAt: new Date().toISOString()
|
|
59
65
|
}
|
|
60
66
|
// if(!privateData){
|
|
61
67
|
// delete doc.owner;
|
|
@@ -108,17 +114,55 @@ class AzureCognitivePlugin extends ModelPlugin {
|
|
|
108
114
|
}
|
|
109
115
|
}
|
|
110
116
|
|
|
117
|
+
async markCompletedForCleanUp(requestId) {
|
|
118
|
+
try {
|
|
119
|
+
if (API_URL) {
|
|
120
|
+
//call helper api to mark processing as completed
|
|
121
|
+
const res = await axios.delete(API_URL, { params: { requestId } });
|
|
122
|
+
console.log(`Marked request ${requestId} as completed:`, res.data);
|
|
123
|
+
return res.data;
|
|
124
|
+
}
|
|
125
|
+
} catch (err) {
|
|
126
|
+
console.log(`Error marking request ${requestId} as completed:`, err);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
111
130
|
// Execute the request to the Azure Cognitive API
|
|
112
131
|
async execute(text, parameters, prompt, pathwayResolver) {
|
|
113
|
-
const { requestId, pathway, savedContextId } = pathwayResolver;
|
|
132
|
+
const { requestId, pathway, savedContextId, savedContext } = pathwayResolver;
|
|
114
133
|
const mode = this.promptParameters.mode || 'search';
|
|
115
134
|
let url = this.ensureMode(this.requestUrl(text), mode == 'delete' ? 'index' : mode);
|
|
116
135
|
const indexName = parameters.indexName || 'indexcortex';
|
|
117
136
|
url = this.ensureIndex(url, indexName);
|
|
118
137
|
const headers = this.model.headers;
|
|
119
138
|
|
|
139
|
+
const { file } = parameters;
|
|
140
|
+
if(file){
|
|
141
|
+
let url = file;
|
|
142
|
+
//if not txt file, use helper app to convert to txt
|
|
143
|
+
const extension = path.extname(file).toLowerCase();
|
|
144
|
+
if (extension !== '.txt') {
|
|
145
|
+
try {
|
|
146
|
+
const {data} = await axios.get(API_URL, { params: { uri: file, requestId, save: true } });
|
|
147
|
+
url = data[0]
|
|
148
|
+
} catch (error) {
|
|
149
|
+
console.log(`Error converting file ${file} to txt:`, error);
|
|
150
|
+
throw error;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const { data } = await axios.get(url);
|
|
155
|
+
await this.markCompletedForCleanUp(requestId);
|
|
156
|
+
|
|
157
|
+
//return await this.execute(data, {...parameters, file:null}, prompt, pathwayResolver);
|
|
158
|
+
return await callPathway(this.config, 'cognitive_insert', {...parameters, file:null, text:data });
|
|
159
|
+
}
|
|
160
|
+
|
|
120
161
|
const { data, params } = await this.getRequestParameters(text, parameters, prompt, mode, indexName, savedContextId, {headers, requestId, pathway, url});
|
|
121
162
|
|
|
163
|
+
// update contextid last used
|
|
164
|
+
savedContext["lastUsed"] = new Date().toISOString();
|
|
165
|
+
|
|
122
166
|
if (mode === 'delete' && data.value.length == 0){
|
|
123
167
|
return; // nothing to delete
|
|
124
168
|
}
|