@aj-archipelago/cortex 1.0.14 → 1.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/helper_apps/MediaFileChunker/docHelper.js +15 -0
- package/helper_apps/MediaFileChunker/index.js +20 -4
- package/helper_apps/MediaFileChunker/package-lock.json +11 -0
- package/helper_apps/MediaFileChunker/package.json +1 -0
- package/package.json +1 -1
- package/pathways/index.js +2 -0
- package/server/plugins/azureCognitivePlugin.js +40 -0
|
@@ -2,6 +2,7 @@ import pdfjsLib from 'pdfjs-dist';
|
|
|
2
2
|
import fs from 'fs/promises';
|
|
3
3
|
import mammoth from 'mammoth';
|
|
4
4
|
import XLSX from 'xlsx';
|
|
5
|
+
import Papa from 'papaparse';
|
|
5
6
|
|
|
6
7
|
export async function txtToText(filePath) {
|
|
7
8
|
const text = await fs.readFile(filePath, 'utf-8');
|
|
@@ -43,6 +44,18 @@ export async function pdfToText(filePath) {
|
|
|
43
44
|
return finalText;
|
|
44
45
|
}
|
|
45
46
|
|
|
47
|
+
export async function csvToText(filePath) {
|
|
48
|
+
const text = await fs.readFile(filePath, 'utf-8');
|
|
49
|
+
const results = Papa.parse(text);
|
|
50
|
+
let finalText = '';
|
|
51
|
+
|
|
52
|
+
results.data.forEach(row => {
|
|
53
|
+
finalText += row.join(' ') + '\n';
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
return finalText;
|
|
57
|
+
}
|
|
58
|
+
|
|
46
59
|
export async function documentToText(filePath) {
|
|
47
60
|
const fileExtension = filePath.split('.').pop();
|
|
48
61
|
|
|
@@ -55,6 +68,8 @@ export async function documentToText(filePath) {
|
|
|
55
68
|
return docxToText(filePath);
|
|
56
69
|
case 'xlsx':
|
|
57
70
|
return xlsxToText(filePath);
|
|
71
|
+
case 'csv':
|
|
72
|
+
return csvToText(filePath);
|
|
58
73
|
default:
|
|
59
74
|
throw new Error(`Unsupported file type: ${fileExtension}`);
|
|
60
75
|
}
|
|
@@ -7,6 +7,7 @@ import { documentToText, easyChunker } from './docHelper.js';
|
|
|
7
7
|
import path from 'path';
|
|
8
8
|
import os from 'os';
|
|
9
9
|
import { v4 as uuidv4 } from 'uuid';
|
|
10
|
+
import fs from 'fs';
|
|
10
11
|
|
|
11
12
|
const useAzure = process.env.AZURE_STORAGE_CONNECTION_STRING ? true : false;
|
|
12
13
|
console.log(useAzure ? 'Using Azure Storage' : 'Using local file system');
|
|
@@ -38,7 +39,7 @@ async function main(context, req) {
|
|
|
38
39
|
return
|
|
39
40
|
}
|
|
40
41
|
|
|
41
|
-
const { uri, requestId } = req.body?.params || req.query;
|
|
42
|
+
const { uri, requestId, save } = req.body?.params || req.query;
|
|
42
43
|
if (!uri || !requestId) {
|
|
43
44
|
context.res = {
|
|
44
45
|
status: 400,
|
|
@@ -63,14 +64,29 @@ async function main(context, req) {
|
|
|
63
64
|
await publishRequestProgress({ requestId, progress, completedCount, totalCount, numberOfChunks, data });
|
|
64
65
|
}
|
|
65
66
|
|
|
66
|
-
const isDocument = ['.pdf', '.txt', '.docx', '.xlsx'].some(ext => uri.toLowerCase().endsWith(ext));
|
|
67
|
+
const isDocument = ['.pdf', '.txt', '.docx', '.xlsx', '.csv'].some(ext => uri.toLowerCase().endsWith(ext));
|
|
67
68
|
|
|
68
69
|
try {
|
|
69
70
|
if (isDocument) {
|
|
70
71
|
const extension = path.extname(uri).toLowerCase();
|
|
71
72
|
const file = path.join(os.tmpdir(), `${uuidv4()}${extension}`);
|
|
72
|
-
await downloadFile(uri,file)
|
|
73
|
-
|
|
73
|
+
await downloadFile(uri, file)
|
|
74
|
+
const text = await documentToText(file);
|
|
75
|
+
if (save) {
|
|
76
|
+
const fileName = `${uuidv4()}.txt`; // generate unique file name
|
|
77
|
+
const filePath = path.join(os.tmpdir(), fileName);
|
|
78
|
+
const tmpPath = filePath;
|
|
79
|
+
fs.writeFileSync(filePath, text); // write text to file
|
|
80
|
+
|
|
81
|
+
// save file to the cloud or local file system
|
|
82
|
+
const saveResult = useAzure ? await saveFileToBlob(filePath, requestId) : await moveFileToPublicFolder(filePath, requestId);
|
|
83
|
+
result.push(saveResult);
|
|
84
|
+
|
|
85
|
+
// delete temporary file
|
|
86
|
+
fs.unlinkSync(tmpPath);
|
|
87
|
+
} else {
|
|
88
|
+
result.push(...easyChunker(text));
|
|
89
|
+
}
|
|
74
90
|
}else{
|
|
75
91
|
|
|
76
92
|
if (isYoutubeUrl) {
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
"fluent-ffmpeg": "^2.1.2",
|
|
19
19
|
"ioredis": "^5.3.1",
|
|
20
20
|
"mammoth": "^1.6.0",
|
|
21
|
+
"papaparse": "^5.4.1",
|
|
21
22
|
"pdfjs-dist": "^3.9.179",
|
|
22
23
|
"public-ip": "^6.0.1",
|
|
23
24
|
"uuid": "^9.0.0",
|
|
@@ -1989,6 +1990,11 @@
|
|
|
1989
1990
|
"resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz",
|
|
1990
1991
|
"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw=="
|
|
1991
1992
|
},
|
|
1993
|
+
"node_modules/papaparse": {
|
|
1994
|
+
"version": "5.4.1",
|
|
1995
|
+
"resolved": "https://registry.npmjs.org/papaparse/-/papaparse-5.4.1.tgz",
|
|
1996
|
+
"integrity": "sha512-HipMsgJkZu8br23pW15uvo6sib6wne/4woLZPlFf3rpDyMe9ywEXUsuD7+6K9PRkJlVT51j/sCOYDKGGS3ZJrw=="
|
|
1997
|
+
},
|
|
1992
1998
|
"node_modules/parseurl": {
|
|
1993
1999
|
"version": "1.3.3",
|
|
1994
2000
|
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
|
@@ -4098,6 +4104,11 @@
|
|
|
4098
4104
|
"resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz",
|
|
4099
4105
|
"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw=="
|
|
4100
4106
|
},
|
|
4107
|
+
"papaparse": {
|
|
4108
|
+
"version": "5.4.1",
|
|
4109
|
+
"resolved": "https://registry.npmjs.org/papaparse/-/papaparse-5.4.1.tgz",
|
|
4110
|
+
"integrity": "sha512-HipMsgJkZu8br23pW15uvo6sib6wne/4woLZPlFf3rpDyMe9ywEXUsuD7+6K9PRkJlVT51j/sCOYDKGGS3ZJrw=="
|
|
4111
|
+
},
|
|
4101
4112
|
"parseurl": {
|
|
4102
4113
|
"version": "1.3.3",
|
|
4103
4114
|
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
package/package.json
CHANGED
package/pathways/index.js
CHANGED
|
@@ -6,6 +6,7 @@ import cognitive_insert from './cognitive_insert.js';
|
|
|
6
6
|
import cognitive_search from './cognitive_search.js';
|
|
7
7
|
import complete from './complete.js';
|
|
8
8
|
import entities from './entities.js';
|
|
9
|
+
import language from './language.js';
|
|
9
10
|
import paraphrase from './paraphrase.js';
|
|
10
11
|
import sentiment from './sentiment.js';
|
|
11
12
|
import summary from './summary.js';
|
|
@@ -28,6 +29,7 @@ export {
|
|
|
28
29
|
complete,
|
|
29
30
|
embeddings,
|
|
30
31
|
entities,
|
|
32
|
+
language,
|
|
31
33
|
paraphrase,
|
|
32
34
|
sentiment,
|
|
33
35
|
summary,
|
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
import { callPathway } from '../../lib/pathwayTools.js';
|
|
3
3
|
import ModelPlugin from './modelPlugin.js';
|
|
4
4
|
import { v4 as uuidv4 } from 'uuid';
|
|
5
|
+
import path from 'path';
|
|
6
|
+
import { config } from '../../config.js';
|
|
7
|
+
import { axios } from '../../lib/request.js';
|
|
8
|
+
|
|
9
|
+
const API_URL = config.get('whisperMediaApiUrl');
|
|
5
10
|
|
|
6
11
|
const TOP = 1000;
|
|
7
12
|
|
|
@@ -109,6 +114,19 @@ class AzureCognitivePlugin extends ModelPlugin {
|
|
|
109
114
|
}
|
|
110
115
|
}
|
|
111
116
|
|
|
117
|
+
async markCompletedForCleanUp(requestId) {
|
|
118
|
+
try {
|
|
119
|
+
if (API_URL) {
|
|
120
|
+
//call helper api to mark processing as completed
|
|
121
|
+
const res = await axios.delete(API_URL, { params: { requestId } });
|
|
122
|
+
console.log(`Marked request ${requestId} as completed:`, res.data);
|
|
123
|
+
return res.data;
|
|
124
|
+
}
|
|
125
|
+
} catch (err) {
|
|
126
|
+
console.log(`Error marking request ${requestId} as completed:`, err);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
112
130
|
// Execute the request to the Azure Cognitive API
|
|
113
131
|
async execute(text, parameters, prompt, pathwayResolver) {
|
|
114
132
|
const { requestId, pathway, savedContextId, savedContext } = pathwayResolver;
|
|
@@ -118,6 +136,28 @@ class AzureCognitivePlugin extends ModelPlugin {
|
|
|
118
136
|
url = this.ensureIndex(url, indexName);
|
|
119
137
|
const headers = this.model.headers;
|
|
120
138
|
|
|
139
|
+
const { file } = parameters;
|
|
140
|
+
if(file){
|
|
141
|
+
let url = file;
|
|
142
|
+
//if not txt file, use helper app to convert to txt
|
|
143
|
+
const extension = path.extname(file).toLowerCase();
|
|
144
|
+
if (extension !== '.txt') {
|
|
145
|
+
try {
|
|
146
|
+
const {data} = await axios.get(API_URL, { params: { uri: file, requestId, save: true } });
|
|
147
|
+
url = data[0]
|
|
148
|
+
} catch (error) {
|
|
149
|
+
console.log(`Error converting file ${file} to txt:`, error);
|
|
150
|
+
throw error;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const { data } = await axios.get(url);
|
|
155
|
+
await this.markCompletedForCleanUp(requestId);
|
|
156
|
+
|
|
157
|
+
//return await this.execute(data, {...parameters, file:null}, prompt, pathwayResolver);
|
|
158
|
+
return await callPathway(this.config, 'cognitive_insert', {...parameters, file:null, text:data });
|
|
159
|
+
}
|
|
160
|
+
|
|
121
161
|
const { data, params } = await this.getRequestParameters(text, parameters, prompt, mode, indexName, savedContextId, {headers, requestId, pathway, url});
|
|
122
162
|
|
|
123
163
|
// update contextid last used
|