@aj-archipelago/cortex 1.0.16 → 1.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -38,10 +38,10 @@ export async function pdfToText(filePath) {
|
|
|
38
38
|
const page = await pdf.getPage(i);
|
|
39
39
|
const textContent = await page.getTextContent();
|
|
40
40
|
const strings = textContent.items.map(item => item.str);
|
|
41
|
-
finalText += strings.join(' ');
|
|
41
|
+
finalText += strings.join(' ') + '\n';
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
return finalText;
|
|
44
|
+
return finalText.trim();
|
|
45
45
|
}
|
|
46
46
|
|
|
47
47
|
export async function csvToText(filePath) {
|
|
@@ -9,6 +9,8 @@ import os from 'os';
|
|
|
9
9
|
import { v4 as uuidv4 } from 'uuid';
|
|
10
10
|
import fs from 'fs';
|
|
11
11
|
|
|
12
|
+
const DOC_EXTENSIONS = [".txt", ".json", ".csv", ".md", ".xml", ".js", ".html", ".css", '.pdf', '.docx', '.xlsx', '.csv'];
|
|
13
|
+
|
|
12
14
|
const useAzure = process.env.AZURE_STORAGE_CONNECTION_STRING ? true : false;
|
|
13
15
|
console.log(useAzure ? 'Using Azure Storage' : 'Using local file system');
|
|
14
16
|
|
|
@@ -64,7 +66,7 @@ async function main(context, req) {
|
|
|
64
66
|
await publishRequestProgress({ requestId, progress, completedCount, totalCount, numberOfChunks, data });
|
|
65
67
|
}
|
|
66
68
|
|
|
67
|
-
const isDocument =
|
|
69
|
+
const isDocument = DOC_EXTENSIONS.some(ext => uri.toLowerCase().endsWith(ext));
|
|
68
70
|
|
|
69
71
|
try {
|
|
70
72
|
if (isDocument) {
|
|
@@ -72,20 +74,47 @@ async function main(context, req) {
|
|
|
72
74
|
const file = path.join(os.tmpdir(), `${uuidv4()}${extension}`);
|
|
73
75
|
await downloadFile(uri, file)
|
|
74
76
|
const text = await documentToText(file);
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
77
|
+
let tmpPath;
|
|
78
|
+
|
|
79
|
+
try{
|
|
80
|
+
if (save) {
|
|
81
|
+
const fileName = `${uuidv4()}.txt`; // generate unique file name
|
|
82
|
+
const filePath = path.join(os.tmpdir(), fileName);
|
|
83
|
+
tmpPath = filePath;
|
|
84
|
+
fs.writeFileSync(filePath, text); // write text to file
|
|
85
|
+
|
|
86
|
+
// save file to the cloud or local file system
|
|
87
|
+
const saveResult = useAzure ? await saveFileToBlob(filePath, requestId) : await moveFileToPublicFolder(filePath, requestId);
|
|
88
|
+
result.push(saveResult);
|
|
89
|
+
|
|
90
|
+
} else {
|
|
91
|
+
result.push(...easyChunker(text));
|
|
92
|
+
}
|
|
93
|
+
}catch(err){
|
|
94
|
+
console.log(`Error saving file ${uri} with request id ${requestId}:`, err);
|
|
95
|
+
}finally{
|
|
96
|
+
try{
|
|
97
|
+
// delete temporary files
|
|
98
|
+
tmpPath && fs.unlinkSync(tmpPath);
|
|
99
|
+
file && fs.unlinkSync(file);
|
|
100
|
+
console.log(`Cleaned temp files ${tmpPath}, ${file}`);
|
|
101
|
+
}catch(err){
|
|
102
|
+
console.log(`Error cleaning temp files ${tmpPath}, ${file}:`, err);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
try{
|
|
106
|
+
//delete uploaded prev nontext file
|
|
107
|
+
//check cleanup for whisper temp uploaded files url
|
|
108
|
+
const regex = /whispertempfiles\/([a-z0-9-]+)/;
|
|
109
|
+
const match = uri.match(regex);
|
|
110
|
+
if (match && match[1]) {
|
|
111
|
+
const extractedValue = match[1];
|
|
112
|
+
useAzure ? await deleteBlob(extractedValue) : await deleteFolder(extractedValue);
|
|
113
|
+
console.log(`Cleaned temp file ${uri} with request id ${extractedValue}`);
|
|
114
|
+
}
|
|
115
|
+
}catch(err){
|
|
116
|
+
console.log(`Error cleaning temp file ${uri}:`, err);
|
|
117
|
+
}
|
|
89
118
|
}
|
|
90
119
|
}else{
|
|
91
120
|
|
|
@@ -123,7 +152,7 @@ async function main(context, req) {
|
|
|
123
152
|
console.error("An error occurred:", error);
|
|
124
153
|
} finally {
|
|
125
154
|
try {
|
|
126
|
-
(isYoutubeUrl
|
|
155
|
+
(isYoutubeUrl) && (await deleteTempPath(file));
|
|
127
156
|
folder && (await deleteTempPath(folder));
|
|
128
157
|
} catch (error) {
|
|
129
158
|
console.error("An error occurred while deleting:", error);
|
package/package.json
CHANGED
|
@@ -10,6 +10,8 @@ const API_URL = config.get('whisperMediaApiUrl');
|
|
|
10
10
|
|
|
11
11
|
const TOP = 1000;
|
|
12
12
|
|
|
13
|
+
let DIRECT_FILE_EXTENSIONS = [".txt", ".json", ".csv", ".md", ".xml", ".js", ".html", ".css"];
|
|
14
|
+
|
|
13
15
|
class AzureCognitivePlugin extends ModelPlugin {
|
|
14
16
|
constructor(config, pathway, modelName, model) {
|
|
15
17
|
super(config, pathway, modelName, model);
|
|
@@ -51,14 +53,24 @@ class AzureCognitivePlugin extends ModelPlugin {
|
|
|
51
53
|
}
|
|
52
54
|
|
|
53
55
|
if (mode == 'index') {
|
|
56
|
+
|
|
57
|
+
/*
|
|
54
58
|
const calculateInputVector = async () => {
|
|
55
|
-
|
|
59
|
+
try{
|
|
60
|
+
if(!text || !text.trim()){
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
return JSON.parse(await callPathway(this.config, 'embeddings', { text }))[0];
|
|
64
|
+
}catch(err){
|
|
65
|
+
console.log(`Error in calculating input vector for text: ${text}, error: ${err}`);
|
|
66
|
+
}
|
|
56
67
|
}
|
|
68
|
+
*/
|
|
57
69
|
|
|
58
70
|
const doc = {
|
|
59
71
|
id: uuidv4(),
|
|
60
72
|
content: text,
|
|
61
|
-
contentVector: inputVector || (await calculateInputVector()),
|
|
73
|
+
//contentVector: inputVector || (await calculateInputVector()),
|
|
62
74
|
owner: savedContextId,
|
|
63
75
|
docId: docId || uuidv4(),
|
|
64
76
|
createdAt: new Date().toISOString()
|
|
@@ -141,7 +153,7 @@ class AzureCognitivePlugin extends ModelPlugin {
|
|
|
141
153
|
let url = file;
|
|
142
154
|
//if not txt file, use helper app to convert to txt
|
|
143
155
|
const extension = path.extname(file).toLowerCase();
|
|
144
|
-
if (extension
|
|
156
|
+
if (!DIRECT_FILE_EXTENSIONS.includes(extension)) {
|
|
145
157
|
try {
|
|
146
158
|
const {data} = await axios.get(API_URL, { params: { uri: file, requestId, save: true } });
|
|
147
159
|
url = data[0]
|
|
@@ -154,10 +166,18 @@ class AzureCognitivePlugin extends ModelPlugin {
|
|
|
154
166
|
const { data } = await axios.get(url);
|
|
155
167
|
await this.markCompletedForCleanUp(requestId);
|
|
156
168
|
|
|
169
|
+
if(!data){
|
|
170
|
+
throw Error(`No data can be extracted out of file!`);
|
|
171
|
+
}
|
|
172
|
+
|
|
157
173
|
//return await this.execute(data, {...parameters, file:null}, prompt, pathwayResolver);
|
|
158
174
|
return await callPathway(this.config, 'cognitive_insert', {...parameters, file:null, text:data });
|
|
159
175
|
}
|
|
160
176
|
|
|
177
|
+
if (mode === 'index' && (!text || !text.trim()) ){
|
|
178
|
+
return; // nothing to index
|
|
179
|
+
}
|
|
180
|
+
|
|
161
181
|
const { data, params } = await this.getRequestParameters(text, parameters, prompt, mode, indexName, savedContextId, {headers, requestId, pathway, url});
|
|
162
182
|
|
|
163
183
|
// update contextid last used
|