@aj-archipelago/cortex 1.0.16 → 1.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -38,10 +38,10 @@ export async function pdfToText(filePath) {
|
|
|
38
38
|
const page = await pdf.getPage(i);
|
|
39
39
|
const textContent = await page.getTextContent();
|
|
40
40
|
const strings = textContent.items.map(item => item.str);
|
|
41
|
-
finalText += strings.join(' ');
|
|
41
|
+
finalText += strings.join(' ') + '\n';
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
return finalText;
|
|
44
|
+
return finalText.trim();
|
|
45
45
|
}
|
|
46
46
|
|
|
47
47
|
export async function csvToText(filePath) {
|
|
@@ -9,6 +9,8 @@ import os from 'os';
|
|
|
9
9
|
import { v4 as uuidv4 } from 'uuid';
|
|
10
10
|
import fs from 'fs';
|
|
11
11
|
|
|
12
|
+
const DOC_EXTENSIONS = [".txt", ".json", ".csv", ".md", ".xml", ".js", ".html", ".css", '.pdf', '.docx', '.xlsx', '.csv'];
|
|
13
|
+
|
|
12
14
|
const useAzure = process.env.AZURE_STORAGE_CONNECTION_STRING ? true : false;
|
|
13
15
|
console.log(useAzure ? 'Using Azure Storage' : 'Using local file system');
|
|
14
16
|
|
|
@@ -64,7 +66,7 @@ async function main(context, req) {
|
|
|
64
66
|
await publishRequestProgress({ requestId, progress, completedCount, totalCount, numberOfChunks, data });
|
|
65
67
|
}
|
|
66
68
|
|
|
67
|
-
const isDocument =
|
|
69
|
+
const isDocument = DOC_EXTENSIONS.some(ext => uri.toLowerCase().endsWith(ext));
|
|
68
70
|
|
|
69
71
|
try {
|
|
70
72
|
if (isDocument) {
|
|
@@ -72,20 +74,47 @@ async function main(context, req) {
|
|
|
72
74
|
const file = path.join(os.tmpdir(), `${uuidv4()}${extension}`);
|
|
73
75
|
await downloadFile(uri, file)
|
|
74
76
|
const text = await documentToText(file);
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
77
|
+
let tmpPath;
|
|
78
|
+
|
|
79
|
+
try{
|
|
80
|
+
if (save) {
|
|
81
|
+
const fileName = `${uuidv4()}.txt`; // generate unique file name
|
|
82
|
+
const filePath = path.join(os.tmpdir(), fileName);
|
|
83
|
+
tmpPath = filePath;
|
|
84
|
+
fs.writeFileSync(filePath, text); // write text to file
|
|
85
|
+
|
|
86
|
+
// save file to the cloud or local file system
|
|
87
|
+
const saveResult = useAzure ? await saveFileToBlob(filePath, requestId) : await moveFileToPublicFolder(filePath, requestId);
|
|
88
|
+
result.push(saveResult);
|
|
89
|
+
|
|
90
|
+
} else {
|
|
91
|
+
result.push(...easyChunker(text));
|
|
92
|
+
}
|
|
93
|
+
}catch(err){
|
|
94
|
+
console.log(`Error saving file ${uri} with request id ${requestId}:`, err);
|
|
95
|
+
}finally{
|
|
96
|
+
try{
|
|
97
|
+
// delete temporary files
|
|
98
|
+
tmpPath && fs.unlinkSync(tmpPath);
|
|
99
|
+
file && fs.unlinkSync(file);
|
|
100
|
+
console.log(`Cleaned temp files ${tmpPath}, ${file}`);
|
|
101
|
+
}catch(err){
|
|
102
|
+
console.log(`Error cleaning temp files ${tmpPath}, ${file}:`, err);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
try{
|
|
106
|
+
//delete uploaded prev nontext file
|
|
107
|
+
//check cleanup for whisper temp uploaded files url
|
|
108
|
+
const regex = /whispertempfiles\/([a-z0-9-]+)/;
|
|
109
|
+
const match = uri.match(regex);
|
|
110
|
+
if (match && match[1]) {
|
|
111
|
+
const extractedValue = match[1];
|
|
112
|
+
useAzure ? await deleteBlob(extractedValue) : await deleteFolder(extractedValue);
|
|
113
|
+
console.log(`Cleaned temp file ${uri} with request id ${extractedValue}`);
|
|
114
|
+
}
|
|
115
|
+
}catch(err){
|
|
116
|
+
console.log(`Error cleaning temp file ${uri}:`, err);
|
|
117
|
+
}
|
|
89
118
|
}
|
|
90
119
|
}else{
|
|
91
120
|
|
|
@@ -123,7 +152,7 @@ async function main(context, req) {
|
|
|
123
152
|
console.error("An error occurred:", error);
|
|
124
153
|
} finally {
|
|
125
154
|
try {
|
|
126
|
-
(isYoutubeUrl
|
|
155
|
+
(isYoutubeUrl) && (await deleteTempPath(file));
|
|
127
156
|
folder && (await deleteTempPath(folder));
|
|
128
157
|
} catch (error) {
|
|
129
158
|
console.error("An error occurred while deleting:", error);
|
package/package.json
CHANGED
|
@@ -10,6 +10,8 @@ const API_URL = config.get('whisperMediaApiUrl');
|
|
|
10
10
|
|
|
11
11
|
const TOP = 1000;
|
|
12
12
|
|
|
13
|
+
let DIRECT_FILE_EXTENSIONS = [".txt", ".json", ".csv", ".md", ".xml", ".js", ".html", ".css"];
|
|
14
|
+
|
|
13
15
|
class AzureCognitivePlugin extends ModelPlugin {
|
|
14
16
|
constructor(config, pathway, modelName, model) {
|
|
15
17
|
super(config, pathway, modelName, model);
|
|
@@ -52,7 +54,14 @@ class AzureCognitivePlugin extends ModelPlugin {
|
|
|
52
54
|
|
|
53
55
|
if (mode == 'index') {
|
|
54
56
|
const calculateInputVector = async () => {
|
|
55
|
-
|
|
57
|
+
try{
|
|
58
|
+
if(!text || !text.trim()){
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
return JSON.parse(await callPathway(this.config, 'embeddings', { text }))[0];
|
|
62
|
+
}catch(err){
|
|
63
|
+
console.log(`Error in calculating input vector for text: ${text}, error: ${err}`);
|
|
64
|
+
}
|
|
56
65
|
}
|
|
57
66
|
|
|
58
67
|
const doc = {
|
|
@@ -141,7 +150,7 @@ class AzureCognitivePlugin extends ModelPlugin {
|
|
|
141
150
|
let url = file;
|
|
142
151
|
//if not txt file, use helper app to convert to txt
|
|
143
152
|
const extension = path.extname(file).toLowerCase();
|
|
144
|
-
if (extension
|
|
153
|
+
if (!DIRECT_FILE_EXTENSIONS.includes(extension)) {
|
|
145
154
|
try {
|
|
146
155
|
const {data} = await axios.get(API_URL, { params: { uri: file, requestId, save: true } });
|
|
147
156
|
url = data[0]
|
|
@@ -154,10 +163,18 @@ class AzureCognitivePlugin extends ModelPlugin {
|
|
|
154
163
|
const { data } = await axios.get(url);
|
|
155
164
|
await this.markCompletedForCleanUp(requestId);
|
|
156
165
|
|
|
166
|
+
if(!data){
|
|
167
|
+
throw Error(`No data can be extracted out of file!`);
|
|
168
|
+
}
|
|
169
|
+
|
|
157
170
|
//return await this.execute(data, {...parameters, file:null}, prompt, pathwayResolver);
|
|
158
171
|
return await callPathway(this.config, 'cognitive_insert', {...parameters, file:null, text:data });
|
|
159
172
|
}
|
|
160
173
|
|
|
174
|
+
if (mode === 'index' && (!text || !text.trim()) ){
|
|
175
|
+
return; // nothing to index
|
|
176
|
+
}
|
|
177
|
+
|
|
161
178
|
const { data, params } = await this.getRequestParameters(text, parameters, prompt, mode, indexName, savedContextId, {headers, requestId, pathway, url});
|
|
162
179
|
|
|
163
180
|
// update contextid last used
|