@aj-archipelago/cortex 1.0.12 → 1.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.js +21 -1
- package/helper_apps/MediaFileChunker/Dockerfile +3 -4
- package/helper_apps/MediaFileChunker/docHelper.js +94 -0
- package/helper_apps/MediaFileChunker/fileChunker.js +1 -1
- package/helper_apps/MediaFileChunker/index.js +44 -31
- package/helper_apps/MediaFileChunker/package-lock.json +1647 -17
- package/helper_apps/MediaFileChunker/package.json +5 -1
- package/helper_apps/MediaFileChunker/start.js +2 -1
- package/lib/request.js +3 -3
- package/package.json +1 -1
- package/pathways/cognitive_delete.js +10 -0
- package/pathways/cognitive_insert.js +14 -0
- package/pathways/cognitive_search.js +13 -0
- package/pathways/embeddings.js +14 -0
- package/pathways/index.js +8 -0
- package/pathways/language.js +10 -0
- package/server/pathwayPrompter.js +8 -0
- package/server/pathwayResolver.js +1 -1
- package/server/plugins/azureCognitivePlugin.js +147 -0
- package/server/plugins/openAiEmbeddingsPlugin.js +38 -0
package/config.js
CHANGED
|
@@ -87,7 +87,27 @@ var config = convict({
|
|
|
87
87
|
"params": {
|
|
88
88
|
"model": "whisper-1"
|
|
89
89
|
},
|
|
90
|
-
}
|
|
90
|
+
},
|
|
91
|
+
"azure-cognitive": {
|
|
92
|
+
"type": "AZURE-COGNITIVE",
|
|
93
|
+
"url": "https://archipelago-cognitive-search.search.windows.net/indexes/indexcortex/docs/search?api-version=2023-07-01-Preview",
|
|
94
|
+
"headers": {
|
|
95
|
+
"api-key": "{{AZURE_COGNITIVE_API_KEY}}",
|
|
96
|
+
"Content-Type": "application/json"
|
|
97
|
+
},
|
|
98
|
+
},
|
|
99
|
+
"oai-embeddings": {
|
|
100
|
+
"type": "OPENAI-EMBEDDINGS",
|
|
101
|
+
"url": "https://archipelago-openai.openai.azure.com/openai/deployments/archipelago-embedding/embeddings?api-version=2023-05-15",
|
|
102
|
+
"headers": {
|
|
103
|
+
"api-key": "{{ARCHIPELAGO_OPENAI_KEY}}",
|
|
104
|
+
"Content-Type": "application/json"
|
|
105
|
+
},
|
|
106
|
+
"params": {
|
|
107
|
+
"model": "text-embedding-ada-002"
|
|
108
|
+
},
|
|
109
|
+
"maxTokenLength": 8192,
|
|
110
|
+
},
|
|
91
111
|
},
|
|
92
112
|
env: 'CORTEX_MODELS'
|
|
93
113
|
},
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import pdfjsLib from 'pdfjs-dist';
|
|
2
|
+
import fs from 'fs/promises';
|
|
3
|
+
import mammoth from 'mammoth';
|
|
4
|
+
import XLSX from 'xlsx';
|
|
5
|
+
|
|
6
|
+
export async function txtToText(filePath) {
|
|
7
|
+
const text = await fs.readFile(filePath, 'utf-8');
|
|
8
|
+
return text;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export async function docxToText(filePath) {
|
|
12
|
+
const buffer = await fs.readFile(filePath);
|
|
13
|
+
const result = await mammoth.extractRawText({ buffer: buffer });
|
|
14
|
+
return result.value;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export async function xlsxToText(filePath) {
|
|
18
|
+
const workbook = XLSX.readFile(filePath);
|
|
19
|
+
let finalText = '';
|
|
20
|
+
|
|
21
|
+
workbook.SheetNames.forEach(sheetName => {
|
|
22
|
+
const sheet = workbook.Sheets[sheetName];
|
|
23
|
+
const sheetAsJson = XLSX.utils.sheet_to_json(sheet, { header: 1 });
|
|
24
|
+
sheetAsJson.forEach(row => {
|
|
25
|
+
finalText += row.join(' ') + '\n';
|
|
26
|
+
});
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
return finalText;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export async function pdfToText(filePath) {
|
|
33
|
+
const pdf = await pdfjsLib.getDocument(filePath).promise;
|
|
34
|
+
let finalText = '';
|
|
35
|
+
|
|
36
|
+
for(let i = 1; i <= pdf.numPages; i++) {
|
|
37
|
+
const page = await pdf.getPage(i);
|
|
38
|
+
const textContent = await page.getTextContent();
|
|
39
|
+
const strings = textContent.items.map(item => item.str);
|
|
40
|
+
finalText += strings.join(' ');
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return finalText;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export async function documentToText(filePath) {
|
|
47
|
+
const fileExtension = filePath.split('.').pop();
|
|
48
|
+
|
|
49
|
+
switch (fileExtension) {
|
|
50
|
+
case 'pdf':
|
|
51
|
+
return pdfToText(filePath);
|
|
52
|
+
case 'txt':
|
|
53
|
+
return txtToText(filePath);
|
|
54
|
+
case 'docx':
|
|
55
|
+
return docxToText(filePath);
|
|
56
|
+
case 'xlsx':
|
|
57
|
+
return xlsxToText(filePath);
|
|
58
|
+
default:
|
|
59
|
+
throw new Error(`Unsupported file type: ${fileExtension}`);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export function easyChunker(text) {
|
|
64
|
+
const result = [];
|
|
65
|
+
const n = 10000;
|
|
66
|
+
|
|
67
|
+
// If the text is less than n characters, just process it as is
|
|
68
|
+
if (text.length <= n) {
|
|
69
|
+
return [text];
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
let startIndex = 0;
|
|
73
|
+
while (startIndex < text.length) {
|
|
74
|
+
let endIndex = Math.min(startIndex + n, text.length);
|
|
75
|
+
|
|
76
|
+
// Make sure we don't split in the middle of a sentence
|
|
77
|
+
while (endIndex > startIndex && text[endIndex] !== '.' && text[endIndex] !== ' ') {
|
|
78
|
+
endIndex--;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// If we didn't find a sentence break, just split at n characters
|
|
82
|
+
if (endIndex === startIndex) {
|
|
83
|
+
endIndex = startIndex + n;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Push the chunk to the result array
|
|
87
|
+
result.push(text.substring(startIndex, endIndex));
|
|
88
|
+
|
|
89
|
+
// Move the start index to the next chunk
|
|
90
|
+
startIndex = endIndex;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return result;
|
|
94
|
+
}
|
|
@@ -1,8 +1,12 @@
|
|
|
1
|
-
import { processYoutubeUrl, splitMediaFile } from './fileChunker.js';
|
|
1
|
+
import { downloadFile, processYoutubeUrl, splitMediaFile } from './fileChunker.js';
|
|
2
2
|
import { saveFileToBlob, deleteBlob, uploadBlob } from './blobHandler.js';
|
|
3
3
|
import { publishRequestProgress } from './redis.js';
|
|
4
4
|
import { deleteTempPath, ensureEncoded, isValidYoutubeUrl } from './helper.js';
|
|
5
5
|
import { moveFileToPublicFolder, deleteFolder } from './localFileHandler.js';
|
|
6
|
+
import { documentToText, easyChunker } from './docHelper.js';
|
|
7
|
+
import path from 'path';
|
|
8
|
+
import os from 'os';
|
|
9
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
6
10
|
|
|
7
11
|
const useAzure = process.env.AZURE_STORAGE_CONNECTION_STRING ? true : false;
|
|
8
12
|
console.log(useAzure ? 'Using Azure Storage' : 'Using local file system');
|
|
@@ -59,42 +63,51 @@ async function main(context, req) {
|
|
|
59
63
|
await publishRequestProgress({ requestId, progress, completedCount, totalCount, numberOfChunks, data });
|
|
60
64
|
}
|
|
61
65
|
|
|
62
|
-
|
|
63
|
-
if (isYoutubeUrl) {
|
|
64
|
-
// totalCount += 1; // extra 1 step for youtube download
|
|
65
|
-
file = await processYoutubeUrl(file);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
const { chunkPromises, uniqueOutputPath } = await splitMediaFile(file);
|
|
69
|
-
folder = uniqueOutputPath;
|
|
66
|
+
const isDocument = ['.pdf', '.txt', '.docx', '.xlsx'].some(ext => uri.toLowerCase().endsWith(ext));
|
|
70
67
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
const
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
68
|
+
try {
|
|
69
|
+
if (isDocument) {
|
|
70
|
+
const extension = path.extname(uri).toLowerCase();
|
|
71
|
+
const file = path.join(os.tmpdir(), `${uuidv4()}${extension}`);
|
|
72
|
+
await downloadFile(uri,file)
|
|
73
|
+
result.push(...easyChunker(await documentToText(file)));
|
|
74
|
+
}else{
|
|
75
|
+
|
|
76
|
+
if (isYoutubeUrl) {
|
|
77
|
+
// totalCount += 1; // extra 1 step for youtube download
|
|
78
|
+
file = await processYoutubeUrl(file);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const { chunkPromises, uniqueOutputPath } = await splitMediaFile(file);
|
|
82
|
+
folder = uniqueOutputPath;
|
|
83
|
+
|
|
84
|
+
numberOfChunks = chunkPromises.length; // for progress reporting
|
|
85
|
+
totalCount += chunkPromises.length * 4; // 4 steps for each chunk (download and upload)
|
|
86
|
+
// isYoutubeUrl && sendProgress(); // send progress for youtube download after total count is calculated
|
|
87
|
+
|
|
88
|
+
// sequential download of chunks
|
|
89
|
+
const chunks = [];
|
|
90
|
+
for (const chunkPromise of chunkPromises) {
|
|
91
|
+
chunks.push(await chunkPromise);
|
|
92
|
+
sendProgress();
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// sequential processing of chunks
|
|
96
|
+
for (const chunk of chunks) {
|
|
97
|
+
const blobName = useAzure ? await saveFileToBlob(chunk, requestId) : await moveFileToPublicFolder(chunk, requestId);
|
|
98
|
+
result.push(blobName);
|
|
99
|
+
context.log(`Saved chunk as: ${blobName}`);
|
|
100
|
+
sendProgress();
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// parallel processing, dropped
|
|
104
|
+
// result = await Promise.all(mediaSplit.chunks.map(processChunk));
|
|
88
105
|
}
|
|
89
|
-
|
|
90
|
-
// parallel processing, dropped
|
|
91
|
-
// result = await Promise.all(mediaSplit.chunks.map(processChunk));
|
|
92
|
-
|
|
93
106
|
} catch (error) {
|
|
94
107
|
console.error("An error occurred:", error);
|
|
95
108
|
} finally {
|
|
96
109
|
try {
|
|
97
|
-
isYoutubeUrl && (await deleteTempPath(file));
|
|
110
|
+
(isYoutubeUrl||isDocument) && (await deleteTempPath(file));
|
|
98
111
|
folder && (await deleteTempPath(folder));
|
|
99
112
|
} catch (error) {
|
|
100
113
|
console.error("An error occurred while deleting:", error);
|