@aj-archipelago/cortex 1.0.11 → 1.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/config.js CHANGED
@@ -87,7 +87,27 @@ var config = convict({
87
87
  "params": {
88
88
  "model": "whisper-1"
89
89
  },
90
- }
90
+ },
91
+ "azure-cognitive": {
92
+ "type": "AZURE-COGNITIVE",
93
+ "url": "https://archipelago-cognitive-search.search.windows.net/indexes/indexcortex/docs/search?api-version=2023-07-01-Preview",
94
+ "headers": {
95
+ "api-key": "{{AZURE_COGNITIVE_API_KEY}}",
96
+ "Content-Type": "application/json"
97
+ },
98
+ },
99
+ "oai-embeddings": {
100
+ "type": "OPENAI-EMBEDDINGS",
101
+ "url": "https://archipelago-openai.openai.azure.com/openai/deployments/archipelago-embedding/embeddings?api-version=2023-05-15",
102
+ "headers": {
103
+ "api-key": "{{ARCHIPELAGO_OPENAI_KEY}}",
104
+ "Content-Type": "application/json"
105
+ },
106
+ "params": {
107
+ "model": "text-embedding-ada-002"
108
+ },
109
+ "maxTokenLength": 8192,
110
+ },
91
111
  },
92
112
  env: 'CORTEX_MODELS'
93
113
  },
@@ -0,0 +1,94 @@
1
+ import pdfjsLib from 'pdfjs-dist';
2
+ import fs from 'fs/promises';
3
+ import mammoth from 'mammoth';
4
+ import XLSX from 'xlsx';
5
+
6
+ export async function txtToText(filePath) {
7
+ const text = await fs.readFile(filePath, 'utf-8');
8
+ return text;
9
+ }
10
+
11
+ export async function docxToText(filePath) {
12
+ const buffer = await fs.readFile(filePath);
13
+ const result = await mammoth.extractRawText({ buffer: buffer });
14
+ return result.value;
15
+ }
16
+
17
+ export async function xlsxToText(filePath) {
18
+ const workbook = XLSX.readFile(filePath);
19
+ let finalText = '';
20
+
21
+ workbook.SheetNames.forEach(sheetName => {
22
+ const sheet = workbook.Sheets[sheetName];
23
+ const sheetAsJson = XLSX.utils.sheet_to_json(sheet, { header: 1 });
24
+ sheetAsJson.forEach(row => {
25
+ finalText += row.join(' ') + '\n';
26
+ });
27
+ });
28
+
29
+ return finalText;
30
+ }
31
+
32
+ export async function pdfToText(filePath) {
33
+ const pdf = await pdfjsLib.getDocument(filePath).promise;
34
+ let finalText = '';
35
+
36
+ for(let i = 1; i <= pdf.numPages; i++) {
37
+ const page = await pdf.getPage(i);
38
+ const textContent = await page.getTextContent();
39
+ const strings = textContent.items.map(item => item.str);
40
+ finalText += strings.join(' ');
41
+ }
42
+
43
+ return finalText;
44
+ }
45
+
46
+ export async function documentToText(filePath) {
47
+ const fileExtension = filePath.split('.').pop();
48
+
49
+ switch (fileExtension) {
50
+ case 'pdf':
51
+ return pdfToText(filePath);
52
+ case 'txt':
53
+ return txtToText(filePath);
54
+ case 'docx':
55
+ return docxToText(filePath);
56
+ case 'xlsx':
57
+ return xlsxToText(filePath);
58
+ default:
59
+ throw new Error(`Unsupported file type: ${fileExtension}`);
60
+ }
61
+ }
62
+
63
+ export function easyChunker(text) {
64
+ const result = [];
65
+ const n = 10000;
66
+
67
+ // If the text is less than n characters, just process it as is
68
+ if (text.length <= n) {
69
+ return [text];
70
+ }
71
+
72
+ let startIndex = 0;
73
+ while (startIndex < text.length) {
74
+ let endIndex = Math.min(startIndex + n, text.length);
75
+
76
+ // Make sure we don't split in the middle of a sentence
77
+ while (endIndex > startIndex && text[endIndex] !== '.' && text[endIndex] !== ' ') {
78
+ endIndex--;
79
+ }
80
+
81
+ // If we didn't find a sentence break, just split at n characters
82
+ if (endIndex === startIndex) {
83
+ endIndex = startIndex + n;
84
+ }
85
+
86
+ // Push the chunk to the result array
87
+ result.push(text.substring(startIndex, endIndex));
88
+
89
+ // Move the start index to the next chunk
90
+ startIndex = endIndex;
91
+ }
92
+
93
+ return result;
94
+ }
@@ -163,5 +163,5 @@ const processYoutubeUrl = async (url) => {
163
163
  }
164
164
 
165
165
  export {
166
- splitMediaFile, processYoutubeUrl
166
+ splitMediaFile, processYoutubeUrl, downloadFile
167
167
  };
@@ -1,8 +1,12 @@
1
- import { processYoutubeUrl, splitMediaFile } from './fileChunker.js';
1
+ import { downloadFile, processYoutubeUrl, splitMediaFile } from './fileChunker.js';
2
2
  import { saveFileToBlob, deleteBlob, uploadBlob } from './blobHandler.js';
3
3
  import { publishRequestProgress } from './redis.js';
4
4
  import { deleteTempPath, ensureEncoded, isValidYoutubeUrl } from './helper.js';
5
5
  import { moveFileToPublicFolder, deleteFolder } from './localFileHandler.js';
6
+ import { documentToText, easyChunker } from './docHelper.js';
7
+ import path from 'path';
8
+ import os from 'os';
9
+ import { v4 as uuidv4 } from 'uuid';
6
10
 
7
11
  const useAzure = process.env.AZURE_STORAGE_CONNECTION_STRING ? true : false;
8
12
  console.log(useAzure ? 'Using Azure Storage' : 'Using local file system');
@@ -59,42 +63,51 @@ async function main(context, req) {
59
63
  await publishRequestProgress({ requestId, progress, completedCount, totalCount, numberOfChunks, data });
60
64
  }
61
65
 
62
- try {
63
- if (isYoutubeUrl) {
64
- // totalCount += 1; // extra 1 step for youtube download
65
- file = await processYoutubeUrl(file);
66
- }
67
-
68
- const { chunkPromises, uniqueOutputPath } = await splitMediaFile(file);
69
- folder = uniqueOutputPath;
66
+ const isDocument = ['.pdf', '.txt', '.docx', '.xlsx'].some(ext => uri.toLowerCase().endsWith(ext));
70
67
 
71
- numberOfChunks = chunkPromises.length; // for progress reporting
72
- totalCount += chunkPromises.length * 4; // 4 steps for each chunk (download and upload)
73
- // isYoutubeUrl && sendProgress(); // send progress for youtube download after total count is calculated
74
-
75
- // sequential download of chunks
76
- const chunks = [];
77
- for (const chunkPromise of chunkPromises) {
78
- chunks.push(await chunkPromise);
79
- sendProgress();
80
- }
81
-
82
- // sequential processing of chunks
83
- for (const chunk of chunks) {
84
- const blobName = useAzure ? await saveFileToBlob(chunk, requestId) : await moveFileToPublicFolder(chunk, requestId);
85
- result.push(blobName);
86
- context.log(`Saved chunk as: ${blobName}`);
87
- sendProgress();
68
+ try {
69
+ if (isDocument) {
70
+ const extension = path.extname(uri).toLowerCase();
71
+ const file = path.join(os.tmpdir(), `${uuidv4()}${extension}`);
72
+ await downloadFile(uri,file)
73
+ result.push(...easyChunker(await documentToText(file)));
74
+ }else{
75
+
76
+ if (isYoutubeUrl) {
77
+ // totalCount += 1; // extra 1 step for youtube download
78
+ file = await processYoutubeUrl(file);
79
+ }
80
+
81
+ const { chunkPromises, uniqueOutputPath } = await splitMediaFile(file);
82
+ folder = uniqueOutputPath;
83
+
84
+ numberOfChunks = chunkPromises.length; // for progress reporting
85
+ totalCount += chunkPromises.length * 4; // 4 steps for each chunk (download and upload)
86
+ // isYoutubeUrl && sendProgress(); // send progress for youtube download after total count is calculated
87
+
88
+ // sequential download of chunks
89
+ const chunks = [];
90
+ for (const chunkPromise of chunkPromises) {
91
+ chunks.push(await chunkPromise);
92
+ sendProgress();
93
+ }
94
+
95
+ // sequential processing of chunks
96
+ for (const chunk of chunks) {
97
+ const blobName = useAzure ? await saveFileToBlob(chunk, requestId) : await moveFileToPublicFolder(chunk, requestId);
98
+ result.push(blobName);
99
+ context.log(`Saved chunk as: ${blobName}`);
100
+ sendProgress();
101
+ }
102
+
103
+ // parallel processing, dropped
104
+ // result = await Promise.all(mediaSplit.chunks.map(processChunk));
88
105
  }
89
-
90
- // parallel processing, dropped
91
- // result = await Promise.all(mediaSplit.chunks.map(processChunk));
92
-
93
106
  } catch (error) {
94
107
  console.error("An error occurred:", error);
95
108
  } finally {
96
109
  try {
97
- isYoutubeUrl && (await deleteTempPath(file));
110
+ (isYoutubeUrl||isDocument) && (await deleteTempPath(file));
98
111
  folder && (await deleteTempPath(folder));
99
112
  } catch (error) {
100
113
  console.error("An error occurred while deleting:", error);