@aj-archipelago/cortex 1.3.49 → 1.3.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.js +1 -1
- package/helper-apps/cortex-browser/Dockerfile +19 -31
- package/helper-apps/cortex-browser/function_app.py +708 -181
- package/helper-apps/cortex-browser/requirements.txt +4 -4
- package/helper-apps/cortex-file-handler/blobHandler.js +850 -429
- package/helper-apps/cortex-file-handler/constants.js +64 -48
- package/helper-apps/cortex-file-handler/docHelper.js +7 -114
- package/helper-apps/cortex-file-handler/fileChunker.js +96 -51
- package/helper-apps/cortex-file-handler/function.json +2 -6
- package/helper-apps/cortex-file-handler/helper.js +34 -25
- package/helper-apps/cortex-file-handler/index.js +324 -136
- package/helper-apps/cortex-file-handler/localFileHandler.js +56 -57
- package/helper-apps/cortex-file-handler/package-lock.json +6065 -5964
- package/helper-apps/cortex-file-handler/package.json +8 -4
- package/helper-apps/cortex-file-handler/redis.js +23 -17
- package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +12 -9
- package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +21 -18
- package/helper-apps/cortex-file-handler/scripts/test-azure.sh +1 -1
- package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +1 -1
- package/helper-apps/cortex-file-handler/services/ConversionService.js +288 -0
- package/helper-apps/cortex-file-handler/services/FileConversionService.js +53 -0
- package/helper-apps/cortex-file-handler/start.js +63 -38
- package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +144 -0
- package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +88 -64
- package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +114 -91
- package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +351 -0
- package/helper-apps/cortex-file-handler/tests/files/DOCX_TestPage.docx +0 -0
- package/helper-apps/cortex-file-handler/tests/files/tests-example.xls +0 -0
- package/helper-apps/cortex-file-handler/tests/start.test.js +943 -642
- package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +31 -0
- package/helper-apps/cortex-markitdown/.funcignore +1 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py +64 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json +21 -0
- package/helper-apps/cortex-markitdown/README.md +94 -0
- package/helper-apps/cortex-markitdown/host.json +15 -0
- package/helper-apps/cortex-markitdown/requirements.txt +2 -0
- package/lib/requestExecutor.js +44 -36
- package/package.json +1 -1
- package/pathways/system/entity/tools/sys_tool_cognitive_search.js +1 -1
- package/pathways/system/entity/tools/sys_tool_readfile.js +24 -2
- package/server/plugins/openAiWhisperPlugin.js +59 -87
- package/helper-apps/cortex-file-handler/tests/docHelper.test.js +0 -148
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@aj-archipelago/cortex-file-handler",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.01",
|
|
4
4
|
"description": "File handling service for Cortex - handles file uploads, media chunking, and document processing",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"scripts": {
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
"express": "^4.21.1",
|
|
22
22
|
"fluent-ffmpeg": "^2.1.3",
|
|
23
23
|
"ioredis": "^5.3.1",
|
|
24
|
-
"
|
|
24
|
+
"mime-types": "^3.0.1",
|
|
25
25
|
"papaparse": "^5.4.1",
|
|
26
26
|
"pdfjs-dist": "^4.2.67",
|
|
27
27
|
"public-ip": "^6.0.1",
|
|
@@ -29,13 +29,17 @@
|
|
|
29
29
|
"xlsx": "^0.18.5"
|
|
30
30
|
},
|
|
31
31
|
"devDependencies": {
|
|
32
|
+
"@eslint/js": "^9.26.0",
|
|
32
33
|
"ava": "^5.3.1",
|
|
33
34
|
"dotenv": "^16.3.1",
|
|
34
|
-
"
|
|
35
|
+
"eslint-plugin-import": "^2.31.0",
|
|
36
|
+
"globals": "^16.1.0",
|
|
37
|
+
"nock": "^13.3.0",
|
|
38
|
+
"typescript-eslint": "^8.32.1"
|
|
35
39
|
},
|
|
36
40
|
"ava": {
|
|
37
41
|
"files": [
|
|
38
|
-
"tests
|
|
42
|
+
"tests/**/*.test.js",
|
|
39
43
|
"!tests/test-files/**/*",
|
|
40
44
|
"!tests/test-docs/**/*",
|
|
41
45
|
"!tests/mocks/**/*"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import redis from 'ioredis';
|
|
2
|
-
const connectionString = process.env[
|
|
2
|
+
const connectionString = process.env['REDIS_CONNECTION_STRING'];
|
|
3
3
|
const client = redis.createClient(connectionString);
|
|
4
4
|
// client.connect();
|
|
5
5
|
|
|
@@ -30,7 +30,7 @@ const publishRequestProgress = async (data) => {
|
|
|
30
30
|
// Function to get all key value pairs in "FileStoreMap" hash map
|
|
31
31
|
const getAllFileStoreMap = async () => {
|
|
32
32
|
try {
|
|
33
|
-
const allKeyValuePairs = await client.hgetall(
|
|
33
|
+
const allKeyValuePairs = await client.hgetall('FileStoreMap');
|
|
34
34
|
// Parse each JSON value in the returned object
|
|
35
35
|
for (const key in allKeyValuePairs) {
|
|
36
36
|
try {
|
|
@@ -43,7 +43,9 @@ const getAllFileStoreMap = async () => {
|
|
|
43
43
|
}
|
|
44
44
|
return allKeyValuePairs;
|
|
45
45
|
} catch (error) {
|
|
46
|
-
console.error(
|
|
46
|
+
console.error(
|
|
47
|
+
`Error getting all key-value pairs from FileStoreMap: ${error}`,
|
|
48
|
+
);
|
|
47
49
|
return {}; // Return null or any default value indicating an error occurred
|
|
48
50
|
}
|
|
49
51
|
};
|
|
@@ -52,7 +54,7 @@ const getAllFileStoreMap = async () => {
|
|
|
52
54
|
const setFileStoreMap = async (key, value) => {
|
|
53
55
|
try {
|
|
54
56
|
value.timestamp = new Date().toISOString();
|
|
55
|
-
await client.hset(
|
|
57
|
+
await client.hset('FileStoreMap', key, JSON.stringify(value));
|
|
56
58
|
} catch (error) {
|
|
57
59
|
console.error(`Error setting key in FileStoreMap: ${error}`);
|
|
58
60
|
}
|
|
@@ -60,7 +62,7 @@ const setFileStoreMap = async (key, value) => {
|
|
|
60
62
|
|
|
61
63
|
const getFileStoreMap = async (key) => {
|
|
62
64
|
try {
|
|
63
|
-
const value = await client.hget(
|
|
65
|
+
const value = await client.hget('FileStoreMap', key);
|
|
64
66
|
if (value) {
|
|
65
67
|
try {
|
|
66
68
|
// parse the value back to an object before returning
|
|
@@ -80,9 +82,9 @@ const getFileStoreMap = async (key) => {
|
|
|
80
82
|
// Function to remove key from "FileStoreMap" hash map
|
|
81
83
|
const removeFromFileStoreMap = async (key) => {
|
|
82
84
|
try {
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
const result = await client.hdel(
|
|
85
|
+
// hdel returns the number of keys that were removed.
|
|
86
|
+
// If the key does not exist, 0 is returned.
|
|
87
|
+
const result = await client.hdel('FileStoreMap', key);
|
|
86
88
|
if (result === 0) {
|
|
87
89
|
console.log(`The key ${key} does not exist`);
|
|
88
90
|
} else {
|
|
@@ -93,31 +95,35 @@ const removeFromFileStoreMap = async (key) => {
|
|
|
93
95
|
}
|
|
94
96
|
};
|
|
95
97
|
|
|
96
|
-
const cleanupRedisFileStoreMap = async (nDays=1) => {
|
|
97
|
-
|
|
98
|
+
const cleanupRedisFileStoreMap = async (nDays = 1) => {
|
|
99
|
+
const cleaned = [];
|
|
98
100
|
try {
|
|
99
101
|
const map = await getAllFileStoreMap();
|
|
100
102
|
const nDaysAgo = new Date(Date.now() - nDays * 24 * 60 * 60 * 1000);
|
|
101
103
|
|
|
102
|
-
for(const key in map){
|
|
104
|
+
for (const key in map) {
|
|
103
105
|
const value = map[key];
|
|
104
106
|
const timestamp = value?.timestamp ? new Date(value.timestamp) : null;
|
|
105
|
-
if(!timestamp || timestamp.getTime() < nDaysAgo.getTime()){
|
|
107
|
+
if (!timestamp || timestamp.getTime() < nDaysAgo.getTime()) {
|
|
106
108
|
// Remove the key from the "FileStoreMap" hash map
|
|
107
109
|
await removeFromFileStoreMap(key);
|
|
108
110
|
console.log(`Removed key ${key} from FileStoreMap`);
|
|
109
|
-
cleaned.push(Object.assign({hash:key}, value));
|
|
111
|
+
cleaned.push(Object.assign({ hash: key }, value));
|
|
110
112
|
}
|
|
111
113
|
}
|
|
112
114
|
} catch (error) {
|
|
113
115
|
console.error(`Error cleaning FileStoreMap: ${error}`);
|
|
114
116
|
} finally {
|
|
115
|
-
|
|
117
|
+
// Cleanup code if needed
|
|
116
118
|
}
|
|
117
119
|
return cleaned;
|
|
118
120
|
};
|
|
119
121
|
|
|
120
|
-
|
|
121
122
|
export {
|
|
122
|
-
publishRequestProgress,
|
|
123
|
-
|
|
123
|
+
publishRequestProgress,
|
|
124
|
+
connectClient,
|
|
125
|
+
setFileStoreMap,
|
|
126
|
+
getFileStoreMap,
|
|
127
|
+
removeFromFileStoreMap,
|
|
128
|
+
cleanupRedisFileStoreMap,
|
|
129
|
+
};
|
|
@@ -2,21 +2,24 @@ import { BlobServiceClient } from '@azure/storage-blob';
|
|
|
2
2
|
|
|
3
3
|
async function createContainer() {
|
|
4
4
|
try {
|
|
5
|
-
const blobServiceClient = BlobServiceClient.fromConnectionString(
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
5
|
+
const blobServiceClient = BlobServiceClient.fromConnectionString(
|
|
6
|
+
'UseDevelopmentStorage=true',
|
|
7
|
+
);
|
|
8
|
+
const containerClient =
|
|
9
|
+
blobServiceClient.getContainerClient('test-container');
|
|
10
|
+
|
|
11
|
+
console.log('Creating container...');
|
|
9
12
|
await containerClient.create();
|
|
10
|
-
console.log(
|
|
13
|
+
console.log('Container created successfully');
|
|
11
14
|
} catch (error) {
|
|
12
|
-
|
|
15
|
+
// Ignore if container already exists
|
|
13
16
|
if (error.statusCode === 409) {
|
|
14
|
-
console.log(
|
|
17
|
+
console.log('Container already exists');
|
|
15
18
|
} else {
|
|
16
|
-
console.error(
|
|
19
|
+
console.error('Error creating container:', error);
|
|
17
20
|
process.exit(1);
|
|
18
21
|
}
|
|
19
22
|
}
|
|
20
23
|
}
|
|
21
24
|
|
|
22
|
-
createContainer();
|
|
25
|
+
createContainer();
|
|
@@ -3,18 +3,21 @@ import { Storage } from '@google-cloud/storage';
|
|
|
3
3
|
|
|
4
4
|
async function createAzureContainer() {
|
|
5
5
|
try {
|
|
6
|
-
const blobServiceClient = BlobServiceClient.fromConnectionString(
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
const blobServiceClient = BlobServiceClient.fromConnectionString(
|
|
7
|
+
'UseDevelopmentStorage=true',
|
|
8
|
+
);
|
|
9
|
+
const containerClient =
|
|
10
|
+
blobServiceClient.getContainerClient('test-container');
|
|
11
|
+
|
|
12
|
+
console.log('Creating Azure container...');
|
|
10
13
|
await containerClient.create();
|
|
11
|
-
console.log(
|
|
14
|
+
console.log('Azure container created successfully');
|
|
12
15
|
} catch (error) {
|
|
13
|
-
|
|
16
|
+
// Ignore if container already exists
|
|
14
17
|
if (error.statusCode === 409) {
|
|
15
|
-
console.log(
|
|
18
|
+
console.log('Azure container already exists');
|
|
16
19
|
} else {
|
|
17
|
-
console.error(
|
|
20
|
+
console.error('Error creating Azure container:', error);
|
|
18
21
|
process.exit(1);
|
|
19
22
|
}
|
|
20
23
|
}
|
|
@@ -23,19 +26,19 @@ async function createAzureContainer() {
|
|
|
23
26
|
async function createGCSBucket() {
|
|
24
27
|
try {
|
|
25
28
|
const storage = new Storage({
|
|
26
|
-
projectId:
|
|
27
|
-
apiEndpoint:
|
|
29
|
+
projectId: 'test-project',
|
|
30
|
+
apiEndpoint: 'http://localhost:4443',
|
|
28
31
|
});
|
|
29
|
-
|
|
30
|
-
console.log(
|
|
31
|
-
await storage.createBucket(
|
|
32
|
-
console.log(
|
|
32
|
+
|
|
33
|
+
console.log('Creating GCS bucket...');
|
|
34
|
+
await storage.createBucket('cortextempfiles');
|
|
35
|
+
console.log('GCS bucket created successfully');
|
|
33
36
|
} catch (error) {
|
|
34
|
-
|
|
37
|
+
// Ignore if bucket already exists
|
|
35
38
|
if (error.code === 409) {
|
|
36
|
-
console.log(
|
|
39
|
+
console.log('GCS bucket already exists');
|
|
37
40
|
} else {
|
|
38
|
-
console.error(
|
|
41
|
+
console.error('Error creating GCS bucket:', error);
|
|
39
42
|
process.exit(1);
|
|
40
43
|
}
|
|
41
44
|
}
|
|
@@ -46,4 +49,4 @@ async function setup() {
|
|
|
46
49
|
await createGCSBucket();
|
|
47
50
|
}
|
|
48
51
|
|
|
49
|
-
setup();
|
|
52
|
+
setup();
|
|
@@ -20,7 +20,7 @@ echo "Starting test environment..."
|
|
|
20
20
|
# Start Azurite if not running
|
|
21
21
|
if ! nc -z localhost 10000; then
|
|
22
22
|
echo "Starting Azurite..."
|
|
23
|
-
azurite --silent --location .azurite --debug .azurite/debug.log &
|
|
23
|
+
azurite --silent --skipApiVersionCheck --location .azurite --debug .azurite/debug.log &
|
|
24
24
|
AZURITE_PID=$!
|
|
25
25
|
# Wait for Azurite to be ready
|
|
26
26
|
until nc -z localhost 10000; do
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
import fs from 'fs/promises';
|
|
2
|
+
import os from 'os';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import { createReadStream, createWriteStream } from 'fs';
|
|
5
|
+
import { pipeline } from 'stream/promises';
|
|
6
|
+
import axios from 'axios';
|
|
7
|
+
import XLSX from 'xlsx';
|
|
8
|
+
import { CONVERTED_EXTENSIONS } from '../constants.js';
|
|
9
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
10
|
+
|
|
11
|
+
const MARKITDOWN_CONVERT_URL = process.env.MARKITDOWN_CONVERT_URL;
|
|
12
|
+
|
|
13
|
+
if (!MARKITDOWN_CONVERT_URL) {
|
|
14
|
+
throw new Error('MARKITDOWN_CONVERT_URL is not set');
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export class ConversionService {
|
|
18
|
+
constructor(context) {
|
|
19
|
+
this.context = context;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Determines if a file needs conversion based on its extension
|
|
24
|
+
* @param {string} filename - The name of the file to check
|
|
25
|
+
* @returns {boolean} - Whether the file needs conversion
|
|
26
|
+
*/
|
|
27
|
+
needsConversion(filename) {
|
|
28
|
+
const ext = path.extname(filename).toLowerCase();
|
|
29
|
+
return CONVERTED_EXTENSIONS.includes(ext);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Converts a file to its appropriate format
|
|
34
|
+
* @param {string} filePath - Path to the file to convert
|
|
35
|
+
* @param {string} originalUrl - Original URL of the file (required for document conversion)
|
|
36
|
+
* @param {boolean} forceConversion - If true, bypasses extension check and forces document conversion
|
|
37
|
+
* @returns {Promise<{convertedPath: string, convertedName: string, converted: boolean}>}
|
|
38
|
+
*/
|
|
39
|
+
async convertFile(filePath, originalUrl = null, forceConversion = false) {
|
|
40
|
+
this.context.log('Converting file:', { filePath, originalUrl, forceConversion });
|
|
41
|
+
|
|
42
|
+
// Clean the file path by removing any query parameters
|
|
43
|
+
const cleanFilePath = filePath.split('?')[0];
|
|
44
|
+
const ext = path.extname(cleanFilePath).toLowerCase();
|
|
45
|
+
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'convert-'));
|
|
46
|
+
|
|
47
|
+
try {
|
|
48
|
+
// If forceConversion is true, directly handle as document conversion
|
|
49
|
+
if (forceConversion) {
|
|
50
|
+
return await this._handleDocumentConversion(filePath, originalUrl, tempDir);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Handle Excel files
|
|
54
|
+
if (ext === '.xlsx' || ext === '.xls') {
|
|
55
|
+
return await this._handleExcelConversion(filePath, tempDir);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Handle documents that need markdown conversion
|
|
59
|
+
if (['.docx', '.doc', '.ppt', '.pptx'].includes(ext)) {
|
|
60
|
+
return await this._handleDocumentConversion(filePath, originalUrl, tempDir);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
this.context.log('No conversion needed for this file type');
|
|
64
|
+
return { converted: false };
|
|
65
|
+
} catch (error) {
|
|
66
|
+
this.context.log('Error in convertFile:', error);
|
|
67
|
+
// Clean up temp directory on error
|
|
68
|
+
await fs.rm(tempDir, { recursive: true, force: true });
|
|
69
|
+
throw error;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Ensures a file has both original and converted versions
|
|
75
|
+
* @param {Object} fileInfo - Information about the file
|
|
76
|
+
* @param {string} requestId - Request ID for storage
|
|
77
|
+
* @returns {Promise<Object>} - Updated file info with conversion if needed
|
|
78
|
+
*/
|
|
79
|
+
async ensureConvertedVersion(fileInfo, requestId) {
|
|
80
|
+
const { url, gcs } = fileInfo;
|
|
81
|
+
const extension = path.extname(url).toLowerCase();
|
|
82
|
+
|
|
83
|
+
// If file doesn't need conversion, return original info
|
|
84
|
+
if (!this.needsConversion(extension)) {
|
|
85
|
+
return fileInfo;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Check if converted version exists in the hash map
|
|
89
|
+
const convertedKey = `${fileInfo.hash}_converted`;
|
|
90
|
+
const convertedInfo = await this._getFileStoreMap(convertedKey);
|
|
91
|
+
|
|
92
|
+
let needsConversion = false;
|
|
93
|
+
if (convertedInfo) {
|
|
94
|
+
// Verify both primary and GCS URLs exist
|
|
95
|
+
const primaryExists = await this._urlExists(convertedInfo?.url);
|
|
96
|
+
const gcsExists = gcs ? await this._gcsUrlExists(convertedInfo?.gcs) : false;
|
|
97
|
+
|
|
98
|
+
// If both URLs exist, return the info
|
|
99
|
+
if (primaryExists.valid && (!gcs || gcsExists)) {
|
|
100
|
+
return { ...fileInfo, converted: convertedInfo };
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// If either URL is missing, we need to convert
|
|
104
|
+
needsConversion = true;
|
|
105
|
+
this.context.log('Conversion needed - missing URLs:', {
|
|
106
|
+
primaryExists: primaryExists.valid,
|
|
107
|
+
gcsExists,
|
|
108
|
+
convertedInfo
|
|
109
|
+
});
|
|
110
|
+
} else {
|
|
111
|
+
needsConversion = true;
|
|
112
|
+
this.context.log('Conversion needed - no converted info in map');
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// If conversion is needed, create it
|
|
116
|
+
if (needsConversion) {
|
|
117
|
+
try {
|
|
118
|
+
const tempDir = path.join(os.tmpdir(), `${uuidv4()}`);
|
|
119
|
+
await fs.mkdir(tempDir);
|
|
120
|
+
|
|
121
|
+
// Download original file
|
|
122
|
+
const downloadedFile = path.join(tempDir, path.basename(url));
|
|
123
|
+
await this._downloadFile(url, downloadedFile);
|
|
124
|
+
|
|
125
|
+
// Convert the file
|
|
126
|
+
const conversion = await this.convertFile(downloadedFile, url);
|
|
127
|
+
|
|
128
|
+
if (!conversion.converted) {
|
|
129
|
+
throw new Error('File conversion failed');
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Save converted file to primary storage
|
|
133
|
+
const convertedSaveResult = await this._saveConvertedFile(conversion.convertedPath, requestId);
|
|
134
|
+
if (!convertedSaveResult) {
|
|
135
|
+
throw new Error('Failed to save converted file to primary storage');
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// If GCS is configured, also save to GCS
|
|
139
|
+
let gcsUrl;
|
|
140
|
+
if (this._isGCSConfigured()) {
|
|
141
|
+
gcsUrl = await this._uploadChunkToGCS(conversion.convertedPath, requestId);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Store converted file info
|
|
145
|
+
const convertedFileInfo = {
|
|
146
|
+
url: convertedSaveResult.url,
|
|
147
|
+
gcs: gcsUrl
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
// Only store in map if we have at least the primary URL
|
|
151
|
+
if (convertedFileInfo.url) {
|
|
152
|
+
await this._setFileStoreMap(convertedKey, convertedFileInfo);
|
|
153
|
+
this.context.log('Stored converted file info:', convertedFileInfo);
|
|
154
|
+
} else {
|
|
155
|
+
throw new Error('Failed to get primary URL for converted file');
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Cleanup temp files
|
|
159
|
+
await this._cleanupTempFiles(downloadedFile, conversion.convertedPath, tempDir);
|
|
160
|
+
|
|
161
|
+
return { ...fileInfo, converted: convertedFileInfo };
|
|
162
|
+
} catch (error) {
|
|
163
|
+
this.context.log('Error ensuring converted version:', error);
|
|
164
|
+
// Don't return partial conversion results
|
|
165
|
+
return fileInfo;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return fileInfo;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Private helper methods
|
|
173
|
+
async _handleExcelConversion(filePath, tempDir) {
|
|
174
|
+
this.context.log('Handling Excel file conversion');
|
|
175
|
+
const csvPath = await this._xlsxToCsv(filePath);
|
|
176
|
+
const ext = path.extname(filePath);
|
|
177
|
+
const convertedPath = path.join(
|
|
178
|
+
tempDir,
|
|
179
|
+
`${path.basename(filePath, ext)}.csv`,
|
|
180
|
+
);
|
|
181
|
+
|
|
182
|
+
await pipeline(
|
|
183
|
+
createReadStream(csvPath, { highWaterMark: 64 * 1024 }),
|
|
184
|
+
createWriteStream(convertedPath, { highWaterMark: 64 * 1024 }),
|
|
185
|
+
);
|
|
186
|
+
await fs.unlink(csvPath);
|
|
187
|
+
|
|
188
|
+
return {
|
|
189
|
+
convertedPath,
|
|
190
|
+
convertedName: path.basename(convertedPath),
|
|
191
|
+
converted: true,
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
async _handleDocumentConversion(filePath, originalUrl, tempDir) {
|
|
196
|
+
this.context.log('Handling document conversion');
|
|
197
|
+
if (!originalUrl) {
|
|
198
|
+
throw new Error('Original URL is required for document conversion');
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const markdown = await this._convertToMarkdown(originalUrl);
|
|
202
|
+
if (!markdown) {
|
|
203
|
+
throw new Error('Markdown conversion returned empty result');
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const ext = path.extname(filePath);
|
|
207
|
+
const baseFilename = path.basename(filePath, ext);
|
|
208
|
+
const convertedPath = path.join(tempDir, `${baseFilename}.md`);
|
|
209
|
+
await fs.writeFile(convertedPath, markdown);
|
|
210
|
+
|
|
211
|
+
return {
|
|
212
|
+
convertedPath,
|
|
213
|
+
convertedName: path.basename(convertedPath),
|
|
214
|
+
converted: true,
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
async _convertToMarkdown(fileUrl) {
|
|
219
|
+
try {
|
|
220
|
+
const apiUrl = `${MARKITDOWN_CONVERT_URL}${encodeURIComponent(fileUrl)}`;
|
|
221
|
+
const response = await axios.get(apiUrl);
|
|
222
|
+
return response.data.markdown || '';
|
|
223
|
+
} catch (err) {
|
|
224
|
+
this.context.log('Error converting to markdown:', err);
|
|
225
|
+
throw err;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
async _xlsxToCsv(filePath) {
|
|
230
|
+
const workbook = XLSX.readFile(filePath, { type: 'buffer' });
|
|
231
|
+
const outputPath = filePath.replace(/\.[^/.]+$/, '.csv');
|
|
232
|
+
let csvContent = '';
|
|
233
|
+
|
|
234
|
+
workbook.SheetNames.forEach((sheetName) => {
|
|
235
|
+
const sheet = workbook.Sheets[sheetName];
|
|
236
|
+
const csv = XLSX.utils.sheet_to_csv(sheet);
|
|
237
|
+
csvContent += `Sheet: ${sheetName}\n${csv}\n\n`;
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
await fs.writeFile(outputPath, csvContent);
|
|
241
|
+
return outputPath;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// Storage-related methods (to be implemented by the caller)
|
|
245
|
+
async _getFileStoreMap(key) {
|
|
246
|
+
throw new Error('Method _getFileStoreMap must be implemented');
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
async _setFileStoreMap(key, value) {
|
|
250
|
+
throw new Error('Method _setFileStoreMap must be implemented');
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
async _urlExists(url) {
|
|
254
|
+
throw new Error('Method _urlExists must be implemented');
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
async _gcsUrlExists(url) {
|
|
258
|
+
throw new Error('Method _gcsUrlExists must be implemented');
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
async _downloadFile(url, destination) {
|
|
262
|
+
throw new Error('Method _downloadFile must be implemented');
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
async _saveConvertedFile(filePath, requestId) {
|
|
266
|
+
throw new Error('Method _saveConvertedFile must be implemented');
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
async _uploadChunkToGCS(filePath, requestId) {
|
|
270
|
+
throw new Error('Method _uploadChunkToGCS must be implemented');
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
_isGCSConfigured() {
|
|
274
|
+
throw new Error('Method _isGCSConfigured must be implemented');
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
async _cleanupTempFiles(...files) {
|
|
278
|
+
for (const file of files) {
|
|
279
|
+
try {
|
|
280
|
+
if (file && await fs.access(file).then(() => true).catch(() => false)) {
|
|
281
|
+
await fs.unlink(file);
|
|
282
|
+
}
|
|
283
|
+
} catch (err) {
|
|
284
|
+
this.context.log('Error cleaning up temp file:', err);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { ConversionService } from './ConversionService.js';
|
|
2
|
+
import { getFileStoreMap, setFileStoreMap } from '../redis.js';
|
|
3
|
+
import { urlExists } from '../helper.js';
|
|
4
|
+
import { gcsUrlExists, uploadChunkToGCS, gcs } from '../blobHandler.js';
|
|
5
|
+
import { downloadFile } from '../fileChunker.js';
|
|
6
|
+
import { saveFileToBlob } from '../blobHandler.js';
|
|
7
|
+
import { moveFileToPublicFolder } from '../localFileHandler.js';
|
|
8
|
+
|
|
9
|
+
export class FileConversionService extends ConversionService {
|
|
10
|
+
constructor(context, useAzure = true) {
|
|
11
|
+
super(context);
|
|
12
|
+
this.useAzure = useAzure;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
async _getFileStoreMap(key) {
|
|
16
|
+
return getFileStoreMap(key);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
async _setFileStoreMap(key, value) {
|
|
20
|
+
return setFileStoreMap(key, value);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async _urlExists(url) {
|
|
24
|
+
return urlExists(url);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
async _gcsUrlExists(url) {
|
|
28
|
+
return gcsUrlExists(url);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
async _downloadFile(url, destination) {
|
|
32
|
+
return downloadFile(url, destination);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async _saveConvertedFile(filePath, requestId) {
|
|
36
|
+
let fileUrl;
|
|
37
|
+
if (this.useAzure) {
|
|
38
|
+
const savedBlob = await saveFileToBlob(filePath, requestId);
|
|
39
|
+
fileUrl = savedBlob.url;
|
|
40
|
+
} else {
|
|
41
|
+
fileUrl = await moveFileToPublicFolder(filePath, requestId);
|
|
42
|
+
}
|
|
43
|
+
return { url: fileUrl };
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
async _uploadChunkToGCS(filePath, requestId) {
|
|
47
|
+
return uploadChunkToGCS(filePath, requestId);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
_isGCSConfigured() {
|
|
51
|
+
return !!gcs;
|
|
52
|
+
}
|
|
53
|
+
}
|