@aj-archipelago/cortex 1.3.49 → 1.3.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.js +1 -1
- package/helper-apps/cortex-browser/Dockerfile +19 -31
- package/helper-apps/cortex-browser/function_app.py +708 -181
- package/helper-apps/cortex-browser/requirements.txt +4 -4
- package/helper-apps/cortex-file-handler/blobHandler.js +850 -429
- package/helper-apps/cortex-file-handler/constants.js +64 -48
- package/helper-apps/cortex-file-handler/docHelper.js +7 -114
- package/helper-apps/cortex-file-handler/fileChunker.js +96 -51
- package/helper-apps/cortex-file-handler/function.json +2 -6
- package/helper-apps/cortex-file-handler/helper.js +34 -25
- package/helper-apps/cortex-file-handler/index.js +324 -136
- package/helper-apps/cortex-file-handler/localFileHandler.js +56 -57
- package/helper-apps/cortex-file-handler/package-lock.json +6065 -5964
- package/helper-apps/cortex-file-handler/package.json +8 -4
- package/helper-apps/cortex-file-handler/redis.js +23 -17
- package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +12 -9
- package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +21 -18
- package/helper-apps/cortex-file-handler/scripts/test-azure.sh +1 -1
- package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +1 -1
- package/helper-apps/cortex-file-handler/services/ConversionService.js +288 -0
- package/helper-apps/cortex-file-handler/services/FileConversionService.js +53 -0
- package/helper-apps/cortex-file-handler/start.js +63 -38
- package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +144 -0
- package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +88 -64
- package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +114 -91
- package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +351 -0
- package/helper-apps/cortex-file-handler/tests/files/DOCX_TestPage.docx +0 -0
- package/helper-apps/cortex-file-handler/tests/files/tests-example.xls +0 -0
- package/helper-apps/cortex-file-handler/tests/start.test.js +943 -642
- package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +31 -0
- package/helper-apps/cortex-markitdown/.funcignore +1 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py +64 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json +21 -0
- package/helper-apps/cortex-markitdown/README.md +94 -0
- package/helper-apps/cortex-markitdown/host.json +15 -0
- package/helper-apps/cortex-markitdown/requirements.txt +2 -0
- package/lib/requestExecutor.js +44 -36
- package/package.json +1 -1
- package/pathways/system/entity/tools/sys_tool_cognitive_search.js +1 -1
- package/pathways/system/entity/tools/sys_tool_readfile.js +24 -2
- package/server/plugins/openAiWhisperPlugin.js +59 -87
- package/helper-apps/cortex-file-handler/tests/docHelper.test.js +0 -148
|
@@ -1,48 +1,42 @@
|
|
|
1
1
|
export const DOC_EXTENSIONS = [
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
2
|
+
'.txt',
|
|
3
|
+
'.json',
|
|
4
|
+
'.csv',
|
|
5
|
+
'.md',
|
|
6
|
+
'.xml',
|
|
7
|
+
'.js',
|
|
8
|
+
'.html',
|
|
9
|
+
'.css',
|
|
10
|
+
'.doc',
|
|
11
|
+
'.docx',
|
|
12
|
+
'.xls',
|
|
13
|
+
'.xlsx',
|
|
14
14
|
];
|
|
15
15
|
|
|
16
16
|
export const IMAGE_EXTENSIONS = [
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
17
|
+
'.jpg',
|
|
18
|
+
'.jpeg',
|
|
19
|
+
'.png',
|
|
20
|
+
'.webp',
|
|
21
|
+
'.heic',
|
|
22
|
+
'.heif',
|
|
23
|
+
'.pdf',
|
|
24
24
|
];
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
export const VIDEO_EXTENSIONS = [
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
];
|
|
37
|
-
|
|
38
|
-
export const AUDIO_EXTENSIONS = [
|
|
39
|
-
".wav",
|
|
40
|
-
".mp3",
|
|
41
|
-
".aac",
|
|
42
|
-
".ogg",
|
|
43
|
-
".flac"
|
|
27
|
+
'.mp4',
|
|
28
|
+
'.mpeg',
|
|
29
|
+
'.mov',
|
|
30
|
+
'.avi',
|
|
31
|
+
'.flv',
|
|
32
|
+
'.mpg',
|
|
33
|
+
'.webm',
|
|
34
|
+
'.wmv',
|
|
35
|
+
'.3gp',
|
|
44
36
|
];
|
|
45
37
|
|
|
38
|
+
export const AUDIO_EXTENSIONS = ['.wav', '.mp3', '.aac', '.ogg', '.flac'];
|
|
39
|
+
|
|
46
40
|
export const ACCEPTED_MIME_TYPES = {
|
|
47
41
|
// Document types
|
|
48
42
|
'text/plain': ['.txt'],
|
|
@@ -53,24 +47,35 @@ export const ACCEPTED_MIME_TYPES = {
|
|
|
53
47
|
'text/javascript': ['.js'],
|
|
54
48
|
'text/html': ['.html'],
|
|
55
49
|
'text/css': ['.css'],
|
|
56
|
-
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': [
|
|
57
|
-
|
|
50
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': [
|
|
51
|
+
'.docx',
|
|
52
|
+
],
|
|
53
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': [
|
|
54
|
+
'.xlsx',
|
|
55
|
+
],
|
|
58
56
|
'application/msword': ['.doc'],
|
|
59
57
|
'application/vnd.ms-excel': ['.xls'],
|
|
60
58
|
'application/vnd.ms-word.document.macroEnabled.12': ['.docm'],
|
|
61
59
|
'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'],
|
|
62
60
|
'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'],
|
|
63
61
|
'application/vnd.ms-excel.template.macroEnabled.12': ['.xltm'],
|
|
64
|
-
|
|
62
|
+
|
|
65
63
|
// Image types
|
|
66
64
|
'image/jpeg': ['.jpg', '.jpeg'],
|
|
67
65
|
'image/png': ['.png'],
|
|
68
66
|
'image/webp': ['.webp'],
|
|
69
67
|
'image/heic': ['.heic'],
|
|
70
68
|
'image/heif': ['.heif'],
|
|
71
|
-
'application/octet-stream': [
|
|
69
|
+
'application/octet-stream': [
|
|
70
|
+
'.jpg',
|
|
71
|
+
'.jpeg',
|
|
72
|
+
'.png',
|
|
73
|
+
'.webp',
|
|
74
|
+
'.heic',
|
|
75
|
+
'.heif',
|
|
76
|
+
],
|
|
72
77
|
'application/pdf': ['.pdf'],
|
|
73
|
-
|
|
78
|
+
|
|
74
79
|
// Audio types
|
|
75
80
|
'audio/wav': ['.wav'],
|
|
76
81
|
'audio/mpeg': ['.mp3'],
|
|
@@ -81,7 +86,7 @@ export const ACCEPTED_MIME_TYPES = {
|
|
|
81
86
|
'audio/x-m4a': ['.m4a'],
|
|
82
87
|
'audio/mp3': ['.mp3'],
|
|
83
88
|
'audio/mp4': ['.mp4'],
|
|
84
|
-
|
|
89
|
+
|
|
85
90
|
// Video types
|
|
86
91
|
'video/mp4': ['.mp4'],
|
|
87
92
|
'video/mpeg': ['.mpeg', '.mpg'],
|
|
@@ -108,8 +113,19 @@ export function getExtensionsForMimeType(mimeType) {
|
|
|
108
113
|
|
|
109
114
|
// Helper function to check if an extension is accepted
|
|
110
115
|
export function isAcceptedExtension(extension) {
|
|
111
|
-
return
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
+
return (
|
|
117
|
+
DOC_EXTENSIONS.includes(extension) ||
|
|
118
|
+
IMAGE_EXTENSIONS.includes(extension) ||
|
|
119
|
+
VIDEO_EXTENSIONS.includes(extension) ||
|
|
120
|
+
AUDIO_EXTENSIONS.includes(extension)
|
|
121
|
+
);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
export const CONVERTED_EXTENSIONS = [
|
|
125
|
+
'.doc',
|
|
126
|
+
'.docx',
|
|
127
|
+
'.xls',
|
|
128
|
+
'.xlsx',
|
|
129
|
+
'.ppt',
|
|
130
|
+
'.pptx',
|
|
131
|
+
];
|
|
@@ -1,115 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
import fs from 'fs/promises';
|
|
3
|
-
import mammoth from 'mammoth';
|
|
4
|
-
import XLSX from 'xlsx';
|
|
5
|
-
import Papa from 'papaparse';
|
|
6
|
-
|
|
7
|
-
export async function txtToText(filePath) {
|
|
8
|
-
const text = await fs.readFile(filePath, 'utf-8');
|
|
9
|
-
return text;
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
export async function docxToText(filePath) {
|
|
13
|
-
const buffer = await fs.readFile(filePath);
|
|
14
|
-
const result = await mammoth.extractRawText({ buffer: buffer });
|
|
15
|
-
return result.value;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
export async function xlsxToText(filePath) {
|
|
19
|
-
const workbook = XLSX.readFile(filePath);
|
|
20
|
-
let finalText = '';
|
|
21
|
-
|
|
22
|
-
workbook.SheetNames.forEach(sheetName => {
|
|
23
|
-
const sheet = workbook.Sheets[sheetName];
|
|
24
|
-
const sheetAsJson = XLSX.utils.sheet_to_json(sheet, { header: 1 });
|
|
25
|
-
sheetAsJson.forEach(row => {
|
|
26
|
-
finalText += row.join(' ') + '\n';
|
|
27
|
-
});
|
|
28
|
-
});
|
|
29
|
-
|
|
30
|
-
return finalText;
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
async function pdfToText(filePath) {
|
|
34
|
-
const pdf = await pdfjsLib.getDocument(filePath).promise;
|
|
35
|
-
const meta = await pdf.getMetadata();
|
|
36
|
-
|
|
37
|
-
// Check if pdf is scanned
|
|
38
|
-
if (meta && meta.metadata && meta.metadata._metadataMap && meta.metadata._metadataMap.has('dc:format')) {
|
|
39
|
-
const format = meta.metadata._metadataMap.get('dc:format');
|
|
40
|
-
if (format && format._value && format._value.toLowerCase() === 'application/pdf; version=1.3') {
|
|
41
|
-
throw new Error('Scanned PDFs are not supported');
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
// Check if pdf is encrypted
|
|
46
|
-
if (pdf._pdfInfo && pdf._pdfInfo.encrypt) {
|
|
47
|
-
throw new Error('Encrypted PDFs are not supported');
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
// Check if pdf is password protected
|
|
51
|
-
if (pdf._passwordNeeded) {
|
|
52
|
-
throw new Error('Password protected PDFs are not supported');
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
let finalText = '';
|
|
56
|
-
let ocrNeeded = true; // Initialize the variable as true
|
|
57
|
-
|
|
58
|
-
for (let i = 1; i <= pdf.numPages; i++) {
|
|
59
|
-
const page = await pdf.getPage(i);
|
|
60
|
-
const operatorList = await page.getOperatorList();
|
|
61
|
-
|
|
62
|
-
// Check if there are any fonts used in the PDF
|
|
63
|
-
if (operatorList.fnArray.some(fn => fn === pdfjsLib.OPS.setFont)) {
|
|
64
|
-
ocrNeeded = false; // Set ocrNeeded to false if fonts are found
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
const textContent = await page.getTextContent();
|
|
68
|
-
const strings = textContent.items.map(item => item.str);
|
|
69
|
-
finalText += strings.join(' ') + '\n';
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
if (ocrNeeded) {
|
|
73
|
-
throw new Error('OCR might be needed for this document!');
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
return finalText.trim();
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
export async function csvToText(filePath) {
|
|
80
|
-
const text = await fs.readFile(filePath, 'utf-8');
|
|
81
|
-
const results = Papa.parse(text);
|
|
82
|
-
let finalText = '';
|
|
83
|
-
|
|
84
|
-
results.data.forEach(row => {
|
|
85
|
-
finalText += row.join(' ') + '\n';
|
|
86
|
-
});
|
|
87
|
-
|
|
88
|
-
return finalText;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
export async function documentToText(filePath) {
|
|
92
|
-
const fileExtension = filePath.split('.').pop();
|
|
93
|
-
|
|
94
|
-
switch (fileExtension) {
|
|
95
|
-
case 'pdf':
|
|
96
|
-
return pdfToText(filePath);
|
|
97
|
-
case 'txt':
|
|
98
|
-
case 'html':
|
|
99
|
-
return txtToText(filePath);
|
|
100
|
-
case 'docx':
|
|
101
|
-
case 'doc':
|
|
102
|
-
return docxToText(filePath);
|
|
103
|
-
case 'xlsx':
|
|
104
|
-
case 'xls':
|
|
105
|
-
return xlsxToText(filePath);
|
|
106
|
-
case 'csv':
|
|
107
|
-
return csvToText(filePath);
|
|
108
|
-
default:
|
|
109
|
-
throw new Error(`Unsupported file type: ${fileExtension}`);
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
|
|
1
|
+
// Utility function for chunking text into smaller pieces
|
|
113
2
|
export function easyChunker(text) {
|
|
114
3
|
const result = [];
|
|
115
4
|
const n = 10000;
|
|
@@ -124,7 +13,11 @@ export function easyChunker(text) {
|
|
|
124
13
|
let endIndex = Math.min(startIndex + n, text.length);
|
|
125
14
|
|
|
126
15
|
// Make sure we don't split in the middle of a sentence
|
|
127
|
-
while (
|
|
16
|
+
while (
|
|
17
|
+
endIndex > startIndex &&
|
|
18
|
+
text[endIndex] !== '.' &&
|
|
19
|
+
text[endIndex] !== ' '
|
|
20
|
+
) {
|
|
128
21
|
endIndex--;
|
|
129
22
|
}
|
|
130
23
|
|
|
@@ -141,4 +34,4 @@ export function easyChunker(text) {
|
|
|
141
34
|
}
|
|
142
35
|
|
|
143
36
|
return result;
|
|
144
|
-
}
|
|
37
|
+
}
|
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
import fs from 'fs';
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import { v4 as uuidv4 } from 'uuid';
|
|
2
|
+
import http from 'http';
|
|
3
|
+
import https from 'https';
|
|
5
4
|
import os from 'os';
|
|
5
|
+
import path from 'path';
|
|
6
|
+
import { Transform } from 'stream';
|
|
7
|
+
import { pipeline } from 'stream/promises';
|
|
6
8
|
import { promisify } from 'util';
|
|
9
|
+
|
|
7
10
|
import axios from 'axios';
|
|
11
|
+
import ffmpeg from 'fluent-ffmpeg';
|
|
12
|
+
import { v4 as uuidv4 } from 'uuid';
|
|
13
|
+
|
|
8
14
|
import { ensureEncoded } from './helper.js';
|
|
9
|
-
import http from 'http';
|
|
10
|
-
import https from 'https';
|
|
11
|
-
import { pipeline } from 'stream/promises';
|
|
12
15
|
|
|
13
16
|
const ffmpegProbe = promisify(ffmpeg.ffprobe);
|
|
14
17
|
|
|
@@ -18,7 +21,6 @@ const tempDirectories = new Map(); // dir -> { createdAt, requestId }
|
|
|
18
21
|
|
|
19
22
|
// Temp directory cleanup
|
|
20
23
|
async function cleanupTempDirectories() {
|
|
21
|
-
|
|
22
24
|
for (const [dir, info] of tempDirectories) {
|
|
23
25
|
try {
|
|
24
26
|
// Cleanup directories older than 1 hour
|
|
@@ -43,7 +45,7 @@ setInterval(async () => {
|
|
|
43
45
|
}
|
|
44
46
|
}, CLEANUP_INTERVAL_MS);
|
|
45
47
|
|
|
46
|
-
// Process a single chunk with streaming
|
|
48
|
+
// Process a single chunk with streaming and progress tracking
|
|
47
49
|
async function processChunk(inputPath, outputFileName, start, duration) {
|
|
48
50
|
return new Promise((resolve, reject) => {
|
|
49
51
|
const command = ffmpeg(inputPath)
|
|
@@ -71,8 +73,11 @@ async function processChunk(inputPath, outputFileName, start, duration) {
|
|
|
71
73
|
resolve(outputFileName);
|
|
72
74
|
});
|
|
73
75
|
|
|
74
|
-
// Use
|
|
75
|
-
|
|
76
|
+
// Use pipeline for better error handling and backpressure
|
|
77
|
+
pipeline(
|
|
78
|
+
command,
|
|
79
|
+
fs.createWriteStream(outputFileName, { highWaterMark: 4 * 1024 * 1024 }), // 4MB chunks
|
|
80
|
+
).catch(reject);
|
|
76
81
|
});
|
|
77
82
|
}
|
|
78
83
|
|
|
@@ -80,38 +85,55 @@ const generateUniqueFolderName = () => {
|
|
|
80
85
|
const uniqueFolderName = uuidv4();
|
|
81
86
|
const tempFolderPath = os.tmpdir();
|
|
82
87
|
return path.join(tempFolderPath, uniqueFolderName);
|
|
83
|
-
}
|
|
88
|
+
};
|
|
84
89
|
|
|
85
90
|
async function downloadFile(url, outputPath) {
|
|
86
91
|
try {
|
|
92
|
+
const agent = {
|
|
93
|
+
http: new http.Agent({
|
|
94
|
+
keepAlive: true,
|
|
95
|
+
maxSockets: 10,
|
|
96
|
+
maxFreeSockets: 10,
|
|
97
|
+
timeout: 60000,
|
|
98
|
+
}),
|
|
99
|
+
https: new https.Agent({
|
|
100
|
+
keepAlive: true,
|
|
101
|
+
maxSockets: 10,
|
|
102
|
+
maxFreeSockets: 10,
|
|
103
|
+
timeout: 60000,
|
|
104
|
+
}),
|
|
105
|
+
};
|
|
106
|
+
|
|
87
107
|
let response;
|
|
88
108
|
try {
|
|
89
|
-
response = await axios.get(decodeURIComponent(url), {
|
|
109
|
+
response = await axios.get(decodeURIComponent(url), {
|
|
90
110
|
responseType: 'stream',
|
|
91
|
-
// Add timeout and maxContentLength
|
|
92
111
|
timeout: 30000,
|
|
93
112
|
maxContentLength: Infinity,
|
|
94
|
-
// Enable streaming download
|
|
95
113
|
decompress: true,
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
114
|
+
httpAgent: agent.http,
|
|
115
|
+
httpsAgent: agent.https,
|
|
116
|
+
maxRedirects: 5,
|
|
117
|
+
validateStatus: (status) => status >= 200 && status < 300,
|
|
99
118
|
});
|
|
100
119
|
} catch (error) {
|
|
101
|
-
response = await axios.get(url, {
|
|
120
|
+
response = await axios.get(url, {
|
|
102
121
|
responseType: 'stream',
|
|
103
122
|
timeout: 30000,
|
|
104
123
|
maxContentLength: Infinity,
|
|
105
124
|
decompress: true,
|
|
106
|
-
httpAgent:
|
|
107
|
-
httpsAgent:
|
|
125
|
+
httpAgent: agent.http,
|
|
126
|
+
httpsAgent: agent.https,
|
|
127
|
+
maxRedirects: 5,
|
|
128
|
+
validateStatus: (status) => status >= 200 && status < 300,
|
|
108
129
|
});
|
|
109
130
|
}
|
|
110
131
|
|
|
111
|
-
const writer = fs.createWriteStream(outputPath);
|
|
112
|
-
|
|
113
132
|
// Use pipeline for better error handling and memory management
|
|
114
|
-
await pipeline(
|
|
133
|
+
await pipeline(
|
|
134
|
+
response.data,
|
|
135
|
+
fs.createWriteStream(outputPath, { highWaterMark: 4 * 1024 * 1024 }), // 4MB chunks
|
|
136
|
+
);
|
|
115
137
|
|
|
116
138
|
if (!fs.existsSync(outputPath) || fs.statSync(outputPath).size === 0) {
|
|
117
139
|
throw new Error('Download failed or file is empty');
|
|
@@ -124,25 +146,30 @@ async function downloadFile(url, outputPath) {
|
|
|
124
146
|
}
|
|
125
147
|
}
|
|
126
148
|
|
|
127
|
-
async function splitMediaFile(
|
|
149
|
+
async function splitMediaFile(
|
|
150
|
+
inputPath,
|
|
151
|
+
chunkDurationInSeconds = 500,
|
|
152
|
+
requestId = uuidv4(),
|
|
153
|
+
) {
|
|
128
154
|
let tempPath = null;
|
|
129
155
|
let uniqueOutputPath = null;
|
|
130
156
|
let inputStream = null;
|
|
131
|
-
|
|
157
|
+
|
|
132
158
|
try {
|
|
133
159
|
uniqueOutputPath = generateUniqueFolderName();
|
|
134
160
|
fs.mkdirSync(uniqueOutputPath, { recursive: true });
|
|
135
|
-
|
|
161
|
+
|
|
136
162
|
tempDirectories.set(uniqueOutputPath, {
|
|
137
163
|
createdAt: Date.now(),
|
|
138
|
-
requestId
|
|
164
|
+
requestId,
|
|
139
165
|
});
|
|
140
166
|
|
|
141
167
|
// Handle URL downloads with streaming
|
|
142
168
|
const isUrl = /^(https?|ftp):\/\/[^\s/$.?#].[^\s]*$/i.test(inputPath);
|
|
143
169
|
if (isUrl) {
|
|
144
170
|
const urlObj = new URL(ensureEncoded(inputPath));
|
|
145
|
-
const originalFileName =
|
|
171
|
+
const originalFileName =
|
|
172
|
+
path.basename(urlObj.pathname) || 'downloaded_file';
|
|
146
173
|
tempPath = path.join(uniqueOutputPath, originalFileName);
|
|
147
174
|
console.log('Downloading file to:', tempPath);
|
|
148
175
|
await downloadFile(inputPath, tempPath);
|
|
@@ -155,9 +182,9 @@ async function splitMediaFile(inputPath, chunkDurationInSeconds = 500, requestId
|
|
|
155
182
|
}
|
|
156
183
|
|
|
157
184
|
// Use a larger chunk size for better throughput while still managing memory
|
|
158
|
-
inputStream = fs.createReadStream(inputPath, {
|
|
185
|
+
inputStream = fs.createReadStream(inputPath, {
|
|
159
186
|
highWaterMark: 4 * 1024 * 1024, // 4MB chunks
|
|
160
|
-
autoClose: true
|
|
187
|
+
autoClose: true,
|
|
161
188
|
});
|
|
162
189
|
|
|
163
190
|
console.log('Probing file:', inputPath);
|
|
@@ -168,33 +195,50 @@ async function splitMediaFile(inputPath, chunkDurationInSeconds = 500, requestId
|
|
|
168
195
|
|
|
169
196
|
const duration = metadata.format.duration;
|
|
170
197
|
const numChunks = Math.ceil((duration - 1) / chunkDurationInSeconds);
|
|
171
|
-
console.log(
|
|
198
|
+
console.log(
|
|
199
|
+
`Processing ${numChunks} chunks of ${chunkDurationInSeconds} seconds each`,
|
|
200
|
+
);
|
|
172
201
|
|
|
173
202
|
const chunkResults = new Array(numChunks); // Pre-allocate array to maintain order
|
|
174
203
|
const chunkOffsets = new Array(numChunks); // Pre-allocate offsets array
|
|
175
204
|
|
|
176
205
|
// Process chunks in parallel with a concurrency limit
|
|
177
|
-
const CONCURRENT_CHUNKS = 3; //
|
|
206
|
+
const CONCURRENT_CHUNKS = Math.min(3, os.cpus().length); // Use CPU count to determine concurrency
|
|
207
|
+
const chunkPromises = [];
|
|
208
|
+
|
|
178
209
|
for (let i = 0; i < numChunks; i += CONCURRENT_CHUNKS) {
|
|
179
210
|
const chunkBatch = [];
|
|
180
211
|
for (let j = 0; j < CONCURRENT_CHUNKS && i + j < numChunks; j++) {
|
|
181
212
|
const chunkIndex = i + j;
|
|
182
|
-
const outputFileName = path.join(
|
|
213
|
+
const outputFileName = path.join(
|
|
214
|
+
uniqueOutputPath,
|
|
215
|
+
`chunk-${chunkIndex + 1}-${path.parse(inputPath).name}.mp3`,
|
|
216
|
+
);
|
|
183
217
|
const offset = chunkIndex * chunkDurationInSeconds;
|
|
184
|
-
|
|
185
|
-
chunkBatch.push(
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
218
|
+
|
|
219
|
+
chunkBatch.push(
|
|
220
|
+
processChunk(
|
|
221
|
+
inputPath,
|
|
222
|
+
outputFileName,
|
|
223
|
+
offset,
|
|
224
|
+
chunkDurationInSeconds,
|
|
225
|
+
)
|
|
226
|
+
.then((result) => {
|
|
227
|
+
chunkResults[chunkIndex] = result; // Store in correct position
|
|
228
|
+
chunkOffsets[chunkIndex] = offset; // Store offset in correct position
|
|
229
|
+
console.log(`Completed chunk ${chunkIndex + 1}/${numChunks}`);
|
|
230
|
+
return result;
|
|
231
|
+
})
|
|
232
|
+
.catch((error) => {
|
|
233
|
+
console.error(
|
|
234
|
+
`Failed to process chunk ${chunkIndex + 1}:`,
|
|
235
|
+
error,
|
|
236
|
+
);
|
|
237
|
+
return null;
|
|
238
|
+
}),
|
|
239
|
+
);
|
|
196
240
|
}
|
|
197
|
-
|
|
241
|
+
|
|
198
242
|
// Wait for the current batch to complete before starting the next
|
|
199
243
|
await Promise.all(chunkBatch);
|
|
200
244
|
}
|
|
@@ -207,7 +251,11 @@ async function splitMediaFile(inputPath, chunkDurationInSeconds = 500, requestId
|
|
|
207
251
|
throw new Error('No chunks were successfully processed');
|
|
208
252
|
}
|
|
209
253
|
|
|
210
|
-
return {
|
|
254
|
+
return {
|
|
255
|
+
chunkPromises: validChunks,
|
|
256
|
+
chunkOffsets: validOffsets,
|
|
257
|
+
uniqueOutputPath,
|
|
258
|
+
};
|
|
211
259
|
} catch (err) {
|
|
212
260
|
if (uniqueOutputPath && fs.existsSync(uniqueOutputPath)) {
|
|
213
261
|
try {
|
|
@@ -230,7 +278,4 @@ async function splitMediaFile(inputPath, chunkDurationInSeconds = 500, requestId
|
|
|
230
278
|
}
|
|
231
279
|
}
|
|
232
280
|
|
|
233
|
-
export {
|
|
234
|
-
splitMediaFile,
|
|
235
|
-
downloadFile
|
|
236
|
-
};
|
|
281
|
+
export { splitMediaFile, downloadFile };
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import fs from 'fs';
|
|
2
|
-
import { ACCEPTED_MIME_TYPES, isAcceptedMimeType } from './constants.js';
|
|
3
|
-
import path from 'path';
|
|
4
2
|
import http from 'http';
|
|
5
3
|
import https from 'https';
|
|
4
|
+
import path from 'path';
|
|
5
|
+
|
|
6
|
+
import { ACCEPTED_MIME_TYPES, isAcceptedMimeType } from './constants.js';
|
|
6
7
|
|
|
7
8
|
export async function deleteTempPath(path) {
|
|
8
9
|
try {
|
|
@@ -20,7 +21,9 @@ export async function deleteTempPath(path) {
|
|
|
20
21
|
console.log(`Temporary file ${path} deleted successfully.`);
|
|
21
22
|
} else if (stats.isDirectory()) {
|
|
22
23
|
fs.rmSync(path, { recursive: true });
|
|
23
|
-
console.log(
|
|
24
|
+
console.log(
|
|
25
|
+
`Temporary folder ${path} and its contents deleted successfully.`,
|
|
26
|
+
);
|
|
24
27
|
}
|
|
25
28
|
} catch (err) {
|
|
26
29
|
console.error('Error occurred while deleting the temporary path:', err);
|
|
@@ -38,7 +41,7 @@ export function getExtensionForMimeType(mimeType) {
|
|
|
38
41
|
// Ensure a filename has the correct extension based on its mime type
|
|
39
42
|
export function ensureFileExtension(filename, mimeType) {
|
|
40
43
|
if (!mimeType) return filename;
|
|
41
|
-
|
|
44
|
+
|
|
42
45
|
const extension = getExtensionForMimeType(mimeType);
|
|
43
46
|
if (!extension) return filename;
|
|
44
47
|
|
|
@@ -49,12 +52,12 @@ export function ensureFileExtension(filename, mimeType) {
|
|
|
49
52
|
|
|
50
53
|
// Get the current extension if any
|
|
51
54
|
const currentExt = path.extname(filename);
|
|
52
|
-
|
|
55
|
+
|
|
53
56
|
// If there's no current extension, just append the new one
|
|
54
57
|
if (!currentExt) {
|
|
55
58
|
return `${filename}${extension}`;
|
|
56
59
|
}
|
|
57
|
-
|
|
60
|
+
|
|
58
61
|
// Replace the current extension with the new one
|
|
59
62
|
return filename.slice(0, -currentExt.length) + extension;
|
|
60
63
|
}
|
|
@@ -69,39 +72,45 @@ export function ensureEncoded(url) {
|
|
|
69
72
|
}
|
|
70
73
|
|
|
71
74
|
export async function urlExists(url) {
|
|
72
|
-
if(!url) return false;
|
|
73
|
-
|
|
75
|
+
if (!url) return false;
|
|
76
|
+
|
|
74
77
|
try {
|
|
75
|
-
|
|
78
|
+
// Basic URL validation
|
|
76
79
|
const urlObj = new URL(url);
|
|
77
80
|
if (!['http:', 'https:'].includes(urlObj.protocol)) {
|
|
78
81
|
throw new Error('Invalid protocol - only HTTP and HTTPS are supported');
|
|
79
82
|
}
|
|
80
83
|
|
|
81
84
|
const httpModule = urlObj.protocol === 'https:' ? https : http;
|
|
82
|
-
|
|
85
|
+
|
|
83
86
|
return new Promise((resolve) => {
|
|
84
|
-
const request = httpModule.request(
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
87
|
+
const request = httpModule.request(
|
|
88
|
+
url,
|
|
89
|
+
{ method: 'HEAD' },
|
|
90
|
+
function (response) {
|
|
91
|
+
if (response.statusCode >= 200 && response.statusCode < 400) {
|
|
92
|
+
const contentType = response.headers['content-type'];
|
|
93
|
+
const cleanContentType = contentType
|
|
94
|
+
? contentType.split(';')[0].trim()
|
|
95
|
+
: '';
|
|
96
|
+
// Check if the content type is one we accept
|
|
97
|
+
if (cleanContentType && isAcceptedMimeType(cleanContentType)) {
|
|
98
|
+
resolve({ valid: true, contentType: cleanContentType });
|
|
99
|
+
} else {
|
|
100
|
+
console.log(`Unsupported content type: ${contentType}`);
|
|
101
|
+
resolve({ valid: false });
|
|
102
|
+
}
|
|
91
103
|
} else {
|
|
92
|
-
console.log(`Unsupported content type: ${contentType}`);
|
|
93
104
|
resolve({ valid: false });
|
|
94
105
|
}
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
request.on('error', function(err) {
|
|
106
|
+
},
|
|
107
|
+
);
|
|
108
|
+
|
|
109
|
+
request.on('error', function (err) {
|
|
101
110
|
console.error('URL validation error:', err.message);
|
|
102
111
|
resolve({ valid: false });
|
|
103
112
|
});
|
|
104
|
-
|
|
113
|
+
|
|
105
114
|
request.end();
|
|
106
115
|
});
|
|
107
116
|
} catch (error) {
|