@aj-archipelago/cortex 1.3.57 → 1.3.59
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/config.js +22 -0
- package/helper-apps/cortex-file-handler/INTERFACE.md +20 -9
- package/helper-apps/cortex-file-handler/package-lock.json +2 -2
- package/helper-apps/cortex-file-handler/package.json +1 -1
- package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +17 -17
- package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +35 -35
- package/helper-apps/cortex-file-handler/src/blobHandler.js +1010 -909
- package/helper-apps/cortex-file-handler/src/constants.js +98 -98
- package/helper-apps/cortex-file-handler/src/docHelper.js +27 -27
- package/helper-apps/cortex-file-handler/src/fileChunker.js +224 -214
- package/helper-apps/cortex-file-handler/src/helper.js +93 -93
- package/helper-apps/cortex-file-handler/src/index.js +584 -550
- package/helper-apps/cortex-file-handler/src/localFileHandler.js +86 -86
- package/helper-apps/cortex-file-handler/src/redis.js +186 -90
- package/helper-apps/cortex-file-handler/src/services/ConversionService.js +301 -273
- package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +55 -55
- package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +174 -154
- package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +239 -223
- package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +161 -159
- package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +73 -71
- package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +46 -45
- package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +256 -213
- package/helper-apps/cortex-file-handler/src/start.js +4 -1
- package/helper-apps/cortex-file-handler/src/utils/filenameUtils.js +59 -25
- package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +119 -116
- package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +257 -257
- package/helper-apps/cortex-file-handler/tests/cleanup.test.js +676 -0
- package/helper-apps/cortex-file-handler/tests/conversionResilience.test.js +124 -124
- package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +249 -208
- package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +439 -380
- package/helper-apps/cortex-file-handler/tests/getOperations.test.js +299 -263
- package/helper-apps/cortex-file-handler/tests/postOperations.test.js +265 -239
- package/helper-apps/cortex-file-handler/tests/start.test.js +1230 -1201
- package/helper-apps/cortex-file-handler/tests/storage/AzureStorageProvider.test.js +110 -105
- package/helper-apps/cortex-file-handler/tests/storage/GCSStorageProvider.test.js +201 -175
- package/helper-apps/cortex-file-handler/tests/storage/LocalStorageProvider.test.js +128 -125
- package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +78 -73
- package/helper-apps/cortex-file-handler/tests/storage/StorageService.test.js +99 -99
- package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +74 -70
- package/package.json +1 -1
- package/pathways/translate_apptek.js +33 -0
- package/pathways/translate_subtitle.js +15 -8
- package/server/plugins/apptekTranslatePlugin.js +46 -91
- package/tests/apptekTranslatePlugin.test.js +0 -2
- package/tests/integration/apptekTranslatePlugin.integration.test.js +159 -93
- package/tests/translate_apptek.test.js +16 -0
|
@@ -1,17 +1,18 @@
|
|
|
1
|
-
import fs from
|
|
2
|
-
import http from
|
|
3
|
-
import https from
|
|
4
|
-
import os from
|
|
5
|
-
import path from
|
|
6
|
-
import { Transform } from
|
|
7
|
-
import { pipeline } from
|
|
8
|
-
import { promisify } from
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import http from "http";
|
|
3
|
+
import https from "https";
|
|
4
|
+
import os from "os";
|
|
5
|
+
import path from "path";
|
|
6
|
+
import { Transform } from "stream";
|
|
7
|
+
import { pipeline } from "stream/promises";
|
|
8
|
+
import { promisify } from "util";
|
|
9
9
|
|
|
10
|
-
import axios from
|
|
11
|
-
import ffmpeg from
|
|
12
|
-
import { v4 as uuidv4 } from
|
|
10
|
+
import axios from "axios";
|
|
11
|
+
import ffmpeg from "fluent-ffmpeg";
|
|
12
|
+
import { v4 as uuidv4 } from "uuid";
|
|
13
13
|
|
|
14
|
-
import { ensureEncoded } from
|
|
14
|
+
import { ensureEncoded } from "./helper.js";
|
|
15
|
+
import { generateShortId } from "./utils/filenameUtils.js";
|
|
15
16
|
|
|
16
17
|
const ffmpegProbe = promisify(ffmpeg.ffprobe);
|
|
17
18
|
|
|
@@ -21,248 +22,257 @@ const tempDirectories = new Map(); // dir -> { createdAt, requestId }
|
|
|
21
22
|
|
|
22
23
|
// Temp directory cleanup
|
|
23
24
|
async function cleanupTempDirectories() {
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
}
|
|
25
|
+
for (const [dir, info] of tempDirectories) {
|
|
26
|
+
try {
|
|
27
|
+
// Cleanup directories older than 1 hour
|
|
28
|
+
if (Date.now() - info.createdAt > 60 * 60 * 1000) {
|
|
29
|
+
await fs.promises.rm(dir, { recursive: true, force: true });
|
|
30
|
+
tempDirectories.delete(dir);
|
|
31
|
+
console.log(`Cleaned up old temp directory: ${dir}`);
|
|
32
|
+
}
|
|
33
|
+
} catch (err) {
|
|
34
|
+
// Directory might be gone
|
|
35
|
+
tempDirectories.delete(dir);
|
|
36
36
|
}
|
|
37
|
+
}
|
|
37
38
|
}
|
|
38
39
|
|
|
39
40
|
// Setup periodic cleanup
|
|
40
41
|
setInterval(async () => {
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
42
|
+
try {
|
|
43
|
+
await cleanupTempDirectories();
|
|
44
|
+
} catch (err) {
|
|
45
|
+
console.error("Error during periodic cleanup:", err);
|
|
46
|
+
}
|
|
46
47
|
}, CLEANUP_INTERVAL_MS);
|
|
47
48
|
|
|
48
49
|
// Process a single chunk with streaming and progress tracking
|
|
49
50
|
async function processChunk(inputPath, outputFileName, start, duration) {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
51
|
+
return new Promise((resolve, reject) => {
|
|
52
|
+
const command = ffmpeg(inputPath)
|
|
53
|
+
.seekInput(start)
|
|
54
|
+
.duration(duration)
|
|
55
|
+
.format("mp3")
|
|
56
|
+
.audioCodec("libmp3lame")
|
|
57
|
+
.audioBitrate(128)
|
|
58
|
+
.on("start", () => {
|
|
59
|
+
console.log(`Processing chunk: ${start}s -> ${start + duration}s`);
|
|
60
|
+
})
|
|
61
|
+
.on("progress", (progress) => {
|
|
62
|
+
if (progress.percent) {
|
|
63
|
+
console.log(`Chunk progress: ${progress.percent}%`);
|
|
64
|
+
}
|
|
65
|
+
})
|
|
66
|
+
.on("error", (err, stdout, stderr) => {
|
|
67
|
+
console.error("FFmpeg error:", err.message);
|
|
68
|
+
if (stdout) console.log("FFmpeg stdout:", stdout);
|
|
69
|
+
if (stderr) console.error("FFmpeg stderr:", stderr);
|
|
70
|
+
reject(err);
|
|
71
|
+
})
|
|
72
|
+
.on("end", () => {
|
|
73
|
+
console.log(`Chunk complete: ${outputFileName}`);
|
|
74
|
+
resolve(outputFileName);
|
|
75
|
+
});
|
|
75
76
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
// Use pipeline for better error handling and backpressure
|
|
78
|
+
pipeline(
|
|
79
|
+
command,
|
|
80
|
+
fs.createWriteStream(outputFileName, { highWaterMark: 4 * 1024 * 1024 }), // 4MB chunks
|
|
81
|
+
).catch(reject);
|
|
82
|
+
});
|
|
82
83
|
}
|
|
83
84
|
|
|
84
85
|
const generateUniqueFolderName = () => {
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
86
|
+
const uniqueFolderName = uuidv4();
|
|
87
|
+
const tempFolderPath = os.tmpdir();
|
|
88
|
+
return path.join(tempFolderPath, uniqueFolderName);
|
|
88
89
|
};
|
|
89
90
|
|
|
90
91
|
async function downloadFile(url, outputPath) {
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
92
|
+
try {
|
|
93
|
+
const agent = {
|
|
94
|
+
http: new http.Agent({
|
|
95
|
+
keepAlive: true,
|
|
96
|
+
maxSockets: 10,
|
|
97
|
+
maxFreeSockets: 10,
|
|
98
|
+
timeout: 60000,
|
|
99
|
+
}),
|
|
100
|
+
https: new https.Agent({
|
|
101
|
+
keepAlive: true,
|
|
102
|
+
maxSockets: 10,
|
|
103
|
+
maxFreeSockets: 10,
|
|
104
|
+
timeout: 60000,
|
|
105
|
+
}),
|
|
106
|
+
};
|
|
106
107
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
108
|
+
// Use the original URL without any decoding
|
|
109
|
+
const response = await axios.get(url, {
|
|
110
|
+
responseType: "stream",
|
|
111
|
+
timeout: 30000,
|
|
112
|
+
maxContentLength: Infinity,
|
|
113
|
+
decompress: true,
|
|
114
|
+
httpAgent: agent.http,
|
|
115
|
+
httpsAgent: agent.https,
|
|
116
|
+
maxRedirects: 5,
|
|
117
|
+
validateStatus: (status) => status >= 200 && status < 300,
|
|
118
|
+
});
|
|
118
119
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
120
|
+
// Use pipeline for better error handling and memory management
|
|
121
|
+
await pipeline(
|
|
122
|
+
response.data,
|
|
123
|
+
fs.createWriteStream(outputPath, { highWaterMark: 4 * 1024 * 1024 }), // 4MB chunks
|
|
124
|
+
);
|
|
124
125
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
}
|
|
132
|
-
throw error;
|
|
126
|
+
if (!fs.existsSync(outputPath) || fs.statSync(outputPath).size === 0) {
|
|
127
|
+
throw new Error("Download failed or file is empty");
|
|
128
|
+
}
|
|
129
|
+
} catch (error) {
|
|
130
|
+
if (fs.existsSync(outputPath)) {
|
|
131
|
+
fs.unlinkSync(outputPath);
|
|
133
132
|
}
|
|
133
|
+
throw error;
|
|
134
|
+
}
|
|
134
135
|
}
|
|
135
136
|
|
|
136
137
|
async function splitMediaFile(
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
138
|
+
inputPath,
|
|
139
|
+
chunkDurationInSeconds = 500,
|
|
140
|
+
requestId = uuidv4(),
|
|
140
141
|
) {
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
142
|
+
let tempPath = null;
|
|
143
|
+
let uniqueOutputPath = null;
|
|
144
|
+
let inputStream = null;
|
|
144
145
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
146
|
+
try {
|
|
147
|
+
uniqueOutputPath = generateUniqueFolderName();
|
|
148
|
+
fs.mkdirSync(uniqueOutputPath, { recursive: true });
|
|
149
|
+
|
|
150
|
+
tempDirectories.set(uniqueOutputPath, {
|
|
151
|
+
createdAt: Date.now(),
|
|
152
|
+
requestId,
|
|
153
|
+
});
|
|
148
154
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
155
|
+
// Handle URL downloads with streaming
|
|
156
|
+
const isUrl = /^(https?|ftp):\/\/[^\s/$.?#].[^\s]*$/i.test(inputPath);
|
|
157
|
+
if (isUrl) {
|
|
158
|
+
const urlObj = new URL(ensureEncoded(inputPath));
|
|
159
|
+
// Use LLM-friendly naming for temp files instead of original filename
|
|
160
|
+
const fileExtension = path.extname(urlObj.pathname) || ".mp3";
|
|
161
|
+
const shortId = generateShortId();
|
|
162
|
+
const tempFileName = `${shortId}${fileExtension}`;
|
|
163
|
+
tempPath = path.join(uniqueOutputPath, tempFileName);
|
|
164
|
+
console.log("Downloading file to:", tempPath);
|
|
165
|
+
await downloadFile(inputPath, tempPath);
|
|
166
|
+
inputPath = tempPath;
|
|
167
|
+
}
|
|
153
168
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
const originalFileName =
|
|
159
|
-
path.basename(urlObj.pathname) || 'downloaded_file';
|
|
160
|
-
tempPath = path.join(uniqueOutputPath, originalFileName);
|
|
161
|
-
console.log('Downloading file to:', tempPath);
|
|
162
|
-
await downloadFile(inputPath, tempPath);
|
|
163
|
-
inputPath = tempPath;
|
|
164
|
-
}
|
|
169
|
+
inputPath = path.resolve(inputPath);
|
|
170
|
+
if (!fs.existsSync(inputPath)) {
|
|
171
|
+
throw new Error(`Input file not found: ${inputPath}`);
|
|
172
|
+
}
|
|
165
173
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
174
|
+
// Use a larger chunk size for better throughput while still managing memory
|
|
175
|
+
inputStream = fs.createReadStream(inputPath, {
|
|
176
|
+
highWaterMark: 4 * 1024 * 1024, // 4MB chunks
|
|
177
|
+
autoClose: true,
|
|
178
|
+
});
|
|
170
179
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
180
|
+
console.log("Probing file:", inputPath);
|
|
181
|
+
const metadata = await ffmpegProbe(inputPath);
|
|
182
|
+
if (!metadata?.format?.duration) {
|
|
183
|
+
throw new Error("Invalid media file or unable to determine duration");
|
|
184
|
+
}
|
|
176
185
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
186
|
+
const duration = metadata.format.duration;
|
|
187
|
+
const numChunks = Math.ceil((duration - 1) / chunkDurationInSeconds);
|
|
188
|
+
console.log(
|
|
189
|
+
`Processing ${numChunks} chunks of ${chunkDurationInSeconds} seconds each`,
|
|
190
|
+
);
|
|
182
191
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
);
|
|
192
|
+
// Generate filename once for all chunks
|
|
193
|
+
// Since we're converting to MP3 format for transcription, use .mp3 extension
|
|
194
|
+
const chunkBaseId = generateShortId();
|
|
195
|
+
const chunkBaseName = `${chunkBaseId}.mp3`;
|
|
188
196
|
|
|
189
|
-
|
|
190
|
-
|
|
197
|
+
const chunkResults = new Array(numChunks); // Pre-allocate array to maintain order
|
|
198
|
+
const chunkOffsets = new Array(numChunks); // Pre-allocate offsets array
|
|
191
199
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
200
|
+
// Process chunks in parallel with a concurrency limit
|
|
201
|
+
const CONCURRENT_CHUNKS = Math.min(3, os.cpus().length); // Use CPU count to determine concurrency
|
|
202
|
+
const chunkPromises = [];
|
|
195
203
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
204
|
+
for (let i = 0; i < numChunks; i += CONCURRENT_CHUNKS) {
|
|
205
|
+
const chunkBatch = [];
|
|
206
|
+
for (let j = 0; j < CONCURRENT_CHUNKS && i + j < numChunks; j++) {
|
|
207
|
+
const chunkIndex = i + j;
|
|
208
|
+
// Use the same base filename for all chunks
|
|
209
|
+
const outputFileName = path.join(
|
|
210
|
+
uniqueOutputPath,
|
|
211
|
+
`chunk-${chunkIndex + 1}-${chunkBaseName}`,
|
|
212
|
+
);
|
|
213
|
+
const offset = chunkIndex * chunkDurationInSeconds;
|
|
205
214
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
215
|
+
chunkBatch.push(
|
|
216
|
+
processChunk(
|
|
217
|
+
inputPath,
|
|
218
|
+
outputFileName,
|
|
219
|
+
offset,
|
|
220
|
+
chunkDurationInSeconds,
|
|
221
|
+
)
|
|
222
|
+
.then((result) => {
|
|
223
|
+
chunkResults[chunkIndex] = result; // Store in correct position
|
|
224
|
+
chunkOffsets[chunkIndex] = offset; // Store offset in correct position
|
|
225
|
+
console.log(`Completed chunk ${chunkIndex + 1}/${numChunks}`);
|
|
226
|
+
return result;
|
|
227
|
+
})
|
|
228
|
+
.catch((error) => {
|
|
229
|
+
console.error(
|
|
230
|
+
`Failed to process chunk ${chunkIndex + 1}:`,
|
|
231
|
+
error,
|
|
232
|
+
);
|
|
233
|
+
return null;
|
|
234
|
+
}),
|
|
235
|
+
);
|
|
236
|
+
}
|
|
228
237
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
238
|
+
// Wait for the current batch to complete before starting the next
|
|
239
|
+
await Promise.all(chunkBatch);
|
|
240
|
+
}
|
|
232
241
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
242
|
+
// Filter out any failed chunks
|
|
243
|
+
const validChunks = chunkResults.filter(Boolean);
|
|
244
|
+
const validOffsets = chunkOffsets.filter((_, index) => chunkResults[index]);
|
|
236
245
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
246
|
+
if (validChunks.length === 0) {
|
|
247
|
+
throw new Error("No chunks were successfully processed");
|
|
248
|
+
}
|
|
240
249
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
250
|
+
return {
|
|
251
|
+
chunkPromises: validChunks,
|
|
252
|
+
chunkOffsets: validOffsets,
|
|
253
|
+
uniqueOutputPath,
|
|
254
|
+
chunkBaseName, // Return the base filename for consistent naming
|
|
255
|
+
};
|
|
256
|
+
} catch (err) {
|
|
257
|
+
if (uniqueOutputPath && fs.existsSync(uniqueOutputPath)) {
|
|
258
|
+
try {
|
|
259
|
+
fs.rmSync(uniqueOutputPath, { recursive: true, force: true });
|
|
260
|
+
tempDirectories.delete(uniqueOutputPath);
|
|
261
|
+
} catch (cleanupErr) {
|
|
262
|
+
console.error("Error during cleanup:", cleanupErr);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
console.error("Error in splitMediaFile:", err);
|
|
266
|
+
throw new Error(`Error processing media file: ${err.message}`);
|
|
267
|
+
} finally {
|
|
268
|
+
if (inputStream) {
|
|
269
|
+
try {
|
|
270
|
+
inputStream.destroy();
|
|
271
|
+
} catch (err) {
|
|
272
|
+
console.error("Error closing input stream:", err);
|
|
273
|
+
}
|
|
265
274
|
}
|
|
275
|
+
}
|
|
266
276
|
}
|
|
267
277
|
|
|
268
278
|
export { splitMediaFile, downloadFile };
|