@aj-archipelago/cortex 1.3.7 → 1.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +578 -80
- package/helper-apps/cortex-file-handler/blobHandler.js +27 -8
- package/helper-apps/cortex-file-handler/index.js +20 -2
- package/helper-apps/cortex-realtime-voice-server/client/src/chat/Chat.tsx +51 -11
- package/helper-apps/cortex-realtime-voice-server/src/SocketServer.ts +220 -183
- package/helper-apps/cortex-realtime-voice-server/src/Tools.ts +18 -34
- package/helper-apps/cortex-realtime-voice-server/src/cortex/utils.ts +29 -15
- package/helper-apps/cortex-realtime-voice-server/src/realtime/client.ts +47 -1
- package/helper-apps/cortex-realtime-voice-server/src/utils/prompt.ts +2 -11
- package/package.json +1 -1
- package/pathways/system/entity/memory/sys_search_memory.js +2 -1
- package/pathways/system/entity/sys_entity_start.js +6 -7
- package/pathways/system/entity/sys_generator_voice_sample.js +2 -2
- package/pathways/translate_gpt4_omni.js +20 -0
- package/pathways/translate_subtitle.js +326 -135
- package/pathways/translate_subtitle_helper.js +4 -16
- package/server/plugins/azureVideoTranslatePlugin.js +27 -15
- package/server/plugins/claude3VertexPlugin.js +10 -17
- package/server/plugins/gemini15VisionPlugin.js +16 -3
- package/server/plugins/modelPlugin.js +27 -0
- package/server/plugins/openAiVisionPlugin.js +26 -8
- package/tests/multimodal_conversion.test.js +88 -12
- package/tests/translate_srt.test.js +66 -14
|
@@ -1,164 +1,291 @@
|
|
|
1
|
-
import subsrt from "subsrt";
|
|
2
1
|
import logger from "../lib/logger.js";
|
|
3
2
|
import { callPathway } from "../lib/pathwayTools.js";
|
|
4
|
-
import { publishRequestProgress } from "../lib/redisSubscription.js";
|
|
5
3
|
|
|
6
|
-
function preprocessStr(str) {
|
|
4
|
+
function preprocessStr(str, format) {
|
|
7
5
|
try {
|
|
8
6
|
if (!str) return "";
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
7
|
+
let content = str
|
|
8
|
+
// Normalize line endings
|
|
9
|
+
.replace(/\r\n?/g, "\n")
|
|
10
|
+
// Remove WEBVTT header for processing
|
|
11
|
+
.replace(/^WEBVTT\n\n/, '');
|
|
12
|
+
|
|
13
|
+
// For SRT, convert commas to dots in timestamps
|
|
14
|
+
if (format === 'srt') {
|
|
15
|
+
content = content.replace(/(\d{2}:\d{2}:\d{2}),(\d{3})/g, "$1.$2");
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
return content
|
|
19
|
+
// Ensure each subtitle block is properly separated
|
|
20
|
+
.split(/\n\s*\n/)
|
|
21
|
+
.map(block => block.trim())
|
|
22
|
+
.filter(block => {
|
|
23
|
+
// Match both numeric indices (SRT) and optional caption identifiers (VTT)
|
|
24
|
+
const firstLine = block.split('\n')[0];
|
|
25
|
+
return block && (
|
|
26
|
+
/^\d+$/.test(firstLine) || // SRT style
|
|
27
|
+
/^\d{2}:\d{2}/.test(firstLine) || // VTT style without identifier
|
|
28
|
+
/^[^\n]+\n\d{2}:\d{2}/.test(block) // VTT style with identifier
|
|
29
|
+
);
|
|
30
|
+
})
|
|
31
|
+
.join("\n\n")
|
|
32
|
+
+ "\n\n";
|
|
16
33
|
} catch (e) {
|
|
17
34
|
logger.error(`An error occurred in content text preprocessing: ${e}`);
|
|
18
35
|
return "";
|
|
19
36
|
}
|
|
20
37
|
}
|
|
21
38
|
|
|
22
|
-
|
|
23
|
-
const
|
|
24
|
-
|
|
25
|
-
|
|
39
|
+
function timeToMs(timeStr) {
|
|
40
|
+
const [time, ms] = timeStr.split(/[.,]/);
|
|
41
|
+
const [hours, minutes, seconds] = time.split(':').map(Number);
|
|
42
|
+
return (hours * 3600 + minutes * 60 + seconds) * 1000 + parseInt(ms);
|
|
43
|
+
}
|
|
26
44
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
45
|
+
function msToTimestamp(ms, format) {
|
|
46
|
+
const date = new Date(ms);
|
|
47
|
+
const timestamp = date.toISOString().slice(11, 23);
|
|
48
|
+
return format === 'srt' ? timestamp.replace('.', ',') : timestamp;
|
|
49
|
+
}
|
|
32
50
|
|
|
33
|
-
|
|
34
|
-
const
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
translatedLines.splice(batch.length - 1, translatedLines.length - batch.length + 1, mergedLines.join(" "));
|
|
52
|
-
}else {
|
|
53
|
-
const emptyLines = Array(batch.length - translatedLines.length).fill("-");
|
|
54
|
-
translatedLines.push(...emptyLines);
|
|
51
|
+
function parseSubtitles(content, format) {
|
|
52
|
+
const blocks = content.split(/\n\s*\n/).filter(block => block.trim());
|
|
53
|
+
const captions = [];
|
|
54
|
+
|
|
55
|
+
for (const block of blocks) {
|
|
56
|
+
const lines = block.split('\n');
|
|
57
|
+
if (lines.length < 2) continue;
|
|
58
|
+
|
|
59
|
+
let index, timelineIndex;
|
|
60
|
+
if (format === 'srt') {
|
|
61
|
+
// SRT format: numeric index required
|
|
62
|
+
if (!/^\d+$/.test(lines[0])) continue;
|
|
63
|
+
index = parseInt(lines[0]);
|
|
64
|
+
timelineIndex = 1;
|
|
65
|
+
} else {
|
|
66
|
+
// VTT format: optional identifier
|
|
67
|
+
timelineIndex = /^\d{2}:\d{2}/.test(lines[0]) ? 0 : 1;
|
|
68
|
+
index = timelineIndex === 0 ? captions.length + 1 : lines[0];
|
|
55
69
|
}
|
|
70
|
+
|
|
71
|
+
const timeMatch = lines[timelineIndex].match(/^(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})/);
|
|
72
|
+
if (!timeMatch) continue;
|
|
73
|
+
|
|
74
|
+
const startTime = timeMatch[1].replace(',', '.');
|
|
75
|
+
const endTime = timeMatch[2].replace(',', '.');
|
|
76
|
+
const content = lines.slice(timelineIndex + 1).join('\n');
|
|
77
|
+
|
|
78
|
+
captions.push({
|
|
79
|
+
type: "caption",
|
|
80
|
+
index: typeof index === 'number' ? index : captions.length + 1,
|
|
81
|
+
identifier: typeof index === 'string' ? index : null,
|
|
82
|
+
start: timeToMs(startTime),
|
|
83
|
+
end: timeToMs(endTime),
|
|
84
|
+
duration: timeToMs(endTime) - timeToMs(startTime),
|
|
85
|
+
content: content,
|
|
86
|
+
text: content
|
|
87
|
+
});
|
|
56
88
|
}
|
|
89
|
+
|
|
90
|
+
return captions;
|
|
91
|
+
}
|
|
57
92
|
|
|
93
|
+
function splitIntoOverlappingChunks(captions, chunkSize = 20, overlap = 3) {
|
|
94
|
+
const chunks = [];
|
|
95
|
+
for (let i = 0; i < captions.length; i += (chunkSize - overlap)) {
|
|
96
|
+
const end = Math.min(i + chunkSize, captions.length);
|
|
97
|
+
const chunk = captions.slice(i, end);
|
|
98
|
+
chunks.push({
|
|
99
|
+
captions: chunk,
|
|
100
|
+
startIndex: i,
|
|
101
|
+
endIndex: end - 1,
|
|
102
|
+
isOverlap: i > 0 || end < captions.length
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
return chunks;
|
|
106
|
+
}
|
|
58
107
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
while (lastNonEmptyIndex >= 0 && translatedLines[lastNonEmptyIndex].trim() === "") {
|
|
63
|
-
lastNonEmptyIndex--;
|
|
64
|
-
}
|
|
65
|
-
if (lastNonEmptyIndex >= 0) {
|
|
66
|
-
translatedLines[translatedLines.length - 1] = translatedLines[lastNonEmptyIndex];
|
|
67
|
-
translatedLines[lastNonEmptyIndex] = "";
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
|
|
108
|
+
function selectBestTranslation(translations, startIndex, endIndex) {
|
|
109
|
+
// If we only have one translation for this caption, use it
|
|
110
|
+
if (translations.length === 1) return translations[0];
|
|
71
111
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
112
|
+
// For multiple translations, prefer the one from the middle of its chunk
|
|
113
|
+
// This helps avoid edge effects in translation
|
|
114
|
+
return translations.reduce((best, current) => {
|
|
115
|
+
const currentDistance = Math.min(
|
|
116
|
+
Math.abs(current.chunkStart - startIndex),
|
|
117
|
+
Math.abs(current.chunkEnd - endIndex)
|
|
118
|
+
);
|
|
119
|
+
const bestDistance = Math.min(
|
|
120
|
+
Math.abs(best.chunkStart - startIndex),
|
|
121
|
+
Math.abs(best.chunkEnd - endIndex)
|
|
122
|
+
);
|
|
123
|
+
return currentDistance < bestDistance ? current : best;
|
|
124
|
+
});
|
|
77
125
|
}
|
|
78
126
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
127
|
+
function validateFinalOutput(result, originalText, format) {
|
|
128
|
+
// Basic structure validation
|
|
129
|
+
if (!result || !result.trim()) {
|
|
130
|
+
logger.error("Empty or whitespace-only result");
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Check for VTT header if needed
|
|
135
|
+
if (format === 'vtt' && !result.startsWith('WEBVTT\n\n')) {
|
|
136
|
+
logger.error("Missing WEBVTT header");
|
|
137
|
+
return false;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Check for timestamp format
|
|
141
|
+
const timestampPattern = format === 'srt'
|
|
142
|
+
? /\d{2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}/
|
|
143
|
+
: /\d{2}:\d{2}:\d{2}\.\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}\.\d{3}/;
|
|
144
|
+
|
|
145
|
+
const hasTimestamps = timestampPattern.test(result);
|
|
146
|
+
if (!hasTimestamps) {
|
|
147
|
+
logger.error(`No valid ${format.toUpperCase()} timestamps found in result`);
|
|
148
|
+
return false;
|
|
149
|
+
}
|
|
102
150
|
|
|
103
|
-
|
|
104
|
-
|
|
151
|
+
// Check overall length ratio
|
|
152
|
+
if (result.length < originalText.length * 0.5) {
|
|
153
|
+
logger.error(`Result length (${result.length}) is less than 50% of original length (${originalText.length})`);
|
|
154
|
+
return false;
|
|
155
|
+
}
|
|
105
156
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
translatedCaptions = translatedCaptions.concat(translatedBatch);
|
|
128
|
-
currentBatch = [];
|
|
129
|
-
currentWordCount = 0;
|
|
157
|
+
// Validate subtitle block structure
|
|
158
|
+
const blocks = result.split(/\n\s*\n/).filter(block => block.trim());
|
|
159
|
+
|
|
160
|
+
// Skip WEBVTT header for VTT format
|
|
161
|
+
const startIndex = format === 'vtt' && blocks[0].trim() === 'WEBVTT' ? 1 : 0;
|
|
162
|
+
|
|
163
|
+
for (let i = startIndex; i < blocks.length; i++) {
|
|
164
|
+
const block = blocks[i];
|
|
165
|
+
const lines = block.trim().split('\n');
|
|
166
|
+
|
|
167
|
+
if (lines.length < 2) {
|
|
168
|
+
logger.error(`Block ${i + 1} has insufficient lines (${lines.length}):\n${block}`);
|
|
169
|
+
return false;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Find the timestamp line
|
|
173
|
+
let timestampLineIndex = -1;
|
|
174
|
+
for (let j = 0; j < lines.length; j++) {
|
|
175
|
+
if (timestampPattern.test(lines[j])) {
|
|
176
|
+
timestampLineIndex = j;
|
|
177
|
+
break;
|
|
130
178
|
}
|
|
131
|
-
currentBatch.push(caption);
|
|
132
|
-
currentWordCount += captionWordCount;
|
|
133
179
|
}
|
|
134
|
-
|
|
135
|
-
if (
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
args,
|
|
139
|
-
);
|
|
140
|
-
translatedCaptions = translatedCaptions.concat(translatedBatch);
|
|
180
|
+
|
|
181
|
+
if (timestampLineIndex === -1) {
|
|
182
|
+
logger.error(`Block ${i + 1} has no valid timestamp line:\n${block}`);
|
|
183
|
+
return false;
|
|
141
184
|
}
|
|
185
|
+
|
|
186
|
+
// Check that we have content after the timestamp
|
|
187
|
+
if (timestampLineIndex === lines.length - 1) {
|
|
188
|
+
logger.error(`Block ${i + 1} has no content after timestamp:\n${block}`);
|
|
189
|
+
return false;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Log the content for inspection
|
|
193
|
+
logger.debug(`Block ${i + 1} content:\n${lines.slice(timestampLineIndex + 1).join('\n')}`);
|
|
194
|
+
}
|
|
142
195
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
`${
|
|
154
|
-
)
|
|
196
|
+
return true;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
async function translateChunk(chunk, args, maxRetries = 3) {
|
|
200
|
+
const format = args.format || 'srt';
|
|
201
|
+
const chunkText = chunk.captions
|
|
202
|
+
.map(c => {
|
|
203
|
+
const startTime = msToTimestamp(c.start, format);
|
|
204
|
+
const endTime = msToTimestamp(c.end, format);
|
|
205
|
+
const index = format === 'srt' || !c.identifier ? c.index : c.identifier;
|
|
206
|
+
return `${index}\n${startTime} --> ${endTime}\n${c.content}`;
|
|
207
|
+
})
|
|
208
|
+
.join('\n\n');
|
|
209
|
+
|
|
210
|
+
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
|
155
211
|
try {
|
|
156
|
-
|
|
212
|
+
const translated = await callPathway("translate_subtitle_helper", {
|
|
213
|
+
...args,
|
|
214
|
+
text: chunkText,
|
|
215
|
+
async: false,
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
// Basic validation - just check for SUBTITLES tags and some content
|
|
219
|
+
const match = translated.match(/<SUBTITLES>([\s\S]*)<\/SUBTITLES>/);
|
|
220
|
+
if (!match || !match[1].trim()) {
|
|
221
|
+
logger.warn(`Attempt ${attempt + 1}: Invalid translation format`);
|
|
222
|
+
continue;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
const content = match[1].trim();
|
|
226
|
+
const blocks = content.split(/\n\s*\n/);
|
|
227
|
+
|
|
228
|
+
// Check if any blocks are empty or invalid
|
|
229
|
+
let hasEmptyBlocks = false;
|
|
230
|
+
const processedBlocks = chunk.captions.map((caption, index) => {
|
|
231
|
+
const block = blocks[index];
|
|
232
|
+
if (!block) {
|
|
233
|
+
logger.warn(`Attempt ${attempt + 1}: Empty block for caption ${caption.index}`);
|
|
234
|
+
hasEmptyBlocks = true;
|
|
235
|
+
return null;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
const lines = block.split('\n');
|
|
239
|
+
if (lines.length < 3) {
|
|
240
|
+
logger.warn(`Attempt ${attempt + 1}: Invalid block structure for caption ${caption.index}`);
|
|
241
|
+
hasEmptyBlocks = true;
|
|
242
|
+
return null;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
const content = lines.slice(2).join('\n').trim();
|
|
246
|
+
if (!content) {
|
|
247
|
+
logger.warn(`Attempt ${attempt + 1}: Empty content for caption ${caption.index}`);
|
|
248
|
+
hasEmptyBlocks = true;
|
|
249
|
+
return null;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
return {
|
|
253
|
+
...caption,
|
|
254
|
+
content: content,
|
|
255
|
+
text: content,
|
|
256
|
+
chunkStart: chunk.startIndex,
|
|
257
|
+
chunkEnd: chunk.endIndex
|
|
258
|
+
};
|
|
259
|
+
});
|
|
260
|
+
|
|
261
|
+
// If no empty blocks, return the processed blocks
|
|
262
|
+
if (!hasEmptyBlocks) {
|
|
263
|
+
return processedBlocks;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// If this was the last attempt and we still have empty blocks,
|
|
267
|
+
// return what we have but keep original content for empty blocks
|
|
268
|
+
if (attempt === maxRetries - 1) {
|
|
269
|
+
logger.warn(`Failed to get valid translations for all blocks after ${maxRetries} attempts`);
|
|
270
|
+
return chunk.captions.map((caption, index) => {
|
|
271
|
+
return processedBlocks[index] || {
|
|
272
|
+
...caption,
|
|
273
|
+
chunkStart: chunk.startIndex,
|
|
274
|
+
chunkEnd: chunk.endIndex
|
|
275
|
+
};
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// Otherwise, try again
|
|
280
|
+
logger.info(`Retrying chunk due to empty blocks (attempt ${attempt + 1}/${maxRetries})`);
|
|
281
|
+
|
|
157
282
|
} catch (e) {
|
|
158
|
-
logger.error(`
|
|
159
|
-
|
|
283
|
+
logger.error(`Error translating chunk ${chunk.startIndex}-${chunk.endIndex} (attempt ${attempt + 1}): ${e}`);
|
|
284
|
+
if (attempt === maxRetries - 1) throw e;
|
|
160
285
|
}
|
|
161
286
|
}
|
|
287
|
+
|
|
288
|
+
throw new Error(`Failed to translate chunk ${chunk.startIndex}-${chunk.endIndex} after ${maxRetries} attempts`);
|
|
162
289
|
}
|
|
163
290
|
|
|
164
291
|
export default {
|
|
@@ -173,9 +300,73 @@ export default {
|
|
|
173
300
|
model: "oai-gpt4o",
|
|
174
301
|
enableDuplicateRequests: false,
|
|
175
302
|
timeout: 3600,
|
|
176
|
-
executePathway: async (
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
303
|
+
executePathway: async ({args}) => {
|
|
304
|
+
try {
|
|
305
|
+
const { text, format = 'srt' } = args;
|
|
306
|
+
const preprocessedText = preprocessStr(text, format);
|
|
307
|
+
const captions = parseSubtitles(preprocessedText, format);
|
|
308
|
+
|
|
309
|
+
if (!captions || captions.length === 0) {
|
|
310
|
+
throw new Error("No captions found in input");
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Split into overlapping chunks
|
|
314
|
+
const chunks = splitIntoOverlappingChunks(captions);
|
|
315
|
+
logger.info(`Split subtitles into ${chunks.length} overlapping chunks`);
|
|
316
|
+
|
|
317
|
+
// Translate all chunks in parallel
|
|
318
|
+
const chunkPromises = chunks.map(chunk => translateChunk(chunk, args));
|
|
319
|
+
const translatedChunks = await Promise.all(chunkPromises);
|
|
320
|
+
|
|
321
|
+
// Create a map of caption index to all its translations
|
|
322
|
+
const translationMap = new Map();
|
|
323
|
+
translatedChunks.flat().forEach(caption => {
|
|
324
|
+
if (!translationMap.has(caption.index)) {
|
|
325
|
+
translationMap.set(caption.index, []);
|
|
326
|
+
}
|
|
327
|
+
translationMap.get(caption.index).push(caption);
|
|
328
|
+
});
|
|
329
|
+
|
|
330
|
+
// Select best translation for each caption
|
|
331
|
+
const finalCaptions = captions.map(caption => {
|
|
332
|
+
const translations = translationMap.get(caption.index) || [caption];
|
|
333
|
+
return selectBestTranslation(translations, caption.index, caption.index);
|
|
334
|
+
});
|
|
335
|
+
|
|
336
|
+
// Format the output
|
|
337
|
+
let result = finalCaptions
|
|
338
|
+
.map(caption => {
|
|
339
|
+
const startTime = msToTimestamp(caption.start, format);
|
|
340
|
+
const endTime = msToTimestamp(caption.end, format);
|
|
341
|
+
// Only include index/identifier if it was in the original
|
|
342
|
+
const hasIdentifier = caption.identifier !== null || format === 'srt';
|
|
343
|
+
const index = format === 'srt' || !caption.identifier ? caption.index : caption.identifier;
|
|
344
|
+
return hasIdentifier ?
|
|
345
|
+
`${index}\n${startTime} --> ${endTime}\n${caption.content}` :
|
|
346
|
+
`${startTime} --> ${endTime}\n${caption.content}`;
|
|
347
|
+
})
|
|
348
|
+
.join('\n\n')
|
|
349
|
+
.trim();
|
|
350
|
+
|
|
351
|
+
// Add final newline only if input had one
|
|
352
|
+
if (text.endsWith('\n')) {
|
|
353
|
+
result += '\n';
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// Add WEBVTT header for VTT format
|
|
357
|
+
if (format === 'vtt') {
|
|
358
|
+
result = 'WEBVTT\n\n' + result;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// Validate final output
|
|
362
|
+
if (!validateFinalOutput(result, text, format)) {
|
|
363
|
+
throw new Error("Final subtitle reconstruction failed validation");
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return result;
|
|
367
|
+
} catch (e) {
|
|
368
|
+
logger.error(`Subtitle translation failed: ${e}`);
|
|
369
|
+
throw e;
|
|
370
|
+
}
|
|
180
371
|
},
|
|
181
372
|
};
|
|
@@ -7,25 +7,13 @@ export default {
|
|
|
7
7
|
{
|
|
8
8
|
role: "system",
|
|
9
9
|
content:
|
|
10
|
-
`
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
2. Output MUST have EXACTLY the same line count as input.
|
|
14
|
-
3. One input line = One output line. Always.
|
|
15
|
-
4. Only translations. Nothing extra.
|
|
16
|
-
5. Non-translatable stays unchanged.
|
|
17
|
-
6. Keep all formatting and characters.
|
|
18
|
-
7. Prefix: "LINE#lineNumber:".
|
|
19
|
-
8. Untranslatable: Copy as-is with prefix.
|
|
20
|
-
9. Internal checks: Verify line count and content after each line.
|
|
21
|
-
10. Final verification: Recount, check numbering, confirm content, cross-check with input.
|
|
22
|
-
|
|
23
|
-
Translate ALL lines. Constant vigilance. Exhaustive final cross-check.`
|
|
10
|
+
`You are an expert subtitle translator. You will be given a block of subtitles and asked to translate them into {{to}}.
|
|
11
|
+
You must maintain the original format (caption numbers and timestamps) exactly and make the content fit as naturally as possible.
|
|
12
|
+
Output only the translated subtitles in a <SUBTITLES> tag with no other text or commentary.`
|
|
24
13
|
},
|
|
25
14
|
{
|
|
26
15
|
role: "user",
|
|
27
|
-
|
|
28
|
-
content: `{{{text}}}`,
|
|
16
|
+
content: `<SUBTITLES>\n{{{text}}}\n</SUBTITLES>`,
|
|
29
17
|
},
|
|
30
18
|
],
|
|
31
19
|
}),
|
|
@@ -81,17 +81,6 @@ class AzureVideoTranslatePlugin extends ModelPlugin {
|
|
|
81
81
|
throw new Error(this.jsonBuffer);
|
|
82
82
|
}
|
|
83
83
|
|
|
84
|
-
if (isValidJSON(this.jsonBuffer)) {
|
|
85
|
-
const parsedData = JSON.parse(this.jsonBuffer);
|
|
86
|
-
if (parsedData.progress !== undefined) {
|
|
87
|
-
publishRequestProgress({
|
|
88
|
-
requestId: this.requestId,
|
|
89
|
-
progress: parsedData.progress,
|
|
90
|
-
info: this.jsonBuffer
|
|
91
|
-
});
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
|
|
95
84
|
onData(this.jsonBuffer);
|
|
96
85
|
this.jsonBuffer = '';
|
|
97
86
|
this.jsonDepth = 0;
|
|
@@ -118,11 +107,34 @@ class AzureVideoTranslatePlugin extends ModelPlugin {
|
|
|
118
107
|
let finalJson = '';
|
|
119
108
|
this.handleStream(response.data,
|
|
120
109
|
(data) => {
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
110
|
+
let sent = false;
|
|
111
|
+
if (isValidJSON(data)) {
|
|
112
|
+
const parsedData = JSON.parse(data);
|
|
113
|
+
if (parsedData.progress !== undefined) {
|
|
114
|
+
let timeInfo = '';
|
|
115
|
+
if (parsedData.estimated_time_remaining && parsedData.elapsed_time) {
|
|
116
|
+
const minutes = Math.ceil(parsedData.estimated_time_remaining / 60);
|
|
117
|
+
timeInfo = minutes <= 2
|
|
118
|
+
? `Should be done soon (${parsedData.elapsed_time} elapsed)`
|
|
119
|
+
: `Estimated ${minutes} minutes remaining`;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
publishRequestProgress({
|
|
123
|
+
requestId: this.requestId,
|
|
124
|
+
progress: parsedData.progress,
|
|
125
|
+
info: timeInfo
|
|
126
|
+
});
|
|
127
|
+
sent = true;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
if (!sent) {
|
|
131
|
+
publishRequestProgress({
|
|
132
|
+
requestId: this.requestId,
|
|
133
|
+
info: data
|
|
134
|
+
});
|
|
135
|
+
}
|
|
125
136
|
logger.debug('Data:', data);
|
|
137
|
+
|
|
126
138
|
// Extract JSON content if message contains targetLocales
|
|
127
139
|
const jsonMatch = data.match(/{[\s\S]*"targetLocales"[\s\S]*}/);
|
|
128
140
|
if (jsonMatch) {
|
|
@@ -2,9 +2,7 @@ import OpenAIVisionPlugin from "./openAiVisionPlugin.js";
|
|
|
2
2
|
import logger from "../../lib/logger.js";
|
|
3
3
|
import axios from 'axios';
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
async function convertContentItem(item, maxImageSize) {
|
|
5
|
+
async function convertContentItem(item, maxImageSize, plugin) {
|
|
8
6
|
let imageUrl = "";
|
|
9
7
|
|
|
10
8
|
try {
|
|
@@ -26,6 +24,12 @@ async function convertContentItem(item, maxImageSize) {
|
|
|
26
24
|
}
|
|
27
25
|
|
|
28
26
|
try {
|
|
27
|
+
// First validate the image URL
|
|
28
|
+
if (!await plugin.validateImageUrl(imageUrl)) {
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Then fetch and convert to base64 if needed
|
|
29
33
|
const urlData = imageUrl.startsWith("data:") ? imageUrl : await fetchImageAsDataURL(imageUrl);
|
|
30
34
|
if (!urlData) { return null; }
|
|
31
35
|
|
|
@@ -69,25 +73,14 @@ async function convertContentItem(item, maxImageSize) {
|
|
|
69
73
|
// Fetch image and convert to base 64 data URL
|
|
70
74
|
async function fetchImageAsDataURL(imageUrl) {
|
|
71
75
|
try {
|
|
72
|
-
//
|
|
73
|
-
const headResponse = await axios.head(imageUrl, {
|
|
74
|
-
timeout: 30000, // 30 second timeout
|
|
75
|
-
maxRedirects: 5
|
|
76
|
-
});
|
|
77
|
-
|
|
78
|
-
const contentType = headResponse.headers['content-type'];
|
|
79
|
-
if (!contentType || !allowedMIMETypes.includes(contentType)) {
|
|
80
|
-
logger.warn(`Unsupported image type: ${contentType} - skipping image content.`);
|
|
81
|
-
return null;
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
// Then get the actual image data
|
|
76
|
+
// Get the actual image data
|
|
85
77
|
const dataResponse = await axios.get(imageUrl, {
|
|
86
78
|
timeout: 30000,
|
|
87
79
|
responseType: 'arraybuffer',
|
|
88
80
|
maxRedirects: 5
|
|
89
81
|
});
|
|
90
82
|
|
|
83
|
+
const contentType = dataResponse.headers['content-type'];
|
|
91
84
|
const base64Image = Buffer.from(dataResponse.data).toString('base64');
|
|
92
85
|
return `data:${contentType};base64,${base64Image}`;
|
|
93
86
|
}
|
|
@@ -161,7 +154,7 @@ class Claude3VertexPlugin extends OpenAIVisionPlugin {
|
|
|
161
154
|
const claude3Messages = await Promise.all(
|
|
162
155
|
finalMessages.map(async (message) => {
|
|
163
156
|
const contentArray = Array.isArray(message.content) ? message.content : [message.content];
|
|
164
|
-
const claude3Content = await Promise.all(contentArray.map(item => convertContentItem(item, this.getModelMaxImageSize())));
|
|
157
|
+
const claude3Content = await Promise.all(contentArray.map(item => convertContentItem(item, this.getModelMaxImageSize(), this)));
|
|
165
158
|
return {
|
|
166
159
|
role: message.role,
|
|
167
160
|
content: claude3Content.filter(Boolean),
|