@mux/ai 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{index-Nxf6BaBO.d.ts → index-C8-E3VR9.d.ts} +59 -4
- package/dist/{index-CkJStzYO.d.ts → index-CA7bG50u.d.ts} +29 -2
- package/dist/index.d.ts +3 -3
- package/dist/index.js +711 -31
- package/dist/index.js.map +1 -1
- package/dist/primitives/index.d.ts +1 -1
- package/dist/primitives/index.js +336 -14
- package/dist/primitives/index.js.map +1 -1
- package/dist/workflows/index.d.ts +1 -1
- package/dist/workflows/index.js +703 -30
- package/dist/workflows/index.js.map +1 -1
- package/package.json +1 -1
package/dist/workflows/index.js
CHANGED
|
@@ -987,24 +987,82 @@ function findCaptionTrack(asset, languageCode) {
|
|
|
987
987
|
(track) => track.text_type === "subtitles" && track.language_code === languageCode
|
|
988
988
|
);
|
|
989
989
|
}
|
|
990
|
+
function normalizeLineEndings(value) {
|
|
991
|
+
return value.replace(/\r\n/g, "\n");
|
|
992
|
+
}
|
|
993
|
+
function isTimingLine(line) {
|
|
994
|
+
return line.includes("-->");
|
|
995
|
+
}
|
|
996
|
+
function parseNumericCueIdentifier(line) {
|
|
997
|
+
if (!/^\d+$/.test(line)) {
|
|
998
|
+
return null;
|
|
999
|
+
}
|
|
1000
|
+
return Number.parseInt(line, 10);
|
|
1001
|
+
}
|
|
1002
|
+
function isLikelyTitledCueIdentifier(line) {
|
|
1003
|
+
return /^\d+\s+-\s+\S.*$/.test(line);
|
|
1004
|
+
}
|
|
1005
|
+
function isLikelyCueIdentifier({
|
|
1006
|
+
line,
|
|
1007
|
+
nextLine,
|
|
1008
|
+
previousCueIdentifier
|
|
1009
|
+
}) {
|
|
1010
|
+
if (!line || !nextLine || !isTimingLine(nextLine)) {
|
|
1011
|
+
return false;
|
|
1012
|
+
}
|
|
1013
|
+
const numericIdentifier = parseNumericCueIdentifier(line);
|
|
1014
|
+
if (numericIdentifier !== null) {
|
|
1015
|
+
if (previousCueIdentifier === null || previousCueIdentifier === void 0) {
|
|
1016
|
+
return numericIdentifier === 1;
|
|
1017
|
+
}
|
|
1018
|
+
return numericIdentifier === previousCueIdentifier + 1;
|
|
1019
|
+
}
|
|
1020
|
+
return isLikelyTitledCueIdentifier(line);
|
|
1021
|
+
}
|
|
1022
|
+
function getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier) {
|
|
1023
|
+
const identifierIndex = timingLineIndex - 1;
|
|
1024
|
+
if (identifierIndex < 0) {
|
|
1025
|
+
return -1;
|
|
1026
|
+
}
|
|
1027
|
+
const candidate = lines[identifierIndex].trim();
|
|
1028
|
+
if (!candidate || isTimingLine(candidate)) {
|
|
1029
|
+
return -1;
|
|
1030
|
+
}
|
|
1031
|
+
return isLikelyCueIdentifier({
|
|
1032
|
+
line: candidate,
|
|
1033
|
+
nextLine: lines[timingLineIndex]?.trim(),
|
|
1034
|
+
previousCueIdentifier
|
|
1035
|
+
}) ? identifierIndex : -1;
|
|
1036
|
+
}
|
|
990
1037
|
function extractTextFromVTT(vttContent) {
|
|
991
1038
|
if (!vttContent.trim()) {
|
|
992
1039
|
return "";
|
|
993
1040
|
}
|
|
994
1041
|
const lines = vttContent.split("\n");
|
|
995
1042
|
const textLines = [];
|
|
1043
|
+
let previousCueIdentifier = null;
|
|
1044
|
+
let isInsideNoteBlock = false;
|
|
996
1045
|
for (let i = 0; i < lines.length; i++) {
|
|
997
1046
|
const line = lines[i].trim();
|
|
998
|
-
|
|
1047
|
+
const nextLine = lines[i + 1]?.trim();
|
|
1048
|
+
if (!line) {
|
|
1049
|
+
isInsideNoteBlock = false;
|
|
1050
|
+
continue;
|
|
1051
|
+
}
|
|
1052
|
+
if (isInsideNoteBlock)
|
|
999
1053
|
continue;
|
|
1000
1054
|
if (line === "WEBVTT")
|
|
1001
1055
|
continue;
|
|
1002
|
-
if (line.startsWith("NOTE "))
|
|
1056
|
+
if (line === "NOTE" || line.startsWith("NOTE ")) {
|
|
1057
|
+
isInsideNoteBlock = true;
|
|
1003
1058
|
continue;
|
|
1004
|
-
|
|
1059
|
+
}
|
|
1060
|
+
if (isTimingLine(line))
|
|
1005
1061
|
continue;
|
|
1006
|
-
if (
|
|
1062
|
+
if (isLikelyCueIdentifier({ line, nextLine, previousCueIdentifier })) {
|
|
1063
|
+
previousCueIdentifier = parseNumericCueIdentifier(line);
|
|
1007
1064
|
continue;
|
|
1065
|
+
}
|
|
1008
1066
|
if (line.startsWith("STYLE") || line.startsWith("REGION"))
|
|
1009
1067
|
continue;
|
|
1010
1068
|
const cleanLine = line.replace(/<[^>]*>/g, "").trim();
|
|
@@ -1053,20 +1111,34 @@ function parseVTTCues(vttContent) {
|
|
|
1053
1111
|
return [];
|
|
1054
1112
|
const lines = vttContent.split("\n");
|
|
1055
1113
|
const cues = [];
|
|
1114
|
+
let previousCueIdentifier = null;
|
|
1056
1115
|
for (let i = 0; i < lines.length; i++) {
|
|
1057
1116
|
const line = lines[i].trim();
|
|
1058
|
-
if (line
|
|
1117
|
+
if (isTimingLine(line)) {
|
|
1059
1118
|
const [startStr, endStr] = line.split(" --> ").map((s) => s.trim());
|
|
1060
1119
|
const startTime = vttTimestampToSeconds(startStr);
|
|
1061
1120
|
const endTime = vttTimestampToSeconds(endStr.split(" ")[0]);
|
|
1062
|
-
const
|
|
1121
|
+
const currentCueIdentifierLine = lines[i - 1]?.trim() ?? "";
|
|
1122
|
+
const currentCueIdentifier = isLikelyCueIdentifier({
|
|
1123
|
+
line: currentCueIdentifierLine,
|
|
1124
|
+
nextLine: line,
|
|
1125
|
+
previousCueIdentifier
|
|
1126
|
+
}) ? parseNumericCueIdentifier(currentCueIdentifierLine) : null;
|
|
1127
|
+
const rawTextLines = [];
|
|
1063
1128
|
let j = i + 1;
|
|
1064
|
-
while (j < lines.length && lines[j].trim() && !lines[j].
|
|
1065
|
-
|
|
1066
|
-
if (cleanLine)
|
|
1067
|
-
textLines.push(cleanLine);
|
|
1129
|
+
while (j < lines.length && lines[j].trim() && !isTimingLine(lines[j].trim())) {
|
|
1130
|
+
rawTextLines.push(lines[j].trim());
|
|
1068
1131
|
j++;
|
|
1069
1132
|
}
|
|
1133
|
+
const trailingNumericLine = parseNumericCueIdentifier(rawTextLines.at(-1) ?? "");
|
|
1134
|
+
if (trailingNumericLine !== null && isLikelyCueIdentifier({
|
|
1135
|
+
line: rawTextLines.at(-1) ?? "",
|
|
1136
|
+
nextLine: lines[j]?.trim(),
|
|
1137
|
+
previousCueIdentifier: currentCueIdentifier
|
|
1138
|
+
}) && rawTextLines.length > 1) {
|
|
1139
|
+
rawTextLines.pop();
|
|
1140
|
+
}
|
|
1141
|
+
const textLines = rawTextLines.map((textLine) => textLine.replace(/<[^>]*>/g, "")).filter(Boolean);
|
|
1070
1142
|
if (textLines.length > 0) {
|
|
1071
1143
|
cues.push({
|
|
1072
1144
|
startTime,
|
|
@@ -1074,10 +1146,102 @@ function parseVTTCues(vttContent) {
|
|
|
1074
1146
|
text: textLines.join(" ")
|
|
1075
1147
|
});
|
|
1076
1148
|
}
|
|
1149
|
+
previousCueIdentifier = currentCueIdentifier;
|
|
1077
1150
|
}
|
|
1078
1151
|
}
|
|
1079
1152
|
return cues;
|
|
1080
1153
|
}
|
|
1154
|
+
function splitVttPreambleAndCueBlocks(vttContent) {
|
|
1155
|
+
const normalizedContent = normalizeLineEndings(vttContent).trim();
|
|
1156
|
+
if (!normalizedContent) {
|
|
1157
|
+
return {
|
|
1158
|
+
preamble: "WEBVTT",
|
|
1159
|
+
cueBlocks: []
|
|
1160
|
+
};
|
|
1161
|
+
}
|
|
1162
|
+
const rawBlocks = normalizedContent.split(/\n{2,}/).map((block) => block.trim()).filter(Boolean);
|
|
1163
|
+
const cueBlockStartIndex = rawBlocks.findIndex((block) => block.includes("-->"));
|
|
1164
|
+
if (cueBlockStartIndex === -1) {
|
|
1165
|
+
return {
|
|
1166
|
+
preamble: normalizedContent.startsWith("WEBVTT") ? normalizedContent : `WEBVTT
|
|
1167
|
+
|
|
1168
|
+
${normalizedContent}`,
|
|
1169
|
+
cueBlocks: []
|
|
1170
|
+
};
|
|
1171
|
+
}
|
|
1172
|
+
const hasMergedCueBlocks = rawBlocks.slice(cueBlockStartIndex).some((block) => (block.match(/-->/g) ?? []).length > 1);
|
|
1173
|
+
if (hasMergedCueBlocks) {
|
|
1174
|
+
const lines = normalizedContent.split("\n");
|
|
1175
|
+
const timingLineIndices = lines.map((line, index) => isTimingLine(line.trim()) ? index : -1).filter((index) => index >= 0);
|
|
1176
|
+
let previousCueIdentifier = null;
|
|
1177
|
+
const firstCueStartIndex = getCueIdentifierLineIndex(lines, timingLineIndices[0], previousCueIdentifier);
|
|
1178
|
+
const preambleEndIndex = firstCueStartIndex >= 0 ? firstCueStartIndex : timingLineIndices[0];
|
|
1179
|
+
const preamble2 = lines.slice(0, preambleEndIndex).join("\n").trim() || "WEBVTT";
|
|
1180
|
+
const cueBlocks2 = timingLineIndices.map((timingLineIndex, index) => {
|
|
1181
|
+
const cueIdentifierLineIndex = getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier);
|
|
1182
|
+
const cueStartIndex = cueIdentifierLineIndex >= 0 ? cueIdentifierLineIndex : timingLineIndex;
|
|
1183
|
+
const currentCueIdentifier = cueIdentifierLineIndex >= 0 ? parseNumericCueIdentifier(lines[cueIdentifierLineIndex].trim()) : null;
|
|
1184
|
+
const nextTimingLineIndex = timingLineIndices[index + 1] ?? lines.length;
|
|
1185
|
+
let cueEndIndex = nextTimingLineIndex - 1;
|
|
1186
|
+
while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
|
|
1187
|
+
cueEndIndex--;
|
|
1188
|
+
}
|
|
1189
|
+
const nextCueIdentifierLineIndex = index < timingLineIndices.length - 1 ? getCueIdentifierLineIndex(lines, nextTimingLineIndex, currentCueIdentifier) : -1;
|
|
1190
|
+
if (nextCueIdentifierLineIndex === cueEndIndex) {
|
|
1191
|
+
cueEndIndex--;
|
|
1192
|
+
}
|
|
1193
|
+
while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
|
|
1194
|
+
cueEndIndex--;
|
|
1195
|
+
}
|
|
1196
|
+
previousCueIdentifier = currentCueIdentifier;
|
|
1197
|
+
return lines.slice(cueStartIndex, cueEndIndex + 1).join("\n").trim();
|
|
1198
|
+
});
|
|
1199
|
+
return {
|
|
1200
|
+
preamble: preamble2,
|
|
1201
|
+
cueBlocks: cueBlocks2
|
|
1202
|
+
};
|
|
1203
|
+
}
|
|
1204
|
+
const preambleBlocks = rawBlocks.slice(0, cueBlockStartIndex);
|
|
1205
|
+
const cueBlocks = rawBlocks.slice(cueBlockStartIndex);
|
|
1206
|
+
const preamble = preambleBlocks.length > 0 ? preambleBlocks.join("\n\n") : "WEBVTT";
|
|
1207
|
+
return {
|
|
1208
|
+
preamble,
|
|
1209
|
+
cueBlocks
|
|
1210
|
+
};
|
|
1211
|
+
}
|
|
1212
|
+
function buildVttFromCueBlocks(cueBlocks, preamble = "WEBVTT") {
|
|
1213
|
+
if (cueBlocks.length === 0) {
|
|
1214
|
+
return `${preamble.trim()}
|
|
1215
|
+
`;
|
|
1216
|
+
}
|
|
1217
|
+
return `${preamble.trim()}
|
|
1218
|
+
|
|
1219
|
+
${cueBlocks.map((block) => block.trim()).join("\n\n")}
|
|
1220
|
+
`;
|
|
1221
|
+
}
|
|
1222
|
+
function replaceCueText(cueBlock, translatedText) {
|
|
1223
|
+
const lines = normalizeLineEndings(cueBlock).split("\n").map((line) => line.trim()).filter(Boolean);
|
|
1224
|
+
const timingLineIndex = lines.findIndex((line) => line.includes("-->"));
|
|
1225
|
+
if (timingLineIndex === -1) {
|
|
1226
|
+
throw new Error("Cue block is missing a timestamp line");
|
|
1227
|
+
}
|
|
1228
|
+
const headerLines = lines.slice(0, timingLineIndex + 1);
|
|
1229
|
+
const translatedLines = normalizeLineEndings(translatedText).split("\n").map((line) => line.trim()).filter(Boolean);
|
|
1230
|
+
return [...headerLines, ...translatedLines].join("\n");
|
|
1231
|
+
}
|
|
1232
|
+
function buildVttFromTranslatedCueBlocks(cueBlocks, translatedTexts, preamble = "WEBVTT") {
|
|
1233
|
+
if (cueBlocks.length !== translatedTexts.length) {
|
|
1234
|
+
throw new Error(`Expected ${cueBlocks.length} translated cues, received ${translatedTexts.length}`);
|
|
1235
|
+
}
|
|
1236
|
+
return buildVttFromCueBlocks(
|
|
1237
|
+
cueBlocks.map((cueBlock, index) => replaceCueText(cueBlock, translatedTexts[index])),
|
|
1238
|
+
preamble
|
|
1239
|
+
);
|
|
1240
|
+
}
|
|
1241
|
+
function concatenateVttSegments(segments, preamble = "WEBVTT") {
|
|
1242
|
+
const cueBlocks = segments.flatMap((segment) => splitVttPreambleAndCueBlocks(segment).cueBlocks);
|
|
1243
|
+
return buildVttFromCueBlocks(cueBlocks, preamble);
|
|
1244
|
+
}
|
|
1081
1245
|
async function buildTranscriptUrl(playbackId, trackId, shouldSign = false, credentials) {
|
|
1082
1246
|
"use step";
|
|
1083
1247
|
const baseUrl = `https://stream.mux.com/${playbackId}/text/${trackId}.vtt`;
|
|
@@ -2016,6 +2180,14 @@ async function generateChapters(assetId, languageCode, options = {}) {
|
|
|
2016
2180
|
import { embed } from "ai";
|
|
2017
2181
|
|
|
2018
2182
|
// src/primitives/text-chunking.ts
|
|
2183
|
+
var DEFAULT_MIN_CHUNK_DURATION_RATIO = 2 / 3;
|
|
2184
|
+
var DEFAULT_BOUNDARY_LOOKAHEAD_CUES = 12;
|
|
2185
|
+
var DEFAULT_BOUNDARY_PAUSE_SECONDS = 1.25;
|
|
2186
|
+
var STRONG_BOUNDARY_SCORE = 4;
|
|
2187
|
+
var PREFERRED_BOUNDARY_WINDOW_SECONDS = 5 * 60;
|
|
2188
|
+
var SENTENCE_BOUNDARY_REGEX = /[.!?]["')\]]*$/;
|
|
2189
|
+
var CLAUSE_BOUNDARY_REGEX = /[,;:]["')\]]*$/;
|
|
2190
|
+
var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
|
|
2019
2191
|
function estimateTokenCount(text) {
|
|
2020
2192
|
const words = text.trim().split(/\s+/).length;
|
|
2021
2193
|
return Math.ceil(words / 0.75);
|
|
@@ -2088,6 +2260,151 @@ function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
|
|
|
2088
2260
|
}
|
|
2089
2261
|
return chunks;
|
|
2090
2262
|
}
|
|
2263
|
+
function scoreCueBoundary(cues, index, boundaryPauseSeconds) {
|
|
2264
|
+
const cue = cues[index];
|
|
2265
|
+
const nextCue = cues[index + 1];
|
|
2266
|
+
if (!nextCue) {
|
|
2267
|
+
return Number.POSITIVE_INFINITY;
|
|
2268
|
+
}
|
|
2269
|
+
const trimmedText = cue.text.trim();
|
|
2270
|
+
let score = 0;
|
|
2271
|
+
if (SENTENCE_BOUNDARY_REGEX.test(trimmedText)) {
|
|
2272
|
+
score += 4;
|
|
2273
|
+
} else if (CLAUSE_BOUNDARY_REGEX.test(trimmedText)) {
|
|
2274
|
+
score += 2;
|
|
2275
|
+
}
|
|
2276
|
+
if (nextCue.startTime - cue.endTime >= boundaryPauseSeconds) {
|
|
2277
|
+
score += 2;
|
|
2278
|
+
}
|
|
2279
|
+
if (NEXT_SENTENCE_START_REGEX.test(nextCue.text.trim())) {
|
|
2280
|
+
score += 1;
|
|
2281
|
+
}
|
|
2282
|
+
return score;
|
|
2283
|
+
}
|
|
2284
|
+
function chunkVTTCuesByBudget(cues, options) {
|
|
2285
|
+
if (cues.length === 0) {
|
|
2286
|
+
return [];
|
|
2287
|
+
}
|
|
2288
|
+
const maxCuesPerChunk = Math.max(1, options.maxCuesPerChunk);
|
|
2289
|
+
let maxTextTokensPerChunk = Number.POSITIVE_INFINITY;
|
|
2290
|
+
if (options.maxTextTokensPerChunk) {
|
|
2291
|
+
maxTextTokensPerChunk = Math.max(1, options.maxTextTokensPerChunk);
|
|
2292
|
+
}
|
|
2293
|
+
const chunks = [];
|
|
2294
|
+
let chunkIndex = 0;
|
|
2295
|
+
let cueStartIndex = 0;
|
|
2296
|
+
let currentTokenCount = 0;
|
|
2297
|
+
for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
|
|
2298
|
+
const cue = cues[cueIndex];
|
|
2299
|
+
const cueTokenCount = estimateTokenCount(cue.text);
|
|
2300
|
+
const currentCueCount = cueIndex - cueStartIndex;
|
|
2301
|
+
const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
|
|
2302
|
+
const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
|
|
2303
|
+
if (wouldExceedCueCount || wouldExceedTokenCount) {
|
|
2304
|
+
chunks.push({
|
|
2305
|
+
id: `chunk-${chunkIndex}`,
|
|
2306
|
+
cueStartIndex,
|
|
2307
|
+
cueEndIndex: cueIndex - 1,
|
|
2308
|
+
cueCount: cueIndex - cueStartIndex,
|
|
2309
|
+
startTime: cues[cueStartIndex].startTime,
|
|
2310
|
+
endTime: cues[cueIndex - 1].endTime
|
|
2311
|
+
});
|
|
2312
|
+
cueStartIndex = cueIndex;
|
|
2313
|
+
currentTokenCount = 0;
|
|
2314
|
+
chunkIndex++;
|
|
2315
|
+
}
|
|
2316
|
+
currentTokenCount += cueTokenCount;
|
|
2317
|
+
}
|
|
2318
|
+
chunks.push({
|
|
2319
|
+
id: `chunk-${chunkIndex}`,
|
|
2320
|
+
cueStartIndex,
|
|
2321
|
+
cueEndIndex: cues.length - 1,
|
|
2322
|
+
cueCount: cues.length - cueStartIndex,
|
|
2323
|
+
startTime: cues[cueStartIndex].startTime,
|
|
2324
|
+
endTime: cues[cues.length - 1].endTime
|
|
2325
|
+
});
|
|
2326
|
+
return chunks;
|
|
2327
|
+
}
|
|
2328
|
+
function chunkVTTCuesByDuration(cues, options) {
|
|
2329
|
+
if (cues.length === 0) {
|
|
2330
|
+
return [];
|
|
2331
|
+
}
|
|
2332
|
+
const targetChunkDurationSeconds = Math.max(1, options.targetChunkDurationSeconds);
|
|
2333
|
+
const maxChunkDurationSeconds = Math.max(targetChunkDurationSeconds, options.maxChunkDurationSeconds);
|
|
2334
|
+
const minChunkDurationSeconds = Math.min(
|
|
2335
|
+
targetChunkDurationSeconds,
|
|
2336
|
+
Math.max(
|
|
2337
|
+
1,
|
|
2338
|
+
options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
|
|
2339
|
+
)
|
|
2340
|
+
);
|
|
2341
|
+
const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
|
|
2342
|
+
const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
|
|
2343
|
+
const preferredBoundaryStartSeconds = Math.max(
|
|
2344
|
+
minChunkDurationSeconds,
|
|
2345
|
+
targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
|
|
2346
|
+
);
|
|
2347
|
+
const chunks = [];
|
|
2348
|
+
let chunkIndex = 0;
|
|
2349
|
+
let cueStartIndex = 0;
|
|
2350
|
+
while (cueStartIndex < cues.length) {
|
|
2351
|
+
const chunkStartTime = cues[cueStartIndex].startTime;
|
|
2352
|
+
let cueEndIndex = cueStartIndex;
|
|
2353
|
+
let bestBoundaryIndex = -1;
|
|
2354
|
+
let bestBoundaryScore = -1;
|
|
2355
|
+
let bestPreferredBoundaryIndex = -1;
|
|
2356
|
+
let bestPreferredBoundaryScore = -1;
|
|
2357
|
+
while (cueEndIndex < cues.length) {
|
|
2358
|
+
const cue = cues[cueEndIndex];
|
|
2359
|
+
const currentDuration = cue.endTime - chunkStartTime;
|
|
2360
|
+
if (currentDuration >= minChunkDurationSeconds) {
|
|
2361
|
+
const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
|
|
2362
|
+
if (boundaryScore >= bestBoundaryScore) {
|
|
2363
|
+
bestBoundaryIndex = cueEndIndex;
|
|
2364
|
+
bestBoundaryScore = boundaryScore;
|
|
2365
|
+
}
|
|
2366
|
+
if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
|
|
2367
|
+
bestPreferredBoundaryIndex = cueEndIndex;
|
|
2368
|
+
bestPreferredBoundaryScore = boundaryScore;
|
|
2369
|
+
}
|
|
2370
|
+
}
|
|
2371
|
+
const nextCue = cues[cueEndIndex + 1];
|
|
2372
|
+
if (!nextCue) {
|
|
2373
|
+
break;
|
|
2374
|
+
}
|
|
2375
|
+
const nextDuration = nextCue.endTime - chunkStartTime;
|
|
2376
|
+
const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
|
|
2377
|
+
const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
|
|
2378
|
+
const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
|
|
2379
|
+
if (currentDuration >= targetChunkDurationSeconds) {
|
|
2380
|
+
if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
|
|
2381
|
+
cueEndIndex = preferredBoundaryIndex;
|
|
2382
|
+
break;
|
|
2383
|
+
}
|
|
2384
|
+
if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
|
|
2385
|
+
cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
|
|
2386
|
+
break;
|
|
2387
|
+
}
|
|
2388
|
+
}
|
|
2389
|
+
if (nextDuration > maxChunkDurationSeconds) {
|
|
2390
|
+
cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
|
|
2391
|
+
break;
|
|
2392
|
+
}
|
|
2393
|
+
cueEndIndex++;
|
|
2394
|
+
}
|
|
2395
|
+
chunks.push({
|
|
2396
|
+
id: `chunk-${chunkIndex}`,
|
|
2397
|
+
cueStartIndex,
|
|
2398
|
+
cueEndIndex,
|
|
2399
|
+
cueCount: cueEndIndex - cueStartIndex + 1,
|
|
2400
|
+
startTime: cues[cueStartIndex].startTime,
|
|
2401
|
+
endTime: cues[cueEndIndex].endTime
|
|
2402
|
+
});
|
|
2403
|
+
cueStartIndex = cueEndIndex + 1;
|
|
2404
|
+
chunkIndex++;
|
|
2405
|
+
}
|
|
2406
|
+
return chunks;
|
|
2407
|
+
}
|
|
2091
2408
|
function chunkText(text, strategy) {
|
|
2092
2409
|
switch (strategy.type) {
|
|
2093
2410
|
case "token": {
|
|
@@ -2343,10 +2660,8 @@ async function getThumbnailUrls(playbackId, duration, options = {}) {
|
|
|
2343
2660
|
}
|
|
2344
2661
|
const baseUrl = getMuxThumbnailBaseUrl(playbackId);
|
|
2345
2662
|
const urlPromises = timestamps.map(async (time) => {
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
}
|
|
2349
|
-
return `${baseUrl}?time=${time}&width=${width}`;
|
|
2663
|
+
const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
|
|
2664
|
+
return { url, time };
|
|
2350
2665
|
});
|
|
2351
2666
|
return Promise.all(urlPromises);
|
|
2352
2667
|
}
|
|
@@ -2420,6 +2735,7 @@ async function moderateImageWithOpenAI(entry) {
|
|
|
2420
2735
|
const categoryScores = json.results?.[0]?.category_scores || {};
|
|
2421
2736
|
return {
|
|
2422
2737
|
url: entry.url,
|
|
2738
|
+
time: entry.time,
|
|
2423
2739
|
sexual: categoryScores.sexual || 0,
|
|
2424
2740
|
violence: categoryScores.violence || 0,
|
|
2425
2741
|
error: false
|
|
@@ -2428,6 +2744,7 @@ async function moderateImageWithOpenAI(entry) {
|
|
|
2428
2744
|
console.error("OpenAI moderation failed:", error);
|
|
2429
2745
|
return {
|
|
2430
2746
|
url: entry.url,
|
|
2747
|
+
time: entry.time,
|
|
2431
2748
|
sexual: 0,
|
|
2432
2749
|
violence: 0,
|
|
2433
2750
|
error: true,
|
|
@@ -2435,11 +2752,13 @@ async function moderateImageWithOpenAI(entry) {
|
|
|
2435
2752
|
};
|
|
2436
2753
|
}
|
|
2437
2754
|
}
|
|
2438
|
-
async function requestOpenAIModeration(
|
|
2755
|
+
async function requestOpenAIModeration(images, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
|
|
2439
2756
|
"use step";
|
|
2757
|
+
const imageUrls = images.map((img) => img.url);
|
|
2758
|
+
const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
|
|
2440
2759
|
const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
|
|
2441
|
-
(img) => ({ url: img.url, image: img.base64Data, model, credentials })
|
|
2442
|
-
) :
|
|
2760
|
+
(img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
|
|
2761
|
+
) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
|
|
2443
2762
|
return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
|
|
2444
2763
|
}
|
|
2445
2764
|
async function requestOpenAITextModeration(text, model, url, credentials) {
|
|
@@ -2584,6 +2903,7 @@ async function moderateImageWithHive(entry) {
|
|
|
2584
2903
|
const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
|
|
2585
2904
|
return {
|
|
2586
2905
|
url: entry.url,
|
|
2906
|
+
time: entry.time,
|
|
2587
2907
|
sexual,
|
|
2588
2908
|
violence,
|
|
2589
2909
|
error: false
|
|
@@ -2591,6 +2911,7 @@ async function moderateImageWithHive(entry) {
|
|
|
2591
2911
|
} catch (error) {
|
|
2592
2912
|
return {
|
|
2593
2913
|
url: entry.url,
|
|
2914
|
+
time: entry.time,
|
|
2594
2915
|
sexual: 0,
|
|
2595
2916
|
violence: 0,
|
|
2596
2917
|
error: true,
|
|
@@ -2598,19 +2919,23 @@ async function moderateImageWithHive(entry) {
|
|
|
2598
2919
|
};
|
|
2599
2920
|
}
|
|
2600
2921
|
}
|
|
2601
|
-
async function requestHiveModeration(
|
|
2922
|
+
async function requestHiveModeration(images, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
|
|
2602
2923
|
"use step";
|
|
2924
|
+
const imageUrls = images.map((img) => img.url);
|
|
2925
|
+
const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
|
|
2603
2926
|
const targets = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map((img) => ({
|
|
2604
2927
|
url: img.url,
|
|
2928
|
+
time: timeByUrl.get(img.url),
|
|
2605
2929
|
source: {
|
|
2606
2930
|
kind: "file",
|
|
2607
2931
|
buffer: img.buffer,
|
|
2608
2932
|
contentType: img.contentType
|
|
2609
2933
|
},
|
|
2610
2934
|
credentials
|
|
2611
|
-
})) :
|
|
2612
|
-
url,
|
|
2613
|
-
|
|
2935
|
+
})) : images.map((img) => ({
|
|
2936
|
+
url: img.url,
|
|
2937
|
+
time: img.time,
|
|
2938
|
+
source: { kind: "url", value: img.url },
|
|
2614
2939
|
credentials
|
|
2615
2940
|
}));
|
|
2616
2941
|
return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
|
|
@@ -2621,10 +2946,8 @@ async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options)
|
|
|
2621
2946
|
const baseUrl = getMuxThumbnailBaseUrl(playbackId);
|
|
2622
2947
|
const urlPromises = timestampsMs.map(async (tsMs) => {
|
|
2623
2948
|
const time = Number((tsMs / 1e3).toFixed(2));
|
|
2624
|
-
|
|
2625
|
-
|
|
2626
|
-
}
|
|
2627
|
-
return `${baseUrl}?time=${time}&width=${width}`;
|
|
2949
|
+
const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
|
|
2950
|
+
return { url, time };
|
|
2628
2951
|
});
|
|
2629
2952
|
return Promise.all(urlPromises);
|
|
2630
2953
|
}
|
|
@@ -3905,12 +4228,187 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
|
|
|
3905
4228
|
}
|
|
3906
4229
|
|
|
3907
4230
|
// src/workflows/translate-captions.ts
|
|
3908
|
-
import {
|
|
4231
|
+
import {
|
|
4232
|
+
APICallError,
|
|
4233
|
+
generateText as generateText5,
|
|
4234
|
+
NoObjectGeneratedError,
|
|
4235
|
+
Output as Output5,
|
|
4236
|
+
RetryError,
|
|
4237
|
+
TypeValidationError
|
|
4238
|
+
} from "ai";
|
|
4239
|
+
import dedent5 from "dedent";
|
|
3909
4240
|
import { z as z6 } from "zod";
|
|
3910
4241
|
var translationSchema = z6.object({
|
|
3911
4242
|
translation: z6.string()
|
|
3912
4243
|
});
|
|
3913
|
-
var SYSTEM_PROMPT4 =
|
|
4244
|
+
var SYSTEM_PROMPT4 = dedent5`
|
|
4245
|
+
You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user.
|
|
4246
|
+
You may receive either a full VTT file or a chunk from a larger VTT.
|
|
4247
|
+
Preserve all timestamps, cue ordering, and VTT formatting exactly as they appear.
|
|
4248
|
+
Return JSON with a single key "translation" containing the translated VTT content.
|
|
4249
|
+
`;
|
|
4250
|
+
var CUE_TRANSLATION_SYSTEM_PROMPT = dedent5`
|
|
4251
|
+
You are a subtitle translation expert.
|
|
4252
|
+
You will receive a sequence of subtitle cues extracted from a VTT file.
|
|
4253
|
+
Translate the cues to the requested target language while preserving their original order.
|
|
4254
|
+
Treat the cue list as continuous context so the translation reads naturally across adjacent lines.
|
|
4255
|
+
Return JSON with a single key "translations" containing exactly one translated string for each input cue.
|
|
4256
|
+
Do not merge, split, omit, reorder, or add cues.
|
|
4257
|
+
`;
|
|
4258
|
+
var DEFAULT_TRANSLATION_CHUNKING = {
|
|
4259
|
+
enabled: true,
|
|
4260
|
+
minimumAssetDurationSeconds: 30 * 60,
|
|
4261
|
+
targetChunkDurationSeconds: 30 * 60,
|
|
4262
|
+
maxConcurrentTranslations: 4,
|
|
4263
|
+
maxCuesPerChunk: 80,
|
|
4264
|
+
maxCueTextTokensPerChunk: 2e3
|
|
4265
|
+
};
|
|
4266
|
+
var TOKEN_USAGE_FIELDS = [
|
|
4267
|
+
"inputTokens",
|
|
4268
|
+
"outputTokens",
|
|
4269
|
+
"totalTokens",
|
|
4270
|
+
"reasoningTokens",
|
|
4271
|
+
"cachedInputTokens"
|
|
4272
|
+
];
|
|
4273
|
+
var TranslationChunkValidationError = class extends Error {
|
|
4274
|
+
constructor(message) {
|
|
4275
|
+
super(message);
|
|
4276
|
+
this.name = "TranslationChunkValidationError";
|
|
4277
|
+
}
|
|
4278
|
+
};
|
|
4279
|
+
function isTranslationChunkValidationError(error) {
|
|
4280
|
+
return error instanceof TranslationChunkValidationError;
|
|
4281
|
+
}
|
|
4282
|
+
function isProviderServiceError(error) {
|
|
4283
|
+
if (!error) {
|
|
4284
|
+
return false;
|
|
4285
|
+
}
|
|
4286
|
+
if (RetryError.isInstance(error)) {
|
|
4287
|
+
return isProviderServiceError(error.lastError);
|
|
4288
|
+
}
|
|
4289
|
+
if (APICallError.isInstance(error)) {
|
|
4290
|
+
return true;
|
|
4291
|
+
}
|
|
4292
|
+
if (error instanceof Error && "cause" in error) {
|
|
4293
|
+
return isProviderServiceError(error.cause);
|
|
4294
|
+
}
|
|
4295
|
+
return false;
|
|
4296
|
+
}
|
|
4297
|
+
function shouldSplitChunkTranslationError(error) {
|
|
4298
|
+
if (isProviderServiceError(error)) {
|
|
4299
|
+
return false;
|
|
4300
|
+
}
|
|
4301
|
+
return NoObjectGeneratedError.isInstance(error) || TypeValidationError.isInstance(error) || isTranslationChunkValidationError(error);
|
|
4302
|
+
}
|
|
4303
|
+
function isDefinedTokenUsageValue(value) {
|
|
4304
|
+
return typeof value === "number";
|
|
4305
|
+
}
|
|
4306
|
+
function resolveTranslationChunkingOptions(options) {
|
|
4307
|
+
const targetChunkDurationSeconds = Math.max(
|
|
4308
|
+
1,
|
|
4309
|
+
options?.targetChunkDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.targetChunkDurationSeconds
|
|
4310
|
+
);
|
|
4311
|
+
return {
|
|
4312
|
+
enabled: options?.enabled ?? DEFAULT_TRANSLATION_CHUNKING.enabled,
|
|
4313
|
+
minimumAssetDurationSeconds: Math.max(
|
|
4314
|
+
1,
|
|
4315
|
+
options?.minimumAssetDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.minimumAssetDurationSeconds
|
|
4316
|
+
),
|
|
4317
|
+
targetChunkDurationSeconds,
|
|
4318
|
+
maxConcurrentTranslations: Math.max(
|
|
4319
|
+
1,
|
|
4320
|
+
options?.maxConcurrentTranslations ?? DEFAULT_TRANSLATION_CHUNKING.maxConcurrentTranslations
|
|
4321
|
+
),
|
|
4322
|
+
maxCuesPerChunk: Math.max(
|
|
4323
|
+
1,
|
|
4324
|
+
options?.maxCuesPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCuesPerChunk
|
|
4325
|
+
),
|
|
4326
|
+
maxCueTextTokensPerChunk: Math.max(
|
|
4327
|
+
1,
|
|
4328
|
+
options?.maxCueTextTokensPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCueTextTokensPerChunk
|
|
4329
|
+
)
|
|
4330
|
+
};
|
|
4331
|
+
}
|
|
4332
|
+
function aggregateTokenUsage(usages) {
|
|
4333
|
+
return TOKEN_USAGE_FIELDS.reduce((aggregate, field) => {
|
|
4334
|
+
const values = usages.map((usage) => usage[field]).filter(isDefinedTokenUsageValue);
|
|
4335
|
+
if (values.length > 0) {
|
|
4336
|
+
aggregate[field] = values.reduce((total, value) => total + value, 0);
|
|
4337
|
+
}
|
|
4338
|
+
return aggregate;
|
|
4339
|
+
}, {});
|
|
4340
|
+
}
|
|
4341
|
+
function createTranslationChunkRequest(id, cues, cueBlocks) {
|
|
4342
|
+
return {
|
|
4343
|
+
id,
|
|
4344
|
+
cueCount: cues.length,
|
|
4345
|
+
startTime: cues[0].startTime,
|
|
4346
|
+
endTime: cues[cues.length - 1].endTime,
|
|
4347
|
+
cues,
|
|
4348
|
+
cueBlocks
|
|
4349
|
+
};
|
|
4350
|
+
}
|
|
4351
|
+
function splitTranslationChunkRequestByBudget(id, cues, cueBlocks, maxCuesPerChunk, maxCueTextTokensPerChunk) {
|
|
4352
|
+
const chunks = chunkVTTCuesByBudget(cues, {
|
|
4353
|
+
maxCuesPerChunk,
|
|
4354
|
+
maxTextTokensPerChunk: maxCueTextTokensPerChunk
|
|
4355
|
+
});
|
|
4356
|
+
return chunks.map(
|
|
4357
|
+
(chunk, index) => createTranslationChunkRequest(
|
|
4358
|
+
chunks.length === 1 ? id : `${id}-part-${index}`,
|
|
4359
|
+
cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
|
|
4360
|
+
cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1)
|
|
4361
|
+
)
|
|
4362
|
+
);
|
|
4363
|
+
}
|
|
4364
|
+
function buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunkingOptions) {
|
|
4365
|
+
const resolvedChunking = resolveTranslationChunkingOptions(chunkingOptions);
|
|
4366
|
+
const cues = parseVTTCues(vttContent);
|
|
4367
|
+
if (cues.length === 0) {
|
|
4368
|
+
return null;
|
|
4369
|
+
}
|
|
4370
|
+
const { preamble, cueBlocks } = splitVttPreambleAndCueBlocks(vttContent);
|
|
4371
|
+
if (cueBlocks.length !== cues.length) {
|
|
4372
|
+
console.warn(
|
|
4373
|
+
`Falling back to full-VTT caption translation because cue block count (${cueBlocks.length}) does not match parsed cue count (${cues.length}).`
|
|
4374
|
+
);
|
|
4375
|
+
return null;
|
|
4376
|
+
}
|
|
4377
|
+
if (!resolvedChunking.enabled) {
|
|
4378
|
+
return {
|
|
4379
|
+
preamble,
|
|
4380
|
+
chunks: [
|
|
4381
|
+
createTranslationChunkRequest("chunk-0", cues, cueBlocks)
|
|
4382
|
+
]
|
|
4383
|
+
};
|
|
4384
|
+
}
|
|
4385
|
+
if (typeof assetDurationSeconds !== "number" || assetDurationSeconds < resolvedChunking.minimumAssetDurationSeconds) {
|
|
4386
|
+
return {
|
|
4387
|
+
preamble,
|
|
4388
|
+
chunks: [
|
|
4389
|
+
createTranslationChunkRequest("chunk-0", cues, cueBlocks)
|
|
4390
|
+
]
|
|
4391
|
+
};
|
|
4392
|
+
}
|
|
4393
|
+
const targetChunkDurationSeconds = resolvedChunking.targetChunkDurationSeconds;
|
|
4394
|
+
const durationChunks = chunkVTTCuesByDuration(cues, {
|
|
4395
|
+
targetChunkDurationSeconds,
|
|
4396
|
+
maxChunkDurationSeconds: Math.max(targetChunkDurationSeconds, Math.round(targetChunkDurationSeconds * (7 / 6))),
|
|
4397
|
+
minChunkDurationSeconds: Math.max(1, Math.round(targetChunkDurationSeconds * (2 / 3)))
|
|
4398
|
+
});
|
|
4399
|
+
return {
|
|
4400
|
+
preamble,
|
|
4401
|
+
chunks: durationChunks.flatMap(
|
|
4402
|
+
(chunk) => splitTranslationChunkRequestByBudget(
|
|
4403
|
+
chunk.id,
|
|
4404
|
+
cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
|
|
4405
|
+
cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
|
|
4406
|
+
resolvedChunking.maxCuesPerChunk,
|
|
4407
|
+
resolvedChunking.maxCueTextTokensPerChunk
|
|
4408
|
+
)
|
|
4409
|
+
)
|
|
4410
|
+
};
|
|
4411
|
+
}
|
|
3914
4412
|
async function fetchVttFromMux(vttUrl) {
|
|
3915
4413
|
"use step";
|
|
3916
4414
|
const vttResponse = await fetch(vttUrl);
|
|
@@ -3956,6 +4454,176 @@ ${vttContent}`
|
|
|
3956
4454
|
}
|
|
3957
4455
|
};
|
|
3958
4456
|
}
|
|
4457
|
+
async function translateCueChunkWithAI({
|
|
4458
|
+
cues,
|
|
4459
|
+
fromLanguageCode,
|
|
4460
|
+
toLanguageCode,
|
|
4461
|
+
provider,
|
|
4462
|
+
modelId,
|
|
4463
|
+
credentials
|
|
4464
|
+
}) {
|
|
4465
|
+
"use step";
|
|
4466
|
+
const model = await createLanguageModelFromConfig(provider, modelId, credentials);
|
|
4467
|
+
const schema = z6.object({
|
|
4468
|
+
translations: z6.array(z6.string().min(1)).length(cues.length)
|
|
4469
|
+
});
|
|
4470
|
+
const cuePayload = cues.map((cue, index) => ({
|
|
4471
|
+
index,
|
|
4472
|
+
startTime: cue.startTime,
|
|
4473
|
+
endTime: cue.endTime,
|
|
4474
|
+
text: cue.text
|
|
4475
|
+
}));
|
|
4476
|
+
const response = await generateText5({
|
|
4477
|
+
model,
|
|
4478
|
+
output: Output5.object({ schema }),
|
|
4479
|
+
messages: [
|
|
4480
|
+
{
|
|
4481
|
+
role: "system",
|
|
4482
|
+
content: CUE_TRANSLATION_SYSTEM_PROMPT
|
|
4483
|
+
},
|
|
4484
|
+
{
|
|
4485
|
+
role: "user",
|
|
4486
|
+
content: `Translate from ${fromLanguageCode} to ${toLanguageCode}.
|
|
4487
|
+
Return exactly ${cues.length} translated cues in the same order as the input.
|
|
4488
|
+
|
|
4489
|
+
${JSON.stringify(cuePayload, null, 2)}`
|
|
4490
|
+
}
|
|
4491
|
+
]
|
|
4492
|
+
});
|
|
4493
|
+
return {
|
|
4494
|
+
translations: response.output.translations,
|
|
4495
|
+
usage: {
|
|
4496
|
+
inputTokens: response.usage.inputTokens,
|
|
4497
|
+
outputTokens: response.usage.outputTokens,
|
|
4498
|
+
totalTokens: response.usage.totalTokens,
|
|
4499
|
+
reasoningTokens: response.usage.reasoningTokens,
|
|
4500
|
+
cachedInputTokens: response.usage.cachedInputTokens
|
|
4501
|
+
}
|
|
4502
|
+
};
|
|
4503
|
+
}
|
|
4504
|
+
function splitTranslationChunkAtMidpoint(chunk) {
|
|
4505
|
+
const midpoint = Math.floor(chunk.cueCount / 2);
|
|
4506
|
+
if (midpoint <= 0 || midpoint >= chunk.cueCount) {
|
|
4507
|
+
throw new Error(`Cannot split chunk ${chunk.id} with cueCount=${chunk.cueCount}`);
|
|
4508
|
+
}
|
|
4509
|
+
return [
|
|
4510
|
+
createTranslationChunkRequest(
|
|
4511
|
+
`${chunk.id}-a`,
|
|
4512
|
+
chunk.cues.slice(0, midpoint),
|
|
4513
|
+
chunk.cueBlocks.slice(0, midpoint)
|
|
4514
|
+
),
|
|
4515
|
+
createTranslationChunkRequest(
|
|
4516
|
+
`${chunk.id}-b`,
|
|
4517
|
+
chunk.cues.slice(midpoint),
|
|
4518
|
+
chunk.cueBlocks.slice(midpoint)
|
|
4519
|
+
)
|
|
4520
|
+
];
|
|
4521
|
+
}
|
|
4522
|
+
async function translateChunkWithFallback({
|
|
4523
|
+
chunk,
|
|
4524
|
+
fromLanguageCode,
|
|
4525
|
+
toLanguageCode,
|
|
4526
|
+
provider,
|
|
4527
|
+
modelId,
|
|
4528
|
+
credentials
|
|
4529
|
+
}) {
|
|
4530
|
+
"use step";
|
|
4531
|
+
try {
|
|
4532
|
+
const result = await translateCueChunkWithAI({
|
|
4533
|
+
cues: chunk.cues,
|
|
4534
|
+
fromLanguageCode,
|
|
4535
|
+
toLanguageCode,
|
|
4536
|
+
provider,
|
|
4537
|
+
modelId,
|
|
4538
|
+
credentials
|
|
4539
|
+
});
|
|
4540
|
+
if (result.translations.length !== chunk.cueCount) {
|
|
4541
|
+
throw new TranslationChunkValidationError(
|
|
4542
|
+
`Chunk ${chunk.id} returned ${result.translations.length} cues, expected ${chunk.cueCount} for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s`
|
|
4543
|
+
);
|
|
4544
|
+
}
|
|
4545
|
+
return {
|
|
4546
|
+
translatedVtt: buildVttFromTranslatedCueBlocks(chunk.cueBlocks, result.translations),
|
|
4547
|
+
usage: result.usage
|
|
4548
|
+
};
|
|
4549
|
+
} catch (error) {
|
|
4550
|
+
if (!shouldSplitChunkTranslationError(error) || chunk.cueCount <= 1) {
|
|
4551
|
+
throw new Error(
|
|
4552
|
+
`Chunk ${chunk.id} failed for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
4553
|
+
);
|
|
4554
|
+
}
|
|
4555
|
+
const [leftChunk, rightChunk] = splitTranslationChunkAtMidpoint(chunk);
|
|
4556
|
+
const [leftResult, rightResult] = await Promise.all([
|
|
4557
|
+
translateChunkWithFallback({
|
|
4558
|
+
chunk: leftChunk,
|
|
4559
|
+
fromLanguageCode,
|
|
4560
|
+
toLanguageCode,
|
|
4561
|
+
provider,
|
|
4562
|
+
modelId,
|
|
4563
|
+
credentials
|
|
4564
|
+
}),
|
|
4565
|
+
translateChunkWithFallback({
|
|
4566
|
+
chunk: rightChunk,
|
|
4567
|
+
fromLanguageCode,
|
|
4568
|
+
toLanguageCode,
|
|
4569
|
+
provider,
|
|
4570
|
+
modelId,
|
|
4571
|
+
credentials
|
|
4572
|
+
})
|
|
4573
|
+
]);
|
|
4574
|
+
return {
|
|
4575
|
+
translatedVtt: concatenateVttSegments([leftResult.translatedVtt, rightResult.translatedVtt]),
|
|
4576
|
+
usage: aggregateTokenUsage([leftResult.usage, rightResult.usage])
|
|
4577
|
+
};
|
|
4578
|
+
}
|
|
4579
|
+
}
|
|
4580
|
+
async function translateCaptionTrack({
|
|
4581
|
+
vttContent,
|
|
4582
|
+
assetDurationSeconds,
|
|
4583
|
+
fromLanguageCode,
|
|
4584
|
+
toLanguageCode,
|
|
4585
|
+
provider,
|
|
4586
|
+
modelId,
|
|
4587
|
+
credentials,
|
|
4588
|
+
chunking
|
|
4589
|
+
}) {
|
|
4590
|
+
"use step";
|
|
4591
|
+
const chunkPlan = buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunking);
|
|
4592
|
+
if (!chunkPlan) {
|
|
4593
|
+
return translateVttWithAI({
|
|
4594
|
+
vttContent,
|
|
4595
|
+
fromLanguageCode,
|
|
4596
|
+
toLanguageCode,
|
|
4597
|
+
provider,
|
|
4598
|
+
modelId,
|
|
4599
|
+
credentials
|
|
4600
|
+
});
|
|
4601
|
+
}
|
|
4602
|
+
const resolvedChunking = resolveTranslationChunkingOptions(chunking);
|
|
4603
|
+
const translatedSegments = [];
|
|
4604
|
+
const usageByChunk = [];
|
|
4605
|
+
for (let index = 0; index < chunkPlan.chunks.length; index += resolvedChunking.maxConcurrentTranslations) {
|
|
4606
|
+
const batch = chunkPlan.chunks.slice(index, index + resolvedChunking.maxConcurrentTranslations);
|
|
4607
|
+
const batchResults = await Promise.all(
|
|
4608
|
+
batch.map(
|
|
4609
|
+
(chunk) => translateChunkWithFallback({
|
|
4610
|
+
chunk,
|
|
4611
|
+
fromLanguageCode,
|
|
4612
|
+
toLanguageCode,
|
|
4613
|
+
provider,
|
|
4614
|
+
modelId,
|
|
4615
|
+
credentials
|
|
4616
|
+
})
|
|
4617
|
+
)
|
|
4618
|
+
);
|
|
4619
|
+
translatedSegments.push(...batchResults.map((result) => result.translatedVtt));
|
|
4620
|
+
usageByChunk.push(...batchResults.map((result) => result.usage));
|
|
4621
|
+
}
|
|
4622
|
+
return {
|
|
4623
|
+
translatedVtt: concatenateVttSegments(translatedSegments, chunkPlan.preamble),
|
|
4624
|
+
usage: aggregateTokenUsage(usageByChunk)
|
|
4625
|
+
};
|
|
4626
|
+
}
|
|
3959
4627
|
async function uploadVttToS3({
|
|
3960
4628
|
translatedVtt,
|
|
3961
4629
|
assetId,
|
|
@@ -4016,7 +4684,8 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
|
|
|
4016
4684
|
s3Bucket: providedS3Bucket,
|
|
4017
4685
|
uploadToMux: uploadToMuxOption,
|
|
4018
4686
|
storageAdapter,
|
|
4019
|
-
credentials: providedCredentials
|
|
4687
|
+
credentials: providedCredentials,
|
|
4688
|
+
chunking
|
|
4020
4689
|
} = options;
|
|
4021
4690
|
const credentials = providedCredentials;
|
|
4022
4691
|
const effectiveStorageAdapter = storageAdapter;
|
|
@@ -4077,13 +4746,15 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
|
|
|
4077
4746
|
let translatedVtt;
|
|
4078
4747
|
let usage;
|
|
4079
4748
|
try {
|
|
4080
|
-
const result = await
|
|
4749
|
+
const result = await translateCaptionTrack({
|
|
4081
4750
|
vttContent,
|
|
4751
|
+
assetDurationSeconds,
|
|
4082
4752
|
fromLanguageCode,
|
|
4083
4753
|
toLanguageCode,
|
|
4084
4754
|
provider: modelConfig.provider,
|
|
4085
4755
|
modelId: modelConfig.modelId,
|
|
4086
|
-
credentials
|
|
4756
|
+
credentials,
|
|
4757
|
+
chunking
|
|
4087
4758
|
});
|
|
4088
4759
|
translatedVtt = result.translatedVtt;
|
|
4089
4760
|
usage = result.usage;
|
|
@@ -4156,6 +4827,7 @@ export {
|
|
|
4156
4827
|
HIVE_SEXUAL_CATEGORIES,
|
|
4157
4828
|
HIVE_VIOLENCE_CATEGORIES,
|
|
4158
4829
|
SUMMARY_KEYWORD_LIMIT,
|
|
4830
|
+
aggregateTokenUsage,
|
|
4159
4831
|
askQuestions,
|
|
4160
4832
|
burnedInCaptionsSchema,
|
|
4161
4833
|
chapterSchema,
|
|
@@ -4167,6 +4839,7 @@ export {
|
|
|
4167
4839
|
getSummaryAndTags,
|
|
4168
4840
|
hasBurnedInCaptions,
|
|
4169
4841
|
questionAnswerSchema,
|
|
4842
|
+
shouldSplitChunkTranslationError,
|
|
4170
4843
|
summarySchema,
|
|
4171
4844
|
translateAudio,
|
|
4172
4845
|
translateCaptions,
|