@mux/ai 0.8.2 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{index-Nxf6BaBO.d.ts → index-C8-E3VR9.d.ts} +59 -4
- package/dist/{index-DP02N3iR.d.ts → index-CA7bG50u.d.ts} +41 -2
- package/dist/index.d.ts +3 -3
- package/dist/index.js +908 -199
- package/dist/index.js.map +1 -1
- package/dist/primitives/index.d.ts +1 -1
- package/dist/primitives/index.js +336 -14
- package/dist/primitives/index.js.map +1 -1
- package/dist/workflows/index.d.ts +1 -1
- package/dist/workflows/index.js +900 -198
- package/dist/workflows/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -5,7 +5,7 @@ var __export = (target, all) => {
|
|
|
5
5
|
};
|
|
6
6
|
|
|
7
7
|
// package.json
|
|
8
|
-
var version = "0.
|
|
8
|
+
var version = "0.10.0";
|
|
9
9
|
|
|
10
10
|
// src/env.ts
|
|
11
11
|
import { z } from "zod";
|
|
@@ -783,9 +783,14 @@ var primitives_exports = {};
|
|
|
783
783
|
__export(primitives_exports, {
|
|
784
784
|
DEFAULT_STORYBOARD_WIDTH: () => DEFAULT_STORYBOARD_WIDTH,
|
|
785
785
|
buildTranscriptUrl: () => buildTranscriptUrl,
|
|
786
|
+
buildVttFromCueBlocks: () => buildVttFromCueBlocks,
|
|
787
|
+
buildVttFromTranslatedCueBlocks: () => buildVttFromTranslatedCueBlocks,
|
|
786
788
|
chunkByTokens: () => chunkByTokens,
|
|
787
789
|
chunkText: () => chunkText,
|
|
788
790
|
chunkVTTCues: () => chunkVTTCues,
|
|
791
|
+
chunkVTTCuesByBudget: () => chunkVTTCuesByBudget,
|
|
792
|
+
chunkVTTCuesByDuration: () => chunkVTTCuesByDuration,
|
|
793
|
+
concatenateVttSegments: () => concatenateVttSegments,
|
|
789
794
|
estimateTokenCount: () => estimateTokenCount,
|
|
790
795
|
extractTextFromVTT: () => extractTextFromVTT,
|
|
791
796
|
extractTimestampedTranscript: () => extractTimestampedTranscript,
|
|
@@ -801,7 +806,9 @@ __export(primitives_exports, {
|
|
|
801
806
|
getStoryboardUrl: () => getStoryboardUrl,
|
|
802
807
|
getThumbnailUrls: () => getThumbnailUrls,
|
|
803
808
|
parseVTTCues: () => parseVTTCues,
|
|
809
|
+
replaceCueText: () => replaceCueText,
|
|
804
810
|
secondsToTimestamp: () => secondsToTimestamp,
|
|
811
|
+
splitVttPreambleAndCueBlocks: () => splitVttPreambleAndCueBlocks,
|
|
805
812
|
vttTimestampToSeconds: () => vttTimestampToSeconds
|
|
806
813
|
});
|
|
807
814
|
|
|
@@ -1162,6 +1169,14 @@ async function getStoryboardUrl(playbackId, width = DEFAULT_STORYBOARD_WIDTH, sh
|
|
|
1162
1169
|
}
|
|
1163
1170
|
|
|
1164
1171
|
// src/primitives/text-chunking.ts
|
|
1172
|
+
var DEFAULT_MIN_CHUNK_DURATION_RATIO = 2 / 3;
|
|
1173
|
+
var DEFAULT_BOUNDARY_LOOKAHEAD_CUES = 12;
|
|
1174
|
+
var DEFAULT_BOUNDARY_PAUSE_SECONDS = 1.25;
|
|
1175
|
+
var STRONG_BOUNDARY_SCORE = 4;
|
|
1176
|
+
var PREFERRED_BOUNDARY_WINDOW_SECONDS = 5 * 60;
|
|
1177
|
+
var SENTENCE_BOUNDARY_REGEX = /[.!?]["')\]]*$/;
|
|
1178
|
+
var CLAUSE_BOUNDARY_REGEX = /[,;:]["')\]]*$/;
|
|
1179
|
+
var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
|
|
1165
1180
|
function estimateTokenCount(text) {
|
|
1166
1181
|
const words = text.trim().split(/\s+/).length;
|
|
1167
1182
|
return Math.ceil(words / 0.75);
|
|
@@ -1234,6 +1249,151 @@ function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
|
|
|
1234
1249
|
}
|
|
1235
1250
|
return chunks;
|
|
1236
1251
|
}
|
|
1252
|
+
function scoreCueBoundary(cues, index, boundaryPauseSeconds) {
|
|
1253
|
+
const cue = cues[index];
|
|
1254
|
+
const nextCue = cues[index + 1];
|
|
1255
|
+
if (!nextCue) {
|
|
1256
|
+
return Number.POSITIVE_INFINITY;
|
|
1257
|
+
}
|
|
1258
|
+
const trimmedText = cue.text.trim();
|
|
1259
|
+
let score = 0;
|
|
1260
|
+
if (SENTENCE_BOUNDARY_REGEX.test(trimmedText)) {
|
|
1261
|
+
score += 4;
|
|
1262
|
+
} else if (CLAUSE_BOUNDARY_REGEX.test(trimmedText)) {
|
|
1263
|
+
score += 2;
|
|
1264
|
+
}
|
|
1265
|
+
if (nextCue.startTime - cue.endTime >= boundaryPauseSeconds) {
|
|
1266
|
+
score += 2;
|
|
1267
|
+
}
|
|
1268
|
+
if (NEXT_SENTENCE_START_REGEX.test(nextCue.text.trim())) {
|
|
1269
|
+
score += 1;
|
|
1270
|
+
}
|
|
1271
|
+
return score;
|
|
1272
|
+
}
|
|
1273
|
+
function chunkVTTCuesByBudget(cues, options) {
|
|
1274
|
+
if (cues.length === 0) {
|
|
1275
|
+
return [];
|
|
1276
|
+
}
|
|
1277
|
+
const maxCuesPerChunk = Math.max(1, options.maxCuesPerChunk);
|
|
1278
|
+
let maxTextTokensPerChunk = Number.POSITIVE_INFINITY;
|
|
1279
|
+
if (options.maxTextTokensPerChunk) {
|
|
1280
|
+
maxTextTokensPerChunk = Math.max(1, options.maxTextTokensPerChunk);
|
|
1281
|
+
}
|
|
1282
|
+
const chunks = [];
|
|
1283
|
+
let chunkIndex = 0;
|
|
1284
|
+
let cueStartIndex = 0;
|
|
1285
|
+
let currentTokenCount = 0;
|
|
1286
|
+
for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
|
|
1287
|
+
const cue = cues[cueIndex];
|
|
1288
|
+
const cueTokenCount = estimateTokenCount(cue.text);
|
|
1289
|
+
const currentCueCount = cueIndex - cueStartIndex;
|
|
1290
|
+
const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
|
|
1291
|
+
const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
|
|
1292
|
+
if (wouldExceedCueCount || wouldExceedTokenCount) {
|
|
1293
|
+
chunks.push({
|
|
1294
|
+
id: `chunk-${chunkIndex}`,
|
|
1295
|
+
cueStartIndex,
|
|
1296
|
+
cueEndIndex: cueIndex - 1,
|
|
1297
|
+
cueCount: cueIndex - cueStartIndex,
|
|
1298
|
+
startTime: cues[cueStartIndex].startTime,
|
|
1299
|
+
endTime: cues[cueIndex - 1].endTime
|
|
1300
|
+
});
|
|
1301
|
+
cueStartIndex = cueIndex;
|
|
1302
|
+
currentTokenCount = 0;
|
|
1303
|
+
chunkIndex++;
|
|
1304
|
+
}
|
|
1305
|
+
currentTokenCount += cueTokenCount;
|
|
1306
|
+
}
|
|
1307
|
+
chunks.push({
|
|
1308
|
+
id: `chunk-${chunkIndex}`,
|
|
1309
|
+
cueStartIndex,
|
|
1310
|
+
cueEndIndex: cues.length - 1,
|
|
1311
|
+
cueCount: cues.length - cueStartIndex,
|
|
1312
|
+
startTime: cues[cueStartIndex].startTime,
|
|
1313
|
+
endTime: cues[cues.length - 1].endTime
|
|
1314
|
+
});
|
|
1315
|
+
return chunks;
|
|
1316
|
+
}
|
|
1317
|
+
function chunkVTTCuesByDuration(cues, options) {
|
|
1318
|
+
if (cues.length === 0) {
|
|
1319
|
+
return [];
|
|
1320
|
+
}
|
|
1321
|
+
const targetChunkDurationSeconds = Math.max(1, options.targetChunkDurationSeconds);
|
|
1322
|
+
const maxChunkDurationSeconds = Math.max(targetChunkDurationSeconds, options.maxChunkDurationSeconds);
|
|
1323
|
+
const minChunkDurationSeconds = Math.min(
|
|
1324
|
+
targetChunkDurationSeconds,
|
|
1325
|
+
Math.max(
|
|
1326
|
+
1,
|
|
1327
|
+
options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
|
|
1328
|
+
)
|
|
1329
|
+
);
|
|
1330
|
+
const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
|
|
1331
|
+
const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
|
|
1332
|
+
const preferredBoundaryStartSeconds = Math.max(
|
|
1333
|
+
minChunkDurationSeconds,
|
|
1334
|
+
targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
|
|
1335
|
+
);
|
|
1336
|
+
const chunks = [];
|
|
1337
|
+
let chunkIndex = 0;
|
|
1338
|
+
let cueStartIndex = 0;
|
|
1339
|
+
while (cueStartIndex < cues.length) {
|
|
1340
|
+
const chunkStartTime = cues[cueStartIndex].startTime;
|
|
1341
|
+
let cueEndIndex = cueStartIndex;
|
|
1342
|
+
let bestBoundaryIndex = -1;
|
|
1343
|
+
let bestBoundaryScore = -1;
|
|
1344
|
+
let bestPreferredBoundaryIndex = -1;
|
|
1345
|
+
let bestPreferredBoundaryScore = -1;
|
|
1346
|
+
while (cueEndIndex < cues.length) {
|
|
1347
|
+
const cue = cues[cueEndIndex];
|
|
1348
|
+
const currentDuration = cue.endTime - chunkStartTime;
|
|
1349
|
+
if (currentDuration >= minChunkDurationSeconds) {
|
|
1350
|
+
const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
|
|
1351
|
+
if (boundaryScore >= bestBoundaryScore) {
|
|
1352
|
+
bestBoundaryIndex = cueEndIndex;
|
|
1353
|
+
bestBoundaryScore = boundaryScore;
|
|
1354
|
+
}
|
|
1355
|
+
if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
|
|
1356
|
+
bestPreferredBoundaryIndex = cueEndIndex;
|
|
1357
|
+
bestPreferredBoundaryScore = boundaryScore;
|
|
1358
|
+
}
|
|
1359
|
+
}
|
|
1360
|
+
const nextCue = cues[cueEndIndex + 1];
|
|
1361
|
+
if (!nextCue) {
|
|
1362
|
+
break;
|
|
1363
|
+
}
|
|
1364
|
+
const nextDuration = nextCue.endTime - chunkStartTime;
|
|
1365
|
+
const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
|
|
1366
|
+
const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
|
|
1367
|
+
const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
|
|
1368
|
+
if (currentDuration >= targetChunkDurationSeconds) {
|
|
1369
|
+
if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
|
|
1370
|
+
cueEndIndex = preferredBoundaryIndex;
|
|
1371
|
+
break;
|
|
1372
|
+
}
|
|
1373
|
+
if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
|
|
1374
|
+
cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
|
|
1375
|
+
break;
|
|
1376
|
+
}
|
|
1377
|
+
}
|
|
1378
|
+
if (nextDuration > maxChunkDurationSeconds) {
|
|
1379
|
+
cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
|
|
1380
|
+
break;
|
|
1381
|
+
}
|
|
1382
|
+
cueEndIndex++;
|
|
1383
|
+
}
|
|
1384
|
+
chunks.push({
|
|
1385
|
+
id: `chunk-${chunkIndex}`,
|
|
1386
|
+
cueStartIndex,
|
|
1387
|
+
cueEndIndex,
|
|
1388
|
+
cueCount: cueEndIndex - cueStartIndex + 1,
|
|
1389
|
+
startTime: cues[cueStartIndex].startTime,
|
|
1390
|
+
endTime: cues[cueEndIndex].endTime
|
|
1391
|
+
});
|
|
1392
|
+
cueStartIndex = cueEndIndex + 1;
|
|
1393
|
+
chunkIndex++;
|
|
1394
|
+
}
|
|
1395
|
+
return chunks;
|
|
1396
|
+
}
|
|
1237
1397
|
function chunkText(text, strategy) {
|
|
1238
1398
|
switch (strategy.type) {
|
|
1239
1399
|
case "token": {
|
|
@@ -1275,10 +1435,8 @@ async function getThumbnailUrls(playbackId, duration, options = {}) {
|
|
|
1275
1435
|
}
|
|
1276
1436
|
const baseUrl = getMuxThumbnailBaseUrl(playbackId);
|
|
1277
1437
|
const urlPromises = timestamps.map(async (time) => {
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
}
|
|
1281
|
-
return `${baseUrl}?time=${time}&width=${width}`;
|
|
1438
|
+
const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
|
|
1439
|
+
return { url, time };
|
|
1282
1440
|
});
|
|
1283
1441
|
return Promise.all(urlPromises);
|
|
1284
1442
|
}
|
|
@@ -1300,24 +1458,82 @@ function findCaptionTrack(asset, languageCode) {
|
|
|
1300
1458
|
(track) => track.text_type === "subtitles" && track.language_code === languageCode
|
|
1301
1459
|
);
|
|
1302
1460
|
}
|
|
1461
|
+
function normalizeLineEndings(value) {
|
|
1462
|
+
return value.replace(/\r\n/g, "\n");
|
|
1463
|
+
}
|
|
1464
|
+
function isTimingLine(line) {
|
|
1465
|
+
return line.includes("-->");
|
|
1466
|
+
}
|
|
1467
|
+
function parseNumericCueIdentifier(line) {
|
|
1468
|
+
if (!/^\d+$/.test(line)) {
|
|
1469
|
+
return null;
|
|
1470
|
+
}
|
|
1471
|
+
return Number.parseInt(line, 10);
|
|
1472
|
+
}
|
|
1473
|
+
function isLikelyTitledCueIdentifier(line) {
|
|
1474
|
+
return /^\d+\s+-\s+\S.*$/.test(line);
|
|
1475
|
+
}
|
|
1476
|
+
function isLikelyCueIdentifier({
|
|
1477
|
+
line,
|
|
1478
|
+
nextLine,
|
|
1479
|
+
previousCueIdentifier
|
|
1480
|
+
}) {
|
|
1481
|
+
if (!line || !nextLine || !isTimingLine(nextLine)) {
|
|
1482
|
+
return false;
|
|
1483
|
+
}
|
|
1484
|
+
const numericIdentifier = parseNumericCueIdentifier(line);
|
|
1485
|
+
if (numericIdentifier !== null) {
|
|
1486
|
+
if (previousCueIdentifier === null || previousCueIdentifier === void 0) {
|
|
1487
|
+
return numericIdentifier === 1;
|
|
1488
|
+
}
|
|
1489
|
+
return numericIdentifier === previousCueIdentifier + 1;
|
|
1490
|
+
}
|
|
1491
|
+
return isLikelyTitledCueIdentifier(line);
|
|
1492
|
+
}
|
|
1493
|
+
function getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier) {
|
|
1494
|
+
const identifierIndex = timingLineIndex - 1;
|
|
1495
|
+
if (identifierIndex < 0) {
|
|
1496
|
+
return -1;
|
|
1497
|
+
}
|
|
1498
|
+
const candidate = lines[identifierIndex].trim();
|
|
1499
|
+
if (!candidate || isTimingLine(candidate)) {
|
|
1500
|
+
return -1;
|
|
1501
|
+
}
|
|
1502
|
+
return isLikelyCueIdentifier({
|
|
1503
|
+
line: candidate,
|
|
1504
|
+
nextLine: lines[timingLineIndex]?.trim(),
|
|
1505
|
+
previousCueIdentifier
|
|
1506
|
+
}) ? identifierIndex : -1;
|
|
1507
|
+
}
|
|
1303
1508
|
function extractTextFromVTT(vttContent) {
|
|
1304
1509
|
if (!vttContent.trim()) {
|
|
1305
1510
|
return "";
|
|
1306
1511
|
}
|
|
1307
1512
|
const lines = vttContent.split("\n");
|
|
1308
1513
|
const textLines = [];
|
|
1514
|
+
let previousCueIdentifier = null;
|
|
1515
|
+
let isInsideNoteBlock = false;
|
|
1309
1516
|
for (let i = 0; i < lines.length; i++) {
|
|
1310
1517
|
const line = lines[i].trim();
|
|
1311
|
-
|
|
1518
|
+
const nextLine = lines[i + 1]?.trim();
|
|
1519
|
+
if (!line) {
|
|
1520
|
+
isInsideNoteBlock = false;
|
|
1521
|
+
continue;
|
|
1522
|
+
}
|
|
1523
|
+
if (isInsideNoteBlock)
|
|
1312
1524
|
continue;
|
|
1313
1525
|
if (line === "WEBVTT")
|
|
1314
1526
|
continue;
|
|
1315
|
-
if (line.startsWith("NOTE "))
|
|
1527
|
+
if (line === "NOTE" || line.startsWith("NOTE ")) {
|
|
1528
|
+
isInsideNoteBlock = true;
|
|
1316
1529
|
continue;
|
|
1317
|
-
|
|
1530
|
+
}
|
|
1531
|
+
if (isTimingLine(line))
|
|
1318
1532
|
continue;
|
|
1319
|
-
if (
|
|
1533
|
+
if (isLikelyCueIdentifier({ line, nextLine, previousCueIdentifier })) {
|
|
1534
|
+
previousCueIdentifier = parseNumericCueIdentifier(line);
|
|
1320
1535
|
continue;
|
|
1536
|
+
}
|
|
1321
1537
|
if (line.startsWith("STYLE") || line.startsWith("REGION"))
|
|
1322
1538
|
continue;
|
|
1323
1539
|
const cleanLine = line.replace(/<[^>]*>/g, "").trim();
|
|
@@ -1376,20 +1592,34 @@ function parseVTTCues(vttContent) {
|
|
|
1376
1592
|
return [];
|
|
1377
1593
|
const lines = vttContent.split("\n");
|
|
1378
1594
|
const cues = [];
|
|
1595
|
+
let previousCueIdentifier = null;
|
|
1379
1596
|
for (let i = 0; i < lines.length; i++) {
|
|
1380
1597
|
const line = lines[i].trim();
|
|
1381
|
-
if (line
|
|
1598
|
+
if (isTimingLine(line)) {
|
|
1382
1599
|
const [startStr, endStr] = line.split(" --> ").map((s) => s.trim());
|
|
1383
1600
|
const startTime = vttTimestampToSeconds(startStr);
|
|
1384
1601
|
const endTime = vttTimestampToSeconds(endStr.split(" ")[0]);
|
|
1385
|
-
const
|
|
1602
|
+
const currentCueIdentifierLine = lines[i - 1]?.trim() ?? "";
|
|
1603
|
+
const currentCueIdentifier = isLikelyCueIdentifier({
|
|
1604
|
+
line: currentCueIdentifierLine,
|
|
1605
|
+
nextLine: line,
|
|
1606
|
+
previousCueIdentifier
|
|
1607
|
+
}) ? parseNumericCueIdentifier(currentCueIdentifierLine) : null;
|
|
1608
|
+
const rawTextLines = [];
|
|
1386
1609
|
let j = i + 1;
|
|
1387
|
-
while (j < lines.length && lines[j].trim() && !lines[j].
|
|
1388
|
-
|
|
1389
|
-
if (cleanLine)
|
|
1390
|
-
textLines.push(cleanLine);
|
|
1610
|
+
while (j < lines.length && lines[j].trim() && !isTimingLine(lines[j].trim())) {
|
|
1611
|
+
rawTextLines.push(lines[j].trim());
|
|
1391
1612
|
j++;
|
|
1392
1613
|
}
|
|
1614
|
+
const trailingNumericLine = parseNumericCueIdentifier(rawTextLines.at(-1) ?? "");
|
|
1615
|
+
if (trailingNumericLine !== null && isLikelyCueIdentifier({
|
|
1616
|
+
line: rawTextLines.at(-1) ?? "",
|
|
1617
|
+
nextLine: lines[j]?.trim(),
|
|
1618
|
+
previousCueIdentifier: currentCueIdentifier
|
|
1619
|
+
}) && rawTextLines.length > 1) {
|
|
1620
|
+
rawTextLines.pop();
|
|
1621
|
+
}
|
|
1622
|
+
const textLines = rawTextLines.map((textLine) => textLine.replace(/<[^>]*>/g, "")).filter(Boolean);
|
|
1393
1623
|
if (textLines.length > 0) {
|
|
1394
1624
|
cues.push({
|
|
1395
1625
|
startTime,
|
|
@@ -1397,10 +1627,102 @@ function parseVTTCues(vttContent) {
|
|
|
1397
1627
|
text: textLines.join(" ")
|
|
1398
1628
|
});
|
|
1399
1629
|
}
|
|
1630
|
+
previousCueIdentifier = currentCueIdentifier;
|
|
1400
1631
|
}
|
|
1401
1632
|
}
|
|
1402
1633
|
return cues;
|
|
1403
1634
|
}
|
|
1635
|
+
function splitVttPreambleAndCueBlocks(vttContent) {
|
|
1636
|
+
const normalizedContent = normalizeLineEndings(vttContent).trim();
|
|
1637
|
+
if (!normalizedContent) {
|
|
1638
|
+
return {
|
|
1639
|
+
preamble: "WEBVTT",
|
|
1640
|
+
cueBlocks: []
|
|
1641
|
+
};
|
|
1642
|
+
}
|
|
1643
|
+
const rawBlocks = normalizedContent.split(/\n{2,}/).map((block) => block.trim()).filter(Boolean);
|
|
1644
|
+
const cueBlockStartIndex = rawBlocks.findIndex((block) => block.includes("-->"));
|
|
1645
|
+
if (cueBlockStartIndex === -1) {
|
|
1646
|
+
return {
|
|
1647
|
+
preamble: normalizedContent.startsWith("WEBVTT") ? normalizedContent : `WEBVTT
|
|
1648
|
+
|
|
1649
|
+
${normalizedContent}`,
|
|
1650
|
+
cueBlocks: []
|
|
1651
|
+
};
|
|
1652
|
+
}
|
|
1653
|
+
const hasMergedCueBlocks = rawBlocks.slice(cueBlockStartIndex).some((block) => (block.match(/-->/g) ?? []).length > 1);
|
|
1654
|
+
if (hasMergedCueBlocks) {
|
|
1655
|
+
const lines = normalizedContent.split("\n");
|
|
1656
|
+
const timingLineIndices = lines.map((line, index) => isTimingLine(line.trim()) ? index : -1).filter((index) => index >= 0);
|
|
1657
|
+
let previousCueIdentifier = null;
|
|
1658
|
+
const firstCueStartIndex = getCueIdentifierLineIndex(lines, timingLineIndices[0], previousCueIdentifier);
|
|
1659
|
+
const preambleEndIndex = firstCueStartIndex >= 0 ? firstCueStartIndex : timingLineIndices[0];
|
|
1660
|
+
const preamble2 = lines.slice(0, preambleEndIndex).join("\n").trim() || "WEBVTT";
|
|
1661
|
+
const cueBlocks2 = timingLineIndices.map((timingLineIndex, index) => {
|
|
1662
|
+
const cueIdentifierLineIndex = getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier);
|
|
1663
|
+
const cueStartIndex = cueIdentifierLineIndex >= 0 ? cueIdentifierLineIndex : timingLineIndex;
|
|
1664
|
+
const currentCueIdentifier = cueIdentifierLineIndex >= 0 ? parseNumericCueIdentifier(lines[cueIdentifierLineIndex].trim()) : null;
|
|
1665
|
+
const nextTimingLineIndex = timingLineIndices[index + 1] ?? lines.length;
|
|
1666
|
+
let cueEndIndex = nextTimingLineIndex - 1;
|
|
1667
|
+
while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
|
|
1668
|
+
cueEndIndex--;
|
|
1669
|
+
}
|
|
1670
|
+
const nextCueIdentifierLineIndex = index < timingLineIndices.length - 1 ? getCueIdentifierLineIndex(lines, nextTimingLineIndex, currentCueIdentifier) : -1;
|
|
1671
|
+
if (nextCueIdentifierLineIndex === cueEndIndex) {
|
|
1672
|
+
cueEndIndex--;
|
|
1673
|
+
}
|
|
1674
|
+
while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
|
|
1675
|
+
cueEndIndex--;
|
|
1676
|
+
}
|
|
1677
|
+
previousCueIdentifier = currentCueIdentifier;
|
|
1678
|
+
return lines.slice(cueStartIndex, cueEndIndex + 1).join("\n").trim();
|
|
1679
|
+
});
|
|
1680
|
+
return {
|
|
1681
|
+
preamble: preamble2,
|
|
1682
|
+
cueBlocks: cueBlocks2
|
|
1683
|
+
};
|
|
1684
|
+
}
|
|
1685
|
+
const preambleBlocks = rawBlocks.slice(0, cueBlockStartIndex);
|
|
1686
|
+
const cueBlocks = rawBlocks.slice(cueBlockStartIndex);
|
|
1687
|
+
const preamble = preambleBlocks.length > 0 ? preambleBlocks.join("\n\n") : "WEBVTT";
|
|
1688
|
+
return {
|
|
1689
|
+
preamble,
|
|
1690
|
+
cueBlocks
|
|
1691
|
+
};
|
|
1692
|
+
}
|
|
1693
|
+
function buildVttFromCueBlocks(cueBlocks, preamble = "WEBVTT") {
|
|
1694
|
+
if (cueBlocks.length === 0) {
|
|
1695
|
+
return `${preamble.trim()}
|
|
1696
|
+
`;
|
|
1697
|
+
}
|
|
1698
|
+
return `${preamble.trim()}
|
|
1699
|
+
|
|
1700
|
+
${cueBlocks.map((block) => block.trim()).join("\n\n")}
|
|
1701
|
+
`;
|
|
1702
|
+
}
|
|
1703
|
+
function replaceCueText(cueBlock, translatedText) {
|
|
1704
|
+
const lines = normalizeLineEndings(cueBlock).split("\n").map((line) => line.trim()).filter(Boolean);
|
|
1705
|
+
const timingLineIndex = lines.findIndex((line) => line.includes("-->"));
|
|
1706
|
+
if (timingLineIndex === -1) {
|
|
1707
|
+
throw new Error("Cue block is missing a timestamp line");
|
|
1708
|
+
}
|
|
1709
|
+
const headerLines = lines.slice(0, timingLineIndex + 1);
|
|
1710
|
+
const translatedLines = normalizeLineEndings(translatedText).split("\n").map((line) => line.trim()).filter(Boolean);
|
|
1711
|
+
return [...headerLines, ...translatedLines].join("\n");
|
|
1712
|
+
}
|
|
1713
|
+
function buildVttFromTranslatedCueBlocks(cueBlocks, translatedTexts, preamble = "WEBVTT") {
|
|
1714
|
+
if (cueBlocks.length !== translatedTexts.length) {
|
|
1715
|
+
throw new Error(`Expected ${cueBlocks.length} translated cues, received ${translatedTexts.length}`);
|
|
1716
|
+
}
|
|
1717
|
+
return buildVttFromCueBlocks(
|
|
1718
|
+
cueBlocks.map((cueBlock, index) => replaceCueText(cueBlock, translatedTexts[index])),
|
|
1719
|
+
preamble
|
|
1720
|
+
);
|
|
1721
|
+
}
|
|
1722
|
+
function concatenateVttSegments(segments, preamble = "WEBVTT") {
|
|
1723
|
+
const cueBlocks = segments.flatMap((segment) => splitVttPreambleAndCueBlocks(segment).cueBlocks);
|
|
1724
|
+
return buildVttFromCueBlocks(cueBlocks, preamble);
|
|
1725
|
+
}
|
|
1404
1726
|
async function buildTranscriptUrl(playbackId, trackId, shouldSign = false, credentials) {
|
|
1405
1727
|
"use step";
|
|
1406
1728
|
const baseUrl = `https://stream.mux.com/${playbackId}/text/${trackId}.vtt`;
|
|
@@ -1466,6 +1788,7 @@ __export(workflows_exports, {
|
|
|
1466
1788
|
HIVE_SEXUAL_CATEGORIES: () => HIVE_SEXUAL_CATEGORIES,
|
|
1467
1789
|
HIVE_VIOLENCE_CATEGORIES: () => HIVE_VIOLENCE_CATEGORIES,
|
|
1468
1790
|
SUMMARY_KEYWORD_LIMIT: () => SUMMARY_KEYWORD_LIMIT,
|
|
1791
|
+
aggregateTokenUsage: () => aggregateTokenUsage,
|
|
1469
1792
|
askQuestions: () => askQuestions,
|
|
1470
1793
|
burnedInCaptionsSchema: () => burnedInCaptionsSchema,
|
|
1471
1794
|
chapterSchema: () => chapterSchema,
|
|
@@ -1477,6 +1800,7 @@ __export(workflows_exports, {
|
|
|
1477
1800
|
getSummaryAndTags: () => getSummaryAndTags,
|
|
1478
1801
|
hasBurnedInCaptions: () => hasBurnedInCaptions,
|
|
1479
1802
|
questionAnswerSchema: () => questionAnswerSchema,
|
|
1803
|
+
shouldSplitChunkTranslationError: () => shouldSplitChunkTranslationError,
|
|
1480
1804
|
summarySchema: () => summarySchema,
|
|
1481
1805
|
translateAudio: () => translateAudio,
|
|
1482
1806
|
translateCaptions: () => translateCaptions,
|
|
@@ -1718,6 +2042,12 @@ function createToneSection(instruction) {
|
|
|
1718
2042
|
content: instruction
|
|
1719
2043
|
};
|
|
1720
2044
|
}
|
|
2045
|
+
function createLanguageSection(languageName) {
|
|
2046
|
+
return {
|
|
2047
|
+
tag: "language",
|
|
2048
|
+
content: `All output (title, description, keywords, chapter titles) MUST be written in ${languageName}.`
|
|
2049
|
+
};
|
|
2050
|
+
}
|
|
1721
2051
|
|
|
1722
2052
|
// src/lib/retry.ts
|
|
1723
2053
|
var DEFAULT_RETRY_OPTIONS = {
|
|
@@ -1828,6 +2158,7 @@ var SYSTEM_PROMPT = dedent`
|
|
|
1828
2158
|
- Only describe observable evidence from frames or transcript
|
|
1829
2159
|
- Do not fabricate details or make unsupported assumptions
|
|
1830
2160
|
- Return structured data matching the requested schema exactly
|
|
2161
|
+
- Provide reasoning in the same language as the question
|
|
1831
2162
|
</constraints>
|
|
1832
2163
|
|
|
1833
2164
|
<language_guidelines>
|
|
@@ -2228,6 +2559,166 @@ async function hasBurnedInCaptions(assetId, options = {}) {
|
|
|
2228
2559
|
import { generateText as generateText3, Output as Output3 } from "ai";
|
|
2229
2560
|
import dedent3 from "dedent";
|
|
2230
2561
|
import { z as z4 } from "zod";
|
|
2562
|
+
|
|
2563
|
+
// src/lib/language-codes.ts
|
|
2564
|
+
var ISO639_1_TO_3 = {
|
|
2565
|
+
// Major world languages
|
|
2566
|
+
en: "eng",
|
|
2567
|
+
// English
|
|
2568
|
+
es: "spa",
|
|
2569
|
+
// Spanish
|
|
2570
|
+
fr: "fra",
|
|
2571
|
+
// French
|
|
2572
|
+
de: "deu",
|
|
2573
|
+
// German
|
|
2574
|
+
it: "ita",
|
|
2575
|
+
// Italian
|
|
2576
|
+
pt: "por",
|
|
2577
|
+
// Portuguese
|
|
2578
|
+
ru: "rus",
|
|
2579
|
+
// Russian
|
|
2580
|
+
zh: "zho",
|
|
2581
|
+
// Chinese
|
|
2582
|
+
ja: "jpn",
|
|
2583
|
+
// Japanese
|
|
2584
|
+
ko: "kor",
|
|
2585
|
+
// Korean
|
|
2586
|
+
ar: "ara",
|
|
2587
|
+
// Arabic
|
|
2588
|
+
hi: "hin",
|
|
2589
|
+
// Hindi
|
|
2590
|
+
// European languages
|
|
2591
|
+
nl: "nld",
|
|
2592
|
+
// Dutch
|
|
2593
|
+
pl: "pol",
|
|
2594
|
+
// Polish
|
|
2595
|
+
sv: "swe",
|
|
2596
|
+
// Swedish
|
|
2597
|
+
da: "dan",
|
|
2598
|
+
// Danish
|
|
2599
|
+
no: "nor",
|
|
2600
|
+
// Norwegian
|
|
2601
|
+
fi: "fin",
|
|
2602
|
+
// Finnish
|
|
2603
|
+
el: "ell",
|
|
2604
|
+
// Greek
|
|
2605
|
+
cs: "ces",
|
|
2606
|
+
// Czech
|
|
2607
|
+
hu: "hun",
|
|
2608
|
+
// Hungarian
|
|
2609
|
+
ro: "ron",
|
|
2610
|
+
// Romanian
|
|
2611
|
+
bg: "bul",
|
|
2612
|
+
// Bulgarian
|
|
2613
|
+
hr: "hrv",
|
|
2614
|
+
// Croatian
|
|
2615
|
+
sk: "slk",
|
|
2616
|
+
// Slovak
|
|
2617
|
+
sl: "slv",
|
|
2618
|
+
// Slovenian
|
|
2619
|
+
uk: "ukr",
|
|
2620
|
+
// Ukrainian
|
|
2621
|
+
tr: "tur",
|
|
2622
|
+
// Turkish
|
|
2623
|
+
// Asian languages
|
|
2624
|
+
th: "tha",
|
|
2625
|
+
// Thai
|
|
2626
|
+
vi: "vie",
|
|
2627
|
+
// Vietnamese
|
|
2628
|
+
id: "ind",
|
|
2629
|
+
// Indonesian
|
|
2630
|
+
ms: "msa",
|
|
2631
|
+
// Malay
|
|
2632
|
+
tl: "tgl",
|
|
2633
|
+
// Tagalog/Filipino
|
|
2634
|
+
// Other languages
|
|
2635
|
+
he: "heb",
|
|
2636
|
+
// Hebrew
|
|
2637
|
+
fa: "fas",
|
|
2638
|
+
// Persian/Farsi
|
|
2639
|
+
bn: "ben",
|
|
2640
|
+
// Bengali
|
|
2641
|
+
ta: "tam",
|
|
2642
|
+
// Tamil
|
|
2643
|
+
te: "tel",
|
|
2644
|
+
// Telugu
|
|
2645
|
+
mr: "mar",
|
|
2646
|
+
// Marathi
|
|
2647
|
+
gu: "guj",
|
|
2648
|
+
// Gujarati
|
|
2649
|
+
kn: "kan",
|
|
2650
|
+
// Kannada
|
|
2651
|
+
ml: "mal",
|
|
2652
|
+
// Malayalam
|
|
2653
|
+
pa: "pan",
|
|
2654
|
+
// Punjabi
|
|
2655
|
+
ur: "urd",
|
|
2656
|
+
// Urdu
|
|
2657
|
+
sw: "swa",
|
|
2658
|
+
// Swahili
|
|
2659
|
+
af: "afr",
|
|
2660
|
+
// Afrikaans
|
|
2661
|
+
ca: "cat",
|
|
2662
|
+
// Catalan
|
|
2663
|
+
eu: "eus",
|
|
2664
|
+
// Basque
|
|
2665
|
+
gl: "glg",
|
|
2666
|
+
// Galician
|
|
2667
|
+
is: "isl",
|
|
2668
|
+
// Icelandic
|
|
2669
|
+
et: "est",
|
|
2670
|
+
// Estonian
|
|
2671
|
+
lv: "lav",
|
|
2672
|
+
// Latvian
|
|
2673
|
+
lt: "lit"
|
|
2674
|
+
// Lithuanian
|
|
2675
|
+
};
|
|
2676
|
+
var ISO639_3_TO_1 = Object.fromEntries(
|
|
2677
|
+
Object.entries(ISO639_1_TO_3).map(([iso1, iso3]) => [iso3, iso1])
|
|
2678
|
+
);
|
|
2679
|
+
function toISO639_3(code) {
|
|
2680
|
+
const normalized = code.toLowerCase().trim();
|
|
2681
|
+
if (normalized.length === 3) {
|
|
2682
|
+
return normalized;
|
|
2683
|
+
}
|
|
2684
|
+
return ISO639_1_TO_3[normalized] ?? normalized;
|
|
2685
|
+
}
|
|
2686
|
+
function toISO639_1(code) {
|
|
2687
|
+
const normalized = code.toLowerCase().trim();
|
|
2688
|
+
if (normalized.length === 2) {
|
|
2689
|
+
return normalized;
|
|
2690
|
+
}
|
|
2691
|
+
return ISO639_3_TO_1[normalized] ?? normalized;
|
|
2692
|
+
}
|
|
2693
|
+
function getLanguageCodePair(code) {
|
|
2694
|
+
const normalized = code.toLowerCase().trim();
|
|
2695
|
+
if (normalized.length === 2) {
|
|
2696
|
+
return {
|
|
2697
|
+
iso639_1: normalized,
|
|
2698
|
+
iso639_3: toISO639_3(normalized)
|
|
2699
|
+
};
|
|
2700
|
+
} else if (normalized.length === 3) {
|
|
2701
|
+
return {
|
|
2702
|
+
iso639_1: toISO639_1(normalized),
|
|
2703
|
+
iso639_3: normalized
|
|
2704
|
+
};
|
|
2705
|
+
}
|
|
2706
|
+
return {
|
|
2707
|
+
iso639_1: normalized,
|
|
2708
|
+
iso639_3: normalized
|
|
2709
|
+
};
|
|
2710
|
+
}
|
|
2711
|
+
function getLanguageName(code) {
|
|
2712
|
+
const iso639_1 = toISO639_1(code);
|
|
2713
|
+
try {
|
|
2714
|
+
const displayNames = new Intl.DisplayNames(["en"], { type: "language" });
|
|
2715
|
+
return displayNames.of(iso639_1) ?? code.toUpperCase();
|
|
2716
|
+
} catch {
|
|
2717
|
+
return code.toUpperCase();
|
|
2718
|
+
}
|
|
2719
|
+
}
|
|
2720
|
+
|
|
2721
|
+
// src/workflows/chapters.ts
|
|
2231
2722
|
var chapterSchema = z4.object({
|
|
2232
2723
|
startTime: z4.number(),
|
|
2233
2724
|
title: z4.string()
|
|
@@ -2288,7 +2779,8 @@ var chapterSystemPromptBuilder = createPromptBuilder({
|
|
|
2288
2779
|
content: dedent3`
|
|
2289
2780
|
- Only use information present in the transcript
|
|
2290
2781
|
- Return structured data that matches the requested JSON schema
|
|
2291
|
-
- Do not add commentary or extra text outside the JSON
|
|
2782
|
+
- Do not add commentary or extra text outside the JSON
|
|
2783
|
+
- When a <language> section is provided, all chapter titles MUST be written in that language`
|
|
2292
2784
|
},
|
|
2293
2785
|
qualityGuidelines: {
|
|
2294
2786
|
tag: "quality_guidelines",
|
|
@@ -2336,7 +2828,7 @@ var chaptersPromptBuilder = createPromptBuilder({
|
|
|
2336
2828
|
content: dedent3`
|
|
2337
2829
|
- Keep titles concise and descriptive
|
|
2338
2830
|
- Avoid filler or generic labels like "Chapter 1"
|
|
2339
|
-
- Use the transcript's
|
|
2831
|
+
- Use the transcript's terminology`
|
|
2340
2832
|
}
|
|
2341
2833
|
},
|
|
2342
2834
|
sectionOrder: ["task", "outputFormat", "chapterGuidelines", "titleGuidelines"]
|
|
@@ -2345,7 +2837,8 @@ function buildUserPrompt3({
|
|
|
2345
2837
|
timestampedTranscript,
|
|
2346
2838
|
promptOverrides,
|
|
2347
2839
|
minChaptersPerHour = 3,
|
|
2348
|
-
maxChaptersPerHour = 8
|
|
2840
|
+
maxChaptersPerHour = 8,
|
|
2841
|
+
languageName
|
|
2349
2842
|
}) {
|
|
2350
2843
|
const contextSections = [
|
|
2351
2844
|
{
|
|
@@ -2354,6 +2847,9 @@ function buildUserPrompt3({
|
|
|
2354
2847
|
attributes: { format: "seconds" }
|
|
2355
2848
|
}
|
|
2356
2849
|
];
|
|
2850
|
+
if (languageName) {
|
|
2851
|
+
contextSections.push(createLanguageSection(languageName));
|
|
2852
|
+
}
|
|
2357
2853
|
const dynamicChapterGuidelines = dedent3`
|
|
2358
2854
|
- Create at least ${minChaptersPerHour} and at most ${maxChaptersPerHour} chapters per hour of content
|
|
2359
2855
|
- Use start times in seconds (not HH:MM:SS)
|
|
@@ -2373,7 +2869,8 @@ async function generateChapters(assetId, languageCode, options = {}) {
|
|
|
2373
2869
|
promptOverrides,
|
|
2374
2870
|
minChaptersPerHour,
|
|
2375
2871
|
maxChaptersPerHour,
|
|
2376
|
-
credentials
|
|
2872
|
+
credentials,
|
|
2873
|
+
outputLanguageCode
|
|
2377
2874
|
} = options;
|
|
2378
2875
|
const modelConfig = resolveLanguageModelConfig({
|
|
2379
2876
|
...options,
|
|
@@ -2417,11 +2914,14 @@ async function generateChapters(assetId, languageCode, options = {}) {
|
|
|
2417
2914
|
const contentLabel = isAudioOnly ? "transcript" : "caption track";
|
|
2418
2915
|
throw new Error(`No usable content found in ${contentLabel}`);
|
|
2419
2916
|
}
|
|
2917
|
+
const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult.track?.language_code ?? languageCode;
|
|
2918
|
+
const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
|
|
2420
2919
|
const userPrompt = buildUserPrompt3({
|
|
2421
2920
|
timestampedTranscript,
|
|
2422
2921
|
promptOverrides,
|
|
2423
2922
|
minChaptersPerHour,
|
|
2424
|
-
maxChaptersPerHour
|
|
2923
|
+
maxChaptersPerHour,
|
|
2924
|
+
languageName
|
|
2425
2925
|
});
|
|
2426
2926
|
let chaptersData = null;
|
|
2427
2927
|
try {
|
|
@@ -2748,6 +3248,7 @@ async function moderateImageWithOpenAI(entry) {
|
|
|
2748
3248
|
const categoryScores = json.results?.[0]?.category_scores || {};
|
|
2749
3249
|
return {
|
|
2750
3250
|
url: entry.url,
|
|
3251
|
+
time: entry.time,
|
|
2751
3252
|
sexual: categoryScores.sexual || 0,
|
|
2752
3253
|
violence: categoryScores.violence || 0,
|
|
2753
3254
|
error: false
|
|
@@ -2756,6 +3257,7 @@ async function moderateImageWithOpenAI(entry) {
|
|
|
2756
3257
|
console.error("OpenAI moderation failed:", error);
|
|
2757
3258
|
return {
|
|
2758
3259
|
url: entry.url,
|
|
3260
|
+
time: entry.time,
|
|
2759
3261
|
sexual: 0,
|
|
2760
3262
|
violence: 0,
|
|
2761
3263
|
error: true,
|
|
@@ -2763,11 +3265,13 @@ async function moderateImageWithOpenAI(entry) {
|
|
|
2763
3265
|
};
|
|
2764
3266
|
}
|
|
2765
3267
|
}
|
|
2766
|
-
async function requestOpenAIModeration(
|
|
3268
|
+
async function requestOpenAIModeration(images, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
|
|
2767
3269
|
"use step";
|
|
3270
|
+
const imageUrls = images.map((img) => img.url);
|
|
3271
|
+
const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
|
|
2768
3272
|
const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
|
|
2769
|
-
(img) => ({ url: img.url, image: img.base64Data, model, credentials })
|
|
2770
|
-
) :
|
|
3273
|
+
(img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
|
|
3274
|
+
) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
|
|
2771
3275
|
return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
|
|
2772
3276
|
}
|
|
2773
3277
|
async function requestOpenAITextModeration(text, model, url, credentials) {
|
|
@@ -2912,6 +3416,7 @@ async function moderateImageWithHive(entry) {
|
|
|
2912
3416
|
const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
|
|
2913
3417
|
return {
|
|
2914
3418
|
url: entry.url,
|
|
3419
|
+
time: entry.time,
|
|
2915
3420
|
sexual,
|
|
2916
3421
|
violence,
|
|
2917
3422
|
error: false
|
|
@@ -2919,6 +3424,7 @@ async function moderateImageWithHive(entry) {
|
|
|
2919
3424
|
} catch (error) {
|
|
2920
3425
|
return {
|
|
2921
3426
|
url: entry.url,
|
|
3427
|
+
time: entry.time,
|
|
2922
3428
|
sexual: 0,
|
|
2923
3429
|
violence: 0,
|
|
2924
3430
|
error: true,
|
|
@@ -2926,19 +3432,23 @@ async function moderateImageWithHive(entry) {
|
|
|
2926
3432
|
};
|
|
2927
3433
|
}
|
|
2928
3434
|
}
|
|
2929
|
-
async function requestHiveModeration(
|
|
3435
|
+
async function requestHiveModeration(images, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
|
|
2930
3436
|
"use step";
|
|
3437
|
+
const imageUrls = images.map((img) => img.url);
|
|
3438
|
+
const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
|
|
2931
3439
|
const targets = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map((img) => ({
|
|
2932
3440
|
url: img.url,
|
|
3441
|
+
time: timeByUrl.get(img.url),
|
|
2933
3442
|
source: {
|
|
2934
3443
|
kind: "file",
|
|
2935
3444
|
buffer: img.buffer,
|
|
2936
3445
|
contentType: img.contentType
|
|
2937
3446
|
},
|
|
2938
3447
|
credentials
|
|
2939
|
-
})) :
|
|
2940
|
-
url,
|
|
2941
|
-
|
|
3448
|
+
})) : images.map((img) => ({
|
|
3449
|
+
url: img.url,
|
|
3450
|
+
time: img.time,
|
|
3451
|
+
source: { kind: "url", value: img.url },
|
|
2942
3452
|
credentials
|
|
2943
3453
|
}));
|
|
2944
3454
|
return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
|
|
@@ -2949,10 +3459,8 @@ async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options)
|
|
|
2949
3459
|
const baseUrl = getMuxThumbnailBaseUrl(playbackId);
|
|
2950
3460
|
const urlPromises = timestampsMs.map(async (tsMs) => {
|
|
2951
3461
|
const time = Number((tsMs / 1e3).toFixed(2));
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
}
|
|
2955
|
-
return `${baseUrl}?time=${time}&width=${width}`;
|
|
3462
|
+
const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
|
|
3463
|
+
return { url, time };
|
|
2956
3464
|
});
|
|
2957
3465
|
return Promise.all(urlPromises);
|
|
2958
3466
|
}
|
|
@@ -3256,6 +3764,7 @@ var SYSTEM_PROMPT3 = dedent4`
|
|
|
3256
3764
|
- Do not fabricate details or make unsupported assumptions
|
|
3257
3765
|
- Return structured data matching the requested schema
|
|
3258
3766
|
- Output only the JSON object; no markdown or extra text
|
|
3767
|
+
- When a <language> section is provided, all output text MUST be written in that language
|
|
3259
3768
|
</constraints>
|
|
3260
3769
|
|
|
3261
3770
|
<tone_guidance>
|
|
@@ -3310,6 +3819,7 @@ var AUDIO_ONLY_SYSTEM_PROMPT = dedent4`
|
|
|
3310
3819
|
- Return structured data matching the requested schema
|
|
3311
3820
|
- Focus entirely on audio/spoken content - there are no visual elements
|
|
3312
3821
|
- Output only the JSON object; no markdown or extra text
|
|
3822
|
+
- When a <language> section is provided, all output text MUST be written in that language
|
|
3313
3823
|
</constraints>
|
|
3314
3824
|
|
|
3315
3825
|
<tone_guidance>
|
|
@@ -3340,9 +3850,13 @@ function buildUserPrompt4({
|
|
|
3340
3850
|
isAudioOnly = false,
|
|
3341
3851
|
titleLength,
|
|
3342
3852
|
descriptionLength,
|
|
3343
|
-
tagCount
|
|
3853
|
+
tagCount,
|
|
3854
|
+
languageName
|
|
3344
3855
|
}) {
|
|
3345
3856
|
const contextSections = [createToneSection(TONE_INSTRUCTIONS[tone])];
|
|
3857
|
+
if (languageName) {
|
|
3858
|
+
contextSections.push(createLanguageSection(languageName));
|
|
3859
|
+
}
|
|
3346
3860
|
if (transcriptText) {
|
|
3347
3861
|
const format = isCleanTranscript ? "plain text" : "WebVTT";
|
|
3348
3862
|
contextSections.push(createTranscriptSection(transcriptText, format));
|
|
@@ -3455,7 +3969,8 @@ async function getSummaryAndTags(assetId, options) {
|
|
|
3455
3969
|
credentials,
|
|
3456
3970
|
titleLength,
|
|
3457
3971
|
descriptionLength,
|
|
3458
|
-
tagCount
|
|
3972
|
+
tagCount,
|
|
3973
|
+
outputLanguageCode
|
|
3459
3974
|
} = options ?? {};
|
|
3460
3975
|
if (!VALID_TONES.includes(tone)) {
|
|
3461
3976
|
throw new Error(
|
|
@@ -3482,12 +3997,15 @@ async function getSummaryAndTags(assetId, options) {
|
|
|
3482
3997
|
"Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
|
|
3483
3998
|
);
|
|
3484
3999
|
}
|
|
3485
|
-
const
|
|
4000
|
+
const transcriptResult = includeTranscript ? await fetchTranscriptForAsset(assetData, playbackId, {
|
|
3486
4001
|
cleanTranscript,
|
|
3487
4002
|
shouldSign: policy === "signed",
|
|
3488
4003
|
credentials: workflowCredentials,
|
|
3489
4004
|
required: isAudioOnly
|
|
3490
|
-
})
|
|
4005
|
+
}) : void 0;
|
|
4006
|
+
const transcriptText = transcriptResult?.transcriptText ?? "";
|
|
4007
|
+
const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult?.track?.language_code ?? getReadyTextTracks(assetData)[0]?.language_code;
|
|
4008
|
+
const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
|
|
3491
4009
|
const userPrompt = buildUserPrompt4({
|
|
3492
4010
|
tone,
|
|
3493
4011
|
transcriptText,
|
|
@@ -3496,7 +4014,8 @@ async function getSummaryAndTags(assetId, options) {
|
|
|
3496
4014
|
isAudioOnly,
|
|
3497
4015
|
titleLength,
|
|
3498
4016
|
descriptionLength,
|
|
3499
|
-
tagCount
|
|
4017
|
+
tagCount,
|
|
4018
|
+
languageName
|
|
3500
4019
|
});
|
|
3501
4020
|
let analysisResponse;
|
|
3502
4021
|
let imageUrl;
|
|
@@ -3566,164 +4085,6 @@ async function getSummaryAndTags(assetId, options) {
|
|
|
3566
4085
|
};
|
|
3567
4086
|
}
|
|
3568
4087
|
|
|
3569
|
-
// src/lib/language-codes.ts
|
|
3570
|
-
var ISO639_1_TO_3 = {
|
|
3571
|
-
// Major world languages
|
|
3572
|
-
en: "eng",
|
|
3573
|
-
// English
|
|
3574
|
-
es: "spa",
|
|
3575
|
-
// Spanish
|
|
3576
|
-
fr: "fra",
|
|
3577
|
-
// French
|
|
3578
|
-
de: "deu",
|
|
3579
|
-
// German
|
|
3580
|
-
it: "ita",
|
|
3581
|
-
// Italian
|
|
3582
|
-
pt: "por",
|
|
3583
|
-
// Portuguese
|
|
3584
|
-
ru: "rus",
|
|
3585
|
-
// Russian
|
|
3586
|
-
zh: "zho",
|
|
3587
|
-
// Chinese
|
|
3588
|
-
ja: "jpn",
|
|
3589
|
-
// Japanese
|
|
3590
|
-
ko: "kor",
|
|
3591
|
-
// Korean
|
|
3592
|
-
ar: "ara",
|
|
3593
|
-
// Arabic
|
|
3594
|
-
hi: "hin",
|
|
3595
|
-
// Hindi
|
|
3596
|
-
// European languages
|
|
3597
|
-
nl: "nld",
|
|
3598
|
-
// Dutch
|
|
3599
|
-
pl: "pol",
|
|
3600
|
-
// Polish
|
|
3601
|
-
sv: "swe",
|
|
3602
|
-
// Swedish
|
|
3603
|
-
da: "dan",
|
|
3604
|
-
// Danish
|
|
3605
|
-
no: "nor",
|
|
3606
|
-
// Norwegian
|
|
3607
|
-
fi: "fin",
|
|
3608
|
-
// Finnish
|
|
3609
|
-
el: "ell",
|
|
3610
|
-
// Greek
|
|
3611
|
-
cs: "ces",
|
|
3612
|
-
// Czech
|
|
3613
|
-
hu: "hun",
|
|
3614
|
-
// Hungarian
|
|
3615
|
-
ro: "ron",
|
|
3616
|
-
// Romanian
|
|
3617
|
-
bg: "bul",
|
|
3618
|
-
// Bulgarian
|
|
3619
|
-
hr: "hrv",
|
|
3620
|
-
// Croatian
|
|
3621
|
-
sk: "slk",
|
|
3622
|
-
// Slovak
|
|
3623
|
-
sl: "slv",
|
|
3624
|
-
// Slovenian
|
|
3625
|
-
uk: "ukr",
|
|
3626
|
-
// Ukrainian
|
|
3627
|
-
tr: "tur",
|
|
3628
|
-
// Turkish
|
|
3629
|
-
// Asian languages
|
|
3630
|
-
th: "tha",
|
|
3631
|
-
// Thai
|
|
3632
|
-
vi: "vie",
|
|
3633
|
-
// Vietnamese
|
|
3634
|
-
id: "ind",
|
|
3635
|
-
// Indonesian
|
|
3636
|
-
ms: "msa",
|
|
3637
|
-
// Malay
|
|
3638
|
-
tl: "tgl",
|
|
3639
|
-
// Tagalog/Filipino
|
|
3640
|
-
// Other languages
|
|
3641
|
-
he: "heb",
|
|
3642
|
-
// Hebrew
|
|
3643
|
-
fa: "fas",
|
|
3644
|
-
// Persian/Farsi
|
|
3645
|
-
bn: "ben",
|
|
3646
|
-
// Bengali
|
|
3647
|
-
ta: "tam",
|
|
3648
|
-
// Tamil
|
|
3649
|
-
te: "tel",
|
|
3650
|
-
// Telugu
|
|
3651
|
-
mr: "mar",
|
|
3652
|
-
// Marathi
|
|
3653
|
-
gu: "guj",
|
|
3654
|
-
// Gujarati
|
|
3655
|
-
kn: "kan",
|
|
3656
|
-
// Kannada
|
|
3657
|
-
ml: "mal",
|
|
3658
|
-
// Malayalam
|
|
3659
|
-
pa: "pan",
|
|
3660
|
-
// Punjabi
|
|
3661
|
-
ur: "urd",
|
|
3662
|
-
// Urdu
|
|
3663
|
-
sw: "swa",
|
|
3664
|
-
// Swahili
|
|
3665
|
-
af: "afr",
|
|
3666
|
-
// Afrikaans
|
|
3667
|
-
ca: "cat",
|
|
3668
|
-
// Catalan
|
|
3669
|
-
eu: "eus",
|
|
3670
|
-
// Basque
|
|
3671
|
-
gl: "glg",
|
|
3672
|
-
// Galician
|
|
3673
|
-
is: "isl",
|
|
3674
|
-
// Icelandic
|
|
3675
|
-
et: "est",
|
|
3676
|
-
// Estonian
|
|
3677
|
-
lv: "lav",
|
|
3678
|
-
// Latvian
|
|
3679
|
-
lt: "lit"
|
|
3680
|
-
// Lithuanian
|
|
3681
|
-
};
|
|
3682
|
-
var ISO639_3_TO_1 = Object.fromEntries(
|
|
3683
|
-
Object.entries(ISO639_1_TO_3).map(([iso1, iso3]) => [iso3, iso1])
|
|
3684
|
-
);
|
|
3685
|
-
function toISO639_3(code) {
|
|
3686
|
-
const normalized = code.toLowerCase().trim();
|
|
3687
|
-
if (normalized.length === 3) {
|
|
3688
|
-
return normalized;
|
|
3689
|
-
}
|
|
3690
|
-
return ISO639_1_TO_3[normalized] ?? normalized;
|
|
3691
|
-
}
|
|
3692
|
-
function toISO639_1(code) {
|
|
3693
|
-
const normalized = code.toLowerCase().trim();
|
|
3694
|
-
if (normalized.length === 2) {
|
|
3695
|
-
return normalized;
|
|
3696
|
-
}
|
|
3697
|
-
return ISO639_3_TO_1[normalized] ?? normalized;
|
|
3698
|
-
}
|
|
3699
|
-
function getLanguageCodePair(code) {
|
|
3700
|
-
const normalized = code.toLowerCase().trim();
|
|
3701
|
-
if (normalized.length === 2) {
|
|
3702
|
-
return {
|
|
3703
|
-
iso639_1: normalized,
|
|
3704
|
-
iso639_3: toISO639_3(normalized)
|
|
3705
|
-
};
|
|
3706
|
-
} else if (normalized.length === 3) {
|
|
3707
|
-
return {
|
|
3708
|
-
iso639_1: toISO639_1(normalized),
|
|
3709
|
-
iso639_3: normalized
|
|
3710
|
-
};
|
|
3711
|
-
}
|
|
3712
|
-
return {
|
|
3713
|
-
iso639_1: normalized,
|
|
3714
|
-
iso639_3: normalized
|
|
3715
|
-
};
|
|
3716
|
-
}
|
|
3717
|
-
function getLanguageName(code) {
|
|
3718
|
-
const iso639_1 = toISO639_1(code);
|
|
3719
|
-
try {
|
|
3720
|
-
const displayNames = new Intl.DisplayNames(["en"], { type: "language" });
|
|
3721
|
-
return displayNames.of(iso639_1) ?? code.toUpperCase();
|
|
3722
|
-
} catch {
|
|
3723
|
-
return code.toUpperCase();
|
|
3724
|
-
}
|
|
3725
|
-
}
|
|
3726
|
-
|
|
3727
4088
|
// src/lib/storage-adapter.ts
|
|
3728
4089
|
function requireCredentials(accessKeyId, secretAccessKey) {
|
|
3729
4090
|
if (!accessKeyId || !secretAccessKey) {
|
|
@@ -4168,12 +4529,187 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
|
|
|
4168
4529
|
}
|
|
4169
4530
|
|
|
4170
4531
|
// src/workflows/translate-captions.ts
|
|
4171
|
-
import {
|
|
4532
|
+
import {
|
|
4533
|
+
APICallError,
|
|
4534
|
+
generateText as generateText5,
|
|
4535
|
+
NoObjectGeneratedError,
|
|
4536
|
+
Output as Output5,
|
|
4537
|
+
RetryError,
|
|
4538
|
+
TypeValidationError
|
|
4539
|
+
} from "ai";
|
|
4540
|
+
import dedent5 from "dedent";
|
|
4172
4541
|
import { z as z6 } from "zod";
|
|
4173
4542
|
var translationSchema = z6.object({
|
|
4174
4543
|
translation: z6.string()
|
|
4175
4544
|
});
|
|
4176
|
-
var SYSTEM_PROMPT4 =
|
|
4545
|
+
var SYSTEM_PROMPT4 = dedent5`
|
|
4546
|
+
You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user.
|
|
4547
|
+
You may receive either a full VTT file or a chunk from a larger VTT.
|
|
4548
|
+
Preserve all timestamps, cue ordering, and VTT formatting exactly as they appear.
|
|
4549
|
+
Return JSON with a single key "translation" containing the translated VTT content.
|
|
4550
|
+
`;
|
|
4551
|
+
var CUE_TRANSLATION_SYSTEM_PROMPT = dedent5`
|
|
4552
|
+
You are a subtitle translation expert.
|
|
4553
|
+
You will receive a sequence of subtitle cues extracted from a VTT file.
|
|
4554
|
+
Translate the cues to the requested target language while preserving their original order.
|
|
4555
|
+
Treat the cue list as continuous context so the translation reads naturally across adjacent lines.
|
|
4556
|
+
Return JSON with a single key "translations" containing exactly one translated string for each input cue.
|
|
4557
|
+
Do not merge, split, omit, reorder, or add cues.
|
|
4558
|
+
`;
|
|
4559
|
+
var DEFAULT_TRANSLATION_CHUNKING = {
|
|
4560
|
+
enabled: true,
|
|
4561
|
+
minimumAssetDurationSeconds: 30 * 60,
|
|
4562
|
+
targetChunkDurationSeconds: 30 * 60,
|
|
4563
|
+
maxConcurrentTranslations: 4,
|
|
4564
|
+
maxCuesPerChunk: 80,
|
|
4565
|
+
maxCueTextTokensPerChunk: 2e3
|
|
4566
|
+
};
|
|
4567
|
+
var TOKEN_USAGE_FIELDS = [
|
|
4568
|
+
"inputTokens",
|
|
4569
|
+
"outputTokens",
|
|
4570
|
+
"totalTokens",
|
|
4571
|
+
"reasoningTokens",
|
|
4572
|
+
"cachedInputTokens"
|
|
4573
|
+
];
|
|
4574
|
+
var TranslationChunkValidationError = class extends Error {
|
|
4575
|
+
constructor(message) {
|
|
4576
|
+
super(message);
|
|
4577
|
+
this.name = "TranslationChunkValidationError";
|
|
4578
|
+
}
|
|
4579
|
+
};
|
|
4580
|
+
function isTranslationChunkValidationError(error) {
|
|
4581
|
+
return error instanceof TranslationChunkValidationError;
|
|
4582
|
+
}
|
|
4583
|
+
function isProviderServiceError(error) {
|
|
4584
|
+
if (!error) {
|
|
4585
|
+
return false;
|
|
4586
|
+
}
|
|
4587
|
+
if (RetryError.isInstance(error)) {
|
|
4588
|
+
return isProviderServiceError(error.lastError);
|
|
4589
|
+
}
|
|
4590
|
+
if (APICallError.isInstance(error)) {
|
|
4591
|
+
return true;
|
|
4592
|
+
}
|
|
4593
|
+
if (error instanceof Error && "cause" in error) {
|
|
4594
|
+
return isProviderServiceError(error.cause);
|
|
4595
|
+
}
|
|
4596
|
+
return false;
|
|
4597
|
+
}
|
|
4598
|
+
function shouldSplitChunkTranslationError(error) {
|
|
4599
|
+
if (isProviderServiceError(error)) {
|
|
4600
|
+
return false;
|
|
4601
|
+
}
|
|
4602
|
+
return NoObjectGeneratedError.isInstance(error) || TypeValidationError.isInstance(error) || isTranslationChunkValidationError(error);
|
|
4603
|
+
}
|
|
4604
|
+
function isDefinedTokenUsageValue(value) {
|
|
4605
|
+
return typeof value === "number";
|
|
4606
|
+
}
|
|
4607
|
+
function resolveTranslationChunkingOptions(options) {
|
|
4608
|
+
const targetChunkDurationSeconds = Math.max(
|
|
4609
|
+
1,
|
|
4610
|
+
options?.targetChunkDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.targetChunkDurationSeconds
|
|
4611
|
+
);
|
|
4612
|
+
return {
|
|
4613
|
+
enabled: options?.enabled ?? DEFAULT_TRANSLATION_CHUNKING.enabled,
|
|
4614
|
+
minimumAssetDurationSeconds: Math.max(
|
|
4615
|
+
1,
|
|
4616
|
+
options?.minimumAssetDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.minimumAssetDurationSeconds
|
|
4617
|
+
),
|
|
4618
|
+
targetChunkDurationSeconds,
|
|
4619
|
+
maxConcurrentTranslations: Math.max(
|
|
4620
|
+
1,
|
|
4621
|
+
options?.maxConcurrentTranslations ?? DEFAULT_TRANSLATION_CHUNKING.maxConcurrentTranslations
|
|
4622
|
+
),
|
|
4623
|
+
maxCuesPerChunk: Math.max(
|
|
4624
|
+
1,
|
|
4625
|
+
options?.maxCuesPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCuesPerChunk
|
|
4626
|
+
),
|
|
4627
|
+
maxCueTextTokensPerChunk: Math.max(
|
|
4628
|
+
1,
|
|
4629
|
+
options?.maxCueTextTokensPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCueTextTokensPerChunk
|
|
4630
|
+
)
|
|
4631
|
+
};
|
|
4632
|
+
}
|
|
4633
|
+
function aggregateTokenUsage(usages) {
|
|
4634
|
+
return TOKEN_USAGE_FIELDS.reduce((aggregate, field) => {
|
|
4635
|
+
const values = usages.map((usage) => usage[field]).filter(isDefinedTokenUsageValue);
|
|
4636
|
+
if (values.length > 0) {
|
|
4637
|
+
aggregate[field] = values.reduce((total, value) => total + value, 0);
|
|
4638
|
+
}
|
|
4639
|
+
return aggregate;
|
|
4640
|
+
}, {});
|
|
4641
|
+
}
|
|
4642
|
+
function createTranslationChunkRequest(id, cues, cueBlocks) {
|
|
4643
|
+
return {
|
|
4644
|
+
id,
|
|
4645
|
+
cueCount: cues.length,
|
|
4646
|
+
startTime: cues[0].startTime,
|
|
4647
|
+
endTime: cues[cues.length - 1].endTime,
|
|
4648
|
+
cues,
|
|
4649
|
+
cueBlocks
|
|
4650
|
+
};
|
|
4651
|
+
}
|
|
4652
|
+
function splitTranslationChunkRequestByBudget(id, cues, cueBlocks, maxCuesPerChunk, maxCueTextTokensPerChunk) {
|
|
4653
|
+
const chunks = chunkVTTCuesByBudget(cues, {
|
|
4654
|
+
maxCuesPerChunk,
|
|
4655
|
+
maxTextTokensPerChunk: maxCueTextTokensPerChunk
|
|
4656
|
+
});
|
|
4657
|
+
return chunks.map(
|
|
4658
|
+
(chunk, index) => createTranslationChunkRequest(
|
|
4659
|
+
chunks.length === 1 ? id : `${id}-part-${index}`,
|
|
4660
|
+
cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
|
|
4661
|
+
cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1)
|
|
4662
|
+
)
|
|
4663
|
+
);
|
|
4664
|
+
}
|
|
4665
|
+
function buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunkingOptions) {
|
|
4666
|
+
const resolvedChunking = resolveTranslationChunkingOptions(chunkingOptions);
|
|
4667
|
+
const cues = parseVTTCues(vttContent);
|
|
4668
|
+
if (cues.length === 0) {
|
|
4669
|
+
return null;
|
|
4670
|
+
}
|
|
4671
|
+
const { preamble, cueBlocks } = splitVttPreambleAndCueBlocks(vttContent);
|
|
4672
|
+
if (cueBlocks.length !== cues.length) {
|
|
4673
|
+
console.warn(
|
|
4674
|
+
`Falling back to full-VTT caption translation because cue block count (${cueBlocks.length}) does not match parsed cue count (${cues.length}).`
|
|
4675
|
+
);
|
|
4676
|
+
return null;
|
|
4677
|
+
}
|
|
4678
|
+
if (!resolvedChunking.enabled) {
|
|
4679
|
+
return {
|
|
4680
|
+
preamble,
|
|
4681
|
+
chunks: [
|
|
4682
|
+
createTranslationChunkRequest("chunk-0", cues, cueBlocks)
|
|
4683
|
+
]
|
|
4684
|
+
};
|
|
4685
|
+
}
|
|
4686
|
+
if (typeof assetDurationSeconds !== "number" || assetDurationSeconds < resolvedChunking.minimumAssetDurationSeconds) {
|
|
4687
|
+
return {
|
|
4688
|
+
preamble,
|
|
4689
|
+
chunks: [
|
|
4690
|
+
createTranslationChunkRequest("chunk-0", cues, cueBlocks)
|
|
4691
|
+
]
|
|
4692
|
+
};
|
|
4693
|
+
}
|
|
4694
|
+
const targetChunkDurationSeconds = resolvedChunking.targetChunkDurationSeconds;
|
|
4695
|
+
const durationChunks = chunkVTTCuesByDuration(cues, {
|
|
4696
|
+
targetChunkDurationSeconds,
|
|
4697
|
+
maxChunkDurationSeconds: Math.max(targetChunkDurationSeconds, Math.round(targetChunkDurationSeconds * (7 / 6))),
|
|
4698
|
+
minChunkDurationSeconds: Math.max(1, Math.round(targetChunkDurationSeconds * (2 / 3)))
|
|
4699
|
+
});
|
|
4700
|
+
return {
|
|
4701
|
+
preamble,
|
|
4702
|
+
chunks: durationChunks.flatMap(
|
|
4703
|
+
(chunk) => splitTranslationChunkRequestByBudget(
|
|
4704
|
+
chunk.id,
|
|
4705
|
+
cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
|
|
4706
|
+
cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
|
|
4707
|
+
resolvedChunking.maxCuesPerChunk,
|
|
4708
|
+
resolvedChunking.maxCueTextTokensPerChunk
|
|
4709
|
+
)
|
|
4710
|
+
)
|
|
4711
|
+
};
|
|
4712
|
+
}
|
|
4177
4713
|
async function fetchVttFromMux(vttUrl) {
|
|
4178
4714
|
"use step";
|
|
4179
4715
|
const vttResponse = await fetch(vttUrl);
|
|
@@ -4219,6 +4755,176 @@ ${vttContent}`
|
|
|
4219
4755
|
}
|
|
4220
4756
|
};
|
|
4221
4757
|
}
|
|
4758
|
+
async function translateCueChunkWithAI({
|
|
4759
|
+
cues,
|
|
4760
|
+
fromLanguageCode,
|
|
4761
|
+
toLanguageCode,
|
|
4762
|
+
provider,
|
|
4763
|
+
modelId,
|
|
4764
|
+
credentials
|
|
4765
|
+
}) {
|
|
4766
|
+
"use step";
|
|
4767
|
+
const model = await createLanguageModelFromConfig(provider, modelId, credentials);
|
|
4768
|
+
const schema = z6.object({
|
|
4769
|
+
translations: z6.array(z6.string().min(1)).length(cues.length)
|
|
4770
|
+
});
|
|
4771
|
+
const cuePayload = cues.map((cue, index) => ({
|
|
4772
|
+
index,
|
|
4773
|
+
startTime: cue.startTime,
|
|
4774
|
+
endTime: cue.endTime,
|
|
4775
|
+
text: cue.text
|
|
4776
|
+
}));
|
|
4777
|
+
const response = await generateText5({
|
|
4778
|
+
model,
|
|
4779
|
+
output: Output5.object({ schema }),
|
|
4780
|
+
messages: [
|
|
4781
|
+
{
|
|
4782
|
+
role: "system",
|
|
4783
|
+
content: CUE_TRANSLATION_SYSTEM_PROMPT
|
|
4784
|
+
},
|
|
4785
|
+
{
|
|
4786
|
+
role: "user",
|
|
4787
|
+
content: `Translate from ${fromLanguageCode} to ${toLanguageCode}.
|
|
4788
|
+
Return exactly ${cues.length} translated cues in the same order as the input.
|
|
4789
|
+
|
|
4790
|
+
${JSON.stringify(cuePayload, null, 2)}`
|
|
4791
|
+
}
|
|
4792
|
+
]
|
|
4793
|
+
});
|
|
4794
|
+
return {
|
|
4795
|
+
translations: response.output.translations,
|
|
4796
|
+
usage: {
|
|
4797
|
+
inputTokens: response.usage.inputTokens,
|
|
4798
|
+
outputTokens: response.usage.outputTokens,
|
|
4799
|
+
totalTokens: response.usage.totalTokens,
|
|
4800
|
+
reasoningTokens: response.usage.reasoningTokens,
|
|
4801
|
+
cachedInputTokens: response.usage.cachedInputTokens
|
|
4802
|
+
}
|
|
4803
|
+
};
|
|
4804
|
+
}
|
|
4805
|
+
function splitTranslationChunkAtMidpoint(chunk) {
|
|
4806
|
+
const midpoint = Math.floor(chunk.cueCount / 2);
|
|
4807
|
+
if (midpoint <= 0 || midpoint >= chunk.cueCount) {
|
|
4808
|
+
throw new Error(`Cannot split chunk ${chunk.id} with cueCount=${chunk.cueCount}`);
|
|
4809
|
+
}
|
|
4810
|
+
return [
|
|
4811
|
+
createTranslationChunkRequest(
|
|
4812
|
+
`${chunk.id}-a`,
|
|
4813
|
+
chunk.cues.slice(0, midpoint),
|
|
4814
|
+
chunk.cueBlocks.slice(0, midpoint)
|
|
4815
|
+
),
|
|
4816
|
+
createTranslationChunkRequest(
|
|
4817
|
+
`${chunk.id}-b`,
|
|
4818
|
+
chunk.cues.slice(midpoint),
|
|
4819
|
+
chunk.cueBlocks.slice(midpoint)
|
|
4820
|
+
)
|
|
4821
|
+
];
|
|
4822
|
+
}
|
|
4823
|
+
async function translateChunkWithFallback({
|
|
4824
|
+
chunk,
|
|
4825
|
+
fromLanguageCode,
|
|
4826
|
+
toLanguageCode,
|
|
4827
|
+
provider,
|
|
4828
|
+
modelId,
|
|
4829
|
+
credentials
|
|
4830
|
+
}) {
|
|
4831
|
+
"use step";
|
|
4832
|
+
try {
|
|
4833
|
+
const result = await translateCueChunkWithAI({
|
|
4834
|
+
cues: chunk.cues,
|
|
4835
|
+
fromLanguageCode,
|
|
4836
|
+
toLanguageCode,
|
|
4837
|
+
provider,
|
|
4838
|
+
modelId,
|
|
4839
|
+
credentials
|
|
4840
|
+
});
|
|
4841
|
+
if (result.translations.length !== chunk.cueCount) {
|
|
4842
|
+
throw new TranslationChunkValidationError(
|
|
4843
|
+
`Chunk ${chunk.id} returned ${result.translations.length} cues, expected ${chunk.cueCount} for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s`
|
|
4844
|
+
);
|
|
4845
|
+
}
|
|
4846
|
+
return {
|
|
4847
|
+
translatedVtt: buildVttFromTranslatedCueBlocks(chunk.cueBlocks, result.translations),
|
|
4848
|
+
usage: result.usage
|
|
4849
|
+
};
|
|
4850
|
+
} catch (error) {
|
|
4851
|
+
if (!shouldSplitChunkTranslationError(error) || chunk.cueCount <= 1) {
|
|
4852
|
+
throw new Error(
|
|
4853
|
+
`Chunk ${chunk.id} failed for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
4854
|
+
);
|
|
4855
|
+
}
|
|
4856
|
+
const [leftChunk, rightChunk] = splitTranslationChunkAtMidpoint(chunk);
|
|
4857
|
+
const [leftResult, rightResult] = await Promise.all([
|
|
4858
|
+
translateChunkWithFallback({
|
|
4859
|
+
chunk: leftChunk,
|
|
4860
|
+
fromLanguageCode,
|
|
4861
|
+
toLanguageCode,
|
|
4862
|
+
provider,
|
|
4863
|
+
modelId,
|
|
4864
|
+
credentials
|
|
4865
|
+
}),
|
|
4866
|
+
translateChunkWithFallback({
|
|
4867
|
+
chunk: rightChunk,
|
|
4868
|
+
fromLanguageCode,
|
|
4869
|
+
toLanguageCode,
|
|
4870
|
+
provider,
|
|
4871
|
+
modelId,
|
|
4872
|
+
credentials
|
|
4873
|
+
})
|
|
4874
|
+
]);
|
|
4875
|
+
return {
|
|
4876
|
+
translatedVtt: concatenateVttSegments([leftResult.translatedVtt, rightResult.translatedVtt]),
|
|
4877
|
+
usage: aggregateTokenUsage([leftResult.usage, rightResult.usage])
|
|
4878
|
+
};
|
|
4879
|
+
}
|
|
4880
|
+
}
|
|
4881
|
+
async function translateCaptionTrack({
|
|
4882
|
+
vttContent,
|
|
4883
|
+
assetDurationSeconds,
|
|
4884
|
+
fromLanguageCode,
|
|
4885
|
+
toLanguageCode,
|
|
4886
|
+
provider,
|
|
4887
|
+
modelId,
|
|
4888
|
+
credentials,
|
|
4889
|
+
chunking
|
|
4890
|
+
}) {
|
|
4891
|
+
"use step";
|
|
4892
|
+
const chunkPlan = buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunking);
|
|
4893
|
+
if (!chunkPlan) {
|
|
4894
|
+
return translateVttWithAI({
|
|
4895
|
+
vttContent,
|
|
4896
|
+
fromLanguageCode,
|
|
4897
|
+
toLanguageCode,
|
|
4898
|
+
provider,
|
|
4899
|
+
modelId,
|
|
4900
|
+
credentials
|
|
4901
|
+
});
|
|
4902
|
+
}
|
|
4903
|
+
const resolvedChunking = resolveTranslationChunkingOptions(chunking);
|
|
4904
|
+
const translatedSegments = [];
|
|
4905
|
+
const usageByChunk = [];
|
|
4906
|
+
for (let index = 0; index < chunkPlan.chunks.length; index += resolvedChunking.maxConcurrentTranslations) {
|
|
4907
|
+
const batch = chunkPlan.chunks.slice(index, index + resolvedChunking.maxConcurrentTranslations);
|
|
4908
|
+
const batchResults = await Promise.all(
|
|
4909
|
+
batch.map(
|
|
4910
|
+
(chunk) => translateChunkWithFallback({
|
|
4911
|
+
chunk,
|
|
4912
|
+
fromLanguageCode,
|
|
4913
|
+
toLanguageCode,
|
|
4914
|
+
provider,
|
|
4915
|
+
modelId,
|
|
4916
|
+
credentials
|
|
4917
|
+
})
|
|
4918
|
+
)
|
|
4919
|
+
);
|
|
4920
|
+
translatedSegments.push(...batchResults.map((result) => result.translatedVtt));
|
|
4921
|
+
usageByChunk.push(...batchResults.map((result) => result.usage));
|
|
4922
|
+
}
|
|
4923
|
+
return {
|
|
4924
|
+
translatedVtt: concatenateVttSegments(translatedSegments, chunkPlan.preamble),
|
|
4925
|
+
usage: aggregateTokenUsage(usageByChunk)
|
|
4926
|
+
};
|
|
4927
|
+
}
|
|
4222
4928
|
async function uploadVttToS3({
|
|
4223
4929
|
translatedVtt,
|
|
4224
4930
|
assetId,
|
|
@@ -4279,7 +4985,8 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
|
|
|
4279
4985
|
s3Bucket: providedS3Bucket,
|
|
4280
4986
|
uploadToMux: uploadToMuxOption,
|
|
4281
4987
|
storageAdapter,
|
|
4282
|
-
credentials: providedCredentials
|
|
4988
|
+
credentials: providedCredentials,
|
|
4989
|
+
chunking
|
|
4283
4990
|
} = options;
|
|
4284
4991
|
const credentials = providedCredentials;
|
|
4285
4992
|
const effectiveStorageAdapter = storageAdapter;
|
|
@@ -4340,13 +5047,15 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
|
|
|
4340
5047
|
let translatedVtt;
|
|
4341
5048
|
let usage;
|
|
4342
5049
|
try {
|
|
4343
|
-
const result = await
|
|
5050
|
+
const result = await translateCaptionTrack({
|
|
4344
5051
|
vttContent,
|
|
5052
|
+
assetDurationSeconds,
|
|
4345
5053
|
fromLanguageCode,
|
|
4346
5054
|
toLanguageCode,
|
|
4347
5055
|
provider: modelConfig.provider,
|
|
4348
5056
|
modelId: modelConfig.modelId,
|
|
4349
|
-
credentials
|
|
5057
|
+
credentials,
|
|
5058
|
+
chunking
|
|
4350
5059
|
});
|
|
4351
5060
|
translatedVtt = result.translatedVtt;
|
|
4352
5061
|
usage = result.usage;
|