@mux/ai 0.8.2 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,7 +5,7 @@ var __export = (target, all) => {
5
5
  };
6
6
 
7
7
  // package.json
8
- var version = "0.8.2";
8
+ var version = "0.10.0";
9
9
 
10
10
  // src/env.ts
11
11
  import { z } from "zod";
@@ -783,9 +783,14 @@ var primitives_exports = {};
783
783
  __export(primitives_exports, {
784
784
  DEFAULT_STORYBOARD_WIDTH: () => DEFAULT_STORYBOARD_WIDTH,
785
785
  buildTranscriptUrl: () => buildTranscriptUrl,
786
+ buildVttFromCueBlocks: () => buildVttFromCueBlocks,
787
+ buildVttFromTranslatedCueBlocks: () => buildVttFromTranslatedCueBlocks,
786
788
  chunkByTokens: () => chunkByTokens,
787
789
  chunkText: () => chunkText,
788
790
  chunkVTTCues: () => chunkVTTCues,
791
+ chunkVTTCuesByBudget: () => chunkVTTCuesByBudget,
792
+ chunkVTTCuesByDuration: () => chunkVTTCuesByDuration,
793
+ concatenateVttSegments: () => concatenateVttSegments,
789
794
  estimateTokenCount: () => estimateTokenCount,
790
795
  extractTextFromVTT: () => extractTextFromVTT,
791
796
  extractTimestampedTranscript: () => extractTimestampedTranscript,
@@ -801,7 +806,9 @@ __export(primitives_exports, {
801
806
  getStoryboardUrl: () => getStoryboardUrl,
802
807
  getThumbnailUrls: () => getThumbnailUrls,
803
808
  parseVTTCues: () => parseVTTCues,
809
+ replaceCueText: () => replaceCueText,
804
810
  secondsToTimestamp: () => secondsToTimestamp,
811
+ splitVttPreambleAndCueBlocks: () => splitVttPreambleAndCueBlocks,
805
812
  vttTimestampToSeconds: () => vttTimestampToSeconds
806
813
  });
807
814
 
@@ -1162,6 +1169,14 @@ async function getStoryboardUrl(playbackId, width = DEFAULT_STORYBOARD_WIDTH, sh
1162
1169
  }
1163
1170
 
1164
1171
  // src/primitives/text-chunking.ts
1172
+ var DEFAULT_MIN_CHUNK_DURATION_RATIO = 2 / 3;
1173
+ var DEFAULT_BOUNDARY_LOOKAHEAD_CUES = 12;
1174
+ var DEFAULT_BOUNDARY_PAUSE_SECONDS = 1.25;
1175
+ var STRONG_BOUNDARY_SCORE = 4;
1176
+ var PREFERRED_BOUNDARY_WINDOW_SECONDS = 5 * 60;
1177
+ var SENTENCE_BOUNDARY_REGEX = /[.!?]["')\]]*$/;
1178
+ var CLAUSE_BOUNDARY_REGEX = /[,;:]["')\]]*$/;
1179
+ var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
1165
1180
  function estimateTokenCount(text) {
1166
1181
  const words = text.trim().split(/\s+/).length;
1167
1182
  return Math.ceil(words / 0.75);
@@ -1234,6 +1249,151 @@ function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
1234
1249
  }
1235
1250
  return chunks;
1236
1251
  }
1252
+ function scoreCueBoundary(cues, index, boundaryPauseSeconds) {
1253
+ const cue = cues[index];
1254
+ const nextCue = cues[index + 1];
1255
+ if (!nextCue) {
1256
+ return Number.POSITIVE_INFINITY;
1257
+ }
1258
+ const trimmedText = cue.text.trim();
1259
+ let score = 0;
1260
+ if (SENTENCE_BOUNDARY_REGEX.test(trimmedText)) {
1261
+ score += 4;
1262
+ } else if (CLAUSE_BOUNDARY_REGEX.test(trimmedText)) {
1263
+ score += 2;
1264
+ }
1265
+ if (nextCue.startTime - cue.endTime >= boundaryPauseSeconds) {
1266
+ score += 2;
1267
+ }
1268
+ if (NEXT_SENTENCE_START_REGEX.test(nextCue.text.trim())) {
1269
+ score += 1;
1270
+ }
1271
+ return score;
1272
+ }
1273
+ function chunkVTTCuesByBudget(cues, options) {
1274
+ if (cues.length === 0) {
1275
+ return [];
1276
+ }
1277
+ const maxCuesPerChunk = Math.max(1, options.maxCuesPerChunk);
1278
+ let maxTextTokensPerChunk = Number.POSITIVE_INFINITY;
1279
+ if (options.maxTextTokensPerChunk) {
1280
+ maxTextTokensPerChunk = Math.max(1, options.maxTextTokensPerChunk);
1281
+ }
1282
+ const chunks = [];
1283
+ let chunkIndex = 0;
1284
+ let cueStartIndex = 0;
1285
+ let currentTokenCount = 0;
1286
+ for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
1287
+ const cue = cues[cueIndex];
1288
+ const cueTokenCount = estimateTokenCount(cue.text);
1289
+ const currentCueCount = cueIndex - cueStartIndex;
1290
+ const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
1291
+ const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
1292
+ if (wouldExceedCueCount || wouldExceedTokenCount) {
1293
+ chunks.push({
1294
+ id: `chunk-${chunkIndex}`,
1295
+ cueStartIndex,
1296
+ cueEndIndex: cueIndex - 1,
1297
+ cueCount: cueIndex - cueStartIndex,
1298
+ startTime: cues[cueStartIndex].startTime,
1299
+ endTime: cues[cueIndex - 1].endTime
1300
+ });
1301
+ cueStartIndex = cueIndex;
1302
+ currentTokenCount = 0;
1303
+ chunkIndex++;
1304
+ }
1305
+ currentTokenCount += cueTokenCount;
1306
+ }
1307
+ chunks.push({
1308
+ id: `chunk-${chunkIndex}`,
1309
+ cueStartIndex,
1310
+ cueEndIndex: cues.length - 1,
1311
+ cueCount: cues.length - cueStartIndex,
1312
+ startTime: cues[cueStartIndex].startTime,
1313
+ endTime: cues[cues.length - 1].endTime
1314
+ });
1315
+ return chunks;
1316
+ }
1317
+ function chunkVTTCuesByDuration(cues, options) {
1318
+ if (cues.length === 0) {
1319
+ return [];
1320
+ }
1321
+ const targetChunkDurationSeconds = Math.max(1, options.targetChunkDurationSeconds);
1322
+ const maxChunkDurationSeconds = Math.max(targetChunkDurationSeconds, options.maxChunkDurationSeconds);
1323
+ const minChunkDurationSeconds = Math.min(
1324
+ targetChunkDurationSeconds,
1325
+ Math.max(
1326
+ 1,
1327
+ options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
1328
+ )
1329
+ );
1330
+ const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
1331
+ const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
1332
+ const preferredBoundaryStartSeconds = Math.max(
1333
+ minChunkDurationSeconds,
1334
+ targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
1335
+ );
1336
+ const chunks = [];
1337
+ let chunkIndex = 0;
1338
+ let cueStartIndex = 0;
1339
+ while (cueStartIndex < cues.length) {
1340
+ const chunkStartTime = cues[cueStartIndex].startTime;
1341
+ let cueEndIndex = cueStartIndex;
1342
+ let bestBoundaryIndex = -1;
1343
+ let bestBoundaryScore = -1;
1344
+ let bestPreferredBoundaryIndex = -1;
1345
+ let bestPreferredBoundaryScore = -1;
1346
+ while (cueEndIndex < cues.length) {
1347
+ const cue = cues[cueEndIndex];
1348
+ const currentDuration = cue.endTime - chunkStartTime;
1349
+ if (currentDuration >= minChunkDurationSeconds) {
1350
+ const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
1351
+ if (boundaryScore >= bestBoundaryScore) {
1352
+ bestBoundaryIndex = cueEndIndex;
1353
+ bestBoundaryScore = boundaryScore;
1354
+ }
1355
+ if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
1356
+ bestPreferredBoundaryIndex = cueEndIndex;
1357
+ bestPreferredBoundaryScore = boundaryScore;
1358
+ }
1359
+ }
1360
+ const nextCue = cues[cueEndIndex + 1];
1361
+ if (!nextCue) {
1362
+ break;
1363
+ }
1364
+ const nextDuration = nextCue.endTime - chunkStartTime;
1365
+ const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
1366
+ const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
1367
+ const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
1368
+ if (currentDuration >= targetChunkDurationSeconds) {
1369
+ if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
1370
+ cueEndIndex = preferredBoundaryIndex;
1371
+ break;
1372
+ }
1373
+ if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
1374
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
1375
+ break;
1376
+ }
1377
+ }
1378
+ if (nextDuration > maxChunkDurationSeconds) {
1379
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
1380
+ break;
1381
+ }
1382
+ cueEndIndex++;
1383
+ }
1384
+ chunks.push({
1385
+ id: `chunk-${chunkIndex}`,
1386
+ cueStartIndex,
1387
+ cueEndIndex,
1388
+ cueCount: cueEndIndex - cueStartIndex + 1,
1389
+ startTime: cues[cueStartIndex].startTime,
1390
+ endTime: cues[cueEndIndex].endTime
1391
+ });
1392
+ cueStartIndex = cueEndIndex + 1;
1393
+ chunkIndex++;
1394
+ }
1395
+ return chunks;
1396
+ }
1237
1397
  function chunkText(text, strategy) {
1238
1398
  switch (strategy.type) {
1239
1399
  case "token": {
@@ -1275,10 +1435,8 @@ async function getThumbnailUrls(playbackId, duration, options = {}) {
1275
1435
  }
1276
1436
  const baseUrl = getMuxThumbnailBaseUrl(playbackId);
1277
1437
  const urlPromises = timestamps.map(async (time) => {
1278
- if (shouldSign) {
1279
- return signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials);
1280
- }
1281
- return `${baseUrl}?time=${time}&width=${width}`;
1438
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
1439
+ return { url, time };
1282
1440
  });
1283
1441
  return Promise.all(urlPromises);
1284
1442
  }
@@ -1300,24 +1458,82 @@ function findCaptionTrack(asset, languageCode) {
1300
1458
  (track) => track.text_type === "subtitles" && track.language_code === languageCode
1301
1459
  );
1302
1460
  }
1461
+ function normalizeLineEndings(value) {
1462
+ return value.replace(/\r\n/g, "\n");
1463
+ }
1464
+ function isTimingLine(line) {
1465
+ return line.includes("-->");
1466
+ }
1467
+ function parseNumericCueIdentifier(line) {
1468
+ if (!/^\d+$/.test(line)) {
1469
+ return null;
1470
+ }
1471
+ return Number.parseInt(line, 10);
1472
+ }
1473
+ function isLikelyTitledCueIdentifier(line) {
1474
+ return /^\d+\s+-\s+\S.*$/.test(line);
1475
+ }
1476
+ function isLikelyCueIdentifier({
1477
+ line,
1478
+ nextLine,
1479
+ previousCueIdentifier
1480
+ }) {
1481
+ if (!line || !nextLine || !isTimingLine(nextLine)) {
1482
+ return false;
1483
+ }
1484
+ const numericIdentifier = parseNumericCueIdentifier(line);
1485
+ if (numericIdentifier !== null) {
1486
+ if (previousCueIdentifier === null || previousCueIdentifier === void 0) {
1487
+ return numericIdentifier === 1;
1488
+ }
1489
+ return numericIdentifier === previousCueIdentifier + 1;
1490
+ }
1491
+ return isLikelyTitledCueIdentifier(line);
1492
+ }
1493
+ function getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier) {
1494
+ const identifierIndex = timingLineIndex - 1;
1495
+ if (identifierIndex < 0) {
1496
+ return -1;
1497
+ }
1498
+ const candidate = lines[identifierIndex].trim();
1499
+ if (!candidate || isTimingLine(candidate)) {
1500
+ return -1;
1501
+ }
1502
+ return isLikelyCueIdentifier({
1503
+ line: candidate,
1504
+ nextLine: lines[timingLineIndex]?.trim(),
1505
+ previousCueIdentifier
1506
+ }) ? identifierIndex : -1;
1507
+ }
1303
1508
  function extractTextFromVTT(vttContent) {
1304
1509
  if (!vttContent.trim()) {
1305
1510
  return "";
1306
1511
  }
1307
1512
  const lines = vttContent.split("\n");
1308
1513
  const textLines = [];
1514
+ let previousCueIdentifier = null;
1515
+ let isInsideNoteBlock = false;
1309
1516
  for (let i = 0; i < lines.length; i++) {
1310
1517
  const line = lines[i].trim();
1311
- if (!line)
1518
+ const nextLine = lines[i + 1]?.trim();
1519
+ if (!line) {
1520
+ isInsideNoteBlock = false;
1521
+ continue;
1522
+ }
1523
+ if (isInsideNoteBlock)
1312
1524
  continue;
1313
1525
  if (line === "WEBVTT")
1314
1526
  continue;
1315
- if (line.startsWith("NOTE "))
1527
+ if (line === "NOTE" || line.startsWith("NOTE ")) {
1528
+ isInsideNoteBlock = true;
1316
1529
  continue;
1317
- if (line.includes("-->"))
1530
+ }
1531
+ if (isTimingLine(line))
1318
1532
  continue;
1319
- if (/^[\w-]+$/.test(line) && !line.includes(" "))
1533
+ if (isLikelyCueIdentifier({ line, nextLine, previousCueIdentifier })) {
1534
+ previousCueIdentifier = parseNumericCueIdentifier(line);
1320
1535
  continue;
1536
+ }
1321
1537
  if (line.startsWith("STYLE") || line.startsWith("REGION"))
1322
1538
  continue;
1323
1539
  const cleanLine = line.replace(/<[^>]*>/g, "").trim();
@@ -1376,20 +1592,34 @@ function parseVTTCues(vttContent) {
1376
1592
  return [];
1377
1593
  const lines = vttContent.split("\n");
1378
1594
  const cues = [];
1595
+ let previousCueIdentifier = null;
1379
1596
  for (let i = 0; i < lines.length; i++) {
1380
1597
  const line = lines[i].trim();
1381
- if (line.includes("-->")) {
1598
+ if (isTimingLine(line)) {
1382
1599
  const [startStr, endStr] = line.split(" --> ").map((s) => s.trim());
1383
1600
  const startTime = vttTimestampToSeconds(startStr);
1384
1601
  const endTime = vttTimestampToSeconds(endStr.split(" ")[0]);
1385
- const textLines = [];
1602
+ const currentCueIdentifierLine = lines[i - 1]?.trim() ?? "";
1603
+ const currentCueIdentifier = isLikelyCueIdentifier({
1604
+ line: currentCueIdentifierLine,
1605
+ nextLine: line,
1606
+ previousCueIdentifier
1607
+ }) ? parseNumericCueIdentifier(currentCueIdentifierLine) : null;
1608
+ const rawTextLines = [];
1386
1609
  let j = i + 1;
1387
- while (j < lines.length && lines[j].trim() && !lines[j].includes("-->")) {
1388
- const cleanLine = lines[j].trim().replace(/<[^>]*>/g, "");
1389
- if (cleanLine)
1390
- textLines.push(cleanLine);
1610
+ while (j < lines.length && lines[j].trim() && !isTimingLine(lines[j].trim())) {
1611
+ rawTextLines.push(lines[j].trim());
1391
1612
  j++;
1392
1613
  }
1614
+ const trailingNumericLine = parseNumericCueIdentifier(rawTextLines.at(-1) ?? "");
1615
+ if (trailingNumericLine !== null && isLikelyCueIdentifier({
1616
+ line: rawTextLines.at(-1) ?? "",
1617
+ nextLine: lines[j]?.trim(),
1618
+ previousCueIdentifier: currentCueIdentifier
1619
+ }) && rawTextLines.length > 1) {
1620
+ rawTextLines.pop();
1621
+ }
1622
+ const textLines = rawTextLines.map((textLine) => textLine.replace(/<[^>]*>/g, "")).filter(Boolean);
1393
1623
  if (textLines.length > 0) {
1394
1624
  cues.push({
1395
1625
  startTime,
@@ -1397,10 +1627,102 @@ function parseVTTCues(vttContent) {
1397
1627
  text: textLines.join(" ")
1398
1628
  });
1399
1629
  }
1630
+ previousCueIdentifier = currentCueIdentifier;
1400
1631
  }
1401
1632
  }
1402
1633
  return cues;
1403
1634
  }
1635
+ function splitVttPreambleAndCueBlocks(vttContent) {
1636
+ const normalizedContent = normalizeLineEndings(vttContent).trim();
1637
+ if (!normalizedContent) {
1638
+ return {
1639
+ preamble: "WEBVTT",
1640
+ cueBlocks: []
1641
+ };
1642
+ }
1643
+ const rawBlocks = normalizedContent.split(/\n{2,}/).map((block) => block.trim()).filter(Boolean);
1644
+ const cueBlockStartIndex = rawBlocks.findIndex((block) => block.includes("-->"));
1645
+ if (cueBlockStartIndex === -1) {
1646
+ return {
1647
+ preamble: normalizedContent.startsWith("WEBVTT") ? normalizedContent : `WEBVTT
1648
+
1649
+ ${normalizedContent}`,
1650
+ cueBlocks: []
1651
+ };
1652
+ }
1653
+ const hasMergedCueBlocks = rawBlocks.slice(cueBlockStartIndex).some((block) => (block.match(/-->/g) ?? []).length > 1);
1654
+ if (hasMergedCueBlocks) {
1655
+ const lines = normalizedContent.split("\n");
1656
+ const timingLineIndices = lines.map((line, index) => isTimingLine(line.trim()) ? index : -1).filter((index) => index >= 0);
1657
+ let previousCueIdentifier = null;
1658
+ const firstCueStartIndex = getCueIdentifierLineIndex(lines, timingLineIndices[0], previousCueIdentifier);
1659
+ const preambleEndIndex = firstCueStartIndex >= 0 ? firstCueStartIndex : timingLineIndices[0];
1660
+ const preamble2 = lines.slice(0, preambleEndIndex).join("\n").trim() || "WEBVTT";
1661
+ const cueBlocks2 = timingLineIndices.map((timingLineIndex, index) => {
1662
+ const cueIdentifierLineIndex = getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier);
1663
+ const cueStartIndex = cueIdentifierLineIndex >= 0 ? cueIdentifierLineIndex : timingLineIndex;
1664
+ const currentCueIdentifier = cueIdentifierLineIndex >= 0 ? parseNumericCueIdentifier(lines[cueIdentifierLineIndex].trim()) : null;
1665
+ const nextTimingLineIndex = timingLineIndices[index + 1] ?? lines.length;
1666
+ let cueEndIndex = nextTimingLineIndex - 1;
1667
+ while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
1668
+ cueEndIndex--;
1669
+ }
1670
+ const nextCueIdentifierLineIndex = index < timingLineIndices.length - 1 ? getCueIdentifierLineIndex(lines, nextTimingLineIndex, currentCueIdentifier) : -1;
1671
+ if (nextCueIdentifierLineIndex === cueEndIndex) {
1672
+ cueEndIndex--;
1673
+ }
1674
+ while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
1675
+ cueEndIndex--;
1676
+ }
1677
+ previousCueIdentifier = currentCueIdentifier;
1678
+ return lines.slice(cueStartIndex, cueEndIndex + 1).join("\n").trim();
1679
+ });
1680
+ return {
1681
+ preamble: preamble2,
1682
+ cueBlocks: cueBlocks2
1683
+ };
1684
+ }
1685
+ const preambleBlocks = rawBlocks.slice(0, cueBlockStartIndex);
1686
+ const cueBlocks = rawBlocks.slice(cueBlockStartIndex);
1687
+ const preamble = preambleBlocks.length > 0 ? preambleBlocks.join("\n\n") : "WEBVTT";
1688
+ return {
1689
+ preamble,
1690
+ cueBlocks
1691
+ };
1692
+ }
1693
+ function buildVttFromCueBlocks(cueBlocks, preamble = "WEBVTT") {
1694
+ if (cueBlocks.length === 0) {
1695
+ return `${preamble.trim()}
1696
+ `;
1697
+ }
1698
+ return `${preamble.trim()}
1699
+
1700
+ ${cueBlocks.map((block) => block.trim()).join("\n\n")}
1701
+ `;
1702
+ }
1703
+ function replaceCueText(cueBlock, translatedText) {
1704
+ const lines = normalizeLineEndings(cueBlock).split("\n").map((line) => line.trim()).filter(Boolean);
1705
+ const timingLineIndex = lines.findIndex((line) => line.includes("-->"));
1706
+ if (timingLineIndex === -1) {
1707
+ throw new Error("Cue block is missing a timestamp line");
1708
+ }
1709
+ const headerLines = lines.slice(0, timingLineIndex + 1);
1710
+ const translatedLines = normalizeLineEndings(translatedText).split("\n").map((line) => line.trim()).filter(Boolean);
1711
+ return [...headerLines, ...translatedLines].join("\n");
1712
+ }
1713
+ function buildVttFromTranslatedCueBlocks(cueBlocks, translatedTexts, preamble = "WEBVTT") {
1714
+ if (cueBlocks.length !== translatedTexts.length) {
1715
+ throw new Error(`Expected ${cueBlocks.length} translated cues, received ${translatedTexts.length}`);
1716
+ }
1717
+ return buildVttFromCueBlocks(
1718
+ cueBlocks.map((cueBlock, index) => replaceCueText(cueBlock, translatedTexts[index])),
1719
+ preamble
1720
+ );
1721
+ }
1722
+ function concatenateVttSegments(segments, preamble = "WEBVTT") {
1723
+ const cueBlocks = segments.flatMap((segment) => splitVttPreambleAndCueBlocks(segment).cueBlocks);
1724
+ return buildVttFromCueBlocks(cueBlocks, preamble);
1725
+ }
1404
1726
  async function buildTranscriptUrl(playbackId, trackId, shouldSign = false, credentials) {
1405
1727
  "use step";
1406
1728
  const baseUrl = `https://stream.mux.com/${playbackId}/text/${trackId}.vtt`;
@@ -1466,6 +1788,7 @@ __export(workflows_exports, {
1466
1788
  HIVE_SEXUAL_CATEGORIES: () => HIVE_SEXUAL_CATEGORIES,
1467
1789
  HIVE_VIOLENCE_CATEGORIES: () => HIVE_VIOLENCE_CATEGORIES,
1468
1790
  SUMMARY_KEYWORD_LIMIT: () => SUMMARY_KEYWORD_LIMIT,
1791
+ aggregateTokenUsage: () => aggregateTokenUsage,
1469
1792
  askQuestions: () => askQuestions,
1470
1793
  burnedInCaptionsSchema: () => burnedInCaptionsSchema,
1471
1794
  chapterSchema: () => chapterSchema,
@@ -1477,6 +1800,7 @@ __export(workflows_exports, {
1477
1800
  getSummaryAndTags: () => getSummaryAndTags,
1478
1801
  hasBurnedInCaptions: () => hasBurnedInCaptions,
1479
1802
  questionAnswerSchema: () => questionAnswerSchema,
1803
+ shouldSplitChunkTranslationError: () => shouldSplitChunkTranslationError,
1480
1804
  summarySchema: () => summarySchema,
1481
1805
  translateAudio: () => translateAudio,
1482
1806
  translateCaptions: () => translateCaptions,
@@ -1718,6 +2042,12 @@ function createToneSection(instruction) {
1718
2042
  content: instruction
1719
2043
  };
1720
2044
  }
2045
+ function createLanguageSection(languageName) {
2046
+ return {
2047
+ tag: "language",
2048
+ content: `All output (title, description, keywords, chapter titles) MUST be written in ${languageName}.`
2049
+ };
2050
+ }
1721
2051
 
1722
2052
  // src/lib/retry.ts
1723
2053
  var DEFAULT_RETRY_OPTIONS = {
@@ -1828,6 +2158,7 @@ var SYSTEM_PROMPT = dedent`
1828
2158
  - Only describe observable evidence from frames or transcript
1829
2159
  - Do not fabricate details or make unsupported assumptions
1830
2160
  - Return structured data matching the requested schema exactly
2161
+ - Provide reasoning in the same language as the question
1831
2162
  </constraints>
1832
2163
 
1833
2164
  <language_guidelines>
@@ -2228,6 +2559,166 @@ async function hasBurnedInCaptions(assetId, options = {}) {
2228
2559
  import { generateText as generateText3, Output as Output3 } from "ai";
2229
2560
  import dedent3 from "dedent";
2230
2561
  import { z as z4 } from "zod";
2562
+
2563
+ // src/lib/language-codes.ts
2564
+ var ISO639_1_TO_3 = {
2565
+ // Major world languages
2566
+ en: "eng",
2567
+ // English
2568
+ es: "spa",
2569
+ // Spanish
2570
+ fr: "fra",
2571
+ // French
2572
+ de: "deu",
2573
+ // German
2574
+ it: "ita",
2575
+ // Italian
2576
+ pt: "por",
2577
+ // Portuguese
2578
+ ru: "rus",
2579
+ // Russian
2580
+ zh: "zho",
2581
+ // Chinese
2582
+ ja: "jpn",
2583
+ // Japanese
2584
+ ko: "kor",
2585
+ // Korean
2586
+ ar: "ara",
2587
+ // Arabic
2588
+ hi: "hin",
2589
+ // Hindi
2590
+ // European languages
2591
+ nl: "nld",
2592
+ // Dutch
2593
+ pl: "pol",
2594
+ // Polish
2595
+ sv: "swe",
2596
+ // Swedish
2597
+ da: "dan",
2598
+ // Danish
2599
+ no: "nor",
2600
+ // Norwegian
2601
+ fi: "fin",
2602
+ // Finnish
2603
+ el: "ell",
2604
+ // Greek
2605
+ cs: "ces",
2606
+ // Czech
2607
+ hu: "hun",
2608
+ // Hungarian
2609
+ ro: "ron",
2610
+ // Romanian
2611
+ bg: "bul",
2612
+ // Bulgarian
2613
+ hr: "hrv",
2614
+ // Croatian
2615
+ sk: "slk",
2616
+ // Slovak
2617
+ sl: "slv",
2618
+ // Slovenian
2619
+ uk: "ukr",
2620
+ // Ukrainian
2621
+ tr: "tur",
2622
+ // Turkish
2623
+ // Asian languages
2624
+ th: "tha",
2625
+ // Thai
2626
+ vi: "vie",
2627
+ // Vietnamese
2628
+ id: "ind",
2629
+ // Indonesian
2630
+ ms: "msa",
2631
+ // Malay
2632
+ tl: "tgl",
2633
+ // Tagalog/Filipino
2634
+ // Other languages
2635
+ he: "heb",
2636
+ // Hebrew
2637
+ fa: "fas",
2638
+ // Persian/Farsi
2639
+ bn: "ben",
2640
+ // Bengali
2641
+ ta: "tam",
2642
+ // Tamil
2643
+ te: "tel",
2644
+ // Telugu
2645
+ mr: "mar",
2646
+ // Marathi
2647
+ gu: "guj",
2648
+ // Gujarati
2649
+ kn: "kan",
2650
+ // Kannada
2651
+ ml: "mal",
2652
+ // Malayalam
2653
+ pa: "pan",
2654
+ // Punjabi
2655
+ ur: "urd",
2656
+ // Urdu
2657
+ sw: "swa",
2658
+ // Swahili
2659
+ af: "afr",
2660
+ // Afrikaans
2661
+ ca: "cat",
2662
+ // Catalan
2663
+ eu: "eus",
2664
+ // Basque
2665
+ gl: "glg",
2666
+ // Galician
2667
+ is: "isl",
2668
+ // Icelandic
2669
+ et: "est",
2670
+ // Estonian
2671
+ lv: "lav",
2672
+ // Latvian
2673
+ lt: "lit"
2674
+ // Lithuanian
2675
+ };
2676
+ var ISO639_3_TO_1 = Object.fromEntries(
2677
+ Object.entries(ISO639_1_TO_3).map(([iso1, iso3]) => [iso3, iso1])
2678
+ );
2679
+ function toISO639_3(code) {
2680
+ const normalized = code.toLowerCase().trim();
2681
+ if (normalized.length === 3) {
2682
+ return normalized;
2683
+ }
2684
+ return ISO639_1_TO_3[normalized] ?? normalized;
2685
+ }
2686
+ function toISO639_1(code) {
2687
+ const normalized = code.toLowerCase().trim();
2688
+ if (normalized.length === 2) {
2689
+ return normalized;
2690
+ }
2691
+ return ISO639_3_TO_1[normalized] ?? normalized;
2692
+ }
2693
+ function getLanguageCodePair(code) {
2694
+ const normalized = code.toLowerCase().trim();
2695
+ if (normalized.length === 2) {
2696
+ return {
2697
+ iso639_1: normalized,
2698
+ iso639_3: toISO639_3(normalized)
2699
+ };
2700
+ } else if (normalized.length === 3) {
2701
+ return {
2702
+ iso639_1: toISO639_1(normalized),
2703
+ iso639_3: normalized
2704
+ };
2705
+ }
2706
+ return {
2707
+ iso639_1: normalized,
2708
+ iso639_3: normalized
2709
+ };
2710
+ }
2711
+ function getLanguageName(code) {
2712
+ const iso639_1 = toISO639_1(code);
2713
+ try {
2714
+ const displayNames = new Intl.DisplayNames(["en"], { type: "language" });
2715
+ return displayNames.of(iso639_1) ?? code.toUpperCase();
2716
+ } catch {
2717
+ return code.toUpperCase();
2718
+ }
2719
+ }
2720
+
2721
+ // src/workflows/chapters.ts
2231
2722
  var chapterSchema = z4.object({
2232
2723
  startTime: z4.number(),
2233
2724
  title: z4.string()
@@ -2288,7 +2779,8 @@ var chapterSystemPromptBuilder = createPromptBuilder({
2288
2779
  content: dedent3`
2289
2780
  - Only use information present in the transcript
2290
2781
  - Return structured data that matches the requested JSON schema
2291
- - Do not add commentary or extra text outside the JSON`
2782
+ - Do not add commentary or extra text outside the JSON
2783
+ - When a <language> section is provided, all chapter titles MUST be written in that language`
2292
2784
  },
2293
2785
  qualityGuidelines: {
2294
2786
  tag: "quality_guidelines",
@@ -2336,7 +2828,7 @@ var chaptersPromptBuilder = createPromptBuilder({
2336
2828
  content: dedent3`
2337
2829
  - Keep titles concise and descriptive
2338
2830
  - Avoid filler or generic labels like "Chapter 1"
2339
- - Use the transcript's language and terminology`
2831
+ - Use the transcript's terminology`
2340
2832
  }
2341
2833
  },
2342
2834
  sectionOrder: ["task", "outputFormat", "chapterGuidelines", "titleGuidelines"]
@@ -2345,7 +2837,8 @@ function buildUserPrompt3({
2345
2837
  timestampedTranscript,
2346
2838
  promptOverrides,
2347
2839
  minChaptersPerHour = 3,
2348
- maxChaptersPerHour = 8
2840
+ maxChaptersPerHour = 8,
2841
+ languageName
2349
2842
  }) {
2350
2843
  const contextSections = [
2351
2844
  {
@@ -2354,6 +2847,9 @@ function buildUserPrompt3({
2354
2847
  attributes: { format: "seconds" }
2355
2848
  }
2356
2849
  ];
2850
+ if (languageName) {
2851
+ contextSections.push(createLanguageSection(languageName));
2852
+ }
2357
2853
  const dynamicChapterGuidelines = dedent3`
2358
2854
  - Create at least ${minChaptersPerHour} and at most ${maxChaptersPerHour} chapters per hour of content
2359
2855
  - Use start times in seconds (not HH:MM:SS)
@@ -2373,7 +2869,8 @@ async function generateChapters(assetId, languageCode, options = {}) {
2373
2869
  promptOverrides,
2374
2870
  minChaptersPerHour,
2375
2871
  maxChaptersPerHour,
2376
- credentials
2872
+ credentials,
2873
+ outputLanguageCode
2377
2874
  } = options;
2378
2875
  const modelConfig = resolveLanguageModelConfig({
2379
2876
  ...options,
@@ -2417,11 +2914,14 @@ async function generateChapters(assetId, languageCode, options = {}) {
2417
2914
  const contentLabel = isAudioOnly ? "transcript" : "caption track";
2418
2915
  throw new Error(`No usable content found in ${contentLabel}`);
2419
2916
  }
2917
+ const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult.track?.language_code ?? languageCode;
2918
+ const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
2420
2919
  const userPrompt = buildUserPrompt3({
2421
2920
  timestampedTranscript,
2422
2921
  promptOverrides,
2423
2922
  minChaptersPerHour,
2424
- maxChaptersPerHour
2923
+ maxChaptersPerHour,
2924
+ languageName
2425
2925
  });
2426
2926
  let chaptersData = null;
2427
2927
  try {
@@ -2748,6 +3248,7 @@ async function moderateImageWithOpenAI(entry) {
2748
3248
  const categoryScores = json.results?.[0]?.category_scores || {};
2749
3249
  return {
2750
3250
  url: entry.url,
3251
+ time: entry.time,
2751
3252
  sexual: categoryScores.sexual || 0,
2752
3253
  violence: categoryScores.violence || 0,
2753
3254
  error: false
@@ -2756,6 +3257,7 @@ async function moderateImageWithOpenAI(entry) {
2756
3257
  console.error("OpenAI moderation failed:", error);
2757
3258
  return {
2758
3259
  url: entry.url,
3260
+ time: entry.time,
2759
3261
  sexual: 0,
2760
3262
  violence: 0,
2761
3263
  error: true,
@@ -2763,11 +3265,13 @@ async function moderateImageWithOpenAI(entry) {
2763
3265
  };
2764
3266
  }
2765
3267
  }
2766
- async function requestOpenAIModeration(imageUrls, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
3268
+ async function requestOpenAIModeration(images, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2767
3269
  "use step";
3270
+ const imageUrls = images.map((img) => img.url);
3271
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2768
3272
  const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
2769
- (img) => ({ url: img.url, image: img.base64Data, model, credentials })
2770
- ) : imageUrls.map((url) => ({ url, image: url, model, credentials }));
3273
+ (img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
3274
+ ) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
2771
3275
  return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
2772
3276
  }
2773
3277
  async function requestOpenAITextModeration(text, model, url, credentials) {
@@ -2912,6 +3416,7 @@ async function moderateImageWithHive(entry) {
2912
3416
  const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
2913
3417
  return {
2914
3418
  url: entry.url,
3419
+ time: entry.time,
2915
3420
  sexual,
2916
3421
  violence,
2917
3422
  error: false
@@ -2919,6 +3424,7 @@ async function moderateImageWithHive(entry) {
2919
3424
  } catch (error) {
2920
3425
  return {
2921
3426
  url: entry.url,
3427
+ time: entry.time,
2922
3428
  sexual: 0,
2923
3429
  violence: 0,
2924
3430
  error: true,
@@ -2926,19 +3432,23 @@ async function moderateImageWithHive(entry) {
2926
3432
  };
2927
3433
  }
2928
3434
  }
2929
- async function requestHiveModeration(imageUrls, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
3435
+ async function requestHiveModeration(images, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2930
3436
  "use step";
3437
+ const imageUrls = images.map((img) => img.url);
3438
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2931
3439
  const targets = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map((img) => ({
2932
3440
  url: img.url,
3441
+ time: timeByUrl.get(img.url),
2933
3442
  source: {
2934
3443
  kind: "file",
2935
3444
  buffer: img.buffer,
2936
3445
  contentType: img.contentType
2937
3446
  },
2938
3447
  credentials
2939
- })) : imageUrls.map((url) => ({
2940
- url,
2941
- source: { kind: "url", value: url },
3448
+ })) : images.map((img) => ({
3449
+ url: img.url,
3450
+ time: img.time,
3451
+ source: { kind: "url", value: img.url },
2942
3452
  credentials
2943
3453
  }));
2944
3454
  return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
@@ -2949,10 +3459,8 @@ async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options)
2949
3459
  const baseUrl = getMuxThumbnailBaseUrl(playbackId);
2950
3460
  const urlPromises = timestampsMs.map(async (tsMs) => {
2951
3461
  const time = Number((tsMs / 1e3).toFixed(2));
2952
- if (shouldSign) {
2953
- return signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials);
2954
- }
2955
- return `${baseUrl}?time=${time}&width=${width}`;
3462
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
3463
+ return { url, time };
2956
3464
  });
2957
3465
  return Promise.all(urlPromises);
2958
3466
  }
@@ -3256,6 +3764,7 @@ var SYSTEM_PROMPT3 = dedent4`
3256
3764
  - Do not fabricate details or make unsupported assumptions
3257
3765
  - Return structured data matching the requested schema
3258
3766
  - Output only the JSON object; no markdown or extra text
3767
+ - When a <language> section is provided, all output text MUST be written in that language
3259
3768
  </constraints>
3260
3769
 
3261
3770
  <tone_guidance>
@@ -3310,6 +3819,7 @@ var AUDIO_ONLY_SYSTEM_PROMPT = dedent4`
3310
3819
  - Return structured data matching the requested schema
3311
3820
  - Focus entirely on audio/spoken content - there are no visual elements
3312
3821
  - Output only the JSON object; no markdown or extra text
3822
+ - When a <language> section is provided, all output text MUST be written in that language
3313
3823
  </constraints>
3314
3824
 
3315
3825
  <tone_guidance>
@@ -3340,9 +3850,13 @@ function buildUserPrompt4({
3340
3850
  isAudioOnly = false,
3341
3851
  titleLength,
3342
3852
  descriptionLength,
3343
- tagCount
3853
+ tagCount,
3854
+ languageName
3344
3855
  }) {
3345
3856
  const contextSections = [createToneSection(TONE_INSTRUCTIONS[tone])];
3857
+ if (languageName) {
3858
+ contextSections.push(createLanguageSection(languageName));
3859
+ }
3346
3860
  if (transcriptText) {
3347
3861
  const format = isCleanTranscript ? "plain text" : "WebVTT";
3348
3862
  contextSections.push(createTranscriptSection(transcriptText, format));
@@ -3455,7 +3969,8 @@ async function getSummaryAndTags(assetId, options) {
3455
3969
  credentials,
3456
3970
  titleLength,
3457
3971
  descriptionLength,
3458
- tagCount
3972
+ tagCount,
3973
+ outputLanguageCode
3459
3974
  } = options ?? {};
3460
3975
  if (!VALID_TONES.includes(tone)) {
3461
3976
  throw new Error(
@@ -3482,12 +3997,15 @@ async function getSummaryAndTags(assetId, options) {
3482
3997
  "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
3483
3998
  );
3484
3999
  }
3485
- const transcriptText = includeTranscript ? (await fetchTranscriptForAsset(assetData, playbackId, {
4000
+ const transcriptResult = includeTranscript ? await fetchTranscriptForAsset(assetData, playbackId, {
3486
4001
  cleanTranscript,
3487
4002
  shouldSign: policy === "signed",
3488
4003
  credentials: workflowCredentials,
3489
4004
  required: isAudioOnly
3490
- })).transcriptText : "";
4005
+ }) : void 0;
4006
+ const transcriptText = transcriptResult?.transcriptText ?? "";
4007
+ const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult?.track?.language_code ?? getReadyTextTracks(assetData)[0]?.language_code;
4008
+ const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
3491
4009
  const userPrompt = buildUserPrompt4({
3492
4010
  tone,
3493
4011
  transcriptText,
@@ -3496,7 +4014,8 @@ async function getSummaryAndTags(assetId, options) {
3496
4014
  isAudioOnly,
3497
4015
  titleLength,
3498
4016
  descriptionLength,
3499
- tagCount
4017
+ tagCount,
4018
+ languageName
3500
4019
  });
3501
4020
  let analysisResponse;
3502
4021
  let imageUrl;
@@ -3566,164 +4085,6 @@ async function getSummaryAndTags(assetId, options) {
3566
4085
  };
3567
4086
  }
3568
4087
 
3569
- // src/lib/language-codes.ts
3570
- var ISO639_1_TO_3 = {
3571
- // Major world languages
3572
- en: "eng",
3573
- // English
3574
- es: "spa",
3575
- // Spanish
3576
- fr: "fra",
3577
- // French
3578
- de: "deu",
3579
- // German
3580
- it: "ita",
3581
- // Italian
3582
- pt: "por",
3583
- // Portuguese
3584
- ru: "rus",
3585
- // Russian
3586
- zh: "zho",
3587
- // Chinese
3588
- ja: "jpn",
3589
- // Japanese
3590
- ko: "kor",
3591
- // Korean
3592
- ar: "ara",
3593
- // Arabic
3594
- hi: "hin",
3595
- // Hindi
3596
- // European languages
3597
- nl: "nld",
3598
- // Dutch
3599
- pl: "pol",
3600
- // Polish
3601
- sv: "swe",
3602
- // Swedish
3603
- da: "dan",
3604
- // Danish
3605
- no: "nor",
3606
- // Norwegian
3607
- fi: "fin",
3608
- // Finnish
3609
- el: "ell",
3610
- // Greek
3611
- cs: "ces",
3612
- // Czech
3613
- hu: "hun",
3614
- // Hungarian
3615
- ro: "ron",
3616
- // Romanian
3617
- bg: "bul",
3618
- // Bulgarian
3619
- hr: "hrv",
3620
- // Croatian
3621
- sk: "slk",
3622
- // Slovak
3623
- sl: "slv",
3624
- // Slovenian
3625
- uk: "ukr",
3626
- // Ukrainian
3627
- tr: "tur",
3628
- // Turkish
3629
- // Asian languages
3630
- th: "tha",
3631
- // Thai
3632
- vi: "vie",
3633
- // Vietnamese
3634
- id: "ind",
3635
- // Indonesian
3636
- ms: "msa",
3637
- // Malay
3638
- tl: "tgl",
3639
- // Tagalog/Filipino
3640
- // Other languages
3641
- he: "heb",
3642
- // Hebrew
3643
- fa: "fas",
3644
- // Persian/Farsi
3645
- bn: "ben",
3646
- // Bengali
3647
- ta: "tam",
3648
- // Tamil
3649
- te: "tel",
3650
- // Telugu
3651
- mr: "mar",
3652
- // Marathi
3653
- gu: "guj",
3654
- // Gujarati
3655
- kn: "kan",
3656
- // Kannada
3657
- ml: "mal",
3658
- // Malayalam
3659
- pa: "pan",
3660
- // Punjabi
3661
- ur: "urd",
3662
- // Urdu
3663
- sw: "swa",
3664
- // Swahili
3665
- af: "afr",
3666
- // Afrikaans
3667
- ca: "cat",
3668
- // Catalan
3669
- eu: "eus",
3670
- // Basque
3671
- gl: "glg",
3672
- // Galician
3673
- is: "isl",
3674
- // Icelandic
3675
- et: "est",
3676
- // Estonian
3677
- lv: "lav",
3678
- // Latvian
3679
- lt: "lit"
3680
- // Lithuanian
3681
- };
3682
- var ISO639_3_TO_1 = Object.fromEntries(
3683
- Object.entries(ISO639_1_TO_3).map(([iso1, iso3]) => [iso3, iso1])
3684
- );
3685
- function toISO639_3(code) {
3686
- const normalized = code.toLowerCase().trim();
3687
- if (normalized.length === 3) {
3688
- return normalized;
3689
- }
3690
- return ISO639_1_TO_3[normalized] ?? normalized;
3691
- }
3692
- function toISO639_1(code) {
3693
- const normalized = code.toLowerCase().trim();
3694
- if (normalized.length === 2) {
3695
- return normalized;
3696
- }
3697
- return ISO639_3_TO_1[normalized] ?? normalized;
3698
- }
3699
- function getLanguageCodePair(code) {
3700
- const normalized = code.toLowerCase().trim();
3701
- if (normalized.length === 2) {
3702
- return {
3703
- iso639_1: normalized,
3704
- iso639_3: toISO639_3(normalized)
3705
- };
3706
- } else if (normalized.length === 3) {
3707
- return {
3708
- iso639_1: toISO639_1(normalized),
3709
- iso639_3: normalized
3710
- };
3711
- }
3712
- return {
3713
- iso639_1: normalized,
3714
- iso639_3: normalized
3715
- };
3716
- }
3717
- function getLanguageName(code) {
3718
- const iso639_1 = toISO639_1(code);
3719
- try {
3720
- const displayNames = new Intl.DisplayNames(["en"], { type: "language" });
3721
- return displayNames.of(iso639_1) ?? code.toUpperCase();
3722
- } catch {
3723
- return code.toUpperCase();
3724
- }
3725
- }
3726
-
3727
4088
  // src/lib/storage-adapter.ts
3728
4089
  function requireCredentials(accessKeyId, secretAccessKey) {
3729
4090
  if (!accessKeyId || !secretAccessKey) {
@@ -4168,12 +4529,187 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
4168
4529
  }
4169
4530
 
4170
4531
  // src/workflows/translate-captions.ts
4171
- import { generateText as generateText5, Output as Output5 } from "ai";
4532
+ import {
4533
+ APICallError,
4534
+ generateText as generateText5,
4535
+ NoObjectGeneratedError,
4536
+ Output as Output5,
4537
+ RetryError,
4538
+ TypeValidationError
4539
+ } from "ai";
4540
+ import dedent5 from "dedent";
4172
4541
  import { z as z6 } from "zod";
4173
4542
  var translationSchema = z6.object({
4174
4543
  translation: z6.string()
4175
4544
  });
4176
- var SYSTEM_PROMPT4 = 'You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user. Preserve all timestamps and VTT formatting exactly as they appear. Return JSON with a single key "translation" containing the translated VTT content.';
4545
+ var SYSTEM_PROMPT4 = dedent5`
4546
+ You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user.
4547
+ You may receive either a full VTT file or a chunk from a larger VTT.
4548
+ Preserve all timestamps, cue ordering, and VTT formatting exactly as they appear.
4549
+ Return JSON with a single key "translation" containing the translated VTT content.
4550
+ `;
4551
+ var CUE_TRANSLATION_SYSTEM_PROMPT = dedent5`
4552
+ You are a subtitle translation expert.
4553
+ You will receive a sequence of subtitle cues extracted from a VTT file.
4554
+ Translate the cues to the requested target language while preserving their original order.
4555
+ Treat the cue list as continuous context so the translation reads naturally across adjacent lines.
4556
+ Return JSON with a single key "translations" containing exactly one translated string for each input cue.
4557
+ Do not merge, split, omit, reorder, or add cues.
4558
+ `;
4559
+ var DEFAULT_TRANSLATION_CHUNKING = {
4560
+ enabled: true,
4561
+ minimumAssetDurationSeconds: 30 * 60,
4562
+ targetChunkDurationSeconds: 30 * 60,
4563
+ maxConcurrentTranslations: 4,
4564
+ maxCuesPerChunk: 80,
4565
+ maxCueTextTokensPerChunk: 2e3
4566
+ };
4567
+ var TOKEN_USAGE_FIELDS = [
4568
+ "inputTokens",
4569
+ "outputTokens",
4570
+ "totalTokens",
4571
+ "reasoningTokens",
4572
+ "cachedInputTokens"
4573
+ ];
4574
+ var TranslationChunkValidationError = class extends Error {
4575
+ constructor(message) {
4576
+ super(message);
4577
+ this.name = "TranslationChunkValidationError";
4578
+ }
4579
+ };
4580
+ function isTranslationChunkValidationError(error) {
4581
+ return error instanceof TranslationChunkValidationError;
4582
+ }
4583
+ function isProviderServiceError(error) {
4584
+ if (!error) {
4585
+ return false;
4586
+ }
4587
+ if (RetryError.isInstance(error)) {
4588
+ return isProviderServiceError(error.lastError);
4589
+ }
4590
+ if (APICallError.isInstance(error)) {
4591
+ return true;
4592
+ }
4593
+ if (error instanceof Error && "cause" in error) {
4594
+ return isProviderServiceError(error.cause);
4595
+ }
4596
+ return false;
4597
+ }
4598
+ function shouldSplitChunkTranslationError(error) {
4599
+ if (isProviderServiceError(error)) {
4600
+ return false;
4601
+ }
4602
+ return NoObjectGeneratedError.isInstance(error) || TypeValidationError.isInstance(error) || isTranslationChunkValidationError(error);
4603
+ }
4604
+ function isDefinedTokenUsageValue(value) {
4605
+ return typeof value === "number";
4606
+ }
4607
+ function resolveTranslationChunkingOptions(options) {
4608
+ const targetChunkDurationSeconds = Math.max(
4609
+ 1,
4610
+ options?.targetChunkDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.targetChunkDurationSeconds
4611
+ );
4612
+ return {
4613
+ enabled: options?.enabled ?? DEFAULT_TRANSLATION_CHUNKING.enabled,
4614
+ minimumAssetDurationSeconds: Math.max(
4615
+ 1,
4616
+ options?.minimumAssetDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.minimumAssetDurationSeconds
4617
+ ),
4618
+ targetChunkDurationSeconds,
4619
+ maxConcurrentTranslations: Math.max(
4620
+ 1,
4621
+ options?.maxConcurrentTranslations ?? DEFAULT_TRANSLATION_CHUNKING.maxConcurrentTranslations
4622
+ ),
4623
+ maxCuesPerChunk: Math.max(
4624
+ 1,
4625
+ options?.maxCuesPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCuesPerChunk
4626
+ ),
4627
+ maxCueTextTokensPerChunk: Math.max(
4628
+ 1,
4629
+ options?.maxCueTextTokensPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCueTextTokensPerChunk
4630
+ )
4631
+ };
4632
+ }
4633
+ function aggregateTokenUsage(usages) {
4634
+ return TOKEN_USAGE_FIELDS.reduce((aggregate, field) => {
4635
+ const values = usages.map((usage) => usage[field]).filter(isDefinedTokenUsageValue);
4636
+ if (values.length > 0) {
4637
+ aggregate[field] = values.reduce((total, value) => total + value, 0);
4638
+ }
4639
+ return aggregate;
4640
+ }, {});
4641
+ }
4642
+ function createTranslationChunkRequest(id, cues, cueBlocks) {
4643
+ return {
4644
+ id,
4645
+ cueCount: cues.length,
4646
+ startTime: cues[0].startTime,
4647
+ endTime: cues[cues.length - 1].endTime,
4648
+ cues,
4649
+ cueBlocks
4650
+ };
4651
+ }
4652
+ function splitTranslationChunkRequestByBudget(id, cues, cueBlocks, maxCuesPerChunk, maxCueTextTokensPerChunk) {
4653
+ const chunks = chunkVTTCuesByBudget(cues, {
4654
+ maxCuesPerChunk,
4655
+ maxTextTokensPerChunk: maxCueTextTokensPerChunk
4656
+ });
4657
+ return chunks.map(
4658
+ (chunk, index) => createTranslationChunkRequest(
4659
+ chunks.length === 1 ? id : `${id}-part-${index}`,
4660
+ cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4661
+ cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1)
4662
+ )
4663
+ );
4664
+ }
4665
+ function buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunkingOptions) {
4666
+ const resolvedChunking = resolveTranslationChunkingOptions(chunkingOptions);
4667
+ const cues = parseVTTCues(vttContent);
4668
+ if (cues.length === 0) {
4669
+ return null;
4670
+ }
4671
+ const { preamble, cueBlocks } = splitVttPreambleAndCueBlocks(vttContent);
4672
+ if (cueBlocks.length !== cues.length) {
4673
+ console.warn(
4674
+ `Falling back to full-VTT caption translation because cue block count (${cueBlocks.length}) does not match parsed cue count (${cues.length}).`
4675
+ );
4676
+ return null;
4677
+ }
4678
+ if (!resolvedChunking.enabled) {
4679
+ return {
4680
+ preamble,
4681
+ chunks: [
4682
+ createTranslationChunkRequest("chunk-0", cues, cueBlocks)
4683
+ ]
4684
+ };
4685
+ }
4686
+ if (typeof assetDurationSeconds !== "number" || assetDurationSeconds < resolvedChunking.minimumAssetDurationSeconds) {
4687
+ return {
4688
+ preamble,
4689
+ chunks: [
4690
+ createTranslationChunkRequest("chunk-0", cues, cueBlocks)
4691
+ ]
4692
+ };
4693
+ }
4694
+ const targetChunkDurationSeconds = resolvedChunking.targetChunkDurationSeconds;
4695
+ const durationChunks = chunkVTTCuesByDuration(cues, {
4696
+ targetChunkDurationSeconds,
4697
+ maxChunkDurationSeconds: Math.max(targetChunkDurationSeconds, Math.round(targetChunkDurationSeconds * (7 / 6))),
4698
+ minChunkDurationSeconds: Math.max(1, Math.round(targetChunkDurationSeconds * (2 / 3)))
4699
+ });
4700
+ return {
4701
+ preamble,
4702
+ chunks: durationChunks.flatMap(
4703
+ (chunk) => splitTranslationChunkRequestByBudget(
4704
+ chunk.id,
4705
+ cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4706
+ cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4707
+ resolvedChunking.maxCuesPerChunk,
4708
+ resolvedChunking.maxCueTextTokensPerChunk
4709
+ )
4710
+ )
4711
+ };
4712
+ }
4177
4713
  async function fetchVttFromMux(vttUrl) {
4178
4714
  "use step";
4179
4715
  const vttResponse = await fetch(vttUrl);
@@ -4219,6 +4755,176 @@ ${vttContent}`
4219
4755
  }
4220
4756
  };
4221
4757
  }
4758
+ async function translateCueChunkWithAI({
4759
+ cues,
4760
+ fromLanguageCode,
4761
+ toLanguageCode,
4762
+ provider,
4763
+ modelId,
4764
+ credentials
4765
+ }) {
4766
+ "use step";
4767
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4768
+ const schema = z6.object({
4769
+ translations: z6.array(z6.string().min(1)).length(cues.length)
4770
+ });
4771
+ const cuePayload = cues.map((cue, index) => ({
4772
+ index,
4773
+ startTime: cue.startTime,
4774
+ endTime: cue.endTime,
4775
+ text: cue.text
4776
+ }));
4777
+ const response = await generateText5({
4778
+ model,
4779
+ output: Output5.object({ schema }),
4780
+ messages: [
4781
+ {
4782
+ role: "system",
4783
+ content: CUE_TRANSLATION_SYSTEM_PROMPT
4784
+ },
4785
+ {
4786
+ role: "user",
4787
+ content: `Translate from ${fromLanguageCode} to ${toLanguageCode}.
4788
+ Return exactly ${cues.length} translated cues in the same order as the input.
4789
+
4790
+ ${JSON.stringify(cuePayload, null, 2)}`
4791
+ }
4792
+ ]
4793
+ });
4794
+ return {
4795
+ translations: response.output.translations,
4796
+ usage: {
4797
+ inputTokens: response.usage.inputTokens,
4798
+ outputTokens: response.usage.outputTokens,
4799
+ totalTokens: response.usage.totalTokens,
4800
+ reasoningTokens: response.usage.reasoningTokens,
4801
+ cachedInputTokens: response.usage.cachedInputTokens
4802
+ }
4803
+ };
4804
+ }
4805
+ function splitTranslationChunkAtMidpoint(chunk) {
4806
+ const midpoint = Math.floor(chunk.cueCount / 2);
4807
+ if (midpoint <= 0 || midpoint >= chunk.cueCount) {
4808
+ throw new Error(`Cannot split chunk ${chunk.id} with cueCount=${chunk.cueCount}`);
4809
+ }
4810
+ return [
4811
+ createTranslationChunkRequest(
4812
+ `${chunk.id}-a`,
4813
+ chunk.cues.slice(0, midpoint),
4814
+ chunk.cueBlocks.slice(0, midpoint)
4815
+ ),
4816
+ createTranslationChunkRequest(
4817
+ `${chunk.id}-b`,
4818
+ chunk.cues.slice(midpoint),
4819
+ chunk.cueBlocks.slice(midpoint)
4820
+ )
4821
+ ];
4822
+ }
4823
+ async function translateChunkWithFallback({
4824
+ chunk,
4825
+ fromLanguageCode,
4826
+ toLanguageCode,
4827
+ provider,
4828
+ modelId,
4829
+ credentials
4830
+ }) {
4831
+ "use step";
4832
+ try {
4833
+ const result = await translateCueChunkWithAI({
4834
+ cues: chunk.cues,
4835
+ fromLanguageCode,
4836
+ toLanguageCode,
4837
+ provider,
4838
+ modelId,
4839
+ credentials
4840
+ });
4841
+ if (result.translations.length !== chunk.cueCount) {
4842
+ throw new TranslationChunkValidationError(
4843
+ `Chunk ${chunk.id} returned ${result.translations.length} cues, expected ${chunk.cueCount} for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s`
4844
+ );
4845
+ }
4846
+ return {
4847
+ translatedVtt: buildVttFromTranslatedCueBlocks(chunk.cueBlocks, result.translations),
4848
+ usage: result.usage
4849
+ };
4850
+ } catch (error) {
4851
+ if (!shouldSplitChunkTranslationError(error) || chunk.cueCount <= 1) {
4852
+ throw new Error(
4853
+ `Chunk ${chunk.id} failed for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s: ${error instanceof Error ? error.message : "Unknown error"}`
4854
+ );
4855
+ }
4856
+ const [leftChunk, rightChunk] = splitTranslationChunkAtMidpoint(chunk);
4857
+ const [leftResult, rightResult] = await Promise.all([
4858
+ translateChunkWithFallback({
4859
+ chunk: leftChunk,
4860
+ fromLanguageCode,
4861
+ toLanguageCode,
4862
+ provider,
4863
+ modelId,
4864
+ credentials
4865
+ }),
4866
+ translateChunkWithFallback({
4867
+ chunk: rightChunk,
4868
+ fromLanguageCode,
4869
+ toLanguageCode,
4870
+ provider,
4871
+ modelId,
4872
+ credentials
4873
+ })
4874
+ ]);
4875
+ return {
4876
+ translatedVtt: concatenateVttSegments([leftResult.translatedVtt, rightResult.translatedVtt]),
4877
+ usage: aggregateTokenUsage([leftResult.usage, rightResult.usage])
4878
+ };
4879
+ }
4880
+ }
4881
+ async function translateCaptionTrack({
4882
+ vttContent,
4883
+ assetDurationSeconds,
4884
+ fromLanguageCode,
4885
+ toLanguageCode,
4886
+ provider,
4887
+ modelId,
4888
+ credentials,
4889
+ chunking
4890
+ }) {
4891
+ "use step";
4892
+ const chunkPlan = buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunking);
4893
+ if (!chunkPlan) {
4894
+ return translateVttWithAI({
4895
+ vttContent,
4896
+ fromLanguageCode,
4897
+ toLanguageCode,
4898
+ provider,
4899
+ modelId,
4900
+ credentials
4901
+ });
4902
+ }
4903
+ const resolvedChunking = resolveTranslationChunkingOptions(chunking);
4904
+ const translatedSegments = [];
4905
+ const usageByChunk = [];
4906
+ for (let index = 0; index < chunkPlan.chunks.length; index += resolvedChunking.maxConcurrentTranslations) {
4907
+ const batch = chunkPlan.chunks.slice(index, index + resolvedChunking.maxConcurrentTranslations);
4908
+ const batchResults = await Promise.all(
4909
+ batch.map(
4910
+ (chunk) => translateChunkWithFallback({
4911
+ chunk,
4912
+ fromLanguageCode,
4913
+ toLanguageCode,
4914
+ provider,
4915
+ modelId,
4916
+ credentials
4917
+ })
4918
+ )
4919
+ );
4920
+ translatedSegments.push(...batchResults.map((result) => result.translatedVtt));
4921
+ usageByChunk.push(...batchResults.map((result) => result.usage));
4922
+ }
4923
+ return {
4924
+ translatedVtt: concatenateVttSegments(translatedSegments, chunkPlan.preamble),
4925
+ usage: aggregateTokenUsage(usageByChunk)
4926
+ };
4927
+ }
4222
4928
  async function uploadVttToS3({
4223
4929
  translatedVtt,
4224
4930
  assetId,
@@ -4279,7 +4985,8 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4279
4985
  s3Bucket: providedS3Bucket,
4280
4986
  uploadToMux: uploadToMuxOption,
4281
4987
  storageAdapter,
4282
- credentials: providedCredentials
4988
+ credentials: providedCredentials,
4989
+ chunking
4283
4990
  } = options;
4284
4991
  const credentials = providedCredentials;
4285
4992
  const effectiveStorageAdapter = storageAdapter;
@@ -4340,13 +5047,15 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4340
5047
  let translatedVtt;
4341
5048
  let usage;
4342
5049
  try {
4343
- const result = await translateVttWithAI({
5050
+ const result = await translateCaptionTrack({
4344
5051
  vttContent,
5052
+ assetDurationSeconds,
4345
5053
  fromLanguageCode,
4346
5054
  toLanguageCode,
4347
5055
  provider: modelConfig.provider,
4348
5056
  modelId: modelConfig.modelId,
4349
- credentials
5057
+ credentials,
5058
+ chunking
4350
5059
  });
4351
5060
  translatedVtt = result.translatedVtt;
4352
5061
  usage = result.usage;