@mux/ai 0.8.2 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -837,6 +837,12 @@ function createToneSection(instruction) {
837
837
  content: instruction
838
838
  };
839
839
  }
840
+ function createLanguageSection(languageName) {
841
+ return {
842
+ tag: "language",
843
+ content: `All output (title, description, keywords, chapter titles) MUST be written in ${languageName}.`
844
+ };
845
+ }
840
846
 
841
847
  // src/lib/retry.ts
842
848
  var DEFAULT_RETRY_OPTIONS = {
@@ -981,24 +987,82 @@ function findCaptionTrack(asset, languageCode) {
981
987
  (track) => track.text_type === "subtitles" && track.language_code === languageCode
982
988
  );
983
989
  }
990
+ function normalizeLineEndings(value) {
991
+ return value.replace(/\r\n/g, "\n");
992
+ }
993
+ function isTimingLine(line) {
994
+ return line.includes("-->");
995
+ }
996
+ function parseNumericCueIdentifier(line) {
997
+ if (!/^\d+$/.test(line)) {
998
+ return null;
999
+ }
1000
+ return Number.parseInt(line, 10);
1001
+ }
1002
+ function isLikelyTitledCueIdentifier(line) {
1003
+ return /^\d+\s+-\s+\S.*$/.test(line);
1004
+ }
1005
+ function isLikelyCueIdentifier({
1006
+ line,
1007
+ nextLine,
1008
+ previousCueIdentifier
1009
+ }) {
1010
+ if (!line || !nextLine || !isTimingLine(nextLine)) {
1011
+ return false;
1012
+ }
1013
+ const numericIdentifier = parseNumericCueIdentifier(line);
1014
+ if (numericIdentifier !== null) {
1015
+ if (previousCueIdentifier === null || previousCueIdentifier === void 0) {
1016
+ return numericIdentifier === 1;
1017
+ }
1018
+ return numericIdentifier === previousCueIdentifier + 1;
1019
+ }
1020
+ return isLikelyTitledCueIdentifier(line);
1021
+ }
1022
+ function getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier) {
1023
+ const identifierIndex = timingLineIndex - 1;
1024
+ if (identifierIndex < 0) {
1025
+ return -1;
1026
+ }
1027
+ const candidate = lines[identifierIndex].trim();
1028
+ if (!candidate || isTimingLine(candidate)) {
1029
+ return -1;
1030
+ }
1031
+ return isLikelyCueIdentifier({
1032
+ line: candidate,
1033
+ nextLine: lines[timingLineIndex]?.trim(),
1034
+ previousCueIdentifier
1035
+ }) ? identifierIndex : -1;
1036
+ }
984
1037
  function extractTextFromVTT(vttContent) {
985
1038
  if (!vttContent.trim()) {
986
1039
  return "";
987
1040
  }
988
1041
  const lines = vttContent.split("\n");
989
1042
  const textLines = [];
1043
+ let previousCueIdentifier = null;
1044
+ let isInsideNoteBlock = false;
990
1045
  for (let i = 0; i < lines.length; i++) {
991
1046
  const line = lines[i].trim();
992
- if (!line)
1047
+ const nextLine = lines[i + 1]?.trim();
1048
+ if (!line) {
1049
+ isInsideNoteBlock = false;
1050
+ continue;
1051
+ }
1052
+ if (isInsideNoteBlock)
993
1053
  continue;
994
1054
  if (line === "WEBVTT")
995
1055
  continue;
996
- if (line.startsWith("NOTE "))
1056
+ if (line === "NOTE" || line.startsWith("NOTE ")) {
1057
+ isInsideNoteBlock = true;
997
1058
  continue;
998
- if (line.includes("-->"))
1059
+ }
1060
+ if (isTimingLine(line))
999
1061
  continue;
1000
- if (/^[\w-]+$/.test(line) && !line.includes(" "))
1062
+ if (isLikelyCueIdentifier({ line, nextLine, previousCueIdentifier })) {
1063
+ previousCueIdentifier = parseNumericCueIdentifier(line);
1001
1064
  continue;
1065
+ }
1002
1066
  if (line.startsWith("STYLE") || line.startsWith("REGION"))
1003
1067
  continue;
1004
1068
  const cleanLine = line.replace(/<[^>]*>/g, "").trim();
@@ -1047,20 +1111,34 @@ function parseVTTCues(vttContent) {
1047
1111
  return [];
1048
1112
  const lines = vttContent.split("\n");
1049
1113
  const cues = [];
1114
+ let previousCueIdentifier = null;
1050
1115
  for (let i = 0; i < lines.length; i++) {
1051
1116
  const line = lines[i].trim();
1052
- if (line.includes("-->")) {
1117
+ if (isTimingLine(line)) {
1053
1118
  const [startStr, endStr] = line.split(" --> ").map((s) => s.trim());
1054
1119
  const startTime = vttTimestampToSeconds(startStr);
1055
1120
  const endTime = vttTimestampToSeconds(endStr.split(" ")[0]);
1056
- const textLines = [];
1121
+ const currentCueIdentifierLine = lines[i - 1]?.trim() ?? "";
1122
+ const currentCueIdentifier = isLikelyCueIdentifier({
1123
+ line: currentCueIdentifierLine,
1124
+ nextLine: line,
1125
+ previousCueIdentifier
1126
+ }) ? parseNumericCueIdentifier(currentCueIdentifierLine) : null;
1127
+ const rawTextLines = [];
1057
1128
  let j = i + 1;
1058
- while (j < lines.length && lines[j].trim() && !lines[j].includes("-->")) {
1059
- const cleanLine = lines[j].trim().replace(/<[^>]*>/g, "");
1060
- if (cleanLine)
1061
- textLines.push(cleanLine);
1129
+ while (j < lines.length && lines[j].trim() && !isTimingLine(lines[j].trim())) {
1130
+ rawTextLines.push(lines[j].trim());
1062
1131
  j++;
1063
1132
  }
1133
+ const trailingNumericLine = parseNumericCueIdentifier(rawTextLines.at(-1) ?? "");
1134
+ if (trailingNumericLine !== null && isLikelyCueIdentifier({
1135
+ line: rawTextLines.at(-1) ?? "",
1136
+ nextLine: lines[j]?.trim(),
1137
+ previousCueIdentifier: currentCueIdentifier
1138
+ }) && rawTextLines.length > 1) {
1139
+ rawTextLines.pop();
1140
+ }
1141
+ const textLines = rawTextLines.map((textLine) => textLine.replace(/<[^>]*>/g, "")).filter(Boolean);
1064
1142
  if (textLines.length > 0) {
1065
1143
  cues.push({
1066
1144
  startTime,
@@ -1068,10 +1146,102 @@ function parseVTTCues(vttContent) {
1068
1146
  text: textLines.join(" ")
1069
1147
  });
1070
1148
  }
1149
+ previousCueIdentifier = currentCueIdentifier;
1071
1150
  }
1072
1151
  }
1073
1152
  return cues;
1074
1153
  }
1154
+ function splitVttPreambleAndCueBlocks(vttContent) {
1155
+ const normalizedContent = normalizeLineEndings(vttContent).trim();
1156
+ if (!normalizedContent) {
1157
+ return {
1158
+ preamble: "WEBVTT",
1159
+ cueBlocks: []
1160
+ };
1161
+ }
1162
+ const rawBlocks = normalizedContent.split(/\n{2,}/).map((block) => block.trim()).filter(Boolean);
1163
+ const cueBlockStartIndex = rawBlocks.findIndex((block) => block.includes("-->"));
1164
+ if (cueBlockStartIndex === -1) {
1165
+ return {
1166
+ preamble: normalizedContent.startsWith("WEBVTT") ? normalizedContent : `WEBVTT
1167
+
1168
+ ${normalizedContent}`,
1169
+ cueBlocks: []
1170
+ };
1171
+ }
1172
+ const hasMergedCueBlocks = rawBlocks.slice(cueBlockStartIndex).some((block) => (block.match(/-->/g) ?? []).length > 1);
1173
+ if (hasMergedCueBlocks) {
1174
+ const lines = normalizedContent.split("\n");
1175
+ const timingLineIndices = lines.map((line, index) => isTimingLine(line.trim()) ? index : -1).filter((index) => index >= 0);
1176
+ let previousCueIdentifier = null;
1177
+ const firstCueStartIndex = getCueIdentifierLineIndex(lines, timingLineIndices[0], previousCueIdentifier);
1178
+ const preambleEndIndex = firstCueStartIndex >= 0 ? firstCueStartIndex : timingLineIndices[0];
1179
+ const preamble2 = lines.slice(0, preambleEndIndex).join("\n").trim() || "WEBVTT";
1180
+ const cueBlocks2 = timingLineIndices.map((timingLineIndex, index) => {
1181
+ const cueIdentifierLineIndex = getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier);
1182
+ const cueStartIndex = cueIdentifierLineIndex >= 0 ? cueIdentifierLineIndex : timingLineIndex;
1183
+ const currentCueIdentifier = cueIdentifierLineIndex >= 0 ? parseNumericCueIdentifier(lines[cueIdentifierLineIndex].trim()) : null;
1184
+ const nextTimingLineIndex = timingLineIndices[index + 1] ?? lines.length;
1185
+ let cueEndIndex = nextTimingLineIndex - 1;
1186
+ while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
1187
+ cueEndIndex--;
1188
+ }
1189
+ const nextCueIdentifierLineIndex = index < timingLineIndices.length - 1 ? getCueIdentifierLineIndex(lines, nextTimingLineIndex, currentCueIdentifier) : -1;
1190
+ if (nextCueIdentifierLineIndex === cueEndIndex) {
1191
+ cueEndIndex--;
1192
+ }
1193
+ while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
1194
+ cueEndIndex--;
1195
+ }
1196
+ previousCueIdentifier = currentCueIdentifier;
1197
+ return lines.slice(cueStartIndex, cueEndIndex + 1).join("\n").trim();
1198
+ });
1199
+ return {
1200
+ preamble: preamble2,
1201
+ cueBlocks: cueBlocks2
1202
+ };
1203
+ }
1204
+ const preambleBlocks = rawBlocks.slice(0, cueBlockStartIndex);
1205
+ const cueBlocks = rawBlocks.slice(cueBlockStartIndex);
1206
+ const preamble = preambleBlocks.length > 0 ? preambleBlocks.join("\n\n") : "WEBVTT";
1207
+ return {
1208
+ preamble,
1209
+ cueBlocks
1210
+ };
1211
+ }
1212
+ function buildVttFromCueBlocks(cueBlocks, preamble = "WEBVTT") {
1213
+ if (cueBlocks.length === 0) {
1214
+ return `${preamble.trim()}
1215
+ `;
1216
+ }
1217
+ return `${preamble.trim()}
1218
+
1219
+ ${cueBlocks.map((block) => block.trim()).join("\n\n")}
1220
+ `;
1221
+ }
1222
+ function replaceCueText(cueBlock, translatedText) {
1223
+ const lines = normalizeLineEndings(cueBlock).split("\n").map((line) => line.trim()).filter(Boolean);
1224
+ const timingLineIndex = lines.findIndex((line) => line.includes("-->"));
1225
+ if (timingLineIndex === -1) {
1226
+ throw new Error("Cue block is missing a timestamp line");
1227
+ }
1228
+ const headerLines = lines.slice(0, timingLineIndex + 1);
1229
+ const translatedLines = normalizeLineEndings(translatedText).split("\n").map((line) => line.trim()).filter(Boolean);
1230
+ return [...headerLines, ...translatedLines].join("\n");
1231
+ }
1232
+ function buildVttFromTranslatedCueBlocks(cueBlocks, translatedTexts, preamble = "WEBVTT") {
1233
+ if (cueBlocks.length !== translatedTexts.length) {
1234
+ throw new Error(`Expected ${cueBlocks.length} translated cues, received ${translatedTexts.length}`);
1235
+ }
1236
+ return buildVttFromCueBlocks(
1237
+ cueBlocks.map((cueBlock, index) => replaceCueText(cueBlock, translatedTexts[index])),
1238
+ preamble
1239
+ );
1240
+ }
1241
+ function concatenateVttSegments(segments, preamble = "WEBVTT") {
1242
+ const cueBlocks = segments.flatMap((segment) => splitVttPreambleAndCueBlocks(segment).cueBlocks);
1243
+ return buildVttFromCueBlocks(cueBlocks, preamble);
1244
+ }
1075
1245
  async function buildTranscriptUrl(playbackId, trackId, shouldSign = false, credentials) {
1076
1246
  "use step";
1077
1247
  const baseUrl = `https://stream.mux.com/${playbackId}/text/${trackId}.vtt`;
@@ -1200,6 +1370,7 @@ var SYSTEM_PROMPT = dedent`
1200
1370
  - Only describe observable evidence from frames or transcript
1201
1371
  - Do not fabricate details or make unsupported assumptions
1202
1372
  - Return structured data matching the requested schema exactly
1373
+ - Provide reasoning in the same language as the question
1203
1374
  </constraints>
1204
1375
 
1205
1376
  <language_guidelines>
@@ -1600,6 +1771,166 @@ async function hasBurnedInCaptions(assetId, options = {}) {
1600
1771
  import { generateText as generateText3, Output as Output3 } from "ai";
1601
1772
  import dedent3 from "dedent";
1602
1773
  import { z as z4 } from "zod";
1774
+
1775
+ // src/lib/language-codes.ts
1776
+ var ISO639_1_TO_3 = {
1777
+ // Major world languages
1778
+ en: "eng",
1779
+ // English
1780
+ es: "spa",
1781
+ // Spanish
1782
+ fr: "fra",
1783
+ // French
1784
+ de: "deu",
1785
+ // German
1786
+ it: "ita",
1787
+ // Italian
1788
+ pt: "por",
1789
+ // Portuguese
1790
+ ru: "rus",
1791
+ // Russian
1792
+ zh: "zho",
1793
+ // Chinese
1794
+ ja: "jpn",
1795
+ // Japanese
1796
+ ko: "kor",
1797
+ // Korean
1798
+ ar: "ara",
1799
+ // Arabic
1800
+ hi: "hin",
1801
+ // Hindi
1802
+ // European languages
1803
+ nl: "nld",
1804
+ // Dutch
1805
+ pl: "pol",
1806
+ // Polish
1807
+ sv: "swe",
1808
+ // Swedish
1809
+ da: "dan",
1810
+ // Danish
1811
+ no: "nor",
1812
+ // Norwegian
1813
+ fi: "fin",
1814
+ // Finnish
1815
+ el: "ell",
1816
+ // Greek
1817
+ cs: "ces",
1818
+ // Czech
1819
+ hu: "hun",
1820
+ // Hungarian
1821
+ ro: "ron",
1822
+ // Romanian
1823
+ bg: "bul",
1824
+ // Bulgarian
1825
+ hr: "hrv",
1826
+ // Croatian
1827
+ sk: "slk",
1828
+ // Slovak
1829
+ sl: "slv",
1830
+ // Slovenian
1831
+ uk: "ukr",
1832
+ // Ukrainian
1833
+ tr: "tur",
1834
+ // Turkish
1835
+ // Asian languages
1836
+ th: "tha",
1837
+ // Thai
1838
+ vi: "vie",
1839
+ // Vietnamese
1840
+ id: "ind",
1841
+ // Indonesian
1842
+ ms: "msa",
1843
+ // Malay
1844
+ tl: "tgl",
1845
+ // Tagalog/Filipino
1846
+ // Other languages
1847
+ he: "heb",
1848
+ // Hebrew
1849
+ fa: "fas",
1850
+ // Persian/Farsi
1851
+ bn: "ben",
1852
+ // Bengali
1853
+ ta: "tam",
1854
+ // Tamil
1855
+ te: "tel",
1856
+ // Telugu
1857
+ mr: "mar",
1858
+ // Marathi
1859
+ gu: "guj",
1860
+ // Gujarati
1861
+ kn: "kan",
1862
+ // Kannada
1863
+ ml: "mal",
1864
+ // Malayalam
1865
+ pa: "pan",
1866
+ // Punjabi
1867
+ ur: "urd",
1868
+ // Urdu
1869
+ sw: "swa",
1870
+ // Swahili
1871
+ af: "afr",
1872
+ // Afrikaans
1873
+ ca: "cat",
1874
+ // Catalan
1875
+ eu: "eus",
1876
+ // Basque
1877
+ gl: "glg",
1878
+ // Galician
1879
+ is: "isl",
1880
+ // Icelandic
1881
+ et: "est",
1882
+ // Estonian
1883
+ lv: "lav",
1884
+ // Latvian
1885
+ lt: "lit"
1886
+ // Lithuanian
1887
+ };
1888
+ var ISO639_3_TO_1 = Object.fromEntries(
1889
+ Object.entries(ISO639_1_TO_3).map(([iso1, iso3]) => [iso3, iso1])
1890
+ );
1891
+ function toISO639_3(code) {
1892
+ const normalized = code.toLowerCase().trim();
1893
+ if (normalized.length === 3) {
1894
+ return normalized;
1895
+ }
1896
+ return ISO639_1_TO_3[normalized] ?? normalized;
1897
+ }
1898
+ function toISO639_1(code) {
1899
+ const normalized = code.toLowerCase().trim();
1900
+ if (normalized.length === 2) {
1901
+ return normalized;
1902
+ }
1903
+ return ISO639_3_TO_1[normalized] ?? normalized;
1904
+ }
1905
+ function getLanguageCodePair(code) {
1906
+ const normalized = code.toLowerCase().trim();
1907
+ if (normalized.length === 2) {
1908
+ return {
1909
+ iso639_1: normalized,
1910
+ iso639_3: toISO639_3(normalized)
1911
+ };
1912
+ } else if (normalized.length === 3) {
1913
+ return {
1914
+ iso639_1: toISO639_1(normalized),
1915
+ iso639_3: normalized
1916
+ };
1917
+ }
1918
+ return {
1919
+ iso639_1: normalized,
1920
+ iso639_3: normalized
1921
+ };
1922
+ }
1923
+ function getLanguageName(code) {
1924
+ const iso639_1 = toISO639_1(code);
1925
+ try {
1926
+ const displayNames = new Intl.DisplayNames(["en"], { type: "language" });
1927
+ return displayNames.of(iso639_1) ?? code.toUpperCase();
1928
+ } catch {
1929
+ return code.toUpperCase();
1930
+ }
1931
+ }
1932
+
1933
+ // src/workflows/chapters.ts
1603
1934
  var chapterSchema = z4.object({
1604
1935
  startTime: z4.number(),
1605
1936
  title: z4.string()
@@ -1660,7 +1991,8 @@ var chapterSystemPromptBuilder = createPromptBuilder({
1660
1991
  content: dedent3`
1661
1992
  - Only use information present in the transcript
1662
1993
  - Return structured data that matches the requested JSON schema
1663
- - Do not add commentary or extra text outside the JSON`
1994
+ - Do not add commentary or extra text outside the JSON
1995
+ - When a <language> section is provided, all chapter titles MUST be written in that language`
1664
1996
  },
1665
1997
  qualityGuidelines: {
1666
1998
  tag: "quality_guidelines",
@@ -1708,7 +2040,7 @@ var chaptersPromptBuilder = createPromptBuilder({
1708
2040
  content: dedent3`
1709
2041
  - Keep titles concise and descriptive
1710
2042
  - Avoid filler or generic labels like "Chapter 1"
1711
- - Use the transcript's language and terminology`
2043
+ - Use the transcript's terminology`
1712
2044
  }
1713
2045
  },
1714
2046
  sectionOrder: ["task", "outputFormat", "chapterGuidelines", "titleGuidelines"]
@@ -1717,7 +2049,8 @@ function buildUserPrompt3({
1717
2049
  timestampedTranscript,
1718
2050
  promptOverrides,
1719
2051
  minChaptersPerHour = 3,
1720
- maxChaptersPerHour = 8
2052
+ maxChaptersPerHour = 8,
2053
+ languageName
1721
2054
  }) {
1722
2055
  const contextSections = [
1723
2056
  {
@@ -1726,6 +2059,9 @@ function buildUserPrompt3({
1726
2059
  attributes: { format: "seconds" }
1727
2060
  }
1728
2061
  ];
2062
+ if (languageName) {
2063
+ contextSections.push(createLanguageSection(languageName));
2064
+ }
1729
2065
  const dynamicChapterGuidelines = dedent3`
1730
2066
  - Create at least ${minChaptersPerHour} and at most ${maxChaptersPerHour} chapters per hour of content
1731
2067
  - Use start times in seconds (not HH:MM:SS)
@@ -1745,7 +2081,8 @@ async function generateChapters(assetId, languageCode, options = {}) {
1745
2081
  promptOverrides,
1746
2082
  minChaptersPerHour,
1747
2083
  maxChaptersPerHour,
1748
- credentials
2084
+ credentials,
2085
+ outputLanguageCode
1749
2086
  } = options;
1750
2087
  const modelConfig = resolveLanguageModelConfig({
1751
2088
  ...options,
@@ -1789,11 +2126,14 @@ async function generateChapters(assetId, languageCode, options = {}) {
1789
2126
  const contentLabel = isAudioOnly ? "transcript" : "caption track";
1790
2127
  throw new Error(`No usable content found in ${contentLabel}`);
1791
2128
  }
2129
+ const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult.track?.language_code ?? languageCode;
2130
+ const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
1792
2131
  const userPrompt = buildUserPrompt3({
1793
2132
  timestampedTranscript,
1794
2133
  promptOverrides,
1795
2134
  minChaptersPerHour,
1796
- maxChaptersPerHour
2135
+ maxChaptersPerHour,
2136
+ languageName
1797
2137
  });
1798
2138
  let chaptersData = null;
1799
2139
  try {
@@ -1840,6 +2180,14 @@ async function generateChapters(assetId, languageCode, options = {}) {
1840
2180
  import { embed } from "ai";
1841
2181
 
1842
2182
  // src/primitives/text-chunking.ts
2183
+ var DEFAULT_MIN_CHUNK_DURATION_RATIO = 2 / 3;
2184
+ var DEFAULT_BOUNDARY_LOOKAHEAD_CUES = 12;
2185
+ var DEFAULT_BOUNDARY_PAUSE_SECONDS = 1.25;
2186
+ var STRONG_BOUNDARY_SCORE = 4;
2187
+ var PREFERRED_BOUNDARY_WINDOW_SECONDS = 5 * 60;
2188
+ var SENTENCE_BOUNDARY_REGEX = /[.!?]["')\]]*$/;
2189
+ var CLAUSE_BOUNDARY_REGEX = /[,;:]["')\]]*$/;
2190
+ var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
1843
2191
  function estimateTokenCount(text) {
1844
2192
  const words = text.trim().split(/\s+/).length;
1845
2193
  return Math.ceil(words / 0.75);
@@ -1912,6 +2260,151 @@ function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
1912
2260
  }
1913
2261
  return chunks;
1914
2262
  }
2263
+ function scoreCueBoundary(cues, index, boundaryPauseSeconds) {
2264
+ const cue = cues[index];
2265
+ const nextCue = cues[index + 1];
2266
+ if (!nextCue) {
2267
+ return Number.POSITIVE_INFINITY;
2268
+ }
2269
+ const trimmedText = cue.text.trim();
2270
+ let score = 0;
2271
+ if (SENTENCE_BOUNDARY_REGEX.test(trimmedText)) {
2272
+ score += 4;
2273
+ } else if (CLAUSE_BOUNDARY_REGEX.test(trimmedText)) {
2274
+ score += 2;
2275
+ }
2276
+ if (nextCue.startTime - cue.endTime >= boundaryPauseSeconds) {
2277
+ score += 2;
2278
+ }
2279
+ if (NEXT_SENTENCE_START_REGEX.test(nextCue.text.trim())) {
2280
+ score += 1;
2281
+ }
2282
+ return score;
2283
+ }
2284
+ function chunkVTTCuesByBudget(cues, options) {
2285
+ if (cues.length === 0) {
2286
+ return [];
2287
+ }
2288
+ const maxCuesPerChunk = Math.max(1, options.maxCuesPerChunk);
2289
+ let maxTextTokensPerChunk = Number.POSITIVE_INFINITY;
2290
+ if (options.maxTextTokensPerChunk) {
2291
+ maxTextTokensPerChunk = Math.max(1, options.maxTextTokensPerChunk);
2292
+ }
2293
+ const chunks = [];
2294
+ let chunkIndex = 0;
2295
+ let cueStartIndex = 0;
2296
+ let currentTokenCount = 0;
2297
+ for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
2298
+ const cue = cues[cueIndex];
2299
+ const cueTokenCount = estimateTokenCount(cue.text);
2300
+ const currentCueCount = cueIndex - cueStartIndex;
2301
+ const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
2302
+ const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
2303
+ if (wouldExceedCueCount || wouldExceedTokenCount) {
2304
+ chunks.push({
2305
+ id: `chunk-${chunkIndex}`,
2306
+ cueStartIndex,
2307
+ cueEndIndex: cueIndex - 1,
2308
+ cueCount: cueIndex - cueStartIndex,
2309
+ startTime: cues[cueStartIndex].startTime,
2310
+ endTime: cues[cueIndex - 1].endTime
2311
+ });
2312
+ cueStartIndex = cueIndex;
2313
+ currentTokenCount = 0;
2314
+ chunkIndex++;
2315
+ }
2316
+ currentTokenCount += cueTokenCount;
2317
+ }
2318
+ chunks.push({
2319
+ id: `chunk-${chunkIndex}`,
2320
+ cueStartIndex,
2321
+ cueEndIndex: cues.length - 1,
2322
+ cueCount: cues.length - cueStartIndex,
2323
+ startTime: cues[cueStartIndex].startTime,
2324
+ endTime: cues[cues.length - 1].endTime
2325
+ });
2326
+ return chunks;
2327
+ }
2328
+ function chunkVTTCuesByDuration(cues, options) {
2329
+ if (cues.length === 0) {
2330
+ return [];
2331
+ }
2332
+ const targetChunkDurationSeconds = Math.max(1, options.targetChunkDurationSeconds);
2333
+ const maxChunkDurationSeconds = Math.max(targetChunkDurationSeconds, options.maxChunkDurationSeconds);
2334
+ const minChunkDurationSeconds = Math.min(
2335
+ targetChunkDurationSeconds,
2336
+ Math.max(
2337
+ 1,
2338
+ options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
2339
+ )
2340
+ );
2341
+ const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
2342
+ const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
2343
+ const preferredBoundaryStartSeconds = Math.max(
2344
+ minChunkDurationSeconds,
2345
+ targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
2346
+ );
2347
+ const chunks = [];
2348
+ let chunkIndex = 0;
2349
+ let cueStartIndex = 0;
2350
+ while (cueStartIndex < cues.length) {
2351
+ const chunkStartTime = cues[cueStartIndex].startTime;
2352
+ let cueEndIndex = cueStartIndex;
2353
+ let bestBoundaryIndex = -1;
2354
+ let bestBoundaryScore = -1;
2355
+ let bestPreferredBoundaryIndex = -1;
2356
+ let bestPreferredBoundaryScore = -1;
2357
+ while (cueEndIndex < cues.length) {
2358
+ const cue = cues[cueEndIndex];
2359
+ const currentDuration = cue.endTime - chunkStartTime;
2360
+ if (currentDuration >= minChunkDurationSeconds) {
2361
+ const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
2362
+ if (boundaryScore >= bestBoundaryScore) {
2363
+ bestBoundaryIndex = cueEndIndex;
2364
+ bestBoundaryScore = boundaryScore;
2365
+ }
2366
+ if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
2367
+ bestPreferredBoundaryIndex = cueEndIndex;
2368
+ bestPreferredBoundaryScore = boundaryScore;
2369
+ }
2370
+ }
2371
+ const nextCue = cues[cueEndIndex + 1];
2372
+ if (!nextCue) {
2373
+ break;
2374
+ }
2375
+ const nextDuration = nextCue.endTime - chunkStartTime;
2376
+ const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
2377
+ const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
2378
+ const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
2379
+ if (currentDuration >= targetChunkDurationSeconds) {
2380
+ if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
2381
+ cueEndIndex = preferredBoundaryIndex;
2382
+ break;
2383
+ }
2384
+ if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
2385
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
2386
+ break;
2387
+ }
2388
+ }
2389
+ if (nextDuration > maxChunkDurationSeconds) {
2390
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
2391
+ break;
2392
+ }
2393
+ cueEndIndex++;
2394
+ }
2395
+ chunks.push({
2396
+ id: `chunk-${chunkIndex}`,
2397
+ cueStartIndex,
2398
+ cueEndIndex,
2399
+ cueCount: cueEndIndex - cueStartIndex + 1,
2400
+ startTime: cues[cueStartIndex].startTime,
2401
+ endTime: cues[cueEndIndex].endTime
2402
+ });
2403
+ cueStartIndex = cueEndIndex + 1;
2404
+ chunkIndex++;
2405
+ }
2406
+ return chunks;
2407
+ }
1915
2408
  function chunkText(text, strategy) {
1916
2409
  switch (strategy.type) {
1917
2410
  case "token": {
@@ -2167,10 +2660,8 @@ async function getThumbnailUrls(playbackId, duration, options = {}) {
2167
2660
  }
2168
2661
  const baseUrl = getMuxThumbnailBaseUrl(playbackId);
2169
2662
  const urlPromises = timestamps.map(async (time) => {
2170
- if (shouldSign) {
2171
- return signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials);
2172
- }
2173
- return `${baseUrl}?time=${time}&width=${width}`;
2663
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
2664
+ return { url, time };
2174
2665
  });
2175
2666
  return Promise.all(urlPromises);
2176
2667
  }
@@ -2244,6 +2735,7 @@ async function moderateImageWithOpenAI(entry) {
2244
2735
  const categoryScores = json.results?.[0]?.category_scores || {};
2245
2736
  return {
2246
2737
  url: entry.url,
2738
+ time: entry.time,
2247
2739
  sexual: categoryScores.sexual || 0,
2248
2740
  violence: categoryScores.violence || 0,
2249
2741
  error: false
@@ -2252,6 +2744,7 @@ async function moderateImageWithOpenAI(entry) {
2252
2744
  console.error("OpenAI moderation failed:", error);
2253
2745
  return {
2254
2746
  url: entry.url,
2747
+ time: entry.time,
2255
2748
  sexual: 0,
2256
2749
  violence: 0,
2257
2750
  error: true,
@@ -2259,11 +2752,13 @@ async function moderateImageWithOpenAI(entry) {
2259
2752
  };
2260
2753
  }
2261
2754
  }
2262
- async function requestOpenAIModeration(imageUrls, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2755
+ async function requestOpenAIModeration(images, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2263
2756
  "use step";
2757
+ const imageUrls = images.map((img) => img.url);
2758
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2264
2759
  const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
2265
- (img) => ({ url: img.url, image: img.base64Data, model, credentials })
2266
- ) : imageUrls.map((url) => ({ url, image: url, model, credentials }));
2760
+ (img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
2761
+ ) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
2267
2762
  return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
2268
2763
  }
2269
2764
  async function requestOpenAITextModeration(text, model, url, credentials) {
@@ -2408,6 +2903,7 @@ async function moderateImageWithHive(entry) {
2408
2903
  const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
2409
2904
  return {
2410
2905
  url: entry.url,
2906
+ time: entry.time,
2411
2907
  sexual,
2412
2908
  violence,
2413
2909
  error: false
@@ -2415,6 +2911,7 @@ async function moderateImageWithHive(entry) {
2415
2911
  } catch (error) {
2416
2912
  return {
2417
2913
  url: entry.url,
2914
+ time: entry.time,
2418
2915
  sexual: 0,
2419
2916
  violence: 0,
2420
2917
  error: true,
@@ -2422,19 +2919,23 @@ async function moderateImageWithHive(entry) {
2422
2919
  };
2423
2920
  }
2424
2921
  }
2425
- async function requestHiveModeration(imageUrls, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2922
+ async function requestHiveModeration(images, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2426
2923
  "use step";
2924
+ const imageUrls = images.map((img) => img.url);
2925
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2427
2926
  const targets = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map((img) => ({
2428
2927
  url: img.url,
2928
+ time: timeByUrl.get(img.url),
2429
2929
  source: {
2430
2930
  kind: "file",
2431
2931
  buffer: img.buffer,
2432
2932
  contentType: img.contentType
2433
2933
  },
2434
2934
  credentials
2435
- })) : imageUrls.map((url) => ({
2436
- url,
2437
- source: { kind: "url", value: url },
2935
+ })) : images.map((img) => ({
2936
+ url: img.url,
2937
+ time: img.time,
2938
+ source: { kind: "url", value: img.url },
2438
2939
  credentials
2439
2940
  }));
2440
2941
  return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
@@ -2445,10 +2946,8 @@ async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options)
2445
2946
  const baseUrl = getMuxThumbnailBaseUrl(playbackId);
2446
2947
  const urlPromises = timestampsMs.map(async (tsMs) => {
2447
2948
  const time = Number((tsMs / 1e3).toFixed(2));
2448
- if (shouldSign) {
2449
- return signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials);
2450
- }
2451
- return `${baseUrl}?time=${time}&width=${width}`;
2949
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
2950
+ return { url, time };
2452
2951
  });
2453
2952
  return Promise.all(urlPromises);
2454
2953
  }
@@ -2752,6 +3251,7 @@ var SYSTEM_PROMPT3 = dedent4`
2752
3251
  - Do not fabricate details or make unsupported assumptions
2753
3252
  - Return structured data matching the requested schema
2754
3253
  - Output only the JSON object; no markdown or extra text
3254
+ - When a <language> section is provided, all output text MUST be written in that language
2755
3255
  </constraints>
2756
3256
 
2757
3257
  <tone_guidance>
@@ -2806,6 +3306,7 @@ var AUDIO_ONLY_SYSTEM_PROMPT = dedent4`
2806
3306
  - Return structured data matching the requested schema
2807
3307
  - Focus entirely on audio/spoken content - there are no visual elements
2808
3308
  - Output only the JSON object; no markdown or extra text
3309
+ - When a <language> section is provided, all output text MUST be written in that language
2809
3310
  </constraints>
2810
3311
 
2811
3312
  <tone_guidance>
@@ -2836,9 +3337,13 @@ function buildUserPrompt4({
2836
3337
  isAudioOnly = false,
2837
3338
  titleLength,
2838
3339
  descriptionLength,
2839
- tagCount
3340
+ tagCount,
3341
+ languageName
2840
3342
  }) {
2841
3343
  const contextSections = [createToneSection(TONE_INSTRUCTIONS[tone])];
3344
+ if (languageName) {
3345
+ contextSections.push(createLanguageSection(languageName));
3346
+ }
2842
3347
  if (transcriptText) {
2843
3348
  const format = isCleanTranscript ? "plain text" : "WebVTT";
2844
3349
  contextSections.push(createTranscriptSection(transcriptText, format));
@@ -2951,7 +3456,8 @@ async function getSummaryAndTags(assetId, options) {
2951
3456
  credentials,
2952
3457
  titleLength,
2953
3458
  descriptionLength,
2954
- tagCount
3459
+ tagCount,
3460
+ outputLanguageCode
2955
3461
  } = options ?? {};
2956
3462
  if (!VALID_TONES.includes(tone)) {
2957
3463
  throw new Error(
@@ -2978,12 +3484,15 @@ async function getSummaryAndTags(assetId, options) {
2978
3484
  "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
2979
3485
  );
2980
3486
  }
2981
- const transcriptText = includeTranscript ? (await fetchTranscriptForAsset(assetData, playbackId, {
3487
+ const transcriptResult = includeTranscript ? await fetchTranscriptForAsset(assetData, playbackId, {
2982
3488
  cleanTranscript,
2983
3489
  shouldSign: policy === "signed",
2984
3490
  credentials: workflowCredentials,
2985
3491
  required: isAudioOnly
2986
- })).transcriptText : "";
3492
+ }) : void 0;
3493
+ const transcriptText = transcriptResult?.transcriptText ?? "";
3494
+ const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult?.track?.language_code ?? getReadyTextTracks(assetData)[0]?.language_code;
3495
+ const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
2987
3496
  const userPrompt = buildUserPrompt4({
2988
3497
  tone,
2989
3498
  transcriptText,
@@ -2992,7 +3501,8 @@ async function getSummaryAndTags(assetId, options) {
2992
3501
  isAudioOnly,
2993
3502
  titleLength,
2994
3503
  descriptionLength,
2995
- tagCount
3504
+ tagCount,
3505
+ languageName
2996
3506
  });
2997
3507
  let analysisResponse;
2998
3508
  let imageUrl;
@@ -3062,164 +3572,6 @@ async function getSummaryAndTags(assetId, options) {
3062
3572
  };
3063
3573
  }
3064
3574
 
3065
- // src/lib/language-codes.ts
3066
- var ISO639_1_TO_3 = {
3067
- // Major world languages
3068
- en: "eng",
3069
- // English
3070
- es: "spa",
3071
- // Spanish
3072
- fr: "fra",
3073
- // French
3074
- de: "deu",
3075
- // German
3076
- it: "ita",
3077
- // Italian
3078
- pt: "por",
3079
- // Portuguese
3080
- ru: "rus",
3081
- // Russian
3082
- zh: "zho",
3083
- // Chinese
3084
- ja: "jpn",
3085
- // Japanese
3086
- ko: "kor",
3087
- // Korean
3088
- ar: "ara",
3089
- // Arabic
3090
- hi: "hin",
3091
- // Hindi
3092
- // European languages
3093
- nl: "nld",
3094
- // Dutch
3095
- pl: "pol",
3096
- // Polish
3097
- sv: "swe",
3098
- // Swedish
3099
- da: "dan",
3100
- // Danish
3101
- no: "nor",
3102
- // Norwegian
3103
- fi: "fin",
3104
- // Finnish
3105
- el: "ell",
3106
- // Greek
3107
- cs: "ces",
3108
- // Czech
3109
- hu: "hun",
3110
- // Hungarian
3111
- ro: "ron",
3112
- // Romanian
3113
- bg: "bul",
3114
- // Bulgarian
3115
- hr: "hrv",
3116
- // Croatian
3117
- sk: "slk",
3118
- // Slovak
3119
- sl: "slv",
3120
- // Slovenian
3121
- uk: "ukr",
3122
- // Ukrainian
3123
- tr: "tur",
3124
- // Turkish
3125
- // Asian languages
3126
- th: "tha",
3127
- // Thai
3128
- vi: "vie",
3129
- // Vietnamese
3130
- id: "ind",
3131
- // Indonesian
3132
- ms: "msa",
3133
- // Malay
3134
- tl: "tgl",
3135
- // Tagalog/Filipino
3136
- // Other languages
3137
- he: "heb",
3138
- // Hebrew
3139
- fa: "fas",
3140
- // Persian/Farsi
3141
- bn: "ben",
3142
- // Bengali
3143
- ta: "tam",
3144
- // Tamil
3145
- te: "tel",
3146
- // Telugu
3147
- mr: "mar",
3148
- // Marathi
3149
- gu: "guj",
3150
- // Gujarati
3151
- kn: "kan",
3152
- // Kannada
3153
- ml: "mal",
3154
- // Malayalam
3155
- pa: "pan",
3156
- // Punjabi
3157
- ur: "urd",
3158
- // Urdu
3159
- sw: "swa",
3160
- // Swahili
3161
- af: "afr",
3162
- // Afrikaans
3163
- ca: "cat",
3164
- // Catalan
3165
- eu: "eus",
3166
- // Basque
3167
- gl: "glg",
3168
- // Galician
3169
- is: "isl",
3170
- // Icelandic
3171
- et: "est",
3172
- // Estonian
3173
- lv: "lav",
3174
- // Latvian
3175
- lt: "lit"
3176
- // Lithuanian
3177
- };
3178
- var ISO639_3_TO_1 = Object.fromEntries(
3179
- Object.entries(ISO639_1_TO_3).map(([iso1, iso3]) => [iso3, iso1])
3180
- );
3181
- function toISO639_3(code) {
3182
- const normalized = code.toLowerCase().trim();
3183
- if (normalized.length === 3) {
3184
- return normalized;
3185
- }
3186
- return ISO639_1_TO_3[normalized] ?? normalized;
3187
- }
3188
- function toISO639_1(code) {
3189
- const normalized = code.toLowerCase().trim();
3190
- if (normalized.length === 2) {
3191
- return normalized;
3192
- }
3193
- return ISO639_3_TO_1[normalized] ?? normalized;
3194
- }
3195
- function getLanguageCodePair(code) {
3196
- const normalized = code.toLowerCase().trim();
3197
- if (normalized.length === 2) {
3198
- return {
3199
- iso639_1: normalized,
3200
- iso639_3: toISO639_3(normalized)
3201
- };
3202
- } else if (normalized.length === 3) {
3203
- return {
3204
- iso639_1: toISO639_1(normalized),
3205
- iso639_3: normalized
3206
- };
3207
- }
3208
- return {
3209
- iso639_1: normalized,
3210
- iso639_3: normalized
3211
- };
3212
- }
3213
- function getLanguageName(code) {
3214
- const iso639_1 = toISO639_1(code);
3215
- try {
3216
- const displayNames = new Intl.DisplayNames(["en"], { type: "language" });
3217
- return displayNames.of(iso639_1) ?? code.toUpperCase();
3218
- } catch {
3219
- return code.toUpperCase();
3220
- }
3221
- }
3222
-
3223
3575
  // src/lib/s3-sigv4.ts
3224
3576
  var AWS4_ALGORITHM = "AWS4-HMAC-SHA256";
3225
3577
  var AWS4_REQUEST_TERMINATOR = "aws4_request";
@@ -3876,12 +4228,187 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
3876
4228
  }
3877
4229
 
3878
4230
  // src/workflows/translate-captions.ts
3879
- import { generateText as generateText5, Output as Output5 } from "ai";
4231
+ import {
4232
+ APICallError,
4233
+ generateText as generateText5,
4234
+ NoObjectGeneratedError,
4235
+ Output as Output5,
4236
+ RetryError,
4237
+ TypeValidationError
4238
+ } from "ai";
4239
+ import dedent5 from "dedent";
3880
4240
  import { z as z6 } from "zod";
3881
4241
  var translationSchema = z6.object({
3882
4242
  translation: z6.string()
3883
4243
  });
3884
- var SYSTEM_PROMPT4 = 'You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user. Preserve all timestamps and VTT formatting exactly as they appear. Return JSON with a single key "translation" containing the translated VTT content.';
4244
+ var SYSTEM_PROMPT4 = dedent5`
4245
+ You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user.
4246
+ You may receive either a full VTT file or a chunk from a larger VTT.
4247
+ Preserve all timestamps, cue ordering, and VTT formatting exactly as they appear.
4248
+ Return JSON with a single key "translation" containing the translated VTT content.
4249
+ `;
4250
+ var CUE_TRANSLATION_SYSTEM_PROMPT = dedent5`
4251
+ You are a subtitle translation expert.
4252
+ You will receive a sequence of subtitle cues extracted from a VTT file.
4253
+ Translate the cues to the requested target language while preserving their original order.
4254
+ Treat the cue list as continuous context so the translation reads naturally across adjacent lines.
4255
+ Return JSON with a single key "translations" containing exactly one translated string for each input cue.
4256
+ Do not merge, split, omit, reorder, or add cues.
4257
+ `;
4258
+ var DEFAULT_TRANSLATION_CHUNKING = {
4259
+ enabled: true,
4260
+ minimumAssetDurationSeconds: 30 * 60,
4261
+ targetChunkDurationSeconds: 30 * 60,
4262
+ maxConcurrentTranslations: 4,
4263
+ maxCuesPerChunk: 80,
4264
+ maxCueTextTokensPerChunk: 2e3
4265
+ };
4266
+ var TOKEN_USAGE_FIELDS = [
4267
+ "inputTokens",
4268
+ "outputTokens",
4269
+ "totalTokens",
4270
+ "reasoningTokens",
4271
+ "cachedInputTokens"
4272
+ ];
4273
+ var TranslationChunkValidationError = class extends Error {
4274
+ constructor(message) {
4275
+ super(message);
4276
+ this.name = "TranslationChunkValidationError";
4277
+ }
4278
+ };
4279
+ function isTranslationChunkValidationError(error) {
4280
+ return error instanceof TranslationChunkValidationError;
4281
+ }
4282
+ function isProviderServiceError(error) {
4283
+ if (!error) {
4284
+ return false;
4285
+ }
4286
+ if (RetryError.isInstance(error)) {
4287
+ return isProviderServiceError(error.lastError);
4288
+ }
4289
+ if (APICallError.isInstance(error)) {
4290
+ return true;
4291
+ }
4292
+ if (error instanceof Error && "cause" in error) {
4293
+ return isProviderServiceError(error.cause);
4294
+ }
4295
+ return false;
4296
+ }
4297
+ function shouldSplitChunkTranslationError(error) {
4298
+ if (isProviderServiceError(error)) {
4299
+ return false;
4300
+ }
4301
+ return NoObjectGeneratedError.isInstance(error) || TypeValidationError.isInstance(error) || isTranslationChunkValidationError(error);
4302
+ }
4303
+ function isDefinedTokenUsageValue(value) {
4304
+ return typeof value === "number";
4305
+ }
4306
+ function resolveTranslationChunkingOptions(options) {
4307
+ const targetChunkDurationSeconds = Math.max(
4308
+ 1,
4309
+ options?.targetChunkDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.targetChunkDurationSeconds
4310
+ );
4311
+ return {
4312
+ enabled: options?.enabled ?? DEFAULT_TRANSLATION_CHUNKING.enabled,
4313
+ minimumAssetDurationSeconds: Math.max(
4314
+ 1,
4315
+ options?.minimumAssetDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.minimumAssetDurationSeconds
4316
+ ),
4317
+ targetChunkDurationSeconds,
4318
+ maxConcurrentTranslations: Math.max(
4319
+ 1,
4320
+ options?.maxConcurrentTranslations ?? DEFAULT_TRANSLATION_CHUNKING.maxConcurrentTranslations
4321
+ ),
4322
+ maxCuesPerChunk: Math.max(
4323
+ 1,
4324
+ options?.maxCuesPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCuesPerChunk
4325
+ ),
4326
+ maxCueTextTokensPerChunk: Math.max(
4327
+ 1,
4328
+ options?.maxCueTextTokensPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCueTextTokensPerChunk
4329
+ )
4330
+ };
4331
+ }
4332
+ function aggregateTokenUsage(usages) {
4333
+ return TOKEN_USAGE_FIELDS.reduce((aggregate, field) => {
4334
+ const values = usages.map((usage) => usage[field]).filter(isDefinedTokenUsageValue);
4335
+ if (values.length > 0) {
4336
+ aggregate[field] = values.reduce((total, value) => total + value, 0);
4337
+ }
4338
+ return aggregate;
4339
+ }, {});
4340
+ }
4341
+ function createTranslationChunkRequest(id, cues, cueBlocks) {
4342
+ return {
4343
+ id,
4344
+ cueCount: cues.length,
4345
+ startTime: cues[0].startTime,
4346
+ endTime: cues[cues.length - 1].endTime,
4347
+ cues,
4348
+ cueBlocks
4349
+ };
4350
+ }
4351
+ function splitTranslationChunkRequestByBudget(id, cues, cueBlocks, maxCuesPerChunk, maxCueTextTokensPerChunk) {
4352
+ const chunks = chunkVTTCuesByBudget(cues, {
4353
+ maxCuesPerChunk,
4354
+ maxTextTokensPerChunk: maxCueTextTokensPerChunk
4355
+ });
4356
+ return chunks.map(
4357
+ (chunk, index) => createTranslationChunkRequest(
4358
+ chunks.length === 1 ? id : `${id}-part-${index}`,
4359
+ cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4360
+ cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1)
4361
+ )
4362
+ );
4363
+ }
4364
+ function buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunkingOptions) {
4365
+ const resolvedChunking = resolveTranslationChunkingOptions(chunkingOptions);
4366
+ const cues = parseVTTCues(vttContent);
4367
+ if (cues.length === 0) {
4368
+ return null;
4369
+ }
4370
+ const { preamble, cueBlocks } = splitVttPreambleAndCueBlocks(vttContent);
4371
+ if (cueBlocks.length !== cues.length) {
4372
+ console.warn(
4373
+ `Falling back to full-VTT caption translation because cue block count (${cueBlocks.length}) does not match parsed cue count (${cues.length}).`
4374
+ );
4375
+ return null;
4376
+ }
4377
+ if (!resolvedChunking.enabled) {
4378
+ return {
4379
+ preamble,
4380
+ chunks: [
4381
+ createTranslationChunkRequest("chunk-0", cues, cueBlocks)
4382
+ ]
4383
+ };
4384
+ }
4385
+ if (typeof assetDurationSeconds !== "number" || assetDurationSeconds < resolvedChunking.minimumAssetDurationSeconds) {
4386
+ return {
4387
+ preamble,
4388
+ chunks: [
4389
+ createTranslationChunkRequest("chunk-0", cues, cueBlocks)
4390
+ ]
4391
+ };
4392
+ }
4393
+ const targetChunkDurationSeconds = resolvedChunking.targetChunkDurationSeconds;
4394
+ const durationChunks = chunkVTTCuesByDuration(cues, {
4395
+ targetChunkDurationSeconds,
4396
+ maxChunkDurationSeconds: Math.max(targetChunkDurationSeconds, Math.round(targetChunkDurationSeconds * (7 / 6))),
4397
+ minChunkDurationSeconds: Math.max(1, Math.round(targetChunkDurationSeconds * (2 / 3)))
4398
+ });
4399
+ return {
4400
+ preamble,
4401
+ chunks: durationChunks.flatMap(
4402
+ (chunk) => splitTranslationChunkRequestByBudget(
4403
+ chunk.id,
4404
+ cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4405
+ cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4406
+ resolvedChunking.maxCuesPerChunk,
4407
+ resolvedChunking.maxCueTextTokensPerChunk
4408
+ )
4409
+ )
4410
+ };
4411
+ }
3885
4412
  async function fetchVttFromMux(vttUrl) {
3886
4413
  "use step";
3887
4414
  const vttResponse = await fetch(vttUrl);
@@ -3927,6 +4454,176 @@ ${vttContent}`
3927
4454
  }
3928
4455
  };
3929
4456
  }
4457
+ async function translateCueChunkWithAI({
4458
+ cues,
4459
+ fromLanguageCode,
4460
+ toLanguageCode,
4461
+ provider,
4462
+ modelId,
4463
+ credentials
4464
+ }) {
4465
+ "use step";
4466
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4467
+ const schema = z6.object({
4468
+ translations: z6.array(z6.string().min(1)).length(cues.length)
4469
+ });
4470
+ const cuePayload = cues.map((cue, index) => ({
4471
+ index,
4472
+ startTime: cue.startTime,
4473
+ endTime: cue.endTime,
4474
+ text: cue.text
4475
+ }));
4476
+ const response = await generateText5({
4477
+ model,
4478
+ output: Output5.object({ schema }),
4479
+ messages: [
4480
+ {
4481
+ role: "system",
4482
+ content: CUE_TRANSLATION_SYSTEM_PROMPT
4483
+ },
4484
+ {
4485
+ role: "user",
4486
+ content: `Translate from ${fromLanguageCode} to ${toLanguageCode}.
4487
+ Return exactly ${cues.length} translated cues in the same order as the input.
4488
+
4489
+ ${JSON.stringify(cuePayload, null, 2)}`
4490
+ }
4491
+ ]
4492
+ });
4493
+ return {
4494
+ translations: response.output.translations,
4495
+ usage: {
4496
+ inputTokens: response.usage.inputTokens,
4497
+ outputTokens: response.usage.outputTokens,
4498
+ totalTokens: response.usage.totalTokens,
4499
+ reasoningTokens: response.usage.reasoningTokens,
4500
+ cachedInputTokens: response.usage.cachedInputTokens
4501
+ }
4502
+ };
4503
+ }
4504
+ function splitTranslationChunkAtMidpoint(chunk) {
4505
+ const midpoint = Math.floor(chunk.cueCount / 2);
4506
+ if (midpoint <= 0 || midpoint >= chunk.cueCount) {
4507
+ throw new Error(`Cannot split chunk ${chunk.id} with cueCount=${chunk.cueCount}`);
4508
+ }
4509
+ return [
4510
+ createTranslationChunkRequest(
4511
+ `${chunk.id}-a`,
4512
+ chunk.cues.slice(0, midpoint),
4513
+ chunk.cueBlocks.slice(0, midpoint)
4514
+ ),
4515
+ createTranslationChunkRequest(
4516
+ `${chunk.id}-b`,
4517
+ chunk.cues.slice(midpoint),
4518
+ chunk.cueBlocks.slice(midpoint)
4519
+ )
4520
+ ];
4521
+ }
4522
+ async function translateChunkWithFallback({
4523
+ chunk,
4524
+ fromLanguageCode,
4525
+ toLanguageCode,
4526
+ provider,
4527
+ modelId,
4528
+ credentials
4529
+ }) {
4530
+ "use step";
4531
+ try {
4532
+ const result = await translateCueChunkWithAI({
4533
+ cues: chunk.cues,
4534
+ fromLanguageCode,
4535
+ toLanguageCode,
4536
+ provider,
4537
+ modelId,
4538
+ credentials
4539
+ });
4540
+ if (result.translations.length !== chunk.cueCount) {
4541
+ throw new TranslationChunkValidationError(
4542
+ `Chunk ${chunk.id} returned ${result.translations.length} cues, expected ${chunk.cueCount} for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s`
4543
+ );
4544
+ }
4545
+ return {
4546
+ translatedVtt: buildVttFromTranslatedCueBlocks(chunk.cueBlocks, result.translations),
4547
+ usage: result.usage
4548
+ };
4549
+ } catch (error) {
4550
+ if (!shouldSplitChunkTranslationError(error) || chunk.cueCount <= 1) {
4551
+ throw new Error(
4552
+ `Chunk ${chunk.id} failed for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s: ${error instanceof Error ? error.message : "Unknown error"}`
4553
+ );
4554
+ }
4555
+ const [leftChunk, rightChunk] = splitTranslationChunkAtMidpoint(chunk);
4556
+ const [leftResult, rightResult] = await Promise.all([
4557
+ translateChunkWithFallback({
4558
+ chunk: leftChunk,
4559
+ fromLanguageCode,
4560
+ toLanguageCode,
4561
+ provider,
4562
+ modelId,
4563
+ credentials
4564
+ }),
4565
+ translateChunkWithFallback({
4566
+ chunk: rightChunk,
4567
+ fromLanguageCode,
4568
+ toLanguageCode,
4569
+ provider,
4570
+ modelId,
4571
+ credentials
4572
+ })
4573
+ ]);
4574
+ return {
4575
+ translatedVtt: concatenateVttSegments([leftResult.translatedVtt, rightResult.translatedVtt]),
4576
+ usage: aggregateTokenUsage([leftResult.usage, rightResult.usage])
4577
+ };
4578
+ }
4579
+ }
4580
+ async function translateCaptionTrack({
4581
+ vttContent,
4582
+ assetDurationSeconds,
4583
+ fromLanguageCode,
4584
+ toLanguageCode,
4585
+ provider,
4586
+ modelId,
4587
+ credentials,
4588
+ chunking
4589
+ }) {
4590
+ "use step";
4591
+ const chunkPlan = buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunking);
4592
+ if (!chunkPlan) {
4593
+ return translateVttWithAI({
4594
+ vttContent,
4595
+ fromLanguageCode,
4596
+ toLanguageCode,
4597
+ provider,
4598
+ modelId,
4599
+ credentials
4600
+ });
4601
+ }
4602
+ const resolvedChunking = resolveTranslationChunkingOptions(chunking);
4603
+ const translatedSegments = [];
4604
+ const usageByChunk = [];
4605
+ for (let index = 0; index < chunkPlan.chunks.length; index += resolvedChunking.maxConcurrentTranslations) {
4606
+ const batch = chunkPlan.chunks.slice(index, index + resolvedChunking.maxConcurrentTranslations);
4607
+ const batchResults = await Promise.all(
4608
+ batch.map(
4609
+ (chunk) => translateChunkWithFallback({
4610
+ chunk,
4611
+ fromLanguageCode,
4612
+ toLanguageCode,
4613
+ provider,
4614
+ modelId,
4615
+ credentials
4616
+ })
4617
+ )
4618
+ );
4619
+ translatedSegments.push(...batchResults.map((result) => result.translatedVtt));
4620
+ usageByChunk.push(...batchResults.map((result) => result.usage));
4621
+ }
4622
+ return {
4623
+ translatedVtt: concatenateVttSegments(translatedSegments, chunkPlan.preamble),
4624
+ usage: aggregateTokenUsage(usageByChunk)
4625
+ };
4626
+ }
3930
4627
  async function uploadVttToS3({
3931
4628
  translatedVtt,
3932
4629
  assetId,
@@ -3987,7 +4684,8 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
3987
4684
  s3Bucket: providedS3Bucket,
3988
4685
  uploadToMux: uploadToMuxOption,
3989
4686
  storageAdapter,
3990
- credentials: providedCredentials
4687
+ credentials: providedCredentials,
4688
+ chunking
3991
4689
  } = options;
3992
4690
  const credentials = providedCredentials;
3993
4691
  const effectiveStorageAdapter = storageAdapter;
@@ -4048,13 +4746,15 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4048
4746
  let translatedVtt;
4049
4747
  let usage;
4050
4748
  try {
4051
- const result = await translateVttWithAI({
4749
+ const result = await translateCaptionTrack({
4052
4750
  vttContent,
4751
+ assetDurationSeconds,
4053
4752
  fromLanguageCode,
4054
4753
  toLanguageCode,
4055
4754
  provider: modelConfig.provider,
4056
4755
  modelId: modelConfig.modelId,
4057
- credentials
4756
+ credentials,
4757
+ chunking
4058
4758
  });
4059
4759
  translatedVtt = result.translatedVtt;
4060
4760
  usage = result.usage;
@@ -4127,6 +4827,7 @@ export {
4127
4827
  HIVE_SEXUAL_CATEGORIES,
4128
4828
  HIVE_VIOLENCE_CATEGORIES,
4129
4829
  SUMMARY_KEYWORD_LIMIT,
4830
+ aggregateTokenUsage,
4130
4831
  askQuestions,
4131
4832
  burnedInCaptionsSchema,
4132
4833
  chapterSchema,
@@ -4138,6 +4839,7 @@ export {
4138
4839
  getSummaryAndTags,
4139
4840
  hasBurnedInCaptions,
4140
4841
  questionAnswerSchema,
4842
+ shouldSplitChunkTranslationError,
4141
4843
  summarySchema,
4142
4844
  translateAudio,
4143
4845
  translateCaptions,