@mux/ai 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -987,24 +987,82 @@ function findCaptionTrack(asset, languageCode) {
987
987
  (track) => track.text_type === "subtitles" && track.language_code === languageCode
988
988
  );
989
989
  }
990
+ function normalizeLineEndings(value) {
991
+ return value.replace(/\r\n/g, "\n");
992
+ }
993
+ function isTimingLine(line) {
994
+ return line.includes("-->");
995
+ }
996
+ function parseNumericCueIdentifier(line) {
997
+ if (!/^\d+$/.test(line)) {
998
+ return null;
999
+ }
1000
+ return Number.parseInt(line, 10);
1001
+ }
1002
+ function isLikelyTitledCueIdentifier(line) {
1003
+ return /^\d+\s+-\s+\S.*$/.test(line);
1004
+ }
1005
+ function isLikelyCueIdentifier({
1006
+ line,
1007
+ nextLine,
1008
+ previousCueIdentifier
1009
+ }) {
1010
+ if (!line || !nextLine || !isTimingLine(nextLine)) {
1011
+ return false;
1012
+ }
1013
+ const numericIdentifier = parseNumericCueIdentifier(line);
1014
+ if (numericIdentifier !== null) {
1015
+ if (previousCueIdentifier === null || previousCueIdentifier === void 0) {
1016
+ return numericIdentifier === 1;
1017
+ }
1018
+ return numericIdentifier === previousCueIdentifier + 1;
1019
+ }
1020
+ return isLikelyTitledCueIdentifier(line);
1021
+ }
1022
+ function getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier) {
1023
+ const identifierIndex = timingLineIndex - 1;
1024
+ if (identifierIndex < 0) {
1025
+ return -1;
1026
+ }
1027
+ const candidate = lines[identifierIndex].trim();
1028
+ if (!candidate || isTimingLine(candidate)) {
1029
+ return -1;
1030
+ }
1031
+ return isLikelyCueIdentifier({
1032
+ line: candidate,
1033
+ nextLine: lines[timingLineIndex]?.trim(),
1034
+ previousCueIdentifier
1035
+ }) ? identifierIndex : -1;
1036
+ }
990
1037
  function extractTextFromVTT(vttContent) {
991
1038
  if (!vttContent.trim()) {
992
1039
  return "";
993
1040
  }
994
1041
  const lines = vttContent.split("\n");
995
1042
  const textLines = [];
1043
+ let previousCueIdentifier = null;
1044
+ let isInsideNoteBlock = false;
996
1045
  for (let i = 0; i < lines.length; i++) {
997
1046
  const line = lines[i].trim();
998
- if (!line)
1047
+ const nextLine = lines[i + 1]?.trim();
1048
+ if (!line) {
1049
+ isInsideNoteBlock = false;
1050
+ continue;
1051
+ }
1052
+ if (isInsideNoteBlock)
999
1053
  continue;
1000
1054
  if (line === "WEBVTT")
1001
1055
  continue;
1002
- if (line.startsWith("NOTE "))
1056
+ if (line === "NOTE" || line.startsWith("NOTE ")) {
1057
+ isInsideNoteBlock = true;
1003
1058
  continue;
1004
- if (line.includes("-->"))
1059
+ }
1060
+ if (isTimingLine(line))
1005
1061
  continue;
1006
- if (/^[\w-]+$/.test(line) && !line.includes(" "))
1062
+ if (isLikelyCueIdentifier({ line, nextLine, previousCueIdentifier })) {
1063
+ previousCueIdentifier = parseNumericCueIdentifier(line);
1007
1064
  continue;
1065
+ }
1008
1066
  if (line.startsWith("STYLE") || line.startsWith("REGION"))
1009
1067
  continue;
1010
1068
  const cleanLine = line.replace(/<[^>]*>/g, "").trim();
@@ -1053,20 +1111,34 @@ function parseVTTCues(vttContent) {
1053
1111
  return [];
1054
1112
  const lines = vttContent.split("\n");
1055
1113
  const cues = [];
1114
+ let previousCueIdentifier = null;
1056
1115
  for (let i = 0; i < lines.length; i++) {
1057
1116
  const line = lines[i].trim();
1058
- if (line.includes("-->")) {
1117
+ if (isTimingLine(line)) {
1059
1118
  const [startStr, endStr] = line.split(" --> ").map((s) => s.trim());
1060
1119
  const startTime = vttTimestampToSeconds(startStr);
1061
1120
  const endTime = vttTimestampToSeconds(endStr.split(" ")[0]);
1062
- const textLines = [];
1121
+ const currentCueIdentifierLine = lines[i - 1]?.trim() ?? "";
1122
+ const currentCueIdentifier = isLikelyCueIdentifier({
1123
+ line: currentCueIdentifierLine,
1124
+ nextLine: line,
1125
+ previousCueIdentifier
1126
+ }) ? parseNumericCueIdentifier(currentCueIdentifierLine) : null;
1127
+ const rawTextLines = [];
1063
1128
  let j = i + 1;
1064
- while (j < lines.length && lines[j].trim() && !lines[j].includes("-->")) {
1065
- const cleanLine = lines[j].trim().replace(/<[^>]*>/g, "");
1066
- if (cleanLine)
1067
- textLines.push(cleanLine);
1129
+ while (j < lines.length && lines[j].trim() && !isTimingLine(lines[j].trim())) {
1130
+ rawTextLines.push(lines[j].trim());
1068
1131
  j++;
1069
1132
  }
1133
+ const trailingNumericLine = parseNumericCueIdentifier(rawTextLines.at(-1) ?? "");
1134
+ if (trailingNumericLine !== null && isLikelyCueIdentifier({
1135
+ line: rawTextLines.at(-1) ?? "",
1136
+ nextLine: lines[j]?.trim(),
1137
+ previousCueIdentifier: currentCueIdentifier
1138
+ }) && rawTextLines.length > 1) {
1139
+ rawTextLines.pop();
1140
+ }
1141
+ const textLines = rawTextLines.map((textLine) => textLine.replace(/<[^>]*>/g, "")).filter(Boolean);
1070
1142
  if (textLines.length > 0) {
1071
1143
  cues.push({
1072
1144
  startTime,
@@ -1074,10 +1146,102 @@ function parseVTTCues(vttContent) {
1074
1146
  text: textLines.join(" ")
1075
1147
  });
1076
1148
  }
1149
+ previousCueIdentifier = currentCueIdentifier;
1077
1150
  }
1078
1151
  }
1079
1152
  return cues;
1080
1153
  }
1154
+ function splitVttPreambleAndCueBlocks(vttContent) {
1155
+ const normalizedContent = normalizeLineEndings(vttContent).trim();
1156
+ if (!normalizedContent) {
1157
+ return {
1158
+ preamble: "WEBVTT",
1159
+ cueBlocks: []
1160
+ };
1161
+ }
1162
+ const rawBlocks = normalizedContent.split(/\n{2,}/).map((block) => block.trim()).filter(Boolean);
1163
+ const cueBlockStartIndex = rawBlocks.findIndex((block) => block.includes("-->"));
1164
+ if (cueBlockStartIndex === -1) {
1165
+ return {
1166
+ preamble: normalizedContent.startsWith("WEBVTT") ? normalizedContent : `WEBVTT
1167
+
1168
+ ${normalizedContent}`,
1169
+ cueBlocks: []
1170
+ };
1171
+ }
1172
+ const hasMergedCueBlocks = rawBlocks.slice(cueBlockStartIndex).some((block) => (block.match(/-->/g) ?? []).length > 1);
1173
+ if (hasMergedCueBlocks) {
1174
+ const lines = normalizedContent.split("\n");
1175
+ const timingLineIndices = lines.map((line, index) => isTimingLine(line.trim()) ? index : -1).filter((index) => index >= 0);
1176
+ let previousCueIdentifier = null;
1177
+ const firstCueStartIndex = getCueIdentifierLineIndex(lines, timingLineIndices[0], previousCueIdentifier);
1178
+ const preambleEndIndex = firstCueStartIndex >= 0 ? firstCueStartIndex : timingLineIndices[0];
1179
+ const preamble2 = lines.slice(0, preambleEndIndex).join("\n").trim() || "WEBVTT";
1180
+ const cueBlocks2 = timingLineIndices.map((timingLineIndex, index) => {
1181
+ const cueIdentifierLineIndex = getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier);
1182
+ const cueStartIndex = cueIdentifierLineIndex >= 0 ? cueIdentifierLineIndex : timingLineIndex;
1183
+ const currentCueIdentifier = cueIdentifierLineIndex >= 0 ? parseNumericCueIdentifier(lines[cueIdentifierLineIndex].trim()) : null;
1184
+ const nextTimingLineIndex = timingLineIndices[index + 1] ?? lines.length;
1185
+ let cueEndIndex = nextTimingLineIndex - 1;
1186
+ while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
1187
+ cueEndIndex--;
1188
+ }
1189
+ const nextCueIdentifierLineIndex = index < timingLineIndices.length - 1 ? getCueIdentifierLineIndex(lines, nextTimingLineIndex, currentCueIdentifier) : -1;
1190
+ if (nextCueIdentifierLineIndex === cueEndIndex) {
1191
+ cueEndIndex--;
1192
+ }
1193
+ while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
1194
+ cueEndIndex--;
1195
+ }
1196
+ previousCueIdentifier = currentCueIdentifier;
1197
+ return lines.slice(cueStartIndex, cueEndIndex + 1).join("\n").trim();
1198
+ });
1199
+ return {
1200
+ preamble: preamble2,
1201
+ cueBlocks: cueBlocks2
1202
+ };
1203
+ }
1204
+ const preambleBlocks = rawBlocks.slice(0, cueBlockStartIndex);
1205
+ const cueBlocks = rawBlocks.slice(cueBlockStartIndex);
1206
+ const preamble = preambleBlocks.length > 0 ? preambleBlocks.join("\n\n") : "WEBVTT";
1207
+ return {
1208
+ preamble,
1209
+ cueBlocks
1210
+ };
1211
+ }
1212
+ function buildVttFromCueBlocks(cueBlocks, preamble = "WEBVTT") {
1213
+ if (cueBlocks.length === 0) {
1214
+ return `${preamble.trim()}
1215
+ `;
1216
+ }
1217
+ return `${preamble.trim()}
1218
+
1219
+ ${cueBlocks.map((block) => block.trim()).join("\n\n")}
1220
+ `;
1221
+ }
1222
+ function replaceCueText(cueBlock, translatedText) {
1223
+ const lines = normalizeLineEndings(cueBlock).split("\n").map((line) => line.trim()).filter(Boolean);
1224
+ const timingLineIndex = lines.findIndex((line) => line.includes("-->"));
1225
+ if (timingLineIndex === -1) {
1226
+ throw new Error("Cue block is missing a timestamp line");
1227
+ }
1228
+ const headerLines = lines.slice(0, timingLineIndex + 1);
1229
+ const translatedLines = normalizeLineEndings(translatedText).split("\n").map((line) => line.trim()).filter(Boolean);
1230
+ return [...headerLines, ...translatedLines].join("\n");
1231
+ }
1232
+ function buildVttFromTranslatedCueBlocks(cueBlocks, translatedTexts, preamble = "WEBVTT") {
1233
+ if (cueBlocks.length !== translatedTexts.length) {
1234
+ throw new Error(`Expected ${cueBlocks.length} translated cues, received ${translatedTexts.length}`);
1235
+ }
1236
+ return buildVttFromCueBlocks(
1237
+ cueBlocks.map((cueBlock, index) => replaceCueText(cueBlock, translatedTexts[index])),
1238
+ preamble
1239
+ );
1240
+ }
1241
+ function concatenateVttSegments(segments, preamble = "WEBVTT") {
1242
+ const cueBlocks = segments.flatMap((segment) => splitVttPreambleAndCueBlocks(segment).cueBlocks);
1243
+ return buildVttFromCueBlocks(cueBlocks, preamble);
1244
+ }
1081
1245
  async function buildTranscriptUrl(playbackId, trackId, shouldSign = false, credentials) {
1082
1246
  "use step";
1083
1247
  const baseUrl = `https://stream.mux.com/${playbackId}/text/${trackId}.vtt`;
@@ -2016,6 +2180,14 @@ async function generateChapters(assetId, languageCode, options = {}) {
2016
2180
  import { embed } from "ai";
2017
2181
 
2018
2182
  // src/primitives/text-chunking.ts
2183
+ var DEFAULT_MIN_CHUNK_DURATION_RATIO = 2 / 3;
2184
+ var DEFAULT_BOUNDARY_LOOKAHEAD_CUES = 12;
2185
+ var DEFAULT_BOUNDARY_PAUSE_SECONDS = 1.25;
2186
+ var STRONG_BOUNDARY_SCORE = 4;
2187
+ var PREFERRED_BOUNDARY_WINDOW_SECONDS = 5 * 60;
2188
+ var SENTENCE_BOUNDARY_REGEX = /[.!?]["')\]]*$/;
2189
+ var CLAUSE_BOUNDARY_REGEX = /[,;:]["')\]]*$/;
2190
+ var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
2019
2191
  function estimateTokenCount(text) {
2020
2192
  const words = text.trim().split(/\s+/).length;
2021
2193
  return Math.ceil(words / 0.75);
@@ -2088,6 +2260,151 @@ function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
2088
2260
  }
2089
2261
  return chunks;
2090
2262
  }
2263
+ function scoreCueBoundary(cues, index, boundaryPauseSeconds) {
2264
+ const cue = cues[index];
2265
+ const nextCue = cues[index + 1];
2266
+ if (!nextCue) {
2267
+ return Number.POSITIVE_INFINITY;
2268
+ }
2269
+ const trimmedText = cue.text.trim();
2270
+ let score = 0;
2271
+ if (SENTENCE_BOUNDARY_REGEX.test(trimmedText)) {
2272
+ score += 4;
2273
+ } else if (CLAUSE_BOUNDARY_REGEX.test(trimmedText)) {
2274
+ score += 2;
2275
+ }
2276
+ if (nextCue.startTime - cue.endTime >= boundaryPauseSeconds) {
2277
+ score += 2;
2278
+ }
2279
+ if (NEXT_SENTENCE_START_REGEX.test(nextCue.text.trim())) {
2280
+ score += 1;
2281
+ }
2282
+ return score;
2283
+ }
2284
+ function chunkVTTCuesByBudget(cues, options) {
2285
+ if (cues.length === 0) {
2286
+ return [];
2287
+ }
2288
+ const maxCuesPerChunk = Math.max(1, options.maxCuesPerChunk);
2289
+ let maxTextTokensPerChunk = Number.POSITIVE_INFINITY;
2290
+ if (options.maxTextTokensPerChunk) {
2291
+ maxTextTokensPerChunk = Math.max(1, options.maxTextTokensPerChunk);
2292
+ }
2293
+ const chunks = [];
2294
+ let chunkIndex = 0;
2295
+ let cueStartIndex = 0;
2296
+ let currentTokenCount = 0;
2297
+ for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
2298
+ const cue = cues[cueIndex];
2299
+ const cueTokenCount = estimateTokenCount(cue.text);
2300
+ const currentCueCount = cueIndex - cueStartIndex;
2301
+ const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
2302
+ const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
2303
+ if (wouldExceedCueCount || wouldExceedTokenCount) {
2304
+ chunks.push({
2305
+ id: `chunk-${chunkIndex}`,
2306
+ cueStartIndex,
2307
+ cueEndIndex: cueIndex - 1,
2308
+ cueCount: cueIndex - cueStartIndex,
2309
+ startTime: cues[cueStartIndex].startTime,
2310
+ endTime: cues[cueIndex - 1].endTime
2311
+ });
2312
+ cueStartIndex = cueIndex;
2313
+ currentTokenCount = 0;
2314
+ chunkIndex++;
2315
+ }
2316
+ currentTokenCount += cueTokenCount;
2317
+ }
2318
+ chunks.push({
2319
+ id: `chunk-${chunkIndex}`,
2320
+ cueStartIndex,
2321
+ cueEndIndex: cues.length - 1,
2322
+ cueCount: cues.length - cueStartIndex,
2323
+ startTime: cues[cueStartIndex].startTime,
2324
+ endTime: cues[cues.length - 1].endTime
2325
+ });
2326
+ return chunks;
2327
+ }
2328
+ function chunkVTTCuesByDuration(cues, options) {
2329
+ if (cues.length === 0) {
2330
+ return [];
2331
+ }
2332
+ const targetChunkDurationSeconds = Math.max(1, options.targetChunkDurationSeconds);
2333
+ const maxChunkDurationSeconds = Math.max(targetChunkDurationSeconds, options.maxChunkDurationSeconds);
2334
+ const minChunkDurationSeconds = Math.min(
2335
+ targetChunkDurationSeconds,
2336
+ Math.max(
2337
+ 1,
2338
+ options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
2339
+ )
2340
+ );
2341
+ const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
2342
+ const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
2343
+ const preferredBoundaryStartSeconds = Math.max(
2344
+ minChunkDurationSeconds,
2345
+ targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
2346
+ );
2347
+ const chunks = [];
2348
+ let chunkIndex = 0;
2349
+ let cueStartIndex = 0;
2350
+ while (cueStartIndex < cues.length) {
2351
+ const chunkStartTime = cues[cueStartIndex].startTime;
2352
+ let cueEndIndex = cueStartIndex;
2353
+ let bestBoundaryIndex = -1;
2354
+ let bestBoundaryScore = -1;
2355
+ let bestPreferredBoundaryIndex = -1;
2356
+ let bestPreferredBoundaryScore = -1;
2357
+ while (cueEndIndex < cues.length) {
2358
+ const cue = cues[cueEndIndex];
2359
+ const currentDuration = cue.endTime - chunkStartTime;
2360
+ if (currentDuration >= minChunkDurationSeconds) {
2361
+ const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
2362
+ if (boundaryScore >= bestBoundaryScore) {
2363
+ bestBoundaryIndex = cueEndIndex;
2364
+ bestBoundaryScore = boundaryScore;
2365
+ }
2366
+ if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
2367
+ bestPreferredBoundaryIndex = cueEndIndex;
2368
+ bestPreferredBoundaryScore = boundaryScore;
2369
+ }
2370
+ }
2371
+ const nextCue = cues[cueEndIndex + 1];
2372
+ if (!nextCue) {
2373
+ break;
2374
+ }
2375
+ const nextDuration = nextCue.endTime - chunkStartTime;
2376
+ const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
2377
+ const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
2378
+ const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
2379
+ if (currentDuration >= targetChunkDurationSeconds) {
2380
+ if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
2381
+ cueEndIndex = preferredBoundaryIndex;
2382
+ break;
2383
+ }
2384
+ if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
2385
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
2386
+ break;
2387
+ }
2388
+ }
2389
+ if (nextDuration > maxChunkDurationSeconds) {
2390
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
2391
+ break;
2392
+ }
2393
+ cueEndIndex++;
2394
+ }
2395
+ chunks.push({
2396
+ id: `chunk-${chunkIndex}`,
2397
+ cueStartIndex,
2398
+ cueEndIndex,
2399
+ cueCount: cueEndIndex - cueStartIndex + 1,
2400
+ startTime: cues[cueStartIndex].startTime,
2401
+ endTime: cues[cueEndIndex].endTime
2402
+ });
2403
+ cueStartIndex = cueEndIndex + 1;
2404
+ chunkIndex++;
2405
+ }
2406
+ return chunks;
2407
+ }
2091
2408
  function chunkText(text, strategy) {
2092
2409
  switch (strategy.type) {
2093
2410
  case "token": {
@@ -2343,10 +2660,8 @@ async function getThumbnailUrls(playbackId, duration, options = {}) {
2343
2660
  }
2344
2661
  const baseUrl = getMuxThumbnailBaseUrl(playbackId);
2345
2662
  const urlPromises = timestamps.map(async (time) => {
2346
- if (shouldSign) {
2347
- return signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials);
2348
- }
2349
- return `${baseUrl}?time=${time}&width=${width}`;
2663
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
2664
+ return { url, time };
2350
2665
  });
2351
2666
  return Promise.all(urlPromises);
2352
2667
  }
@@ -2420,6 +2735,7 @@ async function moderateImageWithOpenAI(entry) {
2420
2735
  const categoryScores = json.results?.[0]?.category_scores || {};
2421
2736
  return {
2422
2737
  url: entry.url,
2738
+ time: entry.time,
2423
2739
  sexual: categoryScores.sexual || 0,
2424
2740
  violence: categoryScores.violence || 0,
2425
2741
  error: false
@@ -2428,6 +2744,7 @@ async function moderateImageWithOpenAI(entry) {
2428
2744
  console.error("OpenAI moderation failed:", error);
2429
2745
  return {
2430
2746
  url: entry.url,
2747
+ time: entry.time,
2431
2748
  sexual: 0,
2432
2749
  violence: 0,
2433
2750
  error: true,
@@ -2435,11 +2752,13 @@ async function moderateImageWithOpenAI(entry) {
2435
2752
  };
2436
2753
  }
2437
2754
  }
2438
- async function requestOpenAIModeration(imageUrls, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2755
+ async function requestOpenAIModeration(images, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2439
2756
  "use step";
2757
+ const imageUrls = images.map((img) => img.url);
2758
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2440
2759
  const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
2441
- (img) => ({ url: img.url, image: img.base64Data, model, credentials })
2442
- ) : imageUrls.map((url) => ({ url, image: url, model, credentials }));
2760
+ (img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
2761
+ ) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
2443
2762
  return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
2444
2763
  }
2445
2764
  async function requestOpenAITextModeration(text, model, url, credentials) {
@@ -2584,6 +2903,7 @@ async function moderateImageWithHive(entry) {
2584
2903
  const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
2585
2904
  return {
2586
2905
  url: entry.url,
2906
+ time: entry.time,
2587
2907
  sexual,
2588
2908
  violence,
2589
2909
  error: false
@@ -2591,6 +2911,7 @@ async function moderateImageWithHive(entry) {
2591
2911
  } catch (error) {
2592
2912
  return {
2593
2913
  url: entry.url,
2914
+ time: entry.time,
2594
2915
  sexual: 0,
2595
2916
  violence: 0,
2596
2917
  error: true,
@@ -2598,19 +2919,23 @@ async function moderateImageWithHive(entry) {
2598
2919
  };
2599
2920
  }
2600
2921
  }
2601
- async function requestHiveModeration(imageUrls, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2922
+ async function requestHiveModeration(images, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2602
2923
  "use step";
2924
+ const imageUrls = images.map((img) => img.url);
2925
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2603
2926
  const targets = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map((img) => ({
2604
2927
  url: img.url,
2928
+ time: timeByUrl.get(img.url),
2605
2929
  source: {
2606
2930
  kind: "file",
2607
2931
  buffer: img.buffer,
2608
2932
  contentType: img.contentType
2609
2933
  },
2610
2934
  credentials
2611
- })) : imageUrls.map((url) => ({
2612
- url,
2613
- source: { kind: "url", value: url },
2935
+ })) : images.map((img) => ({
2936
+ url: img.url,
2937
+ time: img.time,
2938
+ source: { kind: "url", value: img.url },
2614
2939
  credentials
2615
2940
  }));
2616
2941
  return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
@@ -2621,10 +2946,8 @@ async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options)
2621
2946
  const baseUrl = getMuxThumbnailBaseUrl(playbackId);
2622
2947
  const urlPromises = timestampsMs.map(async (tsMs) => {
2623
2948
  const time = Number((tsMs / 1e3).toFixed(2));
2624
- if (shouldSign) {
2625
- return signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials);
2626
- }
2627
- return `${baseUrl}?time=${time}&width=${width}`;
2949
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
2950
+ return { url, time };
2628
2951
  });
2629
2952
  return Promise.all(urlPromises);
2630
2953
  }
@@ -3905,12 +4228,187 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
3905
4228
  }
3906
4229
 
3907
4230
  // src/workflows/translate-captions.ts
3908
- import { generateText as generateText5, Output as Output5 } from "ai";
4231
+ import {
4232
+ APICallError,
4233
+ generateText as generateText5,
4234
+ NoObjectGeneratedError,
4235
+ Output as Output5,
4236
+ RetryError,
4237
+ TypeValidationError
4238
+ } from "ai";
4239
+ import dedent5 from "dedent";
3909
4240
  import { z as z6 } from "zod";
3910
4241
  var translationSchema = z6.object({
3911
4242
  translation: z6.string()
3912
4243
  });
3913
- var SYSTEM_PROMPT4 = 'You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user. Preserve all timestamps and VTT formatting exactly as they appear. Return JSON with a single key "translation" containing the translated VTT content.';
4244
+ var SYSTEM_PROMPT4 = dedent5`
4245
+ You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user.
4246
+ You may receive either a full VTT file or a chunk from a larger VTT.
4247
+ Preserve all timestamps, cue ordering, and VTT formatting exactly as they appear.
4248
+ Return JSON with a single key "translation" containing the translated VTT content.
4249
+ `;
4250
+ var CUE_TRANSLATION_SYSTEM_PROMPT = dedent5`
4251
+ You are a subtitle translation expert.
4252
+ You will receive a sequence of subtitle cues extracted from a VTT file.
4253
+ Translate the cues to the requested target language while preserving their original order.
4254
+ Treat the cue list as continuous context so the translation reads naturally across adjacent lines.
4255
+ Return JSON with a single key "translations" containing exactly one translated string for each input cue.
4256
+ Do not merge, split, omit, reorder, or add cues.
4257
+ `;
4258
+ var DEFAULT_TRANSLATION_CHUNKING = {
4259
+ enabled: true,
4260
+ minimumAssetDurationSeconds: 30 * 60,
4261
+ targetChunkDurationSeconds: 30 * 60,
4262
+ maxConcurrentTranslations: 4,
4263
+ maxCuesPerChunk: 80,
4264
+ maxCueTextTokensPerChunk: 2e3
4265
+ };
4266
+ var TOKEN_USAGE_FIELDS = [
4267
+ "inputTokens",
4268
+ "outputTokens",
4269
+ "totalTokens",
4270
+ "reasoningTokens",
4271
+ "cachedInputTokens"
4272
+ ];
4273
+ var TranslationChunkValidationError = class extends Error {
4274
+ constructor(message) {
4275
+ super(message);
4276
+ this.name = "TranslationChunkValidationError";
4277
+ }
4278
+ };
4279
+ function isTranslationChunkValidationError(error) {
4280
+ return error instanceof TranslationChunkValidationError;
4281
+ }
4282
+ function isProviderServiceError(error) {
4283
+ if (!error) {
4284
+ return false;
4285
+ }
4286
+ if (RetryError.isInstance(error)) {
4287
+ return isProviderServiceError(error.lastError);
4288
+ }
4289
+ if (APICallError.isInstance(error)) {
4290
+ return true;
4291
+ }
4292
+ if (error instanceof Error && "cause" in error) {
4293
+ return isProviderServiceError(error.cause);
4294
+ }
4295
+ return false;
4296
+ }
4297
+ function shouldSplitChunkTranslationError(error) {
4298
+ if (isProviderServiceError(error)) {
4299
+ return false;
4300
+ }
4301
+ return NoObjectGeneratedError.isInstance(error) || TypeValidationError.isInstance(error) || isTranslationChunkValidationError(error);
4302
+ }
4303
+ function isDefinedTokenUsageValue(value) {
4304
+ return typeof value === "number";
4305
+ }
4306
+ function resolveTranslationChunkingOptions(options) {
4307
+ const targetChunkDurationSeconds = Math.max(
4308
+ 1,
4309
+ options?.targetChunkDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.targetChunkDurationSeconds
4310
+ );
4311
+ return {
4312
+ enabled: options?.enabled ?? DEFAULT_TRANSLATION_CHUNKING.enabled,
4313
+ minimumAssetDurationSeconds: Math.max(
4314
+ 1,
4315
+ options?.minimumAssetDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.minimumAssetDurationSeconds
4316
+ ),
4317
+ targetChunkDurationSeconds,
4318
+ maxConcurrentTranslations: Math.max(
4319
+ 1,
4320
+ options?.maxConcurrentTranslations ?? DEFAULT_TRANSLATION_CHUNKING.maxConcurrentTranslations
4321
+ ),
4322
+ maxCuesPerChunk: Math.max(
4323
+ 1,
4324
+ options?.maxCuesPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCuesPerChunk
4325
+ ),
4326
+ maxCueTextTokensPerChunk: Math.max(
4327
+ 1,
4328
+ options?.maxCueTextTokensPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCueTextTokensPerChunk
4329
+ )
4330
+ };
4331
+ }
4332
+ function aggregateTokenUsage(usages) {
4333
+ return TOKEN_USAGE_FIELDS.reduce((aggregate, field) => {
4334
+ const values = usages.map((usage) => usage[field]).filter(isDefinedTokenUsageValue);
4335
+ if (values.length > 0) {
4336
+ aggregate[field] = values.reduce((total, value) => total + value, 0);
4337
+ }
4338
+ return aggregate;
4339
+ }, {});
4340
+ }
4341
+ function createTranslationChunkRequest(id, cues, cueBlocks) {
4342
+ return {
4343
+ id,
4344
+ cueCount: cues.length,
4345
+ startTime: cues[0].startTime,
4346
+ endTime: cues[cues.length - 1].endTime,
4347
+ cues,
4348
+ cueBlocks
4349
+ };
4350
+ }
4351
+ function splitTranslationChunkRequestByBudget(id, cues, cueBlocks, maxCuesPerChunk, maxCueTextTokensPerChunk) {
4352
+ const chunks = chunkVTTCuesByBudget(cues, {
4353
+ maxCuesPerChunk,
4354
+ maxTextTokensPerChunk: maxCueTextTokensPerChunk
4355
+ });
4356
+ return chunks.map(
4357
+ (chunk, index) => createTranslationChunkRequest(
4358
+ chunks.length === 1 ? id : `${id}-part-${index}`,
4359
+ cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4360
+ cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1)
4361
+ )
4362
+ );
4363
+ }
4364
+ function buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunkingOptions) {
4365
+ const resolvedChunking = resolveTranslationChunkingOptions(chunkingOptions);
4366
+ const cues = parseVTTCues(vttContent);
4367
+ if (cues.length === 0) {
4368
+ return null;
4369
+ }
4370
+ const { preamble, cueBlocks } = splitVttPreambleAndCueBlocks(vttContent);
4371
+ if (cueBlocks.length !== cues.length) {
4372
+ console.warn(
4373
+ `Falling back to full-VTT caption translation because cue block count (${cueBlocks.length}) does not match parsed cue count (${cues.length}).`
4374
+ );
4375
+ return null;
4376
+ }
4377
+ if (!resolvedChunking.enabled) {
4378
+ return {
4379
+ preamble,
4380
+ chunks: [
4381
+ createTranslationChunkRequest("chunk-0", cues, cueBlocks)
4382
+ ]
4383
+ };
4384
+ }
4385
+ if (typeof assetDurationSeconds !== "number" || assetDurationSeconds < resolvedChunking.minimumAssetDurationSeconds) {
4386
+ return {
4387
+ preamble,
4388
+ chunks: [
4389
+ createTranslationChunkRequest("chunk-0", cues, cueBlocks)
4390
+ ]
4391
+ };
4392
+ }
4393
+ const targetChunkDurationSeconds = resolvedChunking.targetChunkDurationSeconds;
4394
+ const durationChunks = chunkVTTCuesByDuration(cues, {
4395
+ targetChunkDurationSeconds,
4396
+ maxChunkDurationSeconds: Math.max(targetChunkDurationSeconds, Math.round(targetChunkDurationSeconds * (7 / 6))),
4397
+ minChunkDurationSeconds: Math.max(1, Math.round(targetChunkDurationSeconds * (2 / 3)))
4398
+ });
4399
+ return {
4400
+ preamble,
4401
+ chunks: durationChunks.flatMap(
4402
+ (chunk) => splitTranslationChunkRequestByBudget(
4403
+ chunk.id,
4404
+ cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4405
+ cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4406
+ resolvedChunking.maxCuesPerChunk,
4407
+ resolvedChunking.maxCueTextTokensPerChunk
4408
+ )
4409
+ )
4410
+ };
4411
+ }
3914
4412
  async function fetchVttFromMux(vttUrl) {
3915
4413
  "use step";
3916
4414
  const vttResponse = await fetch(vttUrl);
@@ -3956,6 +4454,176 @@ ${vttContent}`
3956
4454
  }
3957
4455
  };
3958
4456
  }
4457
+ async function translateCueChunkWithAI({
4458
+ cues,
4459
+ fromLanguageCode,
4460
+ toLanguageCode,
4461
+ provider,
4462
+ modelId,
4463
+ credentials
4464
+ }) {
4465
+ "use step";
4466
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4467
+ const schema = z6.object({
4468
+ translations: z6.array(z6.string().min(1)).length(cues.length)
4469
+ });
4470
+ const cuePayload = cues.map((cue, index) => ({
4471
+ index,
4472
+ startTime: cue.startTime,
4473
+ endTime: cue.endTime,
4474
+ text: cue.text
4475
+ }));
4476
+ const response = await generateText5({
4477
+ model,
4478
+ output: Output5.object({ schema }),
4479
+ messages: [
4480
+ {
4481
+ role: "system",
4482
+ content: CUE_TRANSLATION_SYSTEM_PROMPT
4483
+ },
4484
+ {
4485
+ role: "user",
4486
+ content: `Translate from ${fromLanguageCode} to ${toLanguageCode}.
4487
+ Return exactly ${cues.length} translated cues in the same order as the input.
4488
+
4489
+ ${JSON.stringify(cuePayload, null, 2)}`
4490
+ }
4491
+ ]
4492
+ });
4493
+ return {
4494
+ translations: response.output.translations,
4495
+ usage: {
4496
+ inputTokens: response.usage.inputTokens,
4497
+ outputTokens: response.usage.outputTokens,
4498
+ totalTokens: response.usage.totalTokens,
4499
+ reasoningTokens: response.usage.reasoningTokens,
4500
+ cachedInputTokens: response.usage.cachedInputTokens
4501
+ }
4502
+ };
4503
+ }
4504
+ function splitTranslationChunkAtMidpoint(chunk) {
4505
+ const midpoint = Math.floor(chunk.cueCount / 2);
4506
+ if (midpoint <= 0 || midpoint >= chunk.cueCount) {
4507
+ throw new Error(`Cannot split chunk ${chunk.id} with cueCount=${chunk.cueCount}`);
4508
+ }
4509
+ return [
4510
+ createTranslationChunkRequest(
4511
+ `${chunk.id}-a`,
4512
+ chunk.cues.slice(0, midpoint),
4513
+ chunk.cueBlocks.slice(0, midpoint)
4514
+ ),
4515
+ createTranslationChunkRequest(
4516
+ `${chunk.id}-b`,
4517
+ chunk.cues.slice(midpoint),
4518
+ chunk.cueBlocks.slice(midpoint)
4519
+ )
4520
+ ];
4521
+ }
4522
+ async function translateChunkWithFallback({
4523
+ chunk,
4524
+ fromLanguageCode,
4525
+ toLanguageCode,
4526
+ provider,
4527
+ modelId,
4528
+ credentials
4529
+ }) {
4530
+ "use step";
4531
+ try {
4532
+ const result = await translateCueChunkWithAI({
4533
+ cues: chunk.cues,
4534
+ fromLanguageCode,
4535
+ toLanguageCode,
4536
+ provider,
4537
+ modelId,
4538
+ credentials
4539
+ });
4540
+ if (result.translations.length !== chunk.cueCount) {
4541
+ throw new TranslationChunkValidationError(
4542
+ `Chunk ${chunk.id} returned ${result.translations.length} cues, expected ${chunk.cueCount} for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s`
4543
+ );
4544
+ }
4545
+ return {
4546
+ translatedVtt: buildVttFromTranslatedCueBlocks(chunk.cueBlocks, result.translations),
4547
+ usage: result.usage
4548
+ };
4549
+ } catch (error) {
4550
+ if (!shouldSplitChunkTranslationError(error) || chunk.cueCount <= 1) {
4551
+ throw new Error(
4552
+ `Chunk ${chunk.id} failed for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s: ${error instanceof Error ? error.message : "Unknown error"}`
4553
+ );
4554
+ }
4555
+ const [leftChunk, rightChunk] = splitTranslationChunkAtMidpoint(chunk);
4556
+ const [leftResult, rightResult] = await Promise.all([
4557
+ translateChunkWithFallback({
4558
+ chunk: leftChunk,
4559
+ fromLanguageCode,
4560
+ toLanguageCode,
4561
+ provider,
4562
+ modelId,
4563
+ credentials
4564
+ }),
4565
+ translateChunkWithFallback({
4566
+ chunk: rightChunk,
4567
+ fromLanguageCode,
4568
+ toLanguageCode,
4569
+ provider,
4570
+ modelId,
4571
+ credentials
4572
+ })
4573
+ ]);
4574
+ return {
4575
+ translatedVtt: concatenateVttSegments([leftResult.translatedVtt, rightResult.translatedVtt]),
4576
+ usage: aggregateTokenUsage([leftResult.usage, rightResult.usage])
4577
+ };
4578
+ }
4579
+ }
4580
+ async function translateCaptionTrack({
4581
+ vttContent,
4582
+ assetDurationSeconds,
4583
+ fromLanguageCode,
4584
+ toLanguageCode,
4585
+ provider,
4586
+ modelId,
4587
+ credentials,
4588
+ chunking
4589
+ }) {
4590
+ "use step";
4591
+ const chunkPlan = buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunking);
4592
+ if (!chunkPlan) {
4593
+ return translateVttWithAI({
4594
+ vttContent,
4595
+ fromLanguageCode,
4596
+ toLanguageCode,
4597
+ provider,
4598
+ modelId,
4599
+ credentials
4600
+ });
4601
+ }
4602
+ const resolvedChunking = resolveTranslationChunkingOptions(chunking);
4603
+ const translatedSegments = [];
4604
+ const usageByChunk = [];
4605
+ for (let index = 0; index < chunkPlan.chunks.length; index += resolvedChunking.maxConcurrentTranslations) {
4606
+ const batch = chunkPlan.chunks.slice(index, index + resolvedChunking.maxConcurrentTranslations);
4607
+ const batchResults = await Promise.all(
4608
+ batch.map(
4609
+ (chunk) => translateChunkWithFallback({
4610
+ chunk,
4611
+ fromLanguageCode,
4612
+ toLanguageCode,
4613
+ provider,
4614
+ modelId,
4615
+ credentials
4616
+ })
4617
+ )
4618
+ );
4619
+ translatedSegments.push(...batchResults.map((result) => result.translatedVtt));
4620
+ usageByChunk.push(...batchResults.map((result) => result.usage));
4621
+ }
4622
+ return {
4623
+ translatedVtt: concatenateVttSegments(translatedSegments, chunkPlan.preamble),
4624
+ usage: aggregateTokenUsage(usageByChunk)
4625
+ };
4626
+ }
3959
4627
  async function uploadVttToS3({
3960
4628
  translatedVtt,
3961
4629
  assetId,
@@ -4016,7 +4684,8 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4016
4684
  s3Bucket: providedS3Bucket,
4017
4685
  uploadToMux: uploadToMuxOption,
4018
4686
  storageAdapter,
4019
- credentials: providedCredentials
4687
+ credentials: providedCredentials,
4688
+ chunking
4020
4689
  } = options;
4021
4690
  const credentials = providedCredentials;
4022
4691
  const effectiveStorageAdapter = storageAdapter;
@@ -4077,13 +4746,15 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4077
4746
  let translatedVtt;
4078
4747
  let usage;
4079
4748
  try {
4080
- const result = await translateVttWithAI({
4749
+ const result = await translateCaptionTrack({
4081
4750
  vttContent,
4751
+ assetDurationSeconds,
4082
4752
  fromLanguageCode,
4083
4753
  toLanguageCode,
4084
4754
  provider: modelConfig.provider,
4085
4755
  modelId: modelConfig.modelId,
4086
- credentials
4756
+ credentials,
4757
+ chunking
4087
4758
  });
4088
4759
  translatedVtt = result.translatedVtt;
4089
4760
  usage = result.usage;
@@ -4156,6 +4827,7 @@ export {
4156
4827
  HIVE_SEXUAL_CATEGORIES,
4157
4828
  HIVE_VIOLENCE_CATEGORIES,
4158
4829
  SUMMARY_KEYWORD_LIMIT,
4830
+ aggregateTokenUsage,
4159
4831
  askQuestions,
4160
4832
  burnedInCaptionsSchema,
4161
4833
  chapterSchema,
@@ -4167,6 +4839,7 @@ export {
4167
4839
  getSummaryAndTags,
4168
4840
  hasBurnedInCaptions,
4169
4841
  questionAnswerSchema,
4842
+ shouldSplitChunkTranslationError,
4170
4843
  summarySchema,
4171
4844
  translateAudio,
4172
4845
  translateCaptions,