@mux/ai 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -987,24 +987,82 @@ function findCaptionTrack(asset, languageCode) {
987
987
  (track) => track.text_type === "subtitles" && track.language_code === languageCode
988
988
  );
989
989
  }
990
+ function normalizeLineEndings(value) {
991
+ return value.replace(/\r\n/g, "\n");
992
+ }
993
+ function isTimingLine(line) {
994
+ return line.includes("-->");
995
+ }
996
+ function parseNumericCueIdentifier(line) {
997
+ if (!/^\d+$/.test(line)) {
998
+ return null;
999
+ }
1000
+ return Number.parseInt(line, 10);
1001
+ }
1002
+ function isLikelyTitledCueIdentifier(line) {
1003
+ return /^\d+\s+-\s+\S.*$/.test(line);
1004
+ }
1005
+ function isLikelyCueIdentifier({
1006
+ line,
1007
+ nextLine,
1008
+ previousCueIdentifier
1009
+ }) {
1010
+ if (!line || !nextLine || !isTimingLine(nextLine)) {
1011
+ return false;
1012
+ }
1013
+ const numericIdentifier = parseNumericCueIdentifier(line);
1014
+ if (numericIdentifier !== null) {
1015
+ if (previousCueIdentifier === null || previousCueIdentifier === void 0) {
1016
+ return numericIdentifier === 1;
1017
+ }
1018
+ return numericIdentifier === previousCueIdentifier + 1;
1019
+ }
1020
+ return isLikelyTitledCueIdentifier(line);
1021
+ }
1022
+ function getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier) {
1023
+ const identifierIndex = timingLineIndex - 1;
1024
+ if (identifierIndex < 0) {
1025
+ return -1;
1026
+ }
1027
+ const candidate = lines[identifierIndex].trim();
1028
+ if (!candidate || isTimingLine(candidate)) {
1029
+ return -1;
1030
+ }
1031
+ return isLikelyCueIdentifier({
1032
+ line: candidate,
1033
+ nextLine: lines[timingLineIndex]?.trim(),
1034
+ previousCueIdentifier
1035
+ }) ? identifierIndex : -1;
1036
+ }
990
1037
  function extractTextFromVTT(vttContent) {
991
1038
  if (!vttContent.trim()) {
992
1039
  return "";
993
1040
  }
994
1041
  const lines = vttContent.split("\n");
995
1042
  const textLines = [];
1043
+ let previousCueIdentifier = null;
1044
+ let isInsideNoteBlock = false;
996
1045
  for (let i = 0; i < lines.length; i++) {
997
1046
  const line = lines[i].trim();
998
- if (!line)
1047
+ const nextLine = lines[i + 1]?.trim();
1048
+ if (!line) {
1049
+ isInsideNoteBlock = false;
1050
+ continue;
1051
+ }
1052
+ if (isInsideNoteBlock)
999
1053
  continue;
1000
1054
  if (line === "WEBVTT")
1001
1055
  continue;
1002
- if (line.startsWith("NOTE "))
1056
+ if (line === "NOTE" || line.startsWith("NOTE ")) {
1057
+ isInsideNoteBlock = true;
1003
1058
  continue;
1004
- if (line.includes("-->"))
1059
+ }
1060
+ if (isTimingLine(line))
1005
1061
  continue;
1006
- if (/^[\w-]+$/.test(line) && !line.includes(" "))
1062
+ if (isLikelyCueIdentifier({ line, nextLine, previousCueIdentifier })) {
1063
+ previousCueIdentifier = parseNumericCueIdentifier(line);
1007
1064
  continue;
1065
+ }
1008
1066
  if (line.startsWith("STYLE") || line.startsWith("REGION"))
1009
1067
  continue;
1010
1068
  const cleanLine = line.replace(/<[^>]*>/g, "").trim();
@@ -1053,20 +1111,34 @@ function parseVTTCues(vttContent) {
1053
1111
  return [];
1054
1112
  const lines = vttContent.split("\n");
1055
1113
  const cues = [];
1114
+ let previousCueIdentifier = null;
1056
1115
  for (let i = 0; i < lines.length; i++) {
1057
1116
  const line = lines[i].trim();
1058
- if (line.includes("-->")) {
1117
+ if (isTimingLine(line)) {
1059
1118
  const [startStr, endStr] = line.split(" --> ").map((s) => s.trim());
1060
1119
  const startTime = vttTimestampToSeconds(startStr);
1061
1120
  const endTime = vttTimestampToSeconds(endStr.split(" ")[0]);
1062
- const textLines = [];
1121
+ const currentCueIdentifierLine = lines[i - 1]?.trim() ?? "";
1122
+ const currentCueIdentifier = isLikelyCueIdentifier({
1123
+ line: currentCueIdentifierLine,
1124
+ nextLine: line,
1125
+ previousCueIdentifier
1126
+ }) ? parseNumericCueIdentifier(currentCueIdentifierLine) : null;
1127
+ const rawTextLines = [];
1063
1128
  let j = i + 1;
1064
- while (j < lines.length && lines[j].trim() && !lines[j].includes("-->")) {
1065
- const cleanLine = lines[j].trim().replace(/<[^>]*>/g, "");
1066
- if (cleanLine)
1067
- textLines.push(cleanLine);
1129
+ while (j < lines.length && lines[j].trim() && !isTimingLine(lines[j].trim())) {
1130
+ rawTextLines.push(lines[j].trim());
1068
1131
  j++;
1069
1132
  }
1133
+ const trailingNumericLine = parseNumericCueIdentifier(rawTextLines.at(-1) ?? "");
1134
+ if (trailingNumericLine !== null && isLikelyCueIdentifier({
1135
+ line: rawTextLines.at(-1) ?? "",
1136
+ nextLine: lines[j]?.trim(),
1137
+ previousCueIdentifier: currentCueIdentifier
1138
+ }) && rawTextLines.length > 1) {
1139
+ rawTextLines.pop();
1140
+ }
1141
+ const textLines = rawTextLines.map((textLine) => textLine.replace(/<[^>]*>/g, "")).filter(Boolean);
1070
1142
  if (textLines.length > 0) {
1071
1143
  cues.push({
1072
1144
  startTime,
@@ -1074,10 +1146,102 @@ function parseVTTCues(vttContent) {
1074
1146
  text: textLines.join(" ")
1075
1147
  });
1076
1148
  }
1149
+ previousCueIdentifier = currentCueIdentifier;
1077
1150
  }
1078
1151
  }
1079
1152
  return cues;
1080
1153
  }
1154
+ function splitVttPreambleAndCueBlocks(vttContent) {
1155
+ const normalizedContent = normalizeLineEndings(vttContent).trim();
1156
+ if (!normalizedContent) {
1157
+ return {
1158
+ preamble: "WEBVTT",
1159
+ cueBlocks: []
1160
+ };
1161
+ }
1162
+ const rawBlocks = normalizedContent.split(/\n{2,}/).map((block) => block.trim()).filter(Boolean);
1163
+ const cueBlockStartIndex = rawBlocks.findIndex((block) => block.includes("-->"));
1164
+ if (cueBlockStartIndex === -1) {
1165
+ return {
1166
+ preamble: normalizedContent.startsWith("WEBVTT") ? normalizedContent : `WEBVTT
1167
+
1168
+ ${normalizedContent}`,
1169
+ cueBlocks: []
1170
+ };
1171
+ }
1172
+ const hasMergedCueBlocks = rawBlocks.slice(cueBlockStartIndex).some((block) => (block.match(/-->/g) ?? []).length > 1);
1173
+ if (hasMergedCueBlocks) {
1174
+ const lines = normalizedContent.split("\n");
1175
+ const timingLineIndices = lines.map((line, index) => isTimingLine(line.trim()) ? index : -1).filter((index) => index >= 0);
1176
+ let previousCueIdentifier = null;
1177
+ const firstCueStartIndex = getCueIdentifierLineIndex(lines, timingLineIndices[0], previousCueIdentifier);
1178
+ const preambleEndIndex = firstCueStartIndex >= 0 ? firstCueStartIndex : timingLineIndices[0];
1179
+ const preamble2 = lines.slice(0, preambleEndIndex).join("\n").trim() || "WEBVTT";
1180
+ const cueBlocks2 = timingLineIndices.map((timingLineIndex, index) => {
1181
+ const cueIdentifierLineIndex = getCueIdentifierLineIndex(lines, timingLineIndex, previousCueIdentifier);
1182
+ const cueStartIndex = cueIdentifierLineIndex >= 0 ? cueIdentifierLineIndex : timingLineIndex;
1183
+ const currentCueIdentifier = cueIdentifierLineIndex >= 0 ? parseNumericCueIdentifier(lines[cueIdentifierLineIndex].trim()) : null;
1184
+ const nextTimingLineIndex = timingLineIndices[index + 1] ?? lines.length;
1185
+ let cueEndIndex = nextTimingLineIndex - 1;
1186
+ while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
1187
+ cueEndIndex--;
1188
+ }
1189
+ const nextCueIdentifierLineIndex = index < timingLineIndices.length - 1 ? getCueIdentifierLineIndex(lines, nextTimingLineIndex, currentCueIdentifier) : -1;
1190
+ if (nextCueIdentifierLineIndex === cueEndIndex) {
1191
+ cueEndIndex--;
1192
+ }
1193
+ while (cueEndIndex > timingLineIndex && !lines[cueEndIndex].trim()) {
1194
+ cueEndIndex--;
1195
+ }
1196
+ previousCueIdentifier = currentCueIdentifier;
1197
+ return lines.slice(cueStartIndex, cueEndIndex + 1).join("\n").trim();
1198
+ });
1199
+ return {
1200
+ preamble: preamble2,
1201
+ cueBlocks: cueBlocks2
1202
+ };
1203
+ }
1204
+ const preambleBlocks = rawBlocks.slice(0, cueBlockStartIndex);
1205
+ const cueBlocks = rawBlocks.slice(cueBlockStartIndex);
1206
+ const preamble = preambleBlocks.length > 0 ? preambleBlocks.join("\n\n") : "WEBVTT";
1207
+ return {
1208
+ preamble,
1209
+ cueBlocks
1210
+ };
1211
+ }
1212
+ function buildVttFromCueBlocks(cueBlocks, preamble = "WEBVTT") {
1213
+ if (cueBlocks.length === 0) {
1214
+ return `${preamble.trim()}
1215
+ `;
1216
+ }
1217
+ return `${preamble.trim()}
1218
+
1219
+ ${cueBlocks.map((block) => block.trim()).join("\n\n")}
1220
+ `;
1221
+ }
1222
+ function replaceCueText(cueBlock, translatedText) {
1223
+ const lines = normalizeLineEndings(cueBlock).split("\n").map((line) => line.trim()).filter(Boolean);
1224
+ const timingLineIndex = lines.findIndex((line) => line.includes("-->"));
1225
+ if (timingLineIndex === -1) {
1226
+ throw new Error("Cue block is missing a timestamp line");
1227
+ }
1228
+ const headerLines = lines.slice(0, timingLineIndex + 1);
1229
+ const translatedLines = normalizeLineEndings(translatedText).split("\n").map((line) => line.trim()).filter(Boolean);
1230
+ return [...headerLines, ...translatedLines].join("\n");
1231
+ }
1232
+ function buildVttFromTranslatedCueBlocks(cueBlocks, translatedTexts, preamble = "WEBVTT") {
1233
+ if (cueBlocks.length !== translatedTexts.length) {
1234
+ throw new Error(`Expected ${cueBlocks.length} translated cues, received ${translatedTexts.length}`);
1235
+ }
1236
+ return buildVttFromCueBlocks(
1237
+ cueBlocks.map((cueBlock, index) => replaceCueText(cueBlock, translatedTexts[index])),
1238
+ preamble
1239
+ );
1240
+ }
1241
+ function concatenateVttSegments(segments, preamble = "WEBVTT") {
1242
+ const cueBlocks = segments.flatMap((segment) => splitVttPreambleAndCueBlocks(segment).cueBlocks);
1243
+ return buildVttFromCueBlocks(cueBlocks, preamble);
1244
+ }
1081
1245
  async function buildTranscriptUrl(playbackId, trackId, shouldSign = false, credentials) {
1082
1246
  "use step";
1083
1247
  const baseUrl = `https://stream.mux.com/${playbackId}/text/${trackId}.vtt`;
@@ -2012,128 +2176,916 @@ async function generateChapters(assetId, languageCode, options = {}) {
2012
2176
  };
2013
2177
  }
2014
2178
 
2015
- // src/workflows/embeddings.ts
2016
- import { embed } from "ai";
2179
+ // src/workflows/edit-captions.ts
2180
+ import { generateText as generateText4, Output as Output4 } from "ai";
2181
+ import dedent4 from "dedent";
2182
+ import { z as z5 } from "zod";
2017
2183
 
2018
- // src/primitives/text-chunking.ts
2019
- function estimateTokenCount(text) {
2020
- const words = text.trim().split(/\s+/).length;
2021
- return Math.ceil(words / 0.75);
2184
+ // src/lib/mux-tracks.ts
2185
+ async function fetchVttFromMux(vttUrl) {
2186
+ "use step";
2187
+ const vttResponse = await fetch(vttUrl);
2188
+ if (!vttResponse.ok) {
2189
+ throw new Error(`Failed to fetch VTT file: ${vttResponse.statusText}`);
2190
+ }
2191
+ return vttResponse.text();
2022
2192
  }
2023
- function chunkByTokens(text, maxTokens, overlapTokens = 0) {
2024
- if (!text.trim()) {
2025
- return [];
2193
+ async function createTextTrackOnMux(assetId, languageCode, trackName, presignedUrl, credentials) {
2194
+ "use step";
2195
+ const muxClient = await resolveMuxClient(credentials);
2196
+ const mux = await muxClient.createClient();
2197
+ const trackResponse = await mux.video.assets.createTrack(assetId, {
2198
+ type: "text",
2199
+ text_type: "subtitles",
2200
+ language_code: languageCode,
2201
+ name: trackName,
2202
+ url: presignedUrl
2203
+ });
2204
+ if (!trackResponse.id) {
2205
+ throw new Error("Failed to create text track: no track ID returned from Mux");
2026
2206
  }
2027
- const chunks = [];
2028
- const words = text.trim().split(/\s+/);
2029
- const wordsPerChunk = Math.floor(maxTokens * 0.75);
2030
- const overlapWords = Math.floor(overlapTokens * 0.75);
2031
- let chunkIndex = 0;
2032
- let currentPosition = 0;
2033
- while (currentPosition < words.length) {
2034
- const chunkWords = words.slice(
2035
- currentPosition,
2036
- currentPosition + wordsPerChunk
2037
- );
2038
- const chunkText2 = chunkWords.join(" ");
2039
- const tokenCount = estimateTokenCount(chunkText2);
2040
- chunks.push({
2041
- id: `chunk-${chunkIndex}`,
2042
- text: chunkText2,
2043
- tokenCount
2044
- });
2045
- currentPosition += wordsPerChunk - overlapWords;
2046
- chunkIndex++;
2047
- if (currentPosition <= (chunkIndex - 1) * (wordsPerChunk - overlapWords)) {
2048
- break;
2049
- }
2207
+ return trackResponse.id;
2208
+ }
2209
+
2210
+ // src/lib/s3-sigv4.ts
2211
+ var AWS4_ALGORITHM = "AWS4-HMAC-SHA256";
2212
+ var AWS4_REQUEST_TERMINATOR = "aws4_request";
2213
+ var AWS4_SERVICE = "s3";
2214
+ var S3_ALLOWED_ENDPOINT_PATTERNS = parseEndpointAllowlist(
2215
+ env_default.S3_ALLOWED_ENDPOINT_HOSTS
2216
+ );
2217
+ function getCrypto() {
2218
+ const webCrypto = globalThis.crypto;
2219
+ if (!webCrypto?.subtle) {
2220
+ throw new Error("Web Crypto API is required for S3 signing.");
2050
2221
  }
2051
- return chunks;
2222
+ return webCrypto;
2052
2223
  }
2053
- function createChunkFromCues(cues, index) {
2054
- const text = cues.map((c) => c.text).join(" ");
2055
- return {
2056
- id: `chunk-${index}`,
2057
- text,
2058
- tokenCount: estimateTokenCount(text),
2059
- startTime: cues[0].startTime,
2060
- endTime: cues[cues.length - 1].endTime
2061
- };
2224
+ var textEncoder = new TextEncoder();
2225
+ function toBytes(value) {
2226
+ return typeof value === "string" ? textEncoder.encode(value) : value;
2062
2227
  }
2063
- function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
2064
- if (cues.length === 0)
2065
- return [];
2066
- const chunks = [];
2067
- let currentCues = [];
2068
- let currentTokens = 0;
2069
- let chunkIndex = 0;
2070
- for (let i = 0; i < cues.length; i++) {
2071
- const cue = cues[i];
2072
- const cueTokens = estimateTokenCount(cue.text);
2073
- if (currentTokens + cueTokens > maxTokens && currentCues.length > 0) {
2074
- chunks.push(createChunkFromCues(currentCues, chunkIndex));
2075
- chunkIndex++;
2076
- const overlapStart = Math.max(0, currentCues.length - overlapCues);
2077
- currentCues = currentCues.slice(overlapStart);
2078
- currentTokens = currentCues.reduce(
2079
- (sum, c) => sum + estimateTokenCount(c.text),
2080
- 0
2081
- );
2082
- }
2083
- currentCues.push(cue);
2084
- currentTokens += cueTokens;
2228
+ function bytesToHex(bytes) {
2229
+ return Array.from(bytes).map((byte) => byte.toString(16).padStart(2, "0")).join("");
2230
+ }
2231
+ async function sha256Hex(value) {
2232
+ const digest = await getCrypto().subtle.digest("SHA-256", toBytes(value));
2233
+ return bytesToHex(new Uint8Array(digest));
2234
+ }
2235
+ async function hmacSha256Raw(key, value) {
2236
+ const cryptoKey = await getCrypto().subtle.importKey(
2237
+ "raw",
2238
+ key,
2239
+ { name: "HMAC", hash: "SHA-256" },
2240
+ false,
2241
+ ["sign"]
2242
+ );
2243
+ const signature = await getCrypto().subtle.sign("HMAC", cryptoKey, textEncoder.encode(value));
2244
+ return new Uint8Array(signature);
2245
+ }
2246
+ async function deriveSigningKey(secretAccessKey, shortDate, region) {
2247
+ const kDate = await hmacSha256Raw(textEncoder.encode(`AWS4${secretAccessKey}`), shortDate);
2248
+ const kRegion = await hmacSha256Raw(kDate, region);
2249
+ const kService = await hmacSha256Raw(kRegion, AWS4_SERVICE);
2250
+ return hmacSha256Raw(kService, AWS4_REQUEST_TERMINATOR);
2251
+ }
2252
+ function formatAmzDate(date = /* @__PURE__ */ new Date()) {
2253
+ const iso = date.toISOString();
2254
+ const shortDate = iso.slice(0, 10).replace(/-/g, "");
2255
+ const amzDate = `${iso.slice(0, 19).replace(/[-:]/g, "")}Z`;
2256
+ return { amzDate, shortDate };
2257
+ }
2258
+ function encodeRFC3986(value) {
2259
+ return encodeURIComponent(value).replace(/[!'()*]/g, (char) => `%${char.charCodeAt(0).toString(16).toUpperCase()}`);
2260
+ }
2261
+ function encodePath(path) {
2262
+ return path.split("/").map((segment) => encodeRFC3986(segment)).join("/");
2263
+ }
2264
+ function normalizeEndpoint(endpoint) {
2265
+ let url;
2266
+ try {
2267
+ url = new URL(endpoint);
2268
+ } catch {
2269
+ throw new Error(`Invalid S3 endpoint: ${endpoint}`);
2085
2270
  }
2086
- if (currentCues.length > 0) {
2087
- chunks.push(createChunkFromCues(currentCues, chunkIndex));
2271
+ if (url.search || url.hash) {
2272
+ throw new Error("S3 endpoint must not include query params or hash fragments.");
2088
2273
  }
2089
- return chunks;
2274
+ enforceEndpointPolicy(url);
2275
+ return url;
2090
2276
  }
2091
- function chunkText(text, strategy) {
2092
- switch (strategy.type) {
2093
- case "token": {
2094
- return chunkByTokens(text, strategy.maxTokens, strategy.overlap ?? 0);
2095
- }
2096
- default: {
2097
- const exhaustiveCheck = strategy;
2098
- throw new Error(`Unsupported chunking strategy: ${exhaustiveCheck}`);
2099
- }
2277
+ function parseEndpointAllowlist(allowlist) {
2278
+ if (!allowlist) {
2279
+ return [];
2100
2280
  }
2281
+ return allowlist.split(",").map((value) => value.trim().toLowerCase()).filter(Boolean);
2101
2282
  }
2102
-
2103
- // src/workflows/embeddings.ts
2104
- function averageEmbeddings(embeddings) {
2105
- if (embeddings.length === 0) {
2106
- return [];
2283
+ function hostnameMatchesPattern(hostname, pattern) {
2284
+ if (pattern.startsWith("*.")) {
2285
+ const suffix = pattern.slice(1);
2286
+ return hostname.endsWith(suffix) && hostname.length > suffix.length;
2107
2287
  }
2108
- const dimensions = embeddings[0].length;
2109
- const averaged = Array.from({ length: dimensions }, () => 0);
2110
- for (const embedding of embeddings) {
2111
- for (let i = 0; i < dimensions; i++) {
2112
- averaged[i] += embedding[i];
2113
- }
2288
+ return hostname === pattern;
2289
+ }
2290
+ function enforceEndpointPolicy(url) {
2291
+ const hostname = url.hostname.toLowerCase();
2292
+ if (url.protocol !== "https:") {
2293
+ throw new Error(
2294
+ `Insecure S3 endpoint protocol "${url.protocol}" is not allowed. Use HTTPS.`
2295
+ );
2114
2296
  }
2115
- for (let i = 0; i < dimensions; i++) {
2116
- averaged[i] /= embeddings.length;
2297
+ if (S3_ALLOWED_ENDPOINT_PATTERNS.length > 0 && !S3_ALLOWED_ENDPOINT_PATTERNS.some((pattern) => hostnameMatchesPattern(hostname, pattern))) {
2298
+ throw new Error(
2299
+ `S3 endpoint host "${hostname}" is not in S3_ALLOWED_ENDPOINT_HOSTS.`
2300
+ );
2117
2301
  }
2118
- return averaged;
2119
2302
  }
2120
- async function generateSingleChunkEmbedding({
2121
- chunk,
2122
- provider,
2123
- modelId,
2124
- credentials
2125
- }) {
2126
- "use step";
2127
- const model = await createEmbeddingModelFromConfig(provider, modelId, credentials);
2128
- const response = await withRetry(
2129
- () => embed({
2130
- model,
2131
- value: chunk.text
2132
- })
2133
- );
2134
- return {
2135
- chunkId: chunk.id,
2136
- embedding: response.embedding,
2303
+ function buildCanonicalUri(endpoint, bucket, key) {
2304
+ const endpointPath = endpoint.pathname === "/" ? "" : encodePath(endpoint.pathname.replace(/\/+$/, ""));
2305
+ const encodedBucket = encodeRFC3986(bucket);
2306
+ const encodedKey = encodePath(key);
2307
+ return `${endpointPath}/${encodedBucket}/${encodedKey}`;
2308
+ }
2309
+ function buildCanonicalQuery(params) {
2310
+ return Object.entries(params).sort(([a], [b]) => a.localeCompare(b)).map(([key, value]) => `${encodeRFC3986(key)}=${encodeRFC3986(value)}`).join("&");
2311
+ }
2312
+ async function signString(secretAccessKey, shortDate, region, value) {
2313
+ const signingKey = await deriveSigningKey(secretAccessKey, shortDate, region);
2314
+ const signatureBytes = await hmacSha256Raw(signingKey, value);
2315
+ return bytesToHex(signatureBytes);
2316
+ }
2317
+ function buildCredentialScope(shortDate, region) {
2318
+ return `${shortDate}/${region}/${AWS4_SERVICE}/${AWS4_REQUEST_TERMINATOR}`;
2319
+ }
2320
+ async function putObjectToS3({
2321
+ accessKeyId,
2322
+ secretAccessKey,
2323
+ endpoint,
2324
+ region,
2325
+ bucket,
2326
+ key,
2327
+ body,
2328
+ contentType
2329
+ }) {
2330
+ const resolvedEndpoint = normalizeEndpoint(endpoint);
2331
+ const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
2332
+ const host = resolvedEndpoint.host;
2333
+ const normalizedContentType = contentType?.trim();
2334
+ const { amzDate, shortDate } = formatAmzDate();
2335
+ const payloadHash = await sha256Hex(body);
2336
+ const signingHeaders = [
2337
+ ["host", host],
2338
+ ["x-amz-content-sha256", payloadHash],
2339
+ ["x-amz-date", amzDate],
2340
+ ...normalizedContentType ? [["content-type", normalizedContentType]] : []
2341
+ ].sort(([a], [b]) => a.localeCompare(b));
2342
+ const canonicalHeaders = signingHeaders.map(([name, value]) => `${name}:${value}`).join("\n");
2343
+ const signedHeaders = signingHeaders.map(([name]) => name).join(";");
2344
+ const canonicalRequest = [
2345
+ "PUT",
2346
+ canonicalUri,
2347
+ "",
2348
+ `${canonicalHeaders}
2349
+ `,
2350
+ signedHeaders,
2351
+ payloadHash
2352
+ ].join("\n");
2353
+ const credentialScope = buildCredentialScope(shortDate, region);
2354
+ const stringToSign = [
2355
+ AWS4_ALGORITHM,
2356
+ amzDate,
2357
+ credentialScope,
2358
+ await sha256Hex(canonicalRequest)
2359
+ ].join("\n");
2360
+ const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
2361
+ const authorization = `${AWS4_ALGORITHM} Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}`;
2362
+ const requestUrl = `${resolvedEndpoint.origin}${canonicalUri}`;
2363
+ const response = await fetch(requestUrl, {
2364
+ method: "PUT",
2365
+ headers: {
2366
+ "Authorization": authorization,
2367
+ "x-amz-content-sha256": payloadHash,
2368
+ "x-amz-date": amzDate,
2369
+ ...normalizedContentType ? { "content-type": normalizedContentType } : {}
2370
+ },
2371
+ body
2372
+ });
2373
+ if (!response.ok) {
2374
+ const errorBody = await response.text().catch(() => "");
2375
+ const detail = errorBody ? ` ${errorBody}` : "";
2376
+ throw new Error(`S3 PUT failed (${response.status} ${response.statusText}).${detail}`);
2377
+ }
2378
+ }
2379
+ async function createPresignedGetUrl({
2380
+ accessKeyId,
2381
+ secretAccessKey,
2382
+ endpoint,
2383
+ region,
2384
+ bucket,
2385
+ key,
2386
+ expiresInSeconds = 3600
2387
+ }) {
2388
+ const resolvedEndpoint = normalizeEndpoint(endpoint);
2389
+ const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
2390
+ const host = resolvedEndpoint.host;
2391
+ const { amzDate, shortDate } = formatAmzDate();
2392
+ const credentialScope = buildCredentialScope(shortDate, region);
2393
+ const signedHeaders = "host";
2394
+ const queryParams = {
2395
+ "X-Amz-Algorithm": AWS4_ALGORITHM,
2396
+ "X-Amz-Credential": `${accessKeyId}/${credentialScope}`,
2397
+ "X-Amz-Date": amzDate,
2398
+ "X-Amz-Expires": `${expiresInSeconds}`,
2399
+ "X-Amz-SignedHeaders": signedHeaders
2400
+ };
2401
+ const canonicalQuery = buildCanonicalQuery(queryParams);
2402
+ const canonicalRequest = [
2403
+ "GET",
2404
+ canonicalUri,
2405
+ canonicalQuery,
2406
+ `host:${host}
2407
+ `,
2408
+ signedHeaders,
2409
+ "UNSIGNED-PAYLOAD"
2410
+ ].join("\n");
2411
+ const stringToSign = [
2412
+ AWS4_ALGORITHM,
2413
+ amzDate,
2414
+ credentialScope,
2415
+ await sha256Hex(canonicalRequest)
2416
+ ].join("\n");
2417
+ const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
2418
+ const queryWithSignature = `${canonicalQuery}&X-Amz-Signature=${signature}`;
2419
+ return `${resolvedEndpoint.origin}${canonicalUri}?${queryWithSignature}`;
2420
+ }
2421
+
2422
+ // src/lib/storage-adapter.ts
2423
+ function requireCredentials(accessKeyId, secretAccessKey) {
2424
+ if (!accessKeyId || !secretAccessKey) {
2425
+ throw new Error(
2426
+ "S3 credentials are required for default storage operations. Provide S3_ACCESS_KEY_ID and S3_SECRET_ACCESS_KEY or pass options.storageAdapter."
2427
+ );
2428
+ }
2429
+ return { accessKeyId, secretAccessKey };
2430
+ }
2431
+ async function putObjectWithStorageAdapter(input, adapter) {
2432
+ if (adapter) {
2433
+ await adapter.putObject(input);
2434
+ return;
2435
+ }
2436
+ const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
2437
+ await putObjectToS3({
2438
+ accessKeyId: credentials.accessKeyId,
2439
+ secretAccessKey: credentials.secretAccessKey,
2440
+ endpoint: input.endpoint,
2441
+ region: input.region,
2442
+ bucket: input.bucket,
2443
+ key: input.key,
2444
+ body: input.body,
2445
+ contentType: input.contentType
2446
+ });
2447
+ }
2448
+ async function createPresignedGetUrlWithStorageAdapter(input, adapter) {
2449
+ if (adapter) {
2450
+ return adapter.createPresignedGetUrl(input);
2451
+ }
2452
+ const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
2453
+ return createPresignedGetUrl({
2454
+ accessKeyId: credentials.accessKeyId,
2455
+ secretAccessKey: credentials.secretAccessKey,
2456
+ endpoint: input.endpoint,
2457
+ region: input.region,
2458
+ bucket: input.bucket,
2459
+ key: input.key,
2460
+ expiresInSeconds: input.expiresInSeconds
2461
+ });
2462
+ }
2463
+
2464
+ // src/workflows/edit-captions.ts
2465
+ var profanityDetectionSchema = z5.object({
2466
+ profanity: z5.array(z5.string()).describe(
2467
+ "Unique profane words or short phrases exactly as they appear in the transcript text. Include each distinct form only once (e.g., if 'fuck' and 'fucking' both appear, list both)."
2468
+ )
2469
+ });
2470
+ var SYSTEM_PROMPT3 = dedent4`
2471
+ You are a content moderation assistant. Your task is to identify profane, vulgar, or obscene
2472
+ words and phrases in subtitle text. Return ONLY the exact profane words or phrases as they appear
2473
+ in the text. Do not modify, censor, or paraphrase them. Do not include words that are merely
2474
+ informal or slang but not profane. Focus on words that would be bleeped on broadcast television.`;
2475
+ function transformCueText(rawVtt, transform) {
2476
+ const lines = rawVtt.split("\n");
2477
+ let inCueText = false;
2478
+ let currentCueStartTime = 0;
2479
+ const transformed = lines.map((line) => {
2480
+ if (line.includes("-->")) {
2481
+ const startTimestamp = line.split("-->")[0].trim();
2482
+ currentCueStartTime = vttTimestampToSeconds(startTimestamp);
2483
+ inCueText = true;
2484
+ return line;
2485
+ }
2486
+ if (line.trim() === "") {
2487
+ inCueText = false;
2488
+ return line;
2489
+ }
2490
+ if (inCueText) {
2491
+ return transform(line, currentCueStartTime);
2492
+ }
2493
+ return line;
2494
+ });
2495
+ return transformed.join("\n");
2496
+ }
2497
+ function buildReplacementRegex(words) {
2498
+ const filtered = words.filter((w) => w.length > 0);
2499
+ if (filtered.length === 0)
2500
+ return null;
2501
+ filtered.sort((a, b) => b.length - a.length);
2502
+ const escaped = filtered.map((w) => w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
2503
+ const pattern = escaped.join("|");
2504
+ return new RegExp(`\\b(?:${pattern})\\b`, "gi");
2505
+ }
2506
+ function createReplacer(mode) {
2507
+ switch (mode) {
2508
+ case "blank":
2509
+ return (match) => `[${"_".repeat(match.length)}]`;
2510
+ case "remove":
2511
+ return () => "";
2512
+ case "mask":
2513
+ return (match) => "?".repeat(match.length);
2514
+ }
2515
+ }
2516
+ function censorVttContent(rawVtt, profanity, mode) {
2517
+ if (profanity.length === 0) {
2518
+ return { censoredVtt: rawVtt, replacements: [] };
2519
+ }
2520
+ const regex = buildReplacementRegex(profanity);
2521
+ if (!regex) {
2522
+ return { censoredVtt: rawVtt, replacements: [] };
2523
+ }
2524
+ const replacer = createReplacer(mode);
2525
+ const replacements = [];
2526
+ const censoredVtt = transformCueText(rawVtt, (line, cueStartTime) => {
2527
+ return line.replace(regex, (match) => {
2528
+ const after = replacer(match);
2529
+ replacements.push({ cueStartTime, before: match, after });
2530
+ return after;
2531
+ });
2532
+ });
2533
+ return { censoredVtt, replacements };
2534
+ }
2535
+ function applyOverrideLists(detected, alwaysCensor, neverCensor) {
2536
+ const seen = new Set(detected.map((w) => w.toLowerCase()));
2537
+ const merged = [...detected];
2538
+ for (const word of alwaysCensor) {
2539
+ const lower = word.toLowerCase();
2540
+ if (!seen.has(lower)) {
2541
+ seen.add(lower);
2542
+ merged.push(word);
2543
+ }
2544
+ }
2545
+ const neverSet = new Set(neverCensor.map((w) => w.toLowerCase()));
2546
+ return merged.filter((w) => !neverSet.has(w.toLowerCase()));
2547
+ }
2548
+ function applyReplacements(rawVtt, replacements) {
2549
+ const filtered = replacements.filter((r) => r.find.length > 0);
2550
+ if (filtered.length === 0) {
2551
+ return { editedVtt: rawVtt, replacements: [] };
2552
+ }
2553
+ const records = [];
2554
+ const editedVtt = transformCueText(rawVtt, (line, cueStartTime) => {
2555
+ let result = line;
2556
+ for (const { find, replace } of filtered) {
2557
+ const escaped = find.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
2558
+ const regex = new RegExp(`\\b${escaped}\\b`, "g");
2559
+ result = result.replace(regex, (match) => {
2560
+ records.push({ cueStartTime, before: match, after: replace });
2561
+ return replace;
2562
+ });
2563
+ }
2564
+ return result;
2565
+ });
2566
+ return { editedVtt, replacements: records };
2567
+ }
2568
+ async function identifyProfanityWithAI({
2569
+ plainText,
2570
+ provider,
2571
+ modelId,
2572
+ credentials
2573
+ }) {
2574
+ "use step";
2575
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
2576
+ const response = await generateText4({
2577
+ model,
2578
+ output: Output4.object({ schema: profanityDetectionSchema }),
2579
+ messages: [
2580
+ {
2581
+ role: "system",
2582
+ content: SYSTEM_PROMPT3
2583
+ },
2584
+ {
2585
+ role: "user",
2586
+ content: `Identify all profane words and phrases in the following subtitle transcript. Return each unique profane word or phrase exactly as it appears in the text.
2587
+
2588
+ <transcript>
2589
+ ${plainText}
2590
+ </transcript>`
2591
+ }
2592
+ ]
2593
+ });
2594
+ return {
2595
+ profanity: response.output.profanity,
2596
+ usage: {
2597
+ inputTokens: response.usage.inputTokens,
2598
+ outputTokens: response.usage.outputTokens,
2599
+ totalTokens: response.usage.totalTokens,
2600
+ reasoningTokens: response.usage.reasoningTokens,
2601
+ cachedInputTokens: response.usage.cachedInputTokens
2602
+ }
2603
+ };
2604
+ }
2605
+ async function uploadEditedVttToS3({
2606
+ editedVtt,
2607
+ assetId,
2608
+ trackId,
2609
+ s3Endpoint,
2610
+ s3Region,
2611
+ s3Bucket,
2612
+ storageAdapter,
2613
+ s3SignedUrlExpirySeconds
2614
+ }) {
2615
+ "use step";
2616
+ const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
2617
+ const s3SecretAccessKey = env_default.S3_SECRET_ACCESS_KEY;
2618
+ const vttKey = `edited/${assetId}/${trackId}-edited-${Date.now()}.vtt`;
2619
+ await putObjectWithStorageAdapter({
2620
+ accessKeyId: s3AccessKeyId,
2621
+ secretAccessKey: s3SecretAccessKey,
2622
+ endpoint: s3Endpoint,
2623
+ region: s3Region,
2624
+ bucket: s3Bucket,
2625
+ key: vttKey,
2626
+ body: editedVtt,
2627
+ contentType: "text/vtt"
2628
+ }, storageAdapter);
2629
+ return createPresignedGetUrlWithStorageAdapter({
2630
+ accessKeyId: s3AccessKeyId,
2631
+ secretAccessKey: s3SecretAccessKey,
2632
+ endpoint: s3Endpoint,
2633
+ region: s3Region,
2634
+ bucket: s3Bucket,
2635
+ key: vttKey,
2636
+ expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
2637
+ }, storageAdapter);
2638
+ }
2639
+ async function deleteTrackOnMux(assetId, trackId, credentials) {
2640
+ "use step";
2641
+ const muxClient = await resolveMuxClient(credentials);
2642
+ const mux = await muxClient.createClient();
2643
+ await mux.video.assets.deleteTrack(assetId, trackId);
2644
+ }
2645
+ async function editCaptions(assetId, trackId, options) {
2646
+ "use workflow";
2647
+ const {
2648
+ provider,
2649
+ model,
2650
+ autoCensorProfanity: autoCensorOption,
2651
+ replacements: replacementsOption,
2652
+ deleteOriginalTrack,
2653
+ uploadToMux: uploadToMuxOption,
2654
+ s3Endpoint: providedS3Endpoint,
2655
+ s3Region: providedS3Region,
2656
+ s3Bucket: providedS3Bucket,
2657
+ trackNameSuffix,
2658
+ storageAdapter,
2659
+ credentials
2660
+ } = options;
2661
+ const hasAutoCensor = !!autoCensorOption;
2662
+ const hasReplacements = !!replacementsOption && replacementsOption.length > 0;
2663
+ if (!hasAutoCensor && !hasReplacements) {
2664
+ throw new Error("At least one of autoCensorProfanity or replacements must be provided.");
2665
+ }
2666
+ if (autoCensorOption && !provider) {
2667
+ throw new Error("provider is required when using autoCensorProfanity.");
2668
+ }
2669
+ const deleteOriginal = deleteOriginalTrack !== false;
2670
+ const uploadToMux = uploadToMuxOption !== false;
2671
+ const s3Endpoint = providedS3Endpoint ?? env_default.S3_ENDPOINT;
2672
+ const s3Region = providedS3Region ?? env_default.S3_REGION ?? "auto";
2673
+ const s3Bucket = providedS3Bucket ?? env_default.S3_BUCKET;
2674
+ const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
2675
+ const s3SecretAccessKey = env_default.S3_SECRET_ACCESS_KEY;
2676
+ if (uploadToMux && (!s3Endpoint || !s3Bucket || !storageAdapter && (!s3AccessKeyId || !s3SecretAccessKey))) {
2677
+ throw new Error(
2678
+ "Storage configuration is required for uploading to Mux. Provide s3Endpoint and s3Bucket. If no storageAdapter is supplied, also provide s3AccessKeyId and s3SecretAccessKey in options or set S3_ENDPOINT, S3_BUCKET, S3_ACCESS_KEY_ID, and S3_SECRET_ACCESS_KEY environment variables."
2679
+ );
2680
+ }
2681
+ const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
2682
+ const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
2683
+ const signingContext = await resolveMuxSigningContext(credentials);
2684
+ if (policy === "signed" && !signingContext) {
2685
+ throw new Error(
2686
+ "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
2687
+ );
2688
+ }
2689
+ const readyTextTracks = getReadyTextTracks(assetData);
2690
+ const sourceTrack = readyTextTracks.find((t) => t.id === trackId);
2691
+ if (!sourceTrack) {
2692
+ const availableTrackIds = readyTextTracks.map((t) => t.id).filter(Boolean).join(", ");
2693
+ throw new Error(
2694
+ `Track '${trackId}' not found or not ready on asset '${assetId}'. Available track IDs: ${availableTrackIds || "none"}`
2695
+ );
2696
+ }
2697
+ const vttUrl = await buildTranscriptUrl(playbackId, trackId, policy === "signed", credentials);
2698
+ let vttContent;
2699
+ try {
2700
+ vttContent = await fetchVttFromMux(vttUrl);
2701
+ } catch (error) {
2702
+ throw new Error(`Failed to fetch VTT content: ${error instanceof Error ? error.message : "Unknown error"}`);
2703
+ }
2704
+ let editedVtt = vttContent;
2705
+ let totalReplacementCount = 0;
2706
+ let autoCensorResult;
2707
+ let usage;
2708
+ if (autoCensorOption) {
2709
+ const { mode = "blank", alwaysCensor = [], neverCensor = [] } = autoCensorOption;
2710
+ const plainText = extractTextFromVTT(vttContent);
2711
+ if (!plainText.trim()) {
2712
+ throw new Error("Track transcript is empty; nothing to censor.");
2713
+ }
2714
+ const modelConfig = resolveLanguageModelConfig({
2715
+ ...options,
2716
+ provider,
2717
+ model
2718
+ });
2719
+ let detectedProfanity;
2720
+ try {
2721
+ const result = await identifyProfanityWithAI({
2722
+ plainText,
2723
+ provider: modelConfig.provider,
2724
+ modelId: modelConfig.modelId,
2725
+ credentials
2726
+ });
2727
+ detectedProfanity = result.profanity;
2728
+ usage = result.usage;
2729
+ } catch (error) {
2730
+ throw new Error(`Failed to detect profanity with ${modelConfig.provider}: ${error instanceof Error ? error.message : "Unknown error"}`);
2731
+ }
2732
+ const finalProfanity = applyOverrideLists(detectedProfanity, alwaysCensor, neverCensor);
2733
+ const { censoredVtt, replacements: censorReplacements } = censorVttContent(editedVtt, finalProfanity, mode);
2734
+ editedVtt = censoredVtt;
2735
+ totalReplacementCount += censorReplacements.length;
2736
+ autoCensorResult = { replacements: censorReplacements };
2737
+ }
2738
+ let replacementsResult;
2739
+ if (replacementsOption && replacementsOption.length > 0) {
2740
+ const { editedVtt: afterReplacements, replacements: staticReplacements } = applyReplacements(editedVtt, replacementsOption);
2741
+ editedVtt = afterReplacements;
2742
+ totalReplacementCount += staticReplacements.length;
2743
+ replacementsResult = { replacements: staticReplacements };
2744
+ }
2745
+ const usageWithMetadata = usage ? {
2746
+ ...usage,
2747
+ metadata: {
2748
+ assetDurationSeconds
2749
+ }
2750
+ } : void 0;
2751
+ if (!uploadToMux) {
2752
+ return {
2753
+ assetId,
2754
+ trackId,
2755
+ originalVtt: vttContent,
2756
+ editedVtt,
2757
+ totalReplacementCount,
2758
+ autoCensorProfanity: autoCensorResult,
2759
+ replacements: replacementsResult,
2760
+ usage: usageWithMetadata
2761
+ };
2762
+ }
2763
+ let presignedUrl;
2764
+ try {
2765
+ presignedUrl = await uploadEditedVttToS3({
2766
+ editedVtt,
2767
+ assetId,
2768
+ trackId,
2769
+ s3Endpoint,
2770
+ s3Region,
2771
+ s3Bucket,
2772
+ storageAdapter,
2773
+ s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
2774
+ });
2775
+ } catch (error) {
2776
+ throw new Error(`Failed to upload VTT to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
2777
+ }
2778
+ let uploadedTrackId;
2779
+ try {
2780
+ const languageCode = sourceTrack.language_code || "en";
2781
+ const suffix = trackNameSuffix ?? "edited";
2782
+ const trackName = `${sourceTrack.name || "Subtitles"} (${suffix})`;
2783
+ uploadedTrackId = await createTextTrackOnMux(
2784
+ assetId,
2785
+ languageCode,
2786
+ trackName,
2787
+ presignedUrl,
2788
+ credentials
2789
+ );
2790
+ } catch (error) {
2791
+ console.warn(`Failed to add track to Mux asset: ${error instanceof Error ? error.message : "Unknown error"}`);
2792
+ }
2793
+ if (deleteOriginal && uploadedTrackId) {
2794
+ try {
2795
+ await deleteTrackOnMux(assetId, trackId, credentials);
2796
+ } catch (error) {
2797
+ console.warn(`Failed to delete original track: ${error instanceof Error ? error.message : "Unknown error"}`);
2798
+ }
2799
+ }
2800
+ return {
2801
+ assetId,
2802
+ trackId,
2803
+ originalVtt: vttContent,
2804
+ editedVtt,
2805
+ totalReplacementCount,
2806
+ autoCensorProfanity: autoCensorResult,
2807
+ replacements: replacementsResult,
2808
+ uploadedTrackId,
2809
+ presignedUrl,
2810
+ usage: usageWithMetadata
2811
+ };
2812
+ }
2813
+
2814
+ // src/workflows/embeddings.ts
2815
+ import { embed } from "ai";
2816
+
2817
+ // src/primitives/text-chunking.ts
2818
+ var DEFAULT_MIN_CHUNK_DURATION_RATIO = 2 / 3;
2819
+ var DEFAULT_BOUNDARY_LOOKAHEAD_CUES = 12;
2820
+ var DEFAULT_BOUNDARY_PAUSE_SECONDS = 1.25;
2821
+ var STRONG_BOUNDARY_SCORE = 4;
2822
+ var PREFERRED_BOUNDARY_WINDOW_SECONDS = 5 * 60;
2823
+ var SENTENCE_BOUNDARY_REGEX = /[.!?]["')\]]*$/;
2824
+ var CLAUSE_BOUNDARY_REGEX = /[,;:]["')\]]*$/;
2825
+ var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
2826
+ function estimateTokenCount(text) {
2827
+ const words = text.trim().split(/\s+/).length;
2828
+ return Math.ceil(words / 0.75);
2829
+ }
2830
+ function chunkByTokens(text, maxTokens, overlapTokens = 0) {
2831
+ if (!text.trim()) {
2832
+ return [];
2833
+ }
2834
+ const chunks = [];
2835
+ const words = text.trim().split(/\s+/);
2836
+ const wordsPerChunk = Math.floor(maxTokens * 0.75);
2837
+ const overlapWords = Math.floor(overlapTokens * 0.75);
2838
+ let chunkIndex = 0;
2839
+ let currentPosition = 0;
2840
+ while (currentPosition < words.length) {
2841
+ const chunkWords = words.slice(
2842
+ currentPosition,
2843
+ currentPosition + wordsPerChunk
2844
+ );
2845
+ const chunkText2 = chunkWords.join(" ");
2846
+ const tokenCount = estimateTokenCount(chunkText2);
2847
+ chunks.push({
2848
+ id: `chunk-${chunkIndex}`,
2849
+ text: chunkText2,
2850
+ tokenCount
2851
+ });
2852
+ currentPosition += wordsPerChunk - overlapWords;
2853
+ chunkIndex++;
2854
+ if (currentPosition <= (chunkIndex - 1) * (wordsPerChunk - overlapWords)) {
2855
+ break;
2856
+ }
2857
+ }
2858
+ return chunks;
2859
+ }
2860
+ function createChunkFromCues(cues, index) {
2861
+ const text = cues.map((c) => c.text).join(" ");
2862
+ return {
2863
+ id: `chunk-${index}`,
2864
+ text,
2865
+ tokenCount: estimateTokenCount(text),
2866
+ startTime: cues[0].startTime,
2867
+ endTime: cues[cues.length - 1].endTime
2868
+ };
2869
+ }
2870
+ function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
2871
+ if (cues.length === 0)
2872
+ return [];
2873
+ const chunks = [];
2874
+ let currentCues = [];
2875
+ let currentTokens = 0;
2876
+ let chunkIndex = 0;
2877
+ for (let i = 0; i < cues.length; i++) {
2878
+ const cue = cues[i];
2879
+ const cueTokens = estimateTokenCount(cue.text);
2880
+ if (currentTokens + cueTokens > maxTokens && currentCues.length > 0) {
2881
+ chunks.push(createChunkFromCues(currentCues, chunkIndex));
2882
+ chunkIndex++;
2883
+ const overlapStart = Math.max(0, currentCues.length - overlapCues);
2884
+ currentCues = currentCues.slice(overlapStart);
2885
+ currentTokens = currentCues.reduce(
2886
+ (sum, c) => sum + estimateTokenCount(c.text),
2887
+ 0
2888
+ );
2889
+ }
2890
+ currentCues.push(cue);
2891
+ currentTokens += cueTokens;
2892
+ }
2893
+ if (currentCues.length > 0) {
2894
+ chunks.push(createChunkFromCues(currentCues, chunkIndex));
2895
+ }
2896
+ return chunks;
2897
+ }
2898
+ function scoreCueBoundary(cues, index, boundaryPauseSeconds) {
2899
+ const cue = cues[index];
2900
+ const nextCue = cues[index + 1];
2901
+ if (!nextCue) {
2902
+ return Number.POSITIVE_INFINITY;
2903
+ }
2904
+ const trimmedText = cue.text.trim();
2905
+ let score = 0;
2906
+ if (SENTENCE_BOUNDARY_REGEX.test(trimmedText)) {
2907
+ score += 4;
2908
+ } else if (CLAUSE_BOUNDARY_REGEX.test(trimmedText)) {
2909
+ score += 2;
2910
+ }
2911
+ if (nextCue.startTime - cue.endTime >= boundaryPauseSeconds) {
2912
+ score += 2;
2913
+ }
2914
+ if (NEXT_SENTENCE_START_REGEX.test(nextCue.text.trim())) {
2915
+ score += 1;
2916
+ }
2917
+ return score;
2918
+ }
2919
+ function chunkVTTCuesByBudget(cues, options) {
2920
+ if (cues.length === 0) {
2921
+ return [];
2922
+ }
2923
+ const maxCuesPerChunk = Math.max(1, options.maxCuesPerChunk);
2924
+ let maxTextTokensPerChunk = Number.POSITIVE_INFINITY;
2925
+ if (options.maxTextTokensPerChunk) {
2926
+ maxTextTokensPerChunk = Math.max(1, options.maxTextTokensPerChunk);
2927
+ }
2928
+ const chunks = [];
2929
+ let chunkIndex = 0;
2930
+ let cueStartIndex = 0;
2931
+ let currentTokenCount = 0;
2932
+ for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
2933
+ const cue = cues[cueIndex];
2934
+ const cueTokenCount = estimateTokenCount(cue.text);
2935
+ const currentCueCount = cueIndex - cueStartIndex;
2936
+ const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
2937
+ const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
2938
+ if (wouldExceedCueCount || wouldExceedTokenCount) {
2939
+ chunks.push({
2940
+ id: `chunk-${chunkIndex}`,
2941
+ cueStartIndex,
2942
+ cueEndIndex: cueIndex - 1,
2943
+ cueCount: cueIndex - cueStartIndex,
2944
+ startTime: cues[cueStartIndex].startTime,
2945
+ endTime: cues[cueIndex - 1].endTime
2946
+ });
2947
+ cueStartIndex = cueIndex;
2948
+ currentTokenCount = 0;
2949
+ chunkIndex++;
2950
+ }
2951
+ currentTokenCount += cueTokenCount;
2952
+ }
2953
+ chunks.push({
2954
+ id: `chunk-${chunkIndex}`,
2955
+ cueStartIndex,
2956
+ cueEndIndex: cues.length - 1,
2957
+ cueCount: cues.length - cueStartIndex,
2958
+ startTime: cues[cueStartIndex].startTime,
2959
+ endTime: cues[cues.length - 1].endTime
2960
+ });
2961
+ return chunks;
2962
+ }
2963
+ function chunkVTTCuesByDuration(cues, options) {
2964
+ if (cues.length === 0) {
2965
+ return [];
2966
+ }
2967
+ const targetChunkDurationSeconds = Math.max(1, options.targetChunkDurationSeconds);
2968
+ const maxChunkDurationSeconds = Math.max(targetChunkDurationSeconds, options.maxChunkDurationSeconds);
2969
+ const minChunkDurationSeconds = Math.min(
2970
+ targetChunkDurationSeconds,
2971
+ Math.max(
2972
+ 1,
2973
+ options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
2974
+ )
2975
+ );
2976
+ const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
2977
+ const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
2978
+ const preferredBoundaryStartSeconds = Math.max(
2979
+ minChunkDurationSeconds,
2980
+ targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
2981
+ );
2982
+ const chunks = [];
2983
+ let chunkIndex = 0;
2984
+ let cueStartIndex = 0;
2985
+ while (cueStartIndex < cues.length) {
2986
+ const chunkStartTime = cues[cueStartIndex].startTime;
2987
+ let cueEndIndex = cueStartIndex;
2988
+ let bestBoundaryIndex = -1;
2989
+ let bestBoundaryScore = -1;
2990
+ let bestPreferredBoundaryIndex = -1;
2991
+ let bestPreferredBoundaryScore = -1;
2992
+ while (cueEndIndex < cues.length) {
2993
+ const cue = cues[cueEndIndex];
2994
+ const currentDuration = cue.endTime - chunkStartTime;
2995
+ if (currentDuration >= minChunkDurationSeconds) {
2996
+ const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
2997
+ if (boundaryScore >= bestBoundaryScore) {
2998
+ bestBoundaryIndex = cueEndIndex;
2999
+ bestBoundaryScore = boundaryScore;
3000
+ }
3001
+ if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
3002
+ bestPreferredBoundaryIndex = cueEndIndex;
3003
+ bestPreferredBoundaryScore = boundaryScore;
3004
+ }
3005
+ }
3006
+ const nextCue = cues[cueEndIndex + 1];
3007
+ if (!nextCue) {
3008
+ break;
3009
+ }
3010
+ const nextDuration = nextCue.endTime - chunkStartTime;
3011
+ const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
3012
+ const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
3013
+ const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
3014
+ if (currentDuration >= targetChunkDurationSeconds) {
3015
+ if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
3016
+ cueEndIndex = preferredBoundaryIndex;
3017
+ break;
3018
+ }
3019
+ if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
3020
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
3021
+ break;
3022
+ }
3023
+ }
3024
+ if (nextDuration > maxChunkDurationSeconds) {
3025
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
3026
+ break;
3027
+ }
3028
+ cueEndIndex++;
3029
+ }
3030
+ chunks.push({
3031
+ id: `chunk-${chunkIndex}`,
3032
+ cueStartIndex,
3033
+ cueEndIndex,
3034
+ cueCount: cueEndIndex - cueStartIndex + 1,
3035
+ startTime: cues[cueStartIndex].startTime,
3036
+ endTime: cues[cueEndIndex].endTime
3037
+ });
3038
+ cueStartIndex = cueEndIndex + 1;
3039
+ chunkIndex++;
3040
+ }
3041
+ return chunks;
3042
+ }
3043
+ function chunkText(text, strategy) {
3044
+ switch (strategy.type) {
3045
+ case "token": {
3046
+ return chunkByTokens(text, strategy.maxTokens, strategy.overlap ?? 0);
3047
+ }
3048
+ default: {
3049
+ const exhaustiveCheck = strategy;
3050
+ throw new Error(`Unsupported chunking strategy: ${exhaustiveCheck}`);
3051
+ }
3052
+ }
3053
+ }
3054
+
3055
+ // src/workflows/embeddings.ts
3056
+ function averageEmbeddings(embeddings) {
3057
+ if (embeddings.length === 0) {
3058
+ return [];
3059
+ }
3060
+ const dimensions = embeddings[0].length;
3061
+ const averaged = Array.from({ length: dimensions }, () => 0);
3062
+ for (const embedding of embeddings) {
3063
+ for (let i = 0; i < dimensions; i++) {
3064
+ averaged[i] += embedding[i];
3065
+ }
3066
+ }
3067
+ for (let i = 0; i < dimensions; i++) {
3068
+ averaged[i] /= embeddings.length;
3069
+ }
3070
+ return averaged;
3071
+ }
3072
+ async function generateSingleChunkEmbedding({
3073
+ chunk,
3074
+ provider,
3075
+ modelId,
3076
+ credentials
3077
+ }) {
3078
+ "use step";
3079
+ const model = await createEmbeddingModelFromConfig(provider, modelId, credentials);
3080
+ const response = await withRetry(
3081
+ () => embed({
3082
+ model,
3083
+ value: chunk.text
3084
+ })
3085
+ );
3086
+ return {
3087
+ chunkId: chunk.id,
3088
+ embedding: response.embedding,
2137
3089
  metadata: {
2138
3090
  startTime: chunk.startTime,
2139
3091
  endTime: chunk.endTime,
@@ -2343,10 +3295,8 @@ async function getThumbnailUrls(playbackId, duration, options = {}) {
2343
3295
  }
2344
3296
  const baseUrl = getMuxThumbnailBaseUrl(playbackId);
2345
3297
  const urlPromises = timestamps.map(async (time) => {
2346
- if (shouldSign) {
2347
- return signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials);
2348
- }
2349
- return `${baseUrl}?time=${time}&width=${width}`;
3298
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
3299
+ return { url, time };
2350
3300
  });
2351
3301
  return Promise.all(urlPromises);
2352
3302
  }
@@ -2420,6 +3370,7 @@ async function moderateImageWithOpenAI(entry) {
2420
3370
  const categoryScores = json.results?.[0]?.category_scores || {};
2421
3371
  return {
2422
3372
  url: entry.url,
3373
+ time: entry.time,
2423
3374
  sexual: categoryScores.sexual || 0,
2424
3375
  violence: categoryScores.violence || 0,
2425
3376
  error: false
@@ -2428,6 +3379,7 @@ async function moderateImageWithOpenAI(entry) {
2428
3379
  console.error("OpenAI moderation failed:", error);
2429
3380
  return {
2430
3381
  url: entry.url,
3382
+ time: entry.time,
2431
3383
  sexual: 0,
2432
3384
  violence: 0,
2433
3385
  error: true,
@@ -2435,11 +3387,13 @@ async function moderateImageWithOpenAI(entry) {
2435
3387
  };
2436
3388
  }
2437
3389
  }
2438
- async function requestOpenAIModeration(imageUrls, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
3390
+ async function requestOpenAIModeration(images, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2439
3391
  "use step";
3392
+ const imageUrls = images.map((img) => img.url);
3393
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2440
3394
  const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
2441
- (img) => ({ url: img.url, image: img.base64Data, model, credentials })
2442
- ) : imageUrls.map((url) => ({ url, image: url, model, credentials }));
3395
+ (img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
3396
+ ) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
2443
3397
  return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
2444
3398
  }
2445
3399
  async function requestOpenAITextModeration(text, model, url, credentials) {
@@ -2584,6 +3538,7 @@ async function moderateImageWithHive(entry) {
2584
3538
  const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
2585
3539
  return {
2586
3540
  url: entry.url,
3541
+ time: entry.time,
2587
3542
  sexual,
2588
3543
  violence,
2589
3544
  error: false
@@ -2591,6 +3546,7 @@ async function moderateImageWithHive(entry) {
2591
3546
  } catch (error) {
2592
3547
  return {
2593
3548
  url: entry.url,
3549
+ time: entry.time,
2594
3550
  sexual: 0,
2595
3551
  violence: 0,
2596
3552
  error: true,
@@ -2598,19 +3554,23 @@ async function moderateImageWithHive(entry) {
2598
3554
  };
2599
3555
  }
2600
3556
  }
2601
- async function requestHiveModeration(imageUrls, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
3557
+ async function requestHiveModeration(images, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2602
3558
  "use step";
3559
+ const imageUrls = images.map((img) => img.url);
3560
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2603
3561
  const targets = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map((img) => ({
2604
3562
  url: img.url,
3563
+ time: timeByUrl.get(img.url),
2605
3564
  source: {
2606
3565
  kind: "file",
2607
3566
  buffer: img.buffer,
2608
3567
  contentType: img.contentType
2609
3568
  },
2610
3569
  credentials
2611
- })) : imageUrls.map((url) => ({
2612
- url,
2613
- source: { kind: "url", value: url },
3570
+ })) : images.map((img) => ({
3571
+ url: img.url,
3572
+ time: img.time,
3573
+ source: { kind: "url", value: img.url },
2614
3574
  credentials
2615
3575
  }));
2616
3576
  return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
@@ -2621,10 +3581,8 @@ async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options)
2621
3581
  const baseUrl = getMuxThumbnailBaseUrl(playbackId);
2622
3582
  const urlPromises = timestampsMs.map(async (tsMs) => {
2623
3583
  const time = Number((tsMs / 1e3).toFixed(2));
2624
- if (shouldSign) {
2625
- return signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials);
2626
- }
2627
- return `${baseUrl}?time=${time}&width=${width}`;
3584
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
3585
+ return { url, time };
2628
3586
  });
2629
3587
  return Promise.all(urlPromises);
2630
3588
  }
@@ -2775,16 +3733,18 @@ async function getModerationScores(assetId, options = {}) {
2775
3733
  }
2776
3734
 
2777
3735
  // src/workflows/summarization.ts
2778
- import { generateText as generateText4, Output as Output4 } from "ai";
2779
- import dedent4 from "dedent";
2780
- import { z as z5 } from "zod";
2781
- var SUMMARY_KEYWORD_LIMIT = 10;
2782
- var summarySchema = z5.object({
2783
- keywords: z5.array(z5.string()),
2784
- title: z5.string(),
2785
- description: z5.string()
3736
+ import { generateText as generateText5, Output as Output5 } from "ai";
3737
+ import dedent5 from "dedent";
3738
+ import { z as z6 } from "zod";
3739
+ var DEFAULT_SUMMARY_KEYWORD_LIMIT = 10;
3740
+ var DEFAULT_TITLE_LENGTH = 10;
3741
+ var DEFAULT_DESCRIPTION_LENGTH = 50;
3742
+ var summarySchema = z6.object({
3743
+ keywords: z6.array(z6.string()),
3744
+ title: z6.string(),
3745
+ description: z6.string()
2786
3746
  }).strict();
2787
- var SUMMARY_OUTPUT = Output4.object({
3747
+ var SUMMARY_OUTPUT = Output5.object({
2788
3748
  name: "summary_metadata",
2789
3749
  description: "Structured summary with title, description, and keywords.",
2790
3750
  schema: summarySchema
@@ -2795,10 +3755,49 @@ var TONE_INSTRUCTIONS = {
2795
3755
  playful: "Channel your inner diva! Answer with maximum sass, wit, and playful attitude. Don't hold back - be cheeky, clever, and delightfully snarky. Make it pop!",
2796
3756
  professional: "Provide a professional, executive-level analysis suitable for business reporting."
2797
3757
  };
3758
+ var DESCRIPTION_LENGTH_THRESHOLD_SMALL = 25;
3759
+ var DESCRIPTION_LENGTH_THRESHOLD_LARGE = 100;
3760
+ function buildDescriptionGuidance(wordCount, contentType) {
3761
+ if (wordCount < DESCRIPTION_LENGTH_THRESHOLD_SMALL) {
3762
+ if (contentType === "video") {
3763
+ return dedent5`A brief summary of the video in approximately ${wordCount} words.
3764
+ Focus on the single most important subject or action.
3765
+ Write in present tense.`;
3766
+ }
3767
+ return dedent5`A brief summary of the audio content in approximately ${wordCount} words.
3768
+ Focus on the single most important topic or theme.
3769
+ Write in present tense.`;
3770
+ }
3771
+ if (wordCount > DESCRIPTION_LENGTH_THRESHOLD_LARGE) {
3772
+ if (contentType === "video") {
3773
+ return dedent5`A detailed summary that describes what happens across the video.
3774
+ Aim for approximately ${wordCount} words, and you may use multiple sentences.
3775
+ Be thorough: cover subjects, actions, setting, progression, and any notable details visible across frames.
3776
+ Write in present tense. Be specific about observable details rather than making assumptions.
3777
+ If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`;
3778
+ }
3779
+ return dedent5`A detailed summary that describes the audio content.
3780
+ Aim for approximately ${wordCount} words, and you may use multiple sentences.
3781
+ Be thorough: cover topics, speakers, themes, progression, and any notable insights.
3782
+ Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
3783
+ Focus on the spoken content and any key insights, dialogue, or narrative elements.`;
3784
+ }
3785
+ if (contentType === "video") {
3786
+ return dedent5`A summary that describes what happens across the video.
3787
+ Aim for approximately ${wordCount} words, and you may use multiple sentences.
3788
+ Cover the main subjects, actions, setting, and any notable progression visible across frames.
3789
+ Write in present tense. Be specific about observable details rather than making assumptions.
3790
+ If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`;
3791
+ }
3792
+ return dedent5`A summary that describes the audio content.
3793
+ Aim for approximately ${wordCount} words, and you may use multiple sentences.
3794
+ Cover the main topics, speakers, themes, and any notable progression in the discussion or narration.
3795
+ Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
3796
+ Focus on the spoken content and any key insights, dialogue, or narrative elements.`;
3797
+ }
2798
3798
  function createSummarizationBuilder({ titleLength, descriptionLength, tagCount } = {}) {
2799
- const titleBrevity = titleLength != null ? `Aim for approximately ${titleLength} characters.` : "Aim for brevity - typically under 10 words.";
2800
- const descConstraint = descriptionLength != null ? `approximately ${descriptionLength} characters` : "2-4 sentences";
2801
- const keywordLimit = tagCount ?? SUMMARY_KEYWORD_LIMIT;
3799
+ const titleBrevity = `Aim for approximately ${titleLength ?? DEFAULT_TITLE_LENGTH} words.`;
3800
+ const keywordLimit = tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT;
2802
3801
  return createPromptBuilder({
2803
3802
  template: {
2804
3803
  task: {
@@ -2807,7 +3806,7 @@ function createSummarizationBuilder({ titleLength, descriptionLength, tagCount }
2807
3806
  },
2808
3807
  title: {
2809
3808
  tag: "title_requirements",
2810
- content: dedent4`
3809
+ content: dedent5`
2811
3810
  A short, compelling headline that immediately communicates the subject or action.
2812
3811
  ${titleBrevity} Think of how a news headline or video card title would read.
2813
3812
  Start with the primary subject, action, or topic - never begin with "A video of" or similar phrasing.
@@ -2815,15 +3814,11 @@ function createSummarizationBuilder({ titleLength, descriptionLength, tagCount }
2815
3814
  },
2816
3815
  description: {
2817
3816
  tag: "description_requirements",
2818
- content: dedent4`
2819
- A concise summary (${descConstraint}) that describes what happens across the video.
2820
- Cover the main subjects, actions, setting, and any notable progression visible across frames.
2821
- Write in present tense. Be specific about observable details rather than making assumptions.
2822
- If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`
3817
+ content: buildDescriptionGuidance(descriptionLength ?? DEFAULT_DESCRIPTION_LENGTH, "video")
2823
3818
  },
2824
3819
  keywords: {
2825
3820
  tag: "keywords_requirements",
2826
- content: dedent4`
3821
+ content: dedent5`
2827
3822
  Specific, searchable terms (up to ${keywordLimit}) that capture:
2828
3823
  - Primary subjects (people, animals, objects)
2829
3824
  - Actions and activities being performed
@@ -2835,7 +3830,7 @@ function createSummarizationBuilder({ titleLength, descriptionLength, tagCount }
2835
3830
  },
2836
3831
  qualityGuidelines: {
2837
3832
  tag: "quality_guidelines",
2838
- content: dedent4`
3833
+ content: dedent5`
2839
3834
  - Examine all frames to understand the full context and progression
2840
3835
  - Be precise: "golden retriever" is better than "dog" when identifiable
2841
3836
  - Capture the narrative: what begins, develops, and concludes
@@ -2846,9 +3841,8 @@ function createSummarizationBuilder({ titleLength, descriptionLength, tagCount }
2846
3841
  });
2847
3842
  }
2848
3843
  function createAudioOnlyBuilder({ titleLength, descriptionLength, tagCount } = {}) {
2849
- const titleBrevity = titleLength != null ? `Aim for approximately ${titleLength} characters.` : "Aim for brevity - typically under 10 words.";
2850
- const descConstraint = descriptionLength != null ? `approximately ${descriptionLength} characters` : "2-4 sentences";
2851
- const keywordLimit = tagCount ?? SUMMARY_KEYWORD_LIMIT;
3844
+ const titleBrevity = `Aim for approximately ${titleLength ?? DEFAULT_TITLE_LENGTH} words.`;
3845
+ const keywordLimit = tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT;
2852
3846
  return createPromptBuilder({
2853
3847
  template: {
2854
3848
  task: {
@@ -2857,7 +3851,7 @@ function createAudioOnlyBuilder({ titleLength, descriptionLength, tagCount } = {
2857
3851
  },
2858
3852
  title: {
2859
3853
  tag: "title_requirements",
2860
- content: dedent4`
3854
+ content: dedent5`
2861
3855
  A short, compelling headline that immediately communicates the subject or topic.
2862
3856
  ${titleBrevity} Think of how a podcast title or audio description would read.
2863
3857
  Start with the primary subject, action, or topic - never begin with "An audio of" or similar phrasing.
@@ -2865,15 +3859,11 @@ function createAudioOnlyBuilder({ titleLength, descriptionLength, tagCount } = {
2865
3859
  },
2866
3860
  description: {
2867
3861
  tag: "description_requirements",
2868
- content: dedent4`
2869
- A concise summary (${descConstraint}) that describes the audio content.
2870
- Cover the main topics, speakers, themes, and any notable progression in the discussion or narration.
2871
- Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
2872
- Focus on the spoken content and any key insights, dialogue, or narrative elements.`
3862
+ content: buildDescriptionGuidance(descriptionLength ?? DEFAULT_DESCRIPTION_LENGTH, "audio")
2873
3863
  },
2874
3864
  keywords: {
2875
3865
  tag: "keywords_requirements",
2876
- content: dedent4`
3866
+ content: dedent5`
2877
3867
  Specific, searchable terms (up to ${keywordLimit}) that capture:
2878
3868
  - Primary topics and themes
2879
3869
  - Speakers or presenters (if named)
@@ -2885,7 +3875,7 @@ function createAudioOnlyBuilder({ titleLength, descriptionLength, tagCount } = {
2885
3875
  },
2886
3876
  qualityGuidelines: {
2887
3877
  tag: "quality_guidelines",
2888
- content: dedent4`
3878
+ content: dedent5`
2889
3879
  - Analyze the full transcript to understand context and themes
2890
3880
  - Be precise: use specific terminology when mentioned
2891
3881
  - Capture the narrative: what is introduced, discussed, and concluded
@@ -2895,7 +3885,7 @@ function createAudioOnlyBuilder({ titleLength, descriptionLength, tagCount } = {
2895
3885
  sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
2896
3886
  });
2897
3887
  }
2898
- var SYSTEM_PROMPT3 = dedent4`
3888
+ var SYSTEM_PROMPT4 = dedent5`
2899
3889
  <role>
2900
3890
  You are a video content analyst specializing in storyboard interpretation and multimodal analysis.
2901
3891
  </role>
@@ -2952,7 +3942,7 @@ var SYSTEM_PROMPT3 = dedent4`
2952
3942
 
2953
3943
  Write as if describing reality, not describing a recording of reality.
2954
3944
  </language_guidelines>`;
2955
- var AUDIO_ONLY_SYSTEM_PROMPT = dedent4`
3945
+ var AUDIO_ONLY_SYSTEM_PROMPT = dedent5`
2956
3946
  <role>
2957
3947
  You are an audio content analyst specializing in transcript analysis and metadata generation.
2958
3948
  </role>
@@ -3018,56 +4008,26 @@ function buildUserPrompt4({
3018
4008
  languageName
3019
4009
  }) {
3020
4010
  const contextSections = [createToneSection(TONE_INSTRUCTIONS[tone])];
3021
- if (languageName) {
3022
- contextSections.push(createLanguageSection(languageName));
3023
- }
3024
- if (transcriptText) {
3025
- const format = isCleanTranscript ? "plain text" : "WebVTT";
3026
- contextSections.push(createTranscriptSection(transcriptText, format));
3027
- }
3028
- const constraints = { titleLength, descriptionLength, tagCount };
3029
- const promptBuilder = isAudioOnly ? createAudioOnlyBuilder(constraints) : createSummarizationBuilder(constraints);
3030
- return promptBuilder.buildWithContext(promptOverrides, contextSections);
3031
- }
3032
- async function analyzeStoryboard2(imageDataUrl, provider, modelId, userPrompt, systemPrompt, credentials) {
3033
- "use step";
3034
- const model = await createLanguageModelFromConfig(provider, modelId, credentials);
3035
- const response = await generateText4({
3036
- model,
3037
- output: SUMMARY_OUTPUT,
3038
- messages: [
3039
- {
3040
- role: "system",
3041
- content: systemPrompt
3042
- },
3043
- {
3044
- role: "user",
3045
- content: [
3046
- { type: "text", text: userPrompt },
3047
- { type: "image", image: imageDataUrl }
3048
- ]
3049
- }
3050
- ]
3051
- });
3052
- if (!response.output) {
3053
- throw new Error("Summarization output missing");
4011
+ if (languageName) {
4012
+ contextSections.push(createLanguageSection(languageName));
4013
+ } else {
4014
+ contextSections.push({
4015
+ tag: "language",
4016
+ content: "Respond in English. Never switch languages to satisfy length constraints."
4017
+ });
3054
4018
  }
3055
- const parsed = summarySchema.parse(response.output);
3056
- return {
3057
- result: parsed,
3058
- usage: {
3059
- inputTokens: response.usage.inputTokens,
3060
- outputTokens: response.usage.outputTokens,
3061
- totalTokens: response.usage.totalTokens,
3062
- reasoningTokens: response.usage.reasoningTokens,
3063
- cachedInputTokens: response.usage.cachedInputTokens
3064
- }
3065
- };
4019
+ if (transcriptText) {
4020
+ const format = isCleanTranscript ? "plain text" : "WebVTT";
4021
+ contextSections.push(createTranscriptSection(transcriptText, format));
4022
+ }
4023
+ const constraints = { titleLength, descriptionLength, tagCount };
4024
+ const promptBuilder = isAudioOnly ? createAudioOnlyBuilder(constraints) : createSummarizationBuilder(constraints);
4025
+ return promptBuilder.buildWithContext(promptOverrides, contextSections);
3066
4026
  }
3067
- async function analyzeAudioOnly(provider, modelId, userPrompt, systemPrompt, credentials) {
4027
+ async function analyzeStoryboard2(imageDataUrl, provider, modelId, userPrompt, systemPrompt, credentials) {
3068
4028
  "use step";
3069
4029
  const model = await createLanguageModelFromConfig(provider, modelId, credentials);
3070
- const response = await generateText4({
4030
+ const response = await generateText5({
3071
4031
  model,
3072
4032
  output: SUMMARY_OUTPUT,
3073
4033
  messages: [
@@ -3077,430 +4037,211 @@ async function analyzeAudioOnly(provider, modelId, userPrompt, systemPrompt, cre
3077
4037
  },
3078
4038
  {
3079
4039
  role: "user",
3080
- content: userPrompt
3081
- }
3082
- ]
3083
- });
3084
- if (!response.output) {
3085
- throw new Error("Summarization output missing");
3086
- }
3087
- const parsed = summarySchema.parse(response.output);
3088
- return {
3089
- result: parsed,
3090
- usage: {
3091
- inputTokens: response.usage.inputTokens,
3092
- outputTokens: response.usage.outputTokens,
3093
- totalTokens: response.usage.totalTokens,
3094
- reasoningTokens: response.usage.reasoningTokens,
3095
- cachedInputTokens: response.usage.cachedInputTokens
3096
- }
3097
- };
3098
- }
3099
- function normalizeKeywords(keywords, limit = SUMMARY_KEYWORD_LIMIT) {
3100
- if (!Array.isArray(keywords) || keywords.length === 0) {
3101
- return [];
3102
- }
3103
- const uniqueLowercase = /* @__PURE__ */ new Set();
3104
- const normalized = [];
3105
- for (const keyword of keywords) {
3106
- const trimmed = keyword?.trim();
3107
- if (!trimmed) {
3108
- continue;
3109
- }
3110
- const lower = trimmed.toLowerCase();
3111
- if (uniqueLowercase.has(lower)) {
3112
- continue;
3113
- }
3114
- uniqueLowercase.add(lower);
3115
- normalized.push(trimmed);
3116
- if (normalized.length === limit) {
3117
- break;
3118
- }
3119
- }
3120
- return normalized;
3121
- }
3122
- async function getSummaryAndTags(assetId, options) {
3123
- "use workflow";
3124
- const {
3125
- provider = "openai",
3126
- model,
3127
- tone = "neutral",
3128
- includeTranscript = true,
3129
- cleanTranscript = true,
3130
- imageSubmissionMode = "url",
3131
- imageDownloadOptions,
3132
- promptOverrides,
3133
- credentials,
3134
- titleLength,
3135
- descriptionLength,
3136
- tagCount,
3137
- outputLanguageCode
3138
- } = options ?? {};
3139
- if (!VALID_TONES.includes(tone)) {
3140
- throw new Error(
3141
- `Invalid tone "${tone}". Valid tones are: ${VALID_TONES.join(", ")}`
3142
- );
3143
- }
3144
- const modelConfig = resolveLanguageModelConfig({
3145
- ...options,
3146
- model,
3147
- provider
3148
- });
3149
- const workflowCredentials = credentials;
3150
- const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, workflowCredentials);
3151
- const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
3152
- const isAudioOnly = isAudioOnlyAsset(assetData);
3153
- if (isAudioOnly && !includeTranscript) {
3154
- throw new Error(
3155
- "Audio-only assets require a transcript. Set includeTranscript: true and ensure the asset has a ready text track (captions/subtitles)."
3156
- );
3157
- }
3158
- const signingContext = await resolveMuxSigningContext(workflowCredentials);
3159
- if (policy === "signed" && !signingContext) {
3160
- throw new Error(
3161
- "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
3162
- );
3163
- }
3164
- const transcriptResult = includeTranscript ? await fetchTranscriptForAsset(assetData, playbackId, {
3165
- cleanTranscript,
3166
- shouldSign: policy === "signed",
3167
- credentials: workflowCredentials,
3168
- required: isAudioOnly
3169
- }) : void 0;
3170
- const transcriptText = transcriptResult?.transcriptText ?? "";
3171
- const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult?.track?.language_code ?? getReadyTextTracks(assetData)[0]?.language_code;
3172
- const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
3173
- const userPrompt = buildUserPrompt4({
3174
- tone,
3175
- transcriptText,
3176
- isCleanTranscript: cleanTranscript,
3177
- promptOverrides,
3178
- isAudioOnly,
3179
- titleLength,
3180
- descriptionLength,
3181
- tagCount,
3182
- languageName
3183
- });
3184
- let analysisResponse;
3185
- let imageUrl;
3186
- const systemPrompt = isAudioOnly ? AUDIO_ONLY_SYSTEM_PROMPT : SYSTEM_PROMPT3;
3187
- try {
3188
- if (isAudioOnly) {
3189
- analysisResponse = await analyzeAudioOnly(
3190
- modelConfig.provider,
3191
- modelConfig.modelId,
3192
- userPrompt,
3193
- systemPrompt,
3194
- workflowCredentials
3195
- );
3196
- } else {
3197
- const storyboardUrl = await getStoryboardUrl(playbackId, 640, policy === "signed", workflowCredentials);
3198
- imageUrl = storyboardUrl;
3199
- if (imageSubmissionMode === "base64") {
3200
- const downloadResult = await downloadImageAsBase64(storyboardUrl, imageDownloadOptions);
3201
- analysisResponse = await analyzeStoryboard2(
3202
- downloadResult.base64Data,
3203
- modelConfig.provider,
3204
- modelConfig.modelId,
3205
- userPrompt,
3206
- systemPrompt,
3207
- workflowCredentials
3208
- );
3209
- } else {
3210
- analysisResponse = await withRetry(() => analyzeStoryboard2(
3211
- storyboardUrl,
3212
- modelConfig.provider,
3213
- modelConfig.modelId,
3214
- userPrompt,
3215
- systemPrompt,
3216
- workflowCredentials
3217
- ));
3218
- }
3219
- }
3220
- } catch (error) {
3221
- const contentType = isAudioOnly ? "audio" : "video";
3222
- throw new Error(
3223
- `Failed to analyze ${contentType} content with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
3224
- );
3225
- }
3226
- if (!analysisResponse.result) {
3227
- throw new Error(`Failed to analyze video content for asset ${assetId}`);
3228
- }
3229
- if (!analysisResponse.result.title) {
3230
- throw new Error(`Failed to generate title for asset ${assetId}`);
3231
- }
3232
- if (!analysisResponse.result.description) {
3233
- throw new Error(`Failed to generate description for asset ${assetId}`);
3234
- }
3235
- return {
3236
- assetId,
3237
- title: analysisResponse.result.title,
3238
- description: analysisResponse.result.description,
3239
- tags: normalizeKeywords(analysisResponse.result.keywords, tagCount ?? SUMMARY_KEYWORD_LIMIT),
3240
- storyboardUrl: imageUrl,
3241
- // undefined for audio-only assets
3242
- usage: {
3243
- ...analysisResponse.usage,
3244
- metadata: {
3245
- assetDurationSeconds
3246
- }
3247
- },
3248
- transcriptText: transcriptText || void 0
3249
- };
3250
- }
3251
-
3252
- // src/lib/s3-sigv4.ts
3253
- var AWS4_ALGORITHM = "AWS4-HMAC-SHA256";
3254
- var AWS4_REQUEST_TERMINATOR = "aws4_request";
3255
- var AWS4_SERVICE = "s3";
3256
- var S3_ALLOWED_ENDPOINT_PATTERNS = parseEndpointAllowlist(
3257
- env_default.S3_ALLOWED_ENDPOINT_HOSTS
3258
- );
3259
- function getCrypto() {
3260
- const webCrypto = globalThis.crypto;
3261
- if (!webCrypto?.subtle) {
3262
- throw new Error("Web Crypto API is required for S3 signing.");
3263
- }
3264
- return webCrypto;
3265
- }
3266
- var textEncoder = new TextEncoder();
3267
- function toBytes(value) {
3268
- return typeof value === "string" ? textEncoder.encode(value) : value;
3269
- }
3270
- function bytesToHex(bytes) {
3271
- return Array.from(bytes).map((byte) => byte.toString(16).padStart(2, "0")).join("");
3272
- }
3273
- async function sha256Hex(value) {
3274
- const digest = await getCrypto().subtle.digest("SHA-256", toBytes(value));
3275
- return bytesToHex(new Uint8Array(digest));
3276
- }
3277
- async function hmacSha256Raw(key, value) {
3278
- const cryptoKey = await getCrypto().subtle.importKey(
3279
- "raw",
3280
- key,
3281
- { name: "HMAC", hash: "SHA-256" },
3282
- false,
3283
- ["sign"]
3284
- );
3285
- const signature = await getCrypto().subtle.sign("HMAC", cryptoKey, textEncoder.encode(value));
3286
- return new Uint8Array(signature);
3287
- }
3288
- async function deriveSigningKey(secretAccessKey, shortDate, region) {
3289
- const kDate = await hmacSha256Raw(textEncoder.encode(`AWS4${secretAccessKey}`), shortDate);
3290
- const kRegion = await hmacSha256Raw(kDate, region);
3291
- const kService = await hmacSha256Raw(kRegion, AWS4_SERVICE);
3292
- return hmacSha256Raw(kService, AWS4_REQUEST_TERMINATOR);
3293
- }
3294
- function formatAmzDate(date = /* @__PURE__ */ new Date()) {
3295
- const iso = date.toISOString();
3296
- const shortDate = iso.slice(0, 10).replace(/-/g, "");
3297
- const amzDate = `${iso.slice(0, 19).replace(/[-:]/g, "")}Z`;
3298
- return { amzDate, shortDate };
3299
- }
3300
- function encodeRFC3986(value) {
3301
- return encodeURIComponent(value).replace(/[!'()*]/g, (char) => `%${char.charCodeAt(0).toString(16).toUpperCase()}`);
3302
- }
3303
- function encodePath(path) {
3304
- return path.split("/").map((segment) => encodeRFC3986(segment)).join("/");
3305
- }
3306
- function normalizeEndpoint(endpoint) {
3307
- let url;
3308
- try {
3309
- url = new URL(endpoint);
3310
- } catch {
3311
- throw new Error(`Invalid S3 endpoint: ${endpoint}`);
4040
+ content: [
4041
+ { type: "text", text: userPrompt },
4042
+ { type: "image", image: imageDataUrl }
4043
+ ]
4044
+ }
4045
+ ]
4046
+ });
4047
+ if (!response.output) {
4048
+ throw new Error("Summarization output missing");
3312
4049
  }
3313
- if (url.search || url.hash) {
3314
- throw new Error("S3 endpoint must not include query params or hash fragments.");
4050
+ const parsed = summarySchema.parse(response.output);
4051
+ return {
4052
+ result: parsed,
4053
+ usage: {
4054
+ inputTokens: response.usage.inputTokens,
4055
+ outputTokens: response.usage.outputTokens,
4056
+ totalTokens: response.usage.totalTokens,
4057
+ reasoningTokens: response.usage.reasoningTokens,
4058
+ cachedInputTokens: response.usage.cachedInputTokens
4059
+ }
4060
+ };
4061
+ }
4062
+ async function analyzeAudioOnly(provider, modelId, userPrompt, systemPrompt, credentials) {
4063
+ "use step";
4064
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4065
+ const response = await generateText5({
4066
+ model,
4067
+ output: SUMMARY_OUTPUT,
4068
+ messages: [
4069
+ {
4070
+ role: "system",
4071
+ content: systemPrompt
4072
+ },
4073
+ {
4074
+ role: "user",
4075
+ content: userPrompt
4076
+ }
4077
+ ]
4078
+ });
4079
+ if (!response.output) {
4080
+ throw new Error("Summarization output missing");
3315
4081
  }
3316
- enforceEndpointPolicy(url);
3317
- return url;
4082
+ const parsed = summarySchema.parse(response.output);
4083
+ return {
4084
+ result: parsed,
4085
+ usage: {
4086
+ inputTokens: response.usage.inputTokens,
4087
+ outputTokens: response.usage.outputTokens,
4088
+ totalTokens: response.usage.totalTokens,
4089
+ reasoningTokens: response.usage.reasoningTokens,
4090
+ cachedInputTokens: response.usage.cachedInputTokens
4091
+ }
4092
+ };
3318
4093
  }
3319
- function parseEndpointAllowlist(allowlist) {
3320
- if (!allowlist) {
4094
+ function normalizeKeywords(keywords, limit = DEFAULT_SUMMARY_KEYWORD_LIMIT) {
4095
+ if (!Array.isArray(keywords) || keywords.length === 0) {
3321
4096
  return [];
3322
4097
  }
3323
- return allowlist.split(",").map((value) => value.trim().toLowerCase()).filter(Boolean);
3324
- }
3325
- function hostnameMatchesPattern(hostname, pattern) {
3326
- if (pattern.startsWith("*.")) {
3327
- const suffix = pattern.slice(1);
3328
- return hostname.endsWith(suffix) && hostname.length > suffix.length;
4098
+ const uniqueLowercase = /* @__PURE__ */ new Set();
4099
+ const normalized = [];
4100
+ for (const keyword of keywords) {
4101
+ const trimmed = keyword?.trim();
4102
+ if (!trimmed) {
4103
+ continue;
4104
+ }
4105
+ const lower = trimmed.toLowerCase();
4106
+ if (uniqueLowercase.has(lower)) {
4107
+ continue;
4108
+ }
4109
+ uniqueLowercase.add(lower);
4110
+ normalized.push(trimmed);
4111
+ if (normalized.length === limit) {
4112
+ break;
4113
+ }
3329
4114
  }
3330
- return hostname === pattern;
4115
+ return normalized;
3331
4116
  }
3332
- function enforceEndpointPolicy(url) {
3333
- const hostname = url.hostname.toLowerCase();
3334
- if (url.protocol !== "https:") {
4117
+ async function getSummaryAndTags(assetId, options) {
4118
+ "use workflow";
4119
+ const {
4120
+ provider = "openai",
4121
+ model,
4122
+ tone = "neutral",
4123
+ includeTranscript = true,
4124
+ cleanTranscript = true,
4125
+ imageSubmissionMode = "url",
4126
+ imageDownloadOptions,
4127
+ promptOverrides,
4128
+ credentials,
4129
+ titleLength,
4130
+ descriptionLength,
4131
+ tagCount,
4132
+ outputLanguageCode
4133
+ } = options ?? {};
4134
+ if (!VALID_TONES.includes(tone)) {
3335
4135
  throw new Error(
3336
- `Insecure S3 endpoint protocol "${url.protocol}" is not allowed. Use HTTPS.`
4136
+ `Invalid tone "${tone}". Valid tones are: ${VALID_TONES.join(", ")}`
3337
4137
  );
3338
4138
  }
3339
- if (S3_ALLOWED_ENDPOINT_PATTERNS.length > 0 && !S3_ALLOWED_ENDPOINT_PATTERNS.some((pattern) => hostnameMatchesPattern(hostname, pattern))) {
4139
+ const modelConfig = resolveLanguageModelConfig({
4140
+ ...options,
4141
+ model,
4142
+ provider
4143
+ });
4144
+ const workflowCredentials = credentials;
4145
+ const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, workflowCredentials);
4146
+ const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
4147
+ const isAudioOnly = isAudioOnlyAsset(assetData);
4148
+ if (isAudioOnly && !includeTranscript) {
3340
4149
  throw new Error(
3341
- `S3 endpoint host "${hostname}" is not in S3_ALLOWED_ENDPOINT_HOSTS.`
4150
+ "Audio-only assets require a transcript. Set includeTranscript: true and ensure the asset has a ready text track (captions/subtitles)."
3342
4151
  );
3343
4152
  }
3344
- }
3345
- function buildCanonicalUri(endpoint, bucket, key) {
3346
- const endpointPath = endpoint.pathname === "/" ? "" : encodePath(endpoint.pathname.replace(/\/+$/, ""));
3347
- const encodedBucket = encodeRFC3986(bucket);
3348
- const encodedKey = encodePath(key);
3349
- return `${endpointPath}/${encodedBucket}/${encodedKey}`;
3350
- }
3351
- function buildCanonicalQuery(params) {
3352
- return Object.entries(params).sort(([a], [b]) => a.localeCompare(b)).map(([key, value]) => `${encodeRFC3986(key)}=${encodeRFC3986(value)}`).join("&");
3353
- }
3354
- async function signString(secretAccessKey, shortDate, region, value) {
3355
- const signingKey = await deriveSigningKey(secretAccessKey, shortDate, region);
3356
- const signatureBytes = await hmacSha256Raw(signingKey, value);
3357
- return bytesToHex(signatureBytes);
3358
- }
3359
- function buildCredentialScope(shortDate, region) {
3360
- return `${shortDate}/${region}/${AWS4_SERVICE}/${AWS4_REQUEST_TERMINATOR}`;
3361
- }
3362
- async function putObjectToS3({
3363
- accessKeyId,
3364
- secretAccessKey,
3365
- endpoint,
3366
- region,
3367
- bucket,
3368
- key,
3369
- body,
3370
- contentType
3371
- }) {
3372
- const resolvedEndpoint = normalizeEndpoint(endpoint);
3373
- const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
3374
- const host = resolvedEndpoint.host;
3375
- const normalizedContentType = contentType?.trim();
3376
- const { amzDate, shortDate } = formatAmzDate();
3377
- const payloadHash = await sha256Hex(body);
3378
- const signingHeaders = [
3379
- ["host", host],
3380
- ["x-amz-content-sha256", payloadHash],
3381
- ["x-amz-date", amzDate],
3382
- ...normalizedContentType ? [["content-type", normalizedContentType]] : []
3383
- ].sort(([a], [b]) => a.localeCompare(b));
3384
- const canonicalHeaders = signingHeaders.map(([name, value]) => `${name}:${value}`).join("\n");
3385
- const signedHeaders = signingHeaders.map(([name]) => name).join(";");
3386
- const canonicalRequest = [
3387
- "PUT",
3388
- canonicalUri,
3389
- "",
3390
- `${canonicalHeaders}
3391
- `,
3392
- signedHeaders,
3393
- payloadHash
3394
- ].join("\n");
3395
- const credentialScope = buildCredentialScope(shortDate, region);
3396
- const stringToSign = [
3397
- AWS4_ALGORITHM,
3398
- amzDate,
3399
- credentialScope,
3400
- await sha256Hex(canonicalRequest)
3401
- ].join("\n");
3402
- const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
3403
- const authorization = `${AWS4_ALGORITHM} Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}`;
3404
- const requestUrl = `${resolvedEndpoint.origin}${canonicalUri}`;
3405
- const response = await fetch(requestUrl, {
3406
- method: "PUT",
3407
- headers: {
3408
- "Authorization": authorization,
3409
- "x-amz-content-sha256": payloadHash,
3410
- "x-amz-date": amzDate,
3411
- ...normalizedContentType ? { "content-type": normalizedContentType } : {}
3412
- },
3413
- body
3414
- });
3415
- if (!response.ok) {
3416
- const errorBody = await response.text().catch(() => "");
3417
- const detail = errorBody ? ` ${errorBody}` : "";
3418
- throw new Error(`S3 PUT failed (${response.status} ${response.statusText}).${detail}`);
4153
+ const signingContext = await resolveMuxSigningContext(workflowCredentials);
4154
+ if (policy === "signed" && !signingContext) {
4155
+ throw new Error(
4156
+ "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
4157
+ );
3419
4158
  }
3420
- }
3421
- async function createPresignedGetUrl({
3422
- accessKeyId,
3423
- secretAccessKey,
3424
- endpoint,
3425
- region,
3426
- bucket,
3427
- key,
3428
- expiresInSeconds = 3600
3429
- }) {
3430
- const resolvedEndpoint = normalizeEndpoint(endpoint);
3431
- const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
3432
- const host = resolvedEndpoint.host;
3433
- const { amzDate, shortDate } = formatAmzDate();
3434
- const credentialScope = buildCredentialScope(shortDate, region);
3435
- const signedHeaders = "host";
3436
- const queryParams = {
3437
- "X-Amz-Algorithm": AWS4_ALGORITHM,
3438
- "X-Amz-Credential": `${accessKeyId}/${credentialScope}`,
3439
- "X-Amz-Date": amzDate,
3440
- "X-Amz-Expires": `${expiresInSeconds}`,
3441
- "X-Amz-SignedHeaders": signedHeaders
3442
- };
3443
- const canonicalQuery = buildCanonicalQuery(queryParams);
3444
- const canonicalRequest = [
3445
- "GET",
3446
- canonicalUri,
3447
- canonicalQuery,
3448
- `host:${host}
3449
- `,
3450
- signedHeaders,
3451
- "UNSIGNED-PAYLOAD"
3452
- ].join("\n");
3453
- const stringToSign = [
3454
- AWS4_ALGORITHM,
3455
- amzDate,
3456
- credentialScope,
3457
- await sha256Hex(canonicalRequest)
3458
- ].join("\n");
3459
- const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
3460
- const queryWithSignature = `${canonicalQuery}&X-Amz-Signature=${signature}`;
3461
- return `${resolvedEndpoint.origin}${canonicalUri}?${queryWithSignature}`;
3462
- }
3463
-
3464
- // src/lib/storage-adapter.ts
3465
- function requireCredentials(accessKeyId, secretAccessKey) {
3466
- if (!accessKeyId || !secretAccessKey) {
4159
+ const transcriptResult = includeTranscript ? await fetchTranscriptForAsset(assetData, playbackId, {
4160
+ cleanTranscript,
4161
+ shouldSign: policy === "signed",
4162
+ credentials: workflowCredentials,
4163
+ required: isAudioOnly
4164
+ }) : void 0;
4165
+ const transcriptText = transcriptResult?.transcriptText ?? "";
4166
+ const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult?.track?.language_code ?? getReadyTextTracks(assetData)[0]?.language_code;
4167
+ const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
4168
+ const userPrompt = buildUserPrompt4({
4169
+ tone,
4170
+ transcriptText,
4171
+ isCleanTranscript: cleanTranscript,
4172
+ promptOverrides,
4173
+ isAudioOnly,
4174
+ titleLength,
4175
+ descriptionLength,
4176
+ tagCount,
4177
+ languageName
4178
+ });
4179
+ let analysisResponse;
4180
+ let imageUrl;
4181
+ const systemPrompt = isAudioOnly ? AUDIO_ONLY_SYSTEM_PROMPT : SYSTEM_PROMPT4;
4182
+ try {
4183
+ if (isAudioOnly) {
4184
+ analysisResponse = await analyzeAudioOnly(
4185
+ modelConfig.provider,
4186
+ modelConfig.modelId,
4187
+ userPrompt,
4188
+ systemPrompt,
4189
+ workflowCredentials
4190
+ );
4191
+ } else {
4192
+ const storyboardUrl = await getStoryboardUrl(playbackId, 640, policy === "signed", workflowCredentials);
4193
+ imageUrl = storyboardUrl;
4194
+ if (imageSubmissionMode === "base64") {
4195
+ const downloadResult = await downloadImageAsBase64(storyboardUrl, imageDownloadOptions);
4196
+ analysisResponse = await analyzeStoryboard2(
4197
+ downloadResult.base64Data,
4198
+ modelConfig.provider,
4199
+ modelConfig.modelId,
4200
+ userPrompt,
4201
+ systemPrompt,
4202
+ workflowCredentials
4203
+ );
4204
+ } else {
4205
+ analysisResponse = await withRetry(() => analyzeStoryboard2(
4206
+ storyboardUrl,
4207
+ modelConfig.provider,
4208
+ modelConfig.modelId,
4209
+ userPrompt,
4210
+ systemPrompt,
4211
+ workflowCredentials
4212
+ ));
4213
+ }
4214
+ }
4215
+ } catch (error) {
4216
+ const contentType = isAudioOnly ? "audio" : "video";
3467
4217
  throw new Error(
3468
- "S3 credentials are required for default storage operations. Provide S3_ACCESS_KEY_ID and S3_SECRET_ACCESS_KEY or pass options.storageAdapter."
4218
+ `Failed to analyze ${contentType} content with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
3469
4219
  );
3470
4220
  }
3471
- return { accessKeyId, secretAccessKey };
3472
- }
3473
- async function putObjectWithStorageAdapter(input, adapter) {
3474
- if (adapter) {
3475
- await adapter.putObject(input);
3476
- return;
4221
+ if (!analysisResponse.result) {
4222
+ throw new Error(`Failed to analyze video content for asset ${assetId}`);
3477
4223
  }
3478
- const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
3479
- await putObjectToS3({
3480
- accessKeyId: credentials.accessKeyId,
3481
- secretAccessKey: credentials.secretAccessKey,
3482
- endpoint: input.endpoint,
3483
- region: input.region,
3484
- bucket: input.bucket,
3485
- key: input.key,
3486
- body: input.body,
3487
- contentType: input.contentType
3488
- });
3489
- }
3490
- async function createPresignedGetUrlWithStorageAdapter(input, adapter) {
3491
- if (adapter) {
3492
- return adapter.createPresignedGetUrl(input);
4224
+ if (!analysisResponse.result.title) {
4225
+ throw new Error(`Failed to generate title for asset ${assetId}`);
3493
4226
  }
3494
- const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
3495
- return createPresignedGetUrl({
3496
- accessKeyId: credentials.accessKeyId,
3497
- secretAccessKey: credentials.secretAccessKey,
3498
- endpoint: input.endpoint,
3499
- region: input.region,
3500
- bucket: input.bucket,
3501
- key: input.key,
3502
- expiresInSeconds: input.expiresInSeconds
3503
- });
4227
+ if (!analysisResponse.result.description) {
4228
+ throw new Error(`Failed to generate description for asset ${assetId}`);
4229
+ }
4230
+ return {
4231
+ assetId,
4232
+ title: analysisResponse.result.title,
4233
+ description: analysisResponse.result.description,
4234
+ tags: normalizeKeywords(analysisResponse.result.keywords, tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT),
4235
+ storyboardUrl: imageUrl,
4236
+ // undefined for audio-only assets
4237
+ usage: {
4238
+ ...analysisResponse.usage,
4239
+ metadata: {
4240
+ assetDurationSeconds
4241
+ }
4242
+ },
4243
+ transcriptText: transcriptText || void 0
4244
+ };
3504
4245
  }
3505
4246
 
3506
4247
  // src/workflows/translate-audio.ts
@@ -3679,7 +4420,8 @@ async function uploadDubbedAudioToS3({
3679
4420
  s3Endpoint,
3680
4421
  s3Region,
3681
4422
  s3Bucket,
3682
- storageAdapter
4423
+ storageAdapter,
4424
+ s3SignedUrlExpirySeconds
3683
4425
  }) {
3684
4426
  "use step";
3685
4427
  const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
@@ -3702,10 +4444,11 @@ async function uploadDubbedAudioToS3({
3702
4444
  region: s3Region,
3703
4445
  bucket: s3Bucket,
3704
4446
  key: audioKey,
3705
- expiresInSeconds: 3600
4447
+ expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
3706
4448
  }, storageAdapter);
4449
+ const expiryHours = Math.round((s3SignedUrlExpirySeconds ?? 86400) / 3600);
3707
4450
  console.warn(`\u2705 Audio uploaded successfully to: ${audioKey}`);
3708
- console.warn(`\u{1F517} Generated presigned URL (expires in 1 hour)`);
4451
+ console.warn(`\u{1F517} Generated presigned URL (expires in ${expiryHours} hour${expiryHours === 1 ? "" : "s"})`);
3709
4452
  return presignedUrl;
3710
4453
  }
3711
4454
  async function createAudioTrackOnMux(assetId, languageCode, presignedUrl, credentials) {
@@ -3869,7 +4612,8 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
3869
4612
  s3Endpoint,
3870
4613
  s3Region,
3871
4614
  s3Bucket,
3872
- storageAdapter: effectiveStorageAdapter
4615
+ storageAdapter: effectiveStorageAdapter,
4616
+ s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
3873
4617
  });
3874
4618
  } catch (error) {
3875
4619
  throw new Error(`Failed to upload audio to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
@@ -3905,19 +4649,186 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
3905
4649
  }
3906
4650
 
3907
4651
  // src/workflows/translate-captions.ts
3908
- import { generateText as generateText5, Output as Output5 } from "ai";
3909
- import { z as z6 } from "zod";
3910
- var translationSchema = z6.object({
3911
- translation: z6.string()
4652
+ import {
4653
+ APICallError,
4654
+ generateText as generateText6,
4655
+ NoObjectGeneratedError,
4656
+ Output as Output6,
4657
+ RetryError,
4658
+ TypeValidationError
4659
+ } from "ai";
4660
+ import dedent6 from "dedent";
4661
+ import { z as z7 } from "zod";
4662
+ var translationSchema = z7.object({
4663
+ translation: z7.string()
3912
4664
  });
3913
- var SYSTEM_PROMPT4 = 'You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user. Preserve all timestamps and VTT formatting exactly as they appear. Return JSON with a single key "translation" containing the translated VTT content.';
3914
- async function fetchVttFromMux(vttUrl) {
3915
- "use step";
3916
- const vttResponse = await fetch(vttUrl);
3917
- if (!vttResponse.ok) {
3918
- throw new Error(`Failed to fetch VTT file: ${vttResponse.statusText}`);
4665
+ var SYSTEM_PROMPT5 = dedent6`
4666
+ You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user.
4667
+ You may receive either a full VTT file or a chunk from a larger VTT.
4668
+ Preserve all timestamps, cue ordering, and VTT formatting exactly as they appear.
4669
+ Return JSON with a single key "translation" containing the translated VTT content.
4670
+ `;
4671
+ var CUE_TRANSLATION_SYSTEM_PROMPT = dedent6`
4672
+ You are a subtitle translation expert.
4673
+ You will receive a sequence of subtitle cues extracted from a VTT file.
4674
+ Translate the cues to the requested target language while preserving their original order.
4675
+ Treat the cue list as continuous context so the translation reads naturally across adjacent lines.
4676
+ Return JSON with a single key "translations" containing exactly one translated string for each input cue.
4677
+ Do not merge, split, omit, reorder, or add cues.
4678
+ `;
4679
+ var DEFAULT_TRANSLATION_CHUNKING = {
4680
+ enabled: true,
4681
+ minimumAssetDurationSeconds: 30 * 60,
4682
+ targetChunkDurationSeconds: 30 * 60,
4683
+ maxConcurrentTranslations: 4,
4684
+ maxCuesPerChunk: 80,
4685
+ maxCueTextTokensPerChunk: 2e3
4686
+ };
4687
+ var TOKEN_USAGE_FIELDS = [
4688
+ "inputTokens",
4689
+ "outputTokens",
4690
+ "totalTokens",
4691
+ "reasoningTokens",
4692
+ "cachedInputTokens"
4693
+ ];
4694
+ var TranslationChunkValidationError = class extends Error {
4695
+ constructor(message) {
4696
+ super(message);
4697
+ this.name = "TranslationChunkValidationError";
3919
4698
  }
3920
- return vttResponse.text();
4699
+ };
4700
+ function isTranslationChunkValidationError(error) {
4701
+ return error instanceof TranslationChunkValidationError;
4702
+ }
4703
+ function isProviderServiceError(error) {
4704
+ if (!error) {
4705
+ return false;
4706
+ }
4707
+ if (RetryError.isInstance(error)) {
4708
+ return isProviderServiceError(error.lastError);
4709
+ }
4710
+ if (APICallError.isInstance(error)) {
4711
+ return true;
4712
+ }
4713
+ if (error instanceof Error && "cause" in error) {
4714
+ return isProviderServiceError(error.cause);
4715
+ }
4716
+ return false;
4717
+ }
4718
+ function shouldSplitChunkTranslationError(error) {
4719
+ if (isProviderServiceError(error)) {
4720
+ return false;
4721
+ }
4722
+ return NoObjectGeneratedError.isInstance(error) || TypeValidationError.isInstance(error) || isTranslationChunkValidationError(error);
4723
+ }
4724
+ function isDefinedTokenUsageValue(value) {
4725
+ return typeof value === "number";
4726
+ }
4727
+ function resolveTranslationChunkingOptions(options) {
4728
+ const targetChunkDurationSeconds = Math.max(
4729
+ 1,
4730
+ options?.targetChunkDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.targetChunkDurationSeconds
4731
+ );
4732
+ return {
4733
+ enabled: options?.enabled ?? DEFAULT_TRANSLATION_CHUNKING.enabled,
4734
+ minimumAssetDurationSeconds: Math.max(
4735
+ 1,
4736
+ options?.minimumAssetDurationSeconds ?? DEFAULT_TRANSLATION_CHUNKING.minimumAssetDurationSeconds
4737
+ ),
4738
+ targetChunkDurationSeconds,
4739
+ maxConcurrentTranslations: Math.max(
4740
+ 1,
4741
+ options?.maxConcurrentTranslations ?? DEFAULT_TRANSLATION_CHUNKING.maxConcurrentTranslations
4742
+ ),
4743
+ maxCuesPerChunk: Math.max(
4744
+ 1,
4745
+ options?.maxCuesPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCuesPerChunk
4746
+ ),
4747
+ maxCueTextTokensPerChunk: Math.max(
4748
+ 1,
4749
+ options?.maxCueTextTokensPerChunk ?? DEFAULT_TRANSLATION_CHUNKING.maxCueTextTokensPerChunk
4750
+ )
4751
+ };
4752
+ }
4753
+ function aggregateTokenUsage(usages) {
4754
+ return TOKEN_USAGE_FIELDS.reduce((aggregate, field) => {
4755
+ const values = usages.map((usage) => usage[field]).filter(isDefinedTokenUsageValue);
4756
+ if (values.length > 0) {
4757
+ aggregate[field] = values.reduce((total, value) => total + value, 0);
4758
+ }
4759
+ return aggregate;
4760
+ }, {});
4761
+ }
4762
+ function createTranslationChunkRequest(id, cues, cueBlocks) {
4763
+ return {
4764
+ id,
4765
+ cueCount: cues.length,
4766
+ startTime: cues[0].startTime,
4767
+ endTime: cues[cues.length - 1].endTime,
4768
+ cues,
4769
+ cueBlocks
4770
+ };
4771
+ }
4772
+ function splitTranslationChunkRequestByBudget(id, cues, cueBlocks, maxCuesPerChunk, maxCueTextTokensPerChunk) {
4773
+ const chunks = chunkVTTCuesByBudget(cues, {
4774
+ maxCuesPerChunk,
4775
+ maxTextTokensPerChunk: maxCueTextTokensPerChunk
4776
+ });
4777
+ return chunks.map(
4778
+ (chunk, index) => createTranslationChunkRequest(
4779
+ chunks.length === 1 ? id : `${id}-part-${index}`,
4780
+ cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4781
+ cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1)
4782
+ )
4783
+ );
4784
+ }
4785
+ function buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunkingOptions) {
4786
+ const resolvedChunking = resolveTranslationChunkingOptions(chunkingOptions);
4787
+ const cues = parseVTTCues(vttContent);
4788
+ if (cues.length === 0) {
4789
+ return null;
4790
+ }
4791
+ const { preamble, cueBlocks } = splitVttPreambleAndCueBlocks(vttContent);
4792
+ if (cueBlocks.length !== cues.length) {
4793
+ console.warn(
4794
+ `Falling back to full-VTT caption translation because cue block count (${cueBlocks.length}) does not match parsed cue count (${cues.length}).`
4795
+ );
4796
+ return null;
4797
+ }
4798
+ if (!resolvedChunking.enabled) {
4799
+ return {
4800
+ preamble,
4801
+ chunks: [
4802
+ createTranslationChunkRequest("chunk-0", cues, cueBlocks)
4803
+ ]
4804
+ };
4805
+ }
4806
+ if (typeof assetDurationSeconds !== "number" || assetDurationSeconds < resolvedChunking.minimumAssetDurationSeconds) {
4807
+ return {
4808
+ preamble,
4809
+ chunks: [
4810
+ createTranslationChunkRequest("chunk-0", cues, cueBlocks)
4811
+ ]
4812
+ };
4813
+ }
4814
+ const targetChunkDurationSeconds = resolvedChunking.targetChunkDurationSeconds;
4815
+ const durationChunks = chunkVTTCuesByDuration(cues, {
4816
+ targetChunkDurationSeconds,
4817
+ maxChunkDurationSeconds: Math.max(targetChunkDurationSeconds, Math.round(targetChunkDurationSeconds * (7 / 6))),
4818
+ minChunkDurationSeconds: Math.max(1, Math.round(targetChunkDurationSeconds * (2 / 3)))
4819
+ });
4820
+ return {
4821
+ preamble,
4822
+ chunks: durationChunks.flatMap(
4823
+ (chunk) => splitTranslationChunkRequestByBudget(
4824
+ chunk.id,
4825
+ cues.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4826
+ cueBlocks.slice(chunk.cueStartIndex, chunk.cueEndIndex + 1),
4827
+ resolvedChunking.maxCuesPerChunk,
4828
+ resolvedChunking.maxCueTextTokensPerChunk
4829
+ )
4830
+ )
4831
+ };
3921
4832
  }
3922
4833
  async function translateVttWithAI({
3923
4834
  vttContent,
@@ -3929,13 +4840,13 @@ async function translateVttWithAI({
3929
4840
  }) {
3930
4841
  "use step";
3931
4842
  const model = await createLanguageModelFromConfig(provider, modelId, credentials);
3932
- const response = await generateText5({
4843
+ const response = await generateText6({
3933
4844
  model,
3934
- output: Output5.object({ schema: translationSchema }),
4845
+ output: Output6.object({ schema: translationSchema }),
3935
4846
  messages: [
3936
4847
  {
3937
4848
  role: "system",
3938
- content: SYSTEM_PROMPT4
4849
+ content: SYSTEM_PROMPT5
3939
4850
  },
3940
4851
  {
3941
4852
  role: "user",
@@ -3956,6 +4867,176 @@ ${vttContent}`
3956
4867
  }
3957
4868
  };
3958
4869
  }
4870
+ async function translateCueChunkWithAI({
4871
+ cues,
4872
+ fromLanguageCode,
4873
+ toLanguageCode,
4874
+ provider,
4875
+ modelId,
4876
+ credentials
4877
+ }) {
4878
+ "use step";
4879
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4880
+ const schema = z7.object({
4881
+ translations: z7.array(z7.string().min(1)).length(cues.length)
4882
+ });
4883
+ const cuePayload = cues.map((cue, index) => ({
4884
+ index,
4885
+ startTime: cue.startTime,
4886
+ endTime: cue.endTime,
4887
+ text: cue.text
4888
+ }));
4889
+ const response = await generateText6({
4890
+ model,
4891
+ output: Output6.object({ schema }),
4892
+ messages: [
4893
+ {
4894
+ role: "system",
4895
+ content: CUE_TRANSLATION_SYSTEM_PROMPT
4896
+ },
4897
+ {
4898
+ role: "user",
4899
+ content: `Translate from ${fromLanguageCode} to ${toLanguageCode}.
4900
+ Return exactly ${cues.length} translated cues in the same order as the input.
4901
+
4902
+ ${JSON.stringify(cuePayload, null, 2)}`
4903
+ }
4904
+ ]
4905
+ });
4906
+ return {
4907
+ translations: response.output.translations,
4908
+ usage: {
4909
+ inputTokens: response.usage.inputTokens,
4910
+ outputTokens: response.usage.outputTokens,
4911
+ totalTokens: response.usage.totalTokens,
4912
+ reasoningTokens: response.usage.reasoningTokens,
4913
+ cachedInputTokens: response.usage.cachedInputTokens
4914
+ }
4915
+ };
4916
+ }
4917
+ function splitTranslationChunkAtMidpoint(chunk) {
4918
+ const midpoint = Math.floor(chunk.cueCount / 2);
4919
+ if (midpoint <= 0 || midpoint >= chunk.cueCount) {
4920
+ throw new Error(`Cannot split chunk ${chunk.id} with cueCount=${chunk.cueCount}`);
4921
+ }
4922
+ return [
4923
+ createTranslationChunkRequest(
4924
+ `${chunk.id}-a`,
4925
+ chunk.cues.slice(0, midpoint),
4926
+ chunk.cueBlocks.slice(0, midpoint)
4927
+ ),
4928
+ createTranslationChunkRequest(
4929
+ `${chunk.id}-b`,
4930
+ chunk.cues.slice(midpoint),
4931
+ chunk.cueBlocks.slice(midpoint)
4932
+ )
4933
+ ];
4934
+ }
4935
+ async function translateChunkWithFallback({
4936
+ chunk,
4937
+ fromLanguageCode,
4938
+ toLanguageCode,
4939
+ provider,
4940
+ modelId,
4941
+ credentials
4942
+ }) {
4943
+ "use step";
4944
+ try {
4945
+ const result = await translateCueChunkWithAI({
4946
+ cues: chunk.cues,
4947
+ fromLanguageCode,
4948
+ toLanguageCode,
4949
+ provider,
4950
+ modelId,
4951
+ credentials
4952
+ });
4953
+ if (result.translations.length !== chunk.cueCount) {
4954
+ throw new TranslationChunkValidationError(
4955
+ `Chunk ${chunk.id} returned ${result.translations.length} cues, expected ${chunk.cueCount} for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s`
4956
+ );
4957
+ }
4958
+ return {
4959
+ translatedVtt: buildVttFromTranslatedCueBlocks(chunk.cueBlocks, result.translations),
4960
+ usage: result.usage
4961
+ };
4962
+ } catch (error) {
4963
+ if (!shouldSplitChunkTranslationError(error) || chunk.cueCount <= 1) {
4964
+ throw new Error(
4965
+ `Chunk ${chunk.id} failed for ${Math.round(chunk.startTime)}s-${Math.round(chunk.endTime)}s: ${error instanceof Error ? error.message : "Unknown error"}`
4966
+ );
4967
+ }
4968
+ const [leftChunk, rightChunk] = splitTranslationChunkAtMidpoint(chunk);
4969
+ const [leftResult, rightResult] = await Promise.all([
4970
+ translateChunkWithFallback({
4971
+ chunk: leftChunk,
4972
+ fromLanguageCode,
4973
+ toLanguageCode,
4974
+ provider,
4975
+ modelId,
4976
+ credentials
4977
+ }),
4978
+ translateChunkWithFallback({
4979
+ chunk: rightChunk,
4980
+ fromLanguageCode,
4981
+ toLanguageCode,
4982
+ provider,
4983
+ modelId,
4984
+ credentials
4985
+ })
4986
+ ]);
4987
+ return {
4988
+ translatedVtt: concatenateVttSegments([leftResult.translatedVtt, rightResult.translatedVtt]),
4989
+ usage: aggregateTokenUsage([leftResult.usage, rightResult.usage])
4990
+ };
4991
+ }
4992
+ }
4993
+ async function translateCaptionTrack({
4994
+ vttContent,
4995
+ assetDurationSeconds,
4996
+ fromLanguageCode,
4997
+ toLanguageCode,
4998
+ provider,
4999
+ modelId,
5000
+ credentials,
5001
+ chunking
5002
+ }) {
5003
+ "use step";
5004
+ const chunkPlan = buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunking);
5005
+ if (!chunkPlan) {
5006
+ return translateVttWithAI({
5007
+ vttContent,
5008
+ fromLanguageCode,
5009
+ toLanguageCode,
5010
+ provider,
5011
+ modelId,
5012
+ credentials
5013
+ });
5014
+ }
5015
+ const resolvedChunking = resolveTranslationChunkingOptions(chunking);
5016
+ const translatedSegments = [];
5017
+ const usageByChunk = [];
5018
+ for (let index = 0; index < chunkPlan.chunks.length; index += resolvedChunking.maxConcurrentTranslations) {
5019
+ const batch = chunkPlan.chunks.slice(index, index + resolvedChunking.maxConcurrentTranslations);
5020
+ const batchResults = await Promise.all(
5021
+ batch.map(
5022
+ (chunk) => translateChunkWithFallback({
5023
+ chunk,
5024
+ fromLanguageCode,
5025
+ toLanguageCode,
5026
+ provider,
5027
+ modelId,
5028
+ credentials
5029
+ })
5030
+ )
5031
+ );
5032
+ translatedSegments.push(...batchResults.map((result) => result.translatedVtt));
5033
+ usageByChunk.push(...batchResults.map((result) => result.usage));
5034
+ }
5035
+ return {
5036
+ translatedVtt: concatenateVttSegments(translatedSegments, chunkPlan.preamble),
5037
+ usage: aggregateTokenUsage(usageByChunk)
5038
+ };
5039
+ }
3959
5040
  async function uploadVttToS3({
3960
5041
  translatedVtt,
3961
5042
  assetId,
@@ -3964,7 +5045,8 @@ async function uploadVttToS3({
3964
5045
  s3Endpoint,
3965
5046
  s3Region,
3966
5047
  s3Bucket,
3967
- storageAdapter
5048
+ storageAdapter,
5049
+ s3SignedUrlExpirySeconds
3968
5050
  }) {
3969
5051
  "use step";
3970
5052
  const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
@@ -3987,25 +5069,9 @@ async function uploadVttToS3({
3987
5069
  region: s3Region,
3988
5070
  bucket: s3Bucket,
3989
5071
  key: vttKey,
3990
- expiresInSeconds: 3600
5072
+ expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
3991
5073
  }, storageAdapter);
3992
5074
  }
3993
- async function createTextTrackOnMux(assetId, languageCode, trackName, presignedUrl, credentials) {
3994
- "use step";
3995
- const muxClient = await resolveMuxClient(credentials);
3996
- const mux = await muxClient.createClient();
3997
- const trackResponse = await mux.video.assets.createTrack(assetId, {
3998
- type: "text",
3999
- text_type: "subtitles",
4000
- language_code: languageCode,
4001
- name: trackName,
4002
- url: presignedUrl
4003
- });
4004
- if (!trackResponse.id) {
4005
- throw new Error("Failed to create text track: no track ID returned from Mux");
4006
- }
4007
- return trackResponse.id;
4008
- }
4009
5075
  async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, options) {
4010
5076
  "use workflow";
4011
5077
  const {
@@ -4016,7 +5082,8 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4016
5082
  s3Bucket: providedS3Bucket,
4017
5083
  uploadToMux: uploadToMuxOption,
4018
5084
  storageAdapter,
4019
- credentials: providedCredentials
5085
+ credentials: providedCredentials,
5086
+ chunking
4020
5087
  } = options;
4021
5088
  const credentials = providedCredentials;
4022
5089
  const effectiveStorageAdapter = storageAdapter;
@@ -4077,13 +5144,15 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4077
5144
  let translatedVtt;
4078
5145
  let usage;
4079
5146
  try {
4080
- const result = await translateVttWithAI({
5147
+ const result = await translateCaptionTrack({
4081
5148
  vttContent,
5149
+ assetDurationSeconds,
4082
5150
  fromLanguageCode,
4083
5151
  toLanguageCode,
4084
5152
  provider: modelConfig.provider,
4085
5153
  modelId: modelConfig.modelId,
4086
- credentials
5154
+ credentials,
5155
+ chunking
4087
5156
  });
4088
5157
  translatedVtt = result.translatedVtt;
4089
5158
  usage = result.usage;
@@ -4120,7 +5189,8 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4120
5189
  s3Endpoint,
4121
5190
  s3Region,
4122
5191
  s3Bucket,
4123
- storageAdapter: effectiveStorageAdapter
5192
+ storageAdapter: effectiveStorageAdapter,
5193
+ s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
4124
5194
  });
4125
5195
  } catch (error) {
4126
5196
  throw new Error(`Failed to upload VTT to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
@@ -4153,21 +5223,33 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4153
5223
  };
4154
5224
  }
4155
5225
  export {
5226
+ DEFAULT_DESCRIPTION_LENGTH,
5227
+ DEFAULT_SUMMARY_KEYWORD_LIMIT,
5228
+ DEFAULT_TITLE_LENGTH,
4156
5229
  HIVE_SEXUAL_CATEGORIES,
4157
5230
  HIVE_VIOLENCE_CATEGORIES,
4158
- SUMMARY_KEYWORD_LIMIT,
5231
+ aggregateTokenUsage,
5232
+ applyOverrideLists,
5233
+ applyReplacements,
4159
5234
  askQuestions,
5235
+ buildReplacementRegex,
4160
5236
  burnedInCaptionsSchema,
5237
+ censorVttContent,
4161
5238
  chapterSchema,
4162
5239
  chaptersSchema,
5240
+ createReplacer,
5241
+ editCaptions,
4163
5242
  generateChapters,
4164
5243
  generateEmbeddings,
4165
5244
  generateVideoEmbeddings,
4166
5245
  getModerationScores,
4167
5246
  getSummaryAndTags,
4168
5247
  hasBurnedInCaptions,
5248
+ profanityDetectionSchema,
4169
5249
  questionAnswerSchema,
5250
+ shouldSplitChunkTranslationError,
4170
5251
  summarySchema,
5252
+ transformCueText,
4171
5253
  translateAudio,
4172
5254
  translateCaptions,
4173
5255
  translationSchema