@mux/ai 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1304,12 +1304,14 @@ async function fetchTranscriptForAsset(asset, playbackId, options = {}) {
1304
1304
  // src/workflows/ask-questions.ts
1305
1305
  var questionAnswerSchema = z2.object({
1306
1306
  question: z2.string(),
1307
- answer: z2.string(),
1307
+ answer: z2.string().optional(),
1308
1308
  confidence: z2.number(),
1309
- reasoning: z2.string()
1309
+ reasoning: z2.string(),
1310
+ skipped: z2.boolean()
1310
1311
  });
1312
+ var SKIP_SENTINEL = "__SKIPPED__";
1311
1313
  function createAskQuestionsSchema(allowedAnswers) {
1312
- const answerSchema = z2.enum(allowedAnswers);
1314
+ const answerSchema = z2.enum([...allowedAnswers, SKIP_SENTINEL]);
1313
1315
  return z2.object({
1314
1316
  answers: z2.array(
1315
1317
  questionAnswerSchema.extend({
@@ -1365,8 +1367,32 @@ var SYSTEM_PROMPT = dedent`
1365
1367
  - Be precise: cite specific frames, objects, actions, or transcript quotes
1366
1368
  </answer_guidelines>
1367
1369
 
1370
+ <relevance_filtering>
1371
+ Before answering each question, assess whether it can be meaningfully
1372
+ answered based on the video storyboard and/or transcript. A question is
1373
+ relevant if it asks about something observable or inferable from the
1374
+ video content (visuals, audio, dialogue, setting, subjects, actions, etc.).
1375
+
1376
+ Mark a question as skipped (skipped: true) if it:
1377
+ - Is completely unrelated to video content (e.g., math, trivia, personal questions)
1378
+ - Asks about information that cannot be determined from storyboard frames or transcript
1379
+ - Is a general knowledge question with no connection to what is shown or said in the video
1380
+ - Attempts to use the system for non-video-analysis purposes
1381
+
1382
+ For skipped questions:
1383
+ - Set skipped to true
1384
+ - Set answer to "${SKIP_SENTINEL}"
1385
+ - Set confidence to 0
1386
+ - Use the reasoning field to explain why the question is not answerable
1387
+ from the video content
1388
+
1389
+ For borderline questions that are loosely related to the video content,
1390
+ still answer them but use a lower confidence score to reflect uncertainty.
1391
+ </relevance_filtering>
1392
+
1368
1393
  <constraints>
1369
- - You MUST answer every question with one of the allowed response options
1394
+ - You MUST answer every relevant question with one of the allowed response options
1395
+ - Skip irrelevant questions as described in relevance_filtering
1370
1396
  - Only describe observable evidence from frames or transcript
1371
1397
  - Do not fabricate details or make unsupported assumptions
1372
1398
  - Return structured data matching the requested schema exactly
@@ -1442,14 +1468,7 @@ async function analyzeQuestionsWithStoryboard(imageDataUrl, provider, modelId, u
1442
1468
  ]
1443
1469
  });
1444
1470
  return {
1445
- result: {
1446
- answers: response.output.answers.map((answer) => ({
1447
- ...answer,
1448
- // Strip numbering prefix (e.g., "1. " or "2. ") from questions
1449
- question: answer.question.replace(/^\d+\.\s*/, ""),
1450
- confidence: Math.min(1, Math.max(0, answer.confidence))
1451
- }))
1452
- },
1471
+ result: response.output,
1453
1472
  usage: {
1454
1473
  inputTokens: response.usage.inputTokens,
1455
1474
  outputTokens: response.usage.outputTokens,
@@ -1555,9 +1574,20 @@ async function askQuestions(assetId, questions, options) {
1555
1574
  `Expected ${questions.length} answers but received ${analysisResponse.result.answers.length}`
1556
1575
  );
1557
1576
  }
1577
+ const answers = analysisResponse.result.answers.map((raw) => {
1578
+ const isSkipped = raw.skipped || raw.answer === SKIP_SENTINEL;
1579
+ return {
1580
+ // Strip numbering prefix (e.g., "1. " or "2. ") from questions
1581
+ question: raw.question.replace(/^\d+\.\s*/, ""),
1582
+ confidence: isSkipped ? 0 : Math.min(1, Math.max(0, raw.confidence)),
1583
+ reasoning: raw.reasoning,
1584
+ skipped: isSkipped,
1585
+ ...isSkipped ? {} : { answer: raw.answer }
1586
+ };
1587
+ });
1558
1588
  return {
1559
1589
  assetId,
1560
- answers: analysisResponse.result.answers,
1590
+ answers,
1561
1591
  storyboardUrl: imageUrl,
1562
1592
  usage: {
1563
1593
  ...analysisResponse.usage,
@@ -2176,1654 +2206,2074 @@ async function generateChapters(assetId, languageCode, options = {}) {
2176
2206
  };
2177
2207
  }
2178
2208
 
2179
- // src/workflows/embeddings.ts
2180
- import { embed } from "ai";
2209
+ // src/workflows/edit-captions.ts
2210
+ import { generateText as generateText4, Output as Output4 } from "ai";
2211
+ import dedent4 from "dedent";
2212
+ import { z as z5 } from "zod";
2181
2213
 
2182
- // src/primitives/text-chunking.ts
2183
- var DEFAULT_MIN_CHUNK_DURATION_RATIO = 2 / 3;
2184
- var DEFAULT_BOUNDARY_LOOKAHEAD_CUES = 12;
2185
- var DEFAULT_BOUNDARY_PAUSE_SECONDS = 1.25;
2186
- var STRONG_BOUNDARY_SCORE = 4;
2187
- var PREFERRED_BOUNDARY_WINDOW_SECONDS = 5 * 60;
2188
- var SENTENCE_BOUNDARY_REGEX = /[.!?]["')\]]*$/;
2189
- var CLAUSE_BOUNDARY_REGEX = /[,;:]["')\]]*$/;
2190
- var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
2191
- function estimateTokenCount(text) {
2192
- const words = text.trim().split(/\s+/).length;
2193
- return Math.ceil(words / 0.75);
2214
+ // src/lib/mux-tracks.ts
2215
+ async function fetchVttFromMux(vttUrl) {
2216
+ "use step";
2217
+ const vttResponse = await fetch(vttUrl);
2218
+ if (!vttResponse.ok) {
2219
+ throw new Error(`Failed to fetch VTT file: ${vttResponse.statusText}`);
2220
+ }
2221
+ return vttResponse.text();
2194
2222
  }
2195
- function chunkByTokens(text, maxTokens, overlapTokens = 0) {
2196
- if (!text.trim()) {
2197
- return [];
2223
+ async function createTextTrackOnMux(assetId, languageCode, trackName, presignedUrl, credentials) {
2224
+ "use step";
2225
+ const muxClient = await resolveMuxClient(credentials);
2226
+ const mux = await muxClient.createClient();
2227
+ const trackResponse = await mux.video.assets.createTrack(assetId, {
2228
+ type: "text",
2229
+ text_type: "subtitles",
2230
+ language_code: languageCode,
2231
+ name: trackName,
2232
+ url: presignedUrl
2233
+ });
2234
+ if (!trackResponse.id) {
2235
+ throw new Error("Failed to create text track: no track ID returned from Mux");
2198
2236
  }
2199
- const chunks = [];
2200
- const words = text.trim().split(/\s+/);
2201
- const wordsPerChunk = Math.floor(maxTokens * 0.75);
2202
- const overlapWords = Math.floor(overlapTokens * 0.75);
2203
- let chunkIndex = 0;
2204
- let currentPosition = 0;
2205
- while (currentPosition < words.length) {
2206
- const chunkWords = words.slice(
2207
- currentPosition,
2208
- currentPosition + wordsPerChunk
2209
- );
2210
- const chunkText2 = chunkWords.join(" ");
2211
- const tokenCount = estimateTokenCount(chunkText2);
2212
- chunks.push({
2213
- id: `chunk-${chunkIndex}`,
2214
- text: chunkText2,
2215
- tokenCount
2216
- });
2217
- currentPosition += wordsPerChunk - overlapWords;
2218
- chunkIndex++;
2219
- if (currentPosition <= (chunkIndex - 1) * (wordsPerChunk - overlapWords)) {
2220
- break;
2221
- }
2237
+ return trackResponse.id;
2238
+ }
2239
+
2240
+ // src/lib/s3-sigv4.ts
2241
+ var AWS4_ALGORITHM = "AWS4-HMAC-SHA256";
2242
+ var AWS4_REQUEST_TERMINATOR = "aws4_request";
2243
+ var AWS4_SERVICE = "s3";
2244
+ var S3_ALLOWED_ENDPOINT_PATTERNS = parseEndpointAllowlist(
2245
+ env_default.S3_ALLOWED_ENDPOINT_HOSTS
2246
+ );
2247
+ function getCrypto() {
2248
+ const webCrypto = globalThis.crypto;
2249
+ if (!webCrypto?.subtle) {
2250
+ throw new Error("Web Crypto API is required for S3 signing.");
2222
2251
  }
2223
- return chunks;
2252
+ return webCrypto;
2224
2253
  }
2225
- function createChunkFromCues(cues, index) {
2226
- const text = cues.map((c) => c.text).join(" ");
2227
- return {
2228
- id: `chunk-${index}`,
2229
- text,
2230
- tokenCount: estimateTokenCount(text),
2231
- startTime: cues[0].startTime,
2232
- endTime: cues[cues.length - 1].endTime
2233
- };
2254
+ var textEncoder = new TextEncoder();
2255
+ function toBytes(value) {
2256
+ return typeof value === "string" ? textEncoder.encode(value) : value;
2234
2257
  }
2235
- function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
2236
- if (cues.length === 0)
2237
- return [];
2238
- const chunks = [];
2239
- let currentCues = [];
2240
- let currentTokens = 0;
2241
- let chunkIndex = 0;
2242
- for (let i = 0; i < cues.length; i++) {
2243
- const cue = cues[i];
2244
- const cueTokens = estimateTokenCount(cue.text);
2245
- if (currentTokens + cueTokens > maxTokens && currentCues.length > 0) {
2246
- chunks.push(createChunkFromCues(currentCues, chunkIndex));
2247
- chunkIndex++;
2248
- const overlapStart = Math.max(0, currentCues.length - overlapCues);
2249
- currentCues = currentCues.slice(overlapStart);
2250
- currentTokens = currentCues.reduce(
2251
- (sum, c) => sum + estimateTokenCount(c.text),
2252
- 0
2253
- );
2254
- }
2255
- currentCues.push(cue);
2256
- currentTokens += cueTokens;
2257
- }
2258
- if (currentCues.length > 0) {
2259
- chunks.push(createChunkFromCues(currentCues, chunkIndex));
2260
- }
2261
- return chunks;
2258
+ function bytesToHex(bytes) {
2259
+ return Array.from(bytes).map((byte) => byte.toString(16).padStart(2, "0")).join("");
2262
2260
  }
2263
- function scoreCueBoundary(cues, index, boundaryPauseSeconds) {
2264
- const cue = cues[index];
2265
- const nextCue = cues[index + 1];
2266
- if (!nextCue) {
2267
- return Number.POSITIVE_INFINITY;
2268
- }
2269
- const trimmedText = cue.text.trim();
2270
- let score = 0;
2271
- if (SENTENCE_BOUNDARY_REGEX.test(trimmedText)) {
2272
- score += 4;
2273
- } else if (CLAUSE_BOUNDARY_REGEX.test(trimmedText)) {
2274
- score += 2;
2275
- }
2276
- if (nextCue.startTime - cue.endTime >= boundaryPauseSeconds) {
2277
- score += 2;
2261
+ async function sha256Hex(value) {
2262
+ const digest = await getCrypto().subtle.digest("SHA-256", toBytes(value));
2263
+ return bytesToHex(new Uint8Array(digest));
2264
+ }
2265
+ async function hmacSha256Raw(key, value) {
2266
+ const cryptoKey = await getCrypto().subtle.importKey(
2267
+ "raw",
2268
+ key,
2269
+ { name: "HMAC", hash: "SHA-256" },
2270
+ false,
2271
+ ["sign"]
2272
+ );
2273
+ const signature = await getCrypto().subtle.sign("HMAC", cryptoKey, textEncoder.encode(value));
2274
+ return new Uint8Array(signature);
2275
+ }
2276
+ async function deriveSigningKey(secretAccessKey, shortDate, region) {
2277
+ const kDate = await hmacSha256Raw(textEncoder.encode(`AWS4${secretAccessKey}`), shortDate);
2278
+ const kRegion = await hmacSha256Raw(kDate, region);
2279
+ const kService = await hmacSha256Raw(kRegion, AWS4_SERVICE);
2280
+ return hmacSha256Raw(kService, AWS4_REQUEST_TERMINATOR);
2281
+ }
2282
+ function formatAmzDate(date = /* @__PURE__ */ new Date()) {
2283
+ const iso = date.toISOString();
2284
+ const shortDate = iso.slice(0, 10).replace(/-/g, "");
2285
+ const amzDate = `${iso.slice(0, 19).replace(/[-:]/g, "")}Z`;
2286
+ return { amzDate, shortDate };
2287
+ }
2288
+ function encodeRFC3986(value) {
2289
+ return encodeURIComponent(value).replace(/[!'()*]/g, (char) => `%${char.charCodeAt(0).toString(16).toUpperCase()}`);
2290
+ }
2291
+ function encodePath(path) {
2292
+ return path.split("/").map((segment) => encodeRFC3986(segment)).join("/");
2293
+ }
2294
+ function normalizeEndpoint(endpoint) {
2295
+ let url;
2296
+ try {
2297
+ url = new URL(endpoint);
2298
+ } catch {
2299
+ throw new Error(`Invalid S3 endpoint: ${endpoint}`);
2278
2300
  }
2279
- if (NEXT_SENTENCE_START_REGEX.test(nextCue.text.trim())) {
2280
- score += 1;
2301
+ if (url.search || url.hash) {
2302
+ throw new Error("S3 endpoint must not include query params or hash fragments.");
2281
2303
  }
2282
- return score;
2304
+ enforceEndpointPolicy(url);
2305
+ return url;
2283
2306
  }
2284
- function chunkVTTCuesByBudget(cues, options) {
2285
- if (cues.length === 0) {
2307
+ function parseEndpointAllowlist(allowlist) {
2308
+ if (!allowlist) {
2286
2309
  return [];
2287
2310
  }
2288
- const maxCuesPerChunk = Math.max(1, options.maxCuesPerChunk);
2289
- let maxTextTokensPerChunk = Number.POSITIVE_INFINITY;
2290
- if (options.maxTextTokensPerChunk) {
2291
- maxTextTokensPerChunk = Math.max(1, options.maxTextTokensPerChunk);
2292
- }
2293
- const chunks = [];
2294
- let chunkIndex = 0;
2295
- let cueStartIndex = 0;
2296
- let currentTokenCount = 0;
2297
- for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
2298
- const cue = cues[cueIndex];
2299
- const cueTokenCount = estimateTokenCount(cue.text);
2300
- const currentCueCount = cueIndex - cueStartIndex;
2301
- const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
2302
- const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
2303
- if (wouldExceedCueCount || wouldExceedTokenCount) {
2304
- chunks.push({
2305
- id: `chunk-${chunkIndex}`,
2306
- cueStartIndex,
2307
- cueEndIndex: cueIndex - 1,
2308
- cueCount: cueIndex - cueStartIndex,
2309
- startTime: cues[cueStartIndex].startTime,
2310
- endTime: cues[cueIndex - 1].endTime
2311
- });
2312
- cueStartIndex = cueIndex;
2313
- currentTokenCount = 0;
2314
- chunkIndex++;
2315
- }
2316
- currentTokenCount += cueTokenCount;
2311
+ return allowlist.split(",").map((value) => value.trim().toLowerCase()).filter(Boolean);
2312
+ }
2313
+ function hostnameMatchesPattern(hostname, pattern) {
2314
+ if (pattern.startsWith("*.")) {
2315
+ const suffix = pattern.slice(1);
2316
+ return hostname.endsWith(suffix) && hostname.length > suffix.length;
2317
2317
  }
2318
- chunks.push({
2319
- id: `chunk-${chunkIndex}`,
2320
- cueStartIndex,
2321
- cueEndIndex: cues.length - 1,
2322
- cueCount: cues.length - cueStartIndex,
2323
- startTime: cues[cueStartIndex].startTime,
2324
- endTime: cues[cues.length - 1].endTime
2325
- });
2326
- return chunks;
2318
+ return hostname === pattern;
2327
2319
  }
2328
- function chunkVTTCuesByDuration(cues, options) {
2329
- if (cues.length === 0) {
2330
- return [];
2320
+ function enforceEndpointPolicy(url) {
2321
+ const hostname = url.hostname.toLowerCase();
2322
+ if (url.protocol !== "https:") {
2323
+ throw new Error(
2324
+ `Insecure S3 endpoint protocol "${url.protocol}" is not allowed. Use HTTPS.`
2325
+ );
2331
2326
  }
2332
- const targetChunkDurationSeconds = Math.max(1, options.targetChunkDurationSeconds);
2333
- const maxChunkDurationSeconds = Math.max(targetChunkDurationSeconds, options.maxChunkDurationSeconds);
2334
- const minChunkDurationSeconds = Math.min(
2335
- targetChunkDurationSeconds,
2336
- Math.max(
2337
- 1,
2338
- options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
2339
- )
2340
- );
2341
- const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
2342
- const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
2343
- const preferredBoundaryStartSeconds = Math.max(
2344
- minChunkDurationSeconds,
2345
- targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
2346
- );
2347
- const chunks = [];
2348
- let chunkIndex = 0;
2349
- let cueStartIndex = 0;
2350
- while (cueStartIndex < cues.length) {
2351
- const chunkStartTime = cues[cueStartIndex].startTime;
2352
- let cueEndIndex = cueStartIndex;
2353
- let bestBoundaryIndex = -1;
2354
- let bestBoundaryScore = -1;
2355
- let bestPreferredBoundaryIndex = -1;
2356
- let bestPreferredBoundaryScore = -1;
2357
- while (cueEndIndex < cues.length) {
2358
- const cue = cues[cueEndIndex];
2359
- const currentDuration = cue.endTime - chunkStartTime;
2360
- if (currentDuration >= minChunkDurationSeconds) {
2361
- const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
2362
- if (boundaryScore >= bestBoundaryScore) {
2363
- bestBoundaryIndex = cueEndIndex;
2364
- bestBoundaryScore = boundaryScore;
2365
- }
2366
- if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
2367
- bestPreferredBoundaryIndex = cueEndIndex;
2368
- bestPreferredBoundaryScore = boundaryScore;
2369
- }
2370
- }
2371
- const nextCue = cues[cueEndIndex + 1];
2372
- if (!nextCue) {
2373
- break;
2374
- }
2375
- const nextDuration = nextCue.endTime - chunkStartTime;
2376
- const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
2377
- const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
2378
- const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
2379
- if (currentDuration >= targetChunkDurationSeconds) {
2380
- if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
2381
- cueEndIndex = preferredBoundaryIndex;
2382
- break;
2383
- }
2384
- if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
2385
- cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
2386
- break;
2387
- }
2388
- }
2389
- if (nextDuration > maxChunkDurationSeconds) {
2390
- cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
2391
- break;
2392
- }
2393
- cueEndIndex++;
2394
- }
2395
- chunks.push({
2396
- id: `chunk-${chunkIndex}`,
2397
- cueStartIndex,
2398
- cueEndIndex,
2399
- cueCount: cueEndIndex - cueStartIndex + 1,
2400
- startTime: cues[cueStartIndex].startTime,
2401
- endTime: cues[cueEndIndex].endTime
2402
- });
2403
- cueStartIndex = cueEndIndex + 1;
2404
- chunkIndex++;
2327
+ if (S3_ALLOWED_ENDPOINT_PATTERNS.length > 0 && !S3_ALLOWED_ENDPOINT_PATTERNS.some((pattern) => hostnameMatchesPattern(hostname, pattern))) {
2328
+ throw new Error(
2329
+ `S3 endpoint host "${hostname}" is not in S3_ALLOWED_ENDPOINT_HOSTS.`
2330
+ );
2405
2331
  }
2406
- return chunks;
2407
2332
  }
2408
- function chunkText(text, strategy) {
2409
- switch (strategy.type) {
2410
- case "token": {
2411
- return chunkByTokens(text, strategy.maxTokens, strategy.overlap ?? 0);
2412
- }
2413
- default: {
2414
- const exhaustiveCheck = strategy;
2415
- throw new Error(`Unsupported chunking strategy: ${exhaustiveCheck}`);
2416
- }
2417
- }
2333
+ function buildCanonicalUri(endpoint, bucket, key) {
2334
+ const endpointPath = endpoint.pathname === "/" ? "" : encodePath(endpoint.pathname.replace(/\/+$/, ""));
2335
+ const encodedBucket = encodeRFC3986(bucket);
2336
+ const encodedKey = encodePath(key);
2337
+ return `${endpointPath}/${encodedBucket}/${encodedKey}`;
2418
2338
  }
2419
-
2420
- // src/workflows/embeddings.ts
2421
- function averageEmbeddings(embeddings) {
2422
- if (embeddings.length === 0) {
2423
- return [];
2424
- }
2425
- const dimensions = embeddings[0].length;
2426
- const averaged = Array.from({ length: dimensions }, () => 0);
2427
- for (const embedding of embeddings) {
2428
- for (let i = 0; i < dimensions; i++) {
2429
- averaged[i] += embedding[i];
2430
- }
2431
- }
2432
- for (let i = 0; i < dimensions; i++) {
2433
- averaged[i] /= embeddings.length;
2339
+ function buildCanonicalQuery(params) {
2340
+ return Object.entries(params).sort(([a], [b]) => a.localeCompare(b)).map(([key, value]) => `${encodeRFC3986(key)}=${encodeRFC3986(value)}`).join("&");
2341
+ }
2342
+ async function signString(secretAccessKey, shortDate, region, value) {
2343
+ const signingKey = await deriveSigningKey(secretAccessKey, shortDate, region);
2344
+ const signatureBytes = await hmacSha256Raw(signingKey, value);
2345
+ return bytesToHex(signatureBytes);
2346
+ }
2347
+ function buildCredentialScope(shortDate, region) {
2348
+ return `${shortDate}/${region}/${AWS4_SERVICE}/${AWS4_REQUEST_TERMINATOR}`;
2349
+ }
2350
+ async function putObjectToS3({
2351
+ accessKeyId,
2352
+ secretAccessKey,
2353
+ endpoint,
2354
+ region,
2355
+ bucket,
2356
+ key,
2357
+ body,
2358
+ contentType
2359
+ }) {
2360
+ const resolvedEndpoint = normalizeEndpoint(endpoint);
2361
+ const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
2362
+ const host = resolvedEndpoint.host;
2363
+ const normalizedContentType = contentType?.trim();
2364
+ const { amzDate, shortDate } = formatAmzDate();
2365
+ const payloadHash = await sha256Hex(body);
2366
+ const signingHeaders = [
2367
+ ["host", host],
2368
+ ["x-amz-content-sha256", payloadHash],
2369
+ ["x-amz-date", amzDate],
2370
+ ...normalizedContentType ? [["content-type", normalizedContentType]] : []
2371
+ ].sort(([a], [b]) => a.localeCompare(b));
2372
+ const canonicalHeaders = signingHeaders.map(([name, value]) => `${name}:${value}`).join("\n");
2373
+ const signedHeaders = signingHeaders.map(([name]) => name).join(";");
2374
+ const canonicalRequest = [
2375
+ "PUT",
2376
+ canonicalUri,
2377
+ "",
2378
+ `${canonicalHeaders}
2379
+ `,
2380
+ signedHeaders,
2381
+ payloadHash
2382
+ ].join("\n");
2383
+ const credentialScope = buildCredentialScope(shortDate, region);
2384
+ const stringToSign = [
2385
+ AWS4_ALGORITHM,
2386
+ amzDate,
2387
+ credentialScope,
2388
+ await sha256Hex(canonicalRequest)
2389
+ ].join("\n");
2390
+ const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
2391
+ const authorization = `${AWS4_ALGORITHM} Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}`;
2392
+ const requestUrl = `${resolvedEndpoint.origin}${canonicalUri}`;
2393
+ const response = await fetch(requestUrl, {
2394
+ method: "PUT",
2395
+ headers: {
2396
+ "Authorization": authorization,
2397
+ "x-amz-content-sha256": payloadHash,
2398
+ "x-amz-date": amzDate,
2399
+ ...normalizedContentType ? { "content-type": normalizedContentType } : {}
2400
+ },
2401
+ body
2402
+ });
2403
+ if (!response.ok) {
2404
+ const errorBody = await response.text().catch(() => "");
2405
+ const detail = errorBody ? ` ${errorBody}` : "";
2406
+ throw new Error(`S3 PUT failed (${response.status} ${response.statusText}).${detail}`);
2434
2407
  }
2435
- return averaged;
2436
2408
  }
2437
- async function generateSingleChunkEmbedding({
2438
- chunk,
2439
- provider,
2440
- modelId,
2441
- credentials
2409
+ async function createPresignedGetUrl({
2410
+ accessKeyId,
2411
+ secretAccessKey,
2412
+ endpoint,
2413
+ region,
2414
+ bucket,
2415
+ key,
2416
+ expiresInSeconds = 3600
2442
2417
  }) {
2443
- "use step";
2444
- const model = await createEmbeddingModelFromConfig(provider, modelId, credentials);
2445
- const response = await withRetry(
2446
- () => embed({
2447
- model,
2448
- value: chunk.text
2449
- })
2450
- );
2451
- return {
2452
- chunkId: chunk.id,
2453
- embedding: response.embedding,
2454
- metadata: {
2455
- startTime: chunk.startTime,
2456
- endTime: chunk.endTime,
2457
- tokenCount: chunk.tokenCount
2458
- }
2418
+ const resolvedEndpoint = normalizeEndpoint(endpoint);
2419
+ const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
2420
+ const host = resolvedEndpoint.host;
2421
+ const { amzDate, shortDate } = formatAmzDate();
2422
+ const credentialScope = buildCredentialScope(shortDate, region);
2423
+ const signedHeaders = "host";
2424
+ const queryParams = {
2425
+ "X-Amz-Algorithm": AWS4_ALGORITHM,
2426
+ "X-Amz-Credential": `${accessKeyId}/${credentialScope}`,
2427
+ "X-Amz-Date": amzDate,
2428
+ "X-Amz-Expires": `${expiresInSeconds}`,
2429
+ "X-Amz-SignedHeaders": signedHeaders
2459
2430
  };
2431
+ const canonicalQuery = buildCanonicalQuery(queryParams);
2432
+ const canonicalRequest = [
2433
+ "GET",
2434
+ canonicalUri,
2435
+ canonicalQuery,
2436
+ `host:${host}
2437
+ `,
2438
+ signedHeaders,
2439
+ "UNSIGNED-PAYLOAD"
2440
+ ].join("\n");
2441
+ const stringToSign = [
2442
+ AWS4_ALGORITHM,
2443
+ amzDate,
2444
+ credentialScope,
2445
+ await sha256Hex(canonicalRequest)
2446
+ ].join("\n");
2447
+ const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
2448
+ const queryWithSignature = `${canonicalQuery}&X-Amz-Signature=${signature}`;
2449
+ return `${resolvedEndpoint.origin}${canonicalUri}?${queryWithSignature}`;
2460
2450
  }
2461
- async function generateEmbeddingsInternal(assetId, options = {}) {
2462
- const {
2463
- provider = "openai",
2464
- model,
2465
- languageCode,
2466
- chunkingStrategy = { type: "token", maxTokens: 500, overlap: 100 },
2467
- batchSize = 5,
2468
- credentials
2469
- } = options;
2470
- const embeddingModel = resolveEmbeddingModelConfig({ ...options, provider, model });
2471
- const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
2472
- const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
2473
- const isAudioOnly = isAudioOnlyAsset(assetData);
2474
- const signingContext = await resolveMuxSigningContext(credentials);
2475
- if (policy === "signed" && !signingContext) {
2451
+
2452
+ // src/lib/storage-adapter.ts
2453
+ function requireCredentials(accessKeyId, secretAccessKey) {
2454
+ if (!accessKeyId || !secretAccessKey) {
2476
2455
  throw new Error(
2477
- "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
2456
+ "S3 credentials are required for default storage operations. Provide S3_ACCESS_KEY_ID and S3_SECRET_ACCESS_KEY or pass options.storageAdapter."
2478
2457
  );
2479
2458
  }
2480
- const readyTextTracks = getReadyTextTracks(assetData);
2481
- const useVttChunking = chunkingStrategy.type === "vtt";
2482
- let transcriptResult = await fetchTranscriptForAsset(assetData, playbackId, {
2483
- languageCode,
2484
- cleanTranscript: !useVttChunking,
2485
- shouldSign: policy === "signed",
2486
- credentials
2459
+ return { accessKeyId, secretAccessKey };
2460
+ }
2461
+ async function putObjectWithStorageAdapter(input, adapter) {
2462
+ if (adapter) {
2463
+ await adapter.putObject(input);
2464
+ return;
2465
+ }
2466
+ const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
2467
+ await putObjectToS3({
2468
+ accessKeyId: credentials.accessKeyId,
2469
+ secretAccessKey: credentials.secretAccessKey,
2470
+ endpoint: input.endpoint,
2471
+ region: input.region,
2472
+ bucket: input.bucket,
2473
+ key: input.key,
2474
+ body: input.body,
2475
+ contentType: input.contentType
2487
2476
  });
2488
- if (isAudioOnly && !transcriptResult.track && readyTextTracks.length === 1) {
2489
- transcriptResult = await fetchTranscriptForAsset(assetData, playbackId, {
2490
- cleanTranscript: !useVttChunking,
2491
- shouldSign: policy === "signed",
2492
- credentials
2477
+ }
2478
+ async function createPresignedGetUrlWithStorageAdapter(input, adapter) {
2479
+ if (adapter) {
2480
+ return adapter.createPresignedGetUrl(input);
2481
+ }
2482
+ const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
2483
+ return createPresignedGetUrl({
2484
+ accessKeyId: credentials.accessKeyId,
2485
+ secretAccessKey: credentials.secretAccessKey,
2486
+ endpoint: input.endpoint,
2487
+ region: input.region,
2488
+ bucket: input.bucket,
2489
+ key: input.key,
2490
+ expiresInSeconds: input.expiresInSeconds
2491
+ });
2492
+ }
2493
+
2494
+ // src/workflows/edit-captions.ts
2495
+ var profanityDetectionSchema = z5.object({
2496
+ profanity: z5.array(z5.string()).describe(
2497
+ "Unique profane words or short phrases exactly as they appear in the transcript text. Include each distinct form only once (e.g., if 'fuck' and 'fucking' both appear, list both)."
2498
+ )
2499
+ });
2500
+ var SYSTEM_PROMPT3 = dedent4`
2501
+ You are a content moderation assistant. Your task is to identify profane, vulgar, or obscene
2502
+ words and phrases in subtitle text. Return ONLY the exact profane words or phrases as they appear
2503
+ in the text. Do not modify, censor, or paraphrase them. Do not include words that are merely
2504
+ informal or slang but not profane. Focus on words that would be bleeped on broadcast television.`;
2505
+ function transformCueText(rawVtt, transform) {
2506
+ const lines = rawVtt.split("\n");
2507
+ let inCueText = false;
2508
+ let currentCueStartTime = 0;
2509
+ const transformed = lines.map((line) => {
2510
+ if (line.includes("-->")) {
2511
+ const startTimestamp = line.split("-->")[0].trim();
2512
+ currentCueStartTime = vttTimestampToSeconds(startTimestamp);
2513
+ inCueText = true;
2514
+ return line;
2515
+ }
2516
+ if (line.trim() === "") {
2517
+ inCueText = false;
2518
+ return line;
2519
+ }
2520
+ if (inCueText) {
2521
+ return transform(line, currentCueStartTime);
2522
+ }
2523
+ return line;
2524
+ });
2525
+ return transformed.join("\n");
2526
+ }
2527
+ function buildReplacementRegex(words) {
2528
+ const filtered = words.filter((w) => w.length > 0);
2529
+ if (filtered.length === 0)
2530
+ return null;
2531
+ filtered.sort((a, b) => b.length - a.length);
2532
+ const escaped = filtered.map((w) => w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
2533
+ const pattern = escaped.join("|");
2534
+ return new RegExp(`\\b(?:${pattern})\\b`, "gi");
2535
+ }
2536
+ function createReplacer(mode) {
2537
+ switch (mode) {
2538
+ case "blank":
2539
+ return (match) => `[${"_".repeat(match.length)}]`;
2540
+ case "remove":
2541
+ return () => "";
2542
+ case "mask":
2543
+ return (match) => "?".repeat(match.length);
2544
+ }
2545
+ }
2546
+ function censorVttContent(rawVtt, profanity, mode) {
2547
+ if (profanity.length === 0) {
2548
+ return { censoredVtt: rawVtt, replacements: [] };
2549
+ }
2550
+ const regex = buildReplacementRegex(profanity);
2551
+ if (!regex) {
2552
+ return { censoredVtt: rawVtt, replacements: [] };
2553
+ }
2554
+ const replacer = createReplacer(mode);
2555
+ const replacements = [];
2556
+ const censoredVtt = transformCueText(rawVtt, (line, cueStartTime) => {
2557
+ return line.replace(regex, (match) => {
2558
+ const after = replacer(match);
2559
+ replacements.push({ cueStartTime, before: match, after });
2560
+ return after;
2493
2561
  });
2562
+ });
2563
+ return { censoredVtt, replacements };
2564
+ }
2565
+ function applyOverrideLists(detected, alwaysCensor, neverCensor) {
2566
+ const seen = new Set(detected.map((w) => w.toLowerCase()));
2567
+ const merged = [...detected];
2568
+ for (const word of alwaysCensor) {
2569
+ const lower = word.toLowerCase();
2570
+ if (!seen.has(lower)) {
2571
+ seen.add(lower);
2572
+ merged.push(word);
2573
+ }
2494
2574
  }
2495
- if (!transcriptResult.track || !transcriptResult.transcriptText) {
2496
- const availableLanguages = readyTextTracks.map((t) => t.language_code).filter(Boolean).join(", ");
2497
- if (isAudioOnly) {
2498
- throw new Error(
2499
- `No transcript track found${languageCode ? ` for language '${languageCode}'` : ""}. Audio-only assets require a transcript. Available languages: ${availableLanguages || "none"}`
2500
- );
2575
+ const neverSet = new Set(neverCensor.map((w) => w.toLowerCase()));
2576
+ return merged.filter((w) => !neverSet.has(w.toLowerCase()));
2577
+ }
2578
+ function applyReplacements(rawVtt, replacements) {
2579
+ const filtered = replacements.filter((r) => r.find.length > 0);
2580
+ if (filtered.length === 0) {
2581
+ return { editedVtt: rawVtt, replacements: [] };
2582
+ }
2583
+ const records = [];
2584
+ const editedVtt = transformCueText(rawVtt, (line, cueStartTime) => {
2585
+ let result = line;
2586
+ for (const { find, replace } of filtered) {
2587
+ const escaped = find.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
2588
+ const regex = new RegExp(`\\b${escaped}\\b`, "g");
2589
+ result = result.replace(regex, (match) => {
2590
+ records.push({ cueStartTime, before: match, after: replace });
2591
+ return replace;
2592
+ });
2593
+ }
2594
+ return result;
2595
+ });
2596
+ return { editedVtt, replacements: records };
2597
+ }
2598
+ async function identifyProfanityWithAI({
2599
+ plainText,
2600
+ provider,
2601
+ modelId,
2602
+ credentials
2603
+ }) {
2604
+ "use step";
2605
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
2606
+ const response = await generateText4({
2607
+ model,
2608
+ output: Output4.object({ schema: profanityDetectionSchema }),
2609
+ messages: [
2610
+ {
2611
+ role: "system",
2612
+ content: SYSTEM_PROMPT3
2613
+ },
2614
+ {
2615
+ role: "user",
2616
+ content: `Identify all profane words and phrases in the following subtitle transcript. Return each unique profane word or phrase exactly as it appears in the text.
2617
+
2618
+ <transcript>
2619
+ ${plainText}
2620
+ </transcript>`
2621
+ }
2622
+ ]
2623
+ });
2624
+ return {
2625
+ profanity: response.output.profanity,
2626
+ usage: {
2627
+ inputTokens: response.usage.inputTokens,
2628
+ outputTokens: response.usage.outputTokens,
2629
+ totalTokens: response.usage.totalTokens,
2630
+ reasoningTokens: response.usage.reasoningTokens,
2631
+ cachedInputTokens: response.usage.cachedInputTokens
2501
2632
  }
2633
+ };
2634
+ }
2635
+ async function uploadEditedVttToS3({
2636
+ editedVtt,
2637
+ assetId,
2638
+ trackId,
2639
+ s3Endpoint,
2640
+ s3Region,
2641
+ s3Bucket,
2642
+ storageAdapter,
2643
+ s3SignedUrlExpirySeconds
2644
+ }) {
2645
+ "use step";
2646
+ const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
2647
+ const s3SecretAccessKey = env_default.S3_SECRET_ACCESS_KEY;
2648
+ const vttKey = `edited/${assetId}/${trackId}-edited-${Date.now()}.vtt`;
2649
+ await putObjectWithStorageAdapter({
2650
+ accessKeyId: s3AccessKeyId,
2651
+ secretAccessKey: s3SecretAccessKey,
2652
+ endpoint: s3Endpoint,
2653
+ region: s3Region,
2654
+ bucket: s3Bucket,
2655
+ key: vttKey,
2656
+ body: editedVtt,
2657
+ contentType: "text/vtt"
2658
+ }, storageAdapter);
2659
+ return createPresignedGetUrlWithStorageAdapter({
2660
+ accessKeyId: s3AccessKeyId,
2661
+ secretAccessKey: s3SecretAccessKey,
2662
+ endpoint: s3Endpoint,
2663
+ region: s3Region,
2664
+ bucket: s3Bucket,
2665
+ key: vttKey,
2666
+ expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
2667
+ }, storageAdapter);
2668
+ }
2669
+ async function deleteTrackOnMux(assetId, trackId, credentials) {
2670
+ "use step";
2671
+ const muxClient = await resolveMuxClient(credentials);
2672
+ const mux = await muxClient.createClient();
2673
+ await mux.video.assets.deleteTrack(assetId, trackId);
2674
+ }
2675
+ async function editCaptions(assetId, trackId, options) {
2676
+ "use workflow";
2677
+ const {
2678
+ provider,
2679
+ model,
2680
+ autoCensorProfanity: autoCensorOption,
2681
+ replacements: replacementsOption,
2682
+ deleteOriginalTrack,
2683
+ uploadToMux: uploadToMuxOption,
2684
+ s3Endpoint: providedS3Endpoint,
2685
+ s3Region: providedS3Region,
2686
+ s3Bucket: providedS3Bucket,
2687
+ trackNameSuffix,
2688
+ storageAdapter,
2689
+ credentials
2690
+ } = options;
2691
+ const hasAutoCensor = !!autoCensorOption;
2692
+ const hasReplacements = !!replacementsOption && replacementsOption.length > 0;
2693
+ if (!hasAutoCensor && !hasReplacements) {
2694
+ throw new Error("At least one of autoCensorProfanity or replacements must be provided.");
2695
+ }
2696
+ if (autoCensorOption && !provider) {
2697
+ throw new Error("provider is required when using autoCensorProfanity.");
2698
+ }
2699
+ const deleteOriginal = deleteOriginalTrack !== false;
2700
+ const uploadToMux = uploadToMuxOption !== false;
2701
+ const s3Endpoint = providedS3Endpoint ?? env_default.S3_ENDPOINT;
2702
+ const s3Region = providedS3Region ?? env_default.S3_REGION ?? "auto";
2703
+ const s3Bucket = providedS3Bucket ?? env_default.S3_BUCKET;
2704
+ const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
2705
+ const s3SecretAccessKey = env_default.S3_SECRET_ACCESS_KEY;
2706
+ if (uploadToMux && (!s3Endpoint || !s3Bucket || !storageAdapter && (!s3AccessKeyId || !s3SecretAccessKey))) {
2502
2707
  throw new Error(
2503
- `No caption track found${languageCode ? ` for language '${languageCode}'` : ""}. Available languages: ${availableLanguages || "none"}`
2708
+ "Storage configuration is required for uploading to Mux. Provide s3Endpoint and s3Bucket. If no storageAdapter is supplied, also provide s3AccessKeyId and s3SecretAccessKey in options or set S3_ENDPOINT, S3_BUCKET, S3_ACCESS_KEY_ID, and S3_SECRET_ACCESS_KEY environment variables."
2504
2709
  );
2505
2710
  }
2506
- const transcriptText = transcriptResult.transcriptText;
2507
- if (!transcriptText.trim()) {
2508
- throw new Error("Transcript is empty");
2711
+ const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
2712
+ const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
2713
+ const signingContext = await resolveMuxSigningContext(credentials);
2714
+ if (policy === "signed" && !signingContext) {
2715
+ throw new Error(
2716
+ "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
2717
+ );
2509
2718
  }
2510
- const chunks = useVttChunking ? chunkVTTCues(
2511
- parseVTTCues(transcriptText),
2512
- chunkingStrategy.maxTokens,
2513
- chunkingStrategy.overlapCues
2514
- ) : chunkText(transcriptText, chunkingStrategy);
2515
- if (chunks.length === 0) {
2516
- throw new Error("No chunks generated from transcript");
2719
+ const readyTextTracks = getReadyTextTracks(assetData);
2720
+ const sourceTrack = readyTextTracks.find((t) => t.id === trackId);
2721
+ if (!sourceTrack) {
2722
+ const availableTrackIds = readyTextTracks.map((t) => t.id).filter(Boolean).join(", ");
2723
+ throw new Error(
2724
+ `Track '${trackId}' not found or not ready on asset '${assetId}'. Available track IDs: ${availableTrackIds || "none"}`
2725
+ );
2517
2726
  }
2518
- const chunkEmbeddings = [];
2727
+ const vttUrl = await buildTranscriptUrl(playbackId, trackId, policy === "signed", credentials);
2728
+ let vttContent;
2519
2729
  try {
2520
- for (let i = 0; i < chunks.length; i += batchSize) {
2521
- const batch = chunks.slice(i, i + batchSize);
2522
- const batchResults = await Promise.all(
2523
- batch.map(
2524
- (chunk) => generateSingleChunkEmbedding({
2525
- chunk,
2526
- provider: embeddingModel.provider,
2527
- modelId: embeddingModel.modelId,
2528
- credentials
2529
- })
2530
- )
2531
- );
2532
- chunkEmbeddings.push(...batchResults);
2730
+ vttContent = await fetchVttFromMux(vttUrl);
2731
+ } catch (error) {
2732
+ throw new Error(`Failed to fetch VTT content: ${error instanceof Error ? error.message : "Unknown error"}`);
2733
+ }
2734
+ let editedVtt = vttContent;
2735
+ let totalReplacementCount = 0;
2736
+ let autoCensorResult;
2737
+ let usage;
2738
+ if (autoCensorOption) {
2739
+ const { mode = "blank", alwaysCensor = [], neverCensor = [] } = autoCensorOption;
2740
+ const plainText = extractTextFromVTT(vttContent);
2741
+ if (!plainText.trim()) {
2742
+ throw new Error("Track transcript is empty; nothing to censor.");
2743
+ }
2744
+ const modelConfig = resolveLanguageModelConfig({
2745
+ ...options,
2746
+ provider,
2747
+ model
2748
+ });
2749
+ let detectedProfanity;
2750
+ try {
2751
+ const result = await identifyProfanityWithAI({
2752
+ plainText,
2753
+ provider: modelConfig.provider,
2754
+ modelId: modelConfig.modelId,
2755
+ credentials
2756
+ });
2757
+ detectedProfanity = result.profanity;
2758
+ usage = result.usage;
2759
+ } catch (error) {
2760
+ throw new Error(`Failed to detect profanity with ${modelConfig.provider}: ${error instanceof Error ? error.message : "Unknown error"}`);
2761
+ }
2762
+ const finalProfanity = applyOverrideLists(detectedProfanity, alwaysCensor, neverCensor);
2763
+ const { censoredVtt, replacements: censorReplacements } = censorVttContent(editedVtt, finalProfanity, mode);
2764
+ editedVtt = censoredVtt;
2765
+ totalReplacementCount += censorReplacements.length;
2766
+ autoCensorResult = { replacements: censorReplacements };
2767
+ }
2768
+ let replacementsResult;
2769
+ if (replacementsOption && replacementsOption.length > 0) {
2770
+ const { editedVtt: afterReplacements, replacements: staticReplacements } = applyReplacements(editedVtt, replacementsOption);
2771
+ editedVtt = afterReplacements;
2772
+ totalReplacementCount += staticReplacements.length;
2773
+ replacementsResult = { replacements: staticReplacements };
2774
+ }
2775
+ const usageWithMetadata = usage ? {
2776
+ ...usage,
2777
+ metadata: {
2778
+ assetDurationSeconds
2533
2779
  }
2780
+ } : void 0;
2781
+ if (!uploadToMux) {
2782
+ return {
2783
+ assetId,
2784
+ trackId,
2785
+ originalVtt: vttContent,
2786
+ editedVtt,
2787
+ totalReplacementCount,
2788
+ autoCensorProfanity: autoCensorResult,
2789
+ replacements: replacementsResult,
2790
+ usage: usageWithMetadata
2791
+ };
2792
+ }
2793
+ let presignedUrl;
2794
+ try {
2795
+ presignedUrl = await uploadEditedVttToS3({
2796
+ editedVtt,
2797
+ assetId,
2798
+ trackId,
2799
+ s3Endpoint,
2800
+ s3Region,
2801
+ s3Bucket,
2802
+ storageAdapter,
2803
+ s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
2804
+ });
2534
2805
  } catch (error) {
2535
- throw new Error(
2536
- `Failed to generate embeddings with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
2806
+ throw new Error(`Failed to upload VTT to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
2807
+ }
2808
+ let uploadedTrackId;
2809
+ try {
2810
+ const languageCode = sourceTrack.language_code || "en";
2811
+ const suffix = trackNameSuffix ?? "edited";
2812
+ const trackName = `${sourceTrack.name || "Subtitles"} (${suffix})`;
2813
+ uploadedTrackId = await createTextTrackOnMux(
2814
+ assetId,
2815
+ languageCode,
2816
+ trackName,
2817
+ presignedUrl,
2818
+ credentials
2537
2819
  );
2820
+ } catch (error) {
2821
+ console.warn(`Failed to add track to Mux asset: ${error instanceof Error ? error.message : "Unknown error"}`);
2538
2822
  }
2539
- if (chunkEmbeddings.length === 0) {
2540
- throw new Error("No embeddings generated");
2823
+ if (deleteOriginal && uploadedTrackId) {
2824
+ try {
2825
+ await deleteTrackOnMux(assetId, trackId, credentials);
2826
+ } catch (error) {
2827
+ console.warn(`Failed to delete original track: ${error instanceof Error ? error.message : "Unknown error"}`);
2828
+ }
2541
2829
  }
2542
- const averagedEmbedding = averageEmbeddings(chunkEmbeddings.map((ce) => ce.embedding));
2543
- const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0);
2544
2830
  return {
2545
2831
  assetId,
2546
- chunks: chunkEmbeddings,
2547
- averagedEmbedding,
2548
- provider,
2549
- model: embeddingModel.modelId,
2550
- metadata: {
2551
- totalChunks: chunks.length,
2552
- totalTokens,
2553
- chunkingStrategy: JSON.stringify(chunkingStrategy),
2554
- embeddingDimensions: chunkEmbeddings[0].embedding.length,
2555
- generatedAt: (/* @__PURE__ */ new Date()).toISOString()
2556
- },
2557
- usage: {
2558
- metadata: {
2559
- assetDurationSeconds
2560
- }
2561
- }
2832
+ trackId,
2833
+ originalVtt: vttContent,
2834
+ editedVtt,
2835
+ totalReplacementCount,
2836
+ autoCensorProfanity: autoCensorResult,
2837
+ replacements: replacementsResult,
2838
+ uploadedTrackId,
2839
+ presignedUrl,
2840
+ usage: usageWithMetadata
2562
2841
  };
2563
2842
  }
2564
- async function generateEmbeddings(assetId, options = {}) {
2565
- "use workflow";
2566
- return generateEmbeddingsInternal(assetId, options);
2567
- }
2568
- async function generateVideoEmbeddings(assetId, options = {}) {
2569
- "use workflow";
2570
- console.warn("generateVideoEmbeddings is deprecated. Use generateEmbeddings instead.");
2571
- return generateEmbeddingsInternal(assetId, options);
2572
- }
2573
2843
 
2574
- // src/lib/sampling-plan.ts
2575
- var DEFAULT_FPS = 30;
2576
- function roundToNearestFrameMs(tsMs, fps = DEFAULT_FPS) {
2577
- const frameMs = 1e3 / fps;
2578
- return Math.round(Math.round(tsMs / frameMs) * frameMs * 100) / 100;
2844
+ // src/workflows/embeddings.ts
2845
+ import { embed } from "ai";
2846
+
2847
+ // src/primitives/text-chunking.ts
2848
+ var DEFAULT_MIN_CHUNK_DURATION_RATIO = 2 / 3;
2849
+ var DEFAULT_BOUNDARY_LOOKAHEAD_CUES = 12;
2850
+ var DEFAULT_BOUNDARY_PAUSE_SECONDS = 1.25;
2851
+ var STRONG_BOUNDARY_SCORE = 4;
2852
+ var PREFERRED_BOUNDARY_WINDOW_SECONDS = 5 * 60;
2853
+ var SENTENCE_BOUNDARY_REGEX = /[.!?]["')\]]*$/;
2854
+ var CLAUSE_BOUNDARY_REGEX = /[,;:]["')\]]*$/;
2855
+ var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
2856
+ function estimateTokenCount(text) {
2857
+ const words = text.trim().split(/\s+/).length;
2858
+ return Math.ceil(words / 0.75);
2579
2859
  }
2580
- function planSamplingTimestamps(options) {
2581
- const DEFAULT_MIN_CANDIDATES = 10;
2582
- const DEFAULT_MAX_CANDIDATES = 30;
2583
- const {
2584
- duration_sec,
2585
- min_candidates = DEFAULT_MIN_CANDIDATES,
2586
- max_candidates = DEFAULT_MAX_CANDIDATES,
2587
- trim_start_sec = 1,
2588
- trim_end_sec = 1,
2589
- fps = DEFAULT_FPS,
2590
- base_cadence_hz,
2591
- anchor_percents = [0.2, 0.5, 0.8],
2592
- anchor_window_sec = 1.5
2593
- } = options;
2594
- const usableSec = Math.max(0, duration_sec - (trim_start_sec + trim_end_sec));
2595
- if (usableSec <= 0)
2860
+ function chunkByTokens(text, maxTokens, overlapTokens = 0) {
2861
+ if (!text.trim()) {
2596
2862
  return [];
2597
- const cadenceHz = base_cadence_hz ?? (duration_sec < 15 ? 3 : duration_sec < 60 ? 2 : duration_sec < 180 ? 1.5 : 1);
2598
- let target = Math.round(usableSec * cadenceHz);
2599
- target = Math.max(min_candidates, Math.min(max_candidates, target));
2600
- const stepSec = usableSec / target;
2601
- const t0 = trim_start_sec;
2602
- const base = [];
2603
- for (let i = 0; i < target; i++) {
2604
- const tsSec = t0 + (i + 0.5) * stepSec;
2605
- base.push(tsSec * 1e3);
2606
2863
  }
2607
- const slack = Math.max(0, max_candidates - base.length);
2608
- const extra = [];
2609
- if (slack > 0 && anchor_percents.length > 0) {
2610
- const perAnchor = Math.max(1, Math.min(5, Math.floor(slack / anchor_percents.length)));
2611
- for (const p of anchor_percents) {
2612
- const centerSec = Math.min(
2613
- t0 + usableSec - 1e-3,
2614
- // nudge just inside the end bound
2615
- Math.max(t0 + 1e-3, duration_sec * p)
2616
- // nudge just inside the start bound
2617
- );
2618
- const startSec = Math.max(t0, centerSec - anchor_window_sec / 2);
2619
- const endSec = Math.min(t0 + usableSec, centerSec + anchor_window_sec / 2);
2620
- if (endSec <= startSec)
2621
- continue;
2622
- const wStep = (endSec - startSec) / perAnchor;
2623
- for (let i = 0; i < perAnchor; i++) {
2624
- const tsSec = startSec + (i + 0.5) * wStep;
2625
- extra.push(tsSec * 1e3);
2626
- }
2864
+ const chunks = [];
2865
+ const words = text.trim().split(/\s+/);
2866
+ const wordsPerChunk = Math.floor(maxTokens * 0.75);
2867
+ const overlapWords = Math.floor(overlapTokens * 0.75);
2868
+ let chunkIndex = 0;
2869
+ let currentPosition = 0;
2870
+ while (currentPosition < words.length) {
2871
+ const chunkWords = words.slice(
2872
+ currentPosition,
2873
+ currentPosition + wordsPerChunk
2874
+ );
2875
+ const chunkText2 = chunkWords.join(" ");
2876
+ const tokenCount = estimateTokenCount(chunkText2);
2877
+ chunks.push({
2878
+ id: `chunk-${chunkIndex}`,
2879
+ text: chunkText2,
2880
+ tokenCount
2881
+ });
2882
+ currentPosition += wordsPerChunk - overlapWords;
2883
+ chunkIndex++;
2884
+ if (currentPosition <= (chunkIndex - 1) * (wordsPerChunk - overlapWords)) {
2885
+ break;
2627
2886
  }
2628
2887
  }
2629
- const all = base.concat(extra).map((ms) => roundToNearestFrameMs(ms, fps)).filter((ms) => ms >= trim_start_sec * 1e3 && ms <= (duration_sec - trim_end_sec) * 1e3);
2630
- const uniqSorted = Array.from(new Set(all)).sort((a, b) => a - b);
2631
- return uniqSorted.slice(0, max_candidates);
2888
+ return chunks;
2632
2889
  }
2633
-
2634
- // src/primitives/thumbnails.ts
2635
- async function getThumbnailUrls(playbackId, duration, options = {}) {
2636
- "use step";
2637
- const { interval = 10, width = 640, shouldSign = false, maxSamples, credentials } = options;
2638
- let timestamps = [];
2639
- if (duration <= 50) {
2640
- const spacing = duration / 6;
2641
- for (let i = 1; i <= 5; i++) {
2642
- timestamps.push(Math.round(i * spacing));
2643
- }
2644
- } else {
2645
- for (let time = 0; time < duration; time += interval) {
2646
- timestamps.push(time);
2890
+ function createChunkFromCues(cues, index) {
2891
+ const text = cues.map((c) => c.text).join(" ");
2892
+ return {
2893
+ id: `chunk-${index}`,
2894
+ text,
2895
+ tokenCount: estimateTokenCount(text),
2896
+ startTime: cues[0].startTime,
2897
+ endTime: cues[cues.length - 1].endTime
2898
+ };
2899
+ }
2900
+ function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
2901
+ if (cues.length === 0)
2902
+ return [];
2903
+ const chunks = [];
2904
+ let currentCues = [];
2905
+ let currentTokens = 0;
2906
+ let chunkIndex = 0;
2907
+ for (let i = 0; i < cues.length; i++) {
2908
+ const cue = cues[i];
2909
+ const cueTokens = estimateTokenCount(cue.text);
2910
+ if (currentTokens + cueTokens > maxTokens && currentCues.length > 0) {
2911
+ chunks.push(createChunkFromCues(currentCues, chunkIndex));
2912
+ chunkIndex++;
2913
+ const overlapStart = Math.max(0, currentCues.length - overlapCues);
2914
+ currentCues = currentCues.slice(overlapStart);
2915
+ currentTokens = currentCues.reduce(
2916
+ (sum, c) => sum + estimateTokenCount(c.text),
2917
+ 0
2918
+ );
2647
2919
  }
2920
+ currentCues.push(cue);
2921
+ currentTokens += cueTokens;
2648
2922
  }
2649
- if (maxSamples !== void 0 && timestamps.length > maxSamples) {
2650
- const newTimestamps = [];
2651
- newTimestamps.push(0);
2652
- if (maxSamples >= 2) {
2653
- const spacing = duration / (maxSamples - 1);
2654
- for (let i = 1; i < maxSamples - 1; i++) {
2655
- newTimestamps.push(spacing * i);
2656
- }
2657
- newTimestamps.push(duration);
2658
- }
2659
- timestamps = newTimestamps;
2923
+ if (currentCues.length > 0) {
2924
+ chunks.push(createChunkFromCues(currentCues, chunkIndex));
2660
2925
  }
2661
- const baseUrl = getMuxThumbnailBaseUrl(playbackId);
2662
- const urlPromises = timestamps.map(async (time) => {
2663
- const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
2664
- return { url, time };
2665
- });
2666
- return Promise.all(urlPromises);
2926
+ return chunks;
2667
2927
  }
2668
-
2669
- // src/workflows/moderation.ts
2670
- var DEFAULT_THRESHOLDS = {
2671
- sexual: 0.8,
2672
- violence: 0.8
2673
- };
2674
- var DEFAULT_PROVIDER2 = "openai";
2675
- var HIVE_ENDPOINT = "https://api.thehive.ai/api/v2/task/sync";
2676
- var HIVE_SEXUAL_CATEGORIES = [
2677
- "general_nsfw",
2678
- "yes_sexual_activity",
2679
- "yes_sex_toy",
2680
- "yes_female_nudity",
2681
- "yes_male_nudity"
2682
- ];
2683
- var HIVE_VIOLENCE_CATEGORIES = [
2684
- "gun_in_hand",
2685
- "gun_not_in_hand",
2686
- "knife_in_hand",
2687
- "very_bloody",
2688
- "other_blood",
2689
- "hanging",
2690
- "noose",
2691
- "human_corpse",
2692
- "yes_emaciated_body",
2693
- "yes_self_harm",
2694
- "garm_death_injury_or_military_conflict"
2695
- ];
2696
- async function processConcurrently(items, processor, maxConcurrent = 5) {
2697
- "use step";
2698
- const results = [];
2699
- for (let i = 0; i < items.length; i += maxConcurrent) {
2700
- const batch = items.slice(i, i + maxConcurrent);
2701
- const batchPromises = batch.map(processor);
2702
- const batchResults = await Promise.all(batchPromises);
2703
- results.push(...batchResults);
2928
+ function scoreCueBoundary(cues, index, boundaryPauseSeconds) {
2929
+ const cue = cues[index];
2930
+ const nextCue = cues[index + 1];
2931
+ if (!nextCue) {
2932
+ return Number.POSITIVE_INFINITY;
2704
2933
  }
2705
- return results;
2706
- }
2707
- async function moderateImageWithOpenAI(entry) {
2708
- "use step";
2709
- const apiKey = await getApiKeyFromEnv("openai", entry.credentials);
2710
- try {
2711
- const res = await fetch("https://api.openai.com/v1/moderations", {
2712
- method: "POST",
2713
- headers: {
2714
- "Content-Type": "application/json",
2715
- "Authorization": `Bearer ${apiKey}`
2716
- },
2717
- body: JSON.stringify({
2718
- model: entry.model,
2719
- input: [
2720
- {
2721
- type: "image_url",
2722
- image_url: {
2723
- url: entry.image
2724
- }
2725
- }
2726
- ]
2727
- })
2728
- });
2729
- const json = await res.json();
2730
- if (!res.ok) {
2731
- throw new Error(
2732
- `OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
2733
- );
2734
- }
2735
- const categoryScores = json.results?.[0]?.category_scores || {};
2736
- return {
2737
- url: entry.url,
2738
- time: entry.time,
2739
- sexual: categoryScores.sexual || 0,
2740
- violence: categoryScores.violence || 0,
2741
- error: false
2742
- };
2743
- } catch (error) {
2744
- console.error("OpenAI moderation failed:", error);
2745
- return {
2746
- url: entry.url,
2747
- time: entry.time,
2748
- sexual: 0,
2749
- violence: 0,
2750
- error: true,
2751
- errorMessage: error instanceof Error ? error.message : String(error)
2752
- };
2934
+ const trimmedText = cue.text.trim();
2935
+ let score = 0;
2936
+ if (SENTENCE_BOUNDARY_REGEX.test(trimmedText)) {
2937
+ score += 4;
2938
+ } else if (CLAUSE_BOUNDARY_REGEX.test(trimmedText)) {
2939
+ score += 2;
2753
2940
  }
2754
- }
2755
- async function requestOpenAIModeration(images, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2756
- "use step";
2757
- const imageUrls = images.map((img) => img.url);
2758
- const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2759
- const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
2760
- (img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
2761
- ) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
2762
- return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
2763
- }
2764
- async function requestOpenAITextModeration(text, model, url, credentials) {
2765
- "use step";
2766
- const apiKey = await getApiKeyFromEnv("openai", credentials);
2767
- try {
2768
- const res = await fetch("https://api.openai.com/v1/moderations", {
2769
- method: "POST",
2770
- headers: {
2771
- "Content-Type": "application/json",
2772
- "Authorization": `Bearer ${apiKey}`
2773
- },
2774
- body: JSON.stringify({
2775
- model,
2776
- input: text
2777
- })
2778
- });
2779
- const json = await res.json();
2780
- if (!res.ok) {
2781
- throw new Error(
2782
- `OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
2783
- );
2784
- }
2785
- const categoryScores = json.results?.[0]?.category_scores || {};
2786
- return {
2787
- url,
2788
- sexual: categoryScores.sexual || 0,
2789
- violence: categoryScores.violence || 0,
2790
- error: false
2791
- };
2792
- } catch (error) {
2793
- console.error("OpenAI text moderation failed:", error);
2794
- return {
2795
- url,
2796
- sexual: 0,
2797
- violence: 0,
2798
- error: true,
2799
- errorMessage: error instanceof Error ? error.message : String(error)
2800
- };
2941
+ if (nextCue.startTime - cue.endTime >= boundaryPauseSeconds) {
2942
+ score += 2;
2943
+ }
2944
+ if (NEXT_SENTENCE_START_REGEX.test(nextCue.text.trim())) {
2945
+ score += 1;
2801
2946
  }
2947
+ return score;
2802
2948
  }
2803
- function chunkTextByUtf16CodeUnits(text, maxUnits) {
2804
- if (!text.trim()) {
2949
+ function chunkVTTCuesByBudget(cues, options) {
2950
+ if (cues.length === 0) {
2805
2951
  return [];
2806
2952
  }
2807
- if (text.length <= maxUnits) {
2808
- return [text];
2953
+ const maxCuesPerChunk = Math.max(1, options.maxCuesPerChunk);
2954
+ let maxTextTokensPerChunk = Number.POSITIVE_INFINITY;
2955
+ if (options.maxTextTokensPerChunk) {
2956
+ maxTextTokensPerChunk = Math.max(1, options.maxTextTokensPerChunk);
2809
2957
  }
2810
2958
  const chunks = [];
2811
- for (let i = 0; i < text.length; i += maxUnits) {
2812
- const chunk = text.slice(i, i + maxUnits).trim();
2813
- if (chunk) {
2814
- chunks.push(chunk);
2959
+ let chunkIndex = 0;
2960
+ let cueStartIndex = 0;
2961
+ let currentTokenCount = 0;
2962
+ for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
2963
+ const cue = cues[cueIndex];
2964
+ const cueTokenCount = estimateTokenCount(cue.text);
2965
+ const currentCueCount = cueIndex - cueStartIndex;
2966
+ const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
2967
+ const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
2968
+ if (wouldExceedCueCount || wouldExceedTokenCount) {
2969
+ chunks.push({
2970
+ id: `chunk-${chunkIndex}`,
2971
+ cueStartIndex,
2972
+ cueEndIndex: cueIndex - 1,
2973
+ cueCount: cueIndex - cueStartIndex,
2974
+ startTime: cues[cueStartIndex].startTime,
2975
+ endTime: cues[cueIndex - 1].endTime
2976
+ });
2977
+ cueStartIndex = cueIndex;
2978
+ currentTokenCount = 0;
2979
+ chunkIndex++;
2815
2980
  }
2981
+ currentTokenCount += cueTokenCount;
2816
2982
  }
2983
+ chunks.push({
2984
+ id: `chunk-${chunkIndex}`,
2985
+ cueStartIndex,
2986
+ cueEndIndex: cues.length - 1,
2987
+ cueCount: cues.length - cueStartIndex,
2988
+ startTime: cues[cueStartIndex].startTime,
2989
+ endTime: cues[cues.length - 1].endTime
2990
+ });
2817
2991
  return chunks;
2818
2992
  }
2819
- async function requestOpenAITranscriptModeration(transcriptText, model, maxConcurrent = 5, credentials) {
2820
- "use step";
2821
- const chunks = chunkTextByUtf16CodeUnits(transcriptText, 1e4);
2822
- if (!chunks.length) {
2823
- return [
2824
- { url: "transcript:0", sexual: 0, violence: 0, error: true, errorMessage: "No transcript chunks to moderate" }
2825
- ];
2993
+ function chunkVTTCuesByDuration(cues, options) {
2994
+ if (cues.length === 0) {
2995
+ return [];
2826
2996
  }
2827
- const targets = chunks.map((chunk, idx) => ({
2828
- chunk,
2829
- url: `transcript:${idx}`
2830
- }));
2831
- return processConcurrently(
2832
- targets,
2833
- async (entry) => requestOpenAITextModeration(entry.chunk, model, entry.url, credentials),
2834
- maxConcurrent
2997
+ const targetChunkDurationSeconds = Math.max(1, options.targetChunkDurationSeconds);
2998
+ const maxChunkDurationSeconds = Math.max(targetChunkDurationSeconds, options.maxChunkDurationSeconds);
2999
+ const minChunkDurationSeconds = Math.min(
3000
+ targetChunkDurationSeconds,
3001
+ Math.max(
3002
+ 1,
3003
+ options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
3004
+ )
2835
3005
  );
2836
- }
2837
- function getHiveCategoryScores(classes, categoryNames) {
2838
- const scoreMap = Object.fromEntries(
2839
- classes.map((c) => [c.class, c.score])
3006
+ const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
3007
+ const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
3008
+ const preferredBoundaryStartSeconds = Math.max(
3009
+ minChunkDurationSeconds,
3010
+ targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
2840
3011
  );
2841
- const missingCategories = categoryNames.filter((category) => !(category in scoreMap));
2842
- if (missingCategories.length > 0) {
2843
- console.warn(
2844
- `Hive response missing expected categories: ${missingCategories.join(", ")}`
2845
- );
3012
+ const chunks = [];
3013
+ let chunkIndex = 0;
3014
+ let cueStartIndex = 0;
3015
+ while (cueStartIndex < cues.length) {
3016
+ const chunkStartTime = cues[cueStartIndex].startTime;
3017
+ let cueEndIndex = cueStartIndex;
3018
+ let bestBoundaryIndex = -1;
3019
+ let bestBoundaryScore = -1;
3020
+ let bestPreferredBoundaryIndex = -1;
3021
+ let bestPreferredBoundaryScore = -1;
3022
+ while (cueEndIndex < cues.length) {
3023
+ const cue = cues[cueEndIndex];
3024
+ const currentDuration = cue.endTime - chunkStartTime;
3025
+ if (currentDuration >= minChunkDurationSeconds) {
3026
+ const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
3027
+ if (boundaryScore >= bestBoundaryScore) {
3028
+ bestBoundaryIndex = cueEndIndex;
3029
+ bestBoundaryScore = boundaryScore;
3030
+ }
3031
+ if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
3032
+ bestPreferredBoundaryIndex = cueEndIndex;
3033
+ bestPreferredBoundaryScore = boundaryScore;
3034
+ }
3035
+ }
3036
+ const nextCue = cues[cueEndIndex + 1];
3037
+ if (!nextCue) {
3038
+ break;
3039
+ }
3040
+ const nextDuration = nextCue.endTime - chunkStartTime;
3041
+ const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
3042
+ const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
3043
+ const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
3044
+ if (currentDuration >= targetChunkDurationSeconds) {
3045
+ if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
3046
+ cueEndIndex = preferredBoundaryIndex;
3047
+ break;
3048
+ }
3049
+ if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
3050
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
3051
+ break;
3052
+ }
3053
+ }
3054
+ if (nextDuration > maxChunkDurationSeconds) {
3055
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
3056
+ break;
3057
+ }
3058
+ cueEndIndex++;
3059
+ }
3060
+ chunks.push({
3061
+ id: `chunk-${chunkIndex}`,
3062
+ cueStartIndex,
3063
+ cueEndIndex,
3064
+ cueCount: cueEndIndex - cueStartIndex + 1,
3065
+ startTime: cues[cueStartIndex].startTime,
3066
+ endTime: cues[cueEndIndex].endTime
3067
+ });
3068
+ cueStartIndex = cueEndIndex + 1;
3069
+ chunkIndex++;
2846
3070
  }
2847
- const scores = categoryNames.map((category) => scoreMap[category] || 0);
2848
- return Math.max(...scores, 0);
3071
+ return chunks;
2849
3072
  }
2850
- async function moderateImageWithHive(entry) {
2851
- "use step";
2852
- const apiKey = await getApiKeyFromEnv("hive", entry.credentials);
2853
- try {
2854
- const formData = new FormData();
2855
- if (entry.source.kind === "url") {
2856
- formData.append("url", entry.source.value);
2857
- } else {
2858
- const extension = entry.source.contentType.split("/")[1] || "jpg";
2859
- const blob = new Blob([entry.source.buffer], {
2860
- type: entry.source.contentType
2861
- });
2862
- formData.append("media", blob, `thumbnail.${extension}`);
3073
+ function chunkText(text, strategy) {
3074
+ switch (strategy.type) {
3075
+ case "token": {
3076
+ return chunkByTokens(text, strategy.maxTokens, strategy.overlap ?? 0);
2863
3077
  }
2864
- const controller = new AbortController();
2865
- const timeout = setTimeout(() => controller.abort(), 15e3);
2866
- let res;
2867
- try {
2868
- res = await fetch(HIVE_ENDPOINT, {
2869
- method: "POST",
2870
- headers: {
2871
- Accept: "application/json",
2872
- Authorization: `Token ${apiKey}`
2873
- },
2874
- body: formData,
2875
- signal: controller.signal
2876
- });
2877
- } catch (err) {
2878
- if (err?.name === "AbortError") {
2879
- throw new Error("Hive request timed out after 15s");
2880
- }
2881
- throw err;
2882
- } finally {
2883
- clearTimeout(timeout);
3078
+ default: {
3079
+ const exhaustiveCheck = strategy;
3080
+ throw new Error(`Unsupported chunking strategy: ${exhaustiveCheck}`);
2884
3081
  }
2885
- const json = await res.json().catch(() => void 0);
2886
- if (!res.ok) {
2887
- throw new Error(
2888
- `Hive moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
2889
- );
3082
+ }
3083
+ }
3084
+
3085
+ // src/workflows/embeddings.ts
3086
+ function averageEmbeddings(embeddings) {
3087
+ if (embeddings.length === 0) {
3088
+ return [];
3089
+ }
3090
+ const dimensions = embeddings[0].length;
3091
+ const averaged = Array.from({ length: dimensions }, () => 0);
3092
+ for (const embedding of embeddings) {
3093
+ for (let i = 0; i < dimensions; i++) {
3094
+ averaged[i] += embedding[i];
2890
3095
  }
2891
- if (json?.return_code != null && json.return_code !== 0) {
2892
- throw new Error(
2893
- `Hive API error (return_code ${json.return_code}): ${json.message || "Unknown error"}`
2894
- );
2895
- }
2896
- const classes = json?.status?.[0]?.response?.output?.[0]?.classes;
2897
- if (!Array.isArray(classes)) {
2898
- throw new TypeError(
2899
- `Unexpected Hive response structure: ${JSON.stringify(json)}`
2900
- );
2901
- }
2902
- const sexual = getHiveCategoryScores(classes, HIVE_SEXUAL_CATEGORIES);
2903
- const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
2904
- return {
2905
- url: entry.url,
2906
- time: entry.time,
2907
- sexual,
2908
- violence,
2909
- error: false
2910
- };
2911
- } catch (error) {
2912
- return {
2913
- url: entry.url,
2914
- time: entry.time,
2915
- sexual: 0,
2916
- violence: 0,
2917
- error: true,
2918
- errorMessage: error instanceof Error ? error.message : String(error)
2919
- };
2920
3096
  }
3097
+ for (let i = 0; i < dimensions; i++) {
3098
+ averaged[i] /= embeddings.length;
3099
+ }
3100
+ return averaged;
2921
3101
  }
2922
- async function requestHiveModeration(images, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2923
- "use step";
2924
- const imageUrls = images.map((img) => img.url);
2925
- const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2926
- const targets = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map((img) => ({
2927
- url: img.url,
2928
- time: timeByUrl.get(img.url),
2929
- source: {
2930
- kind: "file",
2931
- buffer: img.buffer,
2932
- contentType: img.contentType
2933
- },
2934
- credentials
2935
- })) : images.map((img) => ({
2936
- url: img.url,
2937
- time: img.time,
2938
- source: { kind: "url", value: img.url },
2939
- credentials
2940
- }));
2941
- return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
2942
- }
2943
- async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options) {
3102
+ async function generateSingleChunkEmbedding({
3103
+ chunk,
3104
+ provider,
3105
+ modelId,
3106
+ credentials
3107
+ }) {
2944
3108
  "use step";
2945
- const { width, shouldSign, credentials } = options;
2946
- const baseUrl = getMuxThumbnailBaseUrl(playbackId);
2947
- const urlPromises = timestampsMs.map(async (tsMs) => {
2948
- const time = Number((tsMs / 1e3).toFixed(2));
2949
- const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
2950
- return { url, time };
2951
- });
2952
- return Promise.all(urlPromises);
3109
+ const model = await createEmbeddingModelFromConfig(provider, modelId, credentials);
3110
+ const response = await withRetry(
3111
+ () => embed({
3112
+ model,
3113
+ value: chunk.text
3114
+ })
3115
+ );
3116
+ return {
3117
+ chunkId: chunk.id,
3118
+ embedding: response.embedding,
3119
+ metadata: {
3120
+ startTime: chunk.startTime,
3121
+ endTime: chunk.endTime,
3122
+ tokenCount: chunk.tokenCount
3123
+ }
3124
+ };
2953
3125
  }
2954
- async function getModerationScores(assetId, options = {}) {
2955
- "use workflow";
3126
+ async function generateEmbeddingsInternal(assetId, options = {}) {
2956
3127
  const {
2957
- provider = DEFAULT_PROVIDER2,
2958
- model = provider === "openai" ? "omni-moderation-latest" : void 0,
3128
+ provider = "openai",
3129
+ model,
2959
3130
  languageCode,
2960
- thresholds = DEFAULT_THRESHOLDS,
2961
- thumbnailInterval = 10,
2962
- thumbnailWidth = 640,
2963
- maxSamples,
2964
- maxConcurrent = 5,
2965
- imageSubmissionMode = "url",
2966
- imageDownloadOptions,
2967
- credentials: providedCredentials
3131
+ chunkingStrategy = { type: "token", maxTokens: 500, overlap: 100 },
3132
+ batchSize = 5,
3133
+ credentials
2968
3134
  } = options;
2969
- const credentials = providedCredentials;
2970
- const { asset, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
2971
- const videoTrackDurationSeconds = getVideoTrackDurationSecondsFromAsset(asset);
2972
- const videoTrackFps = getVideoTrackMaxFrameRateFromAsset(asset);
2973
- const assetDurationSeconds = getAssetDurationSecondsFromAsset(asset);
2974
- const candidateDurations = [videoTrackDurationSeconds, assetDurationSeconds].filter(
2975
- (d) => d != null
2976
- );
2977
- const duration = candidateDurations.length > 0 ? Math.min(...candidateDurations) : 0;
2978
- const isAudioOnly = isAudioOnlyAsset(asset);
3135
+ const embeddingModel = resolveEmbeddingModelConfig({ ...options, provider, model });
3136
+ const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
3137
+ const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
3138
+ const isAudioOnly = isAudioOnlyAsset(assetData);
2979
3139
  const signingContext = await resolveMuxSigningContext(credentials);
2980
3140
  if (policy === "signed" && !signingContext) {
2981
3141
  throw new Error(
2982
3142
  "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
2983
3143
  );
2984
3144
  }
2985
- let thumbnailScores;
2986
- let mode = "thumbnails";
2987
- let thumbnailCount;
2988
- if (isAudioOnly) {
2989
- mode = "transcript";
2990
- const readyTextTracks = getReadyTextTracks(asset);
2991
- let transcriptResult = await fetchTranscriptForAsset(asset, playbackId, {
2992
- languageCode,
2993
- cleanTranscript: true,
3145
+ const readyTextTracks = getReadyTextTracks(assetData);
3146
+ const useVttChunking = chunkingStrategy.type === "vtt";
3147
+ let transcriptResult = await fetchTranscriptForAsset(assetData, playbackId, {
3148
+ languageCode,
3149
+ cleanTranscript: !useVttChunking,
3150
+ shouldSign: policy === "signed",
3151
+ credentials
3152
+ });
3153
+ if (isAudioOnly && !transcriptResult.track && readyTextTracks.length === 1) {
3154
+ transcriptResult = await fetchTranscriptForAsset(assetData, playbackId, {
3155
+ cleanTranscript: !useVttChunking,
2994
3156
  shouldSign: policy === "signed",
2995
- credentials,
2996
- required: true
3157
+ credentials
2997
3158
  });
2998
- if (!transcriptResult.track && readyTextTracks.length === 1) {
2999
- transcriptResult = await fetchTranscriptForAsset(asset, playbackId, {
3000
- cleanTranscript: true,
3001
- shouldSign: policy === "signed",
3002
- credentials,
3003
- required: true
3004
- });
3005
- }
3006
- if (provider === "openai") {
3007
- thumbnailScores = await requestOpenAITranscriptModeration(
3008
- transcriptResult.transcriptText,
3009
- model || "omni-moderation-latest",
3010
- maxConcurrent,
3011
- credentials
3159
+ }
3160
+ if (!transcriptResult.track || !transcriptResult.transcriptText) {
3161
+ const availableLanguages = readyTextTracks.map((t) => t.language_code).filter(Boolean).join(", ");
3162
+ if (isAudioOnly) {
3163
+ throw new Error(
3164
+ `No transcript track found${languageCode ? ` for language '${languageCode}'` : ""}. Audio-only assets require a transcript. Available languages: ${availableLanguages || "none"}`
3012
3165
  );
3013
- } else if (provider === "hive") {
3014
- throw new Error("Hive does not support transcript moderation in this workflow. Use provider: 'openai' for audio-only assets.");
3015
- } else {
3016
- throw new Error(`Unsupported moderation provider: ${provider}`);
3017
3166
  }
3018
- } else {
3019
- const thumbnailUrls = maxSamples === void 0 ? (
3020
- // Generate thumbnail URLs (signed if needed) using existing interval-based logic.
3021
- await getThumbnailUrls(playbackId, duration, {
3022
- interval: thumbnailInterval,
3023
- width: thumbnailWidth,
3024
- shouldSign: policy === "signed",
3025
- credentials
3026
- })
3027
- ) : (
3028
- // In maxSamples mode, sample valid timestamps over the trimmed usable span.
3029
- // Use proportional trims (≈ duration/6, capped at 5s) to stay well inside the
3030
- // renderable range — Mux can't always serve thumbnails at the very edges.
3031
- await getThumbnailUrlsFromTimestamps(
3032
- playbackId,
3033
- planSamplingTimestamps({
3034
- duration_sec: duration,
3035
- max_candidates: maxSamples,
3036
- trim_start_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
3037
- trim_end_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
3038
- fps: videoTrackFps,
3039
- base_cadence_hz: thumbnailInterval > 0 ? 1 / thumbnailInterval : void 0
3040
- }),
3041
- {
3042
- width: thumbnailWidth,
3043
- shouldSign: policy === "signed",
3044
- credentials
3045
- }
3046
- )
3167
+ throw new Error(
3168
+ `No caption track found${languageCode ? ` for language '${languageCode}'` : ""}. Available languages: ${availableLanguages || "none"}`
3047
3169
  );
3048
- thumbnailCount = thumbnailUrls.length;
3049
- if (provider === "openai") {
3050
- thumbnailScores = await requestOpenAIModeration(
3051
- thumbnailUrls,
3052
- model || "omni-moderation-latest",
3053
- maxConcurrent,
3054
- imageSubmissionMode,
3055
- imageDownloadOptions,
3056
- credentials
3057
- );
3058
- } else if (provider === "hive") {
3059
- thumbnailScores = await requestHiveModeration(
3060
- thumbnailUrls,
3061
- maxConcurrent,
3062
- imageSubmissionMode,
3063
- imageDownloadOptions,
3064
- credentials
3170
+ }
3171
+ const transcriptText = transcriptResult.transcriptText;
3172
+ if (!transcriptText.trim()) {
3173
+ throw new Error("Transcript is empty");
3174
+ }
3175
+ const chunks = useVttChunking ? chunkVTTCues(
3176
+ parseVTTCues(transcriptText),
3177
+ chunkingStrategy.maxTokens,
3178
+ chunkingStrategy.overlapCues
3179
+ ) : chunkText(transcriptText, chunkingStrategy);
3180
+ if (chunks.length === 0) {
3181
+ throw new Error("No chunks generated from transcript");
3182
+ }
3183
+ const chunkEmbeddings = [];
3184
+ try {
3185
+ for (let i = 0; i < chunks.length; i += batchSize) {
3186
+ const batch = chunks.slice(i, i + batchSize);
3187
+ const batchResults = await Promise.all(
3188
+ batch.map(
3189
+ (chunk) => generateSingleChunkEmbedding({
3190
+ chunk,
3191
+ provider: embeddingModel.provider,
3192
+ modelId: embeddingModel.modelId,
3193
+ credentials
3194
+ })
3195
+ )
3065
3196
  );
3066
- } else {
3067
- throw new Error(`Unsupported moderation provider: ${provider}`);
3197
+ chunkEmbeddings.push(...batchResults);
3068
3198
  }
3069
- }
3070
- const failed = thumbnailScores.filter((s) => s.error);
3071
- if (failed.length > 0) {
3072
- const details = failed.map((s) => `${s.url}: ${s.errorMessage || "Unknown error"}`).join("; ");
3199
+ } catch (error) {
3073
3200
  throw new Error(
3074
- `Moderation failed for ${failed.length}/${thumbnailScores.length} thumbnail(s): ${details}`
3201
+ `Failed to generate embeddings with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
3075
3202
  );
3076
3203
  }
3077
- const maxSexual = Math.max(...thumbnailScores.map((s) => s.sexual));
3078
- const maxViolence = Math.max(...thumbnailScores.map((s) => s.violence));
3079
- const finalThresholds = { ...DEFAULT_THRESHOLDS, ...thresholds };
3204
+ if (chunkEmbeddings.length === 0) {
3205
+ throw new Error("No embeddings generated");
3206
+ }
3207
+ const averagedEmbedding = averageEmbeddings(chunkEmbeddings.map((ce) => ce.embedding));
3208
+ const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0);
3080
3209
  return {
3081
3210
  assetId,
3082
- mode,
3083
- isAudioOnly,
3084
- thumbnailScores,
3211
+ chunks: chunkEmbeddings,
3212
+ averagedEmbedding,
3213
+ provider,
3214
+ model: embeddingModel.modelId,
3215
+ metadata: {
3216
+ totalChunks: chunks.length,
3217
+ totalTokens,
3218
+ chunkingStrategy: JSON.stringify(chunkingStrategy),
3219
+ embeddingDimensions: chunkEmbeddings[0].embedding.length,
3220
+ generatedAt: (/* @__PURE__ */ new Date()).toISOString()
3221
+ },
3085
3222
  usage: {
3086
3223
  metadata: {
3087
- assetDurationSeconds: duration,
3088
- ...thumbnailCount === void 0 ? {} : { thumbnailCount }
3224
+ assetDurationSeconds
3089
3225
  }
3090
- },
3091
- maxScores: {
3092
- sexual: maxSexual,
3093
- violence: maxViolence
3094
- },
3095
- exceedsThreshold: maxSexual > finalThresholds.sexual || maxViolence > finalThresholds.violence,
3096
- thresholds: finalThresholds
3226
+ }
3097
3227
  };
3098
3228
  }
3229
+ async function generateEmbeddings(assetId, options = {}) {
3230
+ "use workflow";
3231
+ return generateEmbeddingsInternal(assetId, options);
3232
+ }
3233
+ async function generateVideoEmbeddings(assetId, options = {}) {
3234
+ "use workflow";
3235
+ console.warn("generateVideoEmbeddings is deprecated. Use generateEmbeddings instead.");
3236
+ return generateEmbeddingsInternal(assetId, options);
3237
+ }
3099
3238
 
3100
- // src/workflows/summarization.ts
3101
- import { generateText as generateText4, Output as Output4 } from "ai";
3102
- import dedent4 from "dedent";
3103
- import { z as z5 } from "zod";
3104
- var SUMMARY_KEYWORD_LIMIT = 10;
3105
- var summarySchema = z5.object({
3106
- keywords: z5.array(z5.string()),
3107
- title: z5.string(),
3108
- description: z5.string()
3109
- }).strict();
3110
- var SUMMARY_OUTPUT = Output4.object({
3111
- name: "summary_metadata",
3112
- description: "Structured summary with title, description, and keywords.",
3113
- schema: summarySchema
3114
- });
3115
- var VALID_TONES = ["neutral", "playful", "professional"];
3116
- var TONE_INSTRUCTIONS = {
3117
- neutral: "Provide a clear, straightforward analysis.",
3118
- playful: "Channel your inner diva! Answer with maximum sass, wit, and playful attitude. Don't hold back - be cheeky, clever, and delightfully snarky. Make it pop!",
3119
- professional: "Provide a professional, executive-level analysis suitable for business reporting."
3120
- };
3121
- function createSummarizationBuilder({ titleLength, descriptionLength, tagCount } = {}) {
3122
- const titleBrevity = titleLength != null ? `Aim for approximately ${titleLength} characters.` : "Aim for brevity - typically under 10 words.";
3123
- const descConstraint = descriptionLength != null ? `approximately ${descriptionLength} characters` : "2-4 sentences";
3124
- const keywordLimit = tagCount ?? SUMMARY_KEYWORD_LIMIT;
3125
- return createPromptBuilder({
3126
- template: {
3127
- task: {
3128
- tag: "task",
3129
- content: "Analyze the storyboard frames and generate metadata that captures the essence of the video content."
3130
- },
3131
- title: {
3132
- tag: "title_requirements",
3133
- content: dedent4`
3134
- A short, compelling headline that immediately communicates the subject or action.
3135
- ${titleBrevity} Think of how a news headline or video card title would read.
3136
- Start with the primary subject, action, or topic - never begin with "A video of" or similar phrasing.
3137
- Use active, specific language.`
3138
- },
3139
- description: {
3140
- tag: "description_requirements",
3141
- content: dedent4`
3142
- A concise summary (${descConstraint}) that describes what happens across the video.
3143
- Cover the main subjects, actions, setting, and any notable progression visible across frames.
3144
- Write in present tense. Be specific about observable details rather than making assumptions.
3145
- If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`
3146
- },
3147
- keywords: {
3148
- tag: "keywords_requirements",
3149
- content: dedent4`
3150
- Specific, searchable terms (up to ${keywordLimit}) that capture:
3151
- - Primary subjects (people, animals, objects)
3152
- - Actions and activities being performed
3153
- - Setting and environment
3154
- - Notable objects or tools
3155
- - Style or genre (if applicable)
3156
- Prefer concrete nouns and action verbs over abstract concepts.
3157
- Use lowercase. Avoid redundant or overly generic terms like "video" or "content".`
3158
- },
3159
- qualityGuidelines: {
3160
- tag: "quality_guidelines",
3161
- content: dedent4`
3162
- - Examine all frames to understand the full context and progression
3163
- - Be precise: "golden retriever" is better than "dog" when identifiable
3164
- - Capture the narrative: what begins, develops, and concludes
3165
- - Balance brevity with informativeness`
3239
+ // src/lib/sampling-plan.ts
3240
+ var DEFAULT_FPS = 30;
3241
+ function roundToNearestFrameMs(tsMs, fps = DEFAULT_FPS) {
3242
+ const frameMs = 1e3 / fps;
3243
+ return Math.round(Math.round(tsMs / frameMs) * frameMs * 100) / 100;
3244
+ }
3245
+ function planSamplingTimestamps(options) {
3246
+ const DEFAULT_MIN_CANDIDATES = 10;
3247
+ const DEFAULT_MAX_CANDIDATES = 30;
3248
+ const {
3249
+ duration_sec,
3250
+ min_candidates = DEFAULT_MIN_CANDIDATES,
3251
+ max_candidates = DEFAULT_MAX_CANDIDATES,
3252
+ trim_start_sec = 1,
3253
+ trim_end_sec = 1,
3254
+ fps = DEFAULT_FPS,
3255
+ base_cadence_hz,
3256
+ anchor_percents = [0.2, 0.5, 0.8],
3257
+ anchor_window_sec = 1.5
3258
+ } = options;
3259
+ const usableSec = Math.max(0, duration_sec - (trim_start_sec + trim_end_sec));
3260
+ if (usableSec <= 0)
3261
+ return [];
3262
+ const cadenceHz = base_cadence_hz ?? (duration_sec < 15 ? 3 : duration_sec < 60 ? 2 : duration_sec < 180 ? 1.5 : 1);
3263
+ let target = Math.round(usableSec * cadenceHz);
3264
+ target = Math.max(min_candidates, Math.min(max_candidates, target));
3265
+ const stepSec = usableSec / target;
3266
+ const t0 = trim_start_sec;
3267
+ const base = [];
3268
+ for (let i = 0; i < target; i++) {
3269
+ const tsSec = t0 + (i + 0.5) * stepSec;
3270
+ base.push(tsSec * 1e3);
3271
+ }
3272
+ const slack = Math.max(0, max_candidates - base.length);
3273
+ const extra = [];
3274
+ if (slack > 0 && anchor_percents.length > 0) {
3275
+ const perAnchor = Math.max(1, Math.min(5, Math.floor(slack / anchor_percents.length)));
3276
+ for (const p of anchor_percents) {
3277
+ const centerSec = Math.min(
3278
+ t0 + usableSec - 1e-3,
3279
+ // nudge just inside the end bound
3280
+ Math.max(t0 + 1e-3, duration_sec * p)
3281
+ // nudge just inside the start bound
3282
+ );
3283
+ const startSec = Math.max(t0, centerSec - anchor_window_sec / 2);
3284
+ const endSec = Math.min(t0 + usableSec, centerSec + anchor_window_sec / 2);
3285
+ if (endSec <= startSec)
3286
+ continue;
3287
+ const wStep = (endSec - startSec) / perAnchor;
3288
+ for (let i = 0; i < perAnchor; i++) {
3289
+ const tsSec = startSec + (i + 0.5) * wStep;
3290
+ extra.push(tsSec * 1e3);
3166
3291
  }
3167
- },
3168
- sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
3169
- });
3292
+ }
3293
+ }
3294
+ const all = base.concat(extra).map((ms) => roundToNearestFrameMs(ms, fps)).filter((ms) => ms >= trim_start_sec * 1e3 && ms <= (duration_sec - trim_end_sec) * 1e3);
3295
+ const uniqSorted = Array.from(new Set(all)).sort((a, b) => a - b);
3296
+ return uniqSorted.slice(0, max_candidates);
3170
3297
  }
3171
- function createAudioOnlyBuilder({ titleLength, descriptionLength, tagCount } = {}) {
3172
- const titleBrevity = titleLength != null ? `Aim for approximately ${titleLength} characters.` : "Aim for brevity - typically under 10 words.";
3173
- const descConstraint = descriptionLength != null ? `approximately ${descriptionLength} characters` : "2-4 sentences";
3174
- const keywordLimit = tagCount ?? SUMMARY_KEYWORD_LIMIT;
3175
- return createPromptBuilder({
3176
- template: {
3177
- task: {
3178
- tag: "task",
3179
- content: "Analyze the transcript and generate metadata that captures the essence of the audio content."
3180
- },
3181
- title: {
3182
- tag: "title_requirements",
3183
- content: dedent4`
3184
- A short, compelling headline that immediately communicates the subject or topic.
3185
- ${titleBrevity} Think of how a podcast title or audio description would read.
3186
- Start with the primary subject, action, or topic - never begin with "An audio of" or similar phrasing.
3187
- Use active, specific language.`
3188
- },
3189
- description: {
3190
- tag: "description_requirements",
3191
- content: dedent4`
3192
- A concise summary (${descConstraint}) that describes the audio content.
3193
- Cover the main topics, speakers, themes, and any notable progression in the discussion or narration.
3194
- Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
3195
- Focus on the spoken content and any key insights, dialogue, or narrative elements.`
3196
- },
3197
- keywords: {
3198
- tag: "keywords_requirements",
3199
- content: dedent4`
3200
- Specific, searchable terms (up to ${keywordLimit}) that capture:
3201
- - Primary topics and themes
3202
- - Speakers or presenters (if named)
3203
- - Key concepts and terminology
3204
- - Content type (interview, lecture, music, etc.)
3205
- - Genre or style (if applicable)
3206
- Prefer concrete nouns and relevant terms over abstract concepts.
3207
- Use lowercase. Avoid redundant or overly generic terms like "audio" or "content".`
3208
- },
3209
- qualityGuidelines: {
3210
- tag: "quality_guidelines",
3211
- content: dedent4`
3212
- - Analyze the full transcript to understand context and themes
3213
- - Be precise: use specific terminology when mentioned
3214
- - Capture the narrative: what is introduced, discussed, and concluded
3215
- - Balance brevity with informativeness`
3298
+
3299
+ // src/primitives/thumbnails.ts
3300
+ async function getThumbnailUrls(playbackId, duration, options = {}) {
3301
+ "use step";
3302
+ const { interval = 10, width = 640, shouldSign = false, maxSamples, credentials } = options;
3303
+ let timestamps = [];
3304
+ if (duration <= 50) {
3305
+ const spacing = duration / 6;
3306
+ for (let i = 1; i <= 5; i++) {
3307
+ timestamps.push(Math.round(i * spacing));
3308
+ }
3309
+ } else {
3310
+ for (let time = 0; time < duration; time += interval) {
3311
+ timestamps.push(time);
3312
+ }
3313
+ }
3314
+ if (maxSamples !== void 0 && timestamps.length > maxSamples) {
3315
+ const newTimestamps = [];
3316
+ newTimestamps.push(0);
3317
+ if (maxSamples >= 2) {
3318
+ const spacing = duration / (maxSamples - 1);
3319
+ for (let i = 1; i < maxSamples - 1; i++) {
3320
+ newTimestamps.push(spacing * i);
3216
3321
  }
3217
- },
3218
- sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
3322
+ newTimestamps.push(duration);
3323
+ }
3324
+ timestamps = newTimestamps;
3325
+ }
3326
+ const baseUrl = getMuxThumbnailBaseUrl(playbackId);
3327
+ const urlPromises = timestamps.map(async (time) => {
3328
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
3329
+ return { url, time };
3219
3330
  });
3331
+ return Promise.all(urlPromises);
3220
3332
  }
3221
- var SYSTEM_PROMPT3 = dedent4`
3222
- <role>
3223
- You are a video content analyst specializing in storyboard interpretation and multimodal analysis.
3224
- </role>
3225
-
3226
- <context>
3227
- You receive storyboard images containing multiple sequential frames extracted from a video.
3228
- These frames are arranged in a grid and represent the visual progression of the content over time.
3229
- Read frames left-to-right, top-to-bottom to understand the temporal sequence.
3230
- </context>
3231
-
3232
- <transcript_guidance>
3233
- When a transcript is provided alongside the storyboard:
3234
- - Use it to understand spoken content, dialogue, narration, and audio context
3235
- - Correlate transcript content with visual frames to build a complete picture
3236
- - Extract key terminology, names, and specific language used by speakers
3237
- - Let the transcript inform keyword selection, especially for topics not visually obvious
3238
- - Prioritize visual content for the description, but enrich it with transcript insights
3239
- - If transcript and visuals conflict, trust the visual evidence
3240
- </transcript_guidance>
3241
-
3242
- <capabilities>
3243
- - Extract meaning from visual sequences
3244
- - Identify subjects, actions, settings, and narrative arcs
3245
- - Generate accurate, searchable metadata
3246
- - Synthesize visual and transcript information when provided
3247
- </capabilities>
3248
-
3249
- <constraints>
3250
- - Only describe what is clearly observable in the frames or explicitly stated in the transcript
3251
- - Do not fabricate details or make unsupported assumptions
3252
- - Return structured data matching the requested schema
3253
- - Output only the JSON object; no markdown or extra text
3254
- - When a <language> section is provided, all output text MUST be written in that language
3255
- </constraints>
3256
-
3257
- <tone_guidance>
3258
- Pay special attention to the <tone> section and lean heavily into those instructions.
3259
- Adapt your entire analysis and writing style to match the specified tone - this should influence
3260
- your word choice, personality, formality level, and overall presentation of the content.
3261
- The tone instructions are not suggestions but core requirements for how you should express yourself.
3262
- </tone_guidance>
3263
-
3264
- <language_guidelines>
3265
- AVOID these meta-descriptive phrases that reference the medium rather than the content:
3266
- - "The image shows..." / "The storyboard shows..."
3267
- - "In this video..." / "This video features..."
3268
- - "The frames depict..." / "The footage shows..."
3269
- - "We can see..." / "You can see..."
3270
- - "The clip shows..." / "The scene shows..."
3271
-
3272
- INSTEAD, describe the content directly:
3273
- - BAD: "The video shows a chef preparing a meal"
3274
- - GOOD: "A chef prepares a meal in a professional kitchen"
3275
3333
 
3276
- Write as if describing reality, not describing a recording of reality.
3277
- </language_guidelines>`;
3278
- var AUDIO_ONLY_SYSTEM_PROMPT = dedent4`
3279
- <role>
3280
- You are an audio content analyst specializing in transcript analysis and metadata generation.
3281
- </role>
3282
-
3283
- <context>
3284
- You receive transcript text from audio-only content (podcasts, audiobooks, music, etc.).
3285
- Your task is to analyze the spoken/audio content and generate accurate, searchable metadata.
3286
- </context>
3287
-
3288
- <transcript_guidance>
3289
- - Carefully analyze the entire transcript to understand themes, topics, and key points
3290
- - Extract key terminology, names, concepts, and specific language used
3291
- - Identify the content type (interview, lecture, music, narration, etc.)
3292
- - Note the tone, style, and any distinctive characteristics of the audio
3293
- - Consider the intended audience and context based on language and content
3294
- </transcript_guidance>
3295
-
3296
- <capabilities>
3297
- - Extract meaning and themes from spoken/audio content
3298
- - Identify subjects, topics, speakers, and narrative structure
3299
- - Generate accurate, searchable metadata from audio-based content
3300
- - Understand context and intent from transcript alone
3301
- </capabilities>
3302
-
3303
- <constraints>
3304
- - Only describe what is explicitly stated or strongly implied in the transcript
3305
- - Do not fabricate details or make unsupported assumptions
3306
- - Return structured data matching the requested schema
3307
- - Focus entirely on audio/spoken content - there are no visual elements
3308
- - Output only the JSON object; no markdown or extra text
3309
- - When a <language> section is provided, all output text MUST be written in that language
3310
- </constraints>
3311
-
3312
- <tone_guidance>
3313
- Pay special attention to the <tone> section and lean heavily into those instructions.
3314
- Adapt your entire analysis and writing style to match the specified tone - this should influence
3315
- your word choice, personality, formality level, and overall presentation of the content.
3316
- The tone instructions are not suggestions but core requirements for how you should express yourself.
3317
- </tone_guidance>
3318
-
3319
- <language_guidelines>
3320
- AVOID these meta-descriptive phrases that reference the medium rather than the content:
3321
- - "The audio shows..." / "The transcript shows..."
3322
- - "In this recording..." / "This audio features..."
3323
- - "The speaker says..." / "We can hear..."
3324
- - "The clip contains..." / "The recording shows..."
3325
-
3326
- INSTEAD, describe the content directly:
3327
- - BAD: "The audio features a discussion about climate change"
3328
- - GOOD: "A panel discusses climate change impacts and solutions"
3329
-
3330
- Write as if describing reality, not describing a recording of reality.
3331
- </language_guidelines>`;
3332
- function buildUserPrompt4({
3333
- tone,
3334
- transcriptText,
3335
- isCleanTranscript = true,
3336
- promptOverrides,
3337
- isAudioOnly = false,
3338
- titleLength,
3339
- descriptionLength,
3340
- tagCount,
3341
- languageName
3342
- }) {
3343
- const contextSections = [createToneSection(TONE_INSTRUCTIONS[tone])];
3344
- if (languageName) {
3345
- contextSections.push(createLanguageSection(languageName));
3346
- }
3347
- if (transcriptText) {
3348
- const format = isCleanTranscript ? "plain text" : "WebVTT";
3349
- contextSections.push(createTranscriptSection(transcriptText, format));
3334
+ // src/workflows/moderation.ts
3335
+ var DEFAULT_THRESHOLDS = {
3336
+ sexual: 0.8,
3337
+ violence: 0.8
3338
+ };
3339
+ var DEFAULT_PROVIDER2 = "openai";
3340
+ var HIVE_ENDPOINT = "https://api.thehive.ai/api/v2/task/sync";
3341
+ var HIVE_SEXUAL_CATEGORIES = [
3342
+ "general_nsfw",
3343
+ "yes_sexual_activity",
3344
+ "yes_sex_toy",
3345
+ "yes_female_nudity",
3346
+ "yes_male_nudity"
3347
+ ];
3348
+ var HIVE_VIOLENCE_CATEGORIES = [
3349
+ "gun_in_hand",
3350
+ "gun_not_in_hand",
3351
+ "knife_in_hand",
3352
+ "very_bloody",
3353
+ "other_blood",
3354
+ "hanging",
3355
+ "noose",
3356
+ "human_corpse",
3357
+ "yes_emaciated_body",
3358
+ "yes_self_harm",
3359
+ "garm_death_injury_or_military_conflict"
3360
+ ];
3361
+ async function processConcurrently(items, processor, maxConcurrent = 5) {
3362
+ "use step";
3363
+ const results = [];
3364
+ for (let i = 0; i < items.length; i += maxConcurrent) {
3365
+ const batch = items.slice(i, i + maxConcurrent);
3366
+ const batchPromises = batch.map(processor);
3367
+ const batchResults = await Promise.all(batchPromises);
3368
+ results.push(...batchResults);
3350
3369
  }
3351
- const constraints = { titleLength, descriptionLength, tagCount };
3352
- const promptBuilder = isAudioOnly ? createAudioOnlyBuilder(constraints) : createSummarizationBuilder(constraints);
3353
- return promptBuilder.buildWithContext(promptOverrides, contextSections);
3370
+ return results;
3354
3371
  }
3355
- async function analyzeStoryboard2(imageDataUrl, provider, modelId, userPrompt, systemPrompt, credentials) {
3372
+ async function moderateImageWithOpenAI(entry) {
3356
3373
  "use step";
3357
- const model = await createLanguageModelFromConfig(provider, modelId, credentials);
3358
- const response = await generateText4({
3359
- model,
3360
- output: SUMMARY_OUTPUT,
3361
- messages: [
3362
- {
3363
- role: "system",
3364
- content: systemPrompt
3374
+ const apiKey = await getApiKeyFromEnv("openai", entry.credentials);
3375
+ try {
3376
+ const res = await fetch("https://api.openai.com/v1/moderations", {
3377
+ method: "POST",
3378
+ headers: {
3379
+ "Content-Type": "application/json",
3380
+ "Authorization": `Bearer ${apiKey}`
3365
3381
  },
3366
- {
3367
- role: "user",
3368
- content: [
3369
- { type: "text", text: userPrompt },
3370
- { type: "image", image: imageDataUrl }
3382
+ body: JSON.stringify({
3383
+ model: entry.model,
3384
+ input: [
3385
+ {
3386
+ type: "image_url",
3387
+ image_url: {
3388
+ url: entry.image
3389
+ }
3390
+ }
3371
3391
  ]
3372
- }
3373
- ]
3374
- });
3375
- if (!response.output) {
3376
- throw new Error("Summarization output missing");
3377
- }
3378
- const parsed = summarySchema.parse(response.output);
3379
- return {
3380
- result: parsed,
3381
- usage: {
3382
- inputTokens: response.usage.inputTokens,
3383
- outputTokens: response.usage.outputTokens,
3384
- totalTokens: response.usage.totalTokens,
3385
- reasoningTokens: response.usage.reasoningTokens,
3386
- cachedInputTokens: response.usage.cachedInputTokens
3392
+ })
3393
+ });
3394
+ const json = await res.json();
3395
+ if (!res.ok) {
3396
+ throw new Error(
3397
+ `OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
3398
+ );
3387
3399
  }
3388
- };
3400
+ const categoryScores = json.results?.[0]?.category_scores || {};
3401
+ return {
3402
+ url: entry.url,
3403
+ time: entry.time,
3404
+ sexual: categoryScores.sexual || 0,
3405
+ violence: categoryScores.violence || 0,
3406
+ error: false
3407
+ };
3408
+ } catch (error) {
3409
+ console.error("OpenAI moderation failed:", error);
3410
+ return {
3411
+ url: entry.url,
3412
+ time: entry.time,
3413
+ sexual: 0,
3414
+ violence: 0,
3415
+ error: true,
3416
+ errorMessage: error instanceof Error ? error.message : String(error)
3417
+ };
3418
+ }
3389
3419
  }
3390
- async function analyzeAudioOnly(provider, modelId, userPrompt, systemPrompt, credentials) {
3420
+ async function requestOpenAIModeration(images, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
3391
3421
  "use step";
3392
- const model = await createLanguageModelFromConfig(provider, modelId, credentials);
3393
- const response = await generateText4({
3394
- model,
3395
- output: SUMMARY_OUTPUT,
3396
- messages: [
3397
- {
3398
- role: "system",
3399
- content: systemPrompt
3422
+ const imageUrls = images.map((img) => img.url);
3423
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
3424
+ const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
3425
+ (img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
3426
+ ) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
3427
+ return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
3428
+ }
3429
+ async function requestOpenAITextModeration(text, model, url, credentials) {
3430
+ "use step";
3431
+ const apiKey = await getApiKeyFromEnv("openai", credentials);
3432
+ try {
3433
+ const res = await fetch("https://api.openai.com/v1/moderations", {
3434
+ method: "POST",
3435
+ headers: {
3436
+ "Content-Type": "application/json",
3437
+ "Authorization": `Bearer ${apiKey}`
3400
3438
  },
3401
- {
3402
- role: "user",
3403
- content: userPrompt
3404
- }
3405
- ]
3406
- });
3407
- if (!response.output) {
3408
- throw new Error("Summarization output missing");
3409
- }
3410
- const parsed = summarySchema.parse(response.output);
3411
- return {
3412
- result: parsed,
3413
- usage: {
3414
- inputTokens: response.usage.inputTokens,
3415
- outputTokens: response.usage.outputTokens,
3416
- totalTokens: response.usage.totalTokens,
3417
- reasoningTokens: response.usage.reasoningTokens,
3418
- cachedInputTokens: response.usage.cachedInputTokens
3439
+ body: JSON.stringify({
3440
+ model,
3441
+ input: text
3442
+ })
3443
+ });
3444
+ const json = await res.json();
3445
+ if (!res.ok) {
3446
+ throw new Error(
3447
+ `OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
3448
+ );
3419
3449
  }
3420
- };
3450
+ const categoryScores = json.results?.[0]?.category_scores || {};
3451
+ return {
3452
+ url,
3453
+ sexual: categoryScores.sexual || 0,
3454
+ violence: categoryScores.violence || 0,
3455
+ error: false
3456
+ };
3457
+ } catch (error) {
3458
+ console.error("OpenAI text moderation failed:", error);
3459
+ return {
3460
+ url,
3461
+ sexual: 0,
3462
+ violence: 0,
3463
+ error: true,
3464
+ errorMessage: error instanceof Error ? error.message : String(error)
3465
+ };
3466
+ }
3421
3467
  }
3422
- function normalizeKeywords(keywords, limit = SUMMARY_KEYWORD_LIMIT) {
3423
- if (!Array.isArray(keywords) || keywords.length === 0) {
3468
+ function chunkTextByUtf16CodeUnits(text, maxUnits) {
3469
+ if (!text.trim()) {
3424
3470
  return [];
3425
3471
  }
3426
- const uniqueLowercase = /* @__PURE__ */ new Set();
3427
- const normalized = [];
3428
- for (const keyword of keywords) {
3429
- const trimmed = keyword?.trim();
3430
- if (!trimmed) {
3431
- continue;
3472
+ if (text.length <= maxUnits) {
3473
+ return [text];
3474
+ }
3475
+ const chunks = [];
3476
+ for (let i = 0; i < text.length; i += maxUnits) {
3477
+ const chunk = text.slice(i, i + maxUnits).trim();
3478
+ if (chunk) {
3479
+ chunks.push(chunk);
3432
3480
  }
3433
- const lower = trimmed.toLowerCase();
3434
- if (uniqueLowercase.has(lower)) {
3435
- continue;
3481
+ }
3482
+ return chunks;
3483
+ }
3484
+ async function requestOpenAITranscriptModeration(transcriptText, model, maxConcurrent = 5, credentials) {
3485
+ "use step";
3486
+ const chunks = chunkTextByUtf16CodeUnits(transcriptText, 1e4);
3487
+ if (!chunks.length) {
3488
+ return [
3489
+ { url: "transcript:0", sexual: 0, violence: 0, error: true, errorMessage: "No transcript chunks to moderate" }
3490
+ ];
3491
+ }
3492
+ const targets = chunks.map((chunk, idx) => ({
3493
+ chunk,
3494
+ url: `transcript:${idx}`
3495
+ }));
3496
+ return processConcurrently(
3497
+ targets,
3498
+ async (entry) => requestOpenAITextModeration(entry.chunk, model, entry.url, credentials),
3499
+ maxConcurrent
3500
+ );
3501
+ }
3502
+ function getHiveCategoryScores(classes, categoryNames) {
3503
+ const scoreMap = Object.fromEntries(
3504
+ classes.map((c) => [c.class, c.score])
3505
+ );
3506
+ const missingCategories = categoryNames.filter((category) => !(category in scoreMap));
3507
+ if (missingCategories.length > 0) {
3508
+ console.warn(
3509
+ `Hive response missing expected categories: ${missingCategories.join(", ")}`
3510
+ );
3511
+ }
3512
+ const scores = categoryNames.map((category) => scoreMap[category] || 0);
3513
+ return Math.max(...scores, 0);
3514
+ }
3515
+ async function moderateImageWithHive(entry) {
3516
+ "use step";
3517
+ const apiKey = await getApiKeyFromEnv("hive", entry.credentials);
3518
+ try {
3519
+ const formData = new FormData();
3520
+ if (entry.source.kind === "url") {
3521
+ formData.append("url", entry.source.value);
3522
+ } else {
3523
+ const extension = entry.source.contentType.split("/")[1] || "jpg";
3524
+ const blob = new Blob([entry.source.buffer], {
3525
+ type: entry.source.contentType
3526
+ });
3527
+ formData.append("media", blob, `thumbnail.${extension}`);
3436
3528
  }
3437
- uniqueLowercase.add(lower);
3438
- normalized.push(trimmed);
3439
- if (normalized.length === limit) {
3440
- break;
3529
+ const controller = new AbortController();
3530
+ const timeout = setTimeout(() => controller.abort(), 15e3);
3531
+ let res;
3532
+ try {
3533
+ res = await fetch(HIVE_ENDPOINT, {
3534
+ method: "POST",
3535
+ headers: {
3536
+ Accept: "application/json",
3537
+ Authorization: `Token ${apiKey}`
3538
+ },
3539
+ body: formData,
3540
+ signal: controller.signal
3541
+ });
3542
+ } catch (err) {
3543
+ if (err?.name === "AbortError") {
3544
+ throw new Error("Hive request timed out after 15s");
3545
+ }
3546
+ throw err;
3547
+ } finally {
3548
+ clearTimeout(timeout);
3549
+ }
3550
+ const json = await res.json().catch(() => void 0);
3551
+ if (!res.ok) {
3552
+ throw new Error(
3553
+ `Hive moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
3554
+ );
3555
+ }
3556
+ if (json?.return_code != null && json.return_code !== 0) {
3557
+ throw new Error(
3558
+ `Hive API error (return_code ${json.return_code}): ${json.message || "Unknown error"}`
3559
+ );
3560
+ }
3561
+ const classes = json?.status?.[0]?.response?.output?.[0]?.classes;
3562
+ if (!Array.isArray(classes)) {
3563
+ throw new TypeError(
3564
+ `Unexpected Hive response structure: ${JSON.stringify(json)}`
3565
+ );
3441
3566
  }
3567
+ const sexual = getHiveCategoryScores(classes, HIVE_SEXUAL_CATEGORIES);
3568
+ const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
3569
+ return {
3570
+ url: entry.url,
3571
+ time: entry.time,
3572
+ sexual,
3573
+ violence,
3574
+ error: false
3575
+ };
3576
+ } catch (error) {
3577
+ return {
3578
+ url: entry.url,
3579
+ time: entry.time,
3580
+ sexual: 0,
3581
+ violence: 0,
3582
+ error: true,
3583
+ errorMessage: error instanceof Error ? error.message : String(error)
3584
+ };
3442
3585
  }
3443
- return normalized;
3444
3586
  }
3445
- async function getSummaryAndTags(assetId, options) {
3587
+ async function requestHiveModeration(images, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
3588
+ "use step";
3589
+ const imageUrls = images.map((img) => img.url);
3590
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
3591
+ const targets = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map((img) => ({
3592
+ url: img.url,
3593
+ time: timeByUrl.get(img.url),
3594
+ source: {
3595
+ kind: "file",
3596
+ buffer: img.buffer,
3597
+ contentType: img.contentType
3598
+ },
3599
+ credentials
3600
+ })) : images.map((img) => ({
3601
+ url: img.url,
3602
+ time: img.time,
3603
+ source: { kind: "url", value: img.url },
3604
+ credentials
3605
+ }));
3606
+ return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
3607
+ }
3608
+ async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options) {
3609
+ "use step";
3610
+ const { width, shouldSign, credentials } = options;
3611
+ const baseUrl = getMuxThumbnailBaseUrl(playbackId);
3612
+ const urlPromises = timestampsMs.map(async (tsMs) => {
3613
+ const time = Number((tsMs / 1e3).toFixed(2));
3614
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
3615
+ return { url, time };
3616
+ });
3617
+ return Promise.all(urlPromises);
3618
+ }
3619
+ async function getModerationScores(assetId, options = {}) {
3446
3620
  "use workflow";
3447
3621
  const {
3448
- provider = "openai",
3449
- model,
3450
- tone = "neutral",
3451
- includeTranscript = true,
3452
- cleanTranscript = true,
3622
+ provider = DEFAULT_PROVIDER2,
3623
+ model = provider === "openai" ? "omni-moderation-latest" : void 0,
3624
+ languageCode,
3625
+ thresholds = DEFAULT_THRESHOLDS,
3626
+ thumbnailInterval = 10,
3627
+ thumbnailWidth = 640,
3628
+ maxSamples,
3629
+ maxConcurrent = 5,
3453
3630
  imageSubmissionMode = "url",
3454
3631
  imageDownloadOptions,
3455
- promptOverrides,
3456
- credentials,
3457
- titleLength,
3458
- descriptionLength,
3459
- tagCount,
3460
- outputLanguageCode
3461
- } = options ?? {};
3462
- if (!VALID_TONES.includes(tone)) {
3463
- throw new Error(
3464
- `Invalid tone "${tone}". Valid tones are: ${VALID_TONES.join(", ")}`
3465
- );
3466
- }
3467
- const modelConfig = resolveLanguageModelConfig({
3468
- ...options,
3469
- model,
3470
- provider
3471
- });
3472
- const workflowCredentials = credentials;
3473
- const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, workflowCredentials);
3474
- const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
3475
- const isAudioOnly = isAudioOnlyAsset(assetData);
3476
- if (isAudioOnly && !includeTranscript) {
3477
- throw new Error(
3478
- "Audio-only assets require a transcript. Set includeTranscript: true and ensure the asset has a ready text track (captions/subtitles)."
3479
- );
3480
- }
3481
- const signingContext = await resolveMuxSigningContext(workflowCredentials);
3632
+ credentials: providedCredentials
3633
+ } = options;
3634
+ const credentials = providedCredentials;
3635
+ const { asset, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
3636
+ const videoTrackDurationSeconds = getVideoTrackDurationSecondsFromAsset(asset);
3637
+ const videoTrackFps = getVideoTrackMaxFrameRateFromAsset(asset);
3638
+ const assetDurationSeconds = getAssetDurationSecondsFromAsset(asset);
3639
+ const candidateDurations = [videoTrackDurationSeconds, assetDurationSeconds].filter(
3640
+ (d) => d != null
3641
+ );
3642
+ const duration = candidateDurations.length > 0 ? Math.min(...candidateDurations) : 0;
3643
+ const isAudioOnly = isAudioOnlyAsset(asset);
3644
+ const signingContext = await resolveMuxSigningContext(credentials);
3482
3645
  if (policy === "signed" && !signingContext) {
3483
3646
  throw new Error(
3484
3647
  "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
3485
3648
  );
3486
3649
  }
3487
- const transcriptResult = includeTranscript ? await fetchTranscriptForAsset(assetData, playbackId, {
3488
- cleanTranscript,
3489
- shouldSign: policy === "signed",
3490
- credentials: workflowCredentials,
3491
- required: isAudioOnly
3492
- }) : void 0;
3493
- const transcriptText = transcriptResult?.transcriptText ?? "";
3494
- const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult?.track?.language_code ?? getReadyTextTracks(assetData)[0]?.language_code;
3495
- const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
3496
- const userPrompt = buildUserPrompt4({
3497
- tone,
3498
- transcriptText,
3499
- isCleanTranscript: cleanTranscript,
3500
- promptOverrides,
3501
- isAudioOnly,
3502
- titleLength,
3503
- descriptionLength,
3504
- tagCount,
3505
- languageName
3506
- });
3507
- let analysisResponse;
3508
- let imageUrl;
3509
- const systemPrompt = isAudioOnly ? AUDIO_ONLY_SYSTEM_PROMPT : SYSTEM_PROMPT3;
3510
- try {
3511
- if (isAudioOnly) {
3512
- analysisResponse = await analyzeAudioOnly(
3513
- modelConfig.provider,
3514
- modelConfig.modelId,
3515
- userPrompt,
3516
- systemPrompt,
3517
- workflowCredentials
3518
- );
3519
- } else {
3520
- const storyboardUrl = await getStoryboardUrl(playbackId, 640, policy === "signed", workflowCredentials);
3521
- imageUrl = storyboardUrl;
3522
- if (imageSubmissionMode === "base64") {
3523
- const downloadResult = await downloadImageAsBase64(storyboardUrl, imageDownloadOptions);
3524
- analysisResponse = await analyzeStoryboard2(
3525
- downloadResult.base64Data,
3526
- modelConfig.provider,
3527
- modelConfig.modelId,
3528
- userPrompt,
3529
- systemPrompt,
3530
- workflowCredentials
3531
- );
3532
- } else {
3533
- analysisResponse = await withRetry(() => analyzeStoryboard2(
3534
- storyboardUrl,
3535
- modelConfig.provider,
3536
- modelConfig.modelId,
3537
- userPrompt,
3538
- systemPrompt,
3539
- workflowCredentials
3540
- ));
3541
- }
3650
+ let thumbnailScores;
3651
+ let mode = "thumbnails";
3652
+ let thumbnailCount;
3653
+ if (isAudioOnly) {
3654
+ mode = "transcript";
3655
+ const readyTextTracks = getReadyTextTracks(asset);
3656
+ let transcriptResult = await fetchTranscriptForAsset(asset, playbackId, {
3657
+ languageCode,
3658
+ cleanTranscript: true,
3659
+ shouldSign: policy === "signed",
3660
+ credentials,
3661
+ required: true
3662
+ });
3663
+ if (!transcriptResult.track && readyTextTracks.length === 1) {
3664
+ transcriptResult = await fetchTranscriptForAsset(asset, playbackId, {
3665
+ cleanTranscript: true,
3666
+ shouldSign: policy === "signed",
3667
+ credentials,
3668
+ required: true
3669
+ });
3542
3670
  }
3543
- } catch (error) {
3544
- const contentType = isAudioOnly ? "audio" : "video";
3545
- throw new Error(
3546
- `Failed to analyze ${contentType} content with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
3671
+ if (provider === "openai") {
3672
+ thumbnailScores = await requestOpenAITranscriptModeration(
3673
+ transcriptResult.transcriptText,
3674
+ model || "omni-moderation-latest",
3675
+ maxConcurrent,
3676
+ credentials
3677
+ );
3678
+ } else if (provider === "hive") {
3679
+ throw new Error("Hive does not support transcript moderation in this workflow. Use provider: 'openai' for audio-only assets.");
3680
+ } else {
3681
+ throw new Error(`Unsupported moderation provider: ${provider}`);
3682
+ }
3683
+ } else {
3684
+ const thumbnailUrls = maxSamples === void 0 ? (
3685
+ // Generate thumbnail URLs (signed if needed) using existing interval-based logic.
3686
+ await getThumbnailUrls(playbackId, duration, {
3687
+ interval: thumbnailInterval,
3688
+ width: thumbnailWidth,
3689
+ shouldSign: policy === "signed",
3690
+ credentials
3691
+ })
3692
+ ) : (
3693
+ // In maxSamples mode, sample valid timestamps over the trimmed usable span.
3694
+ // Use proportional trims (≈ duration/6, capped at 5s) to stay well inside the
3695
+ // renderable range — Mux can't always serve thumbnails at the very edges.
3696
+ await getThumbnailUrlsFromTimestamps(
3697
+ playbackId,
3698
+ planSamplingTimestamps({
3699
+ duration_sec: duration,
3700
+ max_candidates: maxSamples,
3701
+ trim_start_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
3702
+ trim_end_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
3703
+ fps: videoTrackFps,
3704
+ base_cadence_hz: thumbnailInterval > 0 ? 1 / thumbnailInterval : void 0
3705
+ }),
3706
+ {
3707
+ width: thumbnailWidth,
3708
+ shouldSign: policy === "signed",
3709
+ credentials
3710
+ }
3711
+ )
3547
3712
  );
3713
+ thumbnailCount = thumbnailUrls.length;
3714
+ if (provider === "openai") {
3715
+ thumbnailScores = await requestOpenAIModeration(
3716
+ thumbnailUrls,
3717
+ model || "omni-moderation-latest",
3718
+ maxConcurrent,
3719
+ imageSubmissionMode,
3720
+ imageDownloadOptions,
3721
+ credentials
3722
+ );
3723
+ } else if (provider === "hive") {
3724
+ thumbnailScores = await requestHiveModeration(
3725
+ thumbnailUrls,
3726
+ maxConcurrent,
3727
+ imageSubmissionMode,
3728
+ imageDownloadOptions,
3729
+ credentials
3730
+ );
3731
+ } else {
3732
+ throw new Error(`Unsupported moderation provider: ${provider}`);
3733
+ }
3548
3734
  }
3549
- if (!analysisResponse.result) {
3550
- throw new Error(`Failed to analyze video content for asset ${assetId}`);
3551
- }
3552
- if (!analysisResponse.result.title) {
3553
- throw new Error(`Failed to generate title for asset ${assetId}`);
3554
- }
3555
- if (!analysisResponse.result.description) {
3556
- throw new Error(`Failed to generate description for asset ${assetId}`);
3735
+ const failed = thumbnailScores.filter((s) => s.error);
3736
+ if (failed.length > 0) {
3737
+ const details = failed.map((s) => `${s.url}: ${s.errorMessage || "Unknown error"}`).join("; ");
3738
+ throw new Error(
3739
+ `Moderation failed for ${failed.length}/${thumbnailScores.length} thumbnail(s): ${details}`
3740
+ );
3557
3741
  }
3742
+ const maxSexual = Math.max(...thumbnailScores.map((s) => s.sexual));
3743
+ const maxViolence = Math.max(...thumbnailScores.map((s) => s.violence));
3744
+ const finalThresholds = { ...DEFAULT_THRESHOLDS, ...thresholds };
3558
3745
  return {
3559
3746
  assetId,
3560
- title: analysisResponse.result.title,
3561
- description: analysisResponse.result.description,
3562
- tags: normalizeKeywords(analysisResponse.result.keywords, tagCount ?? SUMMARY_KEYWORD_LIMIT),
3563
- storyboardUrl: imageUrl,
3564
- // undefined for audio-only assets
3747
+ mode,
3748
+ isAudioOnly,
3749
+ thumbnailScores,
3565
3750
  usage: {
3566
- ...analysisResponse.usage,
3567
3751
  metadata: {
3568
- assetDurationSeconds
3752
+ assetDurationSeconds: duration,
3753
+ ...thumbnailCount === void 0 ? {} : { thumbnailCount }
3569
3754
  }
3570
3755
  },
3571
- transcriptText: transcriptText || void 0
3756
+ maxScores: {
3757
+ sexual: maxSexual,
3758
+ violence: maxViolence
3759
+ },
3760
+ exceedsThreshold: maxSexual > finalThresholds.sexual || maxViolence > finalThresholds.violence,
3761
+ thresholds: finalThresholds
3572
3762
  };
3573
3763
  }
3574
3764
 
3575
- // src/lib/s3-sigv4.ts
3576
- var AWS4_ALGORITHM = "AWS4-HMAC-SHA256";
3577
- var AWS4_REQUEST_TERMINATOR = "aws4_request";
3578
- var AWS4_SERVICE = "s3";
3579
- var S3_ALLOWED_ENDPOINT_PATTERNS = parseEndpointAllowlist(
3580
- env_default.S3_ALLOWED_ENDPOINT_HOSTS
3581
- );
3582
- function getCrypto() {
3583
- const webCrypto = globalThis.crypto;
3584
- if (!webCrypto?.subtle) {
3585
- throw new Error("Web Crypto API is required for S3 signing.");
3586
- }
3587
- return webCrypto;
3588
- }
3589
- var textEncoder = new TextEncoder();
3590
- function toBytes(value) {
3591
- return typeof value === "string" ? textEncoder.encode(value) : value;
3592
- }
3593
- function bytesToHex(bytes) {
3594
- return Array.from(bytes).map((byte) => byte.toString(16).padStart(2, "0")).join("");
3595
- }
3596
- async function sha256Hex(value) {
3597
- const digest = await getCrypto().subtle.digest("SHA-256", toBytes(value));
3598
- return bytesToHex(new Uint8Array(digest));
3765
+ // src/workflows/summarization.ts
3766
+ import { generateText as generateText5, Output as Output5 } from "ai";
3767
+ import dedent5 from "dedent";
3768
+ import { z as z6 } from "zod";
3769
+ var DEFAULT_SUMMARY_KEYWORD_LIMIT = 10;
3770
+ var DEFAULT_TITLE_LENGTH = 10;
3771
+ var DEFAULT_DESCRIPTION_LENGTH = 50;
3772
+ var summarySchema = z6.object({
3773
+ keywords: z6.array(z6.string()),
3774
+ title: z6.string(),
3775
+ description: z6.string()
3776
+ }).strict();
3777
+ var SUMMARY_OUTPUT = Output5.object({
3778
+ name: "summary_metadata",
3779
+ description: "Structured summary with title, description, and keywords.",
3780
+ schema: summarySchema
3781
+ });
3782
+ var VALID_TONES = ["neutral", "playful", "professional"];
3783
+ var TONE_INSTRUCTIONS = {
3784
+ neutral: "Provide a clear, straightforward analysis.",
3785
+ playful: "Channel your inner diva! Answer with maximum sass, wit, and playful attitude. Don't hold back - be cheeky, clever, and delightfully snarky. Make it pop!",
3786
+ professional: "Provide a professional, executive-level analysis suitable for business reporting."
3787
+ };
3788
+ var DESCRIPTION_LENGTH_THRESHOLD_SMALL = 25;
3789
+ var DESCRIPTION_LENGTH_THRESHOLD_LARGE = 100;
3790
+ function buildDescriptionGuidance(wordCount, contentType) {
3791
+ if (wordCount < DESCRIPTION_LENGTH_THRESHOLD_SMALL) {
3792
+ if (contentType === "video") {
3793
+ return dedent5`A brief summary of the video in no more than ${wordCount} words. Shorter is fine.
3794
+ Focus on the single most important subject or action.
3795
+ Write in present tense.`;
3796
+ }
3797
+ return dedent5`A brief summary of the audio content in no more than ${wordCount} words. Shorter is fine.
3798
+ Focus on the single most important topic or theme.
3799
+ Write in present tense.`;
3800
+ }
3801
+ if (wordCount > DESCRIPTION_LENGTH_THRESHOLD_LARGE) {
3802
+ if (contentType === "video") {
3803
+ return dedent5`A detailed summary that describes what happens across the video.
3804
+ Never exceed ${wordCount} words, but shorter is perfectly fine. You may use multiple sentences.
3805
+ Be thorough: cover subjects, actions, setting, progression, and any notable details visible across frames.
3806
+ Write in present tense. Be specific about observable details rather than making assumptions.
3807
+ If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`;
3808
+ }
3809
+ return dedent5`A detailed summary that describes the audio content.
3810
+ Never exceed ${wordCount} words, but shorter is perfectly fine. You may use multiple sentences.
3811
+ Be thorough: cover topics, speakers, themes, progression, and any notable insights.
3812
+ Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
3813
+ Focus on the spoken content and any key insights, dialogue, or narrative elements.`;
3814
+ }
3815
+ if (contentType === "video") {
3816
+ return dedent5`A summary that describes what happens across the video.
3817
+ Never exceed ${wordCount} words, but shorter is perfectly fine. You may use multiple sentences.
3818
+ Cover the main subjects, actions, setting, and any notable progression visible across frames.
3819
+ Write in present tense. Be specific about observable details rather than making assumptions.
3820
+ If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`;
3821
+ }
3822
+ return dedent5`A summary that describes the audio content.
3823
+ Never exceed ${wordCount} words, but shorter is perfectly fine. You may use multiple sentences.
3824
+ Cover the main topics, speakers, themes, and any notable progression in the discussion or narration.
3825
+ Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
3826
+ Focus on the spoken content and any key insights, dialogue, or narrative elements.`;
3599
3827
  }
3600
- async function hmacSha256Raw(key, value) {
3601
- const cryptoKey = await getCrypto().subtle.importKey(
3602
- "raw",
3603
- key,
3604
- { name: "HMAC", hash: "SHA-256" },
3605
- false,
3606
- ["sign"]
3607
- );
3608
- const signature = await getCrypto().subtle.sign("HMAC", cryptoKey, textEncoder.encode(value));
3609
- return new Uint8Array(signature);
3828
+ function createSummarizationBuilder({ titleLength, descriptionLength, tagCount } = {}) {
3829
+ const titleLimit = titleLength ?? DEFAULT_TITLE_LENGTH;
3830
+ const keywordLimit = tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT;
3831
+ return createPromptBuilder({
3832
+ template: {
3833
+ task: {
3834
+ tag: "task",
3835
+ content: "Analyze the storyboard frames and generate metadata that captures the essence of the video content."
3836
+ },
3837
+ title: {
3838
+ tag: "title_requirements",
3839
+ content: dedent5`
3840
+ A concise, label-style title — not a sentence or description.
3841
+ Never exceed ${titleLimit} words, but shorter is better.
3842
+ Think of how a video card title, playlist entry, or file name would read — e.g. "Predator: Badlands Trailer" or "Chef Prepares Holiday Feast".
3843
+ Start with the primary subject or topic. Never begin with "A video of" or similar phrasing.
3844
+ Use specific nouns over lengthy descriptions. Avoid clauses, conjunctions, or narrative structure.`
3845
+ },
3846
+ description: {
3847
+ tag: "description_requirements",
3848
+ content: buildDescriptionGuidance(descriptionLength ?? DEFAULT_DESCRIPTION_LENGTH, "video")
3849
+ },
3850
+ keywords: {
3851
+ tag: "keywords_requirements",
3852
+ content: dedent5`
3853
+ Specific, searchable terms (up to ${keywordLimit}) that capture:
3854
+ - Primary subjects (people, animals, objects)
3855
+ - Actions and activities being performed
3856
+ - Setting and environment
3857
+ - Notable objects or tools
3858
+ - Style or genre (if applicable)
3859
+ Prefer concrete nouns and action verbs over abstract concepts.
3860
+ Use lowercase. Avoid redundant or overly generic terms like "video" or "content".`
3861
+ },
3862
+ qualityGuidelines: {
3863
+ tag: "quality_guidelines",
3864
+ content: dedent5`
3865
+ - Examine all frames to understand the full context and progression
3866
+ - Be precise: "golden retriever" is better than "dog" when identifiable
3867
+ - Capture the narrative: what begins, develops, and concludes
3868
+ - Balance brevity with informativeness`
3869
+ }
3870
+ },
3871
+ sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
3872
+ });
3610
3873
  }
3611
- async function deriveSigningKey(secretAccessKey, shortDate, region) {
3612
- const kDate = await hmacSha256Raw(textEncoder.encode(`AWS4${secretAccessKey}`), shortDate);
3613
- const kRegion = await hmacSha256Raw(kDate, region);
3614
- const kService = await hmacSha256Raw(kRegion, AWS4_SERVICE);
3615
- return hmacSha256Raw(kService, AWS4_REQUEST_TERMINATOR);
3874
+ function createAudioOnlyBuilder({ titleLength, descriptionLength, tagCount } = {}) {
3875
+ const titleLimit = titleLength ?? DEFAULT_TITLE_LENGTH;
3876
+ const keywordLimit = tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT;
3877
+ return createPromptBuilder({
3878
+ template: {
3879
+ task: {
3880
+ tag: "task",
3881
+ content: "Analyze the transcript and generate metadata that captures the essence of the audio content."
3882
+ },
3883
+ title: {
3884
+ tag: "title_requirements",
3885
+ content: dedent5`
3886
+ A concise, label-style title — not a sentence or description.
3887
+ Never exceed ${titleLimit} words, but shorter is better.
3888
+ Think of how a podcast episode title or playlist entry would read — e.g. "Weekly News Roundup" or "Interview with Dr. Smith".
3889
+ Start with the primary subject or topic. Never begin with "An audio of" or similar phrasing.
3890
+ Use specific nouns over lengthy descriptions. Avoid clauses, conjunctions, or narrative structure.`
3891
+ },
3892
+ description: {
3893
+ tag: "description_requirements",
3894
+ content: buildDescriptionGuidance(descriptionLength ?? DEFAULT_DESCRIPTION_LENGTH, "audio")
3895
+ },
3896
+ keywords: {
3897
+ tag: "keywords_requirements",
3898
+ content: dedent5`
3899
+ Specific, searchable terms (up to ${keywordLimit}) that capture:
3900
+ - Primary topics and themes
3901
+ - Speakers or presenters (if named)
3902
+ - Key concepts and terminology
3903
+ - Content type (interview, lecture, music, etc.)
3904
+ - Genre or style (if applicable)
3905
+ Prefer concrete nouns and relevant terms over abstract concepts.
3906
+ Use lowercase. Avoid redundant or overly generic terms like "audio" or "content".`
3907
+ },
3908
+ qualityGuidelines: {
3909
+ tag: "quality_guidelines",
3910
+ content: dedent5`
3911
+ - Analyze the full transcript to understand context and themes
3912
+ - Be precise: use specific terminology when mentioned
3913
+ - Capture the narrative: what is introduced, discussed, and concluded
3914
+ - Balance brevity with informativeness`
3915
+ }
3916
+ },
3917
+ sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
3918
+ });
3616
3919
  }
3617
- function formatAmzDate(date = /* @__PURE__ */ new Date()) {
3618
- const iso = date.toISOString();
3619
- const shortDate = iso.slice(0, 10).replace(/-/g, "");
3620
- const amzDate = `${iso.slice(0, 19).replace(/[-:]/g, "")}Z`;
3621
- return { amzDate, shortDate };
3622
- }
3623
- function encodeRFC3986(value) {
3624
- return encodeURIComponent(value).replace(/[!'()*]/g, (char) => `%${char.charCodeAt(0).toString(16).toUpperCase()}`);
3625
- }
3626
- function encodePath(path) {
3627
- return path.split("/").map((segment) => encodeRFC3986(segment)).join("/");
3628
- }
3629
- function normalizeEndpoint(endpoint) {
3630
- let url;
3631
- try {
3632
- url = new URL(endpoint);
3633
- } catch {
3634
- throw new Error(`Invalid S3 endpoint: ${endpoint}`);
3635
- }
3636
- if (url.search || url.hash) {
3637
- throw new Error("S3 endpoint must not include query params or hash fragments.");
3638
- }
3639
- enforceEndpointPolicy(url);
3640
- return url;
3641
- }
3642
- function parseEndpointAllowlist(allowlist) {
3643
- if (!allowlist) {
3644
- return [];
3645
- }
3646
- return allowlist.split(",").map((value) => value.trim().toLowerCase()).filter(Boolean);
3647
- }
3648
- function hostnameMatchesPattern(hostname, pattern) {
3649
- if (pattern.startsWith("*.")) {
3650
- const suffix = pattern.slice(1);
3651
- return hostname.endsWith(suffix) && hostname.length > suffix.length;
3652
- }
3653
- return hostname === pattern;
3654
- }
3655
- function enforceEndpointPolicy(url) {
3656
- const hostname = url.hostname.toLowerCase();
3657
- if (url.protocol !== "https:") {
3658
- throw new Error(
3659
- `Insecure S3 endpoint protocol "${url.protocol}" is not allowed. Use HTTPS.`
3660
- );
3920
+ var SYSTEM_PROMPT4 = dedent5`
3921
+ <role>
3922
+ You are a video content analyst specializing in storyboard interpretation and multimodal analysis.
3923
+ </role>
3924
+
3925
+ <context>
3926
+ You receive storyboard images containing multiple sequential frames extracted from a video.
3927
+ These frames are arranged in a grid and represent the visual progression of the content over time.
3928
+ Read frames left-to-right, top-to-bottom to understand the temporal sequence.
3929
+ </context>
3930
+
3931
+ <transcript_guidance>
3932
+ When a transcript is provided alongside the storyboard:
3933
+ - Use it to understand spoken content, dialogue, narration, and audio context
3934
+ - Correlate transcript content with visual frames to build a complete picture
3935
+ - Extract key terminology, names, and specific language used by speakers
3936
+ - Let the transcript inform keyword selection, especially for topics not visually obvious
3937
+ - Prioritize visual content for the description, but enrich it with transcript insights
3938
+ - If transcript and visuals conflict, trust the visual evidence
3939
+ </transcript_guidance>
3940
+
3941
+ <capabilities>
3942
+ - Extract meaning from visual sequences
3943
+ - Identify subjects, actions, settings, and narrative arcs
3944
+ - Generate accurate, searchable metadata
3945
+ - Synthesize visual and transcript information when provided
3946
+ </capabilities>
3947
+
3948
+ <constraints>
3949
+ - Only describe what is clearly observable in the frames or explicitly stated in the transcript
3950
+ - Do not fabricate details or make unsupported assumptions
3951
+ - Return structured data matching the requested schema
3952
+ - Output only the JSON object; no markdown or extra text
3953
+ - When a <language> section is provided, all output text MUST be written in that language
3954
+ </constraints>
3955
+
3956
+ <tone_guidance>
3957
+ Pay special attention to the <tone> section and lean heavily into those instructions.
3958
+ Adapt your entire analysis and writing style to match the specified tone - this should influence
3959
+ your word choice, personality, formality level, and overall presentation of the content.
3960
+ The tone instructions are not suggestions but core requirements for how you should express yourself.
3961
+ </tone_guidance>
3962
+
3963
+ <language_guidelines>
3964
+ AVOID these meta-descriptive phrases that reference the medium rather than the content:
3965
+ - "The image shows..." / "The storyboard shows..."
3966
+ - "In this video..." / "This video features..."
3967
+ - "The frames depict..." / "The footage shows..."
3968
+ - "We can see..." / "You can see..."
3969
+ - "The clip shows..." / "The scene shows..."
3970
+
3971
+ INSTEAD, describe the content directly:
3972
+ - BAD: "The video shows a chef preparing a meal"
3973
+ - GOOD: "A chef prepares a meal in a professional kitchen"
3974
+
3975
+ Write as if describing reality, not describing a recording of reality.
3976
+ </language_guidelines>`;
3977
+ var AUDIO_ONLY_SYSTEM_PROMPT = dedent5`
3978
+ <role>
3979
+ You are an audio content analyst specializing in transcript analysis and metadata generation.
3980
+ </role>
3981
+
3982
+ <context>
3983
+ You receive transcript text from audio-only content (podcasts, audiobooks, music, etc.).
3984
+ Your task is to analyze the spoken/audio content and generate accurate, searchable metadata.
3985
+ </context>
3986
+
3987
+ <transcript_guidance>
3988
+ - Carefully analyze the entire transcript to understand themes, topics, and key points
3989
+ - Extract key terminology, names, concepts, and specific language used
3990
+ - Identify the content type (interview, lecture, music, narration, etc.)
3991
+ - Note the tone, style, and any distinctive characteristics of the audio
3992
+ - Consider the intended audience and context based on language and content
3993
+ </transcript_guidance>
3994
+
3995
+ <capabilities>
3996
+ - Extract meaning and themes from spoken/audio content
3997
+ - Identify subjects, topics, speakers, and narrative structure
3998
+ - Generate accurate, searchable metadata from audio-based content
3999
+ - Understand context and intent from transcript alone
4000
+ </capabilities>
4001
+
4002
+ <constraints>
4003
+ - Only describe what is explicitly stated or strongly implied in the transcript
4004
+ - Do not fabricate details or make unsupported assumptions
4005
+ - Return structured data matching the requested schema
4006
+ - Focus entirely on audio/spoken content - there are no visual elements
4007
+ - Output only the JSON object; no markdown or extra text
4008
+ - When a <language> section is provided, all output text MUST be written in that language
4009
+ </constraints>
4010
+
4011
+ <tone_guidance>
4012
+ Pay special attention to the <tone> section and lean heavily into those instructions.
4013
+ Adapt your entire analysis and writing style to match the specified tone - this should influence
4014
+ your word choice, personality, formality level, and overall presentation of the content.
4015
+ The tone instructions are not suggestions but core requirements for how you should express yourself.
4016
+ </tone_guidance>
4017
+
4018
+ <language_guidelines>
4019
+ AVOID these meta-descriptive phrases that reference the medium rather than the content:
4020
+ - "The audio shows..." / "The transcript shows..."
4021
+ - "In this recording..." / "This audio features..."
4022
+ - "The speaker says..." / "We can hear..."
4023
+ - "The clip contains..." / "The recording shows..."
4024
+
4025
+ INSTEAD, describe the content directly:
4026
+ - BAD: "The audio features a discussion about climate change"
4027
+ - GOOD: "A panel discusses climate change impacts and solutions"
4028
+
4029
+ Write as if describing reality, not describing a recording of reality.
4030
+ </language_guidelines>`;
4031
+ function buildUserPrompt4({
4032
+ tone,
4033
+ transcriptText,
4034
+ isCleanTranscript = true,
4035
+ promptOverrides,
4036
+ isAudioOnly = false,
4037
+ titleLength,
4038
+ descriptionLength,
4039
+ tagCount,
4040
+ languageName
4041
+ }) {
4042
+ const contextSections = [createToneSection(TONE_INSTRUCTIONS[tone])];
4043
+ if (languageName) {
4044
+ contextSections.push(createLanguageSection(languageName));
4045
+ } else {
4046
+ contextSections.push({
4047
+ tag: "language",
4048
+ content: "Respond in English. Never switch languages to satisfy length constraints."
4049
+ });
3661
4050
  }
3662
- if (S3_ALLOWED_ENDPOINT_PATTERNS.length > 0 && !S3_ALLOWED_ENDPOINT_PATTERNS.some((pattern) => hostnameMatchesPattern(hostname, pattern))) {
3663
- throw new Error(
3664
- `S3 endpoint host "${hostname}" is not in S3_ALLOWED_ENDPOINT_HOSTS.`
3665
- );
4051
+ if (transcriptText) {
4052
+ const format = isCleanTranscript ? "plain text" : "WebVTT";
4053
+ contextSections.push(createTranscriptSection(transcriptText, format));
3666
4054
  }
4055
+ const constraints = { titleLength, descriptionLength, tagCount };
4056
+ const promptBuilder = isAudioOnly ? createAudioOnlyBuilder(constraints) : createSummarizationBuilder(constraints);
4057
+ return promptBuilder.buildWithContext(promptOverrides, contextSections);
3667
4058
  }
3668
- function buildCanonicalUri(endpoint, bucket, key) {
3669
- const endpointPath = endpoint.pathname === "/" ? "" : encodePath(endpoint.pathname.replace(/\/+$/, ""));
3670
- const encodedBucket = encodeRFC3986(bucket);
3671
- const encodedKey = encodePath(key);
3672
- return `${endpointPath}/${encodedBucket}/${encodedKey}`;
3673
- }
3674
- function buildCanonicalQuery(params) {
3675
- return Object.entries(params).sort(([a], [b]) => a.localeCompare(b)).map(([key, value]) => `${encodeRFC3986(key)}=${encodeRFC3986(value)}`).join("&");
3676
- }
3677
- async function signString(secretAccessKey, shortDate, region, value) {
3678
- const signingKey = await deriveSigningKey(secretAccessKey, shortDate, region);
3679
- const signatureBytes = await hmacSha256Raw(signingKey, value);
3680
- return bytesToHex(signatureBytes);
3681
- }
3682
- function buildCredentialScope(shortDate, region) {
3683
- return `${shortDate}/${region}/${AWS4_SERVICE}/${AWS4_REQUEST_TERMINATOR}`;
3684
- }
3685
- async function putObjectToS3({
3686
- accessKeyId,
3687
- secretAccessKey,
3688
- endpoint,
3689
- region,
3690
- bucket,
3691
- key,
3692
- body,
3693
- contentType
3694
- }) {
3695
- const resolvedEndpoint = normalizeEndpoint(endpoint);
3696
- const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
3697
- const host = resolvedEndpoint.host;
3698
- const normalizedContentType = contentType?.trim();
3699
- const { amzDate, shortDate } = formatAmzDate();
3700
- const payloadHash = await sha256Hex(body);
3701
- const signingHeaders = [
3702
- ["host", host],
3703
- ["x-amz-content-sha256", payloadHash],
3704
- ["x-amz-date", amzDate],
3705
- ...normalizedContentType ? [["content-type", normalizedContentType]] : []
3706
- ].sort(([a], [b]) => a.localeCompare(b));
3707
- const canonicalHeaders = signingHeaders.map(([name, value]) => `${name}:${value}`).join("\n");
3708
- const signedHeaders = signingHeaders.map(([name]) => name).join(";");
3709
- const canonicalRequest = [
3710
- "PUT",
3711
- canonicalUri,
3712
- "",
3713
- `${canonicalHeaders}
3714
- `,
3715
- signedHeaders,
3716
- payloadHash
3717
- ].join("\n");
3718
- const credentialScope = buildCredentialScope(shortDate, region);
3719
- const stringToSign = [
3720
- AWS4_ALGORITHM,
3721
- amzDate,
3722
- credentialScope,
3723
- await sha256Hex(canonicalRequest)
3724
- ].join("\n");
3725
- const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
3726
- const authorization = `${AWS4_ALGORITHM} Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}`;
3727
- const requestUrl = `${resolvedEndpoint.origin}${canonicalUri}`;
3728
- const response = await fetch(requestUrl, {
3729
- method: "PUT",
3730
- headers: {
3731
- "Authorization": authorization,
3732
- "x-amz-content-sha256": payloadHash,
3733
- "x-amz-date": amzDate,
3734
- ...normalizedContentType ? { "content-type": normalizedContentType } : {}
3735
- },
3736
- body
4059
+ async function analyzeStoryboard2(imageDataUrl, provider, modelId, userPrompt, systemPrompt, credentials) {
4060
+ "use step";
4061
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4062
+ const response = await generateText5({
4063
+ model,
4064
+ output: SUMMARY_OUTPUT,
4065
+ messages: [
4066
+ {
4067
+ role: "system",
4068
+ content: systemPrompt
4069
+ },
4070
+ {
4071
+ role: "user",
4072
+ content: [
4073
+ { type: "text", text: userPrompt },
4074
+ { type: "image", image: imageDataUrl }
4075
+ ]
4076
+ }
4077
+ ]
3737
4078
  });
3738
- if (!response.ok) {
3739
- const errorBody = await response.text().catch(() => "");
3740
- const detail = errorBody ? ` ${errorBody}` : "";
3741
- throw new Error(`S3 PUT failed (${response.status} ${response.statusText}).${detail}`);
4079
+ if (!response.output) {
4080
+ throw new Error("Summarization output missing");
3742
4081
  }
3743
- }
3744
- async function createPresignedGetUrl({
3745
- accessKeyId,
3746
- secretAccessKey,
3747
- endpoint,
3748
- region,
3749
- bucket,
3750
- key,
3751
- expiresInSeconds = 3600
3752
- }) {
3753
- const resolvedEndpoint = normalizeEndpoint(endpoint);
3754
- const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
3755
- const host = resolvedEndpoint.host;
3756
- const { amzDate, shortDate } = formatAmzDate();
3757
- const credentialScope = buildCredentialScope(shortDate, region);
3758
- const signedHeaders = "host";
3759
- const queryParams = {
3760
- "X-Amz-Algorithm": AWS4_ALGORITHM,
3761
- "X-Amz-Credential": `${accessKeyId}/${credentialScope}`,
3762
- "X-Amz-Date": amzDate,
3763
- "X-Amz-Expires": `${expiresInSeconds}`,
3764
- "X-Amz-SignedHeaders": signedHeaders
4082
+ const parsed = summarySchema.parse(response.output);
4083
+ return {
4084
+ result: parsed,
4085
+ usage: {
4086
+ inputTokens: response.usage.inputTokens,
4087
+ outputTokens: response.usage.outputTokens,
4088
+ totalTokens: response.usage.totalTokens,
4089
+ reasoningTokens: response.usage.reasoningTokens,
4090
+ cachedInputTokens: response.usage.cachedInputTokens
4091
+ }
4092
+ };
4093
+ }
4094
+ async function analyzeAudioOnly(provider, modelId, userPrompt, systemPrompt, credentials) {
4095
+ "use step";
4096
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4097
+ const response = await generateText5({
4098
+ model,
4099
+ output: SUMMARY_OUTPUT,
4100
+ messages: [
4101
+ {
4102
+ role: "system",
4103
+ content: systemPrompt
4104
+ },
4105
+ {
4106
+ role: "user",
4107
+ content: userPrompt
4108
+ }
4109
+ ]
4110
+ });
4111
+ if (!response.output) {
4112
+ throw new Error("Summarization output missing");
4113
+ }
4114
+ const parsed = summarySchema.parse(response.output);
4115
+ return {
4116
+ result: parsed,
4117
+ usage: {
4118
+ inputTokens: response.usage.inputTokens,
4119
+ outputTokens: response.usage.outputTokens,
4120
+ totalTokens: response.usage.totalTokens,
4121
+ reasoningTokens: response.usage.reasoningTokens,
4122
+ cachedInputTokens: response.usage.cachedInputTokens
4123
+ }
3765
4124
  };
3766
- const canonicalQuery = buildCanonicalQuery(queryParams);
3767
- const canonicalRequest = [
3768
- "GET",
3769
- canonicalUri,
3770
- canonicalQuery,
3771
- `host:${host}
3772
- `,
3773
- signedHeaders,
3774
- "UNSIGNED-PAYLOAD"
3775
- ].join("\n");
3776
- const stringToSign = [
3777
- AWS4_ALGORITHM,
3778
- amzDate,
3779
- credentialScope,
3780
- await sha256Hex(canonicalRequest)
3781
- ].join("\n");
3782
- const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
3783
- const queryWithSignature = `${canonicalQuery}&X-Amz-Signature=${signature}`;
3784
- return `${resolvedEndpoint.origin}${canonicalUri}?${queryWithSignature}`;
3785
4125
  }
3786
-
3787
- // src/lib/storage-adapter.ts
3788
- function requireCredentials(accessKeyId, secretAccessKey) {
3789
- if (!accessKeyId || !secretAccessKey) {
3790
- throw new Error(
3791
- "S3 credentials are required for default storage operations. Provide S3_ACCESS_KEY_ID and S3_SECRET_ACCESS_KEY or pass options.storageAdapter."
3792
- );
4126
+ function normalizeKeywords(keywords, limit = DEFAULT_SUMMARY_KEYWORD_LIMIT) {
4127
+ if (!Array.isArray(keywords) || keywords.length === 0) {
4128
+ return [];
3793
4129
  }
3794
- return { accessKeyId, secretAccessKey };
4130
+ const uniqueLowercase = /* @__PURE__ */ new Set();
4131
+ const normalized = [];
4132
+ for (const keyword of keywords) {
4133
+ const trimmed = keyword?.trim();
4134
+ if (!trimmed) {
4135
+ continue;
4136
+ }
4137
+ const lower = trimmed.toLowerCase();
4138
+ if (uniqueLowercase.has(lower)) {
4139
+ continue;
4140
+ }
4141
+ uniqueLowercase.add(lower);
4142
+ normalized.push(trimmed);
4143
+ if (normalized.length === limit) {
4144
+ break;
4145
+ }
4146
+ }
4147
+ return normalized;
3795
4148
  }
3796
- async function putObjectWithStorageAdapter(input, adapter) {
3797
- if (adapter) {
3798
- await adapter.putObject(input);
3799
- return;
4149
+ async function getSummaryAndTags(assetId, options) {
4150
+ "use workflow";
4151
+ const {
4152
+ provider = "openai",
4153
+ model,
4154
+ tone = "neutral",
4155
+ includeTranscript = true,
4156
+ cleanTranscript = true,
4157
+ imageSubmissionMode = "url",
4158
+ imageDownloadOptions,
4159
+ promptOverrides,
4160
+ credentials,
4161
+ titleLength,
4162
+ descriptionLength,
4163
+ tagCount,
4164
+ outputLanguageCode
4165
+ } = options ?? {};
4166
+ if (!VALID_TONES.includes(tone)) {
4167
+ throw new Error(
4168
+ `Invalid tone "${tone}". Valid tones are: ${VALID_TONES.join(", ")}`
4169
+ );
3800
4170
  }
3801
- const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
3802
- await putObjectToS3({
3803
- accessKeyId: credentials.accessKeyId,
3804
- secretAccessKey: credentials.secretAccessKey,
3805
- endpoint: input.endpoint,
3806
- region: input.region,
3807
- bucket: input.bucket,
3808
- key: input.key,
3809
- body: input.body,
3810
- contentType: input.contentType
4171
+ const modelConfig = resolveLanguageModelConfig({
4172
+ ...options,
4173
+ model,
4174
+ provider
3811
4175
  });
3812
- }
3813
- async function createPresignedGetUrlWithStorageAdapter(input, adapter) {
3814
- if (adapter) {
3815
- return adapter.createPresignedGetUrl(input);
4176
+ const workflowCredentials = credentials;
4177
+ const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, workflowCredentials);
4178
+ const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
4179
+ const isAudioOnly = isAudioOnlyAsset(assetData);
4180
+ if (isAudioOnly && !includeTranscript) {
4181
+ throw new Error(
4182
+ "Audio-only assets require a transcript. Set includeTranscript: true and ensure the asset has a ready text track (captions/subtitles)."
4183
+ );
3816
4184
  }
3817
- const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
3818
- return createPresignedGetUrl({
3819
- accessKeyId: credentials.accessKeyId,
3820
- secretAccessKey: credentials.secretAccessKey,
3821
- endpoint: input.endpoint,
3822
- region: input.region,
3823
- bucket: input.bucket,
3824
- key: input.key,
3825
- expiresInSeconds: input.expiresInSeconds
4185
+ const signingContext = await resolveMuxSigningContext(workflowCredentials);
4186
+ if (policy === "signed" && !signingContext) {
4187
+ throw new Error(
4188
+ "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
4189
+ );
4190
+ }
4191
+ const transcriptResult = includeTranscript ? await fetchTranscriptForAsset(assetData, playbackId, {
4192
+ cleanTranscript,
4193
+ shouldSign: policy === "signed",
4194
+ credentials: workflowCredentials,
4195
+ required: isAudioOnly
4196
+ }) : void 0;
4197
+ const transcriptText = transcriptResult?.transcriptText ?? "";
4198
+ const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult?.track?.language_code ?? getReadyTextTracks(assetData)[0]?.language_code;
4199
+ const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
4200
+ const userPrompt = buildUserPrompt4({
4201
+ tone,
4202
+ transcriptText,
4203
+ isCleanTranscript: cleanTranscript,
4204
+ promptOverrides,
4205
+ isAudioOnly,
4206
+ titleLength,
4207
+ descriptionLength,
4208
+ tagCount,
4209
+ languageName
3826
4210
  });
4211
+ let analysisResponse;
4212
+ let imageUrl;
4213
+ const systemPrompt = isAudioOnly ? AUDIO_ONLY_SYSTEM_PROMPT : SYSTEM_PROMPT4;
4214
+ try {
4215
+ if (isAudioOnly) {
4216
+ analysisResponse = await analyzeAudioOnly(
4217
+ modelConfig.provider,
4218
+ modelConfig.modelId,
4219
+ userPrompt,
4220
+ systemPrompt,
4221
+ workflowCredentials
4222
+ );
4223
+ } else {
4224
+ const storyboardUrl = await getStoryboardUrl(playbackId, 640, policy === "signed", workflowCredentials);
4225
+ imageUrl = storyboardUrl;
4226
+ if (imageSubmissionMode === "base64") {
4227
+ const downloadResult = await downloadImageAsBase64(storyboardUrl, imageDownloadOptions);
4228
+ analysisResponse = await analyzeStoryboard2(
4229
+ downloadResult.base64Data,
4230
+ modelConfig.provider,
4231
+ modelConfig.modelId,
4232
+ userPrompt,
4233
+ systemPrompt,
4234
+ workflowCredentials
4235
+ );
4236
+ } else {
4237
+ analysisResponse = await withRetry(() => analyzeStoryboard2(
4238
+ storyboardUrl,
4239
+ modelConfig.provider,
4240
+ modelConfig.modelId,
4241
+ userPrompt,
4242
+ systemPrompt,
4243
+ workflowCredentials
4244
+ ));
4245
+ }
4246
+ }
4247
+ } catch (error) {
4248
+ const contentType = isAudioOnly ? "audio" : "video";
4249
+ throw new Error(
4250
+ `Failed to analyze ${contentType} content with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
4251
+ );
4252
+ }
4253
+ if (!analysisResponse.result) {
4254
+ throw new Error(`Failed to analyze video content for asset ${assetId}`);
4255
+ }
4256
+ if (!analysisResponse.result.title) {
4257
+ throw new Error(`Failed to generate title for asset ${assetId}`);
4258
+ }
4259
+ if (!analysisResponse.result.description) {
4260
+ throw new Error(`Failed to generate description for asset ${assetId}`);
4261
+ }
4262
+ return {
4263
+ assetId,
4264
+ title: analysisResponse.result.title,
4265
+ description: analysisResponse.result.description,
4266
+ tags: normalizeKeywords(analysisResponse.result.keywords, tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT),
4267
+ storyboardUrl: imageUrl,
4268
+ // undefined for audio-only assets
4269
+ usage: {
4270
+ ...analysisResponse.usage,
4271
+ metadata: {
4272
+ assetDurationSeconds
4273
+ }
4274
+ },
4275
+ transcriptText: transcriptText || void 0
4276
+ };
3827
4277
  }
3828
4278
 
3829
4279
  // src/workflows/translate-audio.ts
@@ -4002,7 +4452,8 @@ async function uploadDubbedAudioToS3({
4002
4452
  s3Endpoint,
4003
4453
  s3Region,
4004
4454
  s3Bucket,
4005
- storageAdapter
4455
+ storageAdapter,
4456
+ s3SignedUrlExpirySeconds
4006
4457
  }) {
4007
4458
  "use step";
4008
4459
  const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
@@ -4025,10 +4476,11 @@ async function uploadDubbedAudioToS3({
4025
4476
  region: s3Region,
4026
4477
  bucket: s3Bucket,
4027
4478
  key: audioKey,
4028
- expiresInSeconds: 3600
4479
+ expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
4029
4480
  }, storageAdapter);
4481
+ const expiryHours = Math.round((s3SignedUrlExpirySeconds ?? 86400) / 3600);
4030
4482
  console.warn(`\u2705 Audio uploaded successfully to: ${audioKey}`);
4031
- console.warn(`\u{1F517} Generated presigned URL (expires in 1 hour)`);
4483
+ console.warn(`\u{1F517} Generated presigned URL (expires in ${expiryHours} hour${expiryHours === 1 ? "" : "s"})`);
4032
4484
  return presignedUrl;
4033
4485
  }
4034
4486
  async function createAudioTrackOnMux(assetId, languageCode, presignedUrl, credentials) {
@@ -4192,7 +4644,8 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
4192
4644
  s3Endpoint,
4193
4645
  s3Region,
4194
4646
  s3Bucket,
4195
- storageAdapter: effectiveStorageAdapter
4647
+ storageAdapter: effectiveStorageAdapter,
4648
+ s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
4196
4649
  });
4197
4650
  } catch (error) {
4198
4651
  throw new Error(`Failed to upload audio to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
@@ -4230,24 +4683,24 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
4230
4683
  // src/workflows/translate-captions.ts
4231
4684
  import {
4232
4685
  APICallError,
4233
- generateText as generateText5,
4686
+ generateText as generateText6,
4234
4687
  NoObjectGeneratedError,
4235
- Output as Output5,
4688
+ Output as Output6,
4236
4689
  RetryError,
4237
4690
  TypeValidationError
4238
4691
  } from "ai";
4239
- import dedent5 from "dedent";
4240
- import { z as z6 } from "zod";
4241
- var translationSchema = z6.object({
4242
- translation: z6.string()
4692
+ import dedent6 from "dedent";
4693
+ import { z as z7 } from "zod";
4694
+ var translationSchema = z7.object({
4695
+ translation: z7.string()
4243
4696
  });
4244
- var SYSTEM_PROMPT4 = dedent5`
4697
+ var SYSTEM_PROMPT5 = dedent6`
4245
4698
  You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user.
4246
4699
  You may receive either a full VTT file or a chunk from a larger VTT.
4247
4700
  Preserve all timestamps, cue ordering, and VTT formatting exactly as they appear.
4248
4701
  Return JSON with a single key "translation" containing the translated VTT content.
4249
4702
  `;
4250
- var CUE_TRANSLATION_SYSTEM_PROMPT = dedent5`
4703
+ var CUE_TRANSLATION_SYSTEM_PROMPT = dedent6`
4251
4704
  You are a subtitle translation expert.
4252
4705
  You will receive a sequence of subtitle cues extracted from a VTT file.
4253
4706
  Translate the cues to the requested target language while preserving their original order.
@@ -4409,14 +4862,6 @@ function buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunkin
4409
4862
  )
4410
4863
  };
4411
4864
  }
4412
- async function fetchVttFromMux(vttUrl) {
4413
- "use step";
4414
- const vttResponse = await fetch(vttUrl);
4415
- if (!vttResponse.ok) {
4416
- throw new Error(`Failed to fetch VTT file: ${vttResponse.statusText}`);
4417
- }
4418
- return vttResponse.text();
4419
- }
4420
4865
  async function translateVttWithAI({
4421
4866
  vttContent,
4422
4867
  fromLanguageCode,
@@ -4427,13 +4872,13 @@ async function translateVttWithAI({
4427
4872
  }) {
4428
4873
  "use step";
4429
4874
  const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4430
- const response = await generateText5({
4875
+ const response = await generateText6({
4431
4876
  model,
4432
- output: Output5.object({ schema: translationSchema }),
4877
+ output: Output6.object({ schema: translationSchema }),
4433
4878
  messages: [
4434
4879
  {
4435
4880
  role: "system",
4436
- content: SYSTEM_PROMPT4
4881
+ content: SYSTEM_PROMPT5
4437
4882
  },
4438
4883
  {
4439
4884
  role: "user",
@@ -4464,8 +4909,8 @@ async function translateCueChunkWithAI({
4464
4909
  }) {
4465
4910
  "use step";
4466
4911
  const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4467
- const schema = z6.object({
4468
- translations: z6.array(z6.string().min(1)).length(cues.length)
4912
+ const schema = z7.object({
4913
+ translations: z7.array(z7.string().min(1)).length(cues.length)
4469
4914
  });
4470
4915
  const cuePayload = cues.map((cue, index) => ({
4471
4916
  index,
@@ -4473,9 +4918,9 @@ async function translateCueChunkWithAI({
4473
4918
  endTime: cue.endTime,
4474
4919
  text: cue.text
4475
4920
  }));
4476
- const response = await generateText5({
4921
+ const response = await generateText6({
4477
4922
  model,
4478
- output: Output5.object({ schema }),
4923
+ output: Output6.object({ schema }),
4479
4924
  messages: [
4480
4925
  {
4481
4926
  role: "system",
@@ -4632,7 +5077,8 @@ async function uploadVttToS3({
4632
5077
  s3Endpoint,
4633
5078
  s3Region,
4634
5079
  s3Bucket,
4635
- storageAdapter
5080
+ storageAdapter,
5081
+ s3SignedUrlExpirySeconds
4636
5082
  }) {
4637
5083
  "use step";
4638
5084
  const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
@@ -4655,25 +5101,9 @@ async function uploadVttToS3({
4655
5101
  region: s3Region,
4656
5102
  bucket: s3Bucket,
4657
5103
  key: vttKey,
4658
- expiresInSeconds: 3600
5104
+ expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
4659
5105
  }, storageAdapter);
4660
5106
  }
4661
- async function createTextTrackOnMux(assetId, languageCode, trackName, presignedUrl, credentials) {
4662
- "use step";
4663
- const muxClient = await resolveMuxClient(credentials);
4664
- const mux = await muxClient.createClient();
4665
- const trackResponse = await mux.video.assets.createTrack(assetId, {
4666
- type: "text",
4667
- text_type: "subtitles",
4668
- language_code: languageCode,
4669
- name: trackName,
4670
- url: presignedUrl
4671
- });
4672
- if (!trackResponse.id) {
4673
- throw new Error("Failed to create text track: no track ID returned from Mux");
4674
- }
4675
- return trackResponse.id;
4676
- }
4677
5107
  async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, options) {
4678
5108
  "use workflow";
4679
5109
  const {
@@ -4791,7 +5221,8 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4791
5221
  s3Endpoint,
4792
5222
  s3Region,
4793
5223
  s3Bucket,
4794
- storageAdapter: effectiveStorageAdapter
5224
+ storageAdapter: effectiveStorageAdapter,
5225
+ s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
4795
5226
  });
4796
5227
  } catch (error) {
4797
5228
  throw new Error(`Failed to upload VTT to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
@@ -4824,23 +5255,33 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4824
5255
  };
4825
5256
  }
4826
5257
  export {
5258
+ DEFAULT_DESCRIPTION_LENGTH,
5259
+ DEFAULT_SUMMARY_KEYWORD_LIMIT,
5260
+ DEFAULT_TITLE_LENGTH,
4827
5261
  HIVE_SEXUAL_CATEGORIES,
4828
5262
  HIVE_VIOLENCE_CATEGORIES,
4829
- SUMMARY_KEYWORD_LIMIT,
4830
5263
  aggregateTokenUsage,
5264
+ applyOverrideLists,
5265
+ applyReplacements,
4831
5266
  askQuestions,
5267
+ buildReplacementRegex,
4832
5268
  burnedInCaptionsSchema,
5269
+ censorVttContent,
4833
5270
  chapterSchema,
4834
5271
  chaptersSchema,
5272
+ createReplacer,
5273
+ editCaptions,
4835
5274
  generateChapters,
4836
5275
  generateEmbeddings,
4837
5276
  generateVideoEmbeddings,
4838
5277
  getModerationScores,
4839
5278
  getSummaryAndTags,
4840
5279
  hasBurnedInCaptions,
5280
+ profanityDetectionSchema,
4841
5281
  questionAnswerSchema,
4842
5282
  shouldSplitChunkTranslationError,
4843
5283
  summarySchema,
5284
+ transformCueText,
4844
5285
  translateAudio,
4845
5286
  translateCaptions,
4846
5287
  translationSchema