@mux/ai 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2176,1654 +2176,2072 @@ async function generateChapters(assetId, languageCode, options = {}) {
2176
2176
  };
2177
2177
  }
2178
2178
 
2179
- // src/workflows/embeddings.ts
2180
- import { embed } from "ai";
2179
+ // src/workflows/edit-captions.ts
2180
+ import { generateText as generateText4, Output as Output4 } from "ai";
2181
+ import dedent4 from "dedent";
2182
+ import { z as z5 } from "zod";
2181
2183
 
2182
- // src/primitives/text-chunking.ts
2183
- var DEFAULT_MIN_CHUNK_DURATION_RATIO = 2 / 3;
2184
- var DEFAULT_BOUNDARY_LOOKAHEAD_CUES = 12;
2185
- var DEFAULT_BOUNDARY_PAUSE_SECONDS = 1.25;
2186
- var STRONG_BOUNDARY_SCORE = 4;
2187
- var PREFERRED_BOUNDARY_WINDOW_SECONDS = 5 * 60;
2188
- var SENTENCE_BOUNDARY_REGEX = /[.!?]["')\]]*$/;
2189
- var CLAUSE_BOUNDARY_REGEX = /[,;:]["')\]]*$/;
2190
- var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
2191
- function estimateTokenCount(text) {
2192
- const words = text.trim().split(/\s+/).length;
2193
- return Math.ceil(words / 0.75);
2184
+ // src/lib/mux-tracks.ts
2185
+ async function fetchVttFromMux(vttUrl) {
2186
+ "use step";
2187
+ const vttResponse = await fetch(vttUrl);
2188
+ if (!vttResponse.ok) {
2189
+ throw new Error(`Failed to fetch VTT file: ${vttResponse.statusText}`);
2190
+ }
2191
+ return vttResponse.text();
2194
2192
  }
2195
- function chunkByTokens(text, maxTokens, overlapTokens = 0) {
2196
- if (!text.trim()) {
2197
- return [];
2193
+ async function createTextTrackOnMux(assetId, languageCode, trackName, presignedUrl, credentials) {
2194
+ "use step";
2195
+ const muxClient = await resolveMuxClient(credentials);
2196
+ const mux = await muxClient.createClient();
2197
+ const trackResponse = await mux.video.assets.createTrack(assetId, {
2198
+ type: "text",
2199
+ text_type: "subtitles",
2200
+ language_code: languageCode,
2201
+ name: trackName,
2202
+ url: presignedUrl
2203
+ });
2204
+ if (!trackResponse.id) {
2205
+ throw new Error("Failed to create text track: no track ID returned from Mux");
2198
2206
  }
2199
- const chunks = [];
2200
- const words = text.trim().split(/\s+/);
2201
- const wordsPerChunk = Math.floor(maxTokens * 0.75);
2202
- const overlapWords = Math.floor(overlapTokens * 0.75);
2203
- let chunkIndex = 0;
2204
- let currentPosition = 0;
2205
- while (currentPosition < words.length) {
2206
- const chunkWords = words.slice(
2207
- currentPosition,
2208
- currentPosition + wordsPerChunk
2209
- );
2210
- const chunkText2 = chunkWords.join(" ");
2211
- const tokenCount = estimateTokenCount(chunkText2);
2212
- chunks.push({
2213
- id: `chunk-${chunkIndex}`,
2214
- text: chunkText2,
2215
- tokenCount
2216
- });
2217
- currentPosition += wordsPerChunk - overlapWords;
2218
- chunkIndex++;
2219
- if (currentPosition <= (chunkIndex - 1) * (wordsPerChunk - overlapWords)) {
2220
- break;
2221
- }
2207
+ return trackResponse.id;
2208
+ }
2209
+
2210
+ // src/lib/s3-sigv4.ts
2211
+ var AWS4_ALGORITHM = "AWS4-HMAC-SHA256";
2212
+ var AWS4_REQUEST_TERMINATOR = "aws4_request";
2213
+ var AWS4_SERVICE = "s3";
2214
+ var S3_ALLOWED_ENDPOINT_PATTERNS = parseEndpointAllowlist(
2215
+ env_default.S3_ALLOWED_ENDPOINT_HOSTS
2216
+ );
2217
+ function getCrypto() {
2218
+ const webCrypto = globalThis.crypto;
2219
+ if (!webCrypto?.subtle) {
2220
+ throw new Error("Web Crypto API is required for S3 signing.");
2222
2221
  }
2223
- return chunks;
2222
+ return webCrypto;
2224
2223
  }
2225
- function createChunkFromCues(cues, index) {
2226
- const text = cues.map((c) => c.text).join(" ");
2227
- return {
2228
- id: `chunk-${index}`,
2229
- text,
2230
- tokenCount: estimateTokenCount(text),
2231
- startTime: cues[0].startTime,
2232
- endTime: cues[cues.length - 1].endTime
2233
- };
2224
+ var textEncoder = new TextEncoder();
2225
+ function toBytes(value) {
2226
+ return typeof value === "string" ? textEncoder.encode(value) : value;
2234
2227
  }
2235
- function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
2236
- if (cues.length === 0)
2237
- return [];
2238
- const chunks = [];
2239
- let currentCues = [];
2240
- let currentTokens = 0;
2241
- let chunkIndex = 0;
2242
- for (let i = 0; i < cues.length; i++) {
2243
- const cue = cues[i];
2244
- const cueTokens = estimateTokenCount(cue.text);
2245
- if (currentTokens + cueTokens > maxTokens && currentCues.length > 0) {
2246
- chunks.push(createChunkFromCues(currentCues, chunkIndex));
2247
- chunkIndex++;
2248
- const overlapStart = Math.max(0, currentCues.length - overlapCues);
2249
- currentCues = currentCues.slice(overlapStart);
2250
- currentTokens = currentCues.reduce(
2251
- (sum, c) => sum + estimateTokenCount(c.text),
2252
- 0
2253
- );
2254
- }
2255
- currentCues.push(cue);
2256
- currentTokens += cueTokens;
2257
- }
2258
- if (currentCues.length > 0) {
2259
- chunks.push(createChunkFromCues(currentCues, chunkIndex));
2260
- }
2261
- return chunks;
2228
+ function bytesToHex(bytes) {
2229
+ return Array.from(bytes).map((byte) => byte.toString(16).padStart(2, "0")).join("");
2262
2230
  }
2263
- function scoreCueBoundary(cues, index, boundaryPauseSeconds) {
2264
- const cue = cues[index];
2265
- const nextCue = cues[index + 1];
2266
- if (!nextCue) {
2267
- return Number.POSITIVE_INFINITY;
2268
- }
2269
- const trimmedText = cue.text.trim();
2270
- let score = 0;
2271
- if (SENTENCE_BOUNDARY_REGEX.test(trimmedText)) {
2272
- score += 4;
2273
- } else if (CLAUSE_BOUNDARY_REGEX.test(trimmedText)) {
2274
- score += 2;
2275
- }
2276
- if (nextCue.startTime - cue.endTime >= boundaryPauseSeconds) {
2277
- score += 2;
2231
+ async function sha256Hex(value) {
2232
+ const digest = await getCrypto().subtle.digest("SHA-256", toBytes(value));
2233
+ return bytesToHex(new Uint8Array(digest));
2234
+ }
2235
+ async function hmacSha256Raw(key, value) {
2236
+ const cryptoKey = await getCrypto().subtle.importKey(
2237
+ "raw",
2238
+ key,
2239
+ { name: "HMAC", hash: "SHA-256" },
2240
+ false,
2241
+ ["sign"]
2242
+ );
2243
+ const signature = await getCrypto().subtle.sign("HMAC", cryptoKey, textEncoder.encode(value));
2244
+ return new Uint8Array(signature);
2245
+ }
2246
+ async function deriveSigningKey(secretAccessKey, shortDate, region) {
2247
+ const kDate = await hmacSha256Raw(textEncoder.encode(`AWS4${secretAccessKey}`), shortDate);
2248
+ const kRegion = await hmacSha256Raw(kDate, region);
2249
+ const kService = await hmacSha256Raw(kRegion, AWS4_SERVICE);
2250
+ return hmacSha256Raw(kService, AWS4_REQUEST_TERMINATOR);
2251
+ }
2252
+ function formatAmzDate(date = /* @__PURE__ */ new Date()) {
2253
+ const iso = date.toISOString();
2254
+ const shortDate = iso.slice(0, 10).replace(/-/g, "");
2255
+ const amzDate = `${iso.slice(0, 19).replace(/[-:]/g, "")}Z`;
2256
+ return { amzDate, shortDate };
2257
+ }
2258
+ function encodeRFC3986(value) {
2259
+ return encodeURIComponent(value).replace(/[!'()*]/g, (char) => `%${char.charCodeAt(0).toString(16).toUpperCase()}`);
2260
+ }
2261
+ function encodePath(path) {
2262
+ return path.split("/").map((segment) => encodeRFC3986(segment)).join("/");
2263
+ }
2264
+ function normalizeEndpoint(endpoint) {
2265
+ let url;
2266
+ try {
2267
+ url = new URL(endpoint);
2268
+ } catch {
2269
+ throw new Error(`Invalid S3 endpoint: ${endpoint}`);
2278
2270
  }
2279
- if (NEXT_SENTENCE_START_REGEX.test(nextCue.text.trim())) {
2280
- score += 1;
2271
+ if (url.search || url.hash) {
2272
+ throw new Error("S3 endpoint must not include query params or hash fragments.");
2281
2273
  }
2282
- return score;
2274
+ enforceEndpointPolicy(url);
2275
+ return url;
2283
2276
  }
2284
- function chunkVTTCuesByBudget(cues, options) {
2285
- if (cues.length === 0) {
2277
+ function parseEndpointAllowlist(allowlist) {
2278
+ if (!allowlist) {
2286
2279
  return [];
2287
2280
  }
2288
- const maxCuesPerChunk = Math.max(1, options.maxCuesPerChunk);
2289
- let maxTextTokensPerChunk = Number.POSITIVE_INFINITY;
2290
- if (options.maxTextTokensPerChunk) {
2291
- maxTextTokensPerChunk = Math.max(1, options.maxTextTokensPerChunk);
2292
- }
2293
- const chunks = [];
2294
- let chunkIndex = 0;
2295
- let cueStartIndex = 0;
2296
- let currentTokenCount = 0;
2297
- for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
2298
- const cue = cues[cueIndex];
2299
- const cueTokenCount = estimateTokenCount(cue.text);
2300
- const currentCueCount = cueIndex - cueStartIndex;
2301
- const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
2302
- const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
2303
- if (wouldExceedCueCount || wouldExceedTokenCount) {
2304
- chunks.push({
2305
- id: `chunk-${chunkIndex}`,
2306
- cueStartIndex,
2307
- cueEndIndex: cueIndex - 1,
2308
- cueCount: cueIndex - cueStartIndex,
2309
- startTime: cues[cueStartIndex].startTime,
2310
- endTime: cues[cueIndex - 1].endTime
2311
- });
2312
- cueStartIndex = cueIndex;
2313
- currentTokenCount = 0;
2314
- chunkIndex++;
2315
- }
2316
- currentTokenCount += cueTokenCount;
2281
+ return allowlist.split(",").map((value) => value.trim().toLowerCase()).filter(Boolean);
2282
+ }
2283
+ function hostnameMatchesPattern(hostname, pattern) {
2284
+ if (pattern.startsWith("*.")) {
2285
+ const suffix = pattern.slice(1);
2286
+ return hostname.endsWith(suffix) && hostname.length > suffix.length;
2317
2287
  }
2318
- chunks.push({
2319
- id: `chunk-${chunkIndex}`,
2320
- cueStartIndex,
2321
- cueEndIndex: cues.length - 1,
2322
- cueCount: cues.length - cueStartIndex,
2323
- startTime: cues[cueStartIndex].startTime,
2324
- endTime: cues[cues.length - 1].endTime
2325
- });
2326
- return chunks;
2288
+ return hostname === pattern;
2327
2289
  }
2328
- function chunkVTTCuesByDuration(cues, options) {
2329
- if (cues.length === 0) {
2330
- return [];
2290
+ function enforceEndpointPolicy(url) {
2291
+ const hostname = url.hostname.toLowerCase();
2292
+ if (url.protocol !== "https:") {
2293
+ throw new Error(
2294
+ `Insecure S3 endpoint protocol "${url.protocol}" is not allowed. Use HTTPS.`
2295
+ );
2331
2296
  }
2332
- const targetChunkDurationSeconds = Math.max(1, options.targetChunkDurationSeconds);
2333
- const maxChunkDurationSeconds = Math.max(targetChunkDurationSeconds, options.maxChunkDurationSeconds);
2334
- const minChunkDurationSeconds = Math.min(
2335
- targetChunkDurationSeconds,
2336
- Math.max(
2337
- 1,
2338
- options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
2339
- )
2340
- );
2341
- const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
2342
- const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
2343
- const preferredBoundaryStartSeconds = Math.max(
2344
- minChunkDurationSeconds,
2345
- targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
2346
- );
2347
- const chunks = [];
2348
- let chunkIndex = 0;
2349
- let cueStartIndex = 0;
2350
- while (cueStartIndex < cues.length) {
2351
- const chunkStartTime = cues[cueStartIndex].startTime;
2352
- let cueEndIndex = cueStartIndex;
2353
- let bestBoundaryIndex = -1;
2354
- let bestBoundaryScore = -1;
2355
- let bestPreferredBoundaryIndex = -1;
2356
- let bestPreferredBoundaryScore = -1;
2357
- while (cueEndIndex < cues.length) {
2358
- const cue = cues[cueEndIndex];
2359
- const currentDuration = cue.endTime - chunkStartTime;
2360
- if (currentDuration >= minChunkDurationSeconds) {
2361
- const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
2362
- if (boundaryScore >= bestBoundaryScore) {
2363
- bestBoundaryIndex = cueEndIndex;
2364
- bestBoundaryScore = boundaryScore;
2365
- }
2366
- if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
2367
- bestPreferredBoundaryIndex = cueEndIndex;
2368
- bestPreferredBoundaryScore = boundaryScore;
2369
- }
2370
- }
2371
- const nextCue = cues[cueEndIndex + 1];
2372
- if (!nextCue) {
2373
- break;
2374
- }
2375
- const nextDuration = nextCue.endTime - chunkStartTime;
2376
- const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
2377
- const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
2378
- const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
2379
- if (currentDuration >= targetChunkDurationSeconds) {
2380
- if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
2381
- cueEndIndex = preferredBoundaryIndex;
2382
- break;
2383
- }
2384
- if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
2385
- cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
2386
- break;
2387
- }
2388
- }
2389
- if (nextDuration > maxChunkDurationSeconds) {
2390
- cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
2391
- break;
2392
- }
2393
- cueEndIndex++;
2394
- }
2395
- chunks.push({
2396
- id: `chunk-${chunkIndex}`,
2397
- cueStartIndex,
2398
- cueEndIndex,
2399
- cueCount: cueEndIndex - cueStartIndex + 1,
2400
- startTime: cues[cueStartIndex].startTime,
2401
- endTime: cues[cueEndIndex].endTime
2402
- });
2403
- cueStartIndex = cueEndIndex + 1;
2404
- chunkIndex++;
2297
+ if (S3_ALLOWED_ENDPOINT_PATTERNS.length > 0 && !S3_ALLOWED_ENDPOINT_PATTERNS.some((pattern) => hostnameMatchesPattern(hostname, pattern))) {
2298
+ throw new Error(
2299
+ `S3 endpoint host "${hostname}" is not in S3_ALLOWED_ENDPOINT_HOSTS.`
2300
+ );
2405
2301
  }
2406
- return chunks;
2407
2302
  }
2408
- function chunkText(text, strategy) {
2409
- switch (strategy.type) {
2410
- case "token": {
2411
- return chunkByTokens(text, strategy.maxTokens, strategy.overlap ?? 0);
2412
- }
2413
- default: {
2414
- const exhaustiveCheck = strategy;
2415
- throw new Error(`Unsupported chunking strategy: ${exhaustiveCheck}`);
2416
- }
2417
- }
2303
+ function buildCanonicalUri(endpoint, bucket, key) {
2304
+ const endpointPath = endpoint.pathname === "/" ? "" : encodePath(endpoint.pathname.replace(/\/+$/, ""));
2305
+ const encodedBucket = encodeRFC3986(bucket);
2306
+ const encodedKey = encodePath(key);
2307
+ return `${endpointPath}/${encodedBucket}/${encodedKey}`;
2418
2308
  }
2419
-
2420
- // src/workflows/embeddings.ts
2421
- function averageEmbeddings(embeddings) {
2422
- if (embeddings.length === 0) {
2423
- return [];
2424
- }
2425
- const dimensions = embeddings[0].length;
2426
- const averaged = Array.from({ length: dimensions }, () => 0);
2427
- for (const embedding of embeddings) {
2428
- for (let i = 0; i < dimensions; i++) {
2429
- averaged[i] += embedding[i];
2430
- }
2431
- }
2432
- for (let i = 0; i < dimensions; i++) {
2433
- averaged[i] /= embeddings.length;
2309
+ function buildCanonicalQuery(params) {
2310
+ return Object.entries(params).sort(([a], [b]) => a.localeCompare(b)).map(([key, value]) => `${encodeRFC3986(key)}=${encodeRFC3986(value)}`).join("&");
2311
+ }
2312
+ async function signString(secretAccessKey, shortDate, region, value) {
2313
+ const signingKey = await deriveSigningKey(secretAccessKey, shortDate, region);
2314
+ const signatureBytes = await hmacSha256Raw(signingKey, value);
2315
+ return bytesToHex(signatureBytes);
2316
+ }
2317
+ function buildCredentialScope(shortDate, region) {
2318
+ return `${shortDate}/${region}/${AWS4_SERVICE}/${AWS4_REQUEST_TERMINATOR}`;
2319
+ }
2320
+ async function putObjectToS3({
2321
+ accessKeyId,
2322
+ secretAccessKey,
2323
+ endpoint,
2324
+ region,
2325
+ bucket,
2326
+ key,
2327
+ body,
2328
+ contentType
2329
+ }) {
2330
+ const resolvedEndpoint = normalizeEndpoint(endpoint);
2331
+ const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
2332
+ const host = resolvedEndpoint.host;
2333
+ const normalizedContentType = contentType?.trim();
2334
+ const { amzDate, shortDate } = formatAmzDate();
2335
+ const payloadHash = await sha256Hex(body);
2336
+ const signingHeaders = [
2337
+ ["host", host],
2338
+ ["x-amz-content-sha256", payloadHash],
2339
+ ["x-amz-date", amzDate],
2340
+ ...normalizedContentType ? [["content-type", normalizedContentType]] : []
2341
+ ].sort(([a], [b]) => a.localeCompare(b));
2342
+ const canonicalHeaders = signingHeaders.map(([name, value]) => `${name}:${value}`).join("\n");
2343
+ const signedHeaders = signingHeaders.map(([name]) => name).join(";");
2344
+ const canonicalRequest = [
2345
+ "PUT",
2346
+ canonicalUri,
2347
+ "",
2348
+ `${canonicalHeaders}
2349
+ `,
2350
+ signedHeaders,
2351
+ payloadHash
2352
+ ].join("\n");
2353
+ const credentialScope = buildCredentialScope(shortDate, region);
2354
+ const stringToSign = [
2355
+ AWS4_ALGORITHM,
2356
+ amzDate,
2357
+ credentialScope,
2358
+ await sha256Hex(canonicalRequest)
2359
+ ].join("\n");
2360
+ const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
2361
+ const authorization = `${AWS4_ALGORITHM} Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}`;
2362
+ const requestUrl = `${resolvedEndpoint.origin}${canonicalUri}`;
2363
+ const response = await fetch(requestUrl, {
2364
+ method: "PUT",
2365
+ headers: {
2366
+ "Authorization": authorization,
2367
+ "x-amz-content-sha256": payloadHash,
2368
+ "x-amz-date": amzDate,
2369
+ ...normalizedContentType ? { "content-type": normalizedContentType } : {}
2370
+ },
2371
+ body
2372
+ });
2373
+ if (!response.ok) {
2374
+ const errorBody = await response.text().catch(() => "");
2375
+ const detail = errorBody ? ` ${errorBody}` : "";
2376
+ throw new Error(`S3 PUT failed (${response.status} ${response.statusText}).${detail}`);
2434
2377
  }
2435
- return averaged;
2436
2378
  }
2437
- async function generateSingleChunkEmbedding({
2438
- chunk,
2439
- provider,
2440
- modelId,
2441
- credentials
2379
+ async function createPresignedGetUrl({
2380
+ accessKeyId,
2381
+ secretAccessKey,
2382
+ endpoint,
2383
+ region,
2384
+ bucket,
2385
+ key,
2386
+ expiresInSeconds = 3600
2442
2387
  }) {
2443
- "use step";
2444
- const model = await createEmbeddingModelFromConfig(provider, modelId, credentials);
2445
- const response = await withRetry(
2446
- () => embed({
2447
- model,
2448
- value: chunk.text
2449
- })
2450
- );
2451
- return {
2452
- chunkId: chunk.id,
2453
- embedding: response.embedding,
2454
- metadata: {
2455
- startTime: chunk.startTime,
2456
- endTime: chunk.endTime,
2457
- tokenCount: chunk.tokenCount
2458
- }
2388
+ const resolvedEndpoint = normalizeEndpoint(endpoint);
2389
+ const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
2390
+ const host = resolvedEndpoint.host;
2391
+ const { amzDate, shortDate } = formatAmzDate();
2392
+ const credentialScope = buildCredentialScope(shortDate, region);
2393
+ const signedHeaders = "host";
2394
+ const queryParams = {
2395
+ "X-Amz-Algorithm": AWS4_ALGORITHM,
2396
+ "X-Amz-Credential": `${accessKeyId}/${credentialScope}`,
2397
+ "X-Amz-Date": amzDate,
2398
+ "X-Amz-Expires": `${expiresInSeconds}`,
2399
+ "X-Amz-SignedHeaders": signedHeaders
2459
2400
  };
2401
+ const canonicalQuery = buildCanonicalQuery(queryParams);
2402
+ const canonicalRequest = [
2403
+ "GET",
2404
+ canonicalUri,
2405
+ canonicalQuery,
2406
+ `host:${host}
2407
+ `,
2408
+ signedHeaders,
2409
+ "UNSIGNED-PAYLOAD"
2410
+ ].join("\n");
2411
+ const stringToSign = [
2412
+ AWS4_ALGORITHM,
2413
+ amzDate,
2414
+ credentialScope,
2415
+ await sha256Hex(canonicalRequest)
2416
+ ].join("\n");
2417
+ const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
2418
+ const queryWithSignature = `${canonicalQuery}&X-Amz-Signature=${signature}`;
2419
+ return `${resolvedEndpoint.origin}${canonicalUri}?${queryWithSignature}`;
2460
2420
  }
2461
- async function generateEmbeddingsInternal(assetId, options = {}) {
2462
- const {
2463
- provider = "openai",
2464
- model,
2465
- languageCode,
2466
- chunkingStrategy = { type: "token", maxTokens: 500, overlap: 100 },
2467
- batchSize = 5,
2468
- credentials
2469
- } = options;
2470
- const embeddingModel = resolveEmbeddingModelConfig({ ...options, provider, model });
2471
- const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
2472
- const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
2473
- const isAudioOnly = isAudioOnlyAsset(assetData);
2474
- const signingContext = await resolveMuxSigningContext(credentials);
2475
- if (policy === "signed" && !signingContext) {
2421
+
2422
+ // src/lib/storage-adapter.ts
2423
+ function requireCredentials(accessKeyId, secretAccessKey) {
2424
+ if (!accessKeyId || !secretAccessKey) {
2476
2425
  throw new Error(
2477
- "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
2426
+ "S3 credentials are required for default storage operations. Provide S3_ACCESS_KEY_ID and S3_SECRET_ACCESS_KEY or pass options.storageAdapter."
2478
2427
  );
2479
2428
  }
2480
- const readyTextTracks = getReadyTextTracks(assetData);
2481
- const useVttChunking = chunkingStrategy.type === "vtt";
2482
- let transcriptResult = await fetchTranscriptForAsset(assetData, playbackId, {
2483
- languageCode,
2484
- cleanTranscript: !useVttChunking,
2485
- shouldSign: policy === "signed",
2486
- credentials
2487
- });
2488
- if (isAudioOnly && !transcriptResult.track && readyTextTracks.length === 1) {
2489
- transcriptResult = await fetchTranscriptForAsset(assetData, playbackId, {
2490
- cleanTranscript: !useVttChunking,
2491
- shouldSign: policy === "signed",
2492
- credentials
2493
- });
2494
- }
2495
- if (!transcriptResult.track || !transcriptResult.transcriptText) {
2496
- const availableLanguages = readyTextTracks.map((t) => t.language_code).filter(Boolean).join(", ");
2497
- if (isAudioOnly) {
2498
- throw new Error(
2499
- `No transcript track found${languageCode ? ` for language '${languageCode}'` : ""}. Audio-only assets require a transcript. Available languages: ${availableLanguages || "none"}`
2500
- );
2429
+ return { accessKeyId, secretAccessKey };
2430
+ }
2431
+ async function putObjectWithStorageAdapter(input, adapter) {
2432
+ if (adapter) {
2433
+ await adapter.putObject(input);
2434
+ return;
2435
+ }
2436
+ const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
2437
+ await putObjectToS3({
2438
+ accessKeyId: credentials.accessKeyId,
2439
+ secretAccessKey: credentials.secretAccessKey,
2440
+ endpoint: input.endpoint,
2441
+ region: input.region,
2442
+ bucket: input.bucket,
2443
+ key: input.key,
2444
+ body: input.body,
2445
+ contentType: input.contentType
2446
+ });
2447
+ }
2448
+ async function createPresignedGetUrlWithStorageAdapter(input, adapter) {
2449
+ if (adapter) {
2450
+ return adapter.createPresignedGetUrl(input);
2451
+ }
2452
+ const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
2453
+ return createPresignedGetUrl({
2454
+ accessKeyId: credentials.accessKeyId,
2455
+ secretAccessKey: credentials.secretAccessKey,
2456
+ endpoint: input.endpoint,
2457
+ region: input.region,
2458
+ bucket: input.bucket,
2459
+ key: input.key,
2460
+ expiresInSeconds: input.expiresInSeconds
2461
+ });
2462
+ }
2463
+
2464
+ // src/workflows/edit-captions.ts
2465
+ var profanityDetectionSchema = z5.object({
2466
+ profanity: z5.array(z5.string()).describe(
2467
+ "Unique profane words or short phrases exactly as they appear in the transcript text. Include each distinct form only once (e.g., if 'fuck' and 'fucking' both appear, list both)."
2468
+ )
2469
+ });
2470
+ var SYSTEM_PROMPT3 = dedent4`
2471
+ You are a content moderation assistant. Your task is to identify profane, vulgar, or obscene
2472
+ words and phrases in subtitle text. Return ONLY the exact profane words or phrases as they appear
2473
+ in the text. Do not modify, censor, or paraphrase them. Do not include words that are merely
2474
+ informal or slang but not profane. Focus on words that would be bleeped on broadcast television.`;
2475
+ function transformCueText(rawVtt, transform) {
2476
+ const lines = rawVtt.split("\n");
2477
+ let inCueText = false;
2478
+ let currentCueStartTime = 0;
2479
+ const transformed = lines.map((line) => {
2480
+ if (line.includes("-->")) {
2481
+ const startTimestamp = line.split("-->")[0].trim();
2482
+ currentCueStartTime = vttTimestampToSeconds(startTimestamp);
2483
+ inCueText = true;
2484
+ return line;
2485
+ }
2486
+ if (line.trim() === "") {
2487
+ inCueText = false;
2488
+ return line;
2489
+ }
2490
+ if (inCueText) {
2491
+ return transform(line, currentCueStartTime);
2492
+ }
2493
+ return line;
2494
+ });
2495
+ return transformed.join("\n");
2496
+ }
2497
+ function buildReplacementRegex(words) {
2498
+ const filtered = words.filter((w) => w.length > 0);
2499
+ if (filtered.length === 0)
2500
+ return null;
2501
+ filtered.sort((a, b) => b.length - a.length);
2502
+ const escaped = filtered.map((w) => w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
2503
+ const pattern = escaped.join("|");
2504
+ return new RegExp(`\\b(?:${pattern})\\b`, "gi");
2505
+ }
2506
+ function createReplacer(mode) {
2507
+ switch (mode) {
2508
+ case "blank":
2509
+ return (match) => `[${"_".repeat(match.length)}]`;
2510
+ case "remove":
2511
+ return () => "";
2512
+ case "mask":
2513
+ return (match) => "?".repeat(match.length);
2514
+ }
2515
+ }
2516
+ function censorVttContent(rawVtt, profanity, mode) {
2517
+ if (profanity.length === 0) {
2518
+ return { censoredVtt: rawVtt, replacements: [] };
2519
+ }
2520
+ const regex = buildReplacementRegex(profanity);
2521
+ if (!regex) {
2522
+ return { censoredVtt: rawVtt, replacements: [] };
2523
+ }
2524
+ const replacer = createReplacer(mode);
2525
+ const replacements = [];
2526
+ const censoredVtt = transformCueText(rawVtt, (line, cueStartTime) => {
2527
+ return line.replace(regex, (match) => {
2528
+ const after = replacer(match);
2529
+ replacements.push({ cueStartTime, before: match, after });
2530
+ return after;
2531
+ });
2532
+ });
2533
+ return { censoredVtt, replacements };
2534
+ }
2535
+ function applyOverrideLists(detected, alwaysCensor, neverCensor) {
2536
+ const seen = new Set(detected.map((w) => w.toLowerCase()));
2537
+ const merged = [...detected];
2538
+ for (const word of alwaysCensor) {
2539
+ const lower = word.toLowerCase();
2540
+ if (!seen.has(lower)) {
2541
+ seen.add(lower);
2542
+ merged.push(word);
2543
+ }
2544
+ }
2545
+ const neverSet = new Set(neverCensor.map((w) => w.toLowerCase()));
2546
+ return merged.filter((w) => !neverSet.has(w.toLowerCase()));
2547
+ }
2548
+ function applyReplacements(rawVtt, replacements) {
2549
+ const filtered = replacements.filter((r) => r.find.length > 0);
2550
+ if (filtered.length === 0) {
2551
+ return { editedVtt: rawVtt, replacements: [] };
2552
+ }
2553
+ const records = [];
2554
+ const editedVtt = transformCueText(rawVtt, (line, cueStartTime) => {
2555
+ let result = line;
2556
+ for (const { find, replace } of filtered) {
2557
+ const escaped = find.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
2558
+ const regex = new RegExp(`\\b${escaped}\\b`, "g");
2559
+ result = result.replace(regex, (match) => {
2560
+ records.push({ cueStartTime, before: match, after: replace });
2561
+ return replace;
2562
+ });
2563
+ }
2564
+ return result;
2565
+ });
2566
+ return { editedVtt, replacements: records };
2567
+ }
2568
+ async function identifyProfanityWithAI({
2569
+ plainText,
2570
+ provider,
2571
+ modelId,
2572
+ credentials
2573
+ }) {
2574
+ "use step";
2575
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
2576
+ const response = await generateText4({
2577
+ model,
2578
+ output: Output4.object({ schema: profanityDetectionSchema }),
2579
+ messages: [
2580
+ {
2581
+ role: "system",
2582
+ content: SYSTEM_PROMPT3
2583
+ },
2584
+ {
2585
+ role: "user",
2586
+ content: `Identify all profane words and phrases in the following subtitle transcript. Return each unique profane word or phrase exactly as it appears in the text.
2587
+
2588
+ <transcript>
2589
+ ${plainText}
2590
+ </transcript>`
2591
+ }
2592
+ ]
2593
+ });
2594
+ return {
2595
+ profanity: response.output.profanity,
2596
+ usage: {
2597
+ inputTokens: response.usage.inputTokens,
2598
+ outputTokens: response.usage.outputTokens,
2599
+ totalTokens: response.usage.totalTokens,
2600
+ reasoningTokens: response.usage.reasoningTokens,
2601
+ cachedInputTokens: response.usage.cachedInputTokens
2501
2602
  }
2603
+ };
2604
+ }
2605
+ async function uploadEditedVttToS3({
2606
+ editedVtt,
2607
+ assetId,
2608
+ trackId,
2609
+ s3Endpoint,
2610
+ s3Region,
2611
+ s3Bucket,
2612
+ storageAdapter,
2613
+ s3SignedUrlExpirySeconds
2614
+ }) {
2615
+ "use step";
2616
+ const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
2617
+ const s3SecretAccessKey = env_default.S3_SECRET_ACCESS_KEY;
2618
+ const vttKey = `edited/${assetId}/${trackId}-edited-${Date.now()}.vtt`;
2619
+ await putObjectWithStorageAdapter({
2620
+ accessKeyId: s3AccessKeyId,
2621
+ secretAccessKey: s3SecretAccessKey,
2622
+ endpoint: s3Endpoint,
2623
+ region: s3Region,
2624
+ bucket: s3Bucket,
2625
+ key: vttKey,
2626
+ body: editedVtt,
2627
+ contentType: "text/vtt"
2628
+ }, storageAdapter);
2629
+ return createPresignedGetUrlWithStorageAdapter({
2630
+ accessKeyId: s3AccessKeyId,
2631
+ secretAccessKey: s3SecretAccessKey,
2632
+ endpoint: s3Endpoint,
2633
+ region: s3Region,
2634
+ bucket: s3Bucket,
2635
+ key: vttKey,
2636
+ expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
2637
+ }, storageAdapter);
2638
+ }
2639
+ async function deleteTrackOnMux(assetId, trackId, credentials) {
2640
+ "use step";
2641
+ const muxClient = await resolveMuxClient(credentials);
2642
+ const mux = await muxClient.createClient();
2643
+ await mux.video.assets.deleteTrack(assetId, trackId);
2644
+ }
2645
+ async function editCaptions(assetId, trackId, options) {
2646
+ "use workflow";
2647
+ const {
2648
+ provider,
2649
+ model,
2650
+ autoCensorProfanity: autoCensorOption,
2651
+ replacements: replacementsOption,
2652
+ deleteOriginalTrack,
2653
+ uploadToMux: uploadToMuxOption,
2654
+ s3Endpoint: providedS3Endpoint,
2655
+ s3Region: providedS3Region,
2656
+ s3Bucket: providedS3Bucket,
2657
+ trackNameSuffix,
2658
+ storageAdapter,
2659
+ credentials
2660
+ } = options;
2661
+ const hasAutoCensor = !!autoCensorOption;
2662
+ const hasReplacements = !!replacementsOption && replacementsOption.length > 0;
2663
+ if (!hasAutoCensor && !hasReplacements) {
2664
+ throw new Error("At least one of autoCensorProfanity or replacements must be provided.");
2665
+ }
2666
+ if (autoCensorOption && !provider) {
2667
+ throw new Error("provider is required when using autoCensorProfanity.");
2668
+ }
2669
+ const deleteOriginal = deleteOriginalTrack !== false;
2670
+ const uploadToMux = uploadToMuxOption !== false;
2671
+ const s3Endpoint = providedS3Endpoint ?? env_default.S3_ENDPOINT;
2672
+ const s3Region = providedS3Region ?? env_default.S3_REGION ?? "auto";
2673
+ const s3Bucket = providedS3Bucket ?? env_default.S3_BUCKET;
2674
+ const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
2675
+ const s3SecretAccessKey = env_default.S3_SECRET_ACCESS_KEY;
2676
+ if (uploadToMux && (!s3Endpoint || !s3Bucket || !storageAdapter && (!s3AccessKeyId || !s3SecretAccessKey))) {
2502
2677
  throw new Error(
2503
- `No caption track found${languageCode ? ` for language '${languageCode}'` : ""}. Available languages: ${availableLanguages || "none"}`
2678
+ "Storage configuration is required for uploading to Mux. Provide s3Endpoint and s3Bucket. If no storageAdapter is supplied, also provide s3AccessKeyId and s3SecretAccessKey in options or set S3_ENDPOINT, S3_BUCKET, S3_ACCESS_KEY_ID, and S3_SECRET_ACCESS_KEY environment variables."
2504
2679
  );
2505
2680
  }
2506
- const transcriptText = transcriptResult.transcriptText;
2507
- if (!transcriptText.trim()) {
2508
- throw new Error("Transcript is empty");
2681
+ const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
2682
+ const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
2683
+ const signingContext = await resolveMuxSigningContext(credentials);
2684
+ if (policy === "signed" && !signingContext) {
2685
+ throw new Error(
2686
+ "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
2687
+ );
2509
2688
  }
2510
- const chunks = useVttChunking ? chunkVTTCues(
2511
- parseVTTCues(transcriptText),
2512
- chunkingStrategy.maxTokens,
2513
- chunkingStrategy.overlapCues
2514
- ) : chunkText(transcriptText, chunkingStrategy);
2515
- if (chunks.length === 0) {
2516
- throw new Error("No chunks generated from transcript");
2689
+ const readyTextTracks = getReadyTextTracks(assetData);
2690
+ const sourceTrack = readyTextTracks.find((t) => t.id === trackId);
2691
+ if (!sourceTrack) {
2692
+ const availableTrackIds = readyTextTracks.map((t) => t.id).filter(Boolean).join(", ");
2693
+ throw new Error(
2694
+ `Track '${trackId}' not found or not ready on asset '${assetId}'. Available track IDs: ${availableTrackIds || "none"}`
2695
+ );
2517
2696
  }
2518
- const chunkEmbeddings = [];
2697
+ const vttUrl = await buildTranscriptUrl(playbackId, trackId, policy === "signed", credentials);
2698
+ let vttContent;
2519
2699
  try {
2520
- for (let i = 0; i < chunks.length; i += batchSize) {
2521
- const batch = chunks.slice(i, i + batchSize);
2522
- const batchResults = await Promise.all(
2523
- batch.map(
2524
- (chunk) => generateSingleChunkEmbedding({
2525
- chunk,
2526
- provider: embeddingModel.provider,
2527
- modelId: embeddingModel.modelId,
2528
- credentials
2529
- })
2530
- )
2531
- );
2532
- chunkEmbeddings.push(...batchResults);
2700
+ vttContent = await fetchVttFromMux(vttUrl);
2701
+ } catch (error) {
2702
+ throw new Error(`Failed to fetch VTT content: ${error instanceof Error ? error.message : "Unknown error"}`);
2703
+ }
2704
+ let editedVtt = vttContent;
2705
+ let totalReplacementCount = 0;
2706
+ let autoCensorResult;
2707
+ let usage;
2708
+ if (autoCensorOption) {
2709
+ const { mode = "blank", alwaysCensor = [], neverCensor = [] } = autoCensorOption;
2710
+ const plainText = extractTextFromVTT(vttContent);
2711
+ if (!plainText.trim()) {
2712
+ throw new Error("Track transcript is empty; nothing to censor.");
2533
2713
  }
2714
+ const modelConfig = resolveLanguageModelConfig({
2715
+ ...options,
2716
+ provider,
2717
+ model
2718
+ });
2719
+ let detectedProfanity;
2720
+ try {
2721
+ const result = await identifyProfanityWithAI({
2722
+ plainText,
2723
+ provider: modelConfig.provider,
2724
+ modelId: modelConfig.modelId,
2725
+ credentials
2726
+ });
2727
+ detectedProfanity = result.profanity;
2728
+ usage = result.usage;
2729
+ } catch (error) {
2730
+ throw new Error(`Failed to detect profanity with ${modelConfig.provider}: ${error instanceof Error ? error.message : "Unknown error"}`);
2731
+ }
2732
+ const finalProfanity = applyOverrideLists(detectedProfanity, alwaysCensor, neverCensor);
2733
+ const { censoredVtt, replacements: censorReplacements } = censorVttContent(editedVtt, finalProfanity, mode);
2734
+ editedVtt = censoredVtt;
2735
+ totalReplacementCount += censorReplacements.length;
2736
+ autoCensorResult = { replacements: censorReplacements };
2737
+ }
2738
+ let replacementsResult;
2739
+ if (replacementsOption && replacementsOption.length > 0) {
2740
+ const { editedVtt: afterReplacements, replacements: staticReplacements } = applyReplacements(editedVtt, replacementsOption);
2741
+ editedVtt = afterReplacements;
2742
+ totalReplacementCount += staticReplacements.length;
2743
+ replacementsResult = { replacements: staticReplacements };
2744
+ }
2745
+ const usageWithMetadata = usage ? {
2746
+ ...usage,
2747
+ metadata: {
2748
+ assetDurationSeconds
2749
+ }
2750
+ } : void 0;
2751
+ if (!uploadToMux) {
2752
+ return {
2753
+ assetId,
2754
+ trackId,
2755
+ originalVtt: vttContent,
2756
+ editedVtt,
2757
+ totalReplacementCount,
2758
+ autoCensorProfanity: autoCensorResult,
2759
+ replacements: replacementsResult,
2760
+ usage: usageWithMetadata
2761
+ };
2762
+ }
2763
+ let presignedUrl;
2764
+ try {
2765
+ presignedUrl = await uploadEditedVttToS3({
2766
+ editedVtt,
2767
+ assetId,
2768
+ trackId,
2769
+ s3Endpoint,
2770
+ s3Region,
2771
+ s3Bucket,
2772
+ storageAdapter,
2773
+ s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
2774
+ });
2534
2775
  } catch (error) {
2535
- throw new Error(
2536
- `Failed to generate embeddings with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
2776
+ throw new Error(`Failed to upload VTT to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
2777
+ }
2778
+ let uploadedTrackId;
2779
+ try {
2780
+ const languageCode = sourceTrack.language_code || "en";
2781
+ const suffix = trackNameSuffix ?? "edited";
2782
+ const trackName = `${sourceTrack.name || "Subtitles"} (${suffix})`;
2783
+ uploadedTrackId = await createTextTrackOnMux(
2784
+ assetId,
2785
+ languageCode,
2786
+ trackName,
2787
+ presignedUrl,
2788
+ credentials
2537
2789
  );
2790
+ } catch (error) {
2791
+ console.warn(`Failed to add track to Mux asset: ${error instanceof Error ? error.message : "Unknown error"}`);
2538
2792
  }
2539
- if (chunkEmbeddings.length === 0) {
2540
- throw new Error("No embeddings generated");
2793
+ if (deleteOriginal && uploadedTrackId) {
2794
+ try {
2795
+ await deleteTrackOnMux(assetId, trackId, credentials);
2796
+ } catch (error) {
2797
+ console.warn(`Failed to delete original track: ${error instanceof Error ? error.message : "Unknown error"}`);
2798
+ }
2541
2799
  }
2542
- const averagedEmbedding = averageEmbeddings(chunkEmbeddings.map((ce) => ce.embedding));
2543
- const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0);
2544
2800
  return {
2545
2801
  assetId,
2546
- chunks: chunkEmbeddings,
2547
- averagedEmbedding,
2548
- provider,
2549
- model: embeddingModel.modelId,
2550
- metadata: {
2551
- totalChunks: chunks.length,
2552
- totalTokens,
2553
- chunkingStrategy: JSON.stringify(chunkingStrategy),
2554
- embeddingDimensions: chunkEmbeddings[0].embedding.length,
2555
- generatedAt: (/* @__PURE__ */ new Date()).toISOString()
2556
- },
2557
- usage: {
2558
- metadata: {
2559
- assetDurationSeconds
2560
- }
2561
- }
2802
+ trackId,
2803
+ originalVtt: vttContent,
2804
+ editedVtt,
2805
+ totalReplacementCount,
2806
+ autoCensorProfanity: autoCensorResult,
2807
+ replacements: replacementsResult,
2808
+ uploadedTrackId,
2809
+ presignedUrl,
2810
+ usage: usageWithMetadata
2562
2811
  };
2563
2812
  }
2564
- async function generateEmbeddings(assetId, options = {}) {
2565
- "use workflow";
2566
- return generateEmbeddingsInternal(assetId, options);
2567
- }
2568
- async function generateVideoEmbeddings(assetId, options = {}) {
2569
- "use workflow";
2570
- console.warn("generateVideoEmbeddings is deprecated. Use generateEmbeddings instead.");
2571
- return generateEmbeddingsInternal(assetId, options);
2572
- }
2573
2813
 
2574
- // src/lib/sampling-plan.ts
2575
- var DEFAULT_FPS = 30;
2576
- function roundToNearestFrameMs(tsMs, fps = DEFAULT_FPS) {
2577
- const frameMs = 1e3 / fps;
2578
- return Math.round(Math.round(tsMs / frameMs) * frameMs * 100) / 100;
2814
+ // src/workflows/embeddings.ts
2815
+ import { embed } from "ai";
2816
+
2817
+ // src/primitives/text-chunking.ts
2818
+ var DEFAULT_MIN_CHUNK_DURATION_RATIO = 2 / 3;
2819
+ var DEFAULT_BOUNDARY_LOOKAHEAD_CUES = 12;
2820
+ var DEFAULT_BOUNDARY_PAUSE_SECONDS = 1.25;
2821
+ var STRONG_BOUNDARY_SCORE = 4;
2822
+ var PREFERRED_BOUNDARY_WINDOW_SECONDS = 5 * 60;
2823
+ var SENTENCE_BOUNDARY_REGEX = /[.!?]["')\]]*$/;
2824
+ var CLAUSE_BOUNDARY_REGEX = /[,;:]["')\]]*$/;
2825
+ var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
2826
+ function estimateTokenCount(text) {
2827
+ const words = text.trim().split(/\s+/).length;
2828
+ return Math.ceil(words / 0.75);
2579
2829
  }
2580
- function planSamplingTimestamps(options) {
2581
- const DEFAULT_MIN_CANDIDATES = 10;
2582
- const DEFAULT_MAX_CANDIDATES = 30;
2583
- const {
2584
- duration_sec,
2585
- min_candidates = DEFAULT_MIN_CANDIDATES,
2586
- max_candidates = DEFAULT_MAX_CANDIDATES,
2587
- trim_start_sec = 1,
2588
- trim_end_sec = 1,
2589
- fps = DEFAULT_FPS,
2590
- base_cadence_hz,
2591
- anchor_percents = [0.2, 0.5, 0.8],
2592
- anchor_window_sec = 1.5
2593
- } = options;
2594
- const usableSec = Math.max(0, duration_sec - (trim_start_sec + trim_end_sec));
2595
- if (usableSec <= 0)
2830
+ function chunkByTokens(text, maxTokens, overlapTokens = 0) {
2831
+ if (!text.trim()) {
2596
2832
  return [];
2597
- const cadenceHz = base_cadence_hz ?? (duration_sec < 15 ? 3 : duration_sec < 60 ? 2 : duration_sec < 180 ? 1.5 : 1);
2598
- let target = Math.round(usableSec * cadenceHz);
2599
- target = Math.max(min_candidates, Math.min(max_candidates, target));
2600
- const stepSec = usableSec / target;
2601
- const t0 = trim_start_sec;
2602
- const base = [];
2603
- for (let i = 0; i < target; i++) {
2604
- const tsSec = t0 + (i + 0.5) * stepSec;
2605
- base.push(tsSec * 1e3);
2606
2833
  }
2607
- const slack = Math.max(0, max_candidates - base.length);
2608
- const extra = [];
2609
- if (slack > 0 && anchor_percents.length > 0) {
2610
- const perAnchor = Math.max(1, Math.min(5, Math.floor(slack / anchor_percents.length)));
2611
- for (const p of anchor_percents) {
2612
- const centerSec = Math.min(
2613
- t0 + usableSec - 1e-3,
2614
- // nudge just inside the end bound
2615
- Math.max(t0 + 1e-3, duration_sec * p)
2616
- // nudge just inside the start bound
2617
- );
2618
- const startSec = Math.max(t0, centerSec - anchor_window_sec / 2);
2619
- const endSec = Math.min(t0 + usableSec, centerSec + anchor_window_sec / 2);
2620
- if (endSec <= startSec)
2621
- continue;
2622
- const wStep = (endSec - startSec) / perAnchor;
2623
- for (let i = 0; i < perAnchor; i++) {
2624
- const tsSec = startSec + (i + 0.5) * wStep;
2625
- extra.push(tsSec * 1e3);
2626
- }
2834
+ const chunks = [];
2835
+ const words = text.trim().split(/\s+/);
2836
+ const wordsPerChunk = Math.floor(maxTokens * 0.75);
2837
+ const overlapWords = Math.floor(overlapTokens * 0.75);
2838
+ let chunkIndex = 0;
2839
+ let currentPosition = 0;
2840
+ while (currentPosition < words.length) {
2841
+ const chunkWords = words.slice(
2842
+ currentPosition,
2843
+ currentPosition + wordsPerChunk
2844
+ );
2845
+ const chunkText2 = chunkWords.join(" ");
2846
+ const tokenCount = estimateTokenCount(chunkText2);
2847
+ chunks.push({
2848
+ id: `chunk-${chunkIndex}`,
2849
+ text: chunkText2,
2850
+ tokenCount
2851
+ });
2852
+ currentPosition += wordsPerChunk - overlapWords;
2853
+ chunkIndex++;
2854
+ if (currentPosition <= (chunkIndex - 1) * (wordsPerChunk - overlapWords)) {
2855
+ break;
2627
2856
  }
2628
2857
  }
2629
- const all = base.concat(extra).map((ms) => roundToNearestFrameMs(ms, fps)).filter((ms) => ms >= trim_start_sec * 1e3 && ms <= (duration_sec - trim_end_sec) * 1e3);
2630
- const uniqSorted = Array.from(new Set(all)).sort((a, b) => a - b);
2631
- return uniqSorted.slice(0, max_candidates);
2858
+ return chunks;
2632
2859
  }
2633
-
2634
- // src/primitives/thumbnails.ts
2635
- async function getThumbnailUrls(playbackId, duration, options = {}) {
2636
- "use step";
2637
- const { interval = 10, width = 640, shouldSign = false, maxSamples, credentials } = options;
2638
- let timestamps = [];
2639
- if (duration <= 50) {
2640
- const spacing = duration / 6;
2641
- for (let i = 1; i <= 5; i++) {
2642
- timestamps.push(Math.round(i * spacing));
2643
- }
2644
- } else {
2645
- for (let time = 0; time < duration; time += interval) {
2646
- timestamps.push(time);
2860
+ function createChunkFromCues(cues, index) {
2861
+ const text = cues.map((c) => c.text).join(" ");
2862
+ return {
2863
+ id: `chunk-${index}`,
2864
+ text,
2865
+ tokenCount: estimateTokenCount(text),
2866
+ startTime: cues[0].startTime,
2867
+ endTime: cues[cues.length - 1].endTime
2868
+ };
2869
+ }
2870
+ function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
2871
+ if (cues.length === 0)
2872
+ return [];
2873
+ const chunks = [];
2874
+ let currentCues = [];
2875
+ let currentTokens = 0;
2876
+ let chunkIndex = 0;
2877
+ for (let i = 0; i < cues.length; i++) {
2878
+ const cue = cues[i];
2879
+ const cueTokens = estimateTokenCount(cue.text);
2880
+ if (currentTokens + cueTokens > maxTokens && currentCues.length > 0) {
2881
+ chunks.push(createChunkFromCues(currentCues, chunkIndex));
2882
+ chunkIndex++;
2883
+ const overlapStart = Math.max(0, currentCues.length - overlapCues);
2884
+ currentCues = currentCues.slice(overlapStart);
2885
+ currentTokens = currentCues.reduce(
2886
+ (sum, c) => sum + estimateTokenCount(c.text),
2887
+ 0
2888
+ );
2647
2889
  }
2890
+ currentCues.push(cue);
2891
+ currentTokens += cueTokens;
2648
2892
  }
2649
- if (maxSamples !== void 0 && timestamps.length > maxSamples) {
2650
- const newTimestamps = [];
2651
- newTimestamps.push(0);
2652
- if (maxSamples >= 2) {
2653
- const spacing = duration / (maxSamples - 1);
2654
- for (let i = 1; i < maxSamples - 1; i++) {
2655
- newTimestamps.push(spacing * i);
2656
- }
2657
- newTimestamps.push(duration);
2893
+ if (currentCues.length > 0) {
2894
+ chunks.push(createChunkFromCues(currentCues, chunkIndex));
2895
+ }
2896
+ return chunks;
2897
+ }
2898
+ function scoreCueBoundary(cues, index, boundaryPauseSeconds) {
2899
+ const cue = cues[index];
2900
+ const nextCue = cues[index + 1];
2901
+ if (!nextCue) {
2902
+ return Number.POSITIVE_INFINITY;
2903
+ }
2904
+ const trimmedText = cue.text.trim();
2905
+ let score = 0;
2906
+ if (SENTENCE_BOUNDARY_REGEX.test(trimmedText)) {
2907
+ score += 4;
2908
+ } else if (CLAUSE_BOUNDARY_REGEX.test(trimmedText)) {
2909
+ score += 2;
2910
+ }
2911
+ if (nextCue.startTime - cue.endTime >= boundaryPauseSeconds) {
2912
+ score += 2;
2913
+ }
2914
+ if (NEXT_SENTENCE_START_REGEX.test(nextCue.text.trim())) {
2915
+ score += 1;
2916
+ }
2917
+ return score;
2918
+ }
2919
+ function chunkVTTCuesByBudget(cues, options) {
2920
+ if (cues.length === 0) {
2921
+ return [];
2922
+ }
2923
+ const maxCuesPerChunk = Math.max(1, options.maxCuesPerChunk);
2924
+ let maxTextTokensPerChunk = Number.POSITIVE_INFINITY;
2925
+ if (options.maxTextTokensPerChunk) {
2926
+ maxTextTokensPerChunk = Math.max(1, options.maxTextTokensPerChunk);
2927
+ }
2928
+ const chunks = [];
2929
+ let chunkIndex = 0;
2930
+ let cueStartIndex = 0;
2931
+ let currentTokenCount = 0;
2932
+ for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
2933
+ const cue = cues[cueIndex];
2934
+ const cueTokenCount = estimateTokenCount(cue.text);
2935
+ const currentCueCount = cueIndex - cueStartIndex;
2936
+ const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
2937
+ const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
2938
+ if (wouldExceedCueCount || wouldExceedTokenCount) {
2939
+ chunks.push({
2940
+ id: `chunk-${chunkIndex}`,
2941
+ cueStartIndex,
2942
+ cueEndIndex: cueIndex - 1,
2943
+ cueCount: cueIndex - cueStartIndex,
2944
+ startTime: cues[cueStartIndex].startTime,
2945
+ endTime: cues[cueIndex - 1].endTime
2946
+ });
2947
+ cueStartIndex = cueIndex;
2948
+ currentTokenCount = 0;
2949
+ chunkIndex++;
2658
2950
  }
2659
- timestamps = newTimestamps;
2951
+ currentTokenCount += cueTokenCount;
2660
2952
  }
2661
- const baseUrl = getMuxThumbnailBaseUrl(playbackId);
2662
- const urlPromises = timestamps.map(async (time) => {
2663
- const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
2664
- return { url, time };
2953
+ chunks.push({
2954
+ id: `chunk-${chunkIndex}`,
2955
+ cueStartIndex,
2956
+ cueEndIndex: cues.length - 1,
2957
+ cueCount: cues.length - cueStartIndex,
2958
+ startTime: cues[cueStartIndex].startTime,
2959
+ endTime: cues[cues.length - 1].endTime
2665
2960
  });
2666
- return Promise.all(urlPromises);
2961
+ return chunks;
2667
2962
  }
2668
-
2669
- // src/workflows/moderation.ts
2670
- var DEFAULT_THRESHOLDS = {
2671
- sexual: 0.8,
2672
- violence: 0.8
2673
- };
2674
- var DEFAULT_PROVIDER2 = "openai";
2675
- var HIVE_ENDPOINT = "https://api.thehive.ai/api/v2/task/sync";
2676
- var HIVE_SEXUAL_CATEGORIES = [
2677
- "general_nsfw",
2678
- "yes_sexual_activity",
2679
- "yes_sex_toy",
2680
- "yes_female_nudity",
2681
- "yes_male_nudity"
2682
- ];
2683
- var HIVE_VIOLENCE_CATEGORIES = [
2684
- "gun_in_hand",
2685
- "gun_not_in_hand",
2686
- "knife_in_hand",
2687
- "very_bloody",
2688
- "other_blood",
2689
- "hanging",
2690
- "noose",
2691
- "human_corpse",
2692
- "yes_emaciated_body",
2693
- "yes_self_harm",
2694
- "garm_death_injury_or_military_conflict"
2695
- ];
2696
- async function processConcurrently(items, processor, maxConcurrent = 5) {
2697
- "use step";
2698
- const results = [];
2699
- for (let i = 0; i < items.length; i += maxConcurrent) {
2700
- const batch = items.slice(i, i + maxConcurrent);
2701
- const batchPromises = batch.map(processor);
2702
- const batchResults = await Promise.all(batchPromises);
2703
- results.push(...batchResults);
2963
+ function chunkVTTCuesByDuration(cues, options) {
2964
+ if (cues.length === 0) {
2965
+ return [];
2704
2966
  }
2705
- return results;
2706
- }
2707
- async function moderateImageWithOpenAI(entry) {
2708
- "use step";
2709
- const apiKey = await getApiKeyFromEnv("openai", entry.credentials);
2710
- try {
2711
- const res = await fetch("https://api.openai.com/v1/moderations", {
2712
- method: "POST",
2713
- headers: {
2714
- "Content-Type": "application/json",
2715
- "Authorization": `Bearer ${apiKey}`
2716
- },
2717
- body: JSON.stringify({
2718
- model: entry.model,
2719
- input: [
2720
- {
2721
- type: "image_url",
2722
- image_url: {
2723
- url: entry.image
2724
- }
2725
- }
2726
- ]
2727
- })
2728
- });
2729
- const json = await res.json();
2730
- if (!res.ok) {
2731
- throw new Error(
2732
- `OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
2733
- );
2967
+ const targetChunkDurationSeconds = Math.max(1, options.targetChunkDurationSeconds);
2968
+ const maxChunkDurationSeconds = Math.max(targetChunkDurationSeconds, options.maxChunkDurationSeconds);
2969
+ const minChunkDurationSeconds = Math.min(
2970
+ targetChunkDurationSeconds,
2971
+ Math.max(
2972
+ 1,
2973
+ options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
2974
+ )
2975
+ );
2976
+ const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
2977
+ const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
2978
+ const preferredBoundaryStartSeconds = Math.max(
2979
+ minChunkDurationSeconds,
2980
+ targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
2981
+ );
2982
+ const chunks = [];
2983
+ let chunkIndex = 0;
2984
+ let cueStartIndex = 0;
2985
+ while (cueStartIndex < cues.length) {
2986
+ const chunkStartTime = cues[cueStartIndex].startTime;
2987
+ let cueEndIndex = cueStartIndex;
2988
+ let bestBoundaryIndex = -1;
2989
+ let bestBoundaryScore = -1;
2990
+ let bestPreferredBoundaryIndex = -1;
2991
+ let bestPreferredBoundaryScore = -1;
2992
+ while (cueEndIndex < cues.length) {
2993
+ const cue = cues[cueEndIndex];
2994
+ const currentDuration = cue.endTime - chunkStartTime;
2995
+ if (currentDuration >= minChunkDurationSeconds) {
2996
+ const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
2997
+ if (boundaryScore >= bestBoundaryScore) {
2998
+ bestBoundaryIndex = cueEndIndex;
2999
+ bestBoundaryScore = boundaryScore;
3000
+ }
3001
+ if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
3002
+ bestPreferredBoundaryIndex = cueEndIndex;
3003
+ bestPreferredBoundaryScore = boundaryScore;
3004
+ }
3005
+ }
3006
+ const nextCue = cues[cueEndIndex + 1];
3007
+ if (!nextCue) {
3008
+ break;
3009
+ }
3010
+ const nextDuration = nextCue.endTime - chunkStartTime;
3011
+ const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
3012
+ const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
3013
+ const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
3014
+ if (currentDuration >= targetChunkDurationSeconds) {
3015
+ if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
3016
+ cueEndIndex = preferredBoundaryIndex;
3017
+ break;
3018
+ }
3019
+ if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
3020
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
3021
+ break;
3022
+ }
3023
+ }
3024
+ if (nextDuration > maxChunkDurationSeconds) {
3025
+ cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
3026
+ break;
3027
+ }
3028
+ cueEndIndex++;
2734
3029
  }
2735
- const categoryScores = json.results?.[0]?.category_scores || {};
2736
- return {
2737
- url: entry.url,
2738
- time: entry.time,
2739
- sexual: categoryScores.sexual || 0,
2740
- violence: categoryScores.violence || 0,
2741
- error: false
2742
- };
2743
- } catch (error) {
2744
- console.error("OpenAI moderation failed:", error);
2745
- return {
2746
- url: entry.url,
2747
- time: entry.time,
2748
- sexual: 0,
2749
- violence: 0,
2750
- error: true,
2751
- errorMessage: error instanceof Error ? error.message : String(error)
2752
- };
3030
+ chunks.push({
3031
+ id: `chunk-${chunkIndex}`,
3032
+ cueStartIndex,
3033
+ cueEndIndex,
3034
+ cueCount: cueEndIndex - cueStartIndex + 1,
3035
+ startTime: cues[cueStartIndex].startTime,
3036
+ endTime: cues[cueEndIndex].endTime
3037
+ });
3038
+ cueStartIndex = cueEndIndex + 1;
3039
+ chunkIndex++;
2753
3040
  }
3041
+ return chunks;
2754
3042
  }
2755
- async function requestOpenAIModeration(images, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2756
- "use step";
2757
- const imageUrls = images.map((img) => img.url);
2758
- const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2759
- const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
2760
- (img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
2761
- ) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
2762
- return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
2763
- }
2764
- async function requestOpenAITextModeration(text, model, url, credentials) {
2765
- "use step";
2766
- const apiKey = await getApiKeyFromEnv("openai", credentials);
2767
- try {
2768
- const res = await fetch("https://api.openai.com/v1/moderations", {
2769
- method: "POST",
2770
- headers: {
2771
- "Content-Type": "application/json",
2772
- "Authorization": `Bearer ${apiKey}`
2773
- },
2774
- body: JSON.stringify({
2775
- model,
2776
- input: text
2777
- })
2778
- });
2779
- const json = await res.json();
2780
- if (!res.ok) {
2781
- throw new Error(
2782
- `OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
2783
- );
3043
+ function chunkText(text, strategy) {
3044
+ switch (strategy.type) {
3045
+ case "token": {
3046
+ return chunkByTokens(text, strategy.maxTokens, strategy.overlap ?? 0);
3047
+ }
3048
+ default: {
3049
+ const exhaustiveCheck = strategy;
3050
+ throw new Error(`Unsupported chunking strategy: ${exhaustiveCheck}`);
2784
3051
  }
2785
- const categoryScores = json.results?.[0]?.category_scores || {};
2786
- return {
2787
- url,
2788
- sexual: categoryScores.sexual || 0,
2789
- violence: categoryScores.violence || 0,
2790
- error: false
2791
- };
2792
- } catch (error) {
2793
- console.error("OpenAI text moderation failed:", error);
2794
- return {
2795
- url,
2796
- sexual: 0,
2797
- violence: 0,
2798
- error: true,
2799
- errorMessage: error instanceof Error ? error.message : String(error)
2800
- };
2801
3052
  }
2802
3053
  }
2803
- function chunkTextByUtf16CodeUnits(text, maxUnits) {
2804
- if (!text.trim()) {
3054
+
3055
+ // src/workflows/embeddings.ts
3056
+ function averageEmbeddings(embeddings) {
3057
+ if (embeddings.length === 0) {
2805
3058
  return [];
2806
3059
  }
2807
- if (text.length <= maxUnits) {
2808
- return [text];
2809
- }
2810
- const chunks = [];
2811
- for (let i = 0; i < text.length; i += maxUnits) {
2812
- const chunk = text.slice(i, i + maxUnits).trim();
2813
- if (chunk) {
2814
- chunks.push(chunk);
3060
+ const dimensions = embeddings[0].length;
3061
+ const averaged = Array.from({ length: dimensions }, () => 0);
3062
+ for (const embedding of embeddings) {
3063
+ for (let i = 0; i < dimensions; i++) {
3064
+ averaged[i] += embedding[i];
2815
3065
  }
2816
3066
  }
2817
- return chunks;
2818
- }
2819
- async function requestOpenAITranscriptModeration(transcriptText, model, maxConcurrent = 5, credentials) {
2820
- "use step";
2821
- const chunks = chunkTextByUtf16CodeUnits(transcriptText, 1e4);
2822
- if (!chunks.length) {
2823
- return [
2824
- { url: "transcript:0", sexual: 0, violence: 0, error: true, errorMessage: "No transcript chunks to moderate" }
2825
- ];
3067
+ for (let i = 0; i < dimensions; i++) {
3068
+ averaged[i] /= embeddings.length;
2826
3069
  }
2827
- const targets = chunks.map((chunk, idx) => ({
2828
- chunk,
2829
- url: `transcript:${idx}`
2830
- }));
2831
- return processConcurrently(
2832
- targets,
2833
- async (entry) => requestOpenAITextModeration(entry.chunk, model, entry.url, credentials),
2834
- maxConcurrent
2835
- );
3070
+ return averaged;
2836
3071
  }
2837
- function getHiveCategoryScores(classes, categoryNames) {
2838
- const scoreMap = Object.fromEntries(
2839
- classes.map((c) => [c.class, c.score])
3072
+ async function generateSingleChunkEmbedding({
3073
+ chunk,
3074
+ provider,
3075
+ modelId,
3076
+ credentials
3077
+ }) {
3078
+ "use step";
3079
+ const model = await createEmbeddingModelFromConfig(provider, modelId, credentials);
3080
+ const response = await withRetry(
3081
+ () => embed({
3082
+ model,
3083
+ value: chunk.text
3084
+ })
2840
3085
  );
2841
- const missingCategories = categoryNames.filter((category) => !(category in scoreMap));
2842
- if (missingCategories.length > 0) {
2843
- console.warn(
2844
- `Hive response missing expected categories: ${missingCategories.join(", ")}`
2845
- );
2846
- }
2847
- const scores = categoryNames.map((category) => scoreMap[category] || 0);
2848
- return Math.max(...scores, 0);
3086
+ return {
3087
+ chunkId: chunk.id,
3088
+ embedding: response.embedding,
3089
+ metadata: {
3090
+ startTime: chunk.startTime,
3091
+ endTime: chunk.endTime,
3092
+ tokenCount: chunk.tokenCount
3093
+ }
3094
+ };
2849
3095
  }
2850
- async function moderateImageWithHive(entry) {
2851
- "use step";
2852
- const apiKey = await getApiKeyFromEnv("hive", entry.credentials);
2853
- try {
2854
- const formData = new FormData();
2855
- if (entry.source.kind === "url") {
2856
- formData.append("url", entry.source.value);
2857
- } else {
2858
- const extension = entry.source.contentType.split("/")[1] || "jpg";
2859
- const blob = new Blob([entry.source.buffer], {
2860
- type: entry.source.contentType
2861
- });
2862
- formData.append("media", blob, `thumbnail.${extension}`);
2863
- }
2864
- const controller = new AbortController();
2865
- const timeout = setTimeout(() => controller.abort(), 15e3);
2866
- let res;
2867
- try {
2868
- res = await fetch(HIVE_ENDPOINT, {
2869
- method: "POST",
2870
- headers: {
2871
- Accept: "application/json",
2872
- Authorization: `Token ${apiKey}`
2873
- },
2874
- body: formData,
2875
- signal: controller.signal
2876
- });
2877
- } catch (err) {
2878
- if (err?.name === "AbortError") {
2879
- throw new Error("Hive request timed out after 15s");
2880
- }
2881
- throw err;
2882
- } finally {
2883
- clearTimeout(timeout);
2884
- }
2885
- const json = await res.json().catch(() => void 0);
2886
- if (!res.ok) {
2887
- throw new Error(
2888
- `Hive moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
2889
- );
2890
- }
2891
- if (json?.return_code != null && json.return_code !== 0) {
2892
- throw new Error(
2893
- `Hive API error (return_code ${json.return_code}): ${json.message || "Unknown error"}`
2894
- );
2895
- }
2896
- const classes = json?.status?.[0]?.response?.output?.[0]?.classes;
2897
- if (!Array.isArray(classes)) {
2898
- throw new TypeError(
2899
- `Unexpected Hive response structure: ${JSON.stringify(json)}`
2900
- );
2901
- }
2902
- const sexual = getHiveCategoryScores(classes, HIVE_SEXUAL_CATEGORIES);
2903
- const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
2904
- return {
2905
- url: entry.url,
2906
- time: entry.time,
2907
- sexual,
2908
- violence,
2909
- error: false
2910
- };
2911
- } catch (error) {
2912
- return {
2913
- url: entry.url,
2914
- time: entry.time,
2915
- sexual: 0,
2916
- violence: 0,
2917
- error: true,
2918
- errorMessage: error instanceof Error ? error.message : String(error)
2919
- };
2920
- }
2921
- }
2922
- async function requestHiveModeration(images, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
2923
- "use step";
2924
- const imageUrls = images.map((img) => img.url);
2925
- const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
2926
- const targets = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map((img) => ({
2927
- url: img.url,
2928
- time: timeByUrl.get(img.url),
2929
- source: {
2930
- kind: "file",
2931
- buffer: img.buffer,
2932
- contentType: img.contentType
2933
- },
2934
- credentials
2935
- })) : images.map((img) => ({
2936
- url: img.url,
2937
- time: img.time,
2938
- source: { kind: "url", value: img.url },
2939
- credentials
2940
- }));
2941
- return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
2942
- }
2943
- async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options) {
2944
- "use step";
2945
- const { width, shouldSign, credentials } = options;
2946
- const baseUrl = getMuxThumbnailBaseUrl(playbackId);
2947
- const urlPromises = timestampsMs.map(async (tsMs) => {
2948
- const time = Number((tsMs / 1e3).toFixed(2));
2949
- const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
2950
- return { url, time };
2951
- });
2952
- return Promise.all(urlPromises);
2953
- }
2954
- async function getModerationScores(assetId, options = {}) {
2955
- "use workflow";
3096
+ async function generateEmbeddingsInternal(assetId, options = {}) {
2956
3097
  const {
2957
- provider = DEFAULT_PROVIDER2,
2958
- model = provider === "openai" ? "omni-moderation-latest" : void 0,
3098
+ provider = "openai",
3099
+ model,
2959
3100
  languageCode,
2960
- thresholds = DEFAULT_THRESHOLDS,
2961
- thumbnailInterval = 10,
2962
- thumbnailWidth = 640,
2963
- maxSamples,
2964
- maxConcurrent = 5,
2965
- imageSubmissionMode = "url",
2966
- imageDownloadOptions,
2967
- credentials: providedCredentials
3101
+ chunkingStrategy = { type: "token", maxTokens: 500, overlap: 100 },
3102
+ batchSize = 5,
3103
+ credentials
2968
3104
  } = options;
2969
- const credentials = providedCredentials;
2970
- const { asset, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
2971
- const videoTrackDurationSeconds = getVideoTrackDurationSecondsFromAsset(asset);
2972
- const videoTrackFps = getVideoTrackMaxFrameRateFromAsset(asset);
2973
- const assetDurationSeconds = getAssetDurationSecondsFromAsset(asset);
2974
- const candidateDurations = [videoTrackDurationSeconds, assetDurationSeconds].filter(
2975
- (d) => d != null
2976
- );
2977
- const duration = candidateDurations.length > 0 ? Math.min(...candidateDurations) : 0;
2978
- const isAudioOnly = isAudioOnlyAsset(asset);
3105
+ const embeddingModel = resolveEmbeddingModelConfig({ ...options, provider, model });
3106
+ const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
3107
+ const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
3108
+ const isAudioOnly = isAudioOnlyAsset(assetData);
2979
3109
  const signingContext = await resolveMuxSigningContext(credentials);
2980
3110
  if (policy === "signed" && !signingContext) {
2981
3111
  throw new Error(
2982
3112
  "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
2983
3113
  );
2984
3114
  }
2985
- let thumbnailScores;
2986
- let mode = "thumbnails";
2987
- let thumbnailCount;
2988
- if (isAudioOnly) {
2989
- mode = "transcript";
2990
- const readyTextTracks = getReadyTextTracks(asset);
2991
- let transcriptResult = await fetchTranscriptForAsset(asset, playbackId, {
2992
- languageCode,
2993
- cleanTranscript: true,
3115
+ const readyTextTracks = getReadyTextTracks(assetData);
3116
+ const useVttChunking = chunkingStrategy.type === "vtt";
3117
+ let transcriptResult = await fetchTranscriptForAsset(assetData, playbackId, {
3118
+ languageCode,
3119
+ cleanTranscript: !useVttChunking,
3120
+ shouldSign: policy === "signed",
3121
+ credentials
3122
+ });
3123
+ if (isAudioOnly && !transcriptResult.track && readyTextTracks.length === 1) {
3124
+ transcriptResult = await fetchTranscriptForAsset(assetData, playbackId, {
3125
+ cleanTranscript: !useVttChunking,
2994
3126
  shouldSign: policy === "signed",
2995
- credentials,
2996
- required: true
3127
+ credentials
2997
3128
  });
2998
- if (!transcriptResult.track && readyTextTracks.length === 1) {
2999
- transcriptResult = await fetchTranscriptForAsset(asset, playbackId, {
3000
- cleanTranscript: true,
3001
- shouldSign: policy === "signed",
3002
- credentials,
3003
- required: true
3004
- });
3005
- }
3006
- if (provider === "openai") {
3007
- thumbnailScores = await requestOpenAITranscriptModeration(
3008
- transcriptResult.transcriptText,
3009
- model || "omni-moderation-latest",
3010
- maxConcurrent,
3011
- credentials
3129
+ }
3130
+ if (!transcriptResult.track || !transcriptResult.transcriptText) {
3131
+ const availableLanguages = readyTextTracks.map((t) => t.language_code).filter(Boolean).join(", ");
3132
+ if (isAudioOnly) {
3133
+ throw new Error(
3134
+ `No transcript track found${languageCode ? ` for language '${languageCode}'` : ""}. Audio-only assets require a transcript. Available languages: ${availableLanguages || "none"}`
3012
3135
  );
3013
- } else if (provider === "hive") {
3014
- throw new Error("Hive does not support transcript moderation in this workflow. Use provider: 'openai' for audio-only assets.");
3015
- } else {
3016
- throw new Error(`Unsupported moderation provider: ${provider}`);
3017
3136
  }
3018
- } else {
3019
- const thumbnailUrls = maxSamples === void 0 ? (
3020
- // Generate thumbnail URLs (signed if needed) using existing interval-based logic.
3021
- await getThumbnailUrls(playbackId, duration, {
3022
- interval: thumbnailInterval,
3023
- width: thumbnailWidth,
3024
- shouldSign: policy === "signed",
3025
- credentials
3026
- })
3027
- ) : (
3028
- // In maxSamples mode, sample valid timestamps over the trimmed usable span.
3029
- // Use proportional trims (≈ duration/6, capped at 5s) to stay well inside the
3030
- // renderable range — Mux can't always serve thumbnails at the very edges.
3031
- await getThumbnailUrlsFromTimestamps(
3032
- playbackId,
3033
- planSamplingTimestamps({
3034
- duration_sec: duration,
3035
- max_candidates: maxSamples,
3036
- trim_start_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
3037
- trim_end_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
3038
- fps: videoTrackFps,
3039
- base_cadence_hz: thumbnailInterval > 0 ? 1 / thumbnailInterval : void 0
3040
- }),
3041
- {
3042
- width: thumbnailWidth,
3043
- shouldSign: policy === "signed",
3044
- credentials
3045
- }
3046
- )
3137
+ throw new Error(
3138
+ `No caption track found${languageCode ? ` for language '${languageCode}'` : ""}. Available languages: ${availableLanguages || "none"}`
3047
3139
  );
3048
- thumbnailCount = thumbnailUrls.length;
3049
- if (provider === "openai") {
3050
- thumbnailScores = await requestOpenAIModeration(
3051
- thumbnailUrls,
3052
- model || "omni-moderation-latest",
3053
- maxConcurrent,
3054
- imageSubmissionMode,
3055
- imageDownloadOptions,
3056
- credentials
3057
- );
3058
- } else if (provider === "hive") {
3059
- thumbnailScores = await requestHiveModeration(
3060
- thumbnailUrls,
3061
- maxConcurrent,
3062
- imageSubmissionMode,
3063
- imageDownloadOptions,
3064
- credentials
3140
+ }
3141
+ const transcriptText = transcriptResult.transcriptText;
3142
+ if (!transcriptText.trim()) {
3143
+ throw new Error("Transcript is empty");
3144
+ }
3145
+ const chunks = useVttChunking ? chunkVTTCues(
3146
+ parseVTTCues(transcriptText),
3147
+ chunkingStrategy.maxTokens,
3148
+ chunkingStrategy.overlapCues
3149
+ ) : chunkText(transcriptText, chunkingStrategy);
3150
+ if (chunks.length === 0) {
3151
+ throw new Error("No chunks generated from transcript");
3152
+ }
3153
+ const chunkEmbeddings = [];
3154
+ try {
3155
+ for (let i = 0; i < chunks.length; i += batchSize) {
3156
+ const batch = chunks.slice(i, i + batchSize);
3157
+ const batchResults = await Promise.all(
3158
+ batch.map(
3159
+ (chunk) => generateSingleChunkEmbedding({
3160
+ chunk,
3161
+ provider: embeddingModel.provider,
3162
+ modelId: embeddingModel.modelId,
3163
+ credentials
3164
+ })
3165
+ )
3065
3166
  );
3066
- } else {
3067
- throw new Error(`Unsupported moderation provider: ${provider}`);
3167
+ chunkEmbeddings.push(...batchResults);
3068
3168
  }
3069
- }
3070
- const failed = thumbnailScores.filter((s) => s.error);
3071
- if (failed.length > 0) {
3072
- const details = failed.map((s) => `${s.url}: ${s.errorMessage || "Unknown error"}`).join("; ");
3169
+ } catch (error) {
3073
3170
  throw new Error(
3074
- `Moderation failed for ${failed.length}/${thumbnailScores.length} thumbnail(s): ${details}`
3171
+ `Failed to generate embeddings with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
3075
3172
  );
3076
3173
  }
3077
- const maxSexual = Math.max(...thumbnailScores.map((s) => s.sexual));
3078
- const maxViolence = Math.max(...thumbnailScores.map((s) => s.violence));
3079
- const finalThresholds = { ...DEFAULT_THRESHOLDS, ...thresholds };
3174
+ if (chunkEmbeddings.length === 0) {
3175
+ throw new Error("No embeddings generated");
3176
+ }
3177
+ const averagedEmbedding = averageEmbeddings(chunkEmbeddings.map((ce) => ce.embedding));
3178
+ const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0);
3080
3179
  return {
3081
3180
  assetId,
3082
- mode,
3083
- isAudioOnly,
3084
- thumbnailScores,
3181
+ chunks: chunkEmbeddings,
3182
+ averagedEmbedding,
3183
+ provider,
3184
+ model: embeddingModel.modelId,
3185
+ metadata: {
3186
+ totalChunks: chunks.length,
3187
+ totalTokens,
3188
+ chunkingStrategy: JSON.stringify(chunkingStrategy),
3189
+ embeddingDimensions: chunkEmbeddings[0].embedding.length,
3190
+ generatedAt: (/* @__PURE__ */ new Date()).toISOString()
3191
+ },
3085
3192
  usage: {
3086
3193
  metadata: {
3087
- assetDurationSeconds: duration,
3088
- ...thumbnailCount === void 0 ? {} : { thumbnailCount }
3194
+ assetDurationSeconds
3089
3195
  }
3090
- },
3091
- maxScores: {
3092
- sexual: maxSexual,
3093
- violence: maxViolence
3094
- },
3095
- exceedsThreshold: maxSexual > finalThresholds.sexual || maxViolence > finalThresholds.violence,
3096
- thresholds: finalThresholds
3196
+ }
3097
3197
  };
3098
3198
  }
3199
+ async function generateEmbeddings(assetId, options = {}) {
3200
+ "use workflow";
3201
+ return generateEmbeddingsInternal(assetId, options);
3202
+ }
3203
+ async function generateVideoEmbeddings(assetId, options = {}) {
3204
+ "use workflow";
3205
+ console.warn("generateVideoEmbeddings is deprecated. Use generateEmbeddings instead.");
3206
+ return generateEmbeddingsInternal(assetId, options);
3207
+ }
3099
3208
 
3100
- // src/workflows/summarization.ts
3101
- import { generateText as generateText4, Output as Output4 } from "ai";
3102
- import dedent4 from "dedent";
3103
- import { z as z5 } from "zod";
3104
- var SUMMARY_KEYWORD_LIMIT = 10;
3105
- var summarySchema = z5.object({
3106
- keywords: z5.array(z5.string()),
3107
- title: z5.string(),
3108
- description: z5.string()
3109
- }).strict();
3110
- var SUMMARY_OUTPUT = Output4.object({
3111
- name: "summary_metadata",
3112
- description: "Structured summary with title, description, and keywords.",
3113
- schema: summarySchema
3114
- });
3115
- var VALID_TONES = ["neutral", "playful", "professional"];
3116
- var TONE_INSTRUCTIONS = {
3117
- neutral: "Provide a clear, straightforward analysis.",
3118
- playful: "Channel your inner diva! Answer with maximum sass, wit, and playful attitude. Don't hold back - be cheeky, clever, and delightfully snarky. Make it pop!",
3119
- professional: "Provide a professional, executive-level analysis suitable for business reporting."
3120
- };
3121
- function createSummarizationBuilder({ titleLength, descriptionLength, tagCount } = {}) {
3122
- const titleBrevity = titleLength != null ? `Aim for approximately ${titleLength} characters.` : "Aim for brevity - typically under 10 words.";
3123
- const descConstraint = descriptionLength != null ? `approximately ${descriptionLength} characters` : "2-4 sentences";
3124
- const keywordLimit = tagCount ?? SUMMARY_KEYWORD_LIMIT;
3125
- return createPromptBuilder({
3126
- template: {
3127
- task: {
3128
- tag: "task",
3129
- content: "Analyze the storyboard frames and generate metadata that captures the essence of the video content."
3130
- },
3131
- title: {
3132
- tag: "title_requirements",
3133
- content: dedent4`
3134
- A short, compelling headline that immediately communicates the subject or action.
3135
- ${titleBrevity} Think of how a news headline or video card title would read.
3136
- Start with the primary subject, action, or topic - never begin with "A video of" or similar phrasing.
3137
- Use active, specific language.`
3138
- },
3139
- description: {
3140
- tag: "description_requirements",
3141
- content: dedent4`
3142
- A concise summary (${descConstraint}) that describes what happens across the video.
3143
- Cover the main subjects, actions, setting, and any notable progression visible across frames.
3144
- Write in present tense. Be specific about observable details rather than making assumptions.
3145
- If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`
3146
- },
3147
- keywords: {
3148
- tag: "keywords_requirements",
3149
- content: dedent4`
3150
- Specific, searchable terms (up to ${keywordLimit}) that capture:
3151
- - Primary subjects (people, animals, objects)
3152
- - Actions and activities being performed
3153
- - Setting and environment
3154
- - Notable objects or tools
3155
- - Style or genre (if applicable)
3156
- Prefer concrete nouns and action verbs over abstract concepts.
3157
- Use lowercase. Avoid redundant or overly generic terms like "video" or "content".`
3158
- },
3159
- qualityGuidelines: {
3160
- tag: "quality_guidelines",
3161
- content: dedent4`
3162
- - Examine all frames to understand the full context and progression
3163
- - Be precise: "golden retriever" is better than "dog" when identifiable
3164
- - Capture the narrative: what begins, develops, and concludes
3165
- - Balance brevity with informativeness`
3209
+ // src/lib/sampling-plan.ts
3210
+ var DEFAULT_FPS = 30;
3211
+ function roundToNearestFrameMs(tsMs, fps = DEFAULT_FPS) {
3212
+ const frameMs = 1e3 / fps;
3213
+ return Math.round(Math.round(tsMs / frameMs) * frameMs * 100) / 100;
3214
+ }
3215
+ function planSamplingTimestamps(options) {
3216
+ const DEFAULT_MIN_CANDIDATES = 10;
3217
+ const DEFAULT_MAX_CANDIDATES = 30;
3218
+ const {
3219
+ duration_sec,
3220
+ min_candidates = DEFAULT_MIN_CANDIDATES,
3221
+ max_candidates = DEFAULT_MAX_CANDIDATES,
3222
+ trim_start_sec = 1,
3223
+ trim_end_sec = 1,
3224
+ fps = DEFAULT_FPS,
3225
+ base_cadence_hz,
3226
+ anchor_percents = [0.2, 0.5, 0.8],
3227
+ anchor_window_sec = 1.5
3228
+ } = options;
3229
+ const usableSec = Math.max(0, duration_sec - (trim_start_sec + trim_end_sec));
3230
+ if (usableSec <= 0)
3231
+ return [];
3232
+ const cadenceHz = base_cadence_hz ?? (duration_sec < 15 ? 3 : duration_sec < 60 ? 2 : duration_sec < 180 ? 1.5 : 1);
3233
+ let target = Math.round(usableSec * cadenceHz);
3234
+ target = Math.max(min_candidates, Math.min(max_candidates, target));
3235
+ const stepSec = usableSec / target;
3236
+ const t0 = trim_start_sec;
3237
+ const base = [];
3238
+ for (let i = 0; i < target; i++) {
3239
+ const tsSec = t0 + (i + 0.5) * stepSec;
3240
+ base.push(tsSec * 1e3);
3241
+ }
3242
+ const slack = Math.max(0, max_candidates - base.length);
3243
+ const extra = [];
3244
+ if (slack > 0 && anchor_percents.length > 0) {
3245
+ const perAnchor = Math.max(1, Math.min(5, Math.floor(slack / anchor_percents.length)));
3246
+ for (const p of anchor_percents) {
3247
+ const centerSec = Math.min(
3248
+ t0 + usableSec - 1e-3,
3249
+ // nudge just inside the end bound
3250
+ Math.max(t0 + 1e-3, duration_sec * p)
3251
+ // nudge just inside the start bound
3252
+ );
3253
+ const startSec = Math.max(t0, centerSec - anchor_window_sec / 2);
3254
+ const endSec = Math.min(t0 + usableSec, centerSec + anchor_window_sec / 2);
3255
+ if (endSec <= startSec)
3256
+ continue;
3257
+ const wStep = (endSec - startSec) / perAnchor;
3258
+ for (let i = 0; i < perAnchor; i++) {
3259
+ const tsSec = startSec + (i + 0.5) * wStep;
3260
+ extra.push(tsSec * 1e3);
3166
3261
  }
3167
- },
3168
- sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
3169
- });
3262
+ }
3263
+ }
3264
+ const all = base.concat(extra).map((ms) => roundToNearestFrameMs(ms, fps)).filter((ms) => ms >= trim_start_sec * 1e3 && ms <= (duration_sec - trim_end_sec) * 1e3);
3265
+ const uniqSorted = Array.from(new Set(all)).sort((a, b) => a - b);
3266
+ return uniqSorted.slice(0, max_candidates);
3170
3267
  }
3171
- function createAudioOnlyBuilder({ titleLength, descriptionLength, tagCount } = {}) {
3172
- const titleBrevity = titleLength != null ? `Aim for approximately ${titleLength} characters.` : "Aim for brevity - typically under 10 words.";
3173
- const descConstraint = descriptionLength != null ? `approximately ${descriptionLength} characters` : "2-4 sentences";
3174
- const keywordLimit = tagCount ?? SUMMARY_KEYWORD_LIMIT;
3175
- return createPromptBuilder({
3176
- template: {
3177
- task: {
3178
- tag: "task",
3179
- content: "Analyze the transcript and generate metadata that captures the essence of the audio content."
3180
- },
3181
- title: {
3182
- tag: "title_requirements",
3183
- content: dedent4`
3184
- A short, compelling headline that immediately communicates the subject or topic.
3185
- ${titleBrevity} Think of how a podcast title or audio description would read.
3186
- Start with the primary subject, action, or topic - never begin with "An audio of" or similar phrasing.
3187
- Use active, specific language.`
3188
- },
3189
- description: {
3190
- tag: "description_requirements",
3191
- content: dedent4`
3192
- A concise summary (${descConstraint}) that describes the audio content.
3193
- Cover the main topics, speakers, themes, and any notable progression in the discussion or narration.
3194
- Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
3195
- Focus on the spoken content and any key insights, dialogue, or narrative elements.`
3196
- },
3197
- keywords: {
3198
- tag: "keywords_requirements",
3199
- content: dedent4`
3200
- Specific, searchable terms (up to ${keywordLimit}) that capture:
3201
- - Primary topics and themes
3202
- - Speakers or presenters (if named)
3203
- - Key concepts and terminology
3204
- - Content type (interview, lecture, music, etc.)
3205
- - Genre or style (if applicable)
3206
- Prefer concrete nouns and relevant terms over abstract concepts.
3207
- Use lowercase. Avoid redundant or overly generic terms like "audio" or "content".`
3208
- },
3209
- qualityGuidelines: {
3210
- tag: "quality_guidelines",
3211
- content: dedent4`
3212
- - Analyze the full transcript to understand context and themes
3213
- - Be precise: use specific terminology when mentioned
3214
- - Capture the narrative: what is introduced, discussed, and concluded
3215
- - Balance brevity with informativeness`
3268
+
3269
+ // src/primitives/thumbnails.ts
3270
+ async function getThumbnailUrls(playbackId, duration, options = {}) {
3271
+ "use step";
3272
+ const { interval = 10, width = 640, shouldSign = false, maxSamples, credentials } = options;
3273
+ let timestamps = [];
3274
+ if (duration <= 50) {
3275
+ const spacing = duration / 6;
3276
+ for (let i = 1; i <= 5; i++) {
3277
+ timestamps.push(Math.round(i * spacing));
3278
+ }
3279
+ } else {
3280
+ for (let time = 0; time < duration; time += interval) {
3281
+ timestamps.push(time);
3282
+ }
3283
+ }
3284
+ if (maxSamples !== void 0 && timestamps.length > maxSamples) {
3285
+ const newTimestamps = [];
3286
+ newTimestamps.push(0);
3287
+ if (maxSamples >= 2) {
3288
+ const spacing = duration / (maxSamples - 1);
3289
+ for (let i = 1; i < maxSamples - 1; i++) {
3290
+ newTimestamps.push(spacing * i);
3216
3291
  }
3217
- },
3218
- sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
3292
+ newTimestamps.push(duration);
3293
+ }
3294
+ timestamps = newTimestamps;
3295
+ }
3296
+ const baseUrl = getMuxThumbnailBaseUrl(playbackId);
3297
+ const urlPromises = timestamps.map(async (time) => {
3298
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
3299
+ return { url, time };
3219
3300
  });
3301
+ return Promise.all(urlPromises);
3220
3302
  }
3221
- var SYSTEM_PROMPT3 = dedent4`
3222
- <role>
3223
- You are a video content analyst specializing in storyboard interpretation and multimodal analysis.
3224
- </role>
3225
3303
 
3226
- <context>
3227
- You receive storyboard images containing multiple sequential frames extracted from a video.
3228
- These frames are arranged in a grid and represent the visual progression of the content over time.
3229
- Read frames left-to-right, top-to-bottom to understand the temporal sequence.
3230
- </context>
3231
-
3232
- <transcript_guidance>
3233
- When a transcript is provided alongside the storyboard:
3234
- - Use it to understand spoken content, dialogue, narration, and audio context
3235
- - Correlate transcript content with visual frames to build a complete picture
3236
- - Extract key terminology, names, and specific language used by speakers
3237
- - Let the transcript inform keyword selection, especially for topics not visually obvious
3238
- - Prioritize visual content for the description, but enrich it with transcript insights
3239
- - If transcript and visuals conflict, trust the visual evidence
3240
- </transcript_guidance>
3241
-
3242
- <capabilities>
3243
- - Extract meaning from visual sequences
3244
- - Identify subjects, actions, settings, and narrative arcs
3245
- - Generate accurate, searchable metadata
3246
- - Synthesize visual and transcript information when provided
3247
- </capabilities>
3248
-
3249
- <constraints>
3250
- - Only describe what is clearly observable in the frames or explicitly stated in the transcript
3251
- - Do not fabricate details or make unsupported assumptions
3252
- - Return structured data matching the requested schema
3253
- - Output only the JSON object; no markdown or extra text
3254
- - When a <language> section is provided, all output text MUST be written in that language
3255
- </constraints>
3256
-
3257
- <tone_guidance>
3258
- Pay special attention to the <tone> section and lean heavily into those instructions.
3259
- Adapt your entire analysis and writing style to match the specified tone - this should influence
3260
- your word choice, personality, formality level, and overall presentation of the content.
3261
- The tone instructions are not suggestions but core requirements for how you should express yourself.
3262
- </tone_guidance>
3263
-
3264
- <language_guidelines>
3265
- AVOID these meta-descriptive phrases that reference the medium rather than the content:
3266
- - "The image shows..." / "The storyboard shows..."
3267
- - "In this video..." / "This video features..."
3268
- - "The frames depict..." / "The footage shows..."
3269
- - "We can see..." / "You can see..."
3270
- - "The clip shows..." / "The scene shows..."
3271
-
3272
- INSTEAD, describe the content directly:
3273
- - BAD: "The video shows a chef preparing a meal"
3274
- - GOOD: "A chef prepares a meal in a professional kitchen"
3275
-
3276
- Write as if describing reality, not describing a recording of reality.
3277
- </language_guidelines>`;
3278
- var AUDIO_ONLY_SYSTEM_PROMPT = dedent4`
3279
- <role>
3280
- You are an audio content analyst specializing in transcript analysis and metadata generation.
3281
- </role>
3282
-
3283
- <context>
3284
- You receive transcript text from audio-only content (podcasts, audiobooks, music, etc.).
3285
- Your task is to analyze the spoken/audio content and generate accurate, searchable metadata.
3286
- </context>
3287
-
3288
- <transcript_guidance>
3289
- - Carefully analyze the entire transcript to understand themes, topics, and key points
3290
- - Extract key terminology, names, concepts, and specific language used
3291
- - Identify the content type (interview, lecture, music, narration, etc.)
3292
- - Note the tone, style, and any distinctive characteristics of the audio
3293
- - Consider the intended audience and context based on language and content
3294
- </transcript_guidance>
3295
-
3296
- <capabilities>
3297
- - Extract meaning and themes from spoken/audio content
3298
- - Identify subjects, topics, speakers, and narrative structure
3299
- - Generate accurate, searchable metadata from audio-based content
3300
- - Understand context and intent from transcript alone
3301
- </capabilities>
3302
-
3303
- <constraints>
3304
- - Only describe what is explicitly stated or strongly implied in the transcript
3305
- - Do not fabricate details or make unsupported assumptions
3306
- - Return structured data matching the requested schema
3307
- - Focus entirely on audio/spoken content - there are no visual elements
3308
- - Output only the JSON object; no markdown or extra text
3309
- - When a <language> section is provided, all output text MUST be written in that language
3310
- </constraints>
3311
-
3312
- <tone_guidance>
3313
- Pay special attention to the <tone> section and lean heavily into those instructions.
3314
- Adapt your entire analysis and writing style to match the specified tone - this should influence
3315
- your word choice, personality, formality level, and overall presentation of the content.
3316
- The tone instructions are not suggestions but core requirements for how you should express yourself.
3317
- </tone_guidance>
3318
-
3319
- <language_guidelines>
3320
- AVOID these meta-descriptive phrases that reference the medium rather than the content:
3321
- - "The audio shows..." / "The transcript shows..."
3322
- - "In this recording..." / "This audio features..."
3323
- - "The speaker says..." / "We can hear..."
3324
- - "The clip contains..." / "The recording shows..."
3325
-
3326
- INSTEAD, describe the content directly:
3327
- - BAD: "The audio features a discussion about climate change"
3328
- - GOOD: "A panel discusses climate change impacts and solutions"
3329
-
3330
- Write as if describing reality, not describing a recording of reality.
3331
- </language_guidelines>`;
3332
- function buildUserPrompt4({
3333
- tone,
3334
- transcriptText,
3335
- isCleanTranscript = true,
3336
- promptOverrides,
3337
- isAudioOnly = false,
3338
- titleLength,
3339
- descriptionLength,
3340
- tagCount,
3341
- languageName
3342
- }) {
3343
- const contextSections = [createToneSection(TONE_INSTRUCTIONS[tone])];
3344
- if (languageName) {
3345
- contextSections.push(createLanguageSection(languageName));
3346
- }
3347
- if (transcriptText) {
3348
- const format = isCleanTranscript ? "plain text" : "WebVTT";
3349
- contextSections.push(createTranscriptSection(transcriptText, format));
3304
+ // src/workflows/moderation.ts
3305
+ var DEFAULT_THRESHOLDS = {
3306
+ sexual: 0.8,
3307
+ violence: 0.8
3308
+ };
3309
+ var DEFAULT_PROVIDER2 = "openai";
3310
+ var HIVE_ENDPOINT = "https://api.thehive.ai/api/v2/task/sync";
3311
+ var HIVE_SEXUAL_CATEGORIES = [
3312
+ "general_nsfw",
3313
+ "yes_sexual_activity",
3314
+ "yes_sex_toy",
3315
+ "yes_female_nudity",
3316
+ "yes_male_nudity"
3317
+ ];
3318
+ var HIVE_VIOLENCE_CATEGORIES = [
3319
+ "gun_in_hand",
3320
+ "gun_not_in_hand",
3321
+ "knife_in_hand",
3322
+ "very_bloody",
3323
+ "other_blood",
3324
+ "hanging",
3325
+ "noose",
3326
+ "human_corpse",
3327
+ "yes_emaciated_body",
3328
+ "yes_self_harm",
3329
+ "garm_death_injury_or_military_conflict"
3330
+ ];
3331
+ async function processConcurrently(items, processor, maxConcurrent = 5) {
3332
+ "use step";
3333
+ const results = [];
3334
+ for (let i = 0; i < items.length; i += maxConcurrent) {
3335
+ const batch = items.slice(i, i + maxConcurrent);
3336
+ const batchPromises = batch.map(processor);
3337
+ const batchResults = await Promise.all(batchPromises);
3338
+ results.push(...batchResults);
3350
3339
  }
3351
- const constraints = { titleLength, descriptionLength, tagCount };
3352
- const promptBuilder = isAudioOnly ? createAudioOnlyBuilder(constraints) : createSummarizationBuilder(constraints);
3353
- return promptBuilder.buildWithContext(promptOverrides, contextSections);
3340
+ return results;
3354
3341
  }
3355
- async function analyzeStoryboard2(imageDataUrl, provider, modelId, userPrompt, systemPrompt, credentials) {
3342
+ async function moderateImageWithOpenAI(entry) {
3356
3343
  "use step";
3357
- const model = await createLanguageModelFromConfig(provider, modelId, credentials);
3358
- const response = await generateText4({
3359
- model,
3360
- output: SUMMARY_OUTPUT,
3361
- messages: [
3362
- {
3363
- role: "system",
3364
- content: systemPrompt
3344
+ const apiKey = await getApiKeyFromEnv("openai", entry.credentials);
3345
+ try {
3346
+ const res = await fetch("https://api.openai.com/v1/moderations", {
3347
+ method: "POST",
3348
+ headers: {
3349
+ "Content-Type": "application/json",
3350
+ "Authorization": `Bearer ${apiKey}`
3365
3351
  },
3366
- {
3367
- role: "user",
3368
- content: [
3369
- { type: "text", text: userPrompt },
3370
- { type: "image", image: imageDataUrl }
3352
+ body: JSON.stringify({
3353
+ model: entry.model,
3354
+ input: [
3355
+ {
3356
+ type: "image_url",
3357
+ image_url: {
3358
+ url: entry.image
3359
+ }
3360
+ }
3371
3361
  ]
3372
- }
3373
- ]
3374
- });
3375
- if (!response.output) {
3376
- throw new Error("Summarization output missing");
3377
- }
3378
- const parsed = summarySchema.parse(response.output);
3379
- return {
3380
- result: parsed,
3381
- usage: {
3382
- inputTokens: response.usage.inputTokens,
3383
- outputTokens: response.usage.outputTokens,
3384
- totalTokens: response.usage.totalTokens,
3385
- reasoningTokens: response.usage.reasoningTokens,
3386
- cachedInputTokens: response.usage.cachedInputTokens
3362
+ })
3363
+ });
3364
+ const json = await res.json();
3365
+ if (!res.ok) {
3366
+ throw new Error(
3367
+ `OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
3368
+ );
3387
3369
  }
3388
- };
3370
+ const categoryScores = json.results?.[0]?.category_scores || {};
3371
+ return {
3372
+ url: entry.url,
3373
+ time: entry.time,
3374
+ sexual: categoryScores.sexual || 0,
3375
+ violence: categoryScores.violence || 0,
3376
+ error: false
3377
+ };
3378
+ } catch (error) {
3379
+ console.error("OpenAI moderation failed:", error);
3380
+ return {
3381
+ url: entry.url,
3382
+ time: entry.time,
3383
+ sexual: 0,
3384
+ violence: 0,
3385
+ error: true,
3386
+ errorMessage: error instanceof Error ? error.message : String(error)
3387
+ };
3388
+ }
3389
3389
  }
3390
- async function analyzeAudioOnly(provider, modelId, userPrompt, systemPrompt, credentials) {
3390
+ async function requestOpenAIModeration(images, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
3391
3391
  "use step";
3392
- const model = await createLanguageModelFromConfig(provider, modelId, credentials);
3393
- const response = await generateText4({
3394
- model,
3395
- output: SUMMARY_OUTPUT,
3396
- messages: [
3397
- {
3398
- role: "system",
3399
- content: systemPrompt
3400
- },
3401
- {
3402
- role: "user",
3403
- content: userPrompt
3404
- }
3405
- ]
3406
- });
3407
- if (!response.output) {
3408
- throw new Error("Summarization output missing");
3409
- }
3410
- const parsed = summarySchema.parse(response.output);
3411
- return {
3412
- result: parsed,
3413
- usage: {
3414
- inputTokens: response.usage.inputTokens,
3415
- outputTokens: response.usage.outputTokens,
3416
- totalTokens: response.usage.totalTokens,
3417
- reasoningTokens: response.usage.reasoningTokens,
3418
- cachedInputTokens: response.usage.cachedInputTokens
3419
- }
3420
- };
3392
+ const imageUrls = images.map((img) => img.url);
3393
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
3394
+ const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
3395
+ (img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
3396
+ ) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
3397
+ return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
3421
3398
  }
3422
- function normalizeKeywords(keywords, limit = SUMMARY_KEYWORD_LIMIT) {
3423
- if (!Array.isArray(keywords) || keywords.length === 0) {
3424
- return [];
3425
- }
3426
- const uniqueLowercase = /* @__PURE__ */ new Set();
3427
- const normalized = [];
3428
- for (const keyword of keywords) {
3429
- const trimmed = keyword?.trim();
3430
- if (!trimmed) {
3431
- continue;
3432
- }
3433
- const lower = trimmed.toLowerCase();
3434
- if (uniqueLowercase.has(lower)) {
3435
- continue;
3436
- }
3437
- uniqueLowercase.add(lower);
3438
- normalized.push(trimmed);
3439
- if (normalized.length === limit) {
3440
- break;
3441
- }
3442
- }
3443
- return normalized;
3444
- }
3445
- async function getSummaryAndTags(assetId, options) {
3446
- "use workflow";
3447
- const {
3448
- provider = "openai",
3449
- model,
3450
- tone = "neutral",
3451
- includeTranscript = true,
3452
- cleanTranscript = true,
3453
- imageSubmissionMode = "url",
3454
- imageDownloadOptions,
3455
- promptOverrides,
3456
- credentials,
3457
- titleLength,
3458
- descriptionLength,
3459
- tagCount,
3460
- outputLanguageCode
3461
- } = options ?? {};
3462
- if (!VALID_TONES.includes(tone)) {
3463
- throw new Error(
3464
- `Invalid tone "${tone}". Valid tones are: ${VALID_TONES.join(", ")}`
3465
- );
3466
- }
3467
- const modelConfig = resolveLanguageModelConfig({
3468
- ...options,
3469
- model,
3470
- provider
3471
- });
3472
- const workflowCredentials = credentials;
3473
- const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, workflowCredentials);
3474
- const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
3475
- const isAudioOnly = isAudioOnlyAsset(assetData);
3476
- if (isAudioOnly && !includeTranscript) {
3477
- throw new Error(
3478
- "Audio-only assets require a transcript. Set includeTranscript: true and ensure the asset has a ready text track (captions/subtitles)."
3479
- );
3480
- }
3481
- const signingContext = await resolveMuxSigningContext(workflowCredentials);
3482
- if (policy === "signed" && !signingContext) {
3483
- throw new Error(
3484
- "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
3485
- );
3486
- }
3487
- const transcriptResult = includeTranscript ? await fetchTranscriptForAsset(assetData, playbackId, {
3488
- cleanTranscript,
3489
- shouldSign: policy === "signed",
3490
- credentials: workflowCredentials,
3491
- required: isAudioOnly
3492
- }) : void 0;
3493
- const transcriptText = transcriptResult?.transcriptText ?? "";
3494
- const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult?.track?.language_code ?? getReadyTextTracks(assetData)[0]?.language_code;
3495
- const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
3496
- const userPrompt = buildUserPrompt4({
3497
- tone,
3498
- transcriptText,
3499
- isCleanTranscript: cleanTranscript,
3500
- promptOverrides,
3501
- isAudioOnly,
3502
- titleLength,
3503
- descriptionLength,
3504
- tagCount,
3505
- languageName
3506
- });
3507
- let analysisResponse;
3508
- let imageUrl;
3509
- const systemPrompt = isAudioOnly ? AUDIO_ONLY_SYSTEM_PROMPT : SYSTEM_PROMPT3;
3399
+ async function requestOpenAITextModeration(text, model, url, credentials) {
3400
+ "use step";
3401
+ const apiKey = await getApiKeyFromEnv("openai", credentials);
3510
3402
  try {
3511
- if (isAudioOnly) {
3512
- analysisResponse = await analyzeAudioOnly(
3513
- modelConfig.provider,
3514
- modelConfig.modelId,
3515
- userPrompt,
3516
- systemPrompt,
3517
- workflowCredentials
3403
+ const res = await fetch("https://api.openai.com/v1/moderations", {
3404
+ method: "POST",
3405
+ headers: {
3406
+ "Content-Type": "application/json",
3407
+ "Authorization": `Bearer ${apiKey}`
3408
+ },
3409
+ body: JSON.stringify({
3410
+ model,
3411
+ input: text
3412
+ })
3413
+ });
3414
+ const json = await res.json();
3415
+ if (!res.ok) {
3416
+ throw new Error(
3417
+ `OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
3518
3418
  );
3519
- } else {
3520
- const storyboardUrl = await getStoryboardUrl(playbackId, 640, policy === "signed", workflowCredentials);
3521
- imageUrl = storyboardUrl;
3522
- if (imageSubmissionMode === "base64") {
3523
- const downloadResult = await downloadImageAsBase64(storyboardUrl, imageDownloadOptions);
3524
- analysisResponse = await analyzeStoryboard2(
3525
- downloadResult.base64Data,
3526
- modelConfig.provider,
3527
- modelConfig.modelId,
3528
- userPrompt,
3529
- systemPrompt,
3530
- workflowCredentials
3531
- );
3532
- } else {
3533
- analysisResponse = await withRetry(() => analyzeStoryboard2(
3534
- storyboardUrl,
3535
- modelConfig.provider,
3536
- modelConfig.modelId,
3537
- userPrompt,
3538
- systemPrompt,
3539
- workflowCredentials
3540
- ));
3541
- }
3542
3419
  }
3420
+ const categoryScores = json.results?.[0]?.category_scores || {};
3421
+ return {
3422
+ url,
3423
+ sexual: categoryScores.sexual || 0,
3424
+ violence: categoryScores.violence || 0,
3425
+ error: false
3426
+ };
3543
3427
  } catch (error) {
3544
- const contentType = isAudioOnly ? "audio" : "video";
3545
- throw new Error(
3546
- `Failed to analyze ${contentType} content with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
3547
- );
3428
+ console.error("OpenAI text moderation failed:", error);
3429
+ return {
3430
+ url,
3431
+ sexual: 0,
3432
+ violence: 0,
3433
+ error: true,
3434
+ errorMessage: error instanceof Error ? error.message : String(error)
3435
+ };
3548
3436
  }
3549
- if (!analysisResponse.result) {
3550
- throw new Error(`Failed to analyze video content for asset ${assetId}`);
3437
+ }
3438
+ function chunkTextByUtf16CodeUnits(text, maxUnits) {
3439
+ if (!text.trim()) {
3440
+ return [];
3551
3441
  }
3552
- if (!analysisResponse.result.title) {
3553
- throw new Error(`Failed to generate title for asset ${assetId}`);
3442
+ if (text.length <= maxUnits) {
3443
+ return [text];
3554
3444
  }
3555
- if (!analysisResponse.result.description) {
3556
- throw new Error(`Failed to generate description for asset ${assetId}`);
3445
+ const chunks = [];
3446
+ for (let i = 0; i < text.length; i += maxUnits) {
3447
+ const chunk = text.slice(i, i + maxUnits).trim();
3448
+ if (chunk) {
3449
+ chunks.push(chunk);
3450
+ }
3557
3451
  }
3558
- return {
3559
- assetId,
3560
- title: analysisResponse.result.title,
3561
- description: analysisResponse.result.description,
3562
- tags: normalizeKeywords(analysisResponse.result.keywords, tagCount ?? SUMMARY_KEYWORD_LIMIT),
3563
- storyboardUrl: imageUrl,
3564
- // undefined for audio-only assets
3565
- usage: {
3566
- ...analysisResponse.usage,
3567
- metadata: {
3568
- assetDurationSeconds
3569
- }
3570
- },
3571
- transcriptText: transcriptText || void 0
3572
- };
3452
+ return chunks;
3573
3453
  }
3574
-
3575
- // src/lib/s3-sigv4.ts
3576
- var AWS4_ALGORITHM = "AWS4-HMAC-SHA256";
3577
- var AWS4_REQUEST_TERMINATOR = "aws4_request";
3578
- var AWS4_SERVICE = "s3";
3579
- var S3_ALLOWED_ENDPOINT_PATTERNS = parseEndpointAllowlist(
3580
- env_default.S3_ALLOWED_ENDPOINT_HOSTS
3581
- );
3582
- function getCrypto() {
3583
- const webCrypto = globalThis.crypto;
3584
- if (!webCrypto?.subtle) {
3585
- throw new Error("Web Crypto API is required for S3 signing.");
3454
+ async function requestOpenAITranscriptModeration(transcriptText, model, maxConcurrent = 5, credentials) {
3455
+ "use step";
3456
+ const chunks = chunkTextByUtf16CodeUnits(transcriptText, 1e4);
3457
+ if (!chunks.length) {
3458
+ return [
3459
+ { url: "transcript:0", sexual: 0, violence: 0, error: true, errorMessage: "No transcript chunks to moderate" }
3460
+ ];
3586
3461
  }
3587
- return webCrypto;
3588
- }
3589
- var textEncoder = new TextEncoder();
3590
- function toBytes(value) {
3591
- return typeof value === "string" ? textEncoder.encode(value) : value;
3592
- }
3593
- function bytesToHex(bytes) {
3594
- return Array.from(bytes).map((byte) => byte.toString(16).padStart(2, "0")).join("");
3595
- }
3596
- async function sha256Hex(value) {
3597
- const digest = await getCrypto().subtle.digest("SHA-256", toBytes(value));
3598
- return bytesToHex(new Uint8Array(digest));
3599
- }
3600
- async function hmacSha256Raw(key, value) {
3601
- const cryptoKey = await getCrypto().subtle.importKey(
3602
- "raw",
3603
- key,
3604
- { name: "HMAC", hash: "SHA-256" },
3605
- false,
3606
- ["sign"]
3462
+ const targets = chunks.map((chunk, idx) => ({
3463
+ chunk,
3464
+ url: `transcript:${idx}`
3465
+ }));
3466
+ return processConcurrently(
3467
+ targets,
3468
+ async (entry) => requestOpenAITextModeration(entry.chunk, model, entry.url, credentials),
3469
+ maxConcurrent
3607
3470
  );
3608
- const signature = await getCrypto().subtle.sign("HMAC", cryptoKey, textEncoder.encode(value));
3609
- return new Uint8Array(signature);
3610
- }
3611
- async function deriveSigningKey(secretAccessKey, shortDate, region) {
3612
- const kDate = await hmacSha256Raw(textEncoder.encode(`AWS4${secretAccessKey}`), shortDate);
3613
- const kRegion = await hmacSha256Raw(kDate, region);
3614
- const kService = await hmacSha256Raw(kRegion, AWS4_SERVICE);
3615
- return hmacSha256Raw(kService, AWS4_REQUEST_TERMINATOR);
3616
- }
3617
- function formatAmzDate(date = /* @__PURE__ */ new Date()) {
3618
- const iso = date.toISOString();
3619
- const shortDate = iso.slice(0, 10).replace(/-/g, "");
3620
- const amzDate = `${iso.slice(0, 19).replace(/[-:]/g, "")}Z`;
3621
- return { amzDate, shortDate };
3622
- }
3623
- function encodeRFC3986(value) {
3624
- return encodeURIComponent(value).replace(/[!'()*]/g, (char) => `%${char.charCodeAt(0).toString(16).toUpperCase()}`);
3625
3471
  }
3626
- function encodePath(path) {
3627
- return path.split("/").map((segment) => encodeRFC3986(segment)).join("/");
3472
+ function getHiveCategoryScores(classes, categoryNames) {
3473
+ const scoreMap = Object.fromEntries(
3474
+ classes.map((c) => [c.class, c.score])
3475
+ );
3476
+ const missingCategories = categoryNames.filter((category) => !(category in scoreMap));
3477
+ if (missingCategories.length > 0) {
3478
+ console.warn(
3479
+ `Hive response missing expected categories: ${missingCategories.join(", ")}`
3480
+ );
3481
+ }
3482
+ const scores = categoryNames.map((category) => scoreMap[category] || 0);
3483
+ return Math.max(...scores, 0);
3628
3484
  }
3629
- function normalizeEndpoint(endpoint) {
3630
- let url;
3485
+ async function moderateImageWithHive(entry) {
3486
+ "use step";
3487
+ const apiKey = await getApiKeyFromEnv("hive", entry.credentials);
3631
3488
  try {
3632
- url = new URL(endpoint);
3633
- } catch {
3634
- throw new Error(`Invalid S3 endpoint: ${endpoint}`);
3635
- }
3636
- if (url.search || url.hash) {
3637
- throw new Error("S3 endpoint must not include query params or hash fragments.");
3489
+ const formData = new FormData();
3490
+ if (entry.source.kind === "url") {
3491
+ formData.append("url", entry.source.value);
3492
+ } else {
3493
+ const extension = entry.source.contentType.split("/")[1] || "jpg";
3494
+ const blob = new Blob([entry.source.buffer], {
3495
+ type: entry.source.contentType
3496
+ });
3497
+ formData.append("media", blob, `thumbnail.${extension}`);
3498
+ }
3499
+ const controller = new AbortController();
3500
+ const timeout = setTimeout(() => controller.abort(), 15e3);
3501
+ let res;
3502
+ try {
3503
+ res = await fetch(HIVE_ENDPOINT, {
3504
+ method: "POST",
3505
+ headers: {
3506
+ Accept: "application/json",
3507
+ Authorization: `Token ${apiKey}`
3508
+ },
3509
+ body: formData,
3510
+ signal: controller.signal
3511
+ });
3512
+ } catch (err) {
3513
+ if (err?.name === "AbortError") {
3514
+ throw new Error("Hive request timed out after 15s");
3515
+ }
3516
+ throw err;
3517
+ } finally {
3518
+ clearTimeout(timeout);
3519
+ }
3520
+ const json = await res.json().catch(() => void 0);
3521
+ if (!res.ok) {
3522
+ throw new Error(
3523
+ `Hive moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
3524
+ );
3525
+ }
3526
+ if (json?.return_code != null && json.return_code !== 0) {
3527
+ throw new Error(
3528
+ `Hive API error (return_code ${json.return_code}): ${json.message || "Unknown error"}`
3529
+ );
3530
+ }
3531
+ const classes = json?.status?.[0]?.response?.output?.[0]?.classes;
3532
+ if (!Array.isArray(classes)) {
3533
+ throw new TypeError(
3534
+ `Unexpected Hive response structure: ${JSON.stringify(json)}`
3535
+ );
3536
+ }
3537
+ const sexual = getHiveCategoryScores(classes, HIVE_SEXUAL_CATEGORIES);
3538
+ const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
3539
+ return {
3540
+ url: entry.url,
3541
+ time: entry.time,
3542
+ sexual,
3543
+ violence,
3544
+ error: false
3545
+ };
3546
+ } catch (error) {
3547
+ return {
3548
+ url: entry.url,
3549
+ time: entry.time,
3550
+ sexual: 0,
3551
+ violence: 0,
3552
+ error: true,
3553
+ errorMessage: error instanceof Error ? error.message : String(error)
3554
+ };
3638
3555
  }
3639
- enforceEndpointPolicy(url);
3640
- return url;
3641
3556
  }
3642
- function parseEndpointAllowlist(allowlist) {
3643
- if (!allowlist) {
3644
- return [];
3645
- }
3646
- return allowlist.split(",").map((value) => value.trim().toLowerCase()).filter(Boolean);
3557
+ async function requestHiveModeration(images, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
3558
+ "use step";
3559
+ const imageUrls = images.map((img) => img.url);
3560
+ const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
3561
+ const targets = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map((img) => ({
3562
+ url: img.url,
3563
+ time: timeByUrl.get(img.url),
3564
+ source: {
3565
+ kind: "file",
3566
+ buffer: img.buffer,
3567
+ contentType: img.contentType
3568
+ },
3569
+ credentials
3570
+ })) : images.map((img) => ({
3571
+ url: img.url,
3572
+ time: img.time,
3573
+ source: { kind: "url", value: img.url },
3574
+ credentials
3575
+ }));
3576
+ return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
3647
3577
  }
3648
- function hostnameMatchesPattern(hostname, pattern) {
3649
- if (pattern.startsWith("*.")) {
3650
- const suffix = pattern.slice(1);
3651
- return hostname.endsWith(suffix) && hostname.length > suffix.length;
3652
- }
3653
- return hostname === pattern;
3578
+ async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options) {
3579
+ "use step";
3580
+ const { width, shouldSign, credentials } = options;
3581
+ const baseUrl = getMuxThumbnailBaseUrl(playbackId);
3582
+ const urlPromises = timestampsMs.map(async (tsMs) => {
3583
+ const time = Number((tsMs / 1e3).toFixed(2));
3584
+ const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
3585
+ return { url, time };
3586
+ });
3587
+ return Promise.all(urlPromises);
3654
3588
  }
3655
- function enforceEndpointPolicy(url) {
3656
- const hostname = url.hostname.toLowerCase();
3657
- if (url.protocol !== "https:") {
3589
+ async function getModerationScores(assetId, options = {}) {
3590
+ "use workflow";
3591
+ const {
3592
+ provider = DEFAULT_PROVIDER2,
3593
+ model = provider === "openai" ? "omni-moderation-latest" : void 0,
3594
+ languageCode,
3595
+ thresholds = DEFAULT_THRESHOLDS,
3596
+ thumbnailInterval = 10,
3597
+ thumbnailWidth = 640,
3598
+ maxSamples,
3599
+ maxConcurrent = 5,
3600
+ imageSubmissionMode = "url",
3601
+ imageDownloadOptions,
3602
+ credentials: providedCredentials
3603
+ } = options;
3604
+ const credentials = providedCredentials;
3605
+ const { asset, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
3606
+ const videoTrackDurationSeconds = getVideoTrackDurationSecondsFromAsset(asset);
3607
+ const videoTrackFps = getVideoTrackMaxFrameRateFromAsset(asset);
3608
+ const assetDurationSeconds = getAssetDurationSecondsFromAsset(asset);
3609
+ const candidateDurations = [videoTrackDurationSeconds, assetDurationSeconds].filter(
3610
+ (d) => d != null
3611
+ );
3612
+ const duration = candidateDurations.length > 0 ? Math.min(...candidateDurations) : 0;
3613
+ const isAudioOnly = isAudioOnlyAsset(asset);
3614
+ const signingContext = await resolveMuxSigningContext(credentials);
3615
+ if (policy === "signed" && !signingContext) {
3658
3616
  throw new Error(
3659
- `Insecure S3 endpoint protocol "${url.protocol}" is not allowed. Use HTTPS.`
3617
+ "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
3660
3618
  );
3661
3619
  }
3662
- if (S3_ALLOWED_ENDPOINT_PATTERNS.length > 0 && !S3_ALLOWED_ENDPOINT_PATTERNS.some((pattern) => hostnameMatchesPattern(hostname, pattern))) {
3620
+ let thumbnailScores;
3621
+ let mode = "thumbnails";
3622
+ let thumbnailCount;
3623
+ if (isAudioOnly) {
3624
+ mode = "transcript";
3625
+ const readyTextTracks = getReadyTextTracks(asset);
3626
+ let transcriptResult = await fetchTranscriptForAsset(asset, playbackId, {
3627
+ languageCode,
3628
+ cleanTranscript: true,
3629
+ shouldSign: policy === "signed",
3630
+ credentials,
3631
+ required: true
3632
+ });
3633
+ if (!transcriptResult.track && readyTextTracks.length === 1) {
3634
+ transcriptResult = await fetchTranscriptForAsset(asset, playbackId, {
3635
+ cleanTranscript: true,
3636
+ shouldSign: policy === "signed",
3637
+ credentials,
3638
+ required: true
3639
+ });
3640
+ }
3641
+ if (provider === "openai") {
3642
+ thumbnailScores = await requestOpenAITranscriptModeration(
3643
+ transcriptResult.transcriptText,
3644
+ model || "omni-moderation-latest",
3645
+ maxConcurrent,
3646
+ credentials
3647
+ );
3648
+ } else if (provider === "hive") {
3649
+ throw new Error("Hive does not support transcript moderation in this workflow. Use provider: 'openai' for audio-only assets.");
3650
+ } else {
3651
+ throw new Error(`Unsupported moderation provider: ${provider}`);
3652
+ }
3653
+ } else {
3654
+ const thumbnailUrls = maxSamples === void 0 ? (
3655
+ // Generate thumbnail URLs (signed if needed) using existing interval-based logic.
3656
+ await getThumbnailUrls(playbackId, duration, {
3657
+ interval: thumbnailInterval,
3658
+ width: thumbnailWidth,
3659
+ shouldSign: policy === "signed",
3660
+ credentials
3661
+ })
3662
+ ) : (
3663
+ // In maxSamples mode, sample valid timestamps over the trimmed usable span.
3664
+ // Use proportional trims (≈ duration/6, capped at 5s) to stay well inside the
3665
+ // renderable range — Mux can't always serve thumbnails at the very edges.
3666
+ await getThumbnailUrlsFromTimestamps(
3667
+ playbackId,
3668
+ planSamplingTimestamps({
3669
+ duration_sec: duration,
3670
+ max_candidates: maxSamples,
3671
+ trim_start_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
3672
+ trim_end_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
3673
+ fps: videoTrackFps,
3674
+ base_cadence_hz: thumbnailInterval > 0 ? 1 / thumbnailInterval : void 0
3675
+ }),
3676
+ {
3677
+ width: thumbnailWidth,
3678
+ shouldSign: policy === "signed",
3679
+ credentials
3680
+ }
3681
+ )
3682
+ );
3683
+ thumbnailCount = thumbnailUrls.length;
3684
+ if (provider === "openai") {
3685
+ thumbnailScores = await requestOpenAIModeration(
3686
+ thumbnailUrls,
3687
+ model || "omni-moderation-latest",
3688
+ maxConcurrent,
3689
+ imageSubmissionMode,
3690
+ imageDownloadOptions,
3691
+ credentials
3692
+ );
3693
+ } else if (provider === "hive") {
3694
+ thumbnailScores = await requestHiveModeration(
3695
+ thumbnailUrls,
3696
+ maxConcurrent,
3697
+ imageSubmissionMode,
3698
+ imageDownloadOptions,
3699
+ credentials
3700
+ );
3701
+ } else {
3702
+ throw new Error(`Unsupported moderation provider: ${provider}`);
3703
+ }
3704
+ }
3705
+ const failed = thumbnailScores.filter((s) => s.error);
3706
+ if (failed.length > 0) {
3707
+ const details = failed.map((s) => `${s.url}: ${s.errorMessage || "Unknown error"}`).join("; ");
3663
3708
  throw new Error(
3664
- `S3 endpoint host "${hostname}" is not in S3_ALLOWED_ENDPOINT_HOSTS.`
3709
+ `Moderation failed for ${failed.length}/${thumbnailScores.length} thumbnail(s): ${details}`
3665
3710
  );
3666
3711
  }
3712
+ const maxSexual = Math.max(...thumbnailScores.map((s) => s.sexual));
3713
+ const maxViolence = Math.max(...thumbnailScores.map((s) => s.violence));
3714
+ const finalThresholds = { ...DEFAULT_THRESHOLDS, ...thresholds };
3715
+ return {
3716
+ assetId,
3717
+ mode,
3718
+ isAudioOnly,
3719
+ thumbnailScores,
3720
+ usage: {
3721
+ metadata: {
3722
+ assetDurationSeconds: duration,
3723
+ ...thumbnailCount === void 0 ? {} : { thumbnailCount }
3724
+ }
3725
+ },
3726
+ maxScores: {
3727
+ sexual: maxSexual,
3728
+ violence: maxViolence
3729
+ },
3730
+ exceedsThreshold: maxSexual > finalThresholds.sexual || maxViolence > finalThresholds.violence,
3731
+ thresholds: finalThresholds
3732
+ };
3667
3733
  }
3668
- function buildCanonicalUri(endpoint, bucket, key) {
3669
- const endpointPath = endpoint.pathname === "/" ? "" : encodePath(endpoint.pathname.replace(/\/+$/, ""));
3670
- const encodedBucket = encodeRFC3986(bucket);
3671
- const encodedKey = encodePath(key);
3672
- return `${endpointPath}/${encodedBucket}/${encodedKey}`;
3673
- }
3674
- function buildCanonicalQuery(params) {
3675
- return Object.entries(params).sort(([a], [b]) => a.localeCompare(b)).map(([key, value]) => `${encodeRFC3986(key)}=${encodeRFC3986(value)}`).join("&");
3676
- }
3677
- async function signString(secretAccessKey, shortDate, region, value) {
3678
- const signingKey = await deriveSigningKey(secretAccessKey, shortDate, region);
3679
- const signatureBytes = await hmacSha256Raw(signingKey, value);
3680
- return bytesToHex(signatureBytes);
3734
+
3735
+ // src/workflows/summarization.ts
3736
+ import { generateText as generateText5, Output as Output5 } from "ai";
3737
+ import dedent5 from "dedent";
3738
+ import { z as z6 } from "zod";
3739
+ var DEFAULT_SUMMARY_KEYWORD_LIMIT = 10;
3740
+ var DEFAULT_TITLE_LENGTH = 10;
3741
+ var DEFAULT_DESCRIPTION_LENGTH = 50;
3742
+ var summarySchema = z6.object({
3743
+ keywords: z6.array(z6.string()),
3744
+ title: z6.string(),
3745
+ description: z6.string()
3746
+ }).strict();
3747
+ var SUMMARY_OUTPUT = Output5.object({
3748
+ name: "summary_metadata",
3749
+ description: "Structured summary with title, description, and keywords.",
3750
+ schema: summarySchema
3751
+ });
3752
+ var VALID_TONES = ["neutral", "playful", "professional"];
3753
+ var TONE_INSTRUCTIONS = {
3754
+ neutral: "Provide a clear, straightforward analysis.",
3755
+ playful: "Channel your inner diva! Answer with maximum sass, wit, and playful attitude. Don't hold back - be cheeky, clever, and delightfully snarky. Make it pop!",
3756
+ professional: "Provide a professional, executive-level analysis suitable for business reporting."
3757
+ };
3758
+ var DESCRIPTION_LENGTH_THRESHOLD_SMALL = 25;
3759
+ var DESCRIPTION_LENGTH_THRESHOLD_LARGE = 100;
3760
+ function buildDescriptionGuidance(wordCount, contentType) {
3761
+ if (wordCount < DESCRIPTION_LENGTH_THRESHOLD_SMALL) {
3762
+ if (contentType === "video") {
3763
+ return dedent5`A brief summary of the video in approximately ${wordCount} words.
3764
+ Focus on the single most important subject or action.
3765
+ Write in present tense.`;
3766
+ }
3767
+ return dedent5`A brief summary of the audio content in approximately ${wordCount} words.
3768
+ Focus on the single most important topic or theme.
3769
+ Write in present tense.`;
3770
+ }
3771
+ if (wordCount > DESCRIPTION_LENGTH_THRESHOLD_LARGE) {
3772
+ if (contentType === "video") {
3773
+ return dedent5`A detailed summary that describes what happens across the video.
3774
+ Aim for approximately ${wordCount} words, and you may use multiple sentences.
3775
+ Be thorough: cover subjects, actions, setting, progression, and any notable details visible across frames.
3776
+ Write in present tense. Be specific about observable details rather than making assumptions.
3777
+ If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`;
3778
+ }
3779
+ return dedent5`A detailed summary that describes the audio content.
3780
+ Aim for approximately ${wordCount} words, and you may use multiple sentences.
3781
+ Be thorough: cover topics, speakers, themes, progression, and any notable insights.
3782
+ Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
3783
+ Focus on the spoken content and any key insights, dialogue, or narrative elements.`;
3784
+ }
3785
+ if (contentType === "video") {
3786
+ return dedent5`A summary that describes what happens across the video.
3787
+ Aim for approximately ${wordCount} words, and you may use multiple sentences.
3788
+ Cover the main subjects, actions, setting, and any notable progression visible across frames.
3789
+ Write in present tense. Be specific about observable details rather than making assumptions.
3790
+ If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`;
3791
+ }
3792
+ return dedent5`A summary that describes the audio content.
3793
+ Aim for approximately ${wordCount} words, and you may use multiple sentences.
3794
+ Cover the main topics, speakers, themes, and any notable progression in the discussion or narration.
3795
+ Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
3796
+ Focus on the spoken content and any key insights, dialogue, or narrative elements.`;
3681
3797
  }
3682
- function buildCredentialScope(shortDate, region) {
3683
- return `${shortDate}/${region}/${AWS4_SERVICE}/${AWS4_REQUEST_TERMINATOR}`;
3798
+ function createSummarizationBuilder({ titleLength, descriptionLength, tagCount } = {}) {
3799
+ const titleBrevity = `Aim for approximately ${titleLength ?? DEFAULT_TITLE_LENGTH} words.`;
3800
+ const keywordLimit = tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT;
3801
+ return createPromptBuilder({
3802
+ template: {
3803
+ task: {
3804
+ tag: "task",
3805
+ content: "Analyze the storyboard frames and generate metadata that captures the essence of the video content."
3806
+ },
3807
+ title: {
3808
+ tag: "title_requirements",
3809
+ content: dedent5`
3810
+ A short, compelling headline that immediately communicates the subject or action.
3811
+ ${titleBrevity} Think of how a news headline or video card title would read.
3812
+ Start with the primary subject, action, or topic - never begin with "A video of" or similar phrasing.
3813
+ Use active, specific language.`
3814
+ },
3815
+ description: {
3816
+ tag: "description_requirements",
3817
+ content: buildDescriptionGuidance(descriptionLength ?? DEFAULT_DESCRIPTION_LENGTH, "video")
3818
+ },
3819
+ keywords: {
3820
+ tag: "keywords_requirements",
3821
+ content: dedent5`
3822
+ Specific, searchable terms (up to ${keywordLimit}) that capture:
3823
+ - Primary subjects (people, animals, objects)
3824
+ - Actions and activities being performed
3825
+ - Setting and environment
3826
+ - Notable objects or tools
3827
+ - Style or genre (if applicable)
3828
+ Prefer concrete nouns and action verbs over abstract concepts.
3829
+ Use lowercase. Avoid redundant or overly generic terms like "video" or "content".`
3830
+ },
3831
+ qualityGuidelines: {
3832
+ tag: "quality_guidelines",
3833
+ content: dedent5`
3834
+ - Examine all frames to understand the full context and progression
3835
+ - Be precise: "golden retriever" is better than "dog" when identifiable
3836
+ - Capture the narrative: what begins, develops, and concludes
3837
+ - Balance brevity with informativeness`
3838
+ }
3839
+ },
3840
+ sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
3841
+ });
3684
3842
  }
3685
- async function putObjectToS3({
3686
- accessKeyId,
3687
- secretAccessKey,
3688
- endpoint,
3689
- region,
3690
- bucket,
3691
- key,
3692
- body,
3693
- contentType
3694
- }) {
3695
- const resolvedEndpoint = normalizeEndpoint(endpoint);
3696
- const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
3697
- const host = resolvedEndpoint.host;
3698
- const normalizedContentType = contentType?.trim();
3699
- const { amzDate, shortDate } = formatAmzDate();
3700
- const payloadHash = await sha256Hex(body);
3701
- const signingHeaders = [
3702
- ["host", host],
3703
- ["x-amz-content-sha256", payloadHash],
3704
- ["x-amz-date", amzDate],
3705
- ...normalizedContentType ? [["content-type", normalizedContentType]] : []
3706
- ].sort(([a], [b]) => a.localeCompare(b));
3707
- const canonicalHeaders = signingHeaders.map(([name, value]) => `${name}:${value}`).join("\n");
3708
- const signedHeaders = signingHeaders.map(([name]) => name).join(";");
3709
- const canonicalRequest = [
3710
- "PUT",
3711
- canonicalUri,
3712
- "",
3713
- `${canonicalHeaders}
3714
- `,
3715
- signedHeaders,
3716
- payloadHash
3717
- ].join("\n");
3718
- const credentialScope = buildCredentialScope(shortDate, region);
3719
- const stringToSign = [
3720
- AWS4_ALGORITHM,
3721
- amzDate,
3722
- credentialScope,
3723
- await sha256Hex(canonicalRequest)
3724
- ].join("\n");
3725
- const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
3726
- const authorization = `${AWS4_ALGORITHM} Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}`;
3727
- const requestUrl = `${resolvedEndpoint.origin}${canonicalUri}`;
3728
- const response = await fetch(requestUrl, {
3729
- method: "PUT",
3730
- headers: {
3731
- "Authorization": authorization,
3732
- "x-amz-content-sha256": payloadHash,
3733
- "x-amz-date": amzDate,
3734
- ...normalizedContentType ? { "content-type": normalizedContentType } : {}
3843
+ function createAudioOnlyBuilder({ titleLength, descriptionLength, tagCount } = {}) {
3844
+ const titleBrevity = `Aim for approximately ${titleLength ?? DEFAULT_TITLE_LENGTH} words.`;
3845
+ const keywordLimit = tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT;
3846
+ return createPromptBuilder({
3847
+ template: {
3848
+ task: {
3849
+ tag: "task",
3850
+ content: "Analyze the transcript and generate metadata that captures the essence of the audio content."
3851
+ },
3852
+ title: {
3853
+ tag: "title_requirements",
3854
+ content: dedent5`
3855
+ A short, compelling headline that immediately communicates the subject or topic.
3856
+ ${titleBrevity} Think of how a podcast title or audio description would read.
3857
+ Start with the primary subject, action, or topic - never begin with "An audio of" or similar phrasing.
3858
+ Use active, specific language.`
3859
+ },
3860
+ description: {
3861
+ tag: "description_requirements",
3862
+ content: buildDescriptionGuidance(descriptionLength ?? DEFAULT_DESCRIPTION_LENGTH, "audio")
3863
+ },
3864
+ keywords: {
3865
+ tag: "keywords_requirements",
3866
+ content: dedent5`
3867
+ Specific, searchable terms (up to ${keywordLimit}) that capture:
3868
+ - Primary topics and themes
3869
+ - Speakers or presenters (if named)
3870
+ - Key concepts and terminology
3871
+ - Content type (interview, lecture, music, etc.)
3872
+ - Genre or style (if applicable)
3873
+ Prefer concrete nouns and relevant terms over abstract concepts.
3874
+ Use lowercase. Avoid redundant or overly generic terms like "audio" or "content".`
3875
+ },
3876
+ qualityGuidelines: {
3877
+ tag: "quality_guidelines",
3878
+ content: dedent5`
3879
+ - Analyze the full transcript to understand context and themes
3880
+ - Be precise: use specific terminology when mentioned
3881
+ - Capture the narrative: what is introduced, discussed, and concluded
3882
+ - Balance brevity with informativeness`
3883
+ }
3735
3884
  },
3736
- body
3885
+ sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
3737
3886
  });
3738
- if (!response.ok) {
3739
- const errorBody = await response.text().catch(() => "");
3740
- const detail = errorBody ? ` ${errorBody}` : "";
3741
- throw new Error(`S3 PUT failed (${response.status} ${response.statusText}).${detail}`);
3742
- }
3743
3887
  }
3744
- async function createPresignedGetUrl({
3745
- accessKeyId,
3746
- secretAccessKey,
3747
- endpoint,
3748
- region,
3749
- bucket,
3750
- key,
3751
- expiresInSeconds = 3600
3888
+ var SYSTEM_PROMPT4 = dedent5`
3889
+ <role>
3890
+ You are a video content analyst specializing in storyboard interpretation and multimodal analysis.
3891
+ </role>
3892
+
3893
+ <context>
3894
+ You receive storyboard images containing multiple sequential frames extracted from a video.
3895
+ These frames are arranged in a grid and represent the visual progression of the content over time.
3896
+ Read frames left-to-right, top-to-bottom to understand the temporal sequence.
3897
+ </context>
3898
+
3899
+ <transcript_guidance>
3900
+ When a transcript is provided alongside the storyboard:
3901
+ - Use it to understand spoken content, dialogue, narration, and audio context
3902
+ - Correlate transcript content with visual frames to build a complete picture
3903
+ - Extract key terminology, names, and specific language used by speakers
3904
+ - Let the transcript inform keyword selection, especially for topics not visually obvious
3905
+ - Prioritize visual content for the description, but enrich it with transcript insights
3906
+ - If transcript and visuals conflict, trust the visual evidence
3907
+ </transcript_guidance>
3908
+
3909
+ <capabilities>
3910
+ - Extract meaning from visual sequences
3911
+ - Identify subjects, actions, settings, and narrative arcs
3912
+ - Generate accurate, searchable metadata
3913
+ - Synthesize visual and transcript information when provided
3914
+ </capabilities>
3915
+
3916
+ <constraints>
3917
+ - Only describe what is clearly observable in the frames or explicitly stated in the transcript
3918
+ - Do not fabricate details or make unsupported assumptions
3919
+ - Return structured data matching the requested schema
3920
+ - Output only the JSON object; no markdown or extra text
3921
+ - When a <language> section is provided, all output text MUST be written in that language
3922
+ </constraints>
3923
+
3924
+ <tone_guidance>
3925
+ Pay special attention to the <tone> section and lean heavily into those instructions.
3926
+ Adapt your entire analysis and writing style to match the specified tone - this should influence
3927
+ your word choice, personality, formality level, and overall presentation of the content.
3928
+ The tone instructions are not suggestions but core requirements for how you should express yourself.
3929
+ </tone_guidance>
3930
+
3931
+ <language_guidelines>
3932
+ AVOID these meta-descriptive phrases that reference the medium rather than the content:
3933
+ - "The image shows..." / "The storyboard shows..."
3934
+ - "In this video..." / "This video features..."
3935
+ - "The frames depict..." / "The footage shows..."
3936
+ - "We can see..." / "You can see..."
3937
+ - "The clip shows..." / "The scene shows..."
3938
+
3939
+ INSTEAD, describe the content directly:
3940
+ - BAD: "The video shows a chef preparing a meal"
3941
+ - GOOD: "A chef prepares a meal in a professional kitchen"
3942
+
3943
+ Write as if describing reality, not describing a recording of reality.
3944
+ </language_guidelines>`;
3945
+ var AUDIO_ONLY_SYSTEM_PROMPT = dedent5`
3946
+ <role>
3947
+ You are an audio content analyst specializing in transcript analysis and metadata generation.
3948
+ </role>
3949
+
3950
+ <context>
3951
+ You receive transcript text from audio-only content (podcasts, audiobooks, music, etc.).
3952
+ Your task is to analyze the spoken/audio content and generate accurate, searchable metadata.
3953
+ </context>
3954
+
3955
+ <transcript_guidance>
3956
+ - Carefully analyze the entire transcript to understand themes, topics, and key points
3957
+ - Extract key terminology, names, concepts, and specific language used
3958
+ - Identify the content type (interview, lecture, music, narration, etc.)
3959
+ - Note the tone, style, and any distinctive characteristics of the audio
3960
+ - Consider the intended audience and context based on language and content
3961
+ </transcript_guidance>
3962
+
3963
+ <capabilities>
3964
+ - Extract meaning and themes from spoken/audio content
3965
+ - Identify subjects, topics, speakers, and narrative structure
3966
+ - Generate accurate, searchable metadata from audio-based content
3967
+ - Understand context and intent from transcript alone
3968
+ </capabilities>
3969
+
3970
+ <constraints>
3971
+ - Only describe what is explicitly stated or strongly implied in the transcript
3972
+ - Do not fabricate details or make unsupported assumptions
3973
+ - Return structured data matching the requested schema
3974
+ - Focus entirely on audio/spoken content - there are no visual elements
3975
+ - Output only the JSON object; no markdown or extra text
3976
+ - When a <language> section is provided, all output text MUST be written in that language
3977
+ </constraints>
3978
+
3979
+ <tone_guidance>
3980
+ Pay special attention to the <tone> section and lean heavily into those instructions.
3981
+ Adapt your entire analysis and writing style to match the specified tone - this should influence
3982
+ your word choice, personality, formality level, and overall presentation of the content.
3983
+ The tone instructions are not suggestions but core requirements for how you should express yourself.
3984
+ </tone_guidance>
3985
+
3986
+ <language_guidelines>
3987
+ AVOID these meta-descriptive phrases that reference the medium rather than the content:
3988
+ - "The audio shows..." / "The transcript shows..."
3989
+ - "In this recording..." / "This audio features..."
3990
+ - "The speaker says..." / "We can hear..."
3991
+ - "The clip contains..." / "The recording shows..."
3992
+
3993
+ INSTEAD, describe the content directly:
3994
+ - BAD: "The audio features a discussion about climate change"
3995
+ - GOOD: "A panel discusses climate change impacts and solutions"
3996
+
3997
+ Write as if describing reality, not describing a recording of reality.
3998
+ </language_guidelines>`;
3999
+ function buildUserPrompt4({
4000
+ tone,
4001
+ transcriptText,
4002
+ isCleanTranscript = true,
4003
+ promptOverrides,
4004
+ isAudioOnly = false,
4005
+ titleLength,
4006
+ descriptionLength,
4007
+ tagCount,
4008
+ languageName
3752
4009
  }) {
3753
- const resolvedEndpoint = normalizeEndpoint(endpoint);
3754
- const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
3755
- const host = resolvedEndpoint.host;
3756
- const { amzDate, shortDate } = formatAmzDate();
3757
- const credentialScope = buildCredentialScope(shortDate, region);
3758
- const signedHeaders = "host";
3759
- const queryParams = {
3760
- "X-Amz-Algorithm": AWS4_ALGORITHM,
3761
- "X-Amz-Credential": `${accessKeyId}/${credentialScope}`,
3762
- "X-Amz-Date": amzDate,
3763
- "X-Amz-Expires": `${expiresInSeconds}`,
3764
- "X-Amz-SignedHeaders": signedHeaders
4010
+ const contextSections = [createToneSection(TONE_INSTRUCTIONS[tone])];
4011
+ if (languageName) {
4012
+ contextSections.push(createLanguageSection(languageName));
4013
+ } else {
4014
+ contextSections.push({
4015
+ tag: "language",
4016
+ content: "Respond in English. Never switch languages to satisfy length constraints."
4017
+ });
4018
+ }
4019
+ if (transcriptText) {
4020
+ const format = isCleanTranscript ? "plain text" : "WebVTT";
4021
+ contextSections.push(createTranscriptSection(transcriptText, format));
4022
+ }
4023
+ const constraints = { titleLength, descriptionLength, tagCount };
4024
+ const promptBuilder = isAudioOnly ? createAudioOnlyBuilder(constraints) : createSummarizationBuilder(constraints);
4025
+ return promptBuilder.buildWithContext(promptOverrides, contextSections);
4026
+ }
4027
+ async function analyzeStoryboard2(imageDataUrl, provider, modelId, userPrompt, systemPrompt, credentials) {
4028
+ "use step";
4029
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4030
+ const response = await generateText5({
4031
+ model,
4032
+ output: SUMMARY_OUTPUT,
4033
+ messages: [
4034
+ {
4035
+ role: "system",
4036
+ content: systemPrompt
4037
+ },
4038
+ {
4039
+ role: "user",
4040
+ content: [
4041
+ { type: "text", text: userPrompt },
4042
+ { type: "image", image: imageDataUrl }
4043
+ ]
4044
+ }
4045
+ ]
4046
+ });
4047
+ if (!response.output) {
4048
+ throw new Error("Summarization output missing");
4049
+ }
4050
+ const parsed = summarySchema.parse(response.output);
4051
+ return {
4052
+ result: parsed,
4053
+ usage: {
4054
+ inputTokens: response.usage.inputTokens,
4055
+ outputTokens: response.usage.outputTokens,
4056
+ totalTokens: response.usage.totalTokens,
4057
+ reasoningTokens: response.usage.reasoningTokens,
4058
+ cachedInputTokens: response.usage.cachedInputTokens
4059
+ }
4060
+ };
4061
+ }
4062
+ async function analyzeAudioOnly(provider, modelId, userPrompt, systemPrompt, credentials) {
4063
+ "use step";
4064
+ const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4065
+ const response = await generateText5({
4066
+ model,
4067
+ output: SUMMARY_OUTPUT,
4068
+ messages: [
4069
+ {
4070
+ role: "system",
4071
+ content: systemPrompt
4072
+ },
4073
+ {
4074
+ role: "user",
4075
+ content: userPrompt
4076
+ }
4077
+ ]
4078
+ });
4079
+ if (!response.output) {
4080
+ throw new Error("Summarization output missing");
4081
+ }
4082
+ const parsed = summarySchema.parse(response.output);
4083
+ return {
4084
+ result: parsed,
4085
+ usage: {
4086
+ inputTokens: response.usage.inputTokens,
4087
+ outputTokens: response.usage.outputTokens,
4088
+ totalTokens: response.usage.totalTokens,
4089
+ reasoningTokens: response.usage.reasoningTokens,
4090
+ cachedInputTokens: response.usage.cachedInputTokens
4091
+ }
3765
4092
  };
3766
- const canonicalQuery = buildCanonicalQuery(queryParams);
3767
- const canonicalRequest = [
3768
- "GET",
3769
- canonicalUri,
3770
- canonicalQuery,
3771
- `host:${host}
3772
- `,
3773
- signedHeaders,
3774
- "UNSIGNED-PAYLOAD"
3775
- ].join("\n");
3776
- const stringToSign = [
3777
- AWS4_ALGORITHM,
3778
- amzDate,
3779
- credentialScope,
3780
- await sha256Hex(canonicalRequest)
3781
- ].join("\n");
3782
- const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
3783
- const queryWithSignature = `${canonicalQuery}&X-Amz-Signature=${signature}`;
3784
- return `${resolvedEndpoint.origin}${canonicalUri}?${queryWithSignature}`;
3785
4093
  }
3786
-
3787
- // src/lib/storage-adapter.ts
3788
- function requireCredentials(accessKeyId, secretAccessKey) {
3789
- if (!accessKeyId || !secretAccessKey) {
3790
- throw new Error(
3791
- "S3 credentials are required for default storage operations. Provide S3_ACCESS_KEY_ID and S3_SECRET_ACCESS_KEY or pass options.storageAdapter."
3792
- );
4094
+ function normalizeKeywords(keywords, limit = DEFAULT_SUMMARY_KEYWORD_LIMIT) {
4095
+ if (!Array.isArray(keywords) || keywords.length === 0) {
4096
+ return [];
3793
4097
  }
3794
- return { accessKeyId, secretAccessKey };
4098
+ const uniqueLowercase = /* @__PURE__ */ new Set();
4099
+ const normalized = [];
4100
+ for (const keyword of keywords) {
4101
+ const trimmed = keyword?.trim();
4102
+ if (!trimmed) {
4103
+ continue;
4104
+ }
4105
+ const lower = trimmed.toLowerCase();
4106
+ if (uniqueLowercase.has(lower)) {
4107
+ continue;
4108
+ }
4109
+ uniqueLowercase.add(lower);
4110
+ normalized.push(trimmed);
4111
+ if (normalized.length === limit) {
4112
+ break;
4113
+ }
4114
+ }
4115
+ return normalized;
3795
4116
  }
3796
- async function putObjectWithStorageAdapter(input, adapter) {
3797
- if (adapter) {
3798
- await adapter.putObject(input);
3799
- return;
4117
+ async function getSummaryAndTags(assetId, options) {
4118
+ "use workflow";
4119
+ const {
4120
+ provider = "openai",
4121
+ model,
4122
+ tone = "neutral",
4123
+ includeTranscript = true,
4124
+ cleanTranscript = true,
4125
+ imageSubmissionMode = "url",
4126
+ imageDownloadOptions,
4127
+ promptOverrides,
4128
+ credentials,
4129
+ titleLength,
4130
+ descriptionLength,
4131
+ tagCount,
4132
+ outputLanguageCode
4133
+ } = options ?? {};
4134
+ if (!VALID_TONES.includes(tone)) {
4135
+ throw new Error(
4136
+ `Invalid tone "${tone}". Valid tones are: ${VALID_TONES.join(", ")}`
4137
+ );
3800
4138
  }
3801
- const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
3802
- await putObjectToS3({
3803
- accessKeyId: credentials.accessKeyId,
3804
- secretAccessKey: credentials.secretAccessKey,
3805
- endpoint: input.endpoint,
3806
- region: input.region,
3807
- bucket: input.bucket,
3808
- key: input.key,
3809
- body: input.body,
3810
- contentType: input.contentType
4139
+ const modelConfig = resolveLanguageModelConfig({
4140
+ ...options,
4141
+ model,
4142
+ provider
3811
4143
  });
3812
- }
3813
- async function createPresignedGetUrlWithStorageAdapter(input, adapter) {
3814
- if (adapter) {
3815
- return adapter.createPresignedGetUrl(input);
4144
+ const workflowCredentials = credentials;
4145
+ const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, workflowCredentials);
4146
+ const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
4147
+ const isAudioOnly = isAudioOnlyAsset(assetData);
4148
+ if (isAudioOnly && !includeTranscript) {
4149
+ throw new Error(
4150
+ "Audio-only assets require a transcript. Set includeTranscript: true and ensure the asset has a ready text track (captions/subtitles)."
4151
+ );
3816
4152
  }
3817
- const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
3818
- return createPresignedGetUrl({
3819
- accessKeyId: credentials.accessKeyId,
3820
- secretAccessKey: credentials.secretAccessKey,
3821
- endpoint: input.endpoint,
3822
- region: input.region,
3823
- bucket: input.bucket,
3824
- key: input.key,
3825
- expiresInSeconds: input.expiresInSeconds
4153
+ const signingContext = await resolveMuxSigningContext(workflowCredentials);
4154
+ if (policy === "signed" && !signingContext) {
4155
+ throw new Error(
4156
+ "Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
4157
+ );
4158
+ }
4159
+ const transcriptResult = includeTranscript ? await fetchTranscriptForAsset(assetData, playbackId, {
4160
+ cleanTranscript,
4161
+ shouldSign: policy === "signed",
4162
+ credentials: workflowCredentials,
4163
+ required: isAudioOnly
4164
+ }) : void 0;
4165
+ const transcriptText = transcriptResult?.transcriptText ?? "";
4166
+ const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult?.track?.language_code ?? getReadyTextTracks(assetData)[0]?.language_code;
4167
+ const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
4168
+ const userPrompt = buildUserPrompt4({
4169
+ tone,
4170
+ transcriptText,
4171
+ isCleanTranscript: cleanTranscript,
4172
+ promptOverrides,
4173
+ isAudioOnly,
4174
+ titleLength,
4175
+ descriptionLength,
4176
+ tagCount,
4177
+ languageName
3826
4178
  });
4179
+ let analysisResponse;
4180
+ let imageUrl;
4181
+ const systemPrompt = isAudioOnly ? AUDIO_ONLY_SYSTEM_PROMPT : SYSTEM_PROMPT4;
4182
+ try {
4183
+ if (isAudioOnly) {
4184
+ analysisResponse = await analyzeAudioOnly(
4185
+ modelConfig.provider,
4186
+ modelConfig.modelId,
4187
+ userPrompt,
4188
+ systemPrompt,
4189
+ workflowCredentials
4190
+ );
4191
+ } else {
4192
+ const storyboardUrl = await getStoryboardUrl(playbackId, 640, policy === "signed", workflowCredentials);
4193
+ imageUrl = storyboardUrl;
4194
+ if (imageSubmissionMode === "base64") {
4195
+ const downloadResult = await downloadImageAsBase64(storyboardUrl, imageDownloadOptions);
4196
+ analysisResponse = await analyzeStoryboard2(
4197
+ downloadResult.base64Data,
4198
+ modelConfig.provider,
4199
+ modelConfig.modelId,
4200
+ userPrompt,
4201
+ systemPrompt,
4202
+ workflowCredentials
4203
+ );
4204
+ } else {
4205
+ analysisResponse = await withRetry(() => analyzeStoryboard2(
4206
+ storyboardUrl,
4207
+ modelConfig.provider,
4208
+ modelConfig.modelId,
4209
+ userPrompt,
4210
+ systemPrompt,
4211
+ workflowCredentials
4212
+ ));
4213
+ }
4214
+ }
4215
+ } catch (error) {
4216
+ const contentType = isAudioOnly ? "audio" : "video";
4217
+ throw new Error(
4218
+ `Failed to analyze ${contentType} content with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
4219
+ );
4220
+ }
4221
+ if (!analysisResponse.result) {
4222
+ throw new Error(`Failed to analyze video content for asset ${assetId}`);
4223
+ }
4224
+ if (!analysisResponse.result.title) {
4225
+ throw new Error(`Failed to generate title for asset ${assetId}`);
4226
+ }
4227
+ if (!analysisResponse.result.description) {
4228
+ throw new Error(`Failed to generate description for asset ${assetId}`);
4229
+ }
4230
+ return {
4231
+ assetId,
4232
+ title: analysisResponse.result.title,
4233
+ description: analysisResponse.result.description,
4234
+ tags: normalizeKeywords(analysisResponse.result.keywords, tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT),
4235
+ storyboardUrl: imageUrl,
4236
+ // undefined for audio-only assets
4237
+ usage: {
4238
+ ...analysisResponse.usage,
4239
+ metadata: {
4240
+ assetDurationSeconds
4241
+ }
4242
+ },
4243
+ transcriptText: transcriptText || void 0
4244
+ };
3827
4245
  }
3828
4246
 
3829
4247
  // src/workflows/translate-audio.ts
@@ -4002,7 +4420,8 @@ async function uploadDubbedAudioToS3({
4002
4420
  s3Endpoint,
4003
4421
  s3Region,
4004
4422
  s3Bucket,
4005
- storageAdapter
4423
+ storageAdapter,
4424
+ s3SignedUrlExpirySeconds
4006
4425
  }) {
4007
4426
  "use step";
4008
4427
  const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
@@ -4025,10 +4444,11 @@ async function uploadDubbedAudioToS3({
4025
4444
  region: s3Region,
4026
4445
  bucket: s3Bucket,
4027
4446
  key: audioKey,
4028
- expiresInSeconds: 3600
4447
+ expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
4029
4448
  }, storageAdapter);
4449
+ const expiryHours = Math.round((s3SignedUrlExpirySeconds ?? 86400) / 3600);
4030
4450
  console.warn(`\u2705 Audio uploaded successfully to: ${audioKey}`);
4031
- console.warn(`\u{1F517} Generated presigned URL (expires in 1 hour)`);
4451
+ console.warn(`\u{1F517} Generated presigned URL (expires in ${expiryHours} hour${expiryHours === 1 ? "" : "s"})`);
4032
4452
  return presignedUrl;
4033
4453
  }
4034
4454
  async function createAudioTrackOnMux(assetId, languageCode, presignedUrl, credentials) {
@@ -4192,7 +4612,8 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
4192
4612
  s3Endpoint,
4193
4613
  s3Region,
4194
4614
  s3Bucket,
4195
- storageAdapter: effectiveStorageAdapter
4615
+ storageAdapter: effectiveStorageAdapter,
4616
+ s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
4196
4617
  });
4197
4618
  } catch (error) {
4198
4619
  throw new Error(`Failed to upload audio to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
@@ -4230,24 +4651,24 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
4230
4651
  // src/workflows/translate-captions.ts
4231
4652
  import {
4232
4653
  APICallError,
4233
- generateText as generateText5,
4654
+ generateText as generateText6,
4234
4655
  NoObjectGeneratedError,
4235
- Output as Output5,
4656
+ Output as Output6,
4236
4657
  RetryError,
4237
4658
  TypeValidationError
4238
4659
  } from "ai";
4239
- import dedent5 from "dedent";
4240
- import { z as z6 } from "zod";
4241
- var translationSchema = z6.object({
4242
- translation: z6.string()
4660
+ import dedent6 from "dedent";
4661
+ import { z as z7 } from "zod";
4662
+ var translationSchema = z7.object({
4663
+ translation: z7.string()
4243
4664
  });
4244
- var SYSTEM_PROMPT4 = dedent5`
4665
+ var SYSTEM_PROMPT5 = dedent6`
4245
4666
  You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user.
4246
4667
  You may receive either a full VTT file or a chunk from a larger VTT.
4247
4668
  Preserve all timestamps, cue ordering, and VTT formatting exactly as they appear.
4248
4669
  Return JSON with a single key "translation" containing the translated VTT content.
4249
4670
  `;
4250
- var CUE_TRANSLATION_SYSTEM_PROMPT = dedent5`
4671
+ var CUE_TRANSLATION_SYSTEM_PROMPT = dedent6`
4251
4672
  You are a subtitle translation expert.
4252
4673
  You will receive a sequence of subtitle cues extracted from a VTT file.
4253
4674
  Translate the cues to the requested target language while preserving their original order.
@@ -4409,14 +4830,6 @@ function buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunkin
4409
4830
  )
4410
4831
  };
4411
4832
  }
4412
- async function fetchVttFromMux(vttUrl) {
4413
- "use step";
4414
- const vttResponse = await fetch(vttUrl);
4415
- if (!vttResponse.ok) {
4416
- throw new Error(`Failed to fetch VTT file: ${vttResponse.statusText}`);
4417
- }
4418
- return vttResponse.text();
4419
- }
4420
4833
  async function translateVttWithAI({
4421
4834
  vttContent,
4422
4835
  fromLanguageCode,
@@ -4427,13 +4840,13 @@ async function translateVttWithAI({
4427
4840
  }) {
4428
4841
  "use step";
4429
4842
  const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4430
- const response = await generateText5({
4843
+ const response = await generateText6({
4431
4844
  model,
4432
- output: Output5.object({ schema: translationSchema }),
4845
+ output: Output6.object({ schema: translationSchema }),
4433
4846
  messages: [
4434
4847
  {
4435
4848
  role: "system",
4436
- content: SYSTEM_PROMPT4
4849
+ content: SYSTEM_PROMPT5
4437
4850
  },
4438
4851
  {
4439
4852
  role: "user",
@@ -4464,8 +4877,8 @@ async function translateCueChunkWithAI({
4464
4877
  }) {
4465
4878
  "use step";
4466
4879
  const model = await createLanguageModelFromConfig(provider, modelId, credentials);
4467
- const schema = z6.object({
4468
- translations: z6.array(z6.string().min(1)).length(cues.length)
4880
+ const schema = z7.object({
4881
+ translations: z7.array(z7.string().min(1)).length(cues.length)
4469
4882
  });
4470
4883
  const cuePayload = cues.map((cue, index) => ({
4471
4884
  index,
@@ -4473,9 +4886,9 @@ async function translateCueChunkWithAI({
4473
4886
  endTime: cue.endTime,
4474
4887
  text: cue.text
4475
4888
  }));
4476
- const response = await generateText5({
4889
+ const response = await generateText6({
4477
4890
  model,
4478
- output: Output5.object({ schema }),
4891
+ output: Output6.object({ schema }),
4479
4892
  messages: [
4480
4893
  {
4481
4894
  role: "system",
@@ -4632,7 +5045,8 @@ async function uploadVttToS3({
4632
5045
  s3Endpoint,
4633
5046
  s3Region,
4634
5047
  s3Bucket,
4635
- storageAdapter
5048
+ storageAdapter,
5049
+ s3SignedUrlExpirySeconds
4636
5050
  }) {
4637
5051
  "use step";
4638
5052
  const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
@@ -4655,25 +5069,9 @@ async function uploadVttToS3({
4655
5069
  region: s3Region,
4656
5070
  bucket: s3Bucket,
4657
5071
  key: vttKey,
4658
- expiresInSeconds: 3600
5072
+ expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
4659
5073
  }, storageAdapter);
4660
5074
  }
4661
- async function createTextTrackOnMux(assetId, languageCode, trackName, presignedUrl, credentials) {
4662
- "use step";
4663
- const muxClient = await resolveMuxClient(credentials);
4664
- const mux = await muxClient.createClient();
4665
- const trackResponse = await mux.video.assets.createTrack(assetId, {
4666
- type: "text",
4667
- text_type: "subtitles",
4668
- language_code: languageCode,
4669
- name: trackName,
4670
- url: presignedUrl
4671
- });
4672
- if (!trackResponse.id) {
4673
- throw new Error("Failed to create text track: no track ID returned from Mux");
4674
- }
4675
- return trackResponse.id;
4676
- }
4677
5075
  async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, options) {
4678
5076
  "use workflow";
4679
5077
  const {
@@ -4791,7 +5189,8 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4791
5189
  s3Endpoint,
4792
5190
  s3Region,
4793
5191
  s3Bucket,
4794
- storageAdapter: effectiveStorageAdapter
5192
+ storageAdapter: effectiveStorageAdapter,
5193
+ s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
4795
5194
  });
4796
5195
  } catch (error) {
4797
5196
  throw new Error(`Failed to upload VTT to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
@@ -4824,23 +5223,33 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
4824
5223
  };
4825
5224
  }
4826
5225
  export {
5226
+ DEFAULT_DESCRIPTION_LENGTH,
5227
+ DEFAULT_SUMMARY_KEYWORD_LIMIT,
5228
+ DEFAULT_TITLE_LENGTH,
4827
5229
  HIVE_SEXUAL_CATEGORIES,
4828
5230
  HIVE_VIOLENCE_CATEGORIES,
4829
- SUMMARY_KEYWORD_LIMIT,
4830
5231
  aggregateTokenUsage,
5232
+ applyOverrideLists,
5233
+ applyReplacements,
4831
5234
  askQuestions,
5235
+ buildReplacementRegex,
4832
5236
  burnedInCaptionsSchema,
5237
+ censorVttContent,
4833
5238
  chapterSchema,
4834
5239
  chaptersSchema,
5240
+ createReplacer,
5241
+ editCaptions,
4835
5242
  generateChapters,
4836
5243
  generateEmbeddings,
4837
5244
  generateVideoEmbeddings,
4838
5245
  getModerationScores,
4839
5246
  getSummaryAndTags,
4840
5247
  hasBurnedInCaptions,
5248
+ profanityDetectionSchema,
4841
5249
  questionAnswerSchema,
4842
5250
  shouldSplitChunkTranslationError,
4843
5251
  summarySchema,
5252
+ transformCueText,
4844
5253
  translateAudio,
4845
5254
  translateCaptions,
4846
5255
  translationSchema