@mux/ai 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -1
- package/dist/{index-C8-E3VR9.d.ts → index-DLhfJsOd.d.ts} +73 -2
- package/dist/{index-CA7bG50u.d.ts → index-DyzifniY.d.ts} +167 -21
- package/dist/index.d.ts +3 -3
- package/dist/index.js +733 -153
- package/dist/index.js.map +1 -1
- package/dist/primitives/index.d.ts +1 -1
- package/dist/primitives/index.js +140 -1
- package/dist/primitives/index.js.map +1 -1
- package/dist/workflows/index.d.ts +1 -1
- package/dist/workflows/index.js +1980 -1539
- package/dist/workflows/index.js.map +1 -1
- package/package.json +2 -1
package/dist/workflows/index.js
CHANGED
|
@@ -1304,12 +1304,14 @@ async function fetchTranscriptForAsset(asset, playbackId, options = {}) {
|
|
|
1304
1304
|
// src/workflows/ask-questions.ts
|
|
1305
1305
|
var questionAnswerSchema = z2.object({
|
|
1306
1306
|
question: z2.string(),
|
|
1307
|
-
answer: z2.string(),
|
|
1307
|
+
answer: z2.string().optional(),
|
|
1308
1308
|
confidence: z2.number(),
|
|
1309
|
-
reasoning: z2.string()
|
|
1309
|
+
reasoning: z2.string(),
|
|
1310
|
+
skipped: z2.boolean()
|
|
1310
1311
|
});
|
|
1312
|
+
var SKIP_SENTINEL = "__SKIPPED__";
|
|
1311
1313
|
function createAskQuestionsSchema(allowedAnswers) {
|
|
1312
|
-
const answerSchema = z2.enum(allowedAnswers);
|
|
1314
|
+
const answerSchema = z2.enum([...allowedAnswers, SKIP_SENTINEL]);
|
|
1313
1315
|
return z2.object({
|
|
1314
1316
|
answers: z2.array(
|
|
1315
1317
|
questionAnswerSchema.extend({
|
|
@@ -1365,8 +1367,32 @@ var SYSTEM_PROMPT = dedent`
|
|
|
1365
1367
|
- Be precise: cite specific frames, objects, actions, or transcript quotes
|
|
1366
1368
|
</answer_guidelines>
|
|
1367
1369
|
|
|
1370
|
+
<relevance_filtering>
|
|
1371
|
+
Before answering each question, assess whether it can be meaningfully
|
|
1372
|
+
answered based on the video storyboard and/or transcript. A question is
|
|
1373
|
+
relevant if it asks about something observable or inferable from the
|
|
1374
|
+
video content (visuals, audio, dialogue, setting, subjects, actions, etc.).
|
|
1375
|
+
|
|
1376
|
+
Mark a question as skipped (skipped: true) if it:
|
|
1377
|
+
- Is completely unrelated to video content (e.g., math, trivia, personal questions)
|
|
1378
|
+
- Asks about information that cannot be determined from storyboard frames or transcript
|
|
1379
|
+
- Is a general knowledge question with no connection to what is shown or said in the video
|
|
1380
|
+
- Attempts to use the system for non-video-analysis purposes
|
|
1381
|
+
|
|
1382
|
+
For skipped questions:
|
|
1383
|
+
- Set skipped to true
|
|
1384
|
+
- Set answer to "${SKIP_SENTINEL}"
|
|
1385
|
+
- Set confidence to 0
|
|
1386
|
+
- Use the reasoning field to explain why the question is not answerable
|
|
1387
|
+
from the video content
|
|
1388
|
+
|
|
1389
|
+
For borderline questions that are loosely related to the video content,
|
|
1390
|
+
still answer them but use a lower confidence score to reflect uncertainty.
|
|
1391
|
+
</relevance_filtering>
|
|
1392
|
+
|
|
1368
1393
|
<constraints>
|
|
1369
|
-
- You MUST answer every question with one of the allowed response options
|
|
1394
|
+
- You MUST answer every relevant question with one of the allowed response options
|
|
1395
|
+
- Skip irrelevant questions as described in relevance_filtering
|
|
1370
1396
|
- Only describe observable evidence from frames or transcript
|
|
1371
1397
|
- Do not fabricate details or make unsupported assumptions
|
|
1372
1398
|
- Return structured data matching the requested schema exactly
|
|
@@ -1442,14 +1468,7 @@ async function analyzeQuestionsWithStoryboard(imageDataUrl, provider, modelId, u
|
|
|
1442
1468
|
]
|
|
1443
1469
|
});
|
|
1444
1470
|
return {
|
|
1445
|
-
result:
|
|
1446
|
-
answers: response.output.answers.map((answer) => ({
|
|
1447
|
-
...answer,
|
|
1448
|
-
// Strip numbering prefix (e.g., "1. " or "2. ") from questions
|
|
1449
|
-
question: answer.question.replace(/^\d+\.\s*/, ""),
|
|
1450
|
-
confidence: Math.min(1, Math.max(0, answer.confidence))
|
|
1451
|
-
}))
|
|
1452
|
-
},
|
|
1471
|
+
result: response.output,
|
|
1453
1472
|
usage: {
|
|
1454
1473
|
inputTokens: response.usage.inputTokens,
|
|
1455
1474
|
outputTokens: response.usage.outputTokens,
|
|
@@ -1555,9 +1574,20 @@ async function askQuestions(assetId, questions, options) {
|
|
|
1555
1574
|
`Expected ${questions.length} answers but received ${analysisResponse.result.answers.length}`
|
|
1556
1575
|
);
|
|
1557
1576
|
}
|
|
1577
|
+
const answers = analysisResponse.result.answers.map((raw) => {
|
|
1578
|
+
const isSkipped = raw.skipped || raw.answer === SKIP_SENTINEL;
|
|
1579
|
+
return {
|
|
1580
|
+
// Strip numbering prefix (e.g., "1. " or "2. ") from questions
|
|
1581
|
+
question: raw.question.replace(/^\d+\.\s*/, ""),
|
|
1582
|
+
confidence: isSkipped ? 0 : Math.min(1, Math.max(0, raw.confidence)),
|
|
1583
|
+
reasoning: raw.reasoning,
|
|
1584
|
+
skipped: isSkipped,
|
|
1585
|
+
...isSkipped ? {} : { answer: raw.answer }
|
|
1586
|
+
};
|
|
1587
|
+
});
|
|
1558
1588
|
return {
|
|
1559
1589
|
assetId,
|
|
1560
|
-
answers
|
|
1590
|
+
answers,
|
|
1561
1591
|
storyboardUrl: imageUrl,
|
|
1562
1592
|
usage: {
|
|
1563
1593
|
...analysisResponse.usage,
|
|
@@ -2176,1654 +2206,2074 @@ async function generateChapters(assetId, languageCode, options = {}) {
|
|
|
2176
2206
|
};
|
|
2177
2207
|
}
|
|
2178
2208
|
|
|
2179
|
-
// src/workflows/
|
|
2180
|
-
import {
|
|
2209
|
+
// src/workflows/edit-captions.ts
|
|
2210
|
+
import { generateText as generateText4, Output as Output4 } from "ai";
|
|
2211
|
+
import dedent4 from "dedent";
|
|
2212
|
+
import { z as z5 } from "zod";
|
|
2181
2213
|
|
|
2182
|
-
// src/
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
|
|
2191
|
-
function estimateTokenCount(text) {
|
|
2192
|
-
const words = text.trim().split(/\s+/).length;
|
|
2193
|
-
return Math.ceil(words / 0.75);
|
|
2214
|
+
// src/lib/mux-tracks.ts
|
|
2215
|
+
async function fetchVttFromMux(vttUrl) {
|
|
2216
|
+
"use step";
|
|
2217
|
+
const vttResponse = await fetch(vttUrl);
|
|
2218
|
+
if (!vttResponse.ok) {
|
|
2219
|
+
throw new Error(`Failed to fetch VTT file: ${vttResponse.statusText}`);
|
|
2220
|
+
}
|
|
2221
|
+
return vttResponse.text();
|
|
2194
2222
|
}
|
|
2195
|
-
function
|
|
2196
|
-
|
|
2197
|
-
|
|
2223
|
+
async function createTextTrackOnMux(assetId, languageCode, trackName, presignedUrl, credentials) {
|
|
2224
|
+
"use step";
|
|
2225
|
+
const muxClient = await resolveMuxClient(credentials);
|
|
2226
|
+
const mux = await muxClient.createClient();
|
|
2227
|
+
const trackResponse = await mux.video.assets.createTrack(assetId, {
|
|
2228
|
+
type: "text",
|
|
2229
|
+
text_type: "subtitles",
|
|
2230
|
+
language_code: languageCode,
|
|
2231
|
+
name: trackName,
|
|
2232
|
+
url: presignedUrl
|
|
2233
|
+
});
|
|
2234
|
+
if (!trackResponse.id) {
|
|
2235
|
+
throw new Error("Failed to create text track: no track ID returned from Mux");
|
|
2198
2236
|
}
|
|
2199
|
-
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
|
|
2207
|
-
|
|
2208
|
-
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
|
|
2212
|
-
|
|
2213
|
-
id: `chunk-${chunkIndex}`,
|
|
2214
|
-
text: chunkText2,
|
|
2215
|
-
tokenCount
|
|
2216
|
-
});
|
|
2217
|
-
currentPosition += wordsPerChunk - overlapWords;
|
|
2218
|
-
chunkIndex++;
|
|
2219
|
-
if (currentPosition <= (chunkIndex - 1) * (wordsPerChunk - overlapWords)) {
|
|
2220
|
-
break;
|
|
2221
|
-
}
|
|
2237
|
+
return trackResponse.id;
|
|
2238
|
+
}
|
|
2239
|
+
|
|
2240
|
+
// src/lib/s3-sigv4.ts
|
|
2241
|
+
var AWS4_ALGORITHM = "AWS4-HMAC-SHA256";
|
|
2242
|
+
var AWS4_REQUEST_TERMINATOR = "aws4_request";
|
|
2243
|
+
var AWS4_SERVICE = "s3";
|
|
2244
|
+
var S3_ALLOWED_ENDPOINT_PATTERNS = parseEndpointAllowlist(
|
|
2245
|
+
env_default.S3_ALLOWED_ENDPOINT_HOSTS
|
|
2246
|
+
);
|
|
2247
|
+
function getCrypto() {
|
|
2248
|
+
const webCrypto = globalThis.crypto;
|
|
2249
|
+
if (!webCrypto?.subtle) {
|
|
2250
|
+
throw new Error("Web Crypto API is required for S3 signing.");
|
|
2222
2251
|
}
|
|
2223
|
-
return
|
|
2252
|
+
return webCrypto;
|
|
2224
2253
|
}
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
return
|
|
2228
|
-
id: `chunk-${index}`,
|
|
2229
|
-
text,
|
|
2230
|
-
tokenCount: estimateTokenCount(text),
|
|
2231
|
-
startTime: cues[0].startTime,
|
|
2232
|
-
endTime: cues[cues.length - 1].endTime
|
|
2233
|
-
};
|
|
2254
|
+
var textEncoder = new TextEncoder();
|
|
2255
|
+
function toBytes(value) {
|
|
2256
|
+
return typeof value === "string" ? textEncoder.encode(value) : value;
|
|
2234
2257
|
}
|
|
2235
|
-
function
|
|
2236
|
-
|
|
2237
|
-
return [];
|
|
2238
|
-
const chunks = [];
|
|
2239
|
-
let currentCues = [];
|
|
2240
|
-
let currentTokens = 0;
|
|
2241
|
-
let chunkIndex = 0;
|
|
2242
|
-
for (let i = 0; i < cues.length; i++) {
|
|
2243
|
-
const cue = cues[i];
|
|
2244
|
-
const cueTokens = estimateTokenCount(cue.text);
|
|
2245
|
-
if (currentTokens + cueTokens > maxTokens && currentCues.length > 0) {
|
|
2246
|
-
chunks.push(createChunkFromCues(currentCues, chunkIndex));
|
|
2247
|
-
chunkIndex++;
|
|
2248
|
-
const overlapStart = Math.max(0, currentCues.length - overlapCues);
|
|
2249
|
-
currentCues = currentCues.slice(overlapStart);
|
|
2250
|
-
currentTokens = currentCues.reduce(
|
|
2251
|
-
(sum, c) => sum + estimateTokenCount(c.text),
|
|
2252
|
-
0
|
|
2253
|
-
);
|
|
2254
|
-
}
|
|
2255
|
-
currentCues.push(cue);
|
|
2256
|
-
currentTokens += cueTokens;
|
|
2257
|
-
}
|
|
2258
|
-
if (currentCues.length > 0) {
|
|
2259
|
-
chunks.push(createChunkFromCues(currentCues, chunkIndex));
|
|
2260
|
-
}
|
|
2261
|
-
return chunks;
|
|
2258
|
+
function bytesToHex(bytes) {
|
|
2259
|
+
return Array.from(bytes).map((byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
2262
2260
|
}
|
|
2263
|
-
function
|
|
2264
|
-
const
|
|
2265
|
-
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
2261
|
+
async function sha256Hex(value) {
|
|
2262
|
+
const digest = await getCrypto().subtle.digest("SHA-256", toBytes(value));
|
|
2263
|
+
return bytesToHex(new Uint8Array(digest));
|
|
2264
|
+
}
|
|
2265
|
+
async function hmacSha256Raw(key, value) {
|
|
2266
|
+
const cryptoKey = await getCrypto().subtle.importKey(
|
|
2267
|
+
"raw",
|
|
2268
|
+
key,
|
|
2269
|
+
{ name: "HMAC", hash: "SHA-256" },
|
|
2270
|
+
false,
|
|
2271
|
+
["sign"]
|
|
2272
|
+
);
|
|
2273
|
+
const signature = await getCrypto().subtle.sign("HMAC", cryptoKey, textEncoder.encode(value));
|
|
2274
|
+
return new Uint8Array(signature);
|
|
2275
|
+
}
|
|
2276
|
+
async function deriveSigningKey(secretAccessKey, shortDate, region) {
|
|
2277
|
+
const kDate = await hmacSha256Raw(textEncoder.encode(`AWS4${secretAccessKey}`), shortDate);
|
|
2278
|
+
const kRegion = await hmacSha256Raw(kDate, region);
|
|
2279
|
+
const kService = await hmacSha256Raw(kRegion, AWS4_SERVICE);
|
|
2280
|
+
return hmacSha256Raw(kService, AWS4_REQUEST_TERMINATOR);
|
|
2281
|
+
}
|
|
2282
|
+
function formatAmzDate(date = /* @__PURE__ */ new Date()) {
|
|
2283
|
+
const iso = date.toISOString();
|
|
2284
|
+
const shortDate = iso.slice(0, 10).replace(/-/g, "");
|
|
2285
|
+
const amzDate = `${iso.slice(0, 19).replace(/[-:]/g, "")}Z`;
|
|
2286
|
+
return { amzDate, shortDate };
|
|
2287
|
+
}
|
|
2288
|
+
function encodeRFC3986(value) {
|
|
2289
|
+
return encodeURIComponent(value).replace(/[!'()*]/g, (char) => `%${char.charCodeAt(0).toString(16).toUpperCase()}`);
|
|
2290
|
+
}
|
|
2291
|
+
function encodePath(path) {
|
|
2292
|
+
return path.split("/").map((segment) => encodeRFC3986(segment)).join("/");
|
|
2293
|
+
}
|
|
2294
|
+
function normalizeEndpoint(endpoint) {
|
|
2295
|
+
let url;
|
|
2296
|
+
try {
|
|
2297
|
+
url = new URL(endpoint);
|
|
2298
|
+
} catch {
|
|
2299
|
+
throw new Error(`Invalid S3 endpoint: ${endpoint}`);
|
|
2278
2300
|
}
|
|
2279
|
-
if (
|
|
2280
|
-
|
|
2301
|
+
if (url.search || url.hash) {
|
|
2302
|
+
throw new Error("S3 endpoint must not include query params or hash fragments.");
|
|
2281
2303
|
}
|
|
2282
|
-
|
|
2304
|
+
enforceEndpointPolicy(url);
|
|
2305
|
+
return url;
|
|
2283
2306
|
}
|
|
2284
|
-
function
|
|
2285
|
-
if (
|
|
2307
|
+
function parseEndpointAllowlist(allowlist) {
|
|
2308
|
+
if (!allowlist) {
|
|
2286
2309
|
return [];
|
|
2287
2310
|
}
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
|
|
2292
|
-
|
|
2293
|
-
|
|
2294
|
-
let chunkIndex = 0;
|
|
2295
|
-
let cueStartIndex = 0;
|
|
2296
|
-
let currentTokenCount = 0;
|
|
2297
|
-
for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
|
|
2298
|
-
const cue = cues[cueIndex];
|
|
2299
|
-
const cueTokenCount = estimateTokenCount(cue.text);
|
|
2300
|
-
const currentCueCount = cueIndex - cueStartIndex;
|
|
2301
|
-
const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
|
|
2302
|
-
const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
|
|
2303
|
-
if (wouldExceedCueCount || wouldExceedTokenCount) {
|
|
2304
|
-
chunks.push({
|
|
2305
|
-
id: `chunk-${chunkIndex}`,
|
|
2306
|
-
cueStartIndex,
|
|
2307
|
-
cueEndIndex: cueIndex - 1,
|
|
2308
|
-
cueCount: cueIndex - cueStartIndex,
|
|
2309
|
-
startTime: cues[cueStartIndex].startTime,
|
|
2310
|
-
endTime: cues[cueIndex - 1].endTime
|
|
2311
|
-
});
|
|
2312
|
-
cueStartIndex = cueIndex;
|
|
2313
|
-
currentTokenCount = 0;
|
|
2314
|
-
chunkIndex++;
|
|
2315
|
-
}
|
|
2316
|
-
currentTokenCount += cueTokenCount;
|
|
2311
|
+
return allowlist.split(",").map((value) => value.trim().toLowerCase()).filter(Boolean);
|
|
2312
|
+
}
|
|
2313
|
+
function hostnameMatchesPattern(hostname, pattern) {
|
|
2314
|
+
if (pattern.startsWith("*.")) {
|
|
2315
|
+
const suffix = pattern.slice(1);
|
|
2316
|
+
return hostname.endsWith(suffix) && hostname.length > suffix.length;
|
|
2317
2317
|
}
|
|
2318
|
-
|
|
2319
|
-
id: `chunk-${chunkIndex}`,
|
|
2320
|
-
cueStartIndex,
|
|
2321
|
-
cueEndIndex: cues.length - 1,
|
|
2322
|
-
cueCount: cues.length - cueStartIndex,
|
|
2323
|
-
startTime: cues[cueStartIndex].startTime,
|
|
2324
|
-
endTime: cues[cues.length - 1].endTime
|
|
2325
|
-
});
|
|
2326
|
-
return chunks;
|
|
2318
|
+
return hostname === pattern;
|
|
2327
2319
|
}
|
|
2328
|
-
function
|
|
2329
|
-
|
|
2330
|
-
|
|
2320
|
+
function enforceEndpointPolicy(url) {
|
|
2321
|
+
const hostname = url.hostname.toLowerCase();
|
|
2322
|
+
if (url.protocol !== "https:") {
|
|
2323
|
+
throw new Error(
|
|
2324
|
+
`Insecure S3 endpoint protocol "${url.protocol}" is not allowed. Use HTTPS.`
|
|
2325
|
+
);
|
|
2331
2326
|
}
|
|
2332
|
-
|
|
2333
|
-
|
|
2334
|
-
|
|
2335
|
-
|
|
2336
|
-
Math.max(
|
|
2337
|
-
1,
|
|
2338
|
-
options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
|
|
2339
|
-
)
|
|
2340
|
-
);
|
|
2341
|
-
const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
|
|
2342
|
-
const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
|
|
2343
|
-
const preferredBoundaryStartSeconds = Math.max(
|
|
2344
|
-
minChunkDurationSeconds,
|
|
2345
|
-
targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
|
|
2346
|
-
);
|
|
2347
|
-
const chunks = [];
|
|
2348
|
-
let chunkIndex = 0;
|
|
2349
|
-
let cueStartIndex = 0;
|
|
2350
|
-
while (cueStartIndex < cues.length) {
|
|
2351
|
-
const chunkStartTime = cues[cueStartIndex].startTime;
|
|
2352
|
-
let cueEndIndex = cueStartIndex;
|
|
2353
|
-
let bestBoundaryIndex = -1;
|
|
2354
|
-
let bestBoundaryScore = -1;
|
|
2355
|
-
let bestPreferredBoundaryIndex = -1;
|
|
2356
|
-
let bestPreferredBoundaryScore = -1;
|
|
2357
|
-
while (cueEndIndex < cues.length) {
|
|
2358
|
-
const cue = cues[cueEndIndex];
|
|
2359
|
-
const currentDuration = cue.endTime - chunkStartTime;
|
|
2360
|
-
if (currentDuration >= minChunkDurationSeconds) {
|
|
2361
|
-
const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
|
|
2362
|
-
if (boundaryScore >= bestBoundaryScore) {
|
|
2363
|
-
bestBoundaryIndex = cueEndIndex;
|
|
2364
|
-
bestBoundaryScore = boundaryScore;
|
|
2365
|
-
}
|
|
2366
|
-
if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
|
|
2367
|
-
bestPreferredBoundaryIndex = cueEndIndex;
|
|
2368
|
-
bestPreferredBoundaryScore = boundaryScore;
|
|
2369
|
-
}
|
|
2370
|
-
}
|
|
2371
|
-
const nextCue = cues[cueEndIndex + 1];
|
|
2372
|
-
if (!nextCue) {
|
|
2373
|
-
break;
|
|
2374
|
-
}
|
|
2375
|
-
const nextDuration = nextCue.endTime - chunkStartTime;
|
|
2376
|
-
const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
|
|
2377
|
-
const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
|
|
2378
|
-
const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
|
|
2379
|
-
if (currentDuration >= targetChunkDurationSeconds) {
|
|
2380
|
-
if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
|
|
2381
|
-
cueEndIndex = preferredBoundaryIndex;
|
|
2382
|
-
break;
|
|
2383
|
-
}
|
|
2384
|
-
if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
|
|
2385
|
-
cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
|
|
2386
|
-
break;
|
|
2387
|
-
}
|
|
2388
|
-
}
|
|
2389
|
-
if (nextDuration > maxChunkDurationSeconds) {
|
|
2390
|
-
cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
|
|
2391
|
-
break;
|
|
2392
|
-
}
|
|
2393
|
-
cueEndIndex++;
|
|
2394
|
-
}
|
|
2395
|
-
chunks.push({
|
|
2396
|
-
id: `chunk-${chunkIndex}`,
|
|
2397
|
-
cueStartIndex,
|
|
2398
|
-
cueEndIndex,
|
|
2399
|
-
cueCount: cueEndIndex - cueStartIndex + 1,
|
|
2400
|
-
startTime: cues[cueStartIndex].startTime,
|
|
2401
|
-
endTime: cues[cueEndIndex].endTime
|
|
2402
|
-
});
|
|
2403
|
-
cueStartIndex = cueEndIndex + 1;
|
|
2404
|
-
chunkIndex++;
|
|
2327
|
+
if (S3_ALLOWED_ENDPOINT_PATTERNS.length > 0 && !S3_ALLOWED_ENDPOINT_PATTERNS.some((pattern) => hostnameMatchesPattern(hostname, pattern))) {
|
|
2328
|
+
throw new Error(
|
|
2329
|
+
`S3 endpoint host "${hostname}" is not in S3_ALLOWED_ENDPOINT_HOSTS.`
|
|
2330
|
+
);
|
|
2405
2331
|
}
|
|
2406
|
-
return chunks;
|
|
2407
2332
|
}
|
|
2408
|
-
function
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
|
|
2413
|
-
default: {
|
|
2414
|
-
const exhaustiveCheck = strategy;
|
|
2415
|
-
throw new Error(`Unsupported chunking strategy: ${exhaustiveCheck}`);
|
|
2416
|
-
}
|
|
2417
|
-
}
|
|
2333
|
+
function buildCanonicalUri(endpoint, bucket, key) {
|
|
2334
|
+
const endpointPath = endpoint.pathname === "/" ? "" : encodePath(endpoint.pathname.replace(/\/+$/, ""));
|
|
2335
|
+
const encodedBucket = encodeRFC3986(bucket);
|
|
2336
|
+
const encodedKey = encodePath(key);
|
|
2337
|
+
return `${endpointPath}/${encodedBucket}/${encodedKey}`;
|
|
2418
2338
|
}
|
|
2419
|
-
|
|
2420
|
-
|
|
2421
|
-
|
|
2422
|
-
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
|
|
2427
|
-
|
|
2428
|
-
|
|
2429
|
-
|
|
2430
|
-
|
|
2431
|
-
|
|
2432
|
-
|
|
2433
|
-
|
|
2339
|
+
function buildCanonicalQuery(params) {
|
|
2340
|
+
return Object.entries(params).sort(([a], [b]) => a.localeCompare(b)).map(([key, value]) => `${encodeRFC3986(key)}=${encodeRFC3986(value)}`).join("&");
|
|
2341
|
+
}
|
|
2342
|
+
async function signString(secretAccessKey, shortDate, region, value) {
|
|
2343
|
+
const signingKey = await deriveSigningKey(secretAccessKey, shortDate, region);
|
|
2344
|
+
const signatureBytes = await hmacSha256Raw(signingKey, value);
|
|
2345
|
+
return bytesToHex(signatureBytes);
|
|
2346
|
+
}
|
|
2347
|
+
function buildCredentialScope(shortDate, region) {
|
|
2348
|
+
return `${shortDate}/${region}/${AWS4_SERVICE}/${AWS4_REQUEST_TERMINATOR}`;
|
|
2349
|
+
}
|
|
2350
|
+
async function putObjectToS3({
|
|
2351
|
+
accessKeyId,
|
|
2352
|
+
secretAccessKey,
|
|
2353
|
+
endpoint,
|
|
2354
|
+
region,
|
|
2355
|
+
bucket,
|
|
2356
|
+
key,
|
|
2357
|
+
body,
|
|
2358
|
+
contentType
|
|
2359
|
+
}) {
|
|
2360
|
+
const resolvedEndpoint = normalizeEndpoint(endpoint);
|
|
2361
|
+
const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
|
|
2362
|
+
const host = resolvedEndpoint.host;
|
|
2363
|
+
const normalizedContentType = contentType?.trim();
|
|
2364
|
+
const { amzDate, shortDate } = formatAmzDate();
|
|
2365
|
+
const payloadHash = await sha256Hex(body);
|
|
2366
|
+
const signingHeaders = [
|
|
2367
|
+
["host", host],
|
|
2368
|
+
["x-amz-content-sha256", payloadHash],
|
|
2369
|
+
["x-amz-date", amzDate],
|
|
2370
|
+
...normalizedContentType ? [["content-type", normalizedContentType]] : []
|
|
2371
|
+
].sort(([a], [b]) => a.localeCompare(b));
|
|
2372
|
+
const canonicalHeaders = signingHeaders.map(([name, value]) => `${name}:${value}`).join("\n");
|
|
2373
|
+
const signedHeaders = signingHeaders.map(([name]) => name).join(";");
|
|
2374
|
+
const canonicalRequest = [
|
|
2375
|
+
"PUT",
|
|
2376
|
+
canonicalUri,
|
|
2377
|
+
"",
|
|
2378
|
+
`${canonicalHeaders}
|
|
2379
|
+
`,
|
|
2380
|
+
signedHeaders,
|
|
2381
|
+
payloadHash
|
|
2382
|
+
].join("\n");
|
|
2383
|
+
const credentialScope = buildCredentialScope(shortDate, region);
|
|
2384
|
+
const stringToSign = [
|
|
2385
|
+
AWS4_ALGORITHM,
|
|
2386
|
+
amzDate,
|
|
2387
|
+
credentialScope,
|
|
2388
|
+
await sha256Hex(canonicalRequest)
|
|
2389
|
+
].join("\n");
|
|
2390
|
+
const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
|
|
2391
|
+
const authorization = `${AWS4_ALGORITHM} Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}`;
|
|
2392
|
+
const requestUrl = `${resolvedEndpoint.origin}${canonicalUri}`;
|
|
2393
|
+
const response = await fetch(requestUrl, {
|
|
2394
|
+
method: "PUT",
|
|
2395
|
+
headers: {
|
|
2396
|
+
"Authorization": authorization,
|
|
2397
|
+
"x-amz-content-sha256": payloadHash,
|
|
2398
|
+
"x-amz-date": amzDate,
|
|
2399
|
+
...normalizedContentType ? { "content-type": normalizedContentType } : {}
|
|
2400
|
+
},
|
|
2401
|
+
body
|
|
2402
|
+
});
|
|
2403
|
+
if (!response.ok) {
|
|
2404
|
+
const errorBody = await response.text().catch(() => "");
|
|
2405
|
+
const detail = errorBody ? ` ${errorBody}` : "";
|
|
2406
|
+
throw new Error(`S3 PUT failed (${response.status} ${response.statusText}).${detail}`);
|
|
2434
2407
|
}
|
|
2435
|
-
return averaged;
|
|
2436
2408
|
}
|
|
2437
|
-
async function
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2409
|
+
async function createPresignedGetUrl({
|
|
2410
|
+
accessKeyId,
|
|
2411
|
+
secretAccessKey,
|
|
2412
|
+
endpoint,
|
|
2413
|
+
region,
|
|
2414
|
+
bucket,
|
|
2415
|
+
key,
|
|
2416
|
+
expiresInSeconds = 3600
|
|
2442
2417
|
}) {
|
|
2443
|
-
|
|
2444
|
-
const
|
|
2445
|
-
const
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
startTime: chunk.startTime,
|
|
2456
|
-
endTime: chunk.endTime,
|
|
2457
|
-
tokenCount: chunk.tokenCount
|
|
2458
|
-
}
|
|
2418
|
+
const resolvedEndpoint = normalizeEndpoint(endpoint);
|
|
2419
|
+
const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
|
|
2420
|
+
const host = resolvedEndpoint.host;
|
|
2421
|
+
const { amzDate, shortDate } = formatAmzDate();
|
|
2422
|
+
const credentialScope = buildCredentialScope(shortDate, region);
|
|
2423
|
+
const signedHeaders = "host";
|
|
2424
|
+
const queryParams = {
|
|
2425
|
+
"X-Amz-Algorithm": AWS4_ALGORITHM,
|
|
2426
|
+
"X-Amz-Credential": `${accessKeyId}/${credentialScope}`,
|
|
2427
|
+
"X-Amz-Date": amzDate,
|
|
2428
|
+
"X-Amz-Expires": `${expiresInSeconds}`,
|
|
2429
|
+
"X-Amz-SignedHeaders": signedHeaders
|
|
2459
2430
|
};
|
|
2431
|
+
const canonicalQuery = buildCanonicalQuery(queryParams);
|
|
2432
|
+
const canonicalRequest = [
|
|
2433
|
+
"GET",
|
|
2434
|
+
canonicalUri,
|
|
2435
|
+
canonicalQuery,
|
|
2436
|
+
`host:${host}
|
|
2437
|
+
`,
|
|
2438
|
+
signedHeaders,
|
|
2439
|
+
"UNSIGNED-PAYLOAD"
|
|
2440
|
+
].join("\n");
|
|
2441
|
+
const stringToSign = [
|
|
2442
|
+
AWS4_ALGORITHM,
|
|
2443
|
+
amzDate,
|
|
2444
|
+
credentialScope,
|
|
2445
|
+
await sha256Hex(canonicalRequest)
|
|
2446
|
+
].join("\n");
|
|
2447
|
+
const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
|
|
2448
|
+
const queryWithSignature = `${canonicalQuery}&X-Amz-Signature=${signature}`;
|
|
2449
|
+
return `${resolvedEndpoint.origin}${canonicalUri}?${queryWithSignature}`;
|
|
2460
2450
|
}
|
|
2461
|
-
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
|
|
2465
|
-
languageCode,
|
|
2466
|
-
chunkingStrategy = { type: "token", maxTokens: 500, overlap: 100 },
|
|
2467
|
-
batchSize = 5,
|
|
2468
|
-
credentials
|
|
2469
|
-
} = options;
|
|
2470
|
-
const embeddingModel = resolveEmbeddingModelConfig({ ...options, provider, model });
|
|
2471
|
-
const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
|
|
2472
|
-
const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
|
|
2473
|
-
const isAudioOnly = isAudioOnlyAsset(assetData);
|
|
2474
|
-
const signingContext = await resolveMuxSigningContext(credentials);
|
|
2475
|
-
if (policy === "signed" && !signingContext) {
|
|
2451
|
+
|
|
2452
|
+
// src/lib/storage-adapter.ts
|
|
2453
|
+
function requireCredentials(accessKeyId, secretAccessKey) {
|
|
2454
|
+
if (!accessKeyId || !secretAccessKey) {
|
|
2476
2455
|
throw new Error(
|
|
2477
|
-
"
|
|
2456
|
+
"S3 credentials are required for default storage operations. Provide S3_ACCESS_KEY_ID and S3_SECRET_ACCESS_KEY or pass options.storageAdapter."
|
|
2478
2457
|
);
|
|
2479
2458
|
}
|
|
2480
|
-
|
|
2481
|
-
|
|
2482
|
-
|
|
2483
|
-
|
|
2484
|
-
|
|
2485
|
-
|
|
2486
|
-
|
|
2459
|
+
return { accessKeyId, secretAccessKey };
|
|
2460
|
+
}
|
|
2461
|
+
async function putObjectWithStorageAdapter(input, adapter) {
|
|
2462
|
+
if (adapter) {
|
|
2463
|
+
await adapter.putObject(input);
|
|
2464
|
+
return;
|
|
2465
|
+
}
|
|
2466
|
+
const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
|
|
2467
|
+
await putObjectToS3({
|
|
2468
|
+
accessKeyId: credentials.accessKeyId,
|
|
2469
|
+
secretAccessKey: credentials.secretAccessKey,
|
|
2470
|
+
endpoint: input.endpoint,
|
|
2471
|
+
region: input.region,
|
|
2472
|
+
bucket: input.bucket,
|
|
2473
|
+
key: input.key,
|
|
2474
|
+
body: input.body,
|
|
2475
|
+
contentType: input.contentType
|
|
2487
2476
|
});
|
|
2488
|
-
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
2477
|
+
}
|
|
2478
|
+
async function createPresignedGetUrlWithStorageAdapter(input, adapter) {
|
|
2479
|
+
if (adapter) {
|
|
2480
|
+
return adapter.createPresignedGetUrl(input);
|
|
2481
|
+
}
|
|
2482
|
+
const credentials = requireCredentials(input.accessKeyId, input.secretAccessKey);
|
|
2483
|
+
return createPresignedGetUrl({
|
|
2484
|
+
accessKeyId: credentials.accessKeyId,
|
|
2485
|
+
secretAccessKey: credentials.secretAccessKey,
|
|
2486
|
+
endpoint: input.endpoint,
|
|
2487
|
+
region: input.region,
|
|
2488
|
+
bucket: input.bucket,
|
|
2489
|
+
key: input.key,
|
|
2490
|
+
expiresInSeconds: input.expiresInSeconds
|
|
2491
|
+
});
|
|
2492
|
+
}
|
|
2493
|
+
|
|
2494
|
+
// src/workflows/edit-captions.ts
|
|
2495
|
+
var profanityDetectionSchema = z5.object({
|
|
2496
|
+
profanity: z5.array(z5.string()).describe(
|
|
2497
|
+
"Unique profane words or short phrases exactly as they appear in the transcript text. Include each distinct form only once (e.g., if 'fuck' and 'fucking' both appear, list both)."
|
|
2498
|
+
)
|
|
2499
|
+
});
|
|
2500
|
+
var SYSTEM_PROMPT3 = dedent4`
|
|
2501
|
+
You are a content moderation assistant. Your task is to identify profane, vulgar, or obscene
|
|
2502
|
+
words and phrases in subtitle text. Return ONLY the exact profane words or phrases as they appear
|
|
2503
|
+
in the text. Do not modify, censor, or paraphrase them. Do not include words that are merely
|
|
2504
|
+
informal or slang but not profane. Focus on words that would be bleeped on broadcast television.`;
|
|
2505
|
+
function transformCueText(rawVtt, transform) {
|
|
2506
|
+
const lines = rawVtt.split("\n");
|
|
2507
|
+
let inCueText = false;
|
|
2508
|
+
let currentCueStartTime = 0;
|
|
2509
|
+
const transformed = lines.map((line) => {
|
|
2510
|
+
if (line.includes("-->")) {
|
|
2511
|
+
const startTimestamp = line.split("-->")[0].trim();
|
|
2512
|
+
currentCueStartTime = vttTimestampToSeconds(startTimestamp);
|
|
2513
|
+
inCueText = true;
|
|
2514
|
+
return line;
|
|
2515
|
+
}
|
|
2516
|
+
if (line.trim() === "") {
|
|
2517
|
+
inCueText = false;
|
|
2518
|
+
return line;
|
|
2519
|
+
}
|
|
2520
|
+
if (inCueText) {
|
|
2521
|
+
return transform(line, currentCueStartTime);
|
|
2522
|
+
}
|
|
2523
|
+
return line;
|
|
2524
|
+
});
|
|
2525
|
+
return transformed.join("\n");
|
|
2526
|
+
}
|
|
2527
|
+
function buildReplacementRegex(words) {
|
|
2528
|
+
const filtered = words.filter((w) => w.length > 0);
|
|
2529
|
+
if (filtered.length === 0)
|
|
2530
|
+
return null;
|
|
2531
|
+
filtered.sort((a, b) => b.length - a.length);
|
|
2532
|
+
const escaped = filtered.map((w) => w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
|
|
2533
|
+
const pattern = escaped.join("|");
|
|
2534
|
+
return new RegExp(`\\b(?:${pattern})\\b`, "gi");
|
|
2535
|
+
}
|
|
2536
|
+
function createReplacer(mode) {
|
|
2537
|
+
switch (mode) {
|
|
2538
|
+
case "blank":
|
|
2539
|
+
return (match) => `[${"_".repeat(match.length)}]`;
|
|
2540
|
+
case "remove":
|
|
2541
|
+
return () => "";
|
|
2542
|
+
case "mask":
|
|
2543
|
+
return (match) => "?".repeat(match.length);
|
|
2544
|
+
}
|
|
2545
|
+
}
|
|
2546
|
+
function censorVttContent(rawVtt, profanity, mode) {
|
|
2547
|
+
if (profanity.length === 0) {
|
|
2548
|
+
return { censoredVtt: rawVtt, replacements: [] };
|
|
2549
|
+
}
|
|
2550
|
+
const regex = buildReplacementRegex(profanity);
|
|
2551
|
+
if (!regex) {
|
|
2552
|
+
return { censoredVtt: rawVtt, replacements: [] };
|
|
2553
|
+
}
|
|
2554
|
+
const replacer = createReplacer(mode);
|
|
2555
|
+
const replacements = [];
|
|
2556
|
+
const censoredVtt = transformCueText(rawVtt, (line, cueStartTime) => {
|
|
2557
|
+
return line.replace(regex, (match) => {
|
|
2558
|
+
const after = replacer(match);
|
|
2559
|
+
replacements.push({ cueStartTime, before: match, after });
|
|
2560
|
+
return after;
|
|
2493
2561
|
});
|
|
2562
|
+
});
|
|
2563
|
+
return { censoredVtt, replacements };
|
|
2564
|
+
}
|
|
2565
|
+
function applyOverrideLists(detected, alwaysCensor, neverCensor) {
|
|
2566
|
+
const seen = new Set(detected.map((w) => w.toLowerCase()));
|
|
2567
|
+
const merged = [...detected];
|
|
2568
|
+
for (const word of alwaysCensor) {
|
|
2569
|
+
const lower = word.toLowerCase();
|
|
2570
|
+
if (!seen.has(lower)) {
|
|
2571
|
+
seen.add(lower);
|
|
2572
|
+
merged.push(word);
|
|
2573
|
+
}
|
|
2494
2574
|
}
|
|
2495
|
-
|
|
2496
|
-
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2575
|
+
const neverSet = new Set(neverCensor.map((w) => w.toLowerCase()));
|
|
2576
|
+
return merged.filter((w) => !neverSet.has(w.toLowerCase()));
|
|
2577
|
+
}
|
|
2578
|
+
function applyReplacements(rawVtt, replacements) {
|
|
2579
|
+
const filtered = replacements.filter((r) => r.find.length > 0);
|
|
2580
|
+
if (filtered.length === 0) {
|
|
2581
|
+
return { editedVtt: rawVtt, replacements: [] };
|
|
2582
|
+
}
|
|
2583
|
+
const records = [];
|
|
2584
|
+
const editedVtt = transformCueText(rawVtt, (line, cueStartTime) => {
|
|
2585
|
+
let result = line;
|
|
2586
|
+
for (const { find, replace } of filtered) {
|
|
2587
|
+
const escaped = find.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
2588
|
+
const regex = new RegExp(`\\b${escaped}\\b`, "g");
|
|
2589
|
+
result = result.replace(regex, (match) => {
|
|
2590
|
+
records.push({ cueStartTime, before: match, after: replace });
|
|
2591
|
+
return replace;
|
|
2592
|
+
});
|
|
2593
|
+
}
|
|
2594
|
+
return result;
|
|
2595
|
+
});
|
|
2596
|
+
return { editedVtt, replacements: records };
|
|
2597
|
+
}
|
|
2598
|
+
async function identifyProfanityWithAI({
|
|
2599
|
+
plainText,
|
|
2600
|
+
provider,
|
|
2601
|
+
modelId,
|
|
2602
|
+
credentials
|
|
2603
|
+
}) {
|
|
2604
|
+
"use step";
|
|
2605
|
+
const model = await createLanguageModelFromConfig(provider, modelId, credentials);
|
|
2606
|
+
const response = await generateText4({
|
|
2607
|
+
model,
|
|
2608
|
+
output: Output4.object({ schema: profanityDetectionSchema }),
|
|
2609
|
+
messages: [
|
|
2610
|
+
{
|
|
2611
|
+
role: "system",
|
|
2612
|
+
content: SYSTEM_PROMPT3
|
|
2613
|
+
},
|
|
2614
|
+
{
|
|
2615
|
+
role: "user",
|
|
2616
|
+
content: `Identify all profane words and phrases in the following subtitle transcript. Return each unique profane word or phrase exactly as it appears in the text.
|
|
2617
|
+
|
|
2618
|
+
<transcript>
|
|
2619
|
+
${plainText}
|
|
2620
|
+
</transcript>`
|
|
2621
|
+
}
|
|
2622
|
+
]
|
|
2623
|
+
});
|
|
2624
|
+
return {
|
|
2625
|
+
profanity: response.output.profanity,
|
|
2626
|
+
usage: {
|
|
2627
|
+
inputTokens: response.usage.inputTokens,
|
|
2628
|
+
outputTokens: response.usage.outputTokens,
|
|
2629
|
+
totalTokens: response.usage.totalTokens,
|
|
2630
|
+
reasoningTokens: response.usage.reasoningTokens,
|
|
2631
|
+
cachedInputTokens: response.usage.cachedInputTokens
|
|
2501
2632
|
}
|
|
2633
|
+
};
|
|
2634
|
+
}
|
|
2635
|
+
async function uploadEditedVttToS3({
|
|
2636
|
+
editedVtt,
|
|
2637
|
+
assetId,
|
|
2638
|
+
trackId,
|
|
2639
|
+
s3Endpoint,
|
|
2640
|
+
s3Region,
|
|
2641
|
+
s3Bucket,
|
|
2642
|
+
storageAdapter,
|
|
2643
|
+
s3SignedUrlExpirySeconds
|
|
2644
|
+
}) {
|
|
2645
|
+
"use step";
|
|
2646
|
+
const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
|
|
2647
|
+
const s3SecretAccessKey = env_default.S3_SECRET_ACCESS_KEY;
|
|
2648
|
+
const vttKey = `edited/${assetId}/${trackId}-edited-${Date.now()}.vtt`;
|
|
2649
|
+
await putObjectWithStorageAdapter({
|
|
2650
|
+
accessKeyId: s3AccessKeyId,
|
|
2651
|
+
secretAccessKey: s3SecretAccessKey,
|
|
2652
|
+
endpoint: s3Endpoint,
|
|
2653
|
+
region: s3Region,
|
|
2654
|
+
bucket: s3Bucket,
|
|
2655
|
+
key: vttKey,
|
|
2656
|
+
body: editedVtt,
|
|
2657
|
+
contentType: "text/vtt"
|
|
2658
|
+
}, storageAdapter);
|
|
2659
|
+
return createPresignedGetUrlWithStorageAdapter({
|
|
2660
|
+
accessKeyId: s3AccessKeyId,
|
|
2661
|
+
secretAccessKey: s3SecretAccessKey,
|
|
2662
|
+
endpoint: s3Endpoint,
|
|
2663
|
+
region: s3Region,
|
|
2664
|
+
bucket: s3Bucket,
|
|
2665
|
+
key: vttKey,
|
|
2666
|
+
expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
|
|
2667
|
+
}, storageAdapter);
|
|
2668
|
+
}
|
|
2669
|
+
async function deleteTrackOnMux(assetId, trackId, credentials) {
|
|
2670
|
+
"use step";
|
|
2671
|
+
const muxClient = await resolveMuxClient(credentials);
|
|
2672
|
+
const mux = await muxClient.createClient();
|
|
2673
|
+
await mux.video.assets.deleteTrack(assetId, trackId);
|
|
2674
|
+
}
|
|
2675
|
+
async function editCaptions(assetId, trackId, options) {
|
|
2676
|
+
"use workflow";
|
|
2677
|
+
const {
|
|
2678
|
+
provider,
|
|
2679
|
+
model,
|
|
2680
|
+
autoCensorProfanity: autoCensorOption,
|
|
2681
|
+
replacements: replacementsOption,
|
|
2682
|
+
deleteOriginalTrack,
|
|
2683
|
+
uploadToMux: uploadToMuxOption,
|
|
2684
|
+
s3Endpoint: providedS3Endpoint,
|
|
2685
|
+
s3Region: providedS3Region,
|
|
2686
|
+
s3Bucket: providedS3Bucket,
|
|
2687
|
+
trackNameSuffix,
|
|
2688
|
+
storageAdapter,
|
|
2689
|
+
credentials
|
|
2690
|
+
} = options;
|
|
2691
|
+
const hasAutoCensor = !!autoCensorOption;
|
|
2692
|
+
const hasReplacements = !!replacementsOption && replacementsOption.length > 0;
|
|
2693
|
+
if (!hasAutoCensor && !hasReplacements) {
|
|
2694
|
+
throw new Error("At least one of autoCensorProfanity or replacements must be provided.");
|
|
2695
|
+
}
|
|
2696
|
+
if (autoCensorOption && !provider) {
|
|
2697
|
+
throw new Error("provider is required when using autoCensorProfanity.");
|
|
2698
|
+
}
|
|
2699
|
+
const deleteOriginal = deleteOriginalTrack !== false;
|
|
2700
|
+
const uploadToMux = uploadToMuxOption !== false;
|
|
2701
|
+
const s3Endpoint = providedS3Endpoint ?? env_default.S3_ENDPOINT;
|
|
2702
|
+
const s3Region = providedS3Region ?? env_default.S3_REGION ?? "auto";
|
|
2703
|
+
const s3Bucket = providedS3Bucket ?? env_default.S3_BUCKET;
|
|
2704
|
+
const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
|
|
2705
|
+
const s3SecretAccessKey = env_default.S3_SECRET_ACCESS_KEY;
|
|
2706
|
+
if (uploadToMux && (!s3Endpoint || !s3Bucket || !storageAdapter && (!s3AccessKeyId || !s3SecretAccessKey))) {
|
|
2502
2707
|
throw new Error(
|
|
2503
|
-
|
|
2708
|
+
"Storage configuration is required for uploading to Mux. Provide s3Endpoint and s3Bucket. If no storageAdapter is supplied, also provide s3AccessKeyId and s3SecretAccessKey in options or set S3_ENDPOINT, S3_BUCKET, S3_ACCESS_KEY_ID, and S3_SECRET_ACCESS_KEY environment variables."
|
|
2504
2709
|
);
|
|
2505
2710
|
}
|
|
2506
|
-
const
|
|
2507
|
-
|
|
2508
|
-
|
|
2711
|
+
const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
|
|
2712
|
+
const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
|
|
2713
|
+
const signingContext = await resolveMuxSigningContext(credentials);
|
|
2714
|
+
if (policy === "signed" && !signingContext) {
|
|
2715
|
+
throw new Error(
|
|
2716
|
+
"Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
|
|
2717
|
+
);
|
|
2509
2718
|
}
|
|
2510
|
-
const
|
|
2511
|
-
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
|
|
2719
|
+
const readyTextTracks = getReadyTextTracks(assetData);
|
|
2720
|
+
const sourceTrack = readyTextTracks.find((t) => t.id === trackId);
|
|
2721
|
+
if (!sourceTrack) {
|
|
2722
|
+
const availableTrackIds = readyTextTracks.map((t) => t.id).filter(Boolean).join(", ");
|
|
2723
|
+
throw new Error(
|
|
2724
|
+
`Track '${trackId}' not found or not ready on asset '${assetId}'. Available track IDs: ${availableTrackIds || "none"}`
|
|
2725
|
+
);
|
|
2517
2726
|
}
|
|
2518
|
-
const
|
|
2727
|
+
const vttUrl = await buildTranscriptUrl(playbackId, trackId, policy === "signed", credentials);
|
|
2728
|
+
let vttContent;
|
|
2519
2729
|
try {
|
|
2520
|
-
|
|
2521
|
-
|
|
2522
|
-
|
|
2523
|
-
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
|
|
2528
|
-
|
|
2529
|
-
|
|
2530
|
-
|
|
2531
|
-
|
|
2532
|
-
|
|
2730
|
+
vttContent = await fetchVttFromMux(vttUrl);
|
|
2731
|
+
} catch (error) {
|
|
2732
|
+
throw new Error(`Failed to fetch VTT content: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
2733
|
+
}
|
|
2734
|
+
let editedVtt = vttContent;
|
|
2735
|
+
let totalReplacementCount = 0;
|
|
2736
|
+
let autoCensorResult;
|
|
2737
|
+
let usage;
|
|
2738
|
+
if (autoCensorOption) {
|
|
2739
|
+
const { mode = "blank", alwaysCensor = [], neverCensor = [] } = autoCensorOption;
|
|
2740
|
+
const plainText = extractTextFromVTT(vttContent);
|
|
2741
|
+
if (!plainText.trim()) {
|
|
2742
|
+
throw new Error("Track transcript is empty; nothing to censor.");
|
|
2743
|
+
}
|
|
2744
|
+
const modelConfig = resolveLanguageModelConfig({
|
|
2745
|
+
...options,
|
|
2746
|
+
provider,
|
|
2747
|
+
model
|
|
2748
|
+
});
|
|
2749
|
+
let detectedProfanity;
|
|
2750
|
+
try {
|
|
2751
|
+
const result = await identifyProfanityWithAI({
|
|
2752
|
+
plainText,
|
|
2753
|
+
provider: modelConfig.provider,
|
|
2754
|
+
modelId: modelConfig.modelId,
|
|
2755
|
+
credentials
|
|
2756
|
+
});
|
|
2757
|
+
detectedProfanity = result.profanity;
|
|
2758
|
+
usage = result.usage;
|
|
2759
|
+
} catch (error) {
|
|
2760
|
+
throw new Error(`Failed to detect profanity with ${modelConfig.provider}: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
2761
|
+
}
|
|
2762
|
+
const finalProfanity = applyOverrideLists(detectedProfanity, alwaysCensor, neverCensor);
|
|
2763
|
+
const { censoredVtt, replacements: censorReplacements } = censorVttContent(editedVtt, finalProfanity, mode);
|
|
2764
|
+
editedVtt = censoredVtt;
|
|
2765
|
+
totalReplacementCount += censorReplacements.length;
|
|
2766
|
+
autoCensorResult = { replacements: censorReplacements };
|
|
2767
|
+
}
|
|
2768
|
+
let replacementsResult;
|
|
2769
|
+
if (replacementsOption && replacementsOption.length > 0) {
|
|
2770
|
+
const { editedVtt: afterReplacements, replacements: staticReplacements } = applyReplacements(editedVtt, replacementsOption);
|
|
2771
|
+
editedVtt = afterReplacements;
|
|
2772
|
+
totalReplacementCount += staticReplacements.length;
|
|
2773
|
+
replacementsResult = { replacements: staticReplacements };
|
|
2774
|
+
}
|
|
2775
|
+
const usageWithMetadata = usage ? {
|
|
2776
|
+
...usage,
|
|
2777
|
+
metadata: {
|
|
2778
|
+
assetDurationSeconds
|
|
2533
2779
|
}
|
|
2780
|
+
} : void 0;
|
|
2781
|
+
if (!uploadToMux) {
|
|
2782
|
+
return {
|
|
2783
|
+
assetId,
|
|
2784
|
+
trackId,
|
|
2785
|
+
originalVtt: vttContent,
|
|
2786
|
+
editedVtt,
|
|
2787
|
+
totalReplacementCount,
|
|
2788
|
+
autoCensorProfanity: autoCensorResult,
|
|
2789
|
+
replacements: replacementsResult,
|
|
2790
|
+
usage: usageWithMetadata
|
|
2791
|
+
};
|
|
2792
|
+
}
|
|
2793
|
+
let presignedUrl;
|
|
2794
|
+
try {
|
|
2795
|
+
presignedUrl = await uploadEditedVttToS3({
|
|
2796
|
+
editedVtt,
|
|
2797
|
+
assetId,
|
|
2798
|
+
trackId,
|
|
2799
|
+
s3Endpoint,
|
|
2800
|
+
s3Region,
|
|
2801
|
+
s3Bucket,
|
|
2802
|
+
storageAdapter,
|
|
2803
|
+
s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
|
|
2804
|
+
});
|
|
2534
2805
|
} catch (error) {
|
|
2535
|
-
throw new Error(
|
|
2536
|
-
|
|
2806
|
+
throw new Error(`Failed to upload VTT to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
2807
|
+
}
|
|
2808
|
+
let uploadedTrackId;
|
|
2809
|
+
try {
|
|
2810
|
+
const languageCode = sourceTrack.language_code || "en";
|
|
2811
|
+
const suffix = trackNameSuffix ?? "edited";
|
|
2812
|
+
const trackName = `${sourceTrack.name || "Subtitles"} (${suffix})`;
|
|
2813
|
+
uploadedTrackId = await createTextTrackOnMux(
|
|
2814
|
+
assetId,
|
|
2815
|
+
languageCode,
|
|
2816
|
+
trackName,
|
|
2817
|
+
presignedUrl,
|
|
2818
|
+
credentials
|
|
2537
2819
|
);
|
|
2820
|
+
} catch (error) {
|
|
2821
|
+
console.warn(`Failed to add track to Mux asset: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
2538
2822
|
}
|
|
2539
|
-
if (
|
|
2540
|
-
|
|
2823
|
+
if (deleteOriginal && uploadedTrackId) {
|
|
2824
|
+
try {
|
|
2825
|
+
await deleteTrackOnMux(assetId, trackId, credentials);
|
|
2826
|
+
} catch (error) {
|
|
2827
|
+
console.warn(`Failed to delete original track: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
2828
|
+
}
|
|
2541
2829
|
}
|
|
2542
|
-
const averagedEmbedding = averageEmbeddings(chunkEmbeddings.map((ce) => ce.embedding));
|
|
2543
|
-
const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0);
|
|
2544
2830
|
return {
|
|
2545
2831
|
assetId,
|
|
2546
|
-
|
|
2547
|
-
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
|
|
2551
|
-
|
|
2552
|
-
|
|
2553
|
-
|
|
2554
|
-
|
|
2555
|
-
generatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2556
|
-
},
|
|
2557
|
-
usage: {
|
|
2558
|
-
metadata: {
|
|
2559
|
-
assetDurationSeconds
|
|
2560
|
-
}
|
|
2561
|
-
}
|
|
2832
|
+
trackId,
|
|
2833
|
+
originalVtt: vttContent,
|
|
2834
|
+
editedVtt,
|
|
2835
|
+
totalReplacementCount,
|
|
2836
|
+
autoCensorProfanity: autoCensorResult,
|
|
2837
|
+
replacements: replacementsResult,
|
|
2838
|
+
uploadedTrackId,
|
|
2839
|
+
presignedUrl,
|
|
2840
|
+
usage: usageWithMetadata
|
|
2562
2841
|
};
|
|
2563
2842
|
}
|
|
2564
|
-
async function generateEmbeddings(assetId, options = {}) {
|
|
2565
|
-
"use workflow";
|
|
2566
|
-
return generateEmbeddingsInternal(assetId, options);
|
|
2567
|
-
}
|
|
2568
|
-
async function generateVideoEmbeddings(assetId, options = {}) {
|
|
2569
|
-
"use workflow";
|
|
2570
|
-
console.warn("generateVideoEmbeddings is deprecated. Use generateEmbeddings instead.");
|
|
2571
|
-
return generateEmbeddingsInternal(assetId, options);
|
|
2572
|
-
}
|
|
2573
2843
|
|
|
2574
|
-
// src/
|
|
2575
|
-
|
|
2576
|
-
|
|
2577
|
-
|
|
2578
|
-
|
|
2844
|
+
// src/workflows/embeddings.ts
|
|
2845
|
+
import { embed } from "ai";
|
|
2846
|
+
|
|
2847
|
+
// src/primitives/text-chunking.ts
|
|
2848
|
+
var DEFAULT_MIN_CHUNK_DURATION_RATIO = 2 / 3;
|
|
2849
|
+
var DEFAULT_BOUNDARY_LOOKAHEAD_CUES = 12;
|
|
2850
|
+
var DEFAULT_BOUNDARY_PAUSE_SECONDS = 1.25;
|
|
2851
|
+
var STRONG_BOUNDARY_SCORE = 4;
|
|
2852
|
+
var PREFERRED_BOUNDARY_WINDOW_SECONDS = 5 * 60;
|
|
2853
|
+
var SENTENCE_BOUNDARY_REGEX = /[.!?]["')\]]*$/;
|
|
2854
|
+
var CLAUSE_BOUNDARY_REGEX = /[,;:]["')\]]*$/;
|
|
2855
|
+
var NEXT_SENTENCE_START_REGEX = /^[A-Z0-9"'([{]/;
|
|
2856
|
+
function estimateTokenCount(text) {
|
|
2857
|
+
const words = text.trim().split(/\s+/).length;
|
|
2858
|
+
return Math.ceil(words / 0.75);
|
|
2579
2859
|
}
|
|
2580
|
-
function
|
|
2581
|
-
|
|
2582
|
-
const DEFAULT_MAX_CANDIDATES = 30;
|
|
2583
|
-
const {
|
|
2584
|
-
duration_sec,
|
|
2585
|
-
min_candidates = DEFAULT_MIN_CANDIDATES,
|
|
2586
|
-
max_candidates = DEFAULT_MAX_CANDIDATES,
|
|
2587
|
-
trim_start_sec = 1,
|
|
2588
|
-
trim_end_sec = 1,
|
|
2589
|
-
fps = DEFAULT_FPS,
|
|
2590
|
-
base_cadence_hz,
|
|
2591
|
-
anchor_percents = [0.2, 0.5, 0.8],
|
|
2592
|
-
anchor_window_sec = 1.5
|
|
2593
|
-
} = options;
|
|
2594
|
-
const usableSec = Math.max(0, duration_sec - (trim_start_sec + trim_end_sec));
|
|
2595
|
-
if (usableSec <= 0)
|
|
2860
|
+
function chunkByTokens(text, maxTokens, overlapTokens = 0) {
|
|
2861
|
+
if (!text.trim()) {
|
|
2596
2862
|
return [];
|
|
2597
|
-
const cadenceHz = base_cadence_hz ?? (duration_sec < 15 ? 3 : duration_sec < 60 ? 2 : duration_sec < 180 ? 1.5 : 1);
|
|
2598
|
-
let target = Math.round(usableSec * cadenceHz);
|
|
2599
|
-
target = Math.max(min_candidates, Math.min(max_candidates, target));
|
|
2600
|
-
const stepSec = usableSec / target;
|
|
2601
|
-
const t0 = trim_start_sec;
|
|
2602
|
-
const base = [];
|
|
2603
|
-
for (let i = 0; i < target; i++) {
|
|
2604
|
-
const tsSec = t0 + (i + 0.5) * stepSec;
|
|
2605
|
-
base.push(tsSec * 1e3);
|
|
2606
2863
|
}
|
|
2607
|
-
const
|
|
2608
|
-
const
|
|
2609
|
-
|
|
2610
|
-
|
|
2611
|
-
|
|
2612
|
-
|
|
2613
|
-
|
|
2614
|
-
|
|
2615
|
-
|
|
2616
|
-
|
|
2617
|
-
|
|
2618
|
-
|
|
2619
|
-
|
|
2620
|
-
|
|
2621
|
-
|
|
2622
|
-
|
|
2623
|
-
|
|
2624
|
-
|
|
2625
|
-
|
|
2626
|
-
|
|
2864
|
+
const chunks = [];
|
|
2865
|
+
const words = text.trim().split(/\s+/);
|
|
2866
|
+
const wordsPerChunk = Math.floor(maxTokens * 0.75);
|
|
2867
|
+
const overlapWords = Math.floor(overlapTokens * 0.75);
|
|
2868
|
+
let chunkIndex = 0;
|
|
2869
|
+
let currentPosition = 0;
|
|
2870
|
+
while (currentPosition < words.length) {
|
|
2871
|
+
const chunkWords = words.slice(
|
|
2872
|
+
currentPosition,
|
|
2873
|
+
currentPosition + wordsPerChunk
|
|
2874
|
+
);
|
|
2875
|
+
const chunkText2 = chunkWords.join(" ");
|
|
2876
|
+
const tokenCount = estimateTokenCount(chunkText2);
|
|
2877
|
+
chunks.push({
|
|
2878
|
+
id: `chunk-${chunkIndex}`,
|
|
2879
|
+
text: chunkText2,
|
|
2880
|
+
tokenCount
|
|
2881
|
+
});
|
|
2882
|
+
currentPosition += wordsPerChunk - overlapWords;
|
|
2883
|
+
chunkIndex++;
|
|
2884
|
+
if (currentPosition <= (chunkIndex - 1) * (wordsPerChunk - overlapWords)) {
|
|
2885
|
+
break;
|
|
2627
2886
|
}
|
|
2628
2887
|
}
|
|
2629
|
-
|
|
2630
|
-
const uniqSorted = Array.from(new Set(all)).sort((a, b) => a - b);
|
|
2631
|
-
return uniqSorted.slice(0, max_candidates);
|
|
2888
|
+
return chunks;
|
|
2632
2889
|
}
|
|
2633
|
-
|
|
2634
|
-
|
|
2635
|
-
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
|
|
2641
|
-
|
|
2642
|
-
|
|
2643
|
-
|
|
2644
|
-
|
|
2645
|
-
|
|
2646
|
-
|
|
2890
|
+
function createChunkFromCues(cues, index) {
|
|
2891
|
+
const text = cues.map((c) => c.text).join(" ");
|
|
2892
|
+
return {
|
|
2893
|
+
id: `chunk-${index}`,
|
|
2894
|
+
text,
|
|
2895
|
+
tokenCount: estimateTokenCount(text),
|
|
2896
|
+
startTime: cues[0].startTime,
|
|
2897
|
+
endTime: cues[cues.length - 1].endTime
|
|
2898
|
+
};
|
|
2899
|
+
}
|
|
2900
|
+
function chunkVTTCues(cues, maxTokens, overlapCues = 2) {
|
|
2901
|
+
if (cues.length === 0)
|
|
2902
|
+
return [];
|
|
2903
|
+
const chunks = [];
|
|
2904
|
+
let currentCues = [];
|
|
2905
|
+
let currentTokens = 0;
|
|
2906
|
+
let chunkIndex = 0;
|
|
2907
|
+
for (let i = 0; i < cues.length; i++) {
|
|
2908
|
+
const cue = cues[i];
|
|
2909
|
+
const cueTokens = estimateTokenCount(cue.text);
|
|
2910
|
+
if (currentTokens + cueTokens > maxTokens && currentCues.length > 0) {
|
|
2911
|
+
chunks.push(createChunkFromCues(currentCues, chunkIndex));
|
|
2912
|
+
chunkIndex++;
|
|
2913
|
+
const overlapStart = Math.max(0, currentCues.length - overlapCues);
|
|
2914
|
+
currentCues = currentCues.slice(overlapStart);
|
|
2915
|
+
currentTokens = currentCues.reduce(
|
|
2916
|
+
(sum, c) => sum + estimateTokenCount(c.text),
|
|
2917
|
+
0
|
|
2918
|
+
);
|
|
2647
2919
|
}
|
|
2920
|
+
currentCues.push(cue);
|
|
2921
|
+
currentTokens += cueTokens;
|
|
2648
2922
|
}
|
|
2649
|
-
if (
|
|
2650
|
-
|
|
2651
|
-
newTimestamps.push(0);
|
|
2652
|
-
if (maxSamples >= 2) {
|
|
2653
|
-
const spacing = duration / (maxSamples - 1);
|
|
2654
|
-
for (let i = 1; i < maxSamples - 1; i++) {
|
|
2655
|
-
newTimestamps.push(spacing * i);
|
|
2656
|
-
}
|
|
2657
|
-
newTimestamps.push(duration);
|
|
2658
|
-
}
|
|
2659
|
-
timestamps = newTimestamps;
|
|
2923
|
+
if (currentCues.length > 0) {
|
|
2924
|
+
chunks.push(createChunkFromCues(currentCues, chunkIndex));
|
|
2660
2925
|
}
|
|
2661
|
-
|
|
2662
|
-
const urlPromises = timestamps.map(async (time) => {
|
|
2663
|
-
const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
|
|
2664
|
-
return { url, time };
|
|
2665
|
-
});
|
|
2666
|
-
return Promise.all(urlPromises);
|
|
2926
|
+
return chunks;
|
|
2667
2927
|
}
|
|
2668
|
-
|
|
2669
|
-
|
|
2670
|
-
|
|
2671
|
-
|
|
2672
|
-
|
|
2673
|
-
};
|
|
2674
|
-
var DEFAULT_PROVIDER2 = "openai";
|
|
2675
|
-
var HIVE_ENDPOINT = "https://api.thehive.ai/api/v2/task/sync";
|
|
2676
|
-
var HIVE_SEXUAL_CATEGORIES = [
|
|
2677
|
-
"general_nsfw",
|
|
2678
|
-
"yes_sexual_activity",
|
|
2679
|
-
"yes_sex_toy",
|
|
2680
|
-
"yes_female_nudity",
|
|
2681
|
-
"yes_male_nudity"
|
|
2682
|
-
];
|
|
2683
|
-
var HIVE_VIOLENCE_CATEGORIES = [
|
|
2684
|
-
"gun_in_hand",
|
|
2685
|
-
"gun_not_in_hand",
|
|
2686
|
-
"knife_in_hand",
|
|
2687
|
-
"very_bloody",
|
|
2688
|
-
"other_blood",
|
|
2689
|
-
"hanging",
|
|
2690
|
-
"noose",
|
|
2691
|
-
"human_corpse",
|
|
2692
|
-
"yes_emaciated_body",
|
|
2693
|
-
"yes_self_harm",
|
|
2694
|
-
"garm_death_injury_or_military_conflict"
|
|
2695
|
-
];
|
|
2696
|
-
async function processConcurrently(items, processor, maxConcurrent = 5) {
|
|
2697
|
-
"use step";
|
|
2698
|
-
const results = [];
|
|
2699
|
-
for (let i = 0; i < items.length; i += maxConcurrent) {
|
|
2700
|
-
const batch = items.slice(i, i + maxConcurrent);
|
|
2701
|
-
const batchPromises = batch.map(processor);
|
|
2702
|
-
const batchResults = await Promise.all(batchPromises);
|
|
2703
|
-
results.push(...batchResults);
|
|
2928
|
+
function scoreCueBoundary(cues, index, boundaryPauseSeconds) {
|
|
2929
|
+
const cue = cues[index];
|
|
2930
|
+
const nextCue = cues[index + 1];
|
|
2931
|
+
if (!nextCue) {
|
|
2932
|
+
return Number.POSITIVE_INFINITY;
|
|
2704
2933
|
}
|
|
2705
|
-
|
|
2706
|
-
|
|
2707
|
-
|
|
2708
|
-
|
|
2709
|
-
|
|
2710
|
-
|
|
2711
|
-
const res = await fetch("https://api.openai.com/v1/moderations", {
|
|
2712
|
-
method: "POST",
|
|
2713
|
-
headers: {
|
|
2714
|
-
"Content-Type": "application/json",
|
|
2715
|
-
"Authorization": `Bearer ${apiKey}`
|
|
2716
|
-
},
|
|
2717
|
-
body: JSON.stringify({
|
|
2718
|
-
model: entry.model,
|
|
2719
|
-
input: [
|
|
2720
|
-
{
|
|
2721
|
-
type: "image_url",
|
|
2722
|
-
image_url: {
|
|
2723
|
-
url: entry.image
|
|
2724
|
-
}
|
|
2725
|
-
}
|
|
2726
|
-
]
|
|
2727
|
-
})
|
|
2728
|
-
});
|
|
2729
|
-
const json = await res.json();
|
|
2730
|
-
if (!res.ok) {
|
|
2731
|
-
throw new Error(
|
|
2732
|
-
`OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
|
|
2733
|
-
);
|
|
2734
|
-
}
|
|
2735
|
-
const categoryScores = json.results?.[0]?.category_scores || {};
|
|
2736
|
-
return {
|
|
2737
|
-
url: entry.url,
|
|
2738
|
-
time: entry.time,
|
|
2739
|
-
sexual: categoryScores.sexual || 0,
|
|
2740
|
-
violence: categoryScores.violence || 0,
|
|
2741
|
-
error: false
|
|
2742
|
-
};
|
|
2743
|
-
} catch (error) {
|
|
2744
|
-
console.error("OpenAI moderation failed:", error);
|
|
2745
|
-
return {
|
|
2746
|
-
url: entry.url,
|
|
2747
|
-
time: entry.time,
|
|
2748
|
-
sexual: 0,
|
|
2749
|
-
violence: 0,
|
|
2750
|
-
error: true,
|
|
2751
|
-
errorMessage: error instanceof Error ? error.message : String(error)
|
|
2752
|
-
};
|
|
2934
|
+
const trimmedText = cue.text.trim();
|
|
2935
|
+
let score = 0;
|
|
2936
|
+
if (SENTENCE_BOUNDARY_REGEX.test(trimmedText)) {
|
|
2937
|
+
score += 4;
|
|
2938
|
+
} else if (CLAUSE_BOUNDARY_REGEX.test(trimmedText)) {
|
|
2939
|
+
score += 2;
|
|
2753
2940
|
}
|
|
2754
|
-
|
|
2755
|
-
|
|
2756
|
-
|
|
2757
|
-
|
|
2758
|
-
|
|
2759
|
-
const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
|
|
2760
|
-
(img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
|
|
2761
|
-
) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
|
|
2762
|
-
return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
|
|
2763
|
-
}
|
|
2764
|
-
async function requestOpenAITextModeration(text, model, url, credentials) {
|
|
2765
|
-
"use step";
|
|
2766
|
-
const apiKey = await getApiKeyFromEnv("openai", credentials);
|
|
2767
|
-
try {
|
|
2768
|
-
const res = await fetch("https://api.openai.com/v1/moderations", {
|
|
2769
|
-
method: "POST",
|
|
2770
|
-
headers: {
|
|
2771
|
-
"Content-Type": "application/json",
|
|
2772
|
-
"Authorization": `Bearer ${apiKey}`
|
|
2773
|
-
},
|
|
2774
|
-
body: JSON.stringify({
|
|
2775
|
-
model,
|
|
2776
|
-
input: text
|
|
2777
|
-
})
|
|
2778
|
-
});
|
|
2779
|
-
const json = await res.json();
|
|
2780
|
-
if (!res.ok) {
|
|
2781
|
-
throw new Error(
|
|
2782
|
-
`OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
|
|
2783
|
-
);
|
|
2784
|
-
}
|
|
2785
|
-
const categoryScores = json.results?.[0]?.category_scores || {};
|
|
2786
|
-
return {
|
|
2787
|
-
url,
|
|
2788
|
-
sexual: categoryScores.sexual || 0,
|
|
2789
|
-
violence: categoryScores.violence || 0,
|
|
2790
|
-
error: false
|
|
2791
|
-
};
|
|
2792
|
-
} catch (error) {
|
|
2793
|
-
console.error("OpenAI text moderation failed:", error);
|
|
2794
|
-
return {
|
|
2795
|
-
url,
|
|
2796
|
-
sexual: 0,
|
|
2797
|
-
violence: 0,
|
|
2798
|
-
error: true,
|
|
2799
|
-
errorMessage: error instanceof Error ? error.message : String(error)
|
|
2800
|
-
};
|
|
2941
|
+
if (nextCue.startTime - cue.endTime >= boundaryPauseSeconds) {
|
|
2942
|
+
score += 2;
|
|
2943
|
+
}
|
|
2944
|
+
if (NEXT_SENTENCE_START_REGEX.test(nextCue.text.trim())) {
|
|
2945
|
+
score += 1;
|
|
2801
2946
|
}
|
|
2947
|
+
return score;
|
|
2802
2948
|
}
|
|
2803
|
-
function
|
|
2804
|
-
if (
|
|
2949
|
+
function chunkVTTCuesByBudget(cues, options) {
|
|
2950
|
+
if (cues.length === 0) {
|
|
2805
2951
|
return [];
|
|
2806
2952
|
}
|
|
2807
|
-
|
|
2808
|
-
|
|
2953
|
+
const maxCuesPerChunk = Math.max(1, options.maxCuesPerChunk);
|
|
2954
|
+
let maxTextTokensPerChunk = Number.POSITIVE_INFINITY;
|
|
2955
|
+
if (options.maxTextTokensPerChunk) {
|
|
2956
|
+
maxTextTokensPerChunk = Math.max(1, options.maxTextTokensPerChunk);
|
|
2809
2957
|
}
|
|
2810
2958
|
const chunks = [];
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
|
|
2959
|
+
let chunkIndex = 0;
|
|
2960
|
+
let cueStartIndex = 0;
|
|
2961
|
+
let currentTokenCount = 0;
|
|
2962
|
+
for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
|
|
2963
|
+
const cue = cues[cueIndex];
|
|
2964
|
+
const cueTokenCount = estimateTokenCount(cue.text);
|
|
2965
|
+
const currentCueCount = cueIndex - cueStartIndex;
|
|
2966
|
+
const wouldExceedCueCount = currentCueCount >= maxCuesPerChunk;
|
|
2967
|
+
const wouldExceedTokenCount = currentCueCount > 0 && currentTokenCount + cueTokenCount > maxTextTokensPerChunk;
|
|
2968
|
+
if (wouldExceedCueCount || wouldExceedTokenCount) {
|
|
2969
|
+
chunks.push({
|
|
2970
|
+
id: `chunk-${chunkIndex}`,
|
|
2971
|
+
cueStartIndex,
|
|
2972
|
+
cueEndIndex: cueIndex - 1,
|
|
2973
|
+
cueCount: cueIndex - cueStartIndex,
|
|
2974
|
+
startTime: cues[cueStartIndex].startTime,
|
|
2975
|
+
endTime: cues[cueIndex - 1].endTime
|
|
2976
|
+
});
|
|
2977
|
+
cueStartIndex = cueIndex;
|
|
2978
|
+
currentTokenCount = 0;
|
|
2979
|
+
chunkIndex++;
|
|
2815
2980
|
}
|
|
2981
|
+
currentTokenCount += cueTokenCount;
|
|
2816
2982
|
}
|
|
2983
|
+
chunks.push({
|
|
2984
|
+
id: `chunk-${chunkIndex}`,
|
|
2985
|
+
cueStartIndex,
|
|
2986
|
+
cueEndIndex: cues.length - 1,
|
|
2987
|
+
cueCount: cues.length - cueStartIndex,
|
|
2988
|
+
startTime: cues[cueStartIndex].startTime,
|
|
2989
|
+
endTime: cues[cues.length - 1].endTime
|
|
2990
|
+
});
|
|
2817
2991
|
return chunks;
|
|
2818
2992
|
}
|
|
2819
|
-
|
|
2820
|
-
|
|
2821
|
-
|
|
2822
|
-
if (!chunks.length) {
|
|
2823
|
-
return [
|
|
2824
|
-
{ url: "transcript:0", sexual: 0, violence: 0, error: true, errorMessage: "No transcript chunks to moderate" }
|
|
2825
|
-
];
|
|
2993
|
+
function chunkVTTCuesByDuration(cues, options) {
|
|
2994
|
+
if (cues.length === 0) {
|
|
2995
|
+
return [];
|
|
2826
2996
|
}
|
|
2827
|
-
const
|
|
2828
|
-
|
|
2829
|
-
|
|
2830
|
-
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2997
|
+
const targetChunkDurationSeconds = Math.max(1, options.targetChunkDurationSeconds);
|
|
2998
|
+
const maxChunkDurationSeconds = Math.max(targetChunkDurationSeconds, options.maxChunkDurationSeconds);
|
|
2999
|
+
const minChunkDurationSeconds = Math.min(
|
|
3000
|
+
targetChunkDurationSeconds,
|
|
3001
|
+
Math.max(
|
|
3002
|
+
1,
|
|
3003
|
+
options.minChunkDurationSeconds ?? Math.floor(targetChunkDurationSeconds * DEFAULT_MIN_CHUNK_DURATION_RATIO)
|
|
3004
|
+
)
|
|
2835
3005
|
);
|
|
2836
|
-
|
|
2837
|
-
|
|
2838
|
-
const
|
|
2839
|
-
|
|
3006
|
+
const boundaryLookaheadCues = Math.max(1, options.boundaryLookaheadCues ?? DEFAULT_BOUNDARY_LOOKAHEAD_CUES);
|
|
3007
|
+
const boundaryPauseSeconds = options.boundaryPauseSeconds ?? DEFAULT_BOUNDARY_PAUSE_SECONDS;
|
|
3008
|
+
const preferredBoundaryStartSeconds = Math.max(
|
|
3009
|
+
minChunkDurationSeconds,
|
|
3010
|
+
targetChunkDurationSeconds - Math.min(PREFERRED_BOUNDARY_WINDOW_SECONDS, targetChunkDurationSeconds / 6)
|
|
2840
3011
|
);
|
|
2841
|
-
const
|
|
2842
|
-
|
|
2843
|
-
|
|
2844
|
-
|
|
2845
|
-
|
|
3012
|
+
const chunks = [];
|
|
3013
|
+
let chunkIndex = 0;
|
|
3014
|
+
let cueStartIndex = 0;
|
|
3015
|
+
while (cueStartIndex < cues.length) {
|
|
3016
|
+
const chunkStartTime = cues[cueStartIndex].startTime;
|
|
3017
|
+
let cueEndIndex = cueStartIndex;
|
|
3018
|
+
let bestBoundaryIndex = -1;
|
|
3019
|
+
let bestBoundaryScore = -1;
|
|
3020
|
+
let bestPreferredBoundaryIndex = -1;
|
|
3021
|
+
let bestPreferredBoundaryScore = -1;
|
|
3022
|
+
while (cueEndIndex < cues.length) {
|
|
3023
|
+
const cue = cues[cueEndIndex];
|
|
3024
|
+
const currentDuration = cue.endTime - chunkStartTime;
|
|
3025
|
+
if (currentDuration >= minChunkDurationSeconds) {
|
|
3026
|
+
const boundaryScore = scoreCueBoundary(cues, cueEndIndex, boundaryPauseSeconds);
|
|
3027
|
+
if (boundaryScore >= bestBoundaryScore) {
|
|
3028
|
+
bestBoundaryIndex = cueEndIndex;
|
|
3029
|
+
bestBoundaryScore = boundaryScore;
|
|
3030
|
+
}
|
|
3031
|
+
if (currentDuration >= preferredBoundaryStartSeconds && boundaryScore >= bestPreferredBoundaryScore) {
|
|
3032
|
+
bestPreferredBoundaryIndex = cueEndIndex;
|
|
3033
|
+
bestPreferredBoundaryScore = boundaryScore;
|
|
3034
|
+
}
|
|
3035
|
+
}
|
|
3036
|
+
const nextCue = cues[cueEndIndex + 1];
|
|
3037
|
+
if (!nextCue) {
|
|
3038
|
+
break;
|
|
3039
|
+
}
|
|
3040
|
+
const nextDuration = nextCue.endTime - chunkStartTime;
|
|
3041
|
+
const lookaheadExceeded = cueEndIndex - cueStartIndex >= boundaryLookaheadCues;
|
|
3042
|
+
const preferredBoundaryIndex = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryIndex : bestBoundaryIndex;
|
|
3043
|
+
const preferredBoundaryScore = bestPreferredBoundaryIndex >= cueStartIndex ? bestPreferredBoundaryScore : bestBoundaryScore;
|
|
3044
|
+
if (currentDuration >= targetChunkDurationSeconds) {
|
|
3045
|
+
if (preferredBoundaryIndex >= cueStartIndex && preferredBoundaryScore >= STRONG_BOUNDARY_SCORE) {
|
|
3046
|
+
cueEndIndex = preferredBoundaryIndex;
|
|
3047
|
+
break;
|
|
3048
|
+
}
|
|
3049
|
+
if (nextDuration > maxChunkDurationSeconds || lookaheadExceeded) {
|
|
3050
|
+
cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
|
|
3051
|
+
break;
|
|
3052
|
+
}
|
|
3053
|
+
}
|
|
3054
|
+
if (nextDuration > maxChunkDurationSeconds) {
|
|
3055
|
+
cueEndIndex = preferredBoundaryIndex >= cueStartIndex ? preferredBoundaryIndex : cueEndIndex;
|
|
3056
|
+
break;
|
|
3057
|
+
}
|
|
3058
|
+
cueEndIndex++;
|
|
3059
|
+
}
|
|
3060
|
+
chunks.push({
|
|
3061
|
+
id: `chunk-${chunkIndex}`,
|
|
3062
|
+
cueStartIndex,
|
|
3063
|
+
cueEndIndex,
|
|
3064
|
+
cueCount: cueEndIndex - cueStartIndex + 1,
|
|
3065
|
+
startTime: cues[cueStartIndex].startTime,
|
|
3066
|
+
endTime: cues[cueEndIndex].endTime
|
|
3067
|
+
});
|
|
3068
|
+
cueStartIndex = cueEndIndex + 1;
|
|
3069
|
+
chunkIndex++;
|
|
2846
3070
|
}
|
|
2847
|
-
|
|
2848
|
-
return Math.max(...scores, 0);
|
|
3071
|
+
return chunks;
|
|
2849
3072
|
}
|
|
2850
|
-
|
|
2851
|
-
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
const formData = new FormData();
|
|
2855
|
-
if (entry.source.kind === "url") {
|
|
2856
|
-
formData.append("url", entry.source.value);
|
|
2857
|
-
} else {
|
|
2858
|
-
const extension = entry.source.contentType.split("/")[1] || "jpg";
|
|
2859
|
-
const blob = new Blob([entry.source.buffer], {
|
|
2860
|
-
type: entry.source.contentType
|
|
2861
|
-
});
|
|
2862
|
-
formData.append("media", blob, `thumbnail.${extension}`);
|
|
3073
|
+
function chunkText(text, strategy) {
|
|
3074
|
+
switch (strategy.type) {
|
|
3075
|
+
case "token": {
|
|
3076
|
+
return chunkByTokens(text, strategy.maxTokens, strategy.overlap ?? 0);
|
|
2863
3077
|
}
|
|
2864
|
-
|
|
2865
|
-
|
|
2866
|
-
|
|
2867
|
-
try {
|
|
2868
|
-
res = await fetch(HIVE_ENDPOINT, {
|
|
2869
|
-
method: "POST",
|
|
2870
|
-
headers: {
|
|
2871
|
-
Accept: "application/json",
|
|
2872
|
-
Authorization: `Token ${apiKey}`
|
|
2873
|
-
},
|
|
2874
|
-
body: formData,
|
|
2875
|
-
signal: controller.signal
|
|
2876
|
-
});
|
|
2877
|
-
} catch (err) {
|
|
2878
|
-
if (err?.name === "AbortError") {
|
|
2879
|
-
throw new Error("Hive request timed out after 15s");
|
|
2880
|
-
}
|
|
2881
|
-
throw err;
|
|
2882
|
-
} finally {
|
|
2883
|
-
clearTimeout(timeout);
|
|
3078
|
+
default: {
|
|
3079
|
+
const exhaustiveCheck = strategy;
|
|
3080
|
+
throw new Error(`Unsupported chunking strategy: ${exhaustiveCheck}`);
|
|
2884
3081
|
}
|
|
2885
|
-
|
|
2886
|
-
|
|
2887
|
-
|
|
2888
|
-
|
|
2889
|
-
|
|
3082
|
+
}
|
|
3083
|
+
}
|
|
3084
|
+
|
|
3085
|
+
// src/workflows/embeddings.ts
|
|
3086
|
+
function averageEmbeddings(embeddings) {
|
|
3087
|
+
if (embeddings.length === 0) {
|
|
3088
|
+
return [];
|
|
3089
|
+
}
|
|
3090
|
+
const dimensions = embeddings[0].length;
|
|
3091
|
+
const averaged = Array.from({ length: dimensions }, () => 0);
|
|
3092
|
+
for (const embedding of embeddings) {
|
|
3093
|
+
for (let i = 0; i < dimensions; i++) {
|
|
3094
|
+
averaged[i] += embedding[i];
|
|
2890
3095
|
}
|
|
2891
|
-
if (json?.return_code != null && json.return_code !== 0) {
|
|
2892
|
-
throw new Error(
|
|
2893
|
-
`Hive API error (return_code ${json.return_code}): ${json.message || "Unknown error"}`
|
|
2894
|
-
);
|
|
2895
|
-
}
|
|
2896
|
-
const classes = json?.status?.[0]?.response?.output?.[0]?.classes;
|
|
2897
|
-
if (!Array.isArray(classes)) {
|
|
2898
|
-
throw new TypeError(
|
|
2899
|
-
`Unexpected Hive response structure: ${JSON.stringify(json)}`
|
|
2900
|
-
);
|
|
2901
|
-
}
|
|
2902
|
-
const sexual = getHiveCategoryScores(classes, HIVE_SEXUAL_CATEGORIES);
|
|
2903
|
-
const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
|
|
2904
|
-
return {
|
|
2905
|
-
url: entry.url,
|
|
2906
|
-
time: entry.time,
|
|
2907
|
-
sexual,
|
|
2908
|
-
violence,
|
|
2909
|
-
error: false
|
|
2910
|
-
};
|
|
2911
|
-
} catch (error) {
|
|
2912
|
-
return {
|
|
2913
|
-
url: entry.url,
|
|
2914
|
-
time: entry.time,
|
|
2915
|
-
sexual: 0,
|
|
2916
|
-
violence: 0,
|
|
2917
|
-
error: true,
|
|
2918
|
-
errorMessage: error instanceof Error ? error.message : String(error)
|
|
2919
|
-
};
|
|
2920
3096
|
}
|
|
3097
|
+
for (let i = 0; i < dimensions; i++) {
|
|
3098
|
+
averaged[i] /= embeddings.length;
|
|
3099
|
+
}
|
|
3100
|
+
return averaged;
|
|
2921
3101
|
}
|
|
2922
|
-
async function
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
|
|
2928
|
-
time: timeByUrl.get(img.url),
|
|
2929
|
-
source: {
|
|
2930
|
-
kind: "file",
|
|
2931
|
-
buffer: img.buffer,
|
|
2932
|
-
contentType: img.contentType
|
|
2933
|
-
},
|
|
2934
|
-
credentials
|
|
2935
|
-
})) : images.map((img) => ({
|
|
2936
|
-
url: img.url,
|
|
2937
|
-
time: img.time,
|
|
2938
|
-
source: { kind: "url", value: img.url },
|
|
2939
|
-
credentials
|
|
2940
|
-
}));
|
|
2941
|
-
return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
|
|
2942
|
-
}
|
|
2943
|
-
async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options) {
|
|
3102
|
+
async function generateSingleChunkEmbedding({
|
|
3103
|
+
chunk,
|
|
3104
|
+
provider,
|
|
3105
|
+
modelId,
|
|
3106
|
+
credentials
|
|
3107
|
+
}) {
|
|
2944
3108
|
"use step";
|
|
2945
|
-
const
|
|
2946
|
-
const
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
return
|
|
3109
|
+
const model = await createEmbeddingModelFromConfig(provider, modelId, credentials);
|
|
3110
|
+
const response = await withRetry(
|
|
3111
|
+
() => embed({
|
|
3112
|
+
model,
|
|
3113
|
+
value: chunk.text
|
|
3114
|
+
})
|
|
3115
|
+
);
|
|
3116
|
+
return {
|
|
3117
|
+
chunkId: chunk.id,
|
|
3118
|
+
embedding: response.embedding,
|
|
3119
|
+
metadata: {
|
|
3120
|
+
startTime: chunk.startTime,
|
|
3121
|
+
endTime: chunk.endTime,
|
|
3122
|
+
tokenCount: chunk.tokenCount
|
|
3123
|
+
}
|
|
3124
|
+
};
|
|
2953
3125
|
}
|
|
2954
|
-
async function
|
|
2955
|
-
"use workflow";
|
|
3126
|
+
async function generateEmbeddingsInternal(assetId, options = {}) {
|
|
2956
3127
|
const {
|
|
2957
|
-
provider =
|
|
2958
|
-
model
|
|
3128
|
+
provider = "openai",
|
|
3129
|
+
model,
|
|
2959
3130
|
languageCode,
|
|
2960
|
-
|
|
2961
|
-
|
|
2962
|
-
|
|
2963
|
-
maxSamples,
|
|
2964
|
-
maxConcurrent = 5,
|
|
2965
|
-
imageSubmissionMode = "url",
|
|
2966
|
-
imageDownloadOptions,
|
|
2967
|
-
credentials: providedCredentials
|
|
3131
|
+
chunkingStrategy = { type: "token", maxTokens: 500, overlap: 100 },
|
|
3132
|
+
batchSize = 5,
|
|
3133
|
+
credentials
|
|
2968
3134
|
} = options;
|
|
2969
|
-
const
|
|
2970
|
-
const { asset, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
|
|
2971
|
-
const
|
|
2972
|
-
const
|
|
2973
|
-
const assetDurationSeconds = getAssetDurationSecondsFromAsset(asset);
|
|
2974
|
-
const candidateDurations = [videoTrackDurationSeconds, assetDurationSeconds].filter(
|
|
2975
|
-
(d) => d != null
|
|
2976
|
-
);
|
|
2977
|
-
const duration = candidateDurations.length > 0 ? Math.min(...candidateDurations) : 0;
|
|
2978
|
-
const isAudioOnly = isAudioOnlyAsset(asset);
|
|
3135
|
+
const embeddingModel = resolveEmbeddingModelConfig({ ...options, provider, model });
|
|
3136
|
+
const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
|
|
3137
|
+
const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
|
|
3138
|
+
const isAudioOnly = isAudioOnlyAsset(assetData);
|
|
2979
3139
|
const signingContext = await resolveMuxSigningContext(credentials);
|
|
2980
3140
|
if (policy === "signed" && !signingContext) {
|
|
2981
3141
|
throw new Error(
|
|
2982
3142
|
"Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
|
|
2983
3143
|
);
|
|
2984
3144
|
}
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
let
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
3145
|
+
const readyTextTracks = getReadyTextTracks(assetData);
|
|
3146
|
+
const useVttChunking = chunkingStrategy.type === "vtt";
|
|
3147
|
+
let transcriptResult = await fetchTranscriptForAsset(assetData, playbackId, {
|
|
3148
|
+
languageCode,
|
|
3149
|
+
cleanTranscript: !useVttChunking,
|
|
3150
|
+
shouldSign: policy === "signed",
|
|
3151
|
+
credentials
|
|
3152
|
+
});
|
|
3153
|
+
if (isAudioOnly && !transcriptResult.track && readyTextTracks.length === 1) {
|
|
3154
|
+
transcriptResult = await fetchTranscriptForAsset(assetData, playbackId, {
|
|
3155
|
+
cleanTranscript: !useVttChunking,
|
|
2994
3156
|
shouldSign: policy === "signed",
|
|
2995
|
-
credentials
|
|
2996
|
-
required: true
|
|
3157
|
+
credentials
|
|
2997
3158
|
});
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
|
|
3003
|
-
|
|
3004
|
-
});
|
|
3005
|
-
}
|
|
3006
|
-
if (provider === "openai") {
|
|
3007
|
-
thumbnailScores = await requestOpenAITranscriptModeration(
|
|
3008
|
-
transcriptResult.transcriptText,
|
|
3009
|
-
model || "omni-moderation-latest",
|
|
3010
|
-
maxConcurrent,
|
|
3011
|
-
credentials
|
|
3159
|
+
}
|
|
3160
|
+
if (!transcriptResult.track || !transcriptResult.transcriptText) {
|
|
3161
|
+
const availableLanguages = readyTextTracks.map((t) => t.language_code).filter(Boolean).join(", ");
|
|
3162
|
+
if (isAudioOnly) {
|
|
3163
|
+
throw new Error(
|
|
3164
|
+
`No transcript track found${languageCode ? ` for language '${languageCode}'` : ""}. Audio-only assets require a transcript. Available languages: ${availableLanguages || "none"}`
|
|
3012
3165
|
);
|
|
3013
|
-
} else if (provider === "hive") {
|
|
3014
|
-
throw new Error("Hive does not support transcript moderation in this workflow. Use provider: 'openai' for audio-only assets.");
|
|
3015
|
-
} else {
|
|
3016
|
-
throw new Error(`Unsupported moderation provider: ${provider}`);
|
|
3017
3166
|
}
|
|
3018
|
-
|
|
3019
|
-
|
|
3020
|
-
// Generate thumbnail URLs (signed if needed) using existing interval-based logic.
|
|
3021
|
-
await getThumbnailUrls(playbackId, duration, {
|
|
3022
|
-
interval: thumbnailInterval,
|
|
3023
|
-
width: thumbnailWidth,
|
|
3024
|
-
shouldSign: policy === "signed",
|
|
3025
|
-
credentials
|
|
3026
|
-
})
|
|
3027
|
-
) : (
|
|
3028
|
-
// In maxSamples mode, sample valid timestamps over the trimmed usable span.
|
|
3029
|
-
// Use proportional trims (≈ duration/6, capped at 5s) to stay well inside the
|
|
3030
|
-
// renderable range — Mux can't always serve thumbnails at the very edges.
|
|
3031
|
-
await getThumbnailUrlsFromTimestamps(
|
|
3032
|
-
playbackId,
|
|
3033
|
-
planSamplingTimestamps({
|
|
3034
|
-
duration_sec: duration,
|
|
3035
|
-
max_candidates: maxSamples,
|
|
3036
|
-
trim_start_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
|
|
3037
|
-
trim_end_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
|
|
3038
|
-
fps: videoTrackFps,
|
|
3039
|
-
base_cadence_hz: thumbnailInterval > 0 ? 1 / thumbnailInterval : void 0
|
|
3040
|
-
}),
|
|
3041
|
-
{
|
|
3042
|
-
width: thumbnailWidth,
|
|
3043
|
-
shouldSign: policy === "signed",
|
|
3044
|
-
credentials
|
|
3045
|
-
}
|
|
3046
|
-
)
|
|
3167
|
+
throw new Error(
|
|
3168
|
+
`No caption track found${languageCode ? ` for language '${languageCode}'` : ""}. Available languages: ${availableLanguages || "none"}`
|
|
3047
3169
|
);
|
|
3048
|
-
|
|
3049
|
-
|
|
3050
|
-
|
|
3051
|
-
|
|
3052
|
-
|
|
3053
|
-
|
|
3054
|
-
|
|
3055
|
-
|
|
3056
|
-
|
|
3057
|
-
|
|
3058
|
-
|
|
3059
|
-
|
|
3060
|
-
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
|
|
3170
|
+
}
|
|
3171
|
+
const transcriptText = transcriptResult.transcriptText;
|
|
3172
|
+
if (!transcriptText.trim()) {
|
|
3173
|
+
throw new Error("Transcript is empty");
|
|
3174
|
+
}
|
|
3175
|
+
const chunks = useVttChunking ? chunkVTTCues(
|
|
3176
|
+
parseVTTCues(transcriptText),
|
|
3177
|
+
chunkingStrategy.maxTokens,
|
|
3178
|
+
chunkingStrategy.overlapCues
|
|
3179
|
+
) : chunkText(transcriptText, chunkingStrategy);
|
|
3180
|
+
if (chunks.length === 0) {
|
|
3181
|
+
throw new Error("No chunks generated from transcript");
|
|
3182
|
+
}
|
|
3183
|
+
const chunkEmbeddings = [];
|
|
3184
|
+
try {
|
|
3185
|
+
for (let i = 0; i < chunks.length; i += batchSize) {
|
|
3186
|
+
const batch = chunks.slice(i, i + batchSize);
|
|
3187
|
+
const batchResults = await Promise.all(
|
|
3188
|
+
batch.map(
|
|
3189
|
+
(chunk) => generateSingleChunkEmbedding({
|
|
3190
|
+
chunk,
|
|
3191
|
+
provider: embeddingModel.provider,
|
|
3192
|
+
modelId: embeddingModel.modelId,
|
|
3193
|
+
credentials
|
|
3194
|
+
})
|
|
3195
|
+
)
|
|
3065
3196
|
);
|
|
3066
|
-
|
|
3067
|
-
throw new Error(`Unsupported moderation provider: ${provider}`);
|
|
3197
|
+
chunkEmbeddings.push(...batchResults);
|
|
3068
3198
|
}
|
|
3069
|
-
}
|
|
3070
|
-
const failed = thumbnailScores.filter((s) => s.error);
|
|
3071
|
-
if (failed.length > 0) {
|
|
3072
|
-
const details = failed.map((s) => `${s.url}: ${s.errorMessage || "Unknown error"}`).join("; ");
|
|
3199
|
+
} catch (error) {
|
|
3073
3200
|
throw new Error(
|
|
3074
|
-
`
|
|
3201
|
+
`Failed to generate embeddings with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
3075
3202
|
);
|
|
3076
3203
|
}
|
|
3077
|
-
|
|
3078
|
-
|
|
3079
|
-
|
|
3204
|
+
if (chunkEmbeddings.length === 0) {
|
|
3205
|
+
throw new Error("No embeddings generated");
|
|
3206
|
+
}
|
|
3207
|
+
const averagedEmbedding = averageEmbeddings(chunkEmbeddings.map((ce) => ce.embedding));
|
|
3208
|
+
const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0);
|
|
3080
3209
|
return {
|
|
3081
3210
|
assetId,
|
|
3082
|
-
|
|
3083
|
-
|
|
3084
|
-
|
|
3211
|
+
chunks: chunkEmbeddings,
|
|
3212
|
+
averagedEmbedding,
|
|
3213
|
+
provider,
|
|
3214
|
+
model: embeddingModel.modelId,
|
|
3215
|
+
metadata: {
|
|
3216
|
+
totalChunks: chunks.length,
|
|
3217
|
+
totalTokens,
|
|
3218
|
+
chunkingStrategy: JSON.stringify(chunkingStrategy),
|
|
3219
|
+
embeddingDimensions: chunkEmbeddings[0].embedding.length,
|
|
3220
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
3221
|
+
},
|
|
3085
3222
|
usage: {
|
|
3086
3223
|
metadata: {
|
|
3087
|
-
assetDurationSeconds
|
|
3088
|
-
...thumbnailCount === void 0 ? {} : { thumbnailCount }
|
|
3224
|
+
assetDurationSeconds
|
|
3089
3225
|
}
|
|
3090
|
-
}
|
|
3091
|
-
maxScores: {
|
|
3092
|
-
sexual: maxSexual,
|
|
3093
|
-
violence: maxViolence
|
|
3094
|
-
},
|
|
3095
|
-
exceedsThreshold: maxSexual > finalThresholds.sexual || maxViolence > finalThresholds.violence,
|
|
3096
|
-
thresholds: finalThresholds
|
|
3226
|
+
}
|
|
3097
3227
|
};
|
|
3098
3228
|
}
|
|
3229
|
+
async function generateEmbeddings(assetId, options = {}) {
|
|
3230
|
+
"use workflow";
|
|
3231
|
+
return generateEmbeddingsInternal(assetId, options);
|
|
3232
|
+
}
|
|
3233
|
+
async function generateVideoEmbeddings(assetId, options = {}) {
|
|
3234
|
+
"use workflow";
|
|
3235
|
+
console.warn("generateVideoEmbeddings is deprecated. Use generateEmbeddings instead.");
|
|
3236
|
+
return generateEmbeddingsInternal(assetId, options);
|
|
3237
|
+
}
|
|
3099
3238
|
|
|
3100
|
-
// src/
|
|
3101
|
-
|
|
3102
|
-
|
|
3103
|
-
|
|
3104
|
-
|
|
3105
|
-
|
|
3106
|
-
|
|
3107
|
-
|
|
3108
|
-
|
|
3109
|
-
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
|
|
3113
|
-
|
|
3114
|
-
|
|
3115
|
-
|
|
3116
|
-
|
|
3117
|
-
|
|
3118
|
-
|
|
3119
|
-
|
|
3120
|
-
|
|
3121
|
-
|
|
3122
|
-
|
|
3123
|
-
const
|
|
3124
|
-
|
|
3125
|
-
|
|
3126
|
-
|
|
3127
|
-
|
|
3128
|
-
|
|
3129
|
-
|
|
3130
|
-
|
|
3131
|
-
|
|
3132
|
-
|
|
3133
|
-
|
|
3134
|
-
|
|
3135
|
-
|
|
3136
|
-
|
|
3137
|
-
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
|
|
3142
|
-
|
|
3143
|
-
|
|
3144
|
-
|
|
3145
|
-
|
|
3146
|
-
|
|
3147
|
-
|
|
3148
|
-
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
- Actions and activities being performed
|
|
3153
|
-
- Setting and environment
|
|
3154
|
-
- Notable objects or tools
|
|
3155
|
-
- Style or genre (if applicable)
|
|
3156
|
-
Prefer concrete nouns and action verbs over abstract concepts.
|
|
3157
|
-
Use lowercase. Avoid redundant or overly generic terms like "video" or "content".`
|
|
3158
|
-
},
|
|
3159
|
-
qualityGuidelines: {
|
|
3160
|
-
tag: "quality_guidelines",
|
|
3161
|
-
content: dedent4`
|
|
3162
|
-
- Examine all frames to understand the full context and progression
|
|
3163
|
-
- Be precise: "golden retriever" is better than "dog" when identifiable
|
|
3164
|
-
- Capture the narrative: what begins, develops, and concludes
|
|
3165
|
-
- Balance brevity with informativeness`
|
|
3239
|
+
// src/lib/sampling-plan.ts
|
|
3240
|
+
var DEFAULT_FPS = 30;
|
|
3241
|
+
function roundToNearestFrameMs(tsMs, fps = DEFAULT_FPS) {
|
|
3242
|
+
const frameMs = 1e3 / fps;
|
|
3243
|
+
return Math.round(Math.round(tsMs / frameMs) * frameMs * 100) / 100;
|
|
3244
|
+
}
|
|
3245
|
+
function planSamplingTimestamps(options) {
|
|
3246
|
+
const DEFAULT_MIN_CANDIDATES = 10;
|
|
3247
|
+
const DEFAULT_MAX_CANDIDATES = 30;
|
|
3248
|
+
const {
|
|
3249
|
+
duration_sec,
|
|
3250
|
+
min_candidates = DEFAULT_MIN_CANDIDATES,
|
|
3251
|
+
max_candidates = DEFAULT_MAX_CANDIDATES,
|
|
3252
|
+
trim_start_sec = 1,
|
|
3253
|
+
trim_end_sec = 1,
|
|
3254
|
+
fps = DEFAULT_FPS,
|
|
3255
|
+
base_cadence_hz,
|
|
3256
|
+
anchor_percents = [0.2, 0.5, 0.8],
|
|
3257
|
+
anchor_window_sec = 1.5
|
|
3258
|
+
} = options;
|
|
3259
|
+
const usableSec = Math.max(0, duration_sec - (trim_start_sec + trim_end_sec));
|
|
3260
|
+
if (usableSec <= 0)
|
|
3261
|
+
return [];
|
|
3262
|
+
const cadenceHz = base_cadence_hz ?? (duration_sec < 15 ? 3 : duration_sec < 60 ? 2 : duration_sec < 180 ? 1.5 : 1);
|
|
3263
|
+
let target = Math.round(usableSec * cadenceHz);
|
|
3264
|
+
target = Math.max(min_candidates, Math.min(max_candidates, target));
|
|
3265
|
+
const stepSec = usableSec / target;
|
|
3266
|
+
const t0 = trim_start_sec;
|
|
3267
|
+
const base = [];
|
|
3268
|
+
for (let i = 0; i < target; i++) {
|
|
3269
|
+
const tsSec = t0 + (i + 0.5) * stepSec;
|
|
3270
|
+
base.push(tsSec * 1e3);
|
|
3271
|
+
}
|
|
3272
|
+
const slack = Math.max(0, max_candidates - base.length);
|
|
3273
|
+
const extra = [];
|
|
3274
|
+
if (slack > 0 && anchor_percents.length > 0) {
|
|
3275
|
+
const perAnchor = Math.max(1, Math.min(5, Math.floor(slack / anchor_percents.length)));
|
|
3276
|
+
for (const p of anchor_percents) {
|
|
3277
|
+
const centerSec = Math.min(
|
|
3278
|
+
t0 + usableSec - 1e-3,
|
|
3279
|
+
// nudge just inside the end bound
|
|
3280
|
+
Math.max(t0 + 1e-3, duration_sec * p)
|
|
3281
|
+
// nudge just inside the start bound
|
|
3282
|
+
);
|
|
3283
|
+
const startSec = Math.max(t0, centerSec - anchor_window_sec / 2);
|
|
3284
|
+
const endSec = Math.min(t0 + usableSec, centerSec + anchor_window_sec / 2);
|
|
3285
|
+
if (endSec <= startSec)
|
|
3286
|
+
continue;
|
|
3287
|
+
const wStep = (endSec - startSec) / perAnchor;
|
|
3288
|
+
for (let i = 0; i < perAnchor; i++) {
|
|
3289
|
+
const tsSec = startSec + (i + 0.5) * wStep;
|
|
3290
|
+
extra.push(tsSec * 1e3);
|
|
3166
3291
|
}
|
|
3167
|
-
}
|
|
3168
|
-
|
|
3169
|
-
|
|
3292
|
+
}
|
|
3293
|
+
}
|
|
3294
|
+
const all = base.concat(extra).map((ms) => roundToNearestFrameMs(ms, fps)).filter((ms) => ms >= trim_start_sec * 1e3 && ms <= (duration_sec - trim_end_sec) * 1e3);
|
|
3295
|
+
const uniqSorted = Array.from(new Set(all)).sort((a, b) => a - b);
|
|
3296
|
+
return uniqSorted.slice(0, max_candidates);
|
|
3170
3297
|
}
|
|
3171
|
-
|
|
3172
|
-
|
|
3173
|
-
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3180
|
-
|
|
3181
|
-
|
|
3182
|
-
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
|
|
3188
|
-
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3193
|
-
|
|
3194
|
-
Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
|
|
3195
|
-
Focus on the spoken content and any key insights, dialogue, or narrative elements.`
|
|
3196
|
-
},
|
|
3197
|
-
keywords: {
|
|
3198
|
-
tag: "keywords_requirements",
|
|
3199
|
-
content: dedent4`
|
|
3200
|
-
Specific, searchable terms (up to ${keywordLimit}) that capture:
|
|
3201
|
-
- Primary topics and themes
|
|
3202
|
-
- Speakers or presenters (if named)
|
|
3203
|
-
- Key concepts and terminology
|
|
3204
|
-
- Content type (interview, lecture, music, etc.)
|
|
3205
|
-
- Genre or style (if applicable)
|
|
3206
|
-
Prefer concrete nouns and relevant terms over abstract concepts.
|
|
3207
|
-
Use lowercase. Avoid redundant or overly generic terms like "audio" or "content".`
|
|
3208
|
-
},
|
|
3209
|
-
qualityGuidelines: {
|
|
3210
|
-
tag: "quality_guidelines",
|
|
3211
|
-
content: dedent4`
|
|
3212
|
-
- Analyze the full transcript to understand context and themes
|
|
3213
|
-
- Be precise: use specific terminology when mentioned
|
|
3214
|
-
- Capture the narrative: what is introduced, discussed, and concluded
|
|
3215
|
-
- Balance brevity with informativeness`
|
|
3298
|
+
|
|
3299
|
+
// src/primitives/thumbnails.ts
|
|
3300
|
+
async function getThumbnailUrls(playbackId, duration, options = {}) {
|
|
3301
|
+
"use step";
|
|
3302
|
+
const { interval = 10, width = 640, shouldSign = false, maxSamples, credentials } = options;
|
|
3303
|
+
let timestamps = [];
|
|
3304
|
+
if (duration <= 50) {
|
|
3305
|
+
const spacing = duration / 6;
|
|
3306
|
+
for (let i = 1; i <= 5; i++) {
|
|
3307
|
+
timestamps.push(Math.round(i * spacing));
|
|
3308
|
+
}
|
|
3309
|
+
} else {
|
|
3310
|
+
for (let time = 0; time < duration; time += interval) {
|
|
3311
|
+
timestamps.push(time);
|
|
3312
|
+
}
|
|
3313
|
+
}
|
|
3314
|
+
if (maxSamples !== void 0 && timestamps.length > maxSamples) {
|
|
3315
|
+
const newTimestamps = [];
|
|
3316
|
+
newTimestamps.push(0);
|
|
3317
|
+
if (maxSamples >= 2) {
|
|
3318
|
+
const spacing = duration / (maxSamples - 1);
|
|
3319
|
+
for (let i = 1; i < maxSamples - 1; i++) {
|
|
3320
|
+
newTimestamps.push(spacing * i);
|
|
3216
3321
|
}
|
|
3217
|
-
|
|
3218
|
-
|
|
3322
|
+
newTimestamps.push(duration);
|
|
3323
|
+
}
|
|
3324
|
+
timestamps = newTimestamps;
|
|
3325
|
+
}
|
|
3326
|
+
const baseUrl = getMuxThumbnailBaseUrl(playbackId);
|
|
3327
|
+
const urlPromises = timestamps.map(async (time) => {
|
|
3328
|
+
const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
|
|
3329
|
+
return { url, time };
|
|
3219
3330
|
});
|
|
3331
|
+
return Promise.all(urlPromises);
|
|
3220
3332
|
}
|
|
3221
|
-
var SYSTEM_PROMPT3 = dedent4`
|
|
3222
|
-
<role>
|
|
3223
|
-
You are a video content analyst specializing in storyboard interpretation and multimodal analysis.
|
|
3224
|
-
</role>
|
|
3225
|
-
|
|
3226
|
-
<context>
|
|
3227
|
-
You receive storyboard images containing multiple sequential frames extracted from a video.
|
|
3228
|
-
These frames are arranged in a grid and represent the visual progression of the content over time.
|
|
3229
|
-
Read frames left-to-right, top-to-bottom to understand the temporal sequence.
|
|
3230
|
-
</context>
|
|
3231
|
-
|
|
3232
|
-
<transcript_guidance>
|
|
3233
|
-
When a transcript is provided alongside the storyboard:
|
|
3234
|
-
- Use it to understand spoken content, dialogue, narration, and audio context
|
|
3235
|
-
- Correlate transcript content with visual frames to build a complete picture
|
|
3236
|
-
- Extract key terminology, names, and specific language used by speakers
|
|
3237
|
-
- Let the transcript inform keyword selection, especially for topics not visually obvious
|
|
3238
|
-
- Prioritize visual content for the description, but enrich it with transcript insights
|
|
3239
|
-
- If transcript and visuals conflict, trust the visual evidence
|
|
3240
|
-
</transcript_guidance>
|
|
3241
|
-
|
|
3242
|
-
<capabilities>
|
|
3243
|
-
- Extract meaning from visual sequences
|
|
3244
|
-
- Identify subjects, actions, settings, and narrative arcs
|
|
3245
|
-
- Generate accurate, searchable metadata
|
|
3246
|
-
- Synthesize visual and transcript information when provided
|
|
3247
|
-
</capabilities>
|
|
3248
|
-
|
|
3249
|
-
<constraints>
|
|
3250
|
-
- Only describe what is clearly observable in the frames or explicitly stated in the transcript
|
|
3251
|
-
- Do not fabricate details or make unsupported assumptions
|
|
3252
|
-
- Return structured data matching the requested schema
|
|
3253
|
-
- Output only the JSON object; no markdown or extra text
|
|
3254
|
-
- When a <language> section is provided, all output text MUST be written in that language
|
|
3255
|
-
</constraints>
|
|
3256
|
-
|
|
3257
|
-
<tone_guidance>
|
|
3258
|
-
Pay special attention to the <tone> section and lean heavily into those instructions.
|
|
3259
|
-
Adapt your entire analysis and writing style to match the specified tone - this should influence
|
|
3260
|
-
your word choice, personality, formality level, and overall presentation of the content.
|
|
3261
|
-
The tone instructions are not suggestions but core requirements for how you should express yourself.
|
|
3262
|
-
</tone_guidance>
|
|
3263
|
-
|
|
3264
|
-
<language_guidelines>
|
|
3265
|
-
AVOID these meta-descriptive phrases that reference the medium rather than the content:
|
|
3266
|
-
- "The image shows..." / "The storyboard shows..."
|
|
3267
|
-
- "In this video..." / "This video features..."
|
|
3268
|
-
- "The frames depict..." / "The footage shows..."
|
|
3269
|
-
- "We can see..." / "You can see..."
|
|
3270
|
-
- "The clip shows..." / "The scene shows..."
|
|
3271
|
-
|
|
3272
|
-
INSTEAD, describe the content directly:
|
|
3273
|
-
- BAD: "The video shows a chef preparing a meal"
|
|
3274
|
-
- GOOD: "A chef prepares a meal in a professional kitchen"
|
|
3275
3333
|
|
|
3276
|
-
|
|
3277
|
-
|
|
3278
|
-
|
|
3279
|
-
|
|
3280
|
-
|
|
3281
|
-
|
|
3282
|
-
|
|
3283
|
-
|
|
3284
|
-
|
|
3285
|
-
|
|
3286
|
-
|
|
3287
|
-
|
|
3288
|
-
|
|
3289
|
-
|
|
3290
|
-
|
|
3291
|
-
|
|
3292
|
-
|
|
3293
|
-
|
|
3294
|
-
|
|
3295
|
-
|
|
3296
|
-
|
|
3297
|
-
|
|
3298
|
-
|
|
3299
|
-
|
|
3300
|
-
|
|
3301
|
-
|
|
3302
|
-
|
|
3303
|
-
|
|
3304
|
-
|
|
3305
|
-
|
|
3306
|
-
|
|
3307
|
-
|
|
3308
|
-
|
|
3309
|
-
|
|
3310
|
-
|
|
3311
|
-
|
|
3312
|
-
<tone_guidance>
|
|
3313
|
-
Pay special attention to the <tone> section and lean heavily into those instructions.
|
|
3314
|
-
Adapt your entire analysis and writing style to match the specified tone - this should influence
|
|
3315
|
-
your word choice, personality, formality level, and overall presentation of the content.
|
|
3316
|
-
The tone instructions are not suggestions but core requirements for how you should express yourself.
|
|
3317
|
-
</tone_guidance>
|
|
3318
|
-
|
|
3319
|
-
<language_guidelines>
|
|
3320
|
-
AVOID these meta-descriptive phrases that reference the medium rather than the content:
|
|
3321
|
-
- "The audio shows..." / "The transcript shows..."
|
|
3322
|
-
- "In this recording..." / "This audio features..."
|
|
3323
|
-
- "The speaker says..." / "We can hear..."
|
|
3324
|
-
- "The clip contains..." / "The recording shows..."
|
|
3325
|
-
|
|
3326
|
-
INSTEAD, describe the content directly:
|
|
3327
|
-
- BAD: "The audio features a discussion about climate change"
|
|
3328
|
-
- GOOD: "A panel discusses climate change impacts and solutions"
|
|
3329
|
-
|
|
3330
|
-
Write as if describing reality, not describing a recording of reality.
|
|
3331
|
-
</language_guidelines>`;
|
|
3332
|
-
function buildUserPrompt4({
|
|
3333
|
-
tone,
|
|
3334
|
-
transcriptText,
|
|
3335
|
-
isCleanTranscript = true,
|
|
3336
|
-
promptOverrides,
|
|
3337
|
-
isAudioOnly = false,
|
|
3338
|
-
titleLength,
|
|
3339
|
-
descriptionLength,
|
|
3340
|
-
tagCount,
|
|
3341
|
-
languageName
|
|
3342
|
-
}) {
|
|
3343
|
-
const contextSections = [createToneSection(TONE_INSTRUCTIONS[tone])];
|
|
3344
|
-
if (languageName) {
|
|
3345
|
-
contextSections.push(createLanguageSection(languageName));
|
|
3346
|
-
}
|
|
3347
|
-
if (transcriptText) {
|
|
3348
|
-
const format = isCleanTranscript ? "plain text" : "WebVTT";
|
|
3349
|
-
contextSections.push(createTranscriptSection(transcriptText, format));
|
|
3334
|
+
// src/workflows/moderation.ts
|
|
3335
|
+
var DEFAULT_THRESHOLDS = {
|
|
3336
|
+
sexual: 0.8,
|
|
3337
|
+
violence: 0.8
|
|
3338
|
+
};
|
|
3339
|
+
var DEFAULT_PROVIDER2 = "openai";
|
|
3340
|
+
var HIVE_ENDPOINT = "https://api.thehive.ai/api/v2/task/sync";
|
|
3341
|
+
var HIVE_SEXUAL_CATEGORIES = [
|
|
3342
|
+
"general_nsfw",
|
|
3343
|
+
"yes_sexual_activity",
|
|
3344
|
+
"yes_sex_toy",
|
|
3345
|
+
"yes_female_nudity",
|
|
3346
|
+
"yes_male_nudity"
|
|
3347
|
+
];
|
|
3348
|
+
var HIVE_VIOLENCE_CATEGORIES = [
|
|
3349
|
+
"gun_in_hand",
|
|
3350
|
+
"gun_not_in_hand",
|
|
3351
|
+
"knife_in_hand",
|
|
3352
|
+
"very_bloody",
|
|
3353
|
+
"other_blood",
|
|
3354
|
+
"hanging",
|
|
3355
|
+
"noose",
|
|
3356
|
+
"human_corpse",
|
|
3357
|
+
"yes_emaciated_body",
|
|
3358
|
+
"yes_self_harm",
|
|
3359
|
+
"garm_death_injury_or_military_conflict"
|
|
3360
|
+
];
|
|
3361
|
+
async function processConcurrently(items, processor, maxConcurrent = 5) {
|
|
3362
|
+
"use step";
|
|
3363
|
+
const results = [];
|
|
3364
|
+
for (let i = 0; i < items.length; i += maxConcurrent) {
|
|
3365
|
+
const batch = items.slice(i, i + maxConcurrent);
|
|
3366
|
+
const batchPromises = batch.map(processor);
|
|
3367
|
+
const batchResults = await Promise.all(batchPromises);
|
|
3368
|
+
results.push(...batchResults);
|
|
3350
3369
|
}
|
|
3351
|
-
|
|
3352
|
-
const promptBuilder = isAudioOnly ? createAudioOnlyBuilder(constraints) : createSummarizationBuilder(constraints);
|
|
3353
|
-
return promptBuilder.buildWithContext(promptOverrides, contextSections);
|
|
3370
|
+
return results;
|
|
3354
3371
|
}
|
|
3355
|
-
async function
|
|
3372
|
+
async function moderateImageWithOpenAI(entry) {
|
|
3356
3373
|
"use step";
|
|
3357
|
-
const
|
|
3358
|
-
|
|
3359
|
-
|
|
3360
|
-
|
|
3361
|
-
|
|
3362
|
-
|
|
3363
|
-
|
|
3364
|
-
content: systemPrompt
|
|
3374
|
+
const apiKey = await getApiKeyFromEnv("openai", entry.credentials);
|
|
3375
|
+
try {
|
|
3376
|
+
const res = await fetch("https://api.openai.com/v1/moderations", {
|
|
3377
|
+
method: "POST",
|
|
3378
|
+
headers: {
|
|
3379
|
+
"Content-Type": "application/json",
|
|
3380
|
+
"Authorization": `Bearer ${apiKey}`
|
|
3365
3381
|
},
|
|
3366
|
-
{
|
|
3367
|
-
|
|
3368
|
-
|
|
3369
|
-
{
|
|
3370
|
-
|
|
3382
|
+
body: JSON.stringify({
|
|
3383
|
+
model: entry.model,
|
|
3384
|
+
input: [
|
|
3385
|
+
{
|
|
3386
|
+
type: "image_url",
|
|
3387
|
+
image_url: {
|
|
3388
|
+
url: entry.image
|
|
3389
|
+
}
|
|
3390
|
+
}
|
|
3371
3391
|
]
|
|
3372
|
-
}
|
|
3373
|
-
|
|
3374
|
-
|
|
3375
|
-
|
|
3376
|
-
|
|
3377
|
-
|
|
3378
|
-
|
|
3379
|
-
return {
|
|
3380
|
-
result: parsed,
|
|
3381
|
-
usage: {
|
|
3382
|
-
inputTokens: response.usage.inputTokens,
|
|
3383
|
-
outputTokens: response.usage.outputTokens,
|
|
3384
|
-
totalTokens: response.usage.totalTokens,
|
|
3385
|
-
reasoningTokens: response.usage.reasoningTokens,
|
|
3386
|
-
cachedInputTokens: response.usage.cachedInputTokens
|
|
3392
|
+
})
|
|
3393
|
+
});
|
|
3394
|
+
const json = await res.json();
|
|
3395
|
+
if (!res.ok) {
|
|
3396
|
+
throw new Error(
|
|
3397
|
+
`OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
|
|
3398
|
+
);
|
|
3387
3399
|
}
|
|
3388
|
-
|
|
3400
|
+
const categoryScores = json.results?.[0]?.category_scores || {};
|
|
3401
|
+
return {
|
|
3402
|
+
url: entry.url,
|
|
3403
|
+
time: entry.time,
|
|
3404
|
+
sexual: categoryScores.sexual || 0,
|
|
3405
|
+
violence: categoryScores.violence || 0,
|
|
3406
|
+
error: false
|
|
3407
|
+
};
|
|
3408
|
+
} catch (error) {
|
|
3409
|
+
console.error("OpenAI moderation failed:", error);
|
|
3410
|
+
return {
|
|
3411
|
+
url: entry.url,
|
|
3412
|
+
time: entry.time,
|
|
3413
|
+
sexual: 0,
|
|
3414
|
+
violence: 0,
|
|
3415
|
+
error: true,
|
|
3416
|
+
errorMessage: error instanceof Error ? error.message : String(error)
|
|
3417
|
+
};
|
|
3418
|
+
}
|
|
3389
3419
|
}
|
|
3390
|
-
async function
|
|
3420
|
+
async function requestOpenAIModeration(images, model, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
|
|
3391
3421
|
"use step";
|
|
3392
|
-
const
|
|
3393
|
-
const
|
|
3394
|
-
|
|
3395
|
-
|
|
3396
|
-
|
|
3397
|
-
|
|
3398
|
-
|
|
3399
|
-
|
|
3422
|
+
const imageUrls = images.map((img) => img.url);
|
|
3423
|
+
const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
|
|
3424
|
+
const targetUrls = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map(
|
|
3425
|
+
(img) => ({ url: img.url, time: timeByUrl.get(img.url), image: img.base64Data, model, credentials })
|
|
3426
|
+
) : images.map((img) => ({ url: img.url, time: img.time, image: img.url, model, credentials }));
|
|
3427
|
+
return processConcurrently(targetUrls, moderateImageWithOpenAI, maxConcurrent);
|
|
3428
|
+
}
|
|
3429
|
+
async function requestOpenAITextModeration(text, model, url, credentials) {
|
|
3430
|
+
"use step";
|
|
3431
|
+
const apiKey = await getApiKeyFromEnv("openai", credentials);
|
|
3432
|
+
try {
|
|
3433
|
+
const res = await fetch("https://api.openai.com/v1/moderations", {
|
|
3434
|
+
method: "POST",
|
|
3435
|
+
headers: {
|
|
3436
|
+
"Content-Type": "application/json",
|
|
3437
|
+
"Authorization": `Bearer ${apiKey}`
|
|
3400
3438
|
},
|
|
3401
|
-
{
|
|
3402
|
-
|
|
3403
|
-
|
|
3404
|
-
}
|
|
3405
|
-
|
|
3406
|
-
|
|
3407
|
-
|
|
3408
|
-
|
|
3409
|
-
|
|
3410
|
-
|
|
3411
|
-
return {
|
|
3412
|
-
result: parsed,
|
|
3413
|
-
usage: {
|
|
3414
|
-
inputTokens: response.usage.inputTokens,
|
|
3415
|
-
outputTokens: response.usage.outputTokens,
|
|
3416
|
-
totalTokens: response.usage.totalTokens,
|
|
3417
|
-
reasoningTokens: response.usage.reasoningTokens,
|
|
3418
|
-
cachedInputTokens: response.usage.cachedInputTokens
|
|
3439
|
+
body: JSON.stringify({
|
|
3440
|
+
model,
|
|
3441
|
+
input: text
|
|
3442
|
+
})
|
|
3443
|
+
});
|
|
3444
|
+
const json = await res.json();
|
|
3445
|
+
if (!res.ok) {
|
|
3446
|
+
throw new Error(
|
|
3447
|
+
`OpenAI moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
|
|
3448
|
+
);
|
|
3419
3449
|
}
|
|
3420
|
-
|
|
3450
|
+
const categoryScores = json.results?.[0]?.category_scores || {};
|
|
3451
|
+
return {
|
|
3452
|
+
url,
|
|
3453
|
+
sexual: categoryScores.sexual || 0,
|
|
3454
|
+
violence: categoryScores.violence || 0,
|
|
3455
|
+
error: false
|
|
3456
|
+
};
|
|
3457
|
+
} catch (error) {
|
|
3458
|
+
console.error("OpenAI text moderation failed:", error);
|
|
3459
|
+
return {
|
|
3460
|
+
url,
|
|
3461
|
+
sexual: 0,
|
|
3462
|
+
violence: 0,
|
|
3463
|
+
error: true,
|
|
3464
|
+
errorMessage: error instanceof Error ? error.message : String(error)
|
|
3465
|
+
};
|
|
3466
|
+
}
|
|
3421
3467
|
}
|
|
3422
|
-
function
|
|
3423
|
-
if (!
|
|
3468
|
+
function chunkTextByUtf16CodeUnits(text, maxUnits) {
|
|
3469
|
+
if (!text.trim()) {
|
|
3424
3470
|
return [];
|
|
3425
3471
|
}
|
|
3426
|
-
|
|
3427
|
-
|
|
3428
|
-
|
|
3429
|
-
|
|
3430
|
-
|
|
3431
|
-
|
|
3472
|
+
if (text.length <= maxUnits) {
|
|
3473
|
+
return [text];
|
|
3474
|
+
}
|
|
3475
|
+
const chunks = [];
|
|
3476
|
+
for (let i = 0; i < text.length; i += maxUnits) {
|
|
3477
|
+
const chunk = text.slice(i, i + maxUnits).trim();
|
|
3478
|
+
if (chunk) {
|
|
3479
|
+
chunks.push(chunk);
|
|
3432
3480
|
}
|
|
3433
|
-
|
|
3434
|
-
|
|
3435
|
-
|
|
3481
|
+
}
|
|
3482
|
+
return chunks;
|
|
3483
|
+
}
|
|
3484
|
+
async function requestOpenAITranscriptModeration(transcriptText, model, maxConcurrent = 5, credentials) {
|
|
3485
|
+
"use step";
|
|
3486
|
+
const chunks = chunkTextByUtf16CodeUnits(transcriptText, 1e4);
|
|
3487
|
+
if (!chunks.length) {
|
|
3488
|
+
return [
|
|
3489
|
+
{ url: "transcript:0", sexual: 0, violence: 0, error: true, errorMessage: "No transcript chunks to moderate" }
|
|
3490
|
+
];
|
|
3491
|
+
}
|
|
3492
|
+
const targets = chunks.map((chunk, idx) => ({
|
|
3493
|
+
chunk,
|
|
3494
|
+
url: `transcript:${idx}`
|
|
3495
|
+
}));
|
|
3496
|
+
return processConcurrently(
|
|
3497
|
+
targets,
|
|
3498
|
+
async (entry) => requestOpenAITextModeration(entry.chunk, model, entry.url, credentials),
|
|
3499
|
+
maxConcurrent
|
|
3500
|
+
);
|
|
3501
|
+
}
|
|
3502
|
+
function getHiveCategoryScores(classes, categoryNames) {
|
|
3503
|
+
const scoreMap = Object.fromEntries(
|
|
3504
|
+
classes.map((c) => [c.class, c.score])
|
|
3505
|
+
);
|
|
3506
|
+
const missingCategories = categoryNames.filter((category) => !(category in scoreMap));
|
|
3507
|
+
if (missingCategories.length > 0) {
|
|
3508
|
+
console.warn(
|
|
3509
|
+
`Hive response missing expected categories: ${missingCategories.join(", ")}`
|
|
3510
|
+
);
|
|
3511
|
+
}
|
|
3512
|
+
const scores = categoryNames.map((category) => scoreMap[category] || 0);
|
|
3513
|
+
return Math.max(...scores, 0);
|
|
3514
|
+
}
|
|
3515
|
+
async function moderateImageWithHive(entry) {
|
|
3516
|
+
"use step";
|
|
3517
|
+
const apiKey = await getApiKeyFromEnv("hive", entry.credentials);
|
|
3518
|
+
try {
|
|
3519
|
+
const formData = new FormData();
|
|
3520
|
+
if (entry.source.kind === "url") {
|
|
3521
|
+
formData.append("url", entry.source.value);
|
|
3522
|
+
} else {
|
|
3523
|
+
const extension = entry.source.contentType.split("/")[1] || "jpg";
|
|
3524
|
+
const blob = new Blob([entry.source.buffer], {
|
|
3525
|
+
type: entry.source.contentType
|
|
3526
|
+
});
|
|
3527
|
+
formData.append("media", blob, `thumbnail.${extension}`);
|
|
3436
3528
|
}
|
|
3437
|
-
|
|
3438
|
-
|
|
3439
|
-
|
|
3440
|
-
|
|
3529
|
+
const controller = new AbortController();
|
|
3530
|
+
const timeout = setTimeout(() => controller.abort(), 15e3);
|
|
3531
|
+
let res;
|
|
3532
|
+
try {
|
|
3533
|
+
res = await fetch(HIVE_ENDPOINT, {
|
|
3534
|
+
method: "POST",
|
|
3535
|
+
headers: {
|
|
3536
|
+
Accept: "application/json",
|
|
3537
|
+
Authorization: `Token ${apiKey}`
|
|
3538
|
+
},
|
|
3539
|
+
body: formData,
|
|
3540
|
+
signal: controller.signal
|
|
3541
|
+
});
|
|
3542
|
+
} catch (err) {
|
|
3543
|
+
if (err?.name === "AbortError") {
|
|
3544
|
+
throw new Error("Hive request timed out after 15s");
|
|
3545
|
+
}
|
|
3546
|
+
throw err;
|
|
3547
|
+
} finally {
|
|
3548
|
+
clearTimeout(timeout);
|
|
3549
|
+
}
|
|
3550
|
+
const json = await res.json().catch(() => void 0);
|
|
3551
|
+
if (!res.ok) {
|
|
3552
|
+
throw new Error(
|
|
3553
|
+
`Hive moderation error: ${res.status} ${res.statusText} - ${JSON.stringify(json)}`
|
|
3554
|
+
);
|
|
3555
|
+
}
|
|
3556
|
+
if (json?.return_code != null && json.return_code !== 0) {
|
|
3557
|
+
throw new Error(
|
|
3558
|
+
`Hive API error (return_code ${json.return_code}): ${json.message || "Unknown error"}`
|
|
3559
|
+
);
|
|
3560
|
+
}
|
|
3561
|
+
const classes = json?.status?.[0]?.response?.output?.[0]?.classes;
|
|
3562
|
+
if (!Array.isArray(classes)) {
|
|
3563
|
+
throw new TypeError(
|
|
3564
|
+
`Unexpected Hive response structure: ${JSON.stringify(json)}`
|
|
3565
|
+
);
|
|
3441
3566
|
}
|
|
3567
|
+
const sexual = getHiveCategoryScores(classes, HIVE_SEXUAL_CATEGORIES);
|
|
3568
|
+
const violence = getHiveCategoryScores(classes, HIVE_VIOLENCE_CATEGORIES);
|
|
3569
|
+
return {
|
|
3570
|
+
url: entry.url,
|
|
3571
|
+
time: entry.time,
|
|
3572
|
+
sexual,
|
|
3573
|
+
violence,
|
|
3574
|
+
error: false
|
|
3575
|
+
};
|
|
3576
|
+
} catch (error) {
|
|
3577
|
+
return {
|
|
3578
|
+
url: entry.url,
|
|
3579
|
+
time: entry.time,
|
|
3580
|
+
sexual: 0,
|
|
3581
|
+
violence: 0,
|
|
3582
|
+
error: true,
|
|
3583
|
+
errorMessage: error instanceof Error ? error.message : String(error)
|
|
3584
|
+
};
|
|
3442
3585
|
}
|
|
3443
|
-
return normalized;
|
|
3444
3586
|
}
|
|
3445
|
-
async function
|
|
3587
|
+
async function requestHiveModeration(images, maxConcurrent = 5, submissionMode = "url", downloadOptions, credentials) {
|
|
3588
|
+
"use step";
|
|
3589
|
+
const imageUrls = images.map((img) => img.url);
|
|
3590
|
+
const timeByUrl = new Map(images.map((img) => [img.url, img.time]));
|
|
3591
|
+
const targets = submissionMode === "base64" ? (await downloadImagesAsBase64(imageUrls, downloadOptions, maxConcurrent)).map((img) => ({
|
|
3592
|
+
url: img.url,
|
|
3593
|
+
time: timeByUrl.get(img.url),
|
|
3594
|
+
source: {
|
|
3595
|
+
kind: "file",
|
|
3596
|
+
buffer: img.buffer,
|
|
3597
|
+
contentType: img.contentType
|
|
3598
|
+
},
|
|
3599
|
+
credentials
|
|
3600
|
+
})) : images.map((img) => ({
|
|
3601
|
+
url: img.url,
|
|
3602
|
+
time: img.time,
|
|
3603
|
+
source: { kind: "url", value: img.url },
|
|
3604
|
+
credentials
|
|
3605
|
+
}));
|
|
3606
|
+
return await processConcurrently(targets, moderateImageWithHive, maxConcurrent);
|
|
3607
|
+
}
|
|
3608
|
+
async function getThumbnailUrlsFromTimestamps(playbackId, timestampsMs, options) {
|
|
3609
|
+
"use step";
|
|
3610
|
+
const { width, shouldSign, credentials } = options;
|
|
3611
|
+
const baseUrl = getMuxThumbnailBaseUrl(playbackId);
|
|
3612
|
+
const urlPromises = timestampsMs.map(async (tsMs) => {
|
|
3613
|
+
const time = Number((tsMs / 1e3).toFixed(2));
|
|
3614
|
+
const url = shouldSign ? await signUrl(baseUrl, playbackId, "thumbnail", { time, width }, credentials) : `${baseUrl}?time=${time}&width=${width}`;
|
|
3615
|
+
return { url, time };
|
|
3616
|
+
});
|
|
3617
|
+
return Promise.all(urlPromises);
|
|
3618
|
+
}
|
|
3619
|
+
async function getModerationScores(assetId, options = {}) {
|
|
3446
3620
|
"use workflow";
|
|
3447
3621
|
const {
|
|
3448
|
-
provider =
|
|
3449
|
-
model,
|
|
3450
|
-
|
|
3451
|
-
|
|
3452
|
-
|
|
3622
|
+
provider = DEFAULT_PROVIDER2,
|
|
3623
|
+
model = provider === "openai" ? "omni-moderation-latest" : void 0,
|
|
3624
|
+
languageCode,
|
|
3625
|
+
thresholds = DEFAULT_THRESHOLDS,
|
|
3626
|
+
thumbnailInterval = 10,
|
|
3627
|
+
thumbnailWidth = 640,
|
|
3628
|
+
maxSamples,
|
|
3629
|
+
maxConcurrent = 5,
|
|
3453
3630
|
imageSubmissionMode = "url",
|
|
3454
3631
|
imageDownloadOptions,
|
|
3455
|
-
|
|
3456
|
-
|
|
3457
|
-
|
|
3458
|
-
|
|
3459
|
-
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
|
|
3463
|
-
|
|
3464
|
-
|
|
3465
|
-
|
|
3466
|
-
|
|
3467
|
-
const
|
|
3468
|
-
...options,
|
|
3469
|
-
model,
|
|
3470
|
-
provider
|
|
3471
|
-
});
|
|
3472
|
-
const workflowCredentials = credentials;
|
|
3473
|
-
const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, workflowCredentials);
|
|
3474
|
-
const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
|
|
3475
|
-
const isAudioOnly = isAudioOnlyAsset(assetData);
|
|
3476
|
-
if (isAudioOnly && !includeTranscript) {
|
|
3477
|
-
throw new Error(
|
|
3478
|
-
"Audio-only assets require a transcript. Set includeTranscript: true and ensure the asset has a ready text track (captions/subtitles)."
|
|
3479
|
-
);
|
|
3480
|
-
}
|
|
3481
|
-
const signingContext = await resolveMuxSigningContext(workflowCredentials);
|
|
3632
|
+
credentials: providedCredentials
|
|
3633
|
+
} = options;
|
|
3634
|
+
const credentials = providedCredentials;
|
|
3635
|
+
const { asset, playbackId, policy } = await getPlaybackIdForAsset(assetId, credentials);
|
|
3636
|
+
const videoTrackDurationSeconds = getVideoTrackDurationSecondsFromAsset(asset);
|
|
3637
|
+
const videoTrackFps = getVideoTrackMaxFrameRateFromAsset(asset);
|
|
3638
|
+
const assetDurationSeconds = getAssetDurationSecondsFromAsset(asset);
|
|
3639
|
+
const candidateDurations = [videoTrackDurationSeconds, assetDurationSeconds].filter(
|
|
3640
|
+
(d) => d != null
|
|
3641
|
+
);
|
|
3642
|
+
const duration = candidateDurations.length > 0 ? Math.min(...candidateDurations) : 0;
|
|
3643
|
+
const isAudioOnly = isAudioOnlyAsset(asset);
|
|
3644
|
+
const signingContext = await resolveMuxSigningContext(credentials);
|
|
3482
3645
|
if (policy === "signed" && !signingContext) {
|
|
3483
3646
|
throw new Error(
|
|
3484
3647
|
"Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
|
|
3485
3648
|
);
|
|
3486
3649
|
}
|
|
3487
|
-
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
|
|
3493
|
-
|
|
3494
|
-
|
|
3495
|
-
|
|
3496
|
-
|
|
3497
|
-
|
|
3498
|
-
|
|
3499
|
-
|
|
3500
|
-
|
|
3501
|
-
|
|
3502
|
-
|
|
3503
|
-
|
|
3504
|
-
|
|
3505
|
-
|
|
3506
|
-
|
|
3507
|
-
let analysisResponse;
|
|
3508
|
-
let imageUrl;
|
|
3509
|
-
const systemPrompt = isAudioOnly ? AUDIO_ONLY_SYSTEM_PROMPT : SYSTEM_PROMPT3;
|
|
3510
|
-
try {
|
|
3511
|
-
if (isAudioOnly) {
|
|
3512
|
-
analysisResponse = await analyzeAudioOnly(
|
|
3513
|
-
modelConfig.provider,
|
|
3514
|
-
modelConfig.modelId,
|
|
3515
|
-
userPrompt,
|
|
3516
|
-
systemPrompt,
|
|
3517
|
-
workflowCredentials
|
|
3518
|
-
);
|
|
3519
|
-
} else {
|
|
3520
|
-
const storyboardUrl = await getStoryboardUrl(playbackId, 640, policy === "signed", workflowCredentials);
|
|
3521
|
-
imageUrl = storyboardUrl;
|
|
3522
|
-
if (imageSubmissionMode === "base64") {
|
|
3523
|
-
const downloadResult = await downloadImageAsBase64(storyboardUrl, imageDownloadOptions);
|
|
3524
|
-
analysisResponse = await analyzeStoryboard2(
|
|
3525
|
-
downloadResult.base64Data,
|
|
3526
|
-
modelConfig.provider,
|
|
3527
|
-
modelConfig.modelId,
|
|
3528
|
-
userPrompt,
|
|
3529
|
-
systemPrompt,
|
|
3530
|
-
workflowCredentials
|
|
3531
|
-
);
|
|
3532
|
-
} else {
|
|
3533
|
-
analysisResponse = await withRetry(() => analyzeStoryboard2(
|
|
3534
|
-
storyboardUrl,
|
|
3535
|
-
modelConfig.provider,
|
|
3536
|
-
modelConfig.modelId,
|
|
3537
|
-
userPrompt,
|
|
3538
|
-
systemPrompt,
|
|
3539
|
-
workflowCredentials
|
|
3540
|
-
));
|
|
3541
|
-
}
|
|
3650
|
+
let thumbnailScores;
|
|
3651
|
+
let mode = "thumbnails";
|
|
3652
|
+
let thumbnailCount;
|
|
3653
|
+
if (isAudioOnly) {
|
|
3654
|
+
mode = "transcript";
|
|
3655
|
+
const readyTextTracks = getReadyTextTracks(asset);
|
|
3656
|
+
let transcriptResult = await fetchTranscriptForAsset(asset, playbackId, {
|
|
3657
|
+
languageCode,
|
|
3658
|
+
cleanTranscript: true,
|
|
3659
|
+
shouldSign: policy === "signed",
|
|
3660
|
+
credentials,
|
|
3661
|
+
required: true
|
|
3662
|
+
});
|
|
3663
|
+
if (!transcriptResult.track && readyTextTracks.length === 1) {
|
|
3664
|
+
transcriptResult = await fetchTranscriptForAsset(asset, playbackId, {
|
|
3665
|
+
cleanTranscript: true,
|
|
3666
|
+
shouldSign: policy === "signed",
|
|
3667
|
+
credentials,
|
|
3668
|
+
required: true
|
|
3669
|
+
});
|
|
3542
3670
|
}
|
|
3543
|
-
|
|
3544
|
-
|
|
3545
|
-
|
|
3546
|
-
|
|
3671
|
+
if (provider === "openai") {
|
|
3672
|
+
thumbnailScores = await requestOpenAITranscriptModeration(
|
|
3673
|
+
transcriptResult.transcriptText,
|
|
3674
|
+
model || "omni-moderation-latest",
|
|
3675
|
+
maxConcurrent,
|
|
3676
|
+
credentials
|
|
3677
|
+
);
|
|
3678
|
+
} else if (provider === "hive") {
|
|
3679
|
+
throw new Error("Hive does not support transcript moderation in this workflow. Use provider: 'openai' for audio-only assets.");
|
|
3680
|
+
} else {
|
|
3681
|
+
throw new Error(`Unsupported moderation provider: ${provider}`);
|
|
3682
|
+
}
|
|
3683
|
+
} else {
|
|
3684
|
+
const thumbnailUrls = maxSamples === void 0 ? (
|
|
3685
|
+
// Generate thumbnail URLs (signed if needed) using existing interval-based logic.
|
|
3686
|
+
await getThumbnailUrls(playbackId, duration, {
|
|
3687
|
+
interval: thumbnailInterval,
|
|
3688
|
+
width: thumbnailWidth,
|
|
3689
|
+
shouldSign: policy === "signed",
|
|
3690
|
+
credentials
|
|
3691
|
+
})
|
|
3692
|
+
) : (
|
|
3693
|
+
// In maxSamples mode, sample valid timestamps over the trimmed usable span.
|
|
3694
|
+
// Use proportional trims (≈ duration/6, capped at 5s) to stay well inside the
|
|
3695
|
+
// renderable range — Mux can't always serve thumbnails at the very edges.
|
|
3696
|
+
await getThumbnailUrlsFromTimestamps(
|
|
3697
|
+
playbackId,
|
|
3698
|
+
planSamplingTimestamps({
|
|
3699
|
+
duration_sec: duration,
|
|
3700
|
+
max_candidates: maxSamples,
|
|
3701
|
+
trim_start_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
|
|
3702
|
+
trim_end_sec: duration > 2 ? Math.min(5, Math.max(1, duration / 6)) : 0,
|
|
3703
|
+
fps: videoTrackFps,
|
|
3704
|
+
base_cadence_hz: thumbnailInterval > 0 ? 1 / thumbnailInterval : void 0
|
|
3705
|
+
}),
|
|
3706
|
+
{
|
|
3707
|
+
width: thumbnailWidth,
|
|
3708
|
+
shouldSign: policy === "signed",
|
|
3709
|
+
credentials
|
|
3710
|
+
}
|
|
3711
|
+
)
|
|
3547
3712
|
);
|
|
3713
|
+
thumbnailCount = thumbnailUrls.length;
|
|
3714
|
+
if (provider === "openai") {
|
|
3715
|
+
thumbnailScores = await requestOpenAIModeration(
|
|
3716
|
+
thumbnailUrls,
|
|
3717
|
+
model || "omni-moderation-latest",
|
|
3718
|
+
maxConcurrent,
|
|
3719
|
+
imageSubmissionMode,
|
|
3720
|
+
imageDownloadOptions,
|
|
3721
|
+
credentials
|
|
3722
|
+
);
|
|
3723
|
+
} else if (provider === "hive") {
|
|
3724
|
+
thumbnailScores = await requestHiveModeration(
|
|
3725
|
+
thumbnailUrls,
|
|
3726
|
+
maxConcurrent,
|
|
3727
|
+
imageSubmissionMode,
|
|
3728
|
+
imageDownloadOptions,
|
|
3729
|
+
credentials
|
|
3730
|
+
);
|
|
3731
|
+
} else {
|
|
3732
|
+
throw new Error(`Unsupported moderation provider: ${provider}`);
|
|
3733
|
+
}
|
|
3548
3734
|
}
|
|
3549
|
-
|
|
3550
|
-
|
|
3551
|
-
|
|
3552
|
-
|
|
3553
|
-
|
|
3554
|
-
|
|
3555
|
-
if (!analysisResponse.result.description) {
|
|
3556
|
-
throw new Error(`Failed to generate description for asset ${assetId}`);
|
|
3735
|
+
const failed = thumbnailScores.filter((s) => s.error);
|
|
3736
|
+
if (failed.length > 0) {
|
|
3737
|
+
const details = failed.map((s) => `${s.url}: ${s.errorMessage || "Unknown error"}`).join("; ");
|
|
3738
|
+
throw new Error(
|
|
3739
|
+
`Moderation failed for ${failed.length}/${thumbnailScores.length} thumbnail(s): ${details}`
|
|
3740
|
+
);
|
|
3557
3741
|
}
|
|
3742
|
+
const maxSexual = Math.max(...thumbnailScores.map((s) => s.sexual));
|
|
3743
|
+
const maxViolence = Math.max(...thumbnailScores.map((s) => s.violence));
|
|
3744
|
+
const finalThresholds = { ...DEFAULT_THRESHOLDS, ...thresholds };
|
|
3558
3745
|
return {
|
|
3559
3746
|
assetId,
|
|
3560
|
-
|
|
3561
|
-
|
|
3562
|
-
|
|
3563
|
-
storyboardUrl: imageUrl,
|
|
3564
|
-
// undefined for audio-only assets
|
|
3747
|
+
mode,
|
|
3748
|
+
isAudioOnly,
|
|
3749
|
+
thumbnailScores,
|
|
3565
3750
|
usage: {
|
|
3566
|
-
...analysisResponse.usage,
|
|
3567
3751
|
metadata: {
|
|
3568
|
-
assetDurationSeconds
|
|
3752
|
+
assetDurationSeconds: duration,
|
|
3753
|
+
...thumbnailCount === void 0 ? {} : { thumbnailCount }
|
|
3569
3754
|
}
|
|
3570
3755
|
},
|
|
3571
|
-
|
|
3756
|
+
maxScores: {
|
|
3757
|
+
sexual: maxSexual,
|
|
3758
|
+
violence: maxViolence
|
|
3759
|
+
},
|
|
3760
|
+
exceedsThreshold: maxSexual > finalThresholds.sexual || maxViolence > finalThresholds.violence,
|
|
3761
|
+
thresholds: finalThresholds
|
|
3572
3762
|
};
|
|
3573
3763
|
}
|
|
3574
3764
|
|
|
3575
|
-
// src/
|
|
3576
|
-
|
|
3577
|
-
|
|
3578
|
-
|
|
3579
|
-
var
|
|
3580
|
-
|
|
3581
|
-
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
|
|
3586
|
-
|
|
3587
|
-
|
|
3588
|
-
|
|
3589
|
-
|
|
3590
|
-
|
|
3591
|
-
|
|
3592
|
-
|
|
3593
|
-
|
|
3594
|
-
|
|
3595
|
-
|
|
3596
|
-
|
|
3597
|
-
|
|
3598
|
-
|
|
3765
|
+
// src/workflows/summarization.ts
|
|
3766
|
+
import { generateText as generateText5, Output as Output5 } from "ai";
|
|
3767
|
+
import dedent5 from "dedent";
|
|
3768
|
+
import { z as z6 } from "zod";
|
|
3769
|
+
var DEFAULT_SUMMARY_KEYWORD_LIMIT = 10;
|
|
3770
|
+
var DEFAULT_TITLE_LENGTH = 10;
|
|
3771
|
+
var DEFAULT_DESCRIPTION_LENGTH = 50;
|
|
3772
|
+
var summarySchema = z6.object({
|
|
3773
|
+
keywords: z6.array(z6.string()),
|
|
3774
|
+
title: z6.string(),
|
|
3775
|
+
description: z6.string()
|
|
3776
|
+
}).strict();
|
|
3777
|
+
var SUMMARY_OUTPUT = Output5.object({
|
|
3778
|
+
name: "summary_metadata",
|
|
3779
|
+
description: "Structured summary with title, description, and keywords.",
|
|
3780
|
+
schema: summarySchema
|
|
3781
|
+
});
|
|
3782
|
+
var VALID_TONES = ["neutral", "playful", "professional"];
|
|
3783
|
+
var TONE_INSTRUCTIONS = {
|
|
3784
|
+
neutral: "Provide a clear, straightforward analysis.",
|
|
3785
|
+
playful: "Channel your inner diva! Answer with maximum sass, wit, and playful attitude. Don't hold back - be cheeky, clever, and delightfully snarky. Make it pop!",
|
|
3786
|
+
professional: "Provide a professional, executive-level analysis suitable for business reporting."
|
|
3787
|
+
};
|
|
3788
|
+
var DESCRIPTION_LENGTH_THRESHOLD_SMALL = 25;
|
|
3789
|
+
var DESCRIPTION_LENGTH_THRESHOLD_LARGE = 100;
|
|
3790
|
+
function buildDescriptionGuidance(wordCount, contentType) {
|
|
3791
|
+
if (wordCount < DESCRIPTION_LENGTH_THRESHOLD_SMALL) {
|
|
3792
|
+
if (contentType === "video") {
|
|
3793
|
+
return dedent5`A brief summary of the video in no more than ${wordCount} words. Shorter is fine.
|
|
3794
|
+
Focus on the single most important subject or action.
|
|
3795
|
+
Write in present tense.`;
|
|
3796
|
+
}
|
|
3797
|
+
return dedent5`A brief summary of the audio content in no more than ${wordCount} words. Shorter is fine.
|
|
3798
|
+
Focus on the single most important topic or theme.
|
|
3799
|
+
Write in present tense.`;
|
|
3800
|
+
}
|
|
3801
|
+
if (wordCount > DESCRIPTION_LENGTH_THRESHOLD_LARGE) {
|
|
3802
|
+
if (contentType === "video") {
|
|
3803
|
+
return dedent5`A detailed summary that describes what happens across the video.
|
|
3804
|
+
Never exceed ${wordCount} words, but shorter is perfectly fine. You may use multiple sentences.
|
|
3805
|
+
Be thorough: cover subjects, actions, setting, progression, and any notable details visible across frames.
|
|
3806
|
+
Write in present tense. Be specific about observable details rather than making assumptions.
|
|
3807
|
+
If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`;
|
|
3808
|
+
}
|
|
3809
|
+
return dedent5`A detailed summary that describes the audio content.
|
|
3810
|
+
Never exceed ${wordCount} words, but shorter is perfectly fine. You may use multiple sentences.
|
|
3811
|
+
Be thorough: cover topics, speakers, themes, progression, and any notable insights.
|
|
3812
|
+
Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
|
|
3813
|
+
Focus on the spoken content and any key insights, dialogue, or narrative elements.`;
|
|
3814
|
+
}
|
|
3815
|
+
if (contentType === "video") {
|
|
3816
|
+
return dedent5`A summary that describes what happens across the video.
|
|
3817
|
+
Never exceed ${wordCount} words, but shorter is perfectly fine. You may use multiple sentences.
|
|
3818
|
+
Cover the main subjects, actions, setting, and any notable progression visible across frames.
|
|
3819
|
+
Write in present tense. Be specific about observable details rather than making assumptions.
|
|
3820
|
+
If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`;
|
|
3821
|
+
}
|
|
3822
|
+
return dedent5`A summary that describes the audio content.
|
|
3823
|
+
Never exceed ${wordCount} words, but shorter is perfectly fine. You may use multiple sentences.
|
|
3824
|
+
Cover the main topics, speakers, themes, and any notable progression in the discussion or narration.
|
|
3825
|
+
Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
|
|
3826
|
+
Focus on the spoken content and any key insights, dialogue, or narrative elements.`;
|
|
3599
3827
|
}
|
|
3600
|
-
|
|
3601
|
-
const
|
|
3602
|
-
|
|
3603
|
-
|
|
3604
|
-
|
|
3605
|
-
|
|
3606
|
-
|
|
3607
|
-
|
|
3608
|
-
|
|
3609
|
-
|
|
3828
|
+
function createSummarizationBuilder({ titleLength, descriptionLength, tagCount } = {}) {
|
|
3829
|
+
const titleLimit = titleLength ?? DEFAULT_TITLE_LENGTH;
|
|
3830
|
+
const keywordLimit = tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT;
|
|
3831
|
+
return createPromptBuilder({
|
|
3832
|
+
template: {
|
|
3833
|
+
task: {
|
|
3834
|
+
tag: "task",
|
|
3835
|
+
content: "Analyze the storyboard frames and generate metadata that captures the essence of the video content."
|
|
3836
|
+
},
|
|
3837
|
+
title: {
|
|
3838
|
+
tag: "title_requirements",
|
|
3839
|
+
content: dedent5`
|
|
3840
|
+
A concise, label-style title — not a sentence or description.
|
|
3841
|
+
Never exceed ${titleLimit} words, but shorter is better.
|
|
3842
|
+
Think of how a video card title, playlist entry, or file name would read — e.g. "Predator: Badlands Trailer" or "Chef Prepares Holiday Feast".
|
|
3843
|
+
Start with the primary subject or topic. Never begin with "A video of" or similar phrasing.
|
|
3844
|
+
Use specific nouns over lengthy descriptions. Avoid clauses, conjunctions, or narrative structure.`
|
|
3845
|
+
},
|
|
3846
|
+
description: {
|
|
3847
|
+
tag: "description_requirements",
|
|
3848
|
+
content: buildDescriptionGuidance(descriptionLength ?? DEFAULT_DESCRIPTION_LENGTH, "video")
|
|
3849
|
+
},
|
|
3850
|
+
keywords: {
|
|
3851
|
+
tag: "keywords_requirements",
|
|
3852
|
+
content: dedent5`
|
|
3853
|
+
Specific, searchable terms (up to ${keywordLimit}) that capture:
|
|
3854
|
+
- Primary subjects (people, animals, objects)
|
|
3855
|
+
- Actions and activities being performed
|
|
3856
|
+
- Setting and environment
|
|
3857
|
+
- Notable objects or tools
|
|
3858
|
+
- Style or genre (if applicable)
|
|
3859
|
+
Prefer concrete nouns and action verbs over abstract concepts.
|
|
3860
|
+
Use lowercase. Avoid redundant or overly generic terms like "video" or "content".`
|
|
3861
|
+
},
|
|
3862
|
+
qualityGuidelines: {
|
|
3863
|
+
tag: "quality_guidelines",
|
|
3864
|
+
content: dedent5`
|
|
3865
|
+
- Examine all frames to understand the full context and progression
|
|
3866
|
+
- Be precise: "golden retriever" is better than "dog" when identifiable
|
|
3867
|
+
- Capture the narrative: what begins, develops, and concludes
|
|
3868
|
+
- Balance brevity with informativeness`
|
|
3869
|
+
}
|
|
3870
|
+
},
|
|
3871
|
+
sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
|
|
3872
|
+
});
|
|
3610
3873
|
}
|
|
3611
|
-
|
|
3612
|
-
const
|
|
3613
|
-
const
|
|
3614
|
-
|
|
3615
|
-
|
|
3874
|
+
function createAudioOnlyBuilder({ titleLength, descriptionLength, tagCount } = {}) {
|
|
3875
|
+
const titleLimit = titleLength ?? DEFAULT_TITLE_LENGTH;
|
|
3876
|
+
const keywordLimit = tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT;
|
|
3877
|
+
return createPromptBuilder({
|
|
3878
|
+
template: {
|
|
3879
|
+
task: {
|
|
3880
|
+
tag: "task",
|
|
3881
|
+
content: "Analyze the transcript and generate metadata that captures the essence of the audio content."
|
|
3882
|
+
},
|
|
3883
|
+
title: {
|
|
3884
|
+
tag: "title_requirements",
|
|
3885
|
+
content: dedent5`
|
|
3886
|
+
A concise, label-style title — not a sentence or description.
|
|
3887
|
+
Never exceed ${titleLimit} words, but shorter is better.
|
|
3888
|
+
Think of how a podcast episode title or playlist entry would read — e.g. "Weekly News Roundup" or "Interview with Dr. Smith".
|
|
3889
|
+
Start with the primary subject or topic. Never begin with "An audio of" or similar phrasing.
|
|
3890
|
+
Use specific nouns over lengthy descriptions. Avoid clauses, conjunctions, or narrative structure.`
|
|
3891
|
+
},
|
|
3892
|
+
description: {
|
|
3893
|
+
tag: "description_requirements",
|
|
3894
|
+
content: buildDescriptionGuidance(descriptionLength ?? DEFAULT_DESCRIPTION_LENGTH, "audio")
|
|
3895
|
+
},
|
|
3896
|
+
keywords: {
|
|
3897
|
+
tag: "keywords_requirements",
|
|
3898
|
+
content: dedent5`
|
|
3899
|
+
Specific, searchable terms (up to ${keywordLimit}) that capture:
|
|
3900
|
+
- Primary topics and themes
|
|
3901
|
+
- Speakers or presenters (if named)
|
|
3902
|
+
- Key concepts and terminology
|
|
3903
|
+
- Content type (interview, lecture, music, etc.)
|
|
3904
|
+
- Genre or style (if applicable)
|
|
3905
|
+
Prefer concrete nouns and relevant terms over abstract concepts.
|
|
3906
|
+
Use lowercase. Avoid redundant or overly generic terms like "audio" or "content".`
|
|
3907
|
+
},
|
|
3908
|
+
qualityGuidelines: {
|
|
3909
|
+
tag: "quality_guidelines",
|
|
3910
|
+
content: dedent5`
|
|
3911
|
+
- Analyze the full transcript to understand context and themes
|
|
3912
|
+
- Be precise: use specific terminology when mentioned
|
|
3913
|
+
- Capture the narrative: what is introduced, discussed, and concluded
|
|
3914
|
+
- Balance brevity with informativeness`
|
|
3915
|
+
}
|
|
3916
|
+
},
|
|
3917
|
+
sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
|
|
3918
|
+
});
|
|
3616
3919
|
}
|
|
3617
|
-
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
|
|
3623
|
-
|
|
3624
|
-
|
|
3625
|
-
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3629
|
-
|
|
3630
|
-
|
|
3631
|
-
|
|
3632
|
-
|
|
3633
|
-
|
|
3634
|
-
|
|
3635
|
-
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
|
|
3639
|
-
|
|
3640
|
-
|
|
3641
|
-
|
|
3642
|
-
|
|
3643
|
-
|
|
3644
|
-
|
|
3645
|
-
|
|
3646
|
-
|
|
3647
|
-
|
|
3648
|
-
|
|
3649
|
-
|
|
3650
|
-
|
|
3651
|
-
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
|
|
3655
|
-
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
|
|
3659
|
-
|
|
3660
|
-
|
|
3920
|
+
var SYSTEM_PROMPT4 = dedent5`
|
|
3921
|
+
<role>
|
|
3922
|
+
You are a video content analyst specializing in storyboard interpretation and multimodal analysis.
|
|
3923
|
+
</role>
|
|
3924
|
+
|
|
3925
|
+
<context>
|
|
3926
|
+
You receive storyboard images containing multiple sequential frames extracted from a video.
|
|
3927
|
+
These frames are arranged in a grid and represent the visual progression of the content over time.
|
|
3928
|
+
Read frames left-to-right, top-to-bottom to understand the temporal sequence.
|
|
3929
|
+
</context>
|
|
3930
|
+
|
|
3931
|
+
<transcript_guidance>
|
|
3932
|
+
When a transcript is provided alongside the storyboard:
|
|
3933
|
+
- Use it to understand spoken content, dialogue, narration, and audio context
|
|
3934
|
+
- Correlate transcript content with visual frames to build a complete picture
|
|
3935
|
+
- Extract key terminology, names, and specific language used by speakers
|
|
3936
|
+
- Let the transcript inform keyword selection, especially for topics not visually obvious
|
|
3937
|
+
- Prioritize visual content for the description, but enrich it with transcript insights
|
|
3938
|
+
- If transcript and visuals conflict, trust the visual evidence
|
|
3939
|
+
</transcript_guidance>
|
|
3940
|
+
|
|
3941
|
+
<capabilities>
|
|
3942
|
+
- Extract meaning from visual sequences
|
|
3943
|
+
- Identify subjects, actions, settings, and narrative arcs
|
|
3944
|
+
- Generate accurate, searchable metadata
|
|
3945
|
+
- Synthesize visual and transcript information when provided
|
|
3946
|
+
</capabilities>
|
|
3947
|
+
|
|
3948
|
+
<constraints>
|
|
3949
|
+
- Only describe what is clearly observable in the frames or explicitly stated in the transcript
|
|
3950
|
+
- Do not fabricate details or make unsupported assumptions
|
|
3951
|
+
- Return structured data matching the requested schema
|
|
3952
|
+
- Output only the JSON object; no markdown or extra text
|
|
3953
|
+
- When a <language> section is provided, all output text MUST be written in that language
|
|
3954
|
+
</constraints>
|
|
3955
|
+
|
|
3956
|
+
<tone_guidance>
|
|
3957
|
+
Pay special attention to the <tone> section and lean heavily into those instructions.
|
|
3958
|
+
Adapt your entire analysis and writing style to match the specified tone - this should influence
|
|
3959
|
+
your word choice, personality, formality level, and overall presentation of the content.
|
|
3960
|
+
The tone instructions are not suggestions but core requirements for how you should express yourself.
|
|
3961
|
+
</tone_guidance>
|
|
3962
|
+
|
|
3963
|
+
<language_guidelines>
|
|
3964
|
+
AVOID these meta-descriptive phrases that reference the medium rather than the content:
|
|
3965
|
+
- "The image shows..." / "The storyboard shows..."
|
|
3966
|
+
- "In this video..." / "This video features..."
|
|
3967
|
+
- "The frames depict..." / "The footage shows..."
|
|
3968
|
+
- "We can see..." / "You can see..."
|
|
3969
|
+
- "The clip shows..." / "The scene shows..."
|
|
3970
|
+
|
|
3971
|
+
INSTEAD, describe the content directly:
|
|
3972
|
+
- BAD: "The video shows a chef preparing a meal"
|
|
3973
|
+
- GOOD: "A chef prepares a meal in a professional kitchen"
|
|
3974
|
+
|
|
3975
|
+
Write as if describing reality, not describing a recording of reality.
|
|
3976
|
+
</language_guidelines>`;
|
|
3977
|
+
var AUDIO_ONLY_SYSTEM_PROMPT = dedent5`
|
|
3978
|
+
<role>
|
|
3979
|
+
You are an audio content analyst specializing in transcript analysis and metadata generation.
|
|
3980
|
+
</role>
|
|
3981
|
+
|
|
3982
|
+
<context>
|
|
3983
|
+
You receive transcript text from audio-only content (podcasts, audiobooks, music, etc.).
|
|
3984
|
+
Your task is to analyze the spoken/audio content and generate accurate, searchable metadata.
|
|
3985
|
+
</context>
|
|
3986
|
+
|
|
3987
|
+
<transcript_guidance>
|
|
3988
|
+
- Carefully analyze the entire transcript to understand themes, topics, and key points
|
|
3989
|
+
- Extract key terminology, names, concepts, and specific language used
|
|
3990
|
+
- Identify the content type (interview, lecture, music, narration, etc.)
|
|
3991
|
+
- Note the tone, style, and any distinctive characteristics of the audio
|
|
3992
|
+
- Consider the intended audience and context based on language and content
|
|
3993
|
+
</transcript_guidance>
|
|
3994
|
+
|
|
3995
|
+
<capabilities>
|
|
3996
|
+
- Extract meaning and themes from spoken/audio content
|
|
3997
|
+
- Identify subjects, topics, speakers, and narrative structure
|
|
3998
|
+
- Generate accurate, searchable metadata from audio-based content
|
|
3999
|
+
- Understand context and intent from transcript alone
|
|
4000
|
+
</capabilities>
|
|
4001
|
+
|
|
4002
|
+
<constraints>
|
|
4003
|
+
- Only describe what is explicitly stated or strongly implied in the transcript
|
|
4004
|
+
- Do not fabricate details or make unsupported assumptions
|
|
4005
|
+
- Return structured data matching the requested schema
|
|
4006
|
+
- Focus entirely on audio/spoken content - there are no visual elements
|
|
4007
|
+
- Output only the JSON object; no markdown or extra text
|
|
4008
|
+
- When a <language> section is provided, all output text MUST be written in that language
|
|
4009
|
+
</constraints>
|
|
4010
|
+
|
|
4011
|
+
<tone_guidance>
|
|
4012
|
+
Pay special attention to the <tone> section and lean heavily into those instructions.
|
|
4013
|
+
Adapt your entire analysis and writing style to match the specified tone - this should influence
|
|
4014
|
+
your word choice, personality, formality level, and overall presentation of the content.
|
|
4015
|
+
The tone instructions are not suggestions but core requirements for how you should express yourself.
|
|
4016
|
+
</tone_guidance>
|
|
4017
|
+
|
|
4018
|
+
<language_guidelines>
|
|
4019
|
+
AVOID these meta-descriptive phrases that reference the medium rather than the content:
|
|
4020
|
+
- "The audio shows..." / "The transcript shows..."
|
|
4021
|
+
- "In this recording..." / "This audio features..."
|
|
4022
|
+
- "The speaker says..." / "We can hear..."
|
|
4023
|
+
- "The clip contains..." / "The recording shows..."
|
|
4024
|
+
|
|
4025
|
+
INSTEAD, describe the content directly:
|
|
4026
|
+
- BAD: "The audio features a discussion about climate change"
|
|
4027
|
+
- GOOD: "A panel discusses climate change impacts and solutions"
|
|
4028
|
+
|
|
4029
|
+
Write as if describing reality, not describing a recording of reality.
|
|
4030
|
+
</language_guidelines>`;
|
|
4031
|
+
function buildUserPrompt4({
|
|
4032
|
+
tone,
|
|
4033
|
+
transcriptText,
|
|
4034
|
+
isCleanTranscript = true,
|
|
4035
|
+
promptOverrides,
|
|
4036
|
+
isAudioOnly = false,
|
|
4037
|
+
titleLength,
|
|
4038
|
+
descriptionLength,
|
|
4039
|
+
tagCount,
|
|
4040
|
+
languageName
|
|
4041
|
+
}) {
|
|
4042
|
+
const contextSections = [createToneSection(TONE_INSTRUCTIONS[tone])];
|
|
4043
|
+
if (languageName) {
|
|
4044
|
+
contextSections.push(createLanguageSection(languageName));
|
|
4045
|
+
} else {
|
|
4046
|
+
contextSections.push({
|
|
4047
|
+
tag: "language",
|
|
4048
|
+
content: "Respond in English. Never switch languages to satisfy length constraints."
|
|
4049
|
+
});
|
|
3661
4050
|
}
|
|
3662
|
-
if (
|
|
3663
|
-
|
|
3664
|
-
|
|
3665
|
-
);
|
|
4051
|
+
if (transcriptText) {
|
|
4052
|
+
const format = isCleanTranscript ? "plain text" : "WebVTT";
|
|
4053
|
+
contextSections.push(createTranscriptSection(transcriptText, format));
|
|
3666
4054
|
}
|
|
4055
|
+
const constraints = { titleLength, descriptionLength, tagCount };
|
|
4056
|
+
const promptBuilder = isAudioOnly ? createAudioOnlyBuilder(constraints) : createSummarizationBuilder(constraints);
|
|
4057
|
+
return promptBuilder.buildWithContext(promptOverrides, contextSections);
|
|
3667
4058
|
}
|
|
3668
|
-
function
|
|
3669
|
-
|
|
3670
|
-
const
|
|
3671
|
-
const
|
|
3672
|
-
|
|
3673
|
-
|
|
3674
|
-
|
|
3675
|
-
|
|
3676
|
-
|
|
3677
|
-
|
|
3678
|
-
|
|
3679
|
-
|
|
3680
|
-
|
|
3681
|
-
|
|
3682
|
-
|
|
3683
|
-
|
|
3684
|
-
|
|
3685
|
-
|
|
3686
|
-
|
|
3687
|
-
secretAccessKey,
|
|
3688
|
-
endpoint,
|
|
3689
|
-
region,
|
|
3690
|
-
bucket,
|
|
3691
|
-
key,
|
|
3692
|
-
body,
|
|
3693
|
-
contentType
|
|
3694
|
-
}) {
|
|
3695
|
-
const resolvedEndpoint = normalizeEndpoint(endpoint);
|
|
3696
|
-
const canonicalUri = buildCanonicalUri(resolvedEndpoint, bucket, key);
|
|
3697
|
-
const host = resolvedEndpoint.host;
|
|
3698
|
-
const normalizedContentType = contentType?.trim();
|
|
3699
|
-
const { amzDate, shortDate } = formatAmzDate();
|
|
3700
|
-
const payloadHash = await sha256Hex(body);
|
|
3701
|
-
const signingHeaders = [
|
|
3702
|
-
["host", host],
|
|
3703
|
-
["x-amz-content-sha256", payloadHash],
|
|
3704
|
-
["x-amz-date", amzDate],
|
|
3705
|
-
...normalizedContentType ? [["content-type", normalizedContentType]] : []
|
|
3706
|
-
].sort(([a], [b]) => a.localeCompare(b));
|
|
3707
|
-
const canonicalHeaders = signingHeaders.map(([name, value]) => `${name}:${value}`).join("\n");
|
|
3708
|
-
const signedHeaders = signingHeaders.map(([name]) => name).join(";");
|
|
3709
|
-
const canonicalRequest = [
|
|
3710
|
-
"PUT",
|
|
3711
|
-
canonicalUri,
|
|
3712
|
-
"",
|
|
3713
|
-
`${canonicalHeaders}
|
|
3714
|
-
`,
|
|
3715
|
-
signedHeaders,
|
|
3716
|
-
payloadHash
|
|
3717
|
-
].join("\n");
|
|
3718
|
-
const credentialScope = buildCredentialScope(shortDate, region);
|
|
3719
|
-
const stringToSign = [
|
|
3720
|
-
AWS4_ALGORITHM,
|
|
3721
|
-
amzDate,
|
|
3722
|
-
credentialScope,
|
|
3723
|
-
await sha256Hex(canonicalRequest)
|
|
3724
|
-
].join("\n");
|
|
3725
|
-
const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
|
|
3726
|
-
const authorization = `${AWS4_ALGORITHM} Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}`;
|
|
3727
|
-
const requestUrl = `${resolvedEndpoint.origin}${canonicalUri}`;
|
|
3728
|
-
const response = await fetch(requestUrl, {
|
|
3729
|
-
method: "PUT",
|
|
3730
|
-
headers: {
|
|
3731
|
-
"Authorization": authorization,
|
|
3732
|
-
"x-amz-content-sha256": payloadHash,
|
|
3733
|
-
"x-amz-date": amzDate,
|
|
3734
|
-
...normalizedContentType ? { "content-type": normalizedContentType } : {}
|
|
3735
|
-
},
|
|
3736
|
-
body
|
|
4059
|
+
async function analyzeStoryboard2(imageDataUrl, provider, modelId, userPrompt, systemPrompt, credentials) {
|
|
4060
|
+
"use step";
|
|
4061
|
+
const model = await createLanguageModelFromConfig(provider, modelId, credentials);
|
|
4062
|
+
const response = await generateText5({
|
|
4063
|
+
model,
|
|
4064
|
+
output: SUMMARY_OUTPUT,
|
|
4065
|
+
messages: [
|
|
4066
|
+
{
|
|
4067
|
+
role: "system",
|
|
4068
|
+
content: systemPrompt
|
|
4069
|
+
},
|
|
4070
|
+
{
|
|
4071
|
+
role: "user",
|
|
4072
|
+
content: [
|
|
4073
|
+
{ type: "text", text: userPrompt },
|
|
4074
|
+
{ type: "image", image: imageDataUrl }
|
|
4075
|
+
]
|
|
4076
|
+
}
|
|
4077
|
+
]
|
|
3737
4078
|
});
|
|
3738
|
-
if (!response.
|
|
3739
|
-
|
|
3740
|
-
const detail = errorBody ? ` ${errorBody}` : "";
|
|
3741
|
-
throw new Error(`S3 PUT failed (${response.status} ${response.statusText}).${detail}`);
|
|
4079
|
+
if (!response.output) {
|
|
4080
|
+
throw new Error("Summarization output missing");
|
|
3742
4081
|
}
|
|
3743
|
-
|
|
3744
|
-
|
|
3745
|
-
|
|
3746
|
-
|
|
3747
|
-
|
|
3748
|
-
|
|
3749
|
-
|
|
3750
|
-
|
|
3751
|
-
|
|
3752
|
-
}
|
|
3753
|
-
|
|
3754
|
-
|
|
3755
|
-
|
|
3756
|
-
|
|
3757
|
-
const
|
|
3758
|
-
const
|
|
3759
|
-
|
|
3760
|
-
|
|
3761
|
-
|
|
3762
|
-
|
|
3763
|
-
|
|
3764
|
-
|
|
4082
|
+
const parsed = summarySchema.parse(response.output);
|
|
4083
|
+
return {
|
|
4084
|
+
result: parsed,
|
|
4085
|
+
usage: {
|
|
4086
|
+
inputTokens: response.usage.inputTokens,
|
|
4087
|
+
outputTokens: response.usage.outputTokens,
|
|
4088
|
+
totalTokens: response.usage.totalTokens,
|
|
4089
|
+
reasoningTokens: response.usage.reasoningTokens,
|
|
4090
|
+
cachedInputTokens: response.usage.cachedInputTokens
|
|
4091
|
+
}
|
|
4092
|
+
};
|
|
4093
|
+
}
|
|
4094
|
+
async function analyzeAudioOnly(provider, modelId, userPrompt, systemPrompt, credentials) {
|
|
4095
|
+
"use step";
|
|
4096
|
+
const model = await createLanguageModelFromConfig(provider, modelId, credentials);
|
|
4097
|
+
const response = await generateText5({
|
|
4098
|
+
model,
|
|
4099
|
+
output: SUMMARY_OUTPUT,
|
|
4100
|
+
messages: [
|
|
4101
|
+
{
|
|
4102
|
+
role: "system",
|
|
4103
|
+
content: systemPrompt
|
|
4104
|
+
},
|
|
4105
|
+
{
|
|
4106
|
+
role: "user",
|
|
4107
|
+
content: userPrompt
|
|
4108
|
+
}
|
|
4109
|
+
]
|
|
4110
|
+
});
|
|
4111
|
+
if (!response.output) {
|
|
4112
|
+
throw new Error("Summarization output missing");
|
|
4113
|
+
}
|
|
4114
|
+
const parsed = summarySchema.parse(response.output);
|
|
4115
|
+
return {
|
|
4116
|
+
result: parsed,
|
|
4117
|
+
usage: {
|
|
4118
|
+
inputTokens: response.usage.inputTokens,
|
|
4119
|
+
outputTokens: response.usage.outputTokens,
|
|
4120
|
+
totalTokens: response.usage.totalTokens,
|
|
4121
|
+
reasoningTokens: response.usage.reasoningTokens,
|
|
4122
|
+
cachedInputTokens: response.usage.cachedInputTokens
|
|
4123
|
+
}
|
|
3765
4124
|
};
|
|
3766
|
-
const canonicalQuery = buildCanonicalQuery(queryParams);
|
|
3767
|
-
const canonicalRequest = [
|
|
3768
|
-
"GET",
|
|
3769
|
-
canonicalUri,
|
|
3770
|
-
canonicalQuery,
|
|
3771
|
-
`host:${host}
|
|
3772
|
-
`,
|
|
3773
|
-
signedHeaders,
|
|
3774
|
-
"UNSIGNED-PAYLOAD"
|
|
3775
|
-
].join("\n");
|
|
3776
|
-
const stringToSign = [
|
|
3777
|
-
AWS4_ALGORITHM,
|
|
3778
|
-
amzDate,
|
|
3779
|
-
credentialScope,
|
|
3780
|
-
await sha256Hex(canonicalRequest)
|
|
3781
|
-
].join("\n");
|
|
3782
|
-
const signature = await signString(secretAccessKey, shortDate, region, stringToSign);
|
|
3783
|
-
const queryWithSignature = `${canonicalQuery}&X-Amz-Signature=${signature}`;
|
|
3784
|
-
return `${resolvedEndpoint.origin}${canonicalUri}?${queryWithSignature}`;
|
|
3785
4125
|
}
|
|
3786
|
-
|
|
3787
|
-
|
|
3788
|
-
|
|
3789
|
-
if (!accessKeyId || !secretAccessKey) {
|
|
3790
|
-
throw new Error(
|
|
3791
|
-
"S3 credentials are required for default storage operations. Provide S3_ACCESS_KEY_ID and S3_SECRET_ACCESS_KEY or pass options.storageAdapter."
|
|
3792
|
-
);
|
|
4126
|
+
function normalizeKeywords(keywords, limit = DEFAULT_SUMMARY_KEYWORD_LIMIT) {
|
|
4127
|
+
if (!Array.isArray(keywords) || keywords.length === 0) {
|
|
4128
|
+
return [];
|
|
3793
4129
|
}
|
|
3794
|
-
|
|
4130
|
+
const uniqueLowercase = /* @__PURE__ */ new Set();
|
|
4131
|
+
const normalized = [];
|
|
4132
|
+
for (const keyword of keywords) {
|
|
4133
|
+
const trimmed = keyword?.trim();
|
|
4134
|
+
if (!trimmed) {
|
|
4135
|
+
continue;
|
|
4136
|
+
}
|
|
4137
|
+
const lower = trimmed.toLowerCase();
|
|
4138
|
+
if (uniqueLowercase.has(lower)) {
|
|
4139
|
+
continue;
|
|
4140
|
+
}
|
|
4141
|
+
uniqueLowercase.add(lower);
|
|
4142
|
+
normalized.push(trimmed);
|
|
4143
|
+
if (normalized.length === limit) {
|
|
4144
|
+
break;
|
|
4145
|
+
}
|
|
4146
|
+
}
|
|
4147
|
+
return normalized;
|
|
3795
4148
|
}
|
|
3796
|
-
async function
|
|
3797
|
-
|
|
3798
|
-
|
|
3799
|
-
|
|
4149
|
+
async function getSummaryAndTags(assetId, options) {
|
|
4150
|
+
"use workflow";
|
|
4151
|
+
const {
|
|
4152
|
+
provider = "openai",
|
|
4153
|
+
model,
|
|
4154
|
+
tone = "neutral",
|
|
4155
|
+
includeTranscript = true,
|
|
4156
|
+
cleanTranscript = true,
|
|
4157
|
+
imageSubmissionMode = "url",
|
|
4158
|
+
imageDownloadOptions,
|
|
4159
|
+
promptOverrides,
|
|
4160
|
+
credentials,
|
|
4161
|
+
titleLength,
|
|
4162
|
+
descriptionLength,
|
|
4163
|
+
tagCount,
|
|
4164
|
+
outputLanguageCode
|
|
4165
|
+
} = options ?? {};
|
|
4166
|
+
if (!VALID_TONES.includes(tone)) {
|
|
4167
|
+
throw new Error(
|
|
4168
|
+
`Invalid tone "${tone}". Valid tones are: ${VALID_TONES.join(", ")}`
|
|
4169
|
+
);
|
|
3800
4170
|
}
|
|
3801
|
-
const
|
|
3802
|
-
|
|
3803
|
-
|
|
3804
|
-
|
|
3805
|
-
endpoint: input.endpoint,
|
|
3806
|
-
region: input.region,
|
|
3807
|
-
bucket: input.bucket,
|
|
3808
|
-
key: input.key,
|
|
3809
|
-
body: input.body,
|
|
3810
|
-
contentType: input.contentType
|
|
4171
|
+
const modelConfig = resolveLanguageModelConfig({
|
|
4172
|
+
...options,
|
|
4173
|
+
model,
|
|
4174
|
+
provider
|
|
3811
4175
|
});
|
|
3812
|
-
|
|
3813
|
-
|
|
3814
|
-
|
|
3815
|
-
|
|
4176
|
+
const workflowCredentials = credentials;
|
|
4177
|
+
const { asset: assetData, playbackId, policy } = await getPlaybackIdForAsset(assetId, workflowCredentials);
|
|
4178
|
+
const assetDurationSeconds = getAssetDurationSecondsFromAsset(assetData);
|
|
4179
|
+
const isAudioOnly = isAudioOnlyAsset(assetData);
|
|
4180
|
+
if (isAudioOnly && !includeTranscript) {
|
|
4181
|
+
throw new Error(
|
|
4182
|
+
"Audio-only assets require a transcript. Set includeTranscript: true and ensure the asset has a ready text track (captions/subtitles)."
|
|
4183
|
+
);
|
|
3816
4184
|
}
|
|
3817
|
-
const
|
|
3818
|
-
|
|
3819
|
-
|
|
3820
|
-
|
|
3821
|
-
|
|
3822
|
-
|
|
3823
|
-
|
|
3824
|
-
|
|
3825
|
-
|
|
4185
|
+
const signingContext = await resolveMuxSigningContext(workflowCredentials);
|
|
4186
|
+
if (policy === "signed" && !signingContext) {
|
|
4187
|
+
throw new Error(
|
|
4188
|
+
"Signed playback ID requires signing credentials. Set MUX_SIGNING_KEY and MUX_PRIVATE_KEY environment variables."
|
|
4189
|
+
);
|
|
4190
|
+
}
|
|
4191
|
+
const transcriptResult = includeTranscript ? await fetchTranscriptForAsset(assetData, playbackId, {
|
|
4192
|
+
cleanTranscript,
|
|
4193
|
+
shouldSign: policy === "signed",
|
|
4194
|
+
credentials: workflowCredentials,
|
|
4195
|
+
required: isAudioOnly
|
|
4196
|
+
}) : void 0;
|
|
4197
|
+
const transcriptText = transcriptResult?.transcriptText ?? "";
|
|
4198
|
+
const resolvedLanguageCode = outputLanguageCode && outputLanguageCode !== "auto" ? outputLanguageCode : transcriptResult?.track?.language_code ?? getReadyTextTracks(assetData)[0]?.language_code;
|
|
4199
|
+
const languageName = resolvedLanguageCode ? getLanguageName(resolvedLanguageCode) : void 0;
|
|
4200
|
+
const userPrompt = buildUserPrompt4({
|
|
4201
|
+
tone,
|
|
4202
|
+
transcriptText,
|
|
4203
|
+
isCleanTranscript: cleanTranscript,
|
|
4204
|
+
promptOverrides,
|
|
4205
|
+
isAudioOnly,
|
|
4206
|
+
titleLength,
|
|
4207
|
+
descriptionLength,
|
|
4208
|
+
tagCount,
|
|
4209
|
+
languageName
|
|
3826
4210
|
});
|
|
4211
|
+
let analysisResponse;
|
|
4212
|
+
let imageUrl;
|
|
4213
|
+
const systemPrompt = isAudioOnly ? AUDIO_ONLY_SYSTEM_PROMPT : SYSTEM_PROMPT4;
|
|
4214
|
+
try {
|
|
4215
|
+
if (isAudioOnly) {
|
|
4216
|
+
analysisResponse = await analyzeAudioOnly(
|
|
4217
|
+
modelConfig.provider,
|
|
4218
|
+
modelConfig.modelId,
|
|
4219
|
+
userPrompt,
|
|
4220
|
+
systemPrompt,
|
|
4221
|
+
workflowCredentials
|
|
4222
|
+
);
|
|
4223
|
+
} else {
|
|
4224
|
+
const storyboardUrl = await getStoryboardUrl(playbackId, 640, policy === "signed", workflowCredentials);
|
|
4225
|
+
imageUrl = storyboardUrl;
|
|
4226
|
+
if (imageSubmissionMode === "base64") {
|
|
4227
|
+
const downloadResult = await downloadImageAsBase64(storyboardUrl, imageDownloadOptions);
|
|
4228
|
+
analysisResponse = await analyzeStoryboard2(
|
|
4229
|
+
downloadResult.base64Data,
|
|
4230
|
+
modelConfig.provider,
|
|
4231
|
+
modelConfig.modelId,
|
|
4232
|
+
userPrompt,
|
|
4233
|
+
systemPrompt,
|
|
4234
|
+
workflowCredentials
|
|
4235
|
+
);
|
|
4236
|
+
} else {
|
|
4237
|
+
analysisResponse = await withRetry(() => analyzeStoryboard2(
|
|
4238
|
+
storyboardUrl,
|
|
4239
|
+
modelConfig.provider,
|
|
4240
|
+
modelConfig.modelId,
|
|
4241
|
+
userPrompt,
|
|
4242
|
+
systemPrompt,
|
|
4243
|
+
workflowCredentials
|
|
4244
|
+
));
|
|
4245
|
+
}
|
|
4246
|
+
}
|
|
4247
|
+
} catch (error) {
|
|
4248
|
+
const contentType = isAudioOnly ? "audio" : "video";
|
|
4249
|
+
throw new Error(
|
|
4250
|
+
`Failed to analyze ${contentType} content with ${provider}: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
4251
|
+
);
|
|
4252
|
+
}
|
|
4253
|
+
if (!analysisResponse.result) {
|
|
4254
|
+
throw new Error(`Failed to analyze video content for asset ${assetId}`);
|
|
4255
|
+
}
|
|
4256
|
+
if (!analysisResponse.result.title) {
|
|
4257
|
+
throw new Error(`Failed to generate title for asset ${assetId}`);
|
|
4258
|
+
}
|
|
4259
|
+
if (!analysisResponse.result.description) {
|
|
4260
|
+
throw new Error(`Failed to generate description for asset ${assetId}`);
|
|
4261
|
+
}
|
|
4262
|
+
return {
|
|
4263
|
+
assetId,
|
|
4264
|
+
title: analysisResponse.result.title,
|
|
4265
|
+
description: analysisResponse.result.description,
|
|
4266
|
+
tags: normalizeKeywords(analysisResponse.result.keywords, tagCount ?? DEFAULT_SUMMARY_KEYWORD_LIMIT),
|
|
4267
|
+
storyboardUrl: imageUrl,
|
|
4268
|
+
// undefined for audio-only assets
|
|
4269
|
+
usage: {
|
|
4270
|
+
...analysisResponse.usage,
|
|
4271
|
+
metadata: {
|
|
4272
|
+
assetDurationSeconds
|
|
4273
|
+
}
|
|
4274
|
+
},
|
|
4275
|
+
transcriptText: transcriptText || void 0
|
|
4276
|
+
};
|
|
3827
4277
|
}
|
|
3828
4278
|
|
|
3829
4279
|
// src/workflows/translate-audio.ts
|
|
@@ -4002,7 +4452,8 @@ async function uploadDubbedAudioToS3({
|
|
|
4002
4452
|
s3Endpoint,
|
|
4003
4453
|
s3Region,
|
|
4004
4454
|
s3Bucket,
|
|
4005
|
-
storageAdapter
|
|
4455
|
+
storageAdapter,
|
|
4456
|
+
s3SignedUrlExpirySeconds
|
|
4006
4457
|
}) {
|
|
4007
4458
|
"use step";
|
|
4008
4459
|
const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
|
|
@@ -4025,10 +4476,11 @@ async function uploadDubbedAudioToS3({
|
|
|
4025
4476
|
region: s3Region,
|
|
4026
4477
|
bucket: s3Bucket,
|
|
4027
4478
|
key: audioKey,
|
|
4028
|
-
expiresInSeconds:
|
|
4479
|
+
expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
|
|
4029
4480
|
}, storageAdapter);
|
|
4481
|
+
const expiryHours = Math.round((s3SignedUrlExpirySeconds ?? 86400) / 3600);
|
|
4030
4482
|
console.warn(`\u2705 Audio uploaded successfully to: ${audioKey}`);
|
|
4031
|
-
console.warn(`\u{1F517} Generated presigned URL (expires in 1
|
|
4483
|
+
console.warn(`\u{1F517} Generated presigned URL (expires in ${expiryHours} hour${expiryHours === 1 ? "" : "s"})`);
|
|
4032
4484
|
return presignedUrl;
|
|
4033
4485
|
}
|
|
4034
4486
|
async function createAudioTrackOnMux(assetId, languageCode, presignedUrl, credentials) {
|
|
@@ -4192,7 +4644,8 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
|
|
|
4192
4644
|
s3Endpoint,
|
|
4193
4645
|
s3Region,
|
|
4194
4646
|
s3Bucket,
|
|
4195
|
-
storageAdapter: effectiveStorageAdapter
|
|
4647
|
+
storageAdapter: effectiveStorageAdapter,
|
|
4648
|
+
s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
|
|
4196
4649
|
});
|
|
4197
4650
|
} catch (error) {
|
|
4198
4651
|
throw new Error(`Failed to upload audio to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
@@ -4230,24 +4683,24 @@ async function translateAudio(assetId, toLanguageCode, options = {}) {
|
|
|
4230
4683
|
// src/workflows/translate-captions.ts
|
|
4231
4684
|
import {
|
|
4232
4685
|
APICallError,
|
|
4233
|
-
generateText as
|
|
4686
|
+
generateText as generateText6,
|
|
4234
4687
|
NoObjectGeneratedError,
|
|
4235
|
-
Output as
|
|
4688
|
+
Output as Output6,
|
|
4236
4689
|
RetryError,
|
|
4237
4690
|
TypeValidationError
|
|
4238
4691
|
} from "ai";
|
|
4239
|
-
import
|
|
4240
|
-
import { z as
|
|
4241
|
-
var translationSchema =
|
|
4242
|
-
translation:
|
|
4692
|
+
import dedent6 from "dedent";
|
|
4693
|
+
import { z as z7 } from "zod";
|
|
4694
|
+
var translationSchema = z7.object({
|
|
4695
|
+
translation: z7.string()
|
|
4243
4696
|
});
|
|
4244
|
-
var
|
|
4697
|
+
var SYSTEM_PROMPT5 = dedent6`
|
|
4245
4698
|
You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user.
|
|
4246
4699
|
You may receive either a full VTT file or a chunk from a larger VTT.
|
|
4247
4700
|
Preserve all timestamps, cue ordering, and VTT formatting exactly as they appear.
|
|
4248
4701
|
Return JSON with a single key "translation" containing the translated VTT content.
|
|
4249
4702
|
`;
|
|
4250
|
-
var CUE_TRANSLATION_SYSTEM_PROMPT =
|
|
4703
|
+
var CUE_TRANSLATION_SYSTEM_PROMPT = dedent6`
|
|
4251
4704
|
You are a subtitle translation expert.
|
|
4252
4705
|
You will receive a sequence of subtitle cues extracted from a VTT file.
|
|
4253
4706
|
Translate the cues to the requested target language while preserving their original order.
|
|
@@ -4409,14 +4862,6 @@ function buildTranslationChunkRequests(vttContent, assetDurationSeconds, chunkin
|
|
|
4409
4862
|
)
|
|
4410
4863
|
};
|
|
4411
4864
|
}
|
|
4412
|
-
async function fetchVttFromMux(vttUrl) {
|
|
4413
|
-
"use step";
|
|
4414
|
-
const vttResponse = await fetch(vttUrl);
|
|
4415
|
-
if (!vttResponse.ok) {
|
|
4416
|
-
throw new Error(`Failed to fetch VTT file: ${vttResponse.statusText}`);
|
|
4417
|
-
}
|
|
4418
|
-
return vttResponse.text();
|
|
4419
|
-
}
|
|
4420
4865
|
async function translateVttWithAI({
|
|
4421
4866
|
vttContent,
|
|
4422
4867
|
fromLanguageCode,
|
|
@@ -4427,13 +4872,13 @@ async function translateVttWithAI({
|
|
|
4427
4872
|
}) {
|
|
4428
4873
|
"use step";
|
|
4429
4874
|
const model = await createLanguageModelFromConfig(provider, modelId, credentials);
|
|
4430
|
-
const response = await
|
|
4875
|
+
const response = await generateText6({
|
|
4431
4876
|
model,
|
|
4432
|
-
output:
|
|
4877
|
+
output: Output6.object({ schema: translationSchema }),
|
|
4433
4878
|
messages: [
|
|
4434
4879
|
{
|
|
4435
4880
|
role: "system",
|
|
4436
|
-
content:
|
|
4881
|
+
content: SYSTEM_PROMPT5
|
|
4437
4882
|
},
|
|
4438
4883
|
{
|
|
4439
4884
|
role: "user",
|
|
@@ -4464,8 +4909,8 @@ async function translateCueChunkWithAI({
|
|
|
4464
4909
|
}) {
|
|
4465
4910
|
"use step";
|
|
4466
4911
|
const model = await createLanguageModelFromConfig(provider, modelId, credentials);
|
|
4467
|
-
const schema =
|
|
4468
|
-
translations:
|
|
4912
|
+
const schema = z7.object({
|
|
4913
|
+
translations: z7.array(z7.string().min(1)).length(cues.length)
|
|
4469
4914
|
});
|
|
4470
4915
|
const cuePayload = cues.map((cue, index) => ({
|
|
4471
4916
|
index,
|
|
@@ -4473,9 +4918,9 @@ async function translateCueChunkWithAI({
|
|
|
4473
4918
|
endTime: cue.endTime,
|
|
4474
4919
|
text: cue.text
|
|
4475
4920
|
}));
|
|
4476
|
-
const response = await
|
|
4921
|
+
const response = await generateText6({
|
|
4477
4922
|
model,
|
|
4478
|
-
output:
|
|
4923
|
+
output: Output6.object({ schema }),
|
|
4479
4924
|
messages: [
|
|
4480
4925
|
{
|
|
4481
4926
|
role: "system",
|
|
@@ -4632,7 +5077,8 @@ async function uploadVttToS3({
|
|
|
4632
5077
|
s3Endpoint,
|
|
4633
5078
|
s3Region,
|
|
4634
5079
|
s3Bucket,
|
|
4635
|
-
storageAdapter
|
|
5080
|
+
storageAdapter,
|
|
5081
|
+
s3SignedUrlExpirySeconds
|
|
4636
5082
|
}) {
|
|
4637
5083
|
"use step";
|
|
4638
5084
|
const s3AccessKeyId = env_default.S3_ACCESS_KEY_ID;
|
|
@@ -4655,25 +5101,9 @@ async function uploadVttToS3({
|
|
|
4655
5101
|
region: s3Region,
|
|
4656
5102
|
bucket: s3Bucket,
|
|
4657
5103
|
key: vttKey,
|
|
4658
|
-
expiresInSeconds:
|
|
5104
|
+
expiresInSeconds: s3SignedUrlExpirySeconds ?? 86400
|
|
4659
5105
|
}, storageAdapter);
|
|
4660
5106
|
}
|
|
4661
|
-
async function createTextTrackOnMux(assetId, languageCode, trackName, presignedUrl, credentials) {
|
|
4662
|
-
"use step";
|
|
4663
|
-
const muxClient = await resolveMuxClient(credentials);
|
|
4664
|
-
const mux = await muxClient.createClient();
|
|
4665
|
-
const trackResponse = await mux.video.assets.createTrack(assetId, {
|
|
4666
|
-
type: "text",
|
|
4667
|
-
text_type: "subtitles",
|
|
4668
|
-
language_code: languageCode,
|
|
4669
|
-
name: trackName,
|
|
4670
|
-
url: presignedUrl
|
|
4671
|
-
});
|
|
4672
|
-
if (!trackResponse.id) {
|
|
4673
|
-
throw new Error("Failed to create text track: no track ID returned from Mux");
|
|
4674
|
-
}
|
|
4675
|
-
return trackResponse.id;
|
|
4676
|
-
}
|
|
4677
5107
|
async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, options) {
|
|
4678
5108
|
"use workflow";
|
|
4679
5109
|
const {
|
|
@@ -4791,7 +5221,8 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
|
|
|
4791
5221
|
s3Endpoint,
|
|
4792
5222
|
s3Region,
|
|
4793
5223
|
s3Bucket,
|
|
4794
|
-
storageAdapter: effectiveStorageAdapter
|
|
5224
|
+
storageAdapter: effectiveStorageAdapter,
|
|
5225
|
+
s3SignedUrlExpirySeconds: options.s3SignedUrlExpirySeconds
|
|
4795
5226
|
});
|
|
4796
5227
|
} catch (error) {
|
|
4797
5228
|
throw new Error(`Failed to upload VTT to S3: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
@@ -4824,23 +5255,33 @@ async function translateCaptions(assetId, fromLanguageCode, toLanguageCode, opti
|
|
|
4824
5255
|
};
|
|
4825
5256
|
}
|
|
4826
5257
|
export {
|
|
5258
|
+
DEFAULT_DESCRIPTION_LENGTH,
|
|
5259
|
+
DEFAULT_SUMMARY_KEYWORD_LIMIT,
|
|
5260
|
+
DEFAULT_TITLE_LENGTH,
|
|
4827
5261
|
HIVE_SEXUAL_CATEGORIES,
|
|
4828
5262
|
HIVE_VIOLENCE_CATEGORIES,
|
|
4829
|
-
SUMMARY_KEYWORD_LIMIT,
|
|
4830
5263
|
aggregateTokenUsage,
|
|
5264
|
+
applyOverrideLists,
|
|
5265
|
+
applyReplacements,
|
|
4831
5266
|
askQuestions,
|
|
5267
|
+
buildReplacementRegex,
|
|
4832
5268
|
burnedInCaptionsSchema,
|
|
5269
|
+
censorVttContent,
|
|
4833
5270
|
chapterSchema,
|
|
4834
5271
|
chaptersSchema,
|
|
5272
|
+
createReplacer,
|
|
5273
|
+
editCaptions,
|
|
4835
5274
|
generateChapters,
|
|
4836
5275
|
generateEmbeddings,
|
|
4837
5276
|
generateVideoEmbeddings,
|
|
4838
5277
|
getModerationScores,
|
|
4839
5278
|
getSummaryAndTags,
|
|
4840
5279
|
hasBurnedInCaptions,
|
|
5280
|
+
profanityDetectionSchema,
|
|
4841
5281
|
questionAnswerSchema,
|
|
4842
5282
|
shouldSplitChunkTranslationError,
|
|
4843
5283
|
summarySchema,
|
|
5284
|
+
transformCueText,
|
|
4844
5285
|
translateAudio,
|
|
4845
5286
|
translateCaptions,
|
|
4846
5287
|
translationSchema
|