@mux/ai 0.7.6 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- export { A as AskQuestionsOptions, a as AskQuestionsResult, b as AskQuestionsType, c as AudioTranslationOptions, d as AudioTranslationResult, B as BurnedInCaptionsAnalysis, e as BurnedInCaptionsOptions, f as BurnedInCaptionsPromptOverrides, g as BurnedInCaptionsPromptSections, h as BurnedInCaptionsResult, C as Chapter, j as ChapterSystemPromptSections, k as ChaptersOptions, l as ChaptersPromptOverrides, m as ChaptersPromptSections, n as ChaptersResult, o as ChaptersType, E as EmbeddingsOptions, p as EmbeddingsResult, H as HIVE_SEXUAL_CATEGORIES, q as HIVE_VIOLENCE_CATEGORIES, r as HiveModerationOutput, s as HiveModerationSource, M as ModerationOptions, t as ModerationProvider, u as ModerationResult, Q as Question, v as QuestionAnswer, w as QuestionAnswerType, S as SUMMARY_KEYWORD_LIMIT, x as SummarizationOptions, y as SummarizationPromptOverrides, z as SummarizationPromptSections, D as SummaryAndTagsResult, F as SummaryType, T as ThumbnailModerationScore, G as TranslationOptions, I as TranslationPayload, J as TranslationResult, K as askQuestions, L as burnedInCaptionsSchema, N as chapterSchema, O as chaptersSchema, P as generateChapters, R as generateEmbeddings, U as generateVideoEmbeddings, V as getModerationScores, W as getSummaryAndTags, X as hasBurnedInCaptions, Y as questionAnswerSchema, Z as summarySchema, _ as translateAudio, $ as translateCaptions, a0 as translationSchema } from '../index-B0U9upb4.js';
1
+ export { A as AskQuestionsOptions, a as AskQuestionsResult, b as AskQuestionsType, c as AudioTranslationOptions, d as AudioTranslationResult, B as BurnedInCaptionsAnalysis, e as BurnedInCaptionsOptions, f as BurnedInCaptionsPromptOverrides, g as BurnedInCaptionsPromptSections, h as BurnedInCaptionsResult, C as Chapter, j as ChapterSystemPromptSections, k as ChaptersOptions, l as ChaptersPromptOverrides, m as ChaptersPromptSections, n as ChaptersResult, o as ChaptersType, E as EmbeddingsOptions, p as EmbeddingsResult, H as HIVE_SEXUAL_CATEGORIES, q as HIVE_VIOLENCE_CATEGORIES, r as HiveModerationOutput, s as HiveModerationSource, M as ModerationOptions, t as ModerationProvider, u as ModerationResult, Q as Question, v as QuestionAnswer, w as QuestionAnswerType, S as SUMMARY_KEYWORD_LIMIT, x as SummarizationOptions, y as SummarizationPromptOverrides, z as SummarizationPromptSections, D as SummaryAndTagsResult, F as SummaryType, T as ThumbnailModerationScore, G as TranslationOptions, I as TranslationPayload, J as TranslationResult, K as askQuestions, L as burnedInCaptionsSchema, N as chapterSchema, O as chaptersSchema, P as generateChapters, R as generateEmbeddings, U as generateVideoEmbeddings, V as getModerationScores, W as getSummaryAndTags, X as hasBurnedInCaptions, Y as questionAnswerSchema, Z as summarySchema, _ as translateAudio, $ as translateCaptions, a0 as translationSchema } from '../index-DP02N3iR.js';
2
2
  import 'zod';
3
3
  import '@ai-sdk/anthropic';
4
4
  import '@ai-sdk/google';
@@ -1170,16 +1170,6 @@ var SYSTEM_PROMPT = dedent`
1170
1170
  - GOOD: "A person runs through a park"
1171
1171
  - Be specific and evidence-based
1172
1172
  </language_guidelines>`;
1173
- function buildSystemPrompt(allowedAnswers) {
1174
- const answerList = allowedAnswers.map((answer) => `"${answer}"`).join(", ");
1175
- return `${SYSTEM_PROMPT}
1176
-
1177
- ${dedent`
1178
- <response_options>
1179
- Allowed answers: ${answerList}
1180
- </response_options>
1181
- `}`;
1182
- }
1183
1173
  var askQuestionsPromptBuilder = createPromptBuilder({
1184
1174
  template: {
1185
1175
  questions: {
@@ -1189,21 +1179,30 @@ var askQuestionsPromptBuilder = createPromptBuilder({
1189
1179
  },
1190
1180
  sectionOrder: ["questions"]
1191
1181
  });
1192
- function buildUserPrompt(questions, transcriptText, isCleanTranscript = true) {
1182
+ function buildUserPrompt(questions, allowedAnswers, transcriptText, isCleanTranscript = true) {
1193
1183
  const questionsList = questions.map((q, idx) => `${idx + 1}. ${q.question}`).join("\n");
1194
1184
  const questionsContent = dedent`
1195
1185
  Please answer the following yes/no questions about this video:
1196
1186
 
1197
1187
  ${questionsList}`;
1188
+ const answerList = allowedAnswers.map((answer) => `"${answer}"`).join(", ");
1189
+ const responseOptions = dedent`
1190
+ <response_options>
1191
+ Allowed answers: ${answerList}
1192
+ </response_options>`;
1193
+ const questionsSection = askQuestionsPromptBuilder.build({ questions: questionsContent });
1198
1194
  if (!transcriptText) {
1199
- return askQuestionsPromptBuilder.build({ questions: questionsContent });
1195
+ return `${questionsSection}
1196
+
1197
+ ${responseOptions}`;
1200
1198
  }
1201
1199
  const format = isCleanTranscript ? "plain text" : "WebVTT";
1202
- const transcriptSection = createTranscriptSection(transcriptText, format);
1203
- return askQuestionsPromptBuilder.buildWithContext(
1204
- { questions: questionsContent },
1205
- [transcriptSection]
1206
- );
1200
+ const transcriptSection = renderSection(createTranscriptSection(transcriptText, format));
1201
+ return `${transcriptSection}
1202
+
1203
+ ${questionsSection}
1204
+
1205
+ ${responseOptions}`;
1207
1206
  }
1208
1207
  async function fetchImageAsBase64(imageUrl, imageDownloadOptions) {
1209
1208
  "use step";
@@ -1299,8 +1298,8 @@ async function askQuestions(assetId, questions, options) {
1299
1298
  cleanTranscript,
1300
1299
  shouldSign: policy === "signed"
1301
1300
  })).transcriptText : "";
1302
- const userPrompt = buildUserPrompt(questions, transcriptText, cleanTranscript);
1303
- const systemPrompt = buildSystemPrompt(normalizedAnswerOptions);
1301
+ const userPrompt = buildUserPrompt(questions, allowedAnswers, transcriptText, cleanTranscript);
1302
+ const systemPrompt = SYSTEM_PROMPT;
1304
1303
  const imageUrl = await getStoryboardUrl(
1305
1304
  playbackId,
1306
1305
  storyboardWidth,
@@ -2139,7 +2138,7 @@ async function getThumbnailUrls(playbackId, duration, options = {}) {
2139
2138
 
2140
2139
  // src/workflows/moderation.ts
2141
2140
  var DEFAULT_THRESHOLDS = {
2142
- sexual: 0.7,
2141
+ sexual: 0.8,
2143
2142
  violence: 0.8
2144
2143
  };
2145
2144
  var DEFAULT_PROVIDER2 = "openai";
@@ -2581,96 +2580,106 @@ var TONE_INSTRUCTIONS = {
2581
2580
  playful: "Channel your inner diva! Answer with maximum sass, wit, and playful attitude. Don't hold back - be cheeky, clever, and delightfully snarky. Make it pop!",
2582
2581
  professional: "Provide a professional, executive-level analysis suitable for business reporting."
2583
2582
  };
2584
- var summarizationPromptBuilder = createPromptBuilder({
2585
- template: {
2586
- task: {
2587
- tag: "task",
2588
- content: "Analyze the storyboard frames and generate metadata that captures the essence of the video content."
2589
- },
2590
- title: {
2591
- tag: "title_requirements",
2592
- content: dedent4`
2593
- A short, compelling headline that immediately communicates the subject or action.
2594
- Aim for brevity - typically under 10 words. Think of how a news headline or video card title would read.
2595
- Start with the primary subject, action, or topic - never begin with "A video of" or similar phrasing.
2596
- Use active, specific language.`
2597
- },
2598
- description: {
2599
- tag: "description_requirements",
2600
- content: dedent4`
2601
- A concise summary (2-4 sentences) that describes what happens across the video.
2602
- Cover the main subjects, actions, setting, and any notable progression visible across frames.
2603
- Write in present tense. Be specific about observable details rather than making assumptions.
2604
- If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`
2605
- },
2606
- keywords: {
2607
- tag: "keywords_requirements",
2608
- content: dedent4`
2609
- Specific, searchable terms (up to ${SUMMARY_KEYWORD_LIMIT}) that capture:
2610
- - Primary subjects (people, animals, objects)
2611
- - Actions and activities being performed
2612
- - Setting and environment
2613
- - Notable objects or tools
2614
- - Style or genre (if applicable)
2615
- Prefer concrete nouns and action verbs over abstract concepts.
2616
- Use lowercase. Avoid redundant or overly generic terms like "video" or "content".`
2617
- },
2618
- qualityGuidelines: {
2619
- tag: "quality_guidelines",
2620
- content: dedent4`
2621
- - Examine all frames to understand the full context and progression
2622
- - Be precise: "golden retriever" is better than "dog" when identifiable
2623
- - Capture the narrative: what begins, develops, and concludes
2624
- - Balance brevity with informativeness`
2625
- }
2626
- },
2627
- sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
2628
- });
2629
- var audioOnlyPromptBuilder = createPromptBuilder({
2630
- template: {
2631
- task: {
2632
- tag: "task",
2633
- content: "Analyze the transcript and generate metadata that captures the essence of the audio content."
2634
- },
2635
- title: {
2636
- tag: "title_requirements",
2637
- content: dedent4`
2638
- A short, compelling headline that immediately communicates the subject or topic.
2639
- Aim for brevity - typically under 10 words. Think of how a podcast title or audio description would read.
2640
- Start with the primary subject, action, or topic - never begin with "An audio of" or similar phrasing.
2641
- Use active, specific language.`
2642
- },
2643
- description: {
2644
- tag: "description_requirements",
2645
- content: dedent4`
2646
- A concise summary (2-4 sentences) that describes the audio content.
2647
- Cover the main topics, speakers, themes, and any notable progression in the discussion or narration.
2648
- Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
2649
- Focus on the spoken content and any key insights, dialogue, or narrative elements.`
2583
+ function createSummarizationBuilder({ titleLength, descriptionLength, tagCount } = {}) {
2584
+ const titleBrevity = titleLength != null ? `Aim for approximately ${titleLength} characters.` : "Aim for brevity - typically under 10 words.";
2585
+ const descConstraint = descriptionLength != null ? `approximately ${descriptionLength} characters` : "2-4 sentences";
2586
+ const keywordLimit = tagCount ?? SUMMARY_KEYWORD_LIMIT;
2587
+ return createPromptBuilder({
2588
+ template: {
2589
+ task: {
2590
+ tag: "task",
2591
+ content: "Analyze the storyboard frames and generate metadata that captures the essence of the video content."
2592
+ },
2593
+ title: {
2594
+ tag: "title_requirements",
2595
+ content: dedent4`
2596
+ A short, compelling headline that immediately communicates the subject or action.
2597
+ ${titleBrevity} Think of how a news headline or video card title would read.
2598
+ Start with the primary subject, action, or topic - never begin with "A video of" or similar phrasing.
2599
+ Use active, specific language.`
2600
+ },
2601
+ description: {
2602
+ tag: "description_requirements",
2603
+ content: dedent4`
2604
+ A concise summary (${descConstraint}) that describes what happens across the video.
2605
+ Cover the main subjects, actions, setting, and any notable progression visible across frames.
2606
+ Write in present tense. Be specific about observable details rather than making assumptions.
2607
+ If the transcript provides dialogue or narration, incorporate key points but prioritize visual content.`
2608
+ },
2609
+ keywords: {
2610
+ tag: "keywords_requirements",
2611
+ content: dedent4`
2612
+ Specific, searchable terms (up to ${keywordLimit}) that capture:
2613
+ - Primary subjects (people, animals, objects)
2614
+ - Actions and activities being performed
2615
+ - Setting and environment
2616
+ - Notable objects or tools
2617
+ - Style or genre (if applicable)
2618
+ Prefer concrete nouns and action verbs over abstract concepts.
2619
+ Use lowercase. Avoid redundant or overly generic terms like "video" or "content".`
2620
+ },
2621
+ qualityGuidelines: {
2622
+ tag: "quality_guidelines",
2623
+ content: dedent4`
2624
+ - Examine all frames to understand the full context and progression
2625
+ - Be precise: "golden retriever" is better than "dog" when identifiable
2626
+ - Capture the narrative: what begins, develops, and concludes
2627
+ - Balance brevity with informativeness`
2628
+ }
2650
2629
  },
2651
- keywords: {
2652
- tag: "keywords_requirements",
2653
- content: dedent4`
2654
- Specific, searchable terms (up to ${SUMMARY_KEYWORD_LIMIT}) that capture:
2655
- - Primary topics and themes
2656
- - Speakers or presenters (if named)
2657
- - Key concepts and terminology
2658
- - Content type (interview, lecture, music, etc.)
2659
- - Genre or style (if applicable)
2660
- Prefer concrete nouns and relevant terms over abstract concepts.
2661
- Use lowercase. Avoid redundant or overly generic terms like "audio" or "content".`
2630
+ sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
2631
+ });
2632
+ }
2633
+ function createAudioOnlyBuilder({ titleLength, descriptionLength, tagCount } = {}) {
2634
+ const titleBrevity = titleLength != null ? `Aim for approximately ${titleLength} characters.` : "Aim for brevity - typically under 10 words.";
2635
+ const descConstraint = descriptionLength != null ? `approximately ${descriptionLength} characters` : "2-4 sentences";
2636
+ const keywordLimit = tagCount ?? SUMMARY_KEYWORD_LIMIT;
2637
+ return createPromptBuilder({
2638
+ template: {
2639
+ task: {
2640
+ tag: "task",
2641
+ content: "Analyze the transcript and generate metadata that captures the essence of the audio content."
2642
+ },
2643
+ title: {
2644
+ tag: "title_requirements",
2645
+ content: dedent4`
2646
+ A short, compelling headline that immediately communicates the subject or topic.
2647
+ ${titleBrevity} Think of how a podcast title or audio description would read.
2648
+ Start with the primary subject, action, or topic - never begin with "An audio of" or similar phrasing.
2649
+ Use active, specific language.`
2650
+ },
2651
+ description: {
2652
+ tag: "description_requirements",
2653
+ content: dedent4`
2654
+ A concise summary (${descConstraint}) that describes the audio content.
2655
+ Cover the main topics, speakers, themes, and any notable progression in the discussion or narration.
2656
+ Write in present tense. Be specific about what is discussed or presented rather than making assumptions.
2657
+ Focus on the spoken content and any key insights, dialogue, or narrative elements.`
2658
+ },
2659
+ keywords: {
2660
+ tag: "keywords_requirements",
2661
+ content: dedent4`
2662
+ Specific, searchable terms (up to ${keywordLimit}) that capture:
2663
+ - Primary topics and themes
2664
+ - Speakers or presenters (if named)
2665
+ - Key concepts and terminology
2666
+ - Content type (interview, lecture, music, etc.)
2667
+ - Genre or style (if applicable)
2668
+ Prefer concrete nouns and relevant terms over abstract concepts.
2669
+ Use lowercase. Avoid redundant or overly generic terms like "audio" or "content".`
2670
+ },
2671
+ qualityGuidelines: {
2672
+ tag: "quality_guidelines",
2673
+ content: dedent4`
2674
+ - Analyze the full transcript to understand context and themes
2675
+ - Be precise: use specific terminology when mentioned
2676
+ - Capture the narrative: what is introduced, discussed, and concluded
2677
+ - Balance brevity with informativeness`
2678
+ }
2662
2679
  },
2663
- qualityGuidelines: {
2664
- tag: "quality_guidelines",
2665
- content: dedent4`
2666
- - Analyze the full transcript to understand context and themes
2667
- - Be precise: use specific terminology when mentioned
2668
- - Capture the narrative: what is introduced, discussed, and concluded
2669
- - Balance brevity with informativeness`
2670
- }
2671
- },
2672
- sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
2673
- });
2680
+ sectionOrder: ["task", "title", "description", "keywords", "qualityGuidelines"]
2681
+ });
2682
+ }
2674
2683
  var SYSTEM_PROMPT3 = dedent4`
2675
2684
  <role>
2676
2685
  You are a video content analyst specializing in storyboard interpretation and multimodal analysis.
@@ -2785,14 +2794,18 @@ function buildUserPrompt4({
2785
2794
  transcriptText,
2786
2795
  isCleanTranscript = true,
2787
2796
  promptOverrides,
2788
- isAudioOnly = false
2797
+ isAudioOnly = false,
2798
+ titleLength,
2799
+ descriptionLength,
2800
+ tagCount
2789
2801
  }) {
2790
2802
  const contextSections = [createToneSection(TONE_INSTRUCTIONS[tone])];
2791
2803
  if (transcriptText) {
2792
2804
  const format = isCleanTranscript ? "plain text" : "WebVTT";
2793
2805
  contextSections.push(createTranscriptSection(transcriptText, format));
2794
2806
  }
2795
- const promptBuilder = isAudioOnly ? audioOnlyPromptBuilder : summarizationPromptBuilder;
2807
+ const constraints = { titleLength, descriptionLength, tagCount };
2808
+ const promptBuilder = isAudioOnly ? createAudioOnlyBuilder(constraints) : createSummarizationBuilder(constraints);
2796
2809
  return promptBuilder.buildWithContext(promptOverrides, contextSections);
2797
2810
  }
2798
2811
  async function analyzeStoryboard2(imageDataUrl, provider, modelId, userPrompt, systemPrompt, credentials) {
@@ -2862,7 +2875,7 @@ async function analyzeAudioOnly(provider, modelId, userPrompt, systemPrompt, cre
2862
2875
  }
2863
2876
  };
2864
2877
  }
2865
- function normalizeKeywords(keywords) {
2878
+ function normalizeKeywords(keywords, limit = SUMMARY_KEYWORD_LIMIT) {
2866
2879
  if (!Array.isArray(keywords) || keywords.length === 0) {
2867
2880
  return [];
2868
2881
  }
@@ -2879,7 +2892,7 @@ function normalizeKeywords(keywords) {
2879
2892
  }
2880
2893
  uniqueLowercase.add(lower);
2881
2894
  normalized.push(trimmed);
2882
- if (normalized.length === SUMMARY_KEYWORD_LIMIT) {
2895
+ if (normalized.length === limit) {
2883
2896
  break;
2884
2897
  }
2885
2898
  }
@@ -2896,7 +2909,10 @@ async function getSummaryAndTags(assetId, options) {
2896
2909
  imageSubmissionMode = "url",
2897
2910
  imageDownloadOptions,
2898
2911
  promptOverrides,
2899
- credentials
2912
+ credentials,
2913
+ titleLength,
2914
+ descriptionLength,
2915
+ tagCount
2900
2916
  } = options ?? {};
2901
2917
  if (!VALID_TONES.includes(tone)) {
2902
2918
  throw new Error(
@@ -2934,7 +2950,10 @@ async function getSummaryAndTags(assetId, options) {
2934
2950
  transcriptText,
2935
2951
  isCleanTranscript: cleanTranscript,
2936
2952
  promptOverrides,
2937
- isAudioOnly
2953
+ isAudioOnly,
2954
+ titleLength,
2955
+ descriptionLength,
2956
+ tagCount
2938
2957
  });
2939
2958
  let analysisResponse;
2940
2959
  let imageUrl;
@@ -2991,7 +3010,7 @@ async function getSummaryAndTags(assetId, options) {
2991
3010
  assetId,
2992
3011
  title: analysisResponse.result.title,
2993
3012
  description: analysisResponse.result.description,
2994
- tags: normalizeKeywords(analysisResponse.result.keywords),
3013
+ tags: normalizeKeywords(analysisResponse.result.keywords, tagCount ?? SUMMARY_KEYWORD_LIMIT),
2995
3014
  storyboardUrl: imageUrl,
2996
3015
  // undefined for audio-only assets
2997
3016
  usage: {
@@ -3823,6 +3842,7 @@ import { z as z6 } from "zod";
3823
3842
  var translationSchema = z6.object({
3824
3843
  translation: z6.string()
3825
3844
  });
3845
+ var SYSTEM_PROMPT4 = 'You are a subtitle translation expert. Translate VTT subtitle files to the target language specified by the user. Preserve all timestamps and VTT formatting exactly as they appear. Return JSON with a single key "translation" containing the translated VTT content.';
3826
3846
  async function fetchVttFromMux(vttUrl) {
3827
3847
  "use step";
3828
3848
  const vttResponse = await fetch(vttUrl);
@@ -3845,9 +3865,13 @@ async function translateVttWithAI({
3845
3865
  model,
3846
3866
  output: Output5.object({ schema: translationSchema }),
3847
3867
  messages: [
3868
+ {
3869
+ role: "system",
3870
+ content: SYSTEM_PROMPT4
3871
+ },
3848
3872
  {
3849
3873
  role: "user",
3850
- content: `Translate the following VTT subtitle file from ${fromLanguageCode} to ${toLanguageCode}. Preserve all timestamps and VTT formatting exactly as they appear. Return JSON with a single key "translation" containing the translated VTT.
3874
+ content: `Translate from ${fromLanguageCode} to ${toLanguageCode}:
3851
3875
 
3852
3876
  ${vttContent}`
3853
3877
  }