mulmocast 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/lib/actions/audio.js +13 -18
  2. package/lib/actions/image_agents.d.ts +30 -6
  3. package/lib/actions/image_agents.js +5 -2
  4. package/lib/actions/image_references.js +2 -1
  5. package/lib/actions/images.d.ts +9 -1
  6. package/lib/actions/images.js +38 -13
  7. package/lib/actions/movie.js +3 -2
  8. package/lib/agents/add_bgm_agent.js +1 -1
  9. package/lib/agents/combine_audio_files_agent.js +10 -7
  10. package/lib/agents/image_google_agent.js +2 -2
  11. package/lib/agents/image_openai_agent.js +2 -2
  12. package/lib/agents/movie_replicate_agent.js +1 -1
  13. package/lib/agents/tts_elevenlabs_agent.d.ts +2 -1
  14. package/lib/agents/tts_elevenlabs_agent.js +4 -3
  15. package/lib/agents/tts_google_agent.d.ts +2 -9
  16. package/lib/agents/tts_nijivoice_agent.d.ts +2 -1
  17. package/lib/agents/tts_nijivoice_agent.js +3 -3
  18. package/lib/agents/tts_openai_agent.d.ts +2 -13
  19. package/lib/agents/tts_openai_agent.js +4 -3
  20. package/lib/index.browser.d.ts +1 -0
  21. package/lib/index.browser.js +1 -0
  22. package/lib/index.d.ts +1 -0
  23. package/lib/index.js +2 -0
  24. package/lib/methods/mulmo_presentation_style.d.ts +2 -1
  25. package/lib/methods/mulmo_presentation_style.js +21 -17
  26. package/lib/types/agent.d.ts +29 -2
  27. package/lib/types/agent.js +0 -1
  28. package/lib/types/schema.d.ts +596 -485
  29. package/lib/types/schema.js +15 -11
  30. package/lib/utils/const.d.ts +0 -1
  31. package/lib/utils/const.js +0 -1
  32. package/lib/utils/context.d.ts +36 -30
  33. package/lib/utils/ffmpeg_utils.d.ts +4 -1
  34. package/lib/utils/ffmpeg_utils.js +2 -1
  35. package/lib/utils/preprocess.d.ts +28 -24
  36. package/lib/utils/provider2agent.d.ts +76 -0
  37. package/lib/utils/provider2agent.js +87 -0
  38. package/lib/utils/utils.d.ts +6 -11
  39. package/lib/utils/utils.js +5 -26
  40. package/package.json +2 -2
@@ -1,7 +1,7 @@
1
1
  import "dotenv/config";
2
- import { userAssert, llmConfig } from "../utils/utils.js";
2
+ import { userAssert } from "../utils/utils.js";
3
3
  import { text2ImageProviderSchema, text2HtmlImageProviderSchema, text2SpeechProviderSchema, mulmoCanvasDimensionSchema } from "../types/schema.js";
4
- import { defaultOpenAIImageModel } from "../utils/const.js";
4
+ import { defaultProviders, provider2ImageAgent, provider2MovieAgent, provider2LLMAgent } from "../utils/provider2agent.js";
5
5
  const defaultTextSlideStyles = [
6
6
  '*,*::before,*::after{box-sizing:border-box}body,h1,h2,h3,h4,p,figure,blockquote,dl,dd{margin:0}ul[role="list"],ol[role="list"]{list-style:none}html:focus-within{scroll-behavior:smooth}body{min-height:100vh;text-rendering:optimizeSpeed;line-height:1.5}a:not([class]){text-decoration-skip-ink:auto}img,picture{max-width:100%;display:block}input,button,textarea,select{font:inherit}@media(prefers-reduced-motion:reduce){html:focus-within{scroll-behavior:auto}*,*::before,*::after{animation-duration:.01ms !important;animation-iteration-count:1 !important;transition-duration:.01ms !important;scroll-behavior:auto !important}}',
7
7
  "body { margin: 60px; margin-top: 40px; color:#333; font-size: 30px; font-family: Arial, sans-serif; box-sizing: border-box; height: 100vh }",
@@ -49,10 +49,14 @@ export const MulmoPresentationStyleMethods = {
49
49
  userAssert(!!speaker, `speaker is not set: speaker "${beat.speaker}"`);
50
50
  return speaker;
51
51
  },
52
- getProvider(presentationStyle, beat) {
52
+ getTTSProvider(presentationStyle, beat) {
53
53
  const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
54
54
  return speaker.provider ?? presentationStyle.speechParams.provider;
55
55
  },
56
+ getTTSModel(presentationStyle, beat) {
57
+ const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
58
+ return speaker.model ?? presentationStyle.speechParams.model;
59
+ },
56
60
  getVoiceId(presentationStyle, beat) {
57
61
  const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
58
62
  return speaker.voiceId;
@@ -65,46 +69,46 @@ export const MulmoPresentationStyleMethods = {
65
69
  // provider and model appropriately.
66
70
  const imageParams = { ...presentationStyle.imageParams, ...beat?.imageParams };
67
71
  const provider = MulmoPresentationStyleMethods.getText2ImageProvider(imageParams?.provider);
72
+ const agentInfo = provider2ImageAgent[provider];
73
+ // The default text2image model is gpt-image-1 from OpenAI, and to use it you must have an OpenAI account and have verified your identity. If this is not possible, please specify dall-e-3 as the model.
68
74
  const defaultImageParams = {
69
75
  provider,
70
- model: provider === "openai" ? (process.env.DEFAULT_OPENAI_IMAGE_MODEL ?? defaultOpenAIImageModel) : undefined,
76
+ model: agentInfo.defaultModel,
71
77
  };
72
78
  return {
73
- agent: provider === "google" ? "imageGoogleAgent" : "imageOpenaiAgent",
79
+ agent: agentInfo.agentName,
74
80
  imageParams: { ...defaultImageParams, ...imageParams },
75
81
  };
76
82
  },
77
83
  // Determine movie agent based on provider
78
84
  getMovieAgent(presentationStyle) {
79
- const movieProvider = presentationStyle.movieParams?.provider ?? "google";
80
- switch (movieProvider) {
81
- case "replicate":
82
- return "movieReplicateAgent";
83
- case "google":
84
- default:
85
- return "movieGoogleAgent";
86
- }
85
+ const movieProvider = (presentationStyle.movieParams?.provider ?? defaultProviders.text2movie);
86
+ return provider2MovieAgent[movieProvider].agentName;
87
87
  },
88
88
  getConcurrency(presentationStyle) {
89
+ /*
89
90
  if (presentationStyle.movieParams?.provider === "replicate") {
90
- return 4;
91
+ return 4;
91
92
  }
93
+ */
92
94
  const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(presentationStyle);
93
95
  if (imageAgentInfo.imageParams.provider === "openai") {
94
96
  // NOTE: Here are the rate limits of OpenAI's text2image API (1token = 32x32 patch).
95
97
  // dall-e-3: 7,500 RPM、15 images per minute (4 images for max resolution)
96
98
  // gpt-image-1:3,000,000 TPM、150 images per minute
97
- return imageAgentInfo.imageParams.model === defaultOpenAIImageModel ? 4 : 16;
99
+ if (imageAgentInfo.imageParams.model === provider2ImageAgent.openai.defaultModel) {
100
+ return 16;
101
+ }
98
102
  }
99
103
  return 4;
100
104
  },
101
105
  getHtmlImageAgentInfo(presentationStyle) {
102
106
  const provider = text2HtmlImageProviderSchema.parse(presentationStyle.htmlImageParams?.provider);
103
- const defaultConfig = llmConfig[provider];
107
+ const defaultConfig = provider2LLMAgent[provider];
104
108
  const model = presentationStyle.htmlImageParams?.model ? presentationStyle.htmlImageParams?.model : defaultConfig.defaultModel;
105
109
  return {
106
110
  provider,
107
- agent: defaultConfig.agent,
111
+ agent: defaultConfig.agentName,
108
112
  model,
109
113
  max_tokens: defaultConfig.max_tokens,
110
114
  };
@@ -13,6 +13,15 @@ export type AgentBufferResult = {
13
13
  export type AgentPromptInputs = {
14
14
  prompt: string;
15
15
  };
16
+ export type AgentTextInputs = {
17
+ text: string;
18
+ };
19
+ export type AgentErrorResult = {
20
+ error: unknown;
21
+ };
22
+ export type AgentConfig = {
23
+ apiKey?: string;
24
+ };
16
25
  export type ImageAgentInputs = AgentPromptInputs;
17
26
  export type OpenAIImageAgentInputs = AgentPromptInputs & {
18
27
  referenceImages: string[] | null | undefined;
@@ -50,6 +59,24 @@ export type ReplicateMovieAgentParams = {
50
59
  duration?: number;
51
60
  };
52
61
  export type GoogleMovieAgentConfig = GoogleImageAgentConfig;
53
- export type ReplicateMovieAgentConfig = {
54
- apiKey?: string;
62
+ export type ReplicateMovieAgentConfig = AgentConfig;
63
+ export type TTSAgentParams = {
64
+ suppressError: boolean;
65
+ voice: string;
66
+ };
67
+ export type OpenAITTSAgentParams = TTSAgentParams & {
68
+ instructions: string;
69
+ model: string;
70
+ };
71
+ export type NijivoiceTTSAgentParams = TTSAgentParams & {
72
+ speed: number;
73
+ speed_global: number;
74
+ };
75
+ export type GoogleTTSAgentParams = TTSAgentParams & {
76
+ speed: number;
77
+ };
78
+ export type ElevenlabsTTSAgentParams = TTSAgentParams & {
79
+ model: string;
80
+ stability: number;
81
+ similarityBoost: number;
55
82
  };
@@ -1,3 +1,2 @@
1
1
  // for image agent
2
2
  export {};
3
- // end of image agent