mulmocast 2.1.19 → 2.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # MulmoCast: A Multi-Modal Presentation Tool for the AI-Native Era
2
2
 
3
+ [![npm version](https://badge.fury.io/js/mulmocast.svg)](https://www.npmjs.com/package/mulmocast)
4
+
3
5
  ## Quick Start Guide
4
6
 
5
7
  If you want to try our beta version, follow the instruction in the release note below.
@@ -16,9 +16,9 @@
16
16
  },
17
17
  "speechParams": {
18
18
  "speakers": {
19
- "Announcer": { "provider": "nijivoice", "displayName": { "ja": "アナウンサー" }, "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c" },
20
- "Student": { "provider": "nijivoice", "displayName": { "ja": "太郎" }, "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f" },
21
- "Teacher": { "provider": "nijivoice", "displayName": { "ja": "先生" }, "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae" }
19
+ "Announcer": { "provider": "gemini", "displayName": { "ja": "アナウンサー" }, "voiceId": "Aoede" },
20
+ "Student": { "provider": "gemini", "displayName": { "ja": "太郎" }, "voiceId": "Puck" },
21
+ "Teacher": { "provider": "gemini", "displayName": { "ja": "先生" }, "voiceId": "Charon" }
22
22
  }
23
23
  }
24
24
  },
@@ -2,9 +2,9 @@ import dotenv from "dotenv";
2
2
  import { GraphAI, TaskManager, GraphAILogger } from "graphai";
3
3
  import * as agents from "@graphai/vanilla";
4
4
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
5
- import { ttsNijivoiceAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsElevenlabsAgent, ttsKotodamaAgent, addBGMAgent, combineAudioFilesAgent, mediaMockAgent, } from "../agents/index.js";
5
+ import { ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsElevenlabsAgent, ttsKotodamaAgent, addBGMAgent, combineAudioFilesAgent, mediaMockAgent, } from "../agents/index.js";
6
6
  import { text2SpeechProviderSchema } from "../types/index.js";
7
- import { fileCacheAgentFilter, nijovoiceTextAgentFilter } from "../utils/filters.js";
7
+ import { fileCacheAgentFilter } from "../utils/filters.js";
8
8
  import { getAudioArtifactFilePath, getAudioFilePath, getOutputStudioFilePath, resolveDirPath, defaultBGMPath, mkdir, writingMessage } from "../utils/file.js";
9
9
  import { localizedText, settings2GraphAIConfig } from "../utils/utils.js";
10
10
  import { text2hash } from "../utils/utils_node.js";
@@ -214,14 +214,9 @@ const agentFilters = [
214
214
  agent: fileCacheAgentFilter,
215
215
  nodeIds: ["tts"],
216
216
  },
217
- {
218
- name: "nijovoiceTextAgentFilter",
219
- agent: nijovoiceTextAgentFilter,
220
- nodeIds: ["tts"],
221
- },
222
217
  ];
223
218
  const getConcurrency = (context) => {
224
- // Check if any speaker uses nijivoice or elevenlabs (providers that require concurrency = 1)
219
+ // Check if any speaker uses elevenlabs or kotodama (providers that require concurrency = 1)
225
220
  const hasLimitedConcurrencyProvider = Object.values(context.presentationStyle.speechParams.speakers).some((speaker) => {
226
221
  const provider = text2SpeechProviderSchema.parse(speaker.provider);
227
222
  return provider2TTSAgent[provider].hasLimitedConcurrency;
@@ -232,7 +227,6 @@ const audioAgents = {
232
227
  ...vanillaAgents,
233
228
  fileWriteAgent,
234
229
  ttsOpenaiAgent,
235
- ttsNijivoiceAgent,
236
230
  ttsGoogleAgent,
237
231
  ttsGeminiAgent,
238
232
  ttsKotodamaAgent,
@@ -7,6 +7,105 @@ import { renderHTMLToImage, interpolate } from "../utils/markdown.js";
7
7
  import { MulmoStudioContextMethods, MulmoPresentationStyleMethods } from "../methods/index.js";
8
8
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
9
9
  const vanillaAgents = agents.default ?? agents;
10
+ const defaultDelimiters = ["。", "?", "!", ".", "?", "!"];
11
+ // Split text by delimiters while keeping delimiters attached to the preceding text
12
+ const splitTextByDelimiters = (text, delimiters) => {
13
+ if (!text || delimiters.length === 0) {
14
+ return [text];
15
+ }
16
+ const { segments, current } = [...text].reduce((acc, char) => {
17
+ const newCurrent = acc.current + char;
18
+ if (delimiters.includes(char)) {
19
+ const trimmed = newCurrent.trim();
20
+ return {
21
+ segments: trimmed ? [...acc.segments, trimmed] : acc.segments,
22
+ current: "",
23
+ };
24
+ }
25
+ return { ...acc, current: newCurrent };
26
+ }, { segments: [], current: "" });
27
+ const finalSegments = current.trim() ? [...segments, current.trim()] : segments;
28
+ return finalSegments.length > 0 ? finalSegments : [text];
29
+ };
30
+ // Get split texts based on settings
31
+ const getSplitTexts = (text, texts, textSplit) => {
32
+ // Manual split takes precedence
33
+ if (texts && texts.length > 0) {
34
+ return texts;
35
+ }
36
+ // No splitting or undefined
37
+ if (!textSplit || textSplit.type === "none") {
38
+ return [text];
39
+ }
40
+ // Split by delimiters
41
+ if (textSplit.type === "delimiters") {
42
+ const delimiters = textSplit.delimiters ?? defaultDelimiters;
43
+ return splitTextByDelimiters(text, delimiters);
44
+ }
45
+ return [text];
46
+ };
47
+ // Calculate timing ratios based on text length
48
+ const calculateTimingRatios = (splitTexts) => {
49
+ const totalLength = splitTexts.reduce((sum, t) => sum + t.length, 0);
50
+ if (totalLength === 0) {
51
+ return splitTexts.map(() => 1 / splitTexts.length);
52
+ }
53
+ return splitTexts.map((t) => t.length / totalLength);
54
+ };
55
+ // Convert ratios to cumulative ratios: [0.3, 0.5, 0.2] -> [0, 0.3, 0.8, 1.0]
56
+ const calculateCumulativeRatios = (ratios) => {
57
+ return ratios.reduce((acc, ratio) => [...acc, acc[acc.length - 1] + ratio], [0]);
58
+ };
59
+ // Generate caption files for a single beat
60
+ const generateBeatCaptions = async (beat, context, index) => {
61
+ const captionParams = mulmoCaptionParamsSchema.parse({ ...context.studio.script.captionParams, ...beat.captionParams });
62
+ const canvasSize = MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle);
63
+ const template = getHTMLFile("caption");
64
+ if (captionParams.lang && !context.multiLingual?.[index]?.multiLingualTexts?.[captionParams.lang]) {
65
+ GraphAILogger.warn(`No multiLingual caption found for beat ${index}, lang: ${captionParams.lang}`);
66
+ }
67
+ const text = localizedText(beat, context.multiLingual?.[index], captionParams.lang, context.studio.script.lang);
68
+ // Get beat timing info
69
+ const studioBeat = context.studio.beats[index];
70
+ const beatStartAt = studioBeat.startAt ?? 0;
71
+ const beatDuration = studioBeat.duration ?? 0;
72
+ const introPadding = MulmoStudioContextMethods.getIntroPadding(context);
73
+ // Determine split texts based on captionSplit setting
74
+ const captionSplit = captionParams.captionSplit ?? "none";
75
+ const splitTexts = captionSplit === "estimate" ? getSplitTexts(text, beat.texts, captionParams.textSplit) : [text];
76
+ // Calculate timing
77
+ const cumulativeRatios = calculateCumulativeRatios(calculateTimingRatios(splitTexts));
78
+ // Generate caption images with absolute timing
79
+ const captionFiles = await Promise.all(splitTexts.map(async (segmentText, subIndex) => {
80
+ const imagePath = getCaptionImagePath(context, index, subIndex);
81
+ const htmlData = interpolate(template, {
82
+ caption: processLineBreaks(segmentText),
83
+ width: `${canvasSize.width}`,
84
+ height: `${canvasSize.height}`,
85
+ styles: captionParams.styles.join(";\n"),
86
+ });
87
+ await renderHTMLToImage(htmlData, imagePath, canvasSize.width, canvasSize.height, false, true);
88
+ return {
89
+ file: imagePath,
90
+ startAt: beatStartAt + introPadding + beatDuration * cumulativeRatios[subIndex],
91
+ endAt: beatStartAt + introPadding + beatDuration * cumulativeRatios[subIndex + 1],
92
+ };
93
+ }));
94
+ return captionFiles;
95
+ };
96
+ // GraphAI agent for caption generation
97
+ const captionGenerationAgent = async (namedInputs) => {
98
+ const { beat, context, index } = namedInputs;
99
+ try {
100
+ MulmoStudioContextMethods.setBeatSessionState(context, "caption", index, beat.id, true);
101
+ const captionFiles = await generateBeatCaptions(beat, context, index);
102
+ context.studio.beats[index].captionFiles = captionFiles;
103
+ return captionFiles;
104
+ }
105
+ finally {
106
+ MulmoStudioContextMethods.setBeatSessionState(context, "caption", index, beat.id, false);
107
+ }
108
+ };
10
109
  export const caption_graph_data = {
11
110
  version: 0.5,
12
111
  nodes: {
@@ -23,37 +122,8 @@ export const caption_graph_data = {
23
122
  graph: {
24
123
  nodes: {
25
124
  generateCaption: {
26
- agent: async (namedInputs) => {
27
- const { beat, context, index } = namedInputs;
28
- try {
29
- MulmoStudioContextMethods.setBeatSessionState(context, "caption", index, beat.id, true);
30
- const captionParams = mulmoCaptionParamsSchema.parse({ ...context.studio.script.captionParams, ...beat.captionParams });
31
- const canvasSize = MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle);
32
- const imagePath = getCaptionImagePath(context, index);
33
- const template = getHTMLFile("caption");
34
- if (captionParams.lang && !context.multiLingual?.[index]?.multiLingualTexts?.[captionParams.lang]) {
35
- GraphAILogger.warn(`No multiLingual caption found for beat ${index}, lang: ${captionParams.lang}`);
36
- }
37
- const text = localizedText(beat, context.multiLingual?.[index], captionParams.lang, context.studio.script.lang);
38
- const htmlData = interpolate(template, {
39
- caption: processLineBreaks(text),
40
- width: `${canvasSize.width}`,
41
- height: `${canvasSize.height}`,
42
- styles: captionParams.styles.join(";\n"),
43
- });
44
- await renderHTMLToImage(htmlData, imagePath, canvasSize.width, canvasSize.height, false, true);
45
- context.studio.beats[index].captionFile = imagePath;
46
- return imagePath;
47
- }
48
- finally {
49
- MulmoStudioContextMethods.setBeatSessionState(context, "caption", index, beat.id, false);
50
- }
51
- },
52
- inputs: {
53
- beat: ":beat",
54
- context: ":context",
55
- index: ":__mapIndex",
56
- },
125
+ agent: captionGenerationAgent,
126
+ inputs: { beat: ":beat", context: ":context", index: ":__mapIndex" },
57
127
  isResult: true,
58
128
  },
59
129
  },
@@ -27,6 +27,10 @@ export declare const getTransitionVideoId: (transition: MulmoTransition, videoId
27
27
  beatIndex: number;
28
28
  };
29
29
  export declare const getConcatVideoFilter: (concatVideoId: string, videoIdsForBeats: VideoId[]) => string;
30
+ export declare const getTransitionFrameDurations: (context: MulmoStudioContext, index: number) => {
31
+ firstDuration: number;
32
+ lastDuration: number;
33
+ };
30
34
  export declare const validateBeatSource: (studioBeat: MulmoStudioContext["studio"]["beats"][number], index: number) => string;
31
35
  export declare const addSplitAndExtractFrames: (ffmpegContext: FfmpegContext, videoId: string, firstDuration: number, lastDuration: number, isMovie: boolean, needFirst: boolean, needLast: boolean, canvasInfo: {
32
36
  width: number;
@@ -86,19 +86,22 @@ const getOutputOption = (audioId, videoId) => {
86
86
  ];
87
87
  };
88
88
  const addCaptions = (ffmpegContext, concatVideoId, context, caption) => {
89
- const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
89
+ const beatsWithCaptions = context.studio.beats.filter(({ captionFiles }) => captionFiles && captionFiles.length > 0);
90
90
  if (caption && beatsWithCaptions.length > 0) {
91
- const introPadding = MulmoStudioContextMethods.getIntroPadding(context);
92
- return beatsWithCaptions.reduce((prevVideoId, beat, index) => {
93
- const { startAt, duration, captionFile } = beat;
94
- if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
95
- const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
96
- const compositeVideoId = `oc${index}`;
97
- ffmpegContext.filterComplex.push(`[${prevVideoId}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
98
- return compositeVideoId;
91
+ const { videoId } = beatsWithCaptions.reduce((acc, beat) => {
92
+ const { captionFiles } = beat;
93
+ if (!captionFiles) {
94
+ return acc;
99
95
  }
100
- return prevVideoId;
101
- }, concatVideoId);
96
+ return captionFiles.reduce((innerAcc, captionData) => {
97
+ const { file, startAt, endAt } = captionData;
98
+ const captionInputIndex = FfmpegContextAddInput(ffmpegContext, file);
99
+ const compositeVideoId = `oc${innerAcc.captionIndex}`;
100
+ ffmpegContext.filterComplex.push(`[${innerAcc.videoId}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt},${endAt})'[${compositeVideoId}]`);
101
+ return { videoId: compositeVideoId, captionIndex: innerAcc.captionIndex + 1 };
102
+ }, acc);
103
+ }, { videoId: concatVideoId, captionIndex: 0 });
104
+ return videoId;
102
105
  }
103
106
  return concatVideoId;
104
107
  };
@@ -280,24 +283,21 @@ const getClampedTransitionDuration = (transitionDuration, prevBeatDuration, curr
280
283
  const maxDuration = Math.min(prevBeatDuration, currentBeatDuration) * 0.9; // Use 90% to leave some margin
281
284
  return Math.min(transitionDuration, maxDuration);
282
285
  };
283
- const getTransitionFrameDurations = (context, index) => {
286
+ export const getTransitionFrameDurations = (context, index) => {
284
287
  const minFrame = 1 / 30; // 30fpsを想定。最小1フレーム
285
288
  const beats = context.studio.beats;
286
289
  const scriptBeats = context.studio.script.beats;
290
+ const getTransitionDuration = (transition, prevBeatIndex, currentBeatIndex) => {
291
+ if (!transition || prevBeatIndex < 0 || currentBeatIndex >= beats.length)
292
+ return 0;
293
+ const prevBeatDuration = beats[prevBeatIndex].duration ?? 1;
294
+ const currentBeatDuration = beats[currentBeatIndex].duration ?? 1;
295
+ return getClampedTransitionDuration(transition.duration, prevBeatDuration, currentBeatDuration);
296
+ };
287
297
  const currentTransition = MulmoPresentationStyleMethods.getMovieTransition(context, scriptBeats[index]);
288
- let firstDuration = 0;
289
- if (currentTransition && index > 0) {
290
- const prevBeatDuration = beats[index - 1].duration ?? 1;
291
- const currentBeatDuration = beats[index].duration ?? 1;
292
- firstDuration = getClampedTransitionDuration(currentTransition.duration, prevBeatDuration, currentBeatDuration);
293
- }
298
+ const firstDuration = index > 0 ? getTransitionDuration(currentTransition, index - 1, index) : 0;
294
299
  const nextTransition = index < scriptBeats.length - 1 ? MulmoPresentationStyleMethods.getMovieTransition(context, scriptBeats[index + 1]) : null;
295
- let lastDuration = 0;
296
- if (nextTransition) {
297
- const prevBeatDuration = beats[index].duration ?? 1;
298
- const currentBeatDuration = beats[index + 1].duration ?? 1;
299
- lastDuration = getClampedTransitionDuration(nextTransition.duration, prevBeatDuration, currentBeatDuration);
300
- }
300
+ const lastDuration = getTransitionDuration(nextTransition, index, index + 1);
301
301
  return {
302
302
  firstDuration: Math.max(firstDuration, minFrame),
303
303
  lastDuration: Math.max(lastDuration, minFrame),
@@ -8,7 +8,6 @@ import movieGenAIAgent from "./movie_genai_agent.js";
8
8
  import movieReplicateAgent from "./movie_replicate_agent.js";
9
9
  import mediaMockAgent from "./media_mock_agent.js";
10
10
  import ttsElevenlabsAgent from "./tts_elevenlabs_agent.js";
11
- import ttsNijivoiceAgent from "./tts_nijivoice_agent.js";
12
11
  import ttsOpenaiAgent from "./tts_openai_agent.js";
13
12
  import ttsGoogleAgent from "./tts_google_agent.js";
14
13
  import ttsGeminiAgent from "./tts_gemini_agent.js";
@@ -21,4 +20,4 @@ import { browserlessAgent } from "@graphai/browserless_agent";
21
20
  import { textInputAgent } from "@graphai/input_agents";
22
21
  import { openAIAgent } from "@graphai/openai_agent";
23
22
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
24
- export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGenAIAgent, imageOpenaiAgent, imageReplicateAgent, tavilySearchAgent, movieGenAIAgent, movieReplicateAgent, mediaMockAgent, ttsElevenlabsAgent, ttsNijivoiceAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsKotodamaAgent, validateSchemaAgent, soundEffectReplicateAgent, lipSyncReplicateAgent, puppeteerCrawlerAgent, };
23
+ export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGenAIAgent, imageOpenaiAgent, imageReplicateAgent, tavilySearchAgent, movieGenAIAgent, movieReplicateAgent, mediaMockAgent, ttsElevenlabsAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsKotodamaAgent, validateSchemaAgent, soundEffectReplicateAgent, lipSyncReplicateAgent, puppeteerCrawlerAgent, };
@@ -8,7 +8,6 @@ import movieGenAIAgent from "./movie_genai_agent.js";
8
8
  import movieReplicateAgent from "./movie_replicate_agent.js";
9
9
  import mediaMockAgent from "./media_mock_agent.js";
10
10
  import ttsElevenlabsAgent from "./tts_elevenlabs_agent.js";
11
- import ttsNijivoiceAgent from "./tts_nijivoice_agent.js";
12
11
  import ttsOpenaiAgent from "./tts_openai_agent.js";
13
12
  import ttsGoogleAgent from "./tts_google_agent.js";
14
13
  import ttsGeminiAgent from "./tts_gemini_agent.js";
@@ -22,4 +21,4 @@ import { textInputAgent } from "@graphai/input_agents";
22
21
  import { openAIAgent } from "@graphai/openai_agent";
23
22
  // import * as vanilla from "@graphai/vanilla";
24
23
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
25
- export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGenAIAgent, imageOpenaiAgent, imageReplicateAgent, tavilySearchAgent, movieGenAIAgent, movieReplicateAgent, mediaMockAgent, ttsElevenlabsAgent, ttsNijivoiceAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsKotodamaAgent, validateSchemaAgent, soundEffectReplicateAgent, lipSyncReplicateAgent, puppeteerCrawlerAgent, };
24
+ export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGenAIAgent, imageOpenaiAgent, imageReplicateAgent, tavilySearchAgent, movieGenAIAgent, movieReplicateAgent, mediaMockAgent, ttsElevenlabsAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsKotodamaAgent, validateSchemaAgent, soundEffectReplicateAgent, lipSyncReplicateAgent, puppeteerCrawlerAgent, };
@@ -890,22 +890,22 @@ export const promptTemplates = [
890
890
  displayName: {
891
891
  ja: "アナウンサー",
892
892
  },
893
- provider: "nijivoice",
894
- voiceId: "3708ad43-cace-486c-a4ca-8fe41186e20c",
893
+ provider: "gemini",
894
+ voiceId: "Aoede",
895
895
  },
896
896
  Student: {
897
897
  displayName: {
898
898
  ja: "太郎",
899
899
  },
900
- provider: "nijivoice",
901
- voiceId: "a7619e48-bf6a-4f9f-843f-40485651257f",
900
+ provider: "gemini",
901
+ voiceId: "Puck",
902
902
  },
903
903
  Teacher: {
904
904
  displayName: {
905
905
  ja: "先生",
906
906
  },
907
- provider: "nijivoice",
908
- voiceId: "bc06c63f-fef6-43b6-92f7-67f919bd5dae",
907
+ provider: "gemini",
908
+ voiceId: "Charon",
909
909
  },
910
910
  },
911
911
  },
@@ -73,7 +73,7 @@ export const templateDataSet = {
73
73
  "```",
74
74
  sensei_and_taro: "全てを高校生にも分かるように、太郎くん(Student)と先生(Teacher)の会話、という形の台本にして。ただし要点はしっかりと押さえて。以下に別のトピックに関するサンプルを貼り付けます。このJSONフォーマットに従って。\n" +
75
75
  "```JSON\n" +
76
- `{"$mulmocast":{"version":"1.1","credit":"closing"},"title":"韓国の戒厳令とその日本への影響","description":"韓国で最近発令された戒厳令とその可能性のある影響について、また日本の憲法に関する考慮事項との類似点を含めた洞察に満ちた議論。","lang":"ja","beats":[{"speaker":"Announcer","text":"今日は、韓国で起きた戒厳令について、太郎くんが先生に聞きます。","imagePrompt":"A classroom setting with a curious Japanese student (Taro) and a kind teacher. Calm atmosphere, early morning light coming through the window."},{"speaker":"Student","text":"先生、今日は韓国で起きた戒厳令のことを教えてもらえますか?","imagePrompt":"The student (Taro) sitting at his desk with a serious expression, raising his hand to ask a question. Teacher is slightly surprised but attentive."},{"speaker":"Teacher","text":"もちろんだよ、太郎くん。韓国で最近、大統領が「戒厳令」っていうのを突然宣言したんだ。","imagePrompt":"TV screen showing a breaking news headline in Korean: 'President Declares Martial Law'. Students watching with concern."},{"speaker":"Student","text":"戒厳令ってなんですか?","imagePrompt":"A close-up of the student's puzzled face, with a speech bubble saying '戒厳令って?'"},{"speaker":"Teacher","text":"簡単に言うと、国がすごく危ない状態にあるとき、軍隊を使って人々の自由を制限するためのものなんだ。","imagePrompt":"Illustration of soldiers standing in the street, people being stopped and questioned, with a red 'X' on a protest sign. Moody and serious tone."},{"speaker":"Student","text":"それって怖いですね。なんでそんなことをしたんですか?","imagePrompt":"Student looking anxious, thinking deeply. Background shows a shadowy image of a politician giving orders to the military."},{"speaker":"Teacher","text":"大統領は「国会がうまく機能していないから」と言っていたけど…","imagePrompt":"A tense scene of military personnel entering a national assembly building in Korea, lawmakers looking shocked and resisting."},{"speaker":"Student","text":"ええっ!?国会議員を捕まえようとするなんて、すごく危ないことじゃないですか。","imagePrompt":"The student reacts with shock, comic-style expression with wide eyes and open mouth. Background fades into a dramatic courtroom or parliament chaos."},{"speaker":"Teacher","text":"その通りだよ。もし軍隊が国会を占拠していたら…","imagePrompt":"Dark visual of a locked parliament building with soldiers blocking the entrance, ominous sky in the background."},{"speaker":"Student","text":"韓国ではどうなったんですか?","imagePrompt":"Student leans forward, curious and worried. Background shows a hopeful scene of people holding protest signs with candles at night."},{"speaker":"Teacher","text":"幸い、野党の議員や市民たちが急いで集まって抗議して…","imagePrompt":"Peaceful protest scene in Seoul, citizens holding candles and banners, united. Hopeful tone."},{"speaker":"Student","text":"それは大変なことですね…。日本ではそんなこと起きないんですか?","imagePrompt":"Student looking toward the Japanese flag outside the school window, pensive mood."},{"speaker":"Teacher","text":"実はね、今、日本でも似たような話があるんだよ。","imagePrompt":"Teacher pointing to a newspaper headline: '緊急事態条項の議論進む'. Classroom chalkboard shows a map of Korea and Japan."},{"speaker":"Student","text":"緊急事態宣言って、韓国の戒厳令と同じようなものなんですか?","imagePrompt":"Split screen image: left side shows a soldier in Korea, right side shows a suited Japanese politician giving a press conference."},{"speaker":"Teacher","text":"似ている部分があるね。たとえば、総理大臣が…","imagePrompt":"Diagram-style visual showing the flow of emergency powers from PM to local governments. Simple, clean infographic style."},{"speaker":"Student","text":"それって便利そうですけど、なんだか心配です。","imagePrompt":"Student's concerned expression, behind him a blurry image of a street with emergency sirens glowing in red."},{"speaker":"Teacher","text":"そうだね。もちろん、緊急時には素早い対応が必要だけど…","imagePrompt":"Illustration of a balance scale: one side is 'freedom', the other 'security'. The scale is slightly tilting."},{"speaker":"Student","text":"韓国みたいに、軍隊が政治に口を出してくることもあり得るんですか?","imagePrompt":"Student imagining a military tank next to the Japanese parliament, shown as a thought bubble."},{"speaker":"Teacher","text":"完全にあり得ないとは言えないからこそ、注意が必要なんだ。","imagePrompt":"Japanese citizens reading newspapers and watching news with concerned faces, civic awareness growing."},{"speaker":"Student","text":"ありがとうございます。とても良い勉強になりました。","imagePrompt":"The student bows slightly to the teacher with a grateful expression. The classroom is peaceful again."},{"speaker":"Announcer","text":"ご視聴、ありがとうございました。次回の放送もお楽しみに。","imagePrompt":"Ending screen with soft background music, showing the show's logo and a thank-you message in Japanese."}],"canvasSize":{"width":1536,"height":1024},"imageParams":{"style":"<style>Ghibli style. Student (Taro) is a young teenager with a dark short hair with glasses. Teacher is a middle-aged man with grey hair and moustache.</style>"},"speechParams":{"speakers":{"Announcer":{"provider":"nijivoice","displayName":{"ja":"アナウンサー"},"voiceId":"3708ad43-cace-486c-a4ca-8fe41186e20c"},"Student":{"provider":"nijivoice","displayName":{"ja":"太郎"},"voiceId":"a7619e48-bf6a-4f9f-843f-40485651257f"},"Teacher":{"provider":"nijivoice","displayName":{"ja":"先生"},"voiceId":"bc06c63f-fef6-43b6-92f7-67f919bd5dae"}}}}\n` +
76
+ `{"$mulmocast":{"version":"1.1","credit":"closing"},"title":"韓国の戒厳令とその日本への影響","description":"韓国で最近発令された戒厳令とその可能性のある影響について、また日本の憲法に関する考慮事項との類似点を含めた洞察に満ちた議論。","lang":"ja","beats":[{"speaker":"Announcer","text":"今日は、韓国で起きた戒厳令について、太郎くんが先生に聞きます。","imagePrompt":"A classroom setting with a curious Japanese student (Taro) and a kind teacher. Calm atmosphere, early morning light coming through the window."},{"speaker":"Student","text":"先生、今日は韓国で起きた戒厳令のことを教えてもらえますか?","imagePrompt":"The student (Taro) sitting at his desk with a serious expression, raising his hand to ask a question. Teacher is slightly surprised but attentive."},{"speaker":"Teacher","text":"もちろんだよ、太郎くん。韓国で最近、大統領が「戒厳令」っていうのを突然宣言したんだ。","imagePrompt":"TV screen showing a breaking news headline in Korean: 'President Declares Martial Law'. Students watching with concern."},{"speaker":"Student","text":"戒厳令ってなんですか?","imagePrompt":"A close-up of the student's puzzled face, with a speech bubble saying '戒厳令って?'"},{"speaker":"Teacher","text":"簡単に言うと、国がすごく危ない状態にあるとき、軍隊を使って人々の自由を制限するためのものなんだ。","imagePrompt":"Illustration of soldiers standing in the street, people being stopped and questioned, with a red 'X' on a protest sign. Moody and serious tone."},{"speaker":"Student","text":"それって怖いですね。なんでそんなことをしたんですか?","imagePrompt":"Student looking anxious, thinking deeply. Background shows a shadowy image of a politician giving orders to the military."},{"speaker":"Teacher","text":"大統領は「国会がうまく機能していないから」と言っていたけど…","imagePrompt":"A tense scene of military personnel entering a national assembly building in Korea, lawmakers looking shocked and resisting."},{"speaker":"Student","text":"ええっ!?国会議員を捕まえようとするなんて、すごく危ないことじゃないですか。","imagePrompt":"The student reacts with shock, comic-style expression with wide eyes and open mouth. Background fades into a dramatic courtroom or parliament chaos."},{"speaker":"Teacher","text":"その通りだよ。もし軍隊が国会を占拠していたら…","imagePrompt":"Dark visual of a locked parliament building with soldiers blocking the entrance, ominous sky in the background."},{"speaker":"Student","text":"韓国ではどうなったんですか?","imagePrompt":"Student leans forward, curious and worried. Background shows a hopeful scene of people holding protest signs with candles at night."},{"speaker":"Teacher","text":"幸い、野党の議員や市民たちが急いで集まって抗議して…","imagePrompt":"Peaceful protest scene in Seoul, citizens holding candles and banners, united. Hopeful tone."},{"speaker":"Student","text":"それは大変なことですね…。日本ではそんなこと起きないんですか?","imagePrompt":"Student looking toward the Japanese flag outside the school window, pensive mood."},{"speaker":"Teacher","text":"実はね、今、日本でも似たような話があるんだよ。","imagePrompt":"Teacher pointing to a newspaper headline: '緊急事態条項の議論進む'. Classroom chalkboard shows a map of Korea and Japan."},{"speaker":"Student","text":"緊急事態宣言って、韓国の戒厳令と同じようなものなんですか?","imagePrompt":"Split screen image: left side shows a soldier in Korea, right side shows a suited Japanese politician giving a press conference."},{"speaker":"Teacher","text":"似ている部分があるね。たとえば、総理大臣が…","imagePrompt":"Diagram-style visual showing the flow of emergency powers from PM to local governments. Simple, clean infographic style."},{"speaker":"Student","text":"それって便利そうですけど、なんだか心配です。","imagePrompt":"Student's concerned expression, behind him a blurry image of a street with emergency sirens glowing in red."},{"speaker":"Teacher","text":"そうだね。もちろん、緊急時には素早い対応が必要だけど…","imagePrompt":"Illustration of a balance scale: one side is 'freedom', the other 'security'. The scale is slightly tilting."},{"speaker":"Student","text":"韓国みたいに、軍隊が政治に口を出してくることもあり得るんですか?","imagePrompt":"Student imagining a military tank next to the Japanese parliament, shown as a thought bubble."},{"speaker":"Teacher","text":"完全にあり得ないとは言えないからこそ、注意が必要なんだ。","imagePrompt":"Japanese citizens reading newspapers and watching news with concerned faces, civic awareness growing."},{"speaker":"Student","text":"ありがとうございます。とても良い勉強になりました。","imagePrompt":"The student bows slightly to the teacher with a grateful expression. The classroom is peaceful again."},{"speaker":"Announcer","text":"ご視聴、ありがとうございました。次回の放送もお楽しみに。","imagePrompt":"Ending screen with soft background music, showing the show's logo and a thank-you message in Japanese."}],"canvasSize":{"width":1536,"height":1024},"imageParams":{"style":"<style>Ghibli style. Student (Taro) is a young teenager with a dark short hair with glasses. Teacher is a middle-aged man with grey hair and moustache.</style>"},"speechParams":{"speakers":{"Announcer":{"provider":"gemini","displayName":{"ja":"アナウンサー"},"voiceId":"Aoede"},"Student":{"provider":"gemini","displayName":{"ja":"太郎"},"voiceId":"Puck"},"Teacher":{"provider":"gemini","displayName":{"ja":"先生"},"voiceId":"Charon"}}}}\n` +
77
77
  "```",
78
78
  shorts: "This script is for YouTube shorts. The first beat should be a hook, which describes the topic. Another AI will generate images for each beat based on the image prompt of that beat. Movie prompts must be written in English.\n" +
79
79
  "```JSON\n" +
@@ -113,10 +113,6 @@ export type OpenAITTSAgentParams = TTSAgentParams & {
113
113
  model: string;
114
114
  speed: number;
115
115
  };
116
- export type NijivoiceTTSAgentParams = TTSAgentParams & {
117
- speed: number;
118
- speed_global: number;
119
- };
120
116
  export type KotodamaTTSAgentParams = TTSAgentParams & {
121
117
  decoration: string;
122
118
  };
@@ -1,9 +1,4 @@
1
1
  export declare const provider2TTSAgent: {
2
- nijivoice: {
3
- agentName: string;
4
- hasLimitedConcurrency: boolean;
5
- keyName: string;
6
- };
7
2
  openai: {
8
3
  agentName: string;
9
4
  hasLimitedConcurrency: boolean;
@@ -1,10 +1,5 @@
1
1
  // node & browser
2
2
  export const provider2TTSAgent = {
3
- nijivoice: {
4
- agentName: "ttsNijivoiceAgent",
5
- hasLimitedConcurrency: true,
6
- keyName: "NIJIVOICE_API_KEY",
7
- },
8
3
  openai: {
9
4
  agentName: "ttsOpenaiAgent",
10
5
  hasLimitedConcurrency: false,
@@ -190,9 +190,29 @@ export declare const mulmoTextSlideMediaSchema: z.ZodObject<{
190
190
  bullets: z.ZodOptional<z.ZodArray<z.ZodString>>;
191
191
  }, z.core.$strip>;
192
192
  }, z.core.$strict>;
193
+ export declare const captionSplitSchema: z.ZodDefault<z.ZodEnum<{
194
+ none: "none";
195
+ estimate: "estimate";
196
+ }>>;
197
+ export declare const textSplitSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
198
+ type: z.ZodLiteral<"none">;
199
+ }, z.core.$strip>, z.ZodObject<{
200
+ type: z.ZodLiteral<"delimiters">;
201
+ delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
202
+ }, z.core.$strip>], "type">;
193
203
  export declare const mulmoCaptionParamsSchema: z.ZodObject<{
194
204
  lang: z.ZodOptional<z.ZodString>;
195
205
  styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
206
+ captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
207
+ none: "none";
208
+ estimate: "estimate";
209
+ }>>>;
210
+ textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
211
+ type: z.ZodLiteral<"none">;
212
+ }, z.core.$strip>, z.ZodObject<{
213
+ type: z.ZodLiteral<"delimiters">;
214
+ delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
215
+ }, z.core.$strip>], "type">>;
196
216
  }, z.core.$strict>;
197
217
  export declare const mulmoChartMediaSchema: z.ZodObject<{
198
218
  type: z.ZodLiteral<"chart">;
@@ -747,6 +767,7 @@ export declare const mulmoMovieParamsSchema: z.ZodObject<{
747
767
  export declare const mulmoBeatSchema: z.ZodObject<{
748
768
  speaker: z.ZodOptional<z.ZodString>;
749
769
  text: z.ZodDefault<z.ZodOptional<z.ZodString>>;
770
+ texts: z.ZodOptional<z.ZodArray<z.ZodString>>;
750
771
  id: z.ZodOptional<z.ZodString>;
751
772
  description: z.ZodOptional<z.ZodString>;
752
773
  image: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
@@ -1130,6 +1151,16 @@ export declare const mulmoBeatSchema: z.ZodObject<{
1130
1151
  captionParams: z.ZodOptional<z.ZodObject<{
1131
1152
  lang: z.ZodOptional<z.ZodString>;
1132
1153
  styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
1154
+ captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
1155
+ none: "none";
1156
+ estimate: "estimate";
1157
+ }>>>;
1158
+ textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
1159
+ type: z.ZodLiteral<"none">;
1160
+ }, z.core.$strip>, z.ZodObject<{
1161
+ type: z.ZodLiteral<"delimiters">;
1162
+ delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
1163
+ }, z.core.$strip>], "type">>;
1133
1164
  }, z.core.$strict>>;
1134
1165
  imageNames: z.ZodOptional<z.ZodArray<z.ZodString>>;
1135
1166
  imagePrompt: z.ZodOptional<z.ZodString>;
@@ -1485,6 +1516,16 @@ export declare const mulmoPresentationStyleSchema: z.ZodObject<{
1485
1516
  captionParams: z.ZodOptional<z.ZodObject<{
1486
1517
  lang: z.ZodOptional<z.ZodString>;
1487
1518
  styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
1519
+ captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
1520
+ none: "none";
1521
+ estimate: "estimate";
1522
+ }>>>;
1523
+ textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
1524
+ type: z.ZodLiteral<"none">;
1525
+ }, z.core.$strip>, z.ZodObject<{
1526
+ type: z.ZodLiteral<"delimiters">;
1527
+ delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
1528
+ }, z.core.$strip>], "type">>;
1488
1529
  }, z.core.$strict>>;
1489
1530
  audioParams: z.ZodDefault<z.ZodObject<{
1490
1531
  padding: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
@@ -1836,6 +1877,16 @@ export declare const mulmoScriptSchema: z.ZodObject<{
1836
1877
  captionParams: z.ZodOptional<z.ZodObject<{
1837
1878
  lang: z.ZodOptional<z.ZodString>;
1838
1879
  styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
1880
+ captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
1881
+ none: "none";
1882
+ estimate: "estimate";
1883
+ }>>>;
1884
+ textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
1885
+ type: z.ZodLiteral<"none">;
1886
+ }, z.core.$strip>, z.ZodObject<{
1887
+ type: z.ZodLiteral<"delimiters">;
1888
+ delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
1889
+ }, z.core.$strip>], "type">>;
1839
1890
  }, z.core.$strict>>;
1840
1891
  audioParams: z.ZodDefault<z.ZodObject<{
1841
1892
  padding: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
@@ -1874,6 +1925,7 @@ export declare const mulmoScriptSchema: z.ZodObject<{
1874
1925
  beats: z.ZodArray<z.ZodObject<{
1875
1926
  speaker: z.ZodOptional<z.ZodString>;
1876
1927
  text: z.ZodDefault<z.ZodOptional<z.ZodString>>;
1928
+ texts: z.ZodOptional<z.ZodArray<z.ZodString>>;
1877
1929
  id: z.ZodOptional<z.ZodString>;
1878
1930
  description: z.ZodOptional<z.ZodString>;
1879
1931
  image: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
@@ -2257,6 +2309,16 @@ export declare const mulmoScriptSchema: z.ZodObject<{
2257
2309
  captionParams: z.ZodOptional<z.ZodObject<{
2258
2310
  lang: z.ZodOptional<z.ZodString>;
2259
2311
  styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
2312
+ captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
2313
+ none: "none";
2314
+ estimate: "estimate";
2315
+ }>>>;
2316
+ textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
2317
+ type: z.ZodLiteral<"none">;
2318
+ }, z.core.$strip>, z.ZodObject<{
2319
+ type: z.ZodLiteral<"delimiters">;
2320
+ delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
2321
+ }, z.core.$strip>], "type">>;
2260
2322
  }, z.core.$strict>>;
2261
2323
  imageNames: z.ZodOptional<z.ZodArray<z.ZodString>>;
2262
2324
  imagePrompt: z.ZodOptional<z.ZodString>;
@@ -2289,6 +2351,11 @@ export declare const mulmoStudioBeatSchema: z.ZodObject<{
2289
2351
  soundEffectFile: z.ZodOptional<z.ZodString>;
2290
2352
  lipSyncFile: z.ZodOptional<z.ZodString>;
2291
2353
  captionFile: z.ZodOptional<z.ZodString>;
2354
+ captionFiles: z.ZodOptional<z.ZodArray<z.ZodObject<{
2355
+ file: z.ZodString;
2356
+ startAt: z.ZodNumber;
2357
+ endAt: z.ZodNumber;
2358
+ }, z.core.$strip>>>;
2292
2359
  htmlImageFile: z.ZodOptional<z.ZodString>;
2293
2360
  markdown: z.ZodOptional<z.ZodString>;
2294
2361
  html: z.ZodOptional<z.ZodString>;
@@ -2682,6 +2749,16 @@ export declare const mulmoStudioSchema: z.ZodObject<{
2682
2749
  captionParams: z.ZodOptional<z.ZodObject<{
2683
2750
  lang: z.ZodOptional<z.ZodString>;
2684
2751
  styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
2752
+ captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
2753
+ none: "none";
2754
+ estimate: "estimate";
2755
+ }>>>;
2756
+ textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
2757
+ type: z.ZodLiteral<"none">;
2758
+ }, z.core.$strip>, z.ZodObject<{
2759
+ type: z.ZodLiteral<"delimiters">;
2760
+ delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
2761
+ }, z.core.$strip>], "type">>;
2685
2762
  }, z.core.$strict>>;
2686
2763
  audioParams: z.ZodDefault<z.ZodObject<{
2687
2764
  padding: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
@@ -2720,6 +2797,7 @@ export declare const mulmoStudioSchema: z.ZodObject<{
2720
2797
  beats: z.ZodArray<z.ZodObject<{
2721
2798
  speaker: z.ZodOptional<z.ZodString>;
2722
2799
  text: z.ZodDefault<z.ZodOptional<z.ZodString>>;
2800
+ texts: z.ZodOptional<z.ZodArray<z.ZodString>>;
2723
2801
  id: z.ZodOptional<z.ZodString>;
2724
2802
  description: z.ZodOptional<z.ZodString>;
2725
2803
  image: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
@@ -3103,6 +3181,16 @@ export declare const mulmoStudioSchema: z.ZodObject<{
3103
3181
  captionParams: z.ZodOptional<z.ZodObject<{
3104
3182
  lang: z.ZodOptional<z.ZodString>;
3105
3183
  styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
3184
+ captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
3185
+ none: "none";
3186
+ estimate: "estimate";
3187
+ }>>>;
3188
+ textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
3189
+ type: z.ZodLiteral<"none">;
3190
+ }, z.core.$strip>, z.ZodObject<{
3191
+ type: z.ZodLiteral<"delimiters">;
3192
+ delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
3193
+ }, z.core.$strip>], "type">>;
3106
3194
  }, z.core.$strict>>;
3107
3195
  imageNames: z.ZodOptional<z.ZodArray<z.ZodString>>;
3108
3196
  imagePrompt: z.ZodOptional<z.ZodString>;
@@ -3136,6 +3224,11 @@ export declare const mulmoStudioSchema: z.ZodObject<{
3136
3224
  soundEffectFile: z.ZodOptional<z.ZodString>;
3137
3225
  lipSyncFile: z.ZodOptional<z.ZodString>;
3138
3226
  captionFile: z.ZodOptional<z.ZodString>;
3227
+ captionFiles: z.ZodOptional<z.ZodArray<z.ZodObject<{
3228
+ file: z.ZodString;
3229
+ startAt: z.ZodNumber;
3230
+ endAt: z.ZodNumber;
3231
+ }, z.core.$strip>>>;
3139
3232
  htmlImageFile: z.ZodOptional<z.ZodString>;
3140
3233
  markdown: z.ZodOptional<z.ZodString>;
3141
3234
  html: z.ZodOptional<z.ZodString>;
@@ -3464,6 +3557,16 @@ export declare const mulmoPromptTemplateSchema: z.ZodObject<{
3464
3557
  captionParams: z.ZodOptional<z.ZodObject<{
3465
3558
  lang: z.ZodOptional<z.ZodString>;
3466
3559
  styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
3560
+ captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
3561
+ none: "none";
3562
+ estimate: "estimate";
3563
+ }>>>;
3564
+ textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
3565
+ type: z.ZodLiteral<"none">;
3566
+ }, z.core.$strip>, z.ZodObject<{
3567
+ type: z.ZodLiteral<"delimiters">;
3568
+ delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
3569
+ }, z.core.$strip>], "type">>;
3467
3570
  }, z.core.$strict>>;
3468
3571
  audioParams: z.ZodDefault<z.ZodObject<{
3469
3572
  padding: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
@@ -3809,6 +3912,16 @@ export declare const mulmoPromptTemplateFileSchema: z.ZodObject<{
3809
3912
  captionParams: z.ZodOptional<z.ZodObject<{
3810
3913
  lang: z.ZodOptional<z.ZodString>;
3811
3914
  styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
3915
+ captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
3916
+ none: "none";
3917
+ estimate: "estimate";
3918
+ }>>>;
3919
+ textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
3920
+ type: z.ZodLiteral<"none">;
3921
+ }, z.core.$strip>, z.ZodObject<{
3922
+ type: z.ZodLiteral<"delimiters">;
3923
+ delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
3924
+ }, z.core.$strip>], "type">>;
3812
3925
  }, z.core.$strict>>;
3813
3926
  audioParams: z.ZodDefault<z.ZodObject<{
3814
3927
  padding: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
@@ -119,10 +119,20 @@ export const mulmoTextSlideMediaSchema = z
119
119
  }),
120
120
  })
121
121
  .strict();
122
+ export const captionSplitSchema = z.enum(["none", "estimate"]).default("none");
123
+ export const textSplitSchema = z.discriminatedUnion("type", [
124
+ z.object({ type: z.literal("none") }),
125
+ z.object({
126
+ type: z.literal("delimiters"),
127
+ delimiters: z.array(z.string()).optional(), // default: ["。", "?", "!", ".", "?", "!"]
128
+ }),
129
+ ]);
122
130
  export const mulmoCaptionParamsSchema = z
123
131
  .object({
124
132
  lang: langSchema.optional(),
125
133
  styles: z.array(z.string()).optional().default([]), // css styles
134
+ captionSplit: captionSplitSchema.optional(), // how to determine caption timing
135
+ textSplit: textSplitSchema.optional(), // how to split text into segments (default: none)
126
136
  })
127
137
  .strict();
128
138
  export const mulmoChartMediaSchema = z
@@ -317,6 +327,7 @@ export const mulmoBeatSchema = z
317
327
  .object({
318
328
  speaker: speakerIdSchema.optional(),
319
329
  text: z.string().optional().default("").describe("Text to be spoken. If empty, the audio is not generated."),
330
+ texts: z.array(z.string()).optional().describe("Manually split texts for captions. Takes precedence over text for caption display."),
320
331
  id: z.string().optional().describe("Unique identifier for the beat."),
321
332
  description: z.string().optional(),
322
333
  image: mulmoImageAssetSchema.optional(),
@@ -442,7 +453,14 @@ export const mulmoStudioBeatSchema = z
442
453
  movieFile: z.string().optional(), // path to the movie file
443
454
  soundEffectFile: z.string().optional(), // path to the sound effect file
444
455
  lipSyncFile: z.string().optional(), // path to the lip sync file
445
- captionFile: z.string().optional(), // path to the caption image
456
+ captionFile: z.string().optional(), // path to the caption image (deprecated, use captionFiles)
457
+ captionFiles: z
458
+ .array(z.object({
459
+ file: z.string(),
460
+ startAt: z.number(), // absolute start time in seconds
461
+ endAt: z.number(), // absolute end time in seconds
462
+ }))
463
+ .optional(), // split caption images with timing
446
464
  htmlImageFile: z.string().optional(), // path to the html image
447
465
  markdown: z.string().optional(), // markdown string (alternative to image)
448
466
  html: z.string().optional(), // html string (alternative to image)
@@ -247,6 +247,7 @@ export declare const createStudioData: (_mulmoScript: MulmoScript, fileName: str
247
247
  beats: {
248
248
  text: string;
249
249
  speaker?: string | undefined;
250
+ texts?: string[] | undefined;
250
251
  id?: string | undefined;
251
252
  description?: string | undefined;
252
253
  image?: {
@@ -555,6 +556,13 @@ export declare const createStudioData: (_mulmoScript: MulmoScript, fileName: str
555
556
  captionParams?: {
556
557
  styles: string[];
557
558
  lang?: string | undefined;
559
+ captionSplit?: "none" | "estimate" | undefined;
560
+ textSplit?: {
561
+ type: "none";
562
+ } | {
563
+ type: "delimiters";
564
+ delimiters?: string[] | undefined;
565
+ } | undefined;
558
566
  } | undefined;
559
567
  imageNames?: string[] | undefined;
560
568
  imagePrompt?: string | undefined;
@@ -583,6 +591,13 @@ export declare const createStudioData: (_mulmoScript: MulmoScript, fileName: str
583
591
  captionParams?: {
584
592
  styles: string[];
585
593
  lang?: string | undefined;
594
+ captionSplit?: "none" | "estimate" | undefined;
595
+ textSplit?: {
596
+ type: "none";
597
+ } | {
598
+ type: "delimiters";
599
+ delimiters?: string[] | undefined;
600
+ } | undefined;
586
601
  } | undefined;
587
602
  title?: string | undefined;
588
603
  description?: string | undefined;
@@ -611,6 +626,11 @@ export declare const createStudioData: (_mulmoScript: MulmoScript, fileName: str
611
626
  soundEffectFile?: string | undefined;
612
627
  lipSyncFile?: string | undefined;
613
628
  captionFile?: string | undefined;
629
+ captionFiles?: {
630
+ file: string;
631
+ startAt: number;
632
+ endAt: number;
633
+ }[] | undefined;
614
634
  htmlImageFile?: string | undefined;
615
635
  markdown?: string | undefined;
616
636
  html?: string | undefined;
@@ -867,6 +887,7 @@ export declare const initializeContextFromFiles: (files: FileObject, raiseError:
867
887
  beats: {
868
888
  text: string;
869
889
  speaker?: string | undefined;
890
+ texts?: string[] | undefined;
870
891
  id?: string | undefined;
871
892
  description?: string | undefined;
872
893
  image?: {
@@ -1175,6 +1196,13 @@ export declare const initializeContextFromFiles: (files: FileObject, raiseError:
1175
1196
  captionParams?: {
1176
1197
  styles: string[];
1177
1198
  lang?: string | undefined;
1199
+ captionSplit?: "none" | "estimate" | undefined;
1200
+ textSplit?: {
1201
+ type: "none";
1202
+ } | {
1203
+ type: "delimiters";
1204
+ delimiters?: string[] | undefined;
1205
+ } | undefined;
1178
1206
  } | undefined;
1179
1207
  imageNames?: string[] | undefined;
1180
1208
  imagePrompt?: string | undefined;
@@ -1203,6 +1231,13 @@ export declare const initializeContextFromFiles: (files: FileObject, raiseError:
1203
1231
  captionParams?: {
1204
1232
  styles: string[];
1205
1233
  lang?: string | undefined;
1234
+ captionSplit?: "none" | "estimate" | undefined;
1235
+ textSplit?: {
1236
+ type: "none";
1237
+ } | {
1238
+ type: "delimiters";
1239
+ delimiters?: string[] | undefined;
1240
+ } | undefined;
1206
1241
  } | undefined;
1207
1242
  title?: string | undefined;
1208
1243
  description?: string | undefined;
@@ -1231,6 +1266,11 @@ export declare const initializeContextFromFiles: (files: FileObject, raiseError:
1231
1266
  soundEffectFile?: string | undefined;
1232
1267
  lipSyncFile?: string | undefined;
1233
1268
  captionFile?: string | undefined;
1269
+ captionFiles?: {
1270
+ file: string;
1271
+ startAt: number;
1272
+ endAt: number;
1273
+ }[] | undefined;
1234
1274
  htmlImageFile?: string | undefined;
1235
1275
  markdown?: string | undefined;
1236
1276
  html?: string | undefined;
@@ -1504,6 +1544,13 @@ export declare const initializeContextFromFiles: (files: FileObject, raiseError:
1504
1544
  captionParams?: {
1505
1545
  styles: string[];
1506
1546
  lang?: string | undefined;
1547
+ captionSplit?: "none" | "estimate" | undefined;
1548
+ textSplit?: {
1549
+ type: "none";
1550
+ } | {
1551
+ type: "delimiters";
1552
+ delimiters?: string[] | undefined;
1553
+ } | undefined;
1507
1554
  } | undefined;
1508
1555
  };
1509
1556
  sessionState: {
@@ -34,7 +34,7 @@ export declare const getBeatMoviePaths: (context: MulmoStudioContext, index: num
34
34
  lipSyncFile: string;
35
35
  };
36
36
  export declare const getReferenceImagePath: (context: MulmoStudioContext, key: string, extension: string) => string;
37
- export declare const getCaptionImagePath: (context: MulmoStudioContext, index: number) => string;
37
+ export declare const getCaptionImagePath: (context: MulmoStudioContext, index: number, subIndex?: number) => string;
38
38
  export declare const getOutputPdfFilePath: (outDirPath: string, fileName: string, pdfMode: PDFMode, lang?: string) => string;
39
39
  export declare const getPromptTemplateFilePath: (promptTemplateName: string) => string;
40
40
  export declare const mkdir: (dirPath: string) => void;
package/lib/utils/file.js CHANGED
@@ -109,8 +109,11 @@ export const getReferenceImagePath = (context, key, extension) => {
109
109
  const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
110
110
  return `${imageProjectDirPath}/${key}.${extension}`;
111
111
  };
112
- export const getCaptionImagePath = (context, index) => {
112
+ export const getCaptionImagePath = (context, index, subIndex) => {
113
113
  const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
114
+ if (subIndex !== undefined) {
115
+ return `${imageProjectDirPath}/${index}_caption_${subIndex}.png`;
116
+ }
114
117
  return `${imageProjectDirPath}/${index}_caption.png`;
115
118
  };
116
119
  // pdf
@@ -1,5 +1,4 @@
1
1
  import type { AgentFilterFunction } from "graphai";
2
- export declare const nijovoiceTextAgentFilter: AgentFilterFunction;
3
2
  export declare const fileCacheAgentFilter: AgentFilterFunction;
4
3
  export declare const browserlessCacheGenerator: (cacheDir: string) => AgentFilterFunction;
5
4
  export declare const getBackupFilePath: (originalPath: string) => string;
@@ -6,15 +6,7 @@ import { GraphAILogger } from "graphai";
6
6
  import { writingMessage, isFile } from "./file.js";
7
7
  import { text2hash } from "./utils_node.js";
8
8
  import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
9
- import { replacementsJa, replacePairsJa } from "../utils/string.js";
10
9
  dotenv.config({ quiet: true });
11
- export const nijovoiceTextAgentFilter = async (context, next) => {
12
- const { text, provider, lang } = context.namedInputs;
13
- if (provider === "nijivoice" && lang === "ja") {
14
- context.namedInputs.text = replacePairsJa(replacementsJa)(text);
15
- }
16
- return next(context);
17
- };
18
10
  export const fileCacheAgentFilter = async (context, next) => {
19
11
  const { force, file, index, mulmoContext, sessionType, id, withBackup } = context.namedInputs.cache;
20
12
  /*
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mulmocast",
3
- "version": "2.1.19",
3
+ "version": "2.1.21",
4
4
  "description": "",
5
5
  "type": "module",
6
6
  "main": "lib/index.node.js",
@@ -65,7 +65,10 @@
65
65
  "nijivoice": "npx tsx batch/niji_sample.ts && yarn run movie scripts/samples/niji_voice.json",
66
66
  "generate_action_docs": "npx tsx ./automation/generate_actions_docs/generate_action_docs.ts"
67
67
  },
68
- "repository": "git+ssh://git@github.com/receptron/mulmocast-cli.git",
68
+ "repository": {
69
+ "type": "git",
70
+ "url": "git+ssh://git@github.com/receptron/mulmocast-cli.git"
71
+ },
69
72
  "author": "snakajima",
70
73
  "license": "AGPL-3.0-only",
71
74
  "bugs": {
@@ -19,28 +19,28 @@
19
19
  "speechParams": {
20
20
  "speakers": {
21
21
  "Announcer": {
22
- "provider": "nijivoice",
22
+ "provider": "gemini",
23
23
  "displayName": {
24
24
  "ja": "アナウンサー"
25
25
  },
26
- "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c",
26
+ "voiceId": "Aoede",
27
27
  "speechOptions": {
28
28
  "speed": 1.666
29
29
  }
30
30
  },
31
31
  "Student": {
32
- "provider": "nijivoice",
32
+ "provider": "gemini",
33
33
  "displayName": {
34
34
  "ja": "生徒"
35
35
  },
36
- "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f"
36
+ "voiceId": "Puck"
37
37
  },
38
38
  "Teacher": {
39
- "provider": "nijivoice",
39
+ "provider": "gemini",
40
40
  "displayName": {
41
41
  "ja": "先生"
42
42
  },
43
- "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae"
43
+ "voiceId": "Charon"
44
44
  }
45
45
  }
46
46
  },
@@ -17,25 +17,27 @@
17
17
  "style": "<style>monochrome"
18
18
  },
19
19
  "speechParams": {
20
- "provider": "nijivoice",
21
20
  "speakers": {
22
21
  "Announcer": {
23
22
  "displayName": {
24
23
  "ja": "千草朋香"
25
24
  },
26
- "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c"
25
+ "provider": "gemini",
26
+ "voiceId": "Aoede"
27
27
  },
28
28
  "Student": {
29
29
  "displayName": {
30
30
  "ja": "太郎"
31
31
  },
32
- "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f"
32
+ "provider": "gemini",
33
+ "voiceId": "Puck"
33
34
  },
34
35
  "Teacher": {
35
36
  "displayName": {
36
37
  "ja": "山田先生"
37
38
  },
38
- "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae"
39
+ "provider": "gemini",
40
+ "voiceId": "Charon"
39
41
  }
40
42
  }
41
43
  },
@@ -22,9 +22,9 @@
22
22
  "provider": "elevenlabs",
23
23
  "voiceId": "3JDquces8E8bkmvbh6Bc"
24
24
  },
25
- "Nijivoice": {
26
- "provider": "nijivoice",
27
- "voiceId": "231e0170-0ece-4155-be44-231423062f41"
25
+ "Kotodama": {
26
+ "provider": "kotodama",
27
+ "voiceId": "Poporo"
28
28
  }
29
29
  }
30
30
  },
@@ -41,7 +41,7 @@
41
41
  },
42
42
  {
43
43
  "speaker": "Gemini",
44
- "text": "こんにちは、テストです。ジェミニ",
44
+ "text": "こんにちは、テストです。ジェミニです。",
45
45
  "image": {
46
46
  "type": "textSlide",
47
47
  "slide": {
@@ -70,12 +70,12 @@
70
70
  }
71
71
  },
72
72
  {
73
- "speaker": "Nijivoice",
74
- "text": "こんにちは、テストです。ニジヴォイス",
73
+ "speaker": "Kotodama",
74
+ "text": "こんにちは、テストです。コトダマ",
75
75
  "image": {
76
76
  "type": "textSlide",
77
77
  "slide": {
78
- "title": "Nijivoice TTS"
78
+ "title": "Kotodama TTS"
79
79
  }
80
80
  }
81
81
  }
@@ -5,7 +5,12 @@
5
5
  "lang": "en",
6
6
  "captionParams": {
7
7
  "lang": "en",
8
- "styles": ["color: yellow"]
8
+ "styles": ["color: yellow"],
9
+ "captionSplit": "estimate",
10
+ "textSplit": {
11
+ "type": "delimiters",
12
+ "delimiters": ["。", "!", "?"]
13
+ }
9
14
  },
10
15
  "beats": [
11
16
  {
@@ -10,8 +10,8 @@
10
10
  "voiceId": "shimmer",
11
11
  "lang": {
12
12
  "ja": {
13
- "provider": "nijivoice",
14
- "voiceId": "9d9ed276-49ee-443a-bc19-26e6136d05f0"
13
+ "provider": "gemini",
14
+ "voiceId": "Leda"
15
15
  }
16
16
  }
17
17
  }
@@ -70,7 +70,7 @@
70
70
  "image": {
71
71
  "type": "textSlide",
72
72
  "slide": {
73
- "title": "Text replacement test for nijivoice"
73
+ "title": "Text replacement test for Gemini"
74
74
  }
75
75
  }
76
76
  },
@@ -29,8 +29,8 @@
29
29
  }
30
30
  },
31
31
  "Host": {
32
- "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c",
33
- "provider": "nijivoice",
32
+ "voiceId": "Kore",
33
+ "provider": "gemini",
34
34
  "displayName": {
35
35
  "en": "Japanese Host"
36
36
  }
@@ -79,12 +79,12 @@
79
79
  },
80
80
  {
81
81
  "speaker": "Host",
82
- "text": "そして私は、日本語音声合成のためのNijivoiceを使用するホストです。",
82
+ "text": "そして私は、Gemini TTS です。Google's TTS とは別の方法を利用しています。",
83
83
  "image": {
84
84
  "type": "textSlide",
85
85
  "slide": {
86
86
  "title": "Mixed Provider Demo",
87
- "subtitle": "Nijivoice Speaker (Japanese)"
87
+ "subtitle": "Gemini Speaker (Japanese)"
88
88
  }
89
89
  }
90
90
  }