npm - mulmocast - Versions diffs - 2.1.19 → 2.1.21 - Mend

mulmocast 2.1.19 → 2.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/README.md +2 -0
package/assets/templates/sensei_and_taro.json +3 -3
package/lib/actions/audio.js +3 -9
package/lib/actions/captions.js +101 -31
package/lib/actions/movie.d.ts +4 -0
package/lib/actions/movie.js +24 -24
package/lib/agents/index.d.ts +1 -2
package/lib/agents/index.js +1 -2
package/lib/data/promptTemplates.js +6 -6
package/lib/data/templateDataSet.js +1 -1
package/lib/types/agent.d.ts +0 -4
package/lib/types/provider2agent.d.ts +0 -5
package/lib/types/provider2agent.js +0 -5
package/lib/types/schema.d.ts +113 -0
package/lib/types/schema.js +19 -1
package/lib/utils/context.d.ts +47 -0
package/lib/utils/file.d.ts +1 -1
package/lib/utils/file.js +4 -1
package/lib/utils/filters.d.ts +0 -1
package/lib/utils/filters.js +0 -8
package/package.json +5 -2
package/scripts/test/test.json +6 -6
package/scripts/test/test2.json +6 -4
package/scripts/test/test_all_tts.json +7 -7
package/scripts/test/test_captions.json +6 -1
package/scripts/test/test_lang.json +3 -3
package/scripts/test/test_mixed_providers.json +4 -4

package/README.md CHANGED Viewed

@@ -1,5 +1,7 @@
 # MulmoCast: A Multi-Modal Presentation Tool for the AI-Native Era
+[![npm version](https://badge.fury.io/js/mulmocast.svg)](https://www.npmjs.com/package/mulmocast)
 ## Quick Start Guide
 If you want to try our beta version, follow the instruction in the release note below.

package/assets/templates/sensei_and_taro.json CHANGED Viewed

@@ -16,9 +16,9 @@
     },
     "speechParams": {
       "speakers": {
-        "Announcer": { "provider": "nijivoice", "displayName": { "ja": "アナウンサー" }, "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c" },
-        "Student": { "provider": "nijivoice", "displayName": { "ja": "太郎" }, "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f" },
-        "Teacher": { "provider": "nijivoice", "displayName": { "ja": "先生" }, "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae" }
+        "Announcer": { "provider": "gemini", "displayName": { "ja": "アナウンサー" }, "voiceId": "Aoede" },
+        "Student": { "provider": "gemini", "displayName": { "ja": "太郎" }, "voiceId": "Puck" },
+        "Teacher": { "provider": "gemini", "displayName": { "ja": "先生" }, "voiceId": "Charon" }
       }
     }
   },

package/lib/actions/audio.js CHANGED Viewed

@@ -2,9 +2,9 @@ import dotenv from "dotenv";
 import { GraphAI, TaskManager, GraphAILogger } from "graphai";
 import * as agents from "@graphai/vanilla";
 import { fileWriteAgent } from "@graphai/vanilla_node_agents";
-import { ttsNijivoiceAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsElevenlabsAgent, ttsKotodamaAgent, addBGMAgent, combineAudioFilesAgent, mediaMockAgent, } from "../agents/index.js";
+import { ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsElevenlabsAgent, ttsKotodamaAgent, addBGMAgent, combineAudioFilesAgent, mediaMockAgent, } from "../agents/index.js";
 import { text2SpeechProviderSchema } from "../types/index.js";
-import { fileCacheAgentFilter, nijovoiceTextAgentFilter } from "../utils/filters.js";
+import { fileCacheAgentFilter } from "../utils/filters.js";
 import { getAudioArtifactFilePath, getAudioFilePath, getOutputStudioFilePath, resolveDirPath, defaultBGMPath, mkdir, writingMessage } from "../utils/file.js";
 import { localizedText, settings2GraphAIConfig } from "../utils/utils.js";
 import { text2hash } from "../utils/utils_node.js";
@@ -214,14 +214,9 @@ const agentFilters = [
         agent: fileCacheAgentFilter,
         nodeIds: ["tts"],
     },
-    {
-        name: "nijovoiceTextAgentFilter",
-        agent: nijovoiceTextAgentFilter,
-        nodeIds: ["tts"],
-    },
 ];
 const getConcurrency = (context) => {
-    // Check if any speaker uses nijivoice or elevenlabs (providers that require concurrency = 1)
+    // Check if any speaker uses elevenlabs or kotodama (providers that require concurrency = 1)
     const hasLimitedConcurrencyProvider = Object.values(context.presentationStyle.speechParams.speakers).some((speaker) => {
         const provider = text2SpeechProviderSchema.parse(speaker.provider);
         return provider2TTSAgent[provider].hasLimitedConcurrency;
@@ -232,7 +227,6 @@ const audioAgents = {
     ...vanillaAgents,
     fileWriteAgent,
     ttsOpenaiAgent,
-    ttsNijivoiceAgent,
     ttsGoogleAgent,
     ttsGeminiAgent,
     ttsKotodamaAgent,

package/lib/actions/captions.js CHANGED Viewed

@@ -7,6 +7,105 @@ import { renderHTMLToImage, interpolate } from "../utils/markdown.js";
 import { MulmoStudioContextMethods, MulmoPresentationStyleMethods } from "../methods/index.js";
 import { fileWriteAgent } from "@graphai/vanilla_node_agents";
 const vanillaAgents = agents.default ?? agents;
+const defaultDelimiters = ["。", "？", "！", ".", "?", "!"];
+// Split text by delimiters while keeping delimiters attached to the preceding text
+const splitTextByDelimiters = (text, delimiters) => {
+    if (!text || delimiters.length === 0) {
+        return [text];
+    }
+    const { segments, current } = [...text].reduce((acc, char) => {
+        const newCurrent = acc.current + char;
+        if (delimiters.includes(char)) {
+            const trimmed = newCurrent.trim();
+            return {
+                segments: trimmed ? [...acc.segments, trimmed] : acc.segments,
+                current: "",
+            };
+        }
+        return { ...acc, current: newCurrent };
+    }, { segments: [], current: "" });
+    const finalSegments = current.trim() ? [...segments, current.trim()] : segments;
+    return finalSegments.length > 0 ? finalSegments : [text];
+};
+// Get split texts based on settings
+const getSplitTexts = (text, texts, textSplit) => {
+    // Manual split takes precedence
+    if (texts && texts.length > 0) {
+        return texts;
+    }
+    // No splitting or undefined
+    if (!textSplit || textSplit.type === "none") {
+        return [text];
+    }
+    // Split by delimiters
+    if (textSplit.type === "delimiters") {
+        const delimiters = textSplit.delimiters ?? defaultDelimiters;
+        return splitTextByDelimiters(text, delimiters);
+    }
+    return [text];
+};
+// Calculate timing ratios based on text length
+const calculateTimingRatios = (splitTexts) => {
+    const totalLength = splitTexts.reduce((sum, t) => sum + t.length, 0);
+    if (totalLength === 0) {
+        return splitTexts.map(() => 1 / splitTexts.length);
+    }
+    return splitTexts.map((t) => t.length / totalLength);
+};
+// Convert ratios to cumulative ratios: [0.3, 0.5, 0.2] -> [0, 0.3, 0.8, 1.0]
+const calculateCumulativeRatios = (ratios) => {
+    return ratios.reduce((acc, ratio) => [...acc, acc[acc.length - 1] + ratio], [0]);
+};
+// Generate caption files for a single beat
+const generateBeatCaptions = async (beat, context, index) => {
+    const captionParams = mulmoCaptionParamsSchema.parse({ ...context.studio.script.captionParams, ...beat.captionParams });
+    const canvasSize = MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle);
+    const template = getHTMLFile("caption");
+    if (captionParams.lang && !context.multiLingual?.[index]?.multiLingualTexts?.[captionParams.lang]) {
+        GraphAILogger.warn(`No multiLingual caption found for beat ${index}, lang: ${captionParams.lang}`);
+    }
+    const text = localizedText(beat, context.multiLingual?.[index], captionParams.lang, context.studio.script.lang);
+    // Get beat timing info
+    const studioBeat = context.studio.beats[index];
+    const beatStartAt = studioBeat.startAt ?? 0;
+    const beatDuration = studioBeat.duration ?? 0;
+    const introPadding = MulmoStudioContextMethods.getIntroPadding(context);
+    // Determine split texts based on captionSplit setting
+    const captionSplit = captionParams.captionSplit ?? "none";
+    const splitTexts = captionSplit === "estimate" ? getSplitTexts(text, beat.texts, captionParams.textSplit) : [text];
+    // Calculate timing
+    const cumulativeRatios = calculateCumulativeRatios(calculateTimingRatios(splitTexts));
+    // Generate caption images with absolute timing
+    const captionFiles = await Promise.all(splitTexts.map(async (segmentText, subIndex) => {
+        const imagePath = getCaptionImagePath(context, index, subIndex);
+        const htmlData = interpolate(template, {
+            caption: processLineBreaks(segmentText),
+            width: `${canvasSize.width}`,
+            height: `${canvasSize.height}`,
+            styles: captionParams.styles.join(";\n"),
+        });
+        await renderHTMLToImage(htmlData, imagePath, canvasSize.width, canvasSize.height, false, true);
+        return {
+            file: imagePath,
+            startAt: beatStartAt + introPadding + beatDuration * cumulativeRatios[subIndex],
+            endAt: beatStartAt + introPadding + beatDuration * cumulativeRatios[subIndex + 1],
+        };
+    }));
+    return captionFiles;
+};
+// GraphAI agent for caption generation
+const captionGenerationAgent = async (namedInputs) => {
+    const { beat, context, index } = namedInputs;
+    try {
+        MulmoStudioContextMethods.setBeatSessionState(context, "caption", index, beat.id, true);
+        const captionFiles = await generateBeatCaptions(beat, context, index);
+        context.studio.beats[index].captionFiles = captionFiles;
+        return captionFiles;
+    }
+    finally {
+        MulmoStudioContextMethods.setBeatSessionState(context, "caption", index, beat.id, false);
+    }
+};
 export const caption_graph_data = {
     version: 0.5,
     nodes: {
@@ -23,37 +122,8 @@ export const caption_graph_data = {
             graph: {
                 nodes: {
                     generateCaption: {
-                        agent: async (namedInputs) => {
-                            const { beat, context, index } = namedInputs;
-                            try {
-                                MulmoStudioContextMethods.setBeatSessionState(context, "caption", index, beat.id, true);
-                                const captionParams = mulmoCaptionParamsSchema.parse({ ...context.studio.script.captionParams, ...beat.captionParams });
-                                const canvasSize = MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle);
-                                const imagePath = getCaptionImagePath(context, index);
-                                const template = getHTMLFile("caption");
-                                if (captionParams.lang && !context.multiLingual?.[index]?.multiLingualTexts?.[captionParams.lang]) {
-                                    GraphAILogger.warn(`No multiLingual caption found for beat ${index}, lang: ${captionParams.lang}`);
-                                }
-                                const text = localizedText(beat, context.multiLingual?.[index], captionParams.lang, context.studio.script.lang);
-                                const htmlData = interpolate(template, {
-                                    caption: processLineBreaks(text),
-                                    width: `${canvasSize.width}`,
-                                    height: `${canvasSize.height}`,
-                                    styles: captionParams.styles.join(";\n"),
-                                });
-                                await renderHTMLToImage(htmlData, imagePath, canvasSize.width, canvasSize.height, false, true);
-                                context.studio.beats[index].captionFile = imagePath;
-                                return imagePath;
-                            }
-                            finally {
-                                MulmoStudioContextMethods.setBeatSessionState(context, "caption", index, beat.id, false);
-                            }
-                        },
-                        inputs: {
-                            beat: ":beat",
-                            context: ":context",
-                            index: ":__mapIndex",
-                        },
+                        agent: captionGenerationAgent,
+                        inputs: { beat: ":beat", context: ":context", index: ":__mapIndex" },
                         isResult: true,
                     },
                 },

package/lib/actions/movie.d.ts CHANGED Viewed

@@ -27,6 +27,10 @@ export declare const getTransitionVideoId: (transition: MulmoTransition, videoId
     beatIndex: number;
 };
 export declare const getConcatVideoFilter: (concatVideoId: string, videoIdsForBeats: VideoId[]) => string;
+export declare const getTransitionFrameDurations: (context: MulmoStudioContext, index: number) => {
+    firstDuration: number;
+    lastDuration: number;
+};
 export declare const validateBeatSource: (studioBeat: MulmoStudioContext["studio"]["beats"][number], index: number) => string;
 export declare const addSplitAndExtractFrames: (ffmpegContext: FfmpegContext, videoId: string, firstDuration: number, lastDuration: number, isMovie: boolean, needFirst: boolean, needLast: boolean, canvasInfo: {
     width: number;

package/lib/actions/movie.js CHANGED Viewed

@@ -86,19 +86,22 @@ const getOutputOption = (audioId, videoId) => {
     ];
 };
 const addCaptions = (ffmpegContext, concatVideoId, context, caption) => {
-    const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
+    const beatsWithCaptions = context.studio.beats.filter(({ captionFiles }) => captionFiles && captionFiles.length > 0);
     if (caption && beatsWithCaptions.length > 0) {
-        const introPadding = MulmoStudioContextMethods.getIntroPadding(context);
-        return beatsWithCaptions.reduce((prevVideoId, beat, index) => {
-            const { startAt, duration, captionFile } = beat;
-            if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
-                const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
-                const compositeVideoId = `oc${index}`;
-                ffmpegContext.filterComplex.push(`[${prevVideoId}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
-                return compositeVideoId;
+        const { videoId } = beatsWithCaptions.reduce((acc, beat) => {
+            const { captionFiles } = beat;
+            if (!captionFiles) {
+                return acc;
             }
-            return prevVideoId;
-        }, concatVideoId);
+            return captionFiles.reduce((innerAcc, captionData) => {
+                const { file, startAt, endAt } = captionData;
+                const captionInputIndex = FfmpegContextAddInput(ffmpegContext, file);
+                const compositeVideoId = `oc${innerAcc.captionIndex}`;
+                ffmpegContext.filterComplex.push(`[${innerAcc.videoId}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt},${endAt})'[${compositeVideoId}]`);
+                return { videoId: compositeVideoId, captionIndex: innerAcc.captionIndex + 1 };
+            }, acc);
+        }, { videoId: concatVideoId, captionIndex: 0 });
+        return videoId;
     }
     return concatVideoId;
 };
@@ -280,24 +283,21 @@ const getClampedTransitionDuration = (transitionDuration, prevBeatDuration, curr
     const maxDuration = Math.min(prevBeatDuration, currentBeatDuration) * 0.9; // Use 90% to leave some margin
     return Math.min(transitionDuration, maxDuration);
 };
-const getTransitionFrameDurations = (context, index) => {
+export const getTransitionFrameDurations = (context, index) => {
     const minFrame = 1 / 30; // 30fpsを想定。最小1フレーム
     const beats = context.studio.beats;
     const scriptBeats = context.studio.script.beats;
+    const getTransitionDuration = (transition, prevBeatIndex, currentBeatIndex) => {
+        if (!transition || prevBeatIndex < 0 || currentBeatIndex >= beats.length)
+            return 0;
+        const prevBeatDuration = beats[prevBeatIndex].duration ?? 1;
+        const currentBeatDuration = beats[currentBeatIndex].duration ?? 1;
+        return getClampedTransitionDuration(transition.duration, prevBeatDuration, currentBeatDuration);
+    };
     const currentTransition = MulmoPresentationStyleMethods.getMovieTransition(context, scriptBeats[index]);
-    let firstDuration = 0;
-    if (currentTransition && index > 0) {
-        const prevBeatDuration = beats[index - 1].duration ?? 1;
-        const currentBeatDuration = beats[index].duration ?? 1;
-        firstDuration = getClampedTransitionDuration(currentTransition.duration, prevBeatDuration, currentBeatDuration);
-    }
+    const firstDuration = index > 0 ? getTransitionDuration(currentTransition, index - 1, index) : 0;
     const nextTransition = index < scriptBeats.length - 1 ? MulmoPresentationStyleMethods.getMovieTransition(context, scriptBeats[index + 1]) : null;
-    let lastDuration = 0;
-    if (nextTransition) {
-        const prevBeatDuration = beats[index].duration ?? 1;
-        const currentBeatDuration = beats[index + 1].duration ?? 1;
-        lastDuration = getClampedTransitionDuration(nextTransition.duration, prevBeatDuration, currentBeatDuration);
-    }
+    const lastDuration = getTransitionDuration(nextTransition, index, index + 1);
     return {
         firstDuration: Math.max(firstDuration, minFrame),
         lastDuration: Math.max(lastDuration, minFrame),

package/lib/agents/index.d.ts CHANGED Viewed

@@ -8,7 +8,6 @@ import movieGenAIAgent from "./movie_genai_agent.js";
 import movieReplicateAgent from "./movie_replicate_agent.js";
 import mediaMockAgent from "./media_mock_agent.js";
 import ttsElevenlabsAgent from "./tts_elevenlabs_agent.js";
-import ttsNijivoiceAgent from "./tts_nijivoice_agent.js";
 import ttsOpenaiAgent from "./tts_openai_agent.js";
 import ttsGoogleAgent from "./tts_google_agent.js";
 import ttsGeminiAgent from "./tts_gemini_agent.js";
@@ -21,4 +20,4 @@ import { browserlessAgent } from "@graphai/browserless_agent";
 import { textInputAgent } from "@graphai/input_agents";
 import { openAIAgent } from "@graphai/openai_agent";
 import { fileWriteAgent } from "@graphai/vanilla_node_agents";
-export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGenAIAgent, imageOpenaiAgent, imageReplicateAgent, tavilySearchAgent, movieGenAIAgent, movieReplicateAgent, mediaMockAgent, ttsElevenlabsAgent, ttsNijivoiceAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsKotodamaAgent, validateSchemaAgent, soundEffectReplicateAgent, lipSyncReplicateAgent, puppeteerCrawlerAgent, };
+export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGenAIAgent, imageOpenaiAgent, imageReplicateAgent, tavilySearchAgent, movieGenAIAgent, movieReplicateAgent, mediaMockAgent, ttsElevenlabsAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsKotodamaAgent, validateSchemaAgent, soundEffectReplicateAgent, lipSyncReplicateAgent, puppeteerCrawlerAgent, };

package/lib/agents/index.js CHANGED Viewed

@@ -8,7 +8,6 @@ import movieGenAIAgent from "./movie_genai_agent.js";
 import movieReplicateAgent from "./movie_replicate_agent.js";
 import mediaMockAgent from "./media_mock_agent.js";
 import ttsElevenlabsAgent from "./tts_elevenlabs_agent.js";
-import ttsNijivoiceAgent from "./tts_nijivoice_agent.js";
 import ttsOpenaiAgent from "./tts_openai_agent.js";
 import ttsGoogleAgent from "./tts_google_agent.js";
 import ttsGeminiAgent from "./tts_gemini_agent.js";
@@ -22,4 +21,4 @@ import { textInputAgent } from "@graphai/input_agents";
 import { openAIAgent } from "@graphai/openai_agent";
 // import * as vanilla from "@graphai/vanilla";
 import { fileWriteAgent } from "@graphai/vanilla_node_agents";
-export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGenAIAgent, imageOpenaiAgent, imageReplicateAgent, tavilySearchAgent, movieGenAIAgent, movieReplicateAgent, mediaMockAgent, ttsElevenlabsAgent, ttsNijivoiceAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsKotodamaAgent, validateSchemaAgent, soundEffectReplicateAgent, lipSyncReplicateAgent, puppeteerCrawlerAgent, };
+export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGenAIAgent, imageOpenaiAgent, imageReplicateAgent, tavilySearchAgent, movieGenAIAgent, movieReplicateAgent, mediaMockAgent, ttsElevenlabsAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsGeminiAgent, ttsKotodamaAgent, validateSchemaAgent, soundEffectReplicateAgent, lipSyncReplicateAgent, puppeteerCrawlerAgent, };

package/lib/data/promptTemplates.js CHANGED Viewed

@@ -890,22 +890,22 @@ export const promptTemplates = [
                         displayName: {
                             ja: "アナウンサー",
                         },
-                        provider: "nijivoice",
-                        voiceId: "3708ad43-cace-486c-a4ca-8fe41186e20c",
+                        provider: "gemini",
+                        voiceId: "Aoede",
                     },
                     Student: {
                         displayName: {
                             ja: "太郎",
                         },
-                        provider: "nijivoice",
-                        voiceId: "a7619e48-bf6a-4f9f-843f-40485651257f",
+                        provider: "gemini",
+                        voiceId: "Puck",
                     },
                     Teacher: {
                         displayName: {
                             ja: "先生",
                         },
-                        provider: "nijivoice",
-                        voiceId: "bc06c63f-fef6-43b6-92f7-67f919bd5dae",
+                        provider: "gemini",
+                        voiceId: "Charon",
                     },
                 },
             },

package/lib/data/templateDataSet.js CHANGED Viewed

@@ -73,7 +73,7 @@ export const templateDataSet = {
         "```",
     sensei_and_taro: "全てを高校生にも分かるように、太郎くん(Student)と先生(Teacher)の会話、という形の台本にして。ただし要点はしっかりと押さえて。以下に別のトピックに関するサンプルを貼り付けます。このJSONフォーマットに従って。\n" +
         "```JSON\n" +
-        `{"$mulmocast":{"version":"1.1","credit":"closing"},"title":"韓国の戒厳令とその日本への影響","description":"韓国で最近発令された戒厳令とその可能性のある影響について、また日本の憲法に関する考慮事項との類似点を含めた洞察に満ちた議論。","lang":"ja","beats":[{"speaker":"Announcer","text":"今日は、韓国で起きた戒厳令について、太郎くんが先生に聞きます。","imagePrompt":"A classroom setting with a curious Japanese student (Taro) and a kind teacher. Calm atmosphere, early morning light coming through the window."},{"speaker":"Student","text":"先生、今日は韓国で起きた戒厳令のことを教えてもらえますか？","imagePrompt":"The student (Taro) sitting at his desk with a serious expression, raising his hand to ask a question. Teacher is slightly surprised but attentive."},{"speaker":"Teacher","text":"もちろんだよ、太郎くん。韓国で最近、大統領が「戒厳令」っていうのを突然宣言したんだ。","imagePrompt":"TV screen showing a breaking news headline in Korean: 'President Declares Martial Law'. Students watching with concern."},{"speaker":"Student","text":"戒厳令ってなんですか？","imagePrompt":"A close-up of the student's puzzled face, with a speech bubble saying '戒厳令って？'"},{"speaker":"Teacher","text":"簡単に言うと、国がすごく危ない状態にあるとき、軍隊を使って人々の自由を制限するためのものなんだ。","imagePrompt":"Illustration of soldiers standing in the street, people being stopped and questioned, with a red 'X' on a protest sign. Moody and serious tone."},{"speaker":"Student","text":"それって怖いですね。なんでそんなことをしたんですか？","imagePrompt":"Student looking anxious, thinking deeply. Background shows a shadowy image of a politician giving orders to the military."},{"speaker":"Teacher","text":"大統領は「国会がうまく機能していないから」と言っていたけど…","imagePrompt":"A tense scene of military personnel entering a national assembly building in Korea, lawmakers looking shocked and resisting."},{"speaker":"Student","text":"ええっ！？国会議員を捕まえようとするなんて、すごく危ないことじゃないですか。","imagePrompt":"The student reacts with shock, comic-style expression with wide eyes and open mouth. Background fades into a dramatic courtroom or parliament chaos."},{"speaker":"Teacher","text":"その通りだよ。もし軍隊が国会を占拠していたら…","imagePrompt":"Dark visual of a locked parliament building with soldiers blocking the entrance, ominous sky in the background."},{"speaker":"Student","text":"韓国ではどうなったんですか？","imagePrompt":"Student leans forward, curious and worried. Background shows a hopeful scene of people holding protest signs with candles at night."},{"speaker":"Teacher","text":"幸い、野党の議員や市民たちが急いで集まって抗議して…","imagePrompt":"Peaceful protest scene in Seoul, citizens holding candles and banners, united. Hopeful tone."},{"speaker":"Student","text":"それは大変なことですね…。日本ではそんなこと起きないんですか？","imagePrompt":"Student looking toward the Japanese flag outside the school window, pensive mood."},{"speaker":"Teacher","text":"実はね、今、日本でも似たような話があるんだよ。","imagePrompt":"Teacher pointing to a newspaper headline: '緊急事態条項の議論進む'. Classroom chalkboard shows a map of Korea and Japan."},{"speaker":"Student","text":"緊急事態宣言って、韓国の戒厳令と同じようなものなんですか？","imagePrompt":"Split screen image: left side shows a soldier in Korea, right side shows a suited Japanese politician giving a press conference."},{"speaker":"Teacher","text":"似ている部分があるね。たとえば、総理大臣が…","imagePrompt":"Diagram-style visual showing the flow of emergency powers from PM to local governments. Simple, clean infographic style."},{"speaker":"Student","text":"それって便利そうですけど、なんだか心配です。","imagePrompt":"Student's concerned expression, behind him a blurry image of a street with emergency sirens glowing in red."},{"speaker":"Teacher","text":"そうだね。もちろん、緊急時には素早い対応が必要だけど…","imagePrompt":"Illustration of a balance scale: one side is 'freedom', the other 'security'. The scale is slightly tilting."},{"speaker":"Student","text":"韓国みたいに、軍隊が政治に口を出してくることもあり得るんですか？","imagePrompt":"Student imagining a military tank next to the Japanese parliament, shown as a thought bubble."},{"speaker":"Teacher","text":"完全にあり得ないとは言えないからこそ、注意が必要なんだ。","imagePrompt":"Japanese citizens reading newspapers and watching news with concerned faces, civic awareness growing."},{"speaker":"Student","text":"ありがとうございます。とても良い勉強になりました。","imagePrompt":"The student bows slightly to the teacher with a grateful expression. The classroom is peaceful again."},{"speaker":"Announcer","text":"ご視聴、ありがとうございました。次回の放送もお楽しみに。","imagePrompt":"Ending screen with soft background music, showing the show's logo and a thank-you message in Japanese."}],"canvasSize":{"width":1536,"height":1024},"imageParams":{"style":"<style>Ghibli style. Student (Taro) is a young teenager with a dark short hair with glasses. Teacher is a middle-aged man with grey hair and moustache.</style>"},"speechParams":{"speakers":{"Announcer":{"provider":"nijivoice","displayName":{"ja":"アナウンサー"},"voiceId":"3708ad43-cace-486c-a4ca-8fe41186e20c"},"Student":{"provider":"nijivoice","displayName":{"ja":"太郎"},"voiceId":"a7619e48-bf6a-4f9f-843f-40485651257f"},"Teacher":{"provider":"nijivoice","displayName":{"ja":"先生"},"voiceId":"bc06c63f-fef6-43b6-92f7-67f919bd5dae"}}}}\n` +
+        `{"$mulmocast":{"version":"1.1","credit":"closing"},"title":"韓国の戒厳令とその日本への影響","description":"韓国で最近発令された戒厳令とその可能性のある影響について、また日本の憲法に関する考慮事項との類似点を含めた洞察に満ちた議論。","lang":"ja","beats":[{"speaker":"Announcer","text":"今日は、韓国で起きた戒厳令について、太郎くんが先生に聞きます。","imagePrompt":"A classroom setting with a curious Japanese student (Taro) and a kind teacher. Calm atmosphere, early morning light coming through the window."},{"speaker":"Student","text":"先生、今日は韓国で起きた戒厳令のことを教えてもらえますか？","imagePrompt":"The student (Taro) sitting at his desk with a serious expression, raising his hand to ask a question. Teacher is slightly surprised but attentive."},{"speaker":"Teacher","text":"もちろんだよ、太郎くん。韓国で最近、大統領が「戒厳令」っていうのを突然宣言したんだ。","imagePrompt":"TV screen showing a breaking news headline in Korean: 'President Declares Martial Law'. Students watching with concern."},{"speaker":"Student","text":"戒厳令ってなんですか？","imagePrompt":"A close-up of the student's puzzled face, with a speech bubble saying '戒厳令って？'"},{"speaker":"Teacher","text":"簡単に言うと、国がすごく危ない状態にあるとき、軍隊を使って人々の自由を制限するためのものなんだ。","imagePrompt":"Illustration of soldiers standing in the street, people being stopped and questioned, with a red 'X' on a protest sign. Moody and serious tone."},{"speaker":"Student","text":"それって怖いですね。なんでそんなことをしたんですか？","imagePrompt":"Student looking anxious, thinking deeply. Background shows a shadowy image of a politician giving orders to the military."},{"speaker":"Teacher","text":"大統領は「国会がうまく機能していないから」と言っていたけど…","imagePrompt":"A tense scene of military personnel entering a national assembly building in Korea, lawmakers looking shocked and resisting."},{"speaker":"Student","text":"ええっ！？国会議員を捕まえようとするなんて、すごく危ないことじゃないですか。","imagePrompt":"The student reacts with shock, comic-style expression with wide eyes and open mouth. Background fades into a dramatic courtroom or parliament chaos."},{"speaker":"Teacher","text":"その通りだよ。もし軍隊が国会を占拠していたら…","imagePrompt":"Dark visual of a locked parliament building with soldiers blocking the entrance, ominous sky in the background."},{"speaker":"Student","text":"韓国ではどうなったんですか？","imagePrompt":"Student leans forward, curious and worried. Background shows a hopeful scene of people holding protest signs with candles at night."},{"speaker":"Teacher","text":"幸い、野党の議員や市民たちが急いで集まって抗議して…","imagePrompt":"Peaceful protest scene in Seoul, citizens holding candles and banners, united. Hopeful tone."},{"speaker":"Student","text":"それは大変なことですね…。日本ではそんなこと起きないんですか？","imagePrompt":"Student looking toward the Japanese flag outside the school window, pensive mood."},{"speaker":"Teacher","text":"実はね、今、日本でも似たような話があるんだよ。","imagePrompt":"Teacher pointing to a newspaper headline: '緊急事態条項の議論進む'. Classroom chalkboard shows a map of Korea and Japan."},{"speaker":"Student","text":"緊急事態宣言って、韓国の戒厳令と同じようなものなんですか？","imagePrompt":"Split screen image: left side shows a soldier in Korea, right side shows a suited Japanese politician giving a press conference."},{"speaker":"Teacher","text":"似ている部分があるね。たとえば、総理大臣が…","imagePrompt":"Diagram-style visual showing the flow of emergency powers from PM to local governments. Simple, clean infographic style."},{"speaker":"Student","text":"それって便利そうですけど、なんだか心配です。","imagePrompt":"Student's concerned expression, behind him a blurry image of a street with emergency sirens glowing in red."},{"speaker":"Teacher","text":"そうだね。もちろん、緊急時には素早い対応が必要だけど…","imagePrompt":"Illustration of a balance scale: one side is 'freedom', the other 'security'. The scale is slightly tilting."},{"speaker":"Student","text":"韓国みたいに、軍隊が政治に口を出してくることもあり得るんですか？","imagePrompt":"Student imagining a military tank next to the Japanese parliament, shown as a thought bubble."},{"speaker":"Teacher","text":"完全にあり得ないとは言えないからこそ、注意が必要なんだ。","imagePrompt":"Japanese citizens reading newspapers and watching news with concerned faces, civic awareness growing."},{"speaker":"Student","text":"ありがとうございます。とても良い勉強になりました。","imagePrompt":"The student bows slightly to the teacher with a grateful expression. The classroom is peaceful again."},{"speaker":"Announcer","text":"ご視聴、ありがとうございました。次回の放送もお楽しみに。","imagePrompt":"Ending screen with soft background music, showing the show's logo and a thank-you message in Japanese."}],"canvasSize":{"width":1536,"height":1024},"imageParams":{"style":"<style>Ghibli style. Student (Taro) is a young teenager with a dark short hair with glasses. Teacher is a middle-aged man with grey hair and moustache.</style>"},"speechParams":{"speakers":{"Announcer":{"provider":"gemini","displayName":{"ja":"アナウンサー"},"voiceId":"Aoede"},"Student":{"provider":"gemini","displayName":{"ja":"太郎"},"voiceId":"Puck"},"Teacher":{"provider":"gemini","displayName":{"ja":"先生"},"voiceId":"Charon"}}}}\n` +
         "```",
     shorts: "This script is for YouTube shorts. The first beat should be a hook, which describes the topic. Another AI will generate images for each beat based on the image prompt of that beat. Movie prompts must be written in English.\n" +
         "```JSON\n" +

package/lib/types/agent.d.ts CHANGED Viewed

@@ -113,10 +113,6 @@ export type OpenAITTSAgentParams = TTSAgentParams & {
     model: string;
     speed: number;
 };
-export type NijivoiceTTSAgentParams = TTSAgentParams & {
-    speed: number;
-    speed_global: number;
-};
 export type KotodamaTTSAgentParams = TTSAgentParams & {
     decoration: string;
 };

package/lib/types/provider2agent.d.ts CHANGED Viewed

@@ -1,9 +1,4 @@
 export declare const provider2TTSAgent: {
-    nijivoice: {
-        agentName: string;
-        hasLimitedConcurrency: boolean;
-        keyName: string;
-    };
     openai: {
         agentName: string;
         hasLimitedConcurrency: boolean;

package/lib/types/provider2agent.js CHANGED Viewed

@@ -1,10 +1,5 @@
 // node & browser
 export const provider2TTSAgent = {
-    nijivoice: {
-        agentName: "ttsNijivoiceAgent",
-        hasLimitedConcurrency: true,
-        keyName: "NIJIVOICE_API_KEY",
-    },
     openai: {
         agentName: "ttsOpenaiAgent",
         hasLimitedConcurrency: false,

package/lib/types/schema.d.ts CHANGED Viewed

@@ -190,9 +190,29 @@ export declare const mulmoTextSlideMediaSchema: z.ZodObject<{
         bullets: z.ZodOptional<z.ZodArray<z.ZodString>>;
     }, z.core.$strip>;
 }, z.core.$strict>;
+export declare const captionSplitSchema: z.ZodDefault<z.ZodEnum<{
+    none: "none";
+    estimate: "estimate";
+}>>;
+export declare const textSplitSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
+    type: z.ZodLiteral<"none">;
+}, z.core.$strip>, z.ZodObject<{
+    type: z.ZodLiteral<"delimiters">;
+    delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
+}, z.core.$strip>], "type">;
 export declare const mulmoCaptionParamsSchema: z.ZodObject<{
     lang: z.ZodOptional<z.ZodString>;
     styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
+    captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
+        none: "none";
+        estimate: "estimate";
+    }>>>;
+    textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
+        type: z.ZodLiteral<"none">;
+    }, z.core.$strip>, z.ZodObject<{
+        type: z.ZodLiteral<"delimiters">;
+        delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
+    }, z.core.$strip>], "type">>;
 }, z.core.$strict>;
 export declare const mulmoChartMediaSchema: z.ZodObject<{
     type: z.ZodLiteral<"chart">;
@@ -747,6 +767,7 @@ export declare const mulmoMovieParamsSchema: z.ZodObject<{
 export declare const mulmoBeatSchema: z.ZodObject<{
     speaker: z.ZodOptional<z.ZodString>;
     text: z.ZodDefault<z.ZodOptional<z.ZodString>>;
+    texts: z.ZodOptional<z.ZodArray<z.ZodString>>;
     id: z.ZodOptional<z.ZodString>;
     description: z.ZodOptional<z.ZodString>;
     image: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
@@ -1130,6 +1151,16 @@ export declare const mulmoBeatSchema: z.ZodObject<{
     captionParams: z.ZodOptional<z.ZodObject<{
         lang: z.ZodOptional<z.ZodString>;
         styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
+        captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
+            none: "none";
+            estimate: "estimate";
+        }>>>;
+        textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
+            type: z.ZodLiteral<"none">;
+        }, z.core.$strip>, z.ZodObject<{
+            type: z.ZodLiteral<"delimiters">;
+            delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
+        }, z.core.$strip>], "type">>;
     }, z.core.$strict>>;
     imageNames: z.ZodOptional<z.ZodArray<z.ZodString>>;
     imagePrompt: z.ZodOptional<z.ZodString>;
@@ -1485,6 +1516,16 @@ export declare const mulmoPresentationStyleSchema: z.ZodObject<{
     captionParams: z.ZodOptional<z.ZodObject<{
         lang: z.ZodOptional<z.ZodString>;
         styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
+        captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
+            none: "none";
+            estimate: "estimate";
+        }>>>;
+        textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
+            type: z.ZodLiteral<"none">;
+        }, z.core.$strip>, z.ZodObject<{
+            type: z.ZodLiteral<"delimiters">;
+            delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
+        }, z.core.$strip>], "type">>;
     }, z.core.$strict>>;
     audioParams: z.ZodDefault<z.ZodObject<{
         padding: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
@@ -1836,6 +1877,16 @@ export declare const mulmoScriptSchema: z.ZodObject<{
     captionParams: z.ZodOptional<z.ZodObject<{
         lang: z.ZodOptional<z.ZodString>;
         styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
+        captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
+            none: "none";
+            estimate: "estimate";
+        }>>>;
+        textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
+            type: z.ZodLiteral<"none">;
+        }, z.core.$strip>, z.ZodObject<{
+            type: z.ZodLiteral<"delimiters">;
+            delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
+        }, z.core.$strip>], "type">>;
     }, z.core.$strict>>;
     audioParams: z.ZodDefault<z.ZodObject<{
         padding: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
@@ -1874,6 +1925,7 @@ export declare const mulmoScriptSchema: z.ZodObject<{
     beats: z.ZodArray<z.ZodObject<{
         speaker: z.ZodOptional<z.ZodString>;
         text: z.ZodDefault<z.ZodOptional<z.ZodString>>;
+        texts: z.ZodOptional<z.ZodArray<z.ZodString>>;
         id: z.ZodOptional<z.ZodString>;
         description: z.ZodOptional<z.ZodString>;
         image: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
@@ -2257,6 +2309,16 @@ export declare const mulmoScriptSchema: z.ZodObject<{
         captionParams: z.ZodOptional<z.ZodObject<{
             lang: z.ZodOptional<z.ZodString>;
             styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
+            captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
+                none: "none";
+                estimate: "estimate";
+            }>>>;
+            textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
+                type: z.ZodLiteral<"none">;
+            }, z.core.$strip>, z.ZodObject<{
+                type: z.ZodLiteral<"delimiters">;
+                delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
+            }, z.core.$strip>], "type">>;
         }, z.core.$strict>>;
         imageNames: z.ZodOptional<z.ZodArray<z.ZodString>>;
         imagePrompt: z.ZodOptional<z.ZodString>;
@@ -2289,6 +2351,11 @@ export declare const mulmoStudioBeatSchema: z.ZodObject<{
     soundEffectFile: z.ZodOptional<z.ZodString>;
     lipSyncFile: z.ZodOptional<z.ZodString>;
     captionFile: z.ZodOptional<z.ZodString>;
+    captionFiles: z.ZodOptional<z.ZodArray<z.ZodObject<{
+        file: z.ZodString;
+        startAt: z.ZodNumber;
+        endAt: z.ZodNumber;
+    }, z.core.$strip>>>;
     htmlImageFile: z.ZodOptional<z.ZodString>;
     markdown: z.ZodOptional<z.ZodString>;
     html: z.ZodOptional<z.ZodString>;
@@ -2682,6 +2749,16 @@ export declare const mulmoStudioSchema: z.ZodObject<{
         captionParams: z.ZodOptional<z.ZodObject<{
             lang: z.ZodOptional<z.ZodString>;
             styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
+            captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
+                none: "none";
+                estimate: "estimate";
+            }>>>;
+            textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
+                type: z.ZodLiteral<"none">;
+            }, z.core.$strip>, z.ZodObject<{
+                type: z.ZodLiteral<"delimiters">;
+                delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
+            }, z.core.$strip>], "type">>;
         }, z.core.$strict>>;
         audioParams: z.ZodDefault<z.ZodObject<{
             padding: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
@@ -2720,6 +2797,7 @@ export declare const mulmoStudioSchema: z.ZodObject<{
         beats: z.ZodArray<z.ZodObject<{
             speaker: z.ZodOptional<z.ZodString>;
             text: z.ZodDefault<z.ZodOptional<z.ZodString>>;
+            texts: z.ZodOptional<z.ZodArray<z.ZodString>>;
             id: z.ZodOptional<z.ZodString>;
             description: z.ZodOptional<z.ZodString>;
             image: z.ZodOptional<z.ZodUnion<readonly [z.ZodObject<{
@@ -3103,6 +3181,16 @@ export declare const mulmoStudioSchema: z.ZodObject<{
             captionParams: z.ZodOptional<z.ZodObject<{
                 lang: z.ZodOptional<z.ZodString>;
                 styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
+                captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
+                    none: "none";
+                    estimate: "estimate";
+                }>>>;
+                textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
+                    type: z.ZodLiteral<"none">;
+                }, z.core.$strip>, z.ZodObject<{
+                    type: z.ZodLiteral<"delimiters">;
+                    delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
+                }, z.core.$strip>], "type">>;
             }, z.core.$strict>>;
             imageNames: z.ZodOptional<z.ZodArray<z.ZodString>>;
             imagePrompt: z.ZodOptional<z.ZodString>;
@@ -3136,6 +3224,11 @@ export declare const mulmoStudioSchema: z.ZodObject<{
         soundEffectFile: z.ZodOptional<z.ZodString>;
         lipSyncFile: z.ZodOptional<z.ZodString>;
         captionFile: z.ZodOptional<z.ZodString>;
+        captionFiles: z.ZodOptional<z.ZodArray<z.ZodObject<{
+            file: z.ZodString;
+            startAt: z.ZodNumber;
+            endAt: z.ZodNumber;
+        }, z.core.$strip>>>;
         htmlImageFile: z.ZodOptional<z.ZodString>;
         markdown: z.ZodOptional<z.ZodString>;
         html: z.ZodOptional<z.ZodString>;
@@ -3464,6 +3557,16 @@ export declare const mulmoPromptTemplateSchema: z.ZodObject<{
         captionParams: z.ZodOptional<z.ZodObject<{
             lang: z.ZodOptional<z.ZodString>;
             styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
+            captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
+                none: "none";
+                estimate: "estimate";
+            }>>>;
+            textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
+                type: z.ZodLiteral<"none">;
+            }, z.core.$strip>, z.ZodObject<{
+                type: z.ZodLiteral<"delimiters">;
+                delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
+            }, z.core.$strip>], "type">>;
         }, z.core.$strict>>;
         audioParams: z.ZodDefault<z.ZodObject<{
             padding: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
@@ -3809,6 +3912,16 @@ export declare const mulmoPromptTemplateFileSchema: z.ZodObject<{
         captionParams: z.ZodOptional<z.ZodObject<{
             lang: z.ZodOptional<z.ZodString>;
             styles: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString>>>;
+            captionSplit: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
+                none: "none";
+                estimate: "estimate";
+            }>>>;
+            textSplit: z.ZodOptional<z.ZodDiscriminatedUnion<[z.ZodObject<{
+                type: z.ZodLiteral<"none">;
+            }, z.core.$strip>, z.ZodObject<{
+                type: z.ZodLiteral<"delimiters">;
+                delimiters: z.ZodOptional<z.ZodArray<z.ZodString>>;
+            }, z.core.$strip>], "type">>;
         }, z.core.$strict>>;
         audioParams: z.ZodDefault<z.ZodObject<{
             padding: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;

package/lib/types/schema.js CHANGED Viewed

@@ -119,10 +119,20 @@ export const mulmoTextSlideMediaSchema = z
     }),
 })
     .strict();
+export const captionSplitSchema = z.enum(["none", "estimate"]).default("none");
+export const textSplitSchema = z.discriminatedUnion("type", [
+    z.object({ type: z.literal("none") }),
+    z.object({
+        type: z.literal("delimiters"),
+        delimiters: z.array(z.string()).optional(), // default: ["。", "？", "！", ".", "?", "!"]
+    }),
+]);
 export const mulmoCaptionParamsSchema = z
     .object({
     lang: langSchema.optional(),
     styles: z.array(z.string()).optional().default([]), // css styles
+    captionSplit: captionSplitSchema.optional(), // how to determine caption timing
+    textSplit: textSplitSchema.optional(), // how to split text into segments (default: none)
 })
     .strict();
 export const mulmoChartMediaSchema = z
@@ -317,6 +327,7 @@ export const mulmoBeatSchema = z
     .object({
     speaker: speakerIdSchema.optional(),
     text: z.string().optional().default("").describe("Text to be spoken. If empty, the audio is not generated."),
+    texts: z.array(z.string()).optional().describe("Manually split texts for captions. Takes precedence over text for caption display."),
     id: z.string().optional().describe("Unique identifier for the beat."),
     description: z.string().optional(),
     image: mulmoImageAssetSchema.optional(),
@@ -442,7 +453,14 @@ export const mulmoStudioBeatSchema = z
     movieFile: z.string().optional(), // path to the movie file
     soundEffectFile: z.string().optional(), // path to the sound effect file
     lipSyncFile: z.string().optional(), // path to the lip sync file
-    captionFile: z.string().optional(), // path to the caption image
+    captionFile: z.string().optional(), // path to the caption image (deprecated, use captionFiles)
+    captionFiles: z
+        .array(z.object({
+        file: z.string(),
+        startAt: z.number(), // absolute start time in seconds
+        endAt: z.number(), // absolute end time in seconds
+    }))
+        .optional(), // split caption images with timing
     htmlImageFile: z.string().optional(), // path to the html image
     markdown: z.string().optional(), // markdown string (alternative to image)
     html: z.string().optional(), // html string (alternative to image)

package/lib/utils/context.d.ts CHANGED Viewed

@@ -247,6 +247,7 @@ export declare const createStudioData: (_mulmoScript: MulmoScript, fileName: str
         beats: {
             text: string;
             speaker?: string | undefined;
+            texts?: string[] | undefined;
             id?: string | undefined;
             description?: string | undefined;
             image?: {
@@ -555,6 +556,13 @@ export declare const createStudioData: (_mulmoScript: MulmoScript, fileName: str
             captionParams?: {
                 styles: string[];
                 lang?: string | undefined;
+                captionSplit?: "none" | "estimate" | undefined;
+                textSplit?: {
+                    type: "none";
+                } | {
+                    type: "delimiters";
+                    delimiters?: string[] | undefined;
+                } | undefined;
             } | undefined;
             imageNames?: string[] | undefined;
             imagePrompt?: string | undefined;
@@ -583,6 +591,13 @@ export declare const createStudioData: (_mulmoScript: MulmoScript, fileName: str
         captionParams?: {
             styles: string[];
             lang?: string | undefined;
+            captionSplit?: "none" | "estimate" | undefined;
+            textSplit?: {
+                type: "none";
+            } | {
+                type: "delimiters";
+                delimiters?: string[] | undefined;
+            } | undefined;
         } | undefined;
         title?: string | undefined;
         description?: string | undefined;
@@ -611,6 +626,11 @@ export declare const createStudioData: (_mulmoScript: MulmoScript, fileName: str
         soundEffectFile?: string | undefined;
         lipSyncFile?: string | undefined;
         captionFile?: string | undefined;
+        captionFiles?: {
+            file: string;
+            startAt: number;
+            endAt: number;
+        }[] | undefined;
         htmlImageFile?: string | undefined;
         markdown?: string | undefined;
         html?: string | undefined;
@@ -867,6 +887,7 @@ export declare const initializeContextFromFiles: (files: FileObject, raiseError:
             beats: {
                 text: string;
                 speaker?: string | undefined;
+                texts?: string[] | undefined;
                 id?: string | undefined;
                 description?: string | undefined;
                 image?: {
@@ -1175,6 +1196,13 @@ export declare const initializeContextFromFiles: (files: FileObject, raiseError:
                 captionParams?: {
                     styles: string[];
                     lang?: string | undefined;
+                    captionSplit?: "none" | "estimate" | undefined;
+                    textSplit?: {
+                        type: "none";
+                    } | {
+                        type: "delimiters";
+                        delimiters?: string[] | undefined;
+                    } | undefined;
                 } | undefined;
                 imageNames?: string[] | undefined;
                 imagePrompt?: string | undefined;
@@ -1203,6 +1231,13 @@ export declare const initializeContextFromFiles: (files: FileObject, raiseError:
             captionParams?: {
                 styles: string[];
                 lang?: string | undefined;
+                captionSplit?: "none" | "estimate" | undefined;
+                textSplit?: {
+                    type: "none";
+                } | {
+                    type: "delimiters";
+                    delimiters?: string[] | undefined;
+                } | undefined;
             } | undefined;
             title?: string | undefined;
             description?: string | undefined;
@@ -1231,6 +1266,11 @@ export declare const initializeContextFromFiles: (files: FileObject, raiseError:
             soundEffectFile?: string | undefined;
             lipSyncFile?: string | undefined;
             captionFile?: string | undefined;
+            captionFiles?: {
+                file: string;
+                startAt: number;
+                endAt: number;
+            }[] | undefined;
             htmlImageFile?: string | undefined;
             markdown?: string | undefined;
             html?: string | undefined;
@@ -1504,6 +1544,13 @@ export declare const initializeContextFromFiles: (files: FileObject, raiseError:
         captionParams?: {
             styles: string[];
             lang?: string | undefined;
+            captionSplit?: "none" | "estimate" | undefined;
+            textSplit?: {
+                type: "none";
+            } | {
+                type: "delimiters";
+                delimiters?: string[] | undefined;
+            } | undefined;
         } | undefined;
     };
     sessionState: {

package/lib/utils/file.d.ts CHANGED Viewed

@@ -34,7 +34,7 @@ export declare const getBeatMoviePaths: (context: MulmoStudioContext, index: num
     lipSyncFile: string;
 };
 export declare const getReferenceImagePath: (context: MulmoStudioContext, key: string, extension: string) => string;
-export declare const getCaptionImagePath: (context: MulmoStudioContext, index: number) => string;
+export declare const getCaptionImagePath: (context: MulmoStudioContext, index: number, subIndex?: number) => string;
 export declare const getOutputPdfFilePath: (outDirPath: string, fileName: string, pdfMode: PDFMode, lang?: string) => string;
 export declare const getPromptTemplateFilePath: (promptTemplateName: string) => string;
 export declare const mkdir: (dirPath: string) => void;

package/lib/utils/file.js CHANGED Viewed

@@ -109,8 +109,11 @@ export const getReferenceImagePath = (context, key, extension) => {
     const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
     return `${imageProjectDirPath}/${key}.${extension}`;
 };
-export const getCaptionImagePath = (context, index) => {
+export const getCaptionImagePath = (context, index, subIndex) => {
     const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
+    if (subIndex !== undefined) {
+        return `${imageProjectDirPath}/${index}_caption_${subIndex}.png`;
+    }
     return `${imageProjectDirPath}/${index}_caption.png`;
 };
 // pdf

package/lib/utils/filters.d.ts CHANGED Viewed

@@ -1,5 +1,4 @@
 import type { AgentFilterFunction } from "graphai";
-export declare const nijovoiceTextAgentFilter: AgentFilterFunction;
 export declare const fileCacheAgentFilter: AgentFilterFunction;
 export declare const browserlessCacheGenerator: (cacheDir: string) => AgentFilterFunction;
 export declare const getBackupFilePath: (originalPath: string) => string;

package/lib/utils/filters.js CHANGED Viewed

@@ -6,15 +6,7 @@ import { GraphAILogger } from "graphai";
 import { writingMessage, isFile } from "./file.js";
 import { text2hash } from "./utils_node.js";
 import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
-import { replacementsJa, replacePairsJa } from "../utils/string.js";
 dotenv.config({ quiet: true });
-export const nijovoiceTextAgentFilter = async (context, next) => {
-    const { text, provider, lang } = context.namedInputs;
-    if (provider === "nijivoice" && lang === "ja") {
-        context.namedInputs.text = replacePairsJa(replacementsJa)(text);
-    }
-    return next(context);
-};
 export const fileCacheAgentFilter = async (context, next) => {
     const { force, file, index, mulmoContext, sessionType, id, withBackup } = context.namedInputs.cache;
     /*

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mulmocast",
-  "version": "2.1.19",
+  "version": "2.1.21",
   "description": "",
   "type": "module",
   "main": "lib/index.node.js",
@@ -65,7 +65,10 @@
     "nijivoice": "npx tsx batch/niji_sample.ts && yarn run movie  scripts/samples/niji_voice.json",
     "generate_action_docs": "npx tsx ./automation/generate_actions_docs/generate_action_docs.ts"
   },
-  "repository": "git+ssh://git@github.com/receptron/mulmocast-cli.git",
+  "repository": {
+    "type": "git",
+    "url": "git+ssh://git@github.com/receptron/mulmocast-cli.git"
+  },
   "author": "snakajima",
   "license": "AGPL-3.0-only",
   "bugs": {

package/scripts/test/test.json CHANGED Viewed

@@ -19,28 +19,28 @@
   "speechParams": {
     "speakers": {
       "Announcer": {
-        "provider": "nijivoice",
+        "provider": "gemini",
         "displayName": {
           "ja": "アナウンサー"
         },
-        "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c",
+        "voiceId": "Aoede",
         "speechOptions": {
           "speed": 1.666
         }
       },
       "Student": {
-        "provider": "nijivoice",
+        "provider": "gemini",
         "displayName": {
           "ja": "生徒"
         },
-        "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f"
+        "voiceId": "Puck"
       },
       "Teacher": {
-        "provider": "nijivoice",
+        "provider": "gemini",
         "displayName": {
           "ja": "先生"
         },
-        "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae"
+        "voiceId": "Charon"
       }
     }
   },

package/scripts/test/test2.json CHANGED Viewed

@@ -17,25 +17,27 @@
     "style": "<style>monochrome"
   },
   "speechParams": {
-    "provider": "nijivoice",
     "speakers": {
       "Announcer": {
         "displayName": {
           "ja": "千草朋香"
         },
-        "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c"
+        "provider": "gemini",
+        "voiceId": "Aoede"
       },
       "Student": {
         "displayName": {
           "ja": "太郎"
         },
-        "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f"
+        "provider": "gemini",
+        "voiceId": "Puck"
       },
       "Teacher": {
         "displayName": {
           "ja": "山田先生"
         },
-        "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae"
+        "provider": "gemini",
+        "voiceId": "Charon"
       }
     }
   },

package/scripts/test/test_all_tts.json CHANGED Viewed

@@ -22,9 +22,9 @@
         "provider": "elevenlabs",
         "voiceId": "3JDquces8E8bkmvbh6Bc"
       },
-      "Nijivoice": {
-        "provider": "nijivoice",
-        "voiceId": "231e0170-0ece-4155-be44-231423062f41"
+      "Kotodama": {
+        "provider": "kotodama",
+        "voiceId": "Poporo"
       }
     }
   },
@@ -41,7 +41,7 @@
     },
     {
       "speaker": "Gemini",
-      "text": "こんにちは、テストです。ジェミニ",
+      "text": "こんにちは、テストです。ジェミニです。",
       "image": {
         "type": "textSlide",
         "slide": {
@@ -70,12 +70,12 @@
       }
     },
     {
-      "speaker": "Nijivoice",
-      "text": "こんにちは、テストです。ニジヴォイス",
+      "speaker": "Kotodama",
+      "text": "こんにちは、テストです。コトダマ",
       "image": {
         "type": "textSlide",
         "slide": {
-          "title": "Nijivoice TTS"
+          "title": "Kotodama TTS"
         }
       }
     }

package/scripts/test/test_captions.json CHANGED Viewed

@@ -5,7 +5,12 @@
   "lang": "en",
   "captionParams": {
     "lang": "en",
-    "styles": ["color: yellow"]
+    "styles": ["color: yellow"],
+    "captionSplit": "estimate",
+    "textSplit": {
+      "type": "delimiters",
+      "delimiters": ["。", "！", "？"]
+    }
   },
   "beats": [
     {

package/scripts/test/test_lang.json CHANGED Viewed

@@ -10,8 +10,8 @@
         "voiceId": "shimmer",
         "lang": {
           "ja": {
-            "provider": "nijivoice",
-            "voiceId": "9d9ed276-49ee-443a-bc19-26e6136d05f0"
+            "provider": "gemini",
+            "voiceId": "Leda"
           }
         }
       }
@@ -70,7 +70,7 @@
       "image": {
         "type": "textSlide",
         "slide": {
-          "title": "Text replacement test for nijivoice"
+          "title": "Text replacement test for Gemini"
         }
       }
     },

package/scripts/test/test_mixed_providers.json CHANGED Viewed

@@ -29,8 +29,8 @@
         }
       },
       "Host": {
-        "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c",
-        "provider": "nijivoice",
+        "voiceId": "Kore",
+        "provider": "gemini",
         "displayName": {
           "en": "Japanese Host"
         }
@@ -79,12 +79,12 @@
     },
     {
       "speaker": "Host",
-      "text": "そして私は、日本語音声合成のためのNijivoiceを使用するホストです。",
+      "text": "そして私は、Gemini TTS です。Google's TTS とは別の方法を利用しています。",
       "image": {
         "type": "textSlide",
         "slide": {
           "title": "Mixed Provider Demo",
-          "subtitle": "Nijivoice Speaker (Japanese)"
+          "subtitle": "Gemini Speaker (Japanese)"
         }
       }
     }