cerevox 3.0.0-beta.21 → 3.0.0-beta.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -50,7 +50,6 @@ const constants_1 = require("../../utils/constants");
50
50
  const videokit_1 = require("../../utils/videokit");
51
51
  const promises_1 = require("node:fs/promises");
52
52
  const node_path_1 = __importStar(require("node:path"));
53
- const doubao_voices_full_1 = require("./helper/doubao_voices_full");
54
53
  const node_fs_1 = require("node:fs");
55
54
  const coze_1 = require("../../utils/coze");
56
55
  const mp3_duration_1 = __importDefault(require("mp3-duration"));
@@ -2463,44 +2462,44 @@ server.registerTool('generate-scene-tts', {
2463
2462
  .optional()
2464
2463
  .default(1.0)
2465
2464
  .describe('The volume of the tts.'),
2466
- emotion: zod_1.z
2467
- .enum([
2468
- 'storytelling',
2469
- 'neutral',
2470
- 'excited',
2471
- 'coldness',
2472
- 'angry',
2473
- 'sad',
2474
- 'happy',
2475
- 'surprised',
2476
- 'fear',
2477
- 'depressed',
2478
- 'lovey-dovey',
2479
- 'shy',
2480
- 'comfort',
2481
- 'tension',
2482
- 'tender',
2483
- 'magnetic',
2484
- 'vocal - fry',
2485
- 'ASMR',
2486
- ])
2487
- .optional(),
2488
2465
  voiceID: zod_1.z
2489
2466
  .string()
2490
2467
  .describe(`适合作为视频配音的音色ID,除非用户指定,否则你必须已通过 search_voice 工具检查确定该音色确实是存在的。`),
2468
+ context_texts: zod_1.z
2469
+ .array(zod_1.z.string())
2470
+ .default([])
2471
+ .describe(`语音合成的辅助信息,用于模型对话式合成,能更好的体现语音情感
2472
+
2473
+ 可以探索,比如常见示例有以下几种:
2474
+
2475
+ 1. 语速调整
2476
+ - context_texts: ["你可以说慢一点吗?"]
2477
+ 2. 情绪/语气调整
2478
+ - context_texts=["你可以用特别特别痛心的语气说话吗?"]
2479
+ - context_texts=["嗯,你的语气再欢乐一点"]
2480
+ 3. 音量调整
2481
+ - context_texts=["你嗓门再小点。"]
2482
+ 4. 音感调整
2483
+ - context_texts=["你能用骄傲的语气来说话吗?"]
2484
+ `),
2491
2485
  explicit_language: zod_1.z.enum(['zh', 'en', 'ja']).optional().default('zh'),
2492
2486
  },
2493
- }, async ({ text, sceneIndex, storyBoardFile, skipConsistencyCheck, voiceID, saveToFileName, speed, pitch, volume, emotion, explicit_language, }) => {
2487
+ }, async ({ text, sceneIndex, storyBoardFile, skipConsistencyCheck, voiceID, saveToFileName, speed, pitch, volume, context_texts, explicit_language, }) => {
2494
2488
  try {
2495
2489
  // 验证session状态
2496
2490
  const currentSession = await validateSession('generate-scene-tts');
2497
2491
  const validatedFileName = validateFileName(saveToFileName);
2498
2492
  const finalSpeed = speed ?? 1;
2499
2493
  volume = volume ?? 1;
2494
+ const ai = currentSession.ai;
2500
2495
  let scene = null;
2501
2496
  // 校验 text 与 storyboard.json 中场景设定的一致性
2502
2497
  if (sceneIndex && !skipConsistencyCheck) {
2503
2498
  try {
2499
+ const voice = (await ai.listVoices()).find(v => v.id === voiceID);
2500
+ if (!voice) {
2501
+ return createErrorResponse(`Voice ${voiceID} not found in voice-list. Use search-voices tool to find available voices. 若用户坚持要使用该音色,需跳过一致性检查。`, 'generate-scene-tts');
2502
+ }
2504
2503
  const storyBoardPath = (0, node_path_1.resolve)(process.env.ZEROCUT_PROJECT_CWD || process.cwd(), projectLocalDir, storyBoardFile);
2505
2504
  if ((0, node_fs_1.existsSync)(storyBoardPath)) {
2506
2505
  const storyBoardContent = await (0, promises_1.readFile)(storyBoardPath, 'utf8');
@@ -2551,7 +2550,6 @@ server.registerTool('generate-scene-tts', {
2551
2550
  }
2552
2551
  }
2553
2552
  console.log(`Generating TTS with voice: ${voiceID}, speed: ${finalSpeed}, text: ${text.substring(0, 100)}...`);
2554
- const ai = currentSession.ai;
2555
2553
  if (voiceID.startsWith('BV0')) {
2556
2554
  throw new Error(`BV0* 系列音色已弃用,你必须已通过 search_voice 工具检查确定该音色确实是存在的。`);
2557
2555
  }
@@ -2563,30 +2561,80 @@ server.registerTool('generate-scene-tts', {
2563
2561
  ? 'volcano'
2564
2562
  : 'minimax';
2565
2563
  let res;
2564
+ let emotion = 'auto';
2566
2565
  if (type === 'volcano') {
2567
- const voice = doubao_voices_full_1.doubaoVoicesFull.find(v => v.voiceID === voiceID);
2568
- if (!voice) {
2569
- return createErrorResponse(`Voice ${voiceID} not found in Doubao voices. Use search-voices tool to find available voices.`, 'generate-scene-tts');
2570
- }
2571
- const emotions = voice.emotions || [];
2572
- if (emotion && !emotions.includes(emotion)) {
2573
- emotion = 'neutral';
2574
- }
2575
- // 修复可能的 emotion 错误情况
2576
- emotion = emotion || 'neutral';
2577
2566
  volume = Math.max(Math.min(volume, 2.0), 0.5);
2578
2567
  res = await ai.textToSpeechVolc({
2579
2568
  text: text.trim(),
2580
2569
  speaker: voiceID,
2581
2570
  speed: Math.floor(100 * (finalSpeed - 1)),
2582
2571
  volume: Math.floor(100 * (volume - 1)),
2583
- emotion,
2572
+ context_texts,
2584
2573
  explicit_language,
2585
2574
  voice_to_caption: explicit_language === 'zh' || explicit_language === 'en',
2586
2575
  });
2587
2576
  }
2588
2577
  else {
2589
- emotion = emotion || 'neutral';
2578
+ emotion = 'neutral';
2579
+ if (context_texts.length > 0) {
2580
+ const prompt = `根据用户输入语音内容和上下文判断语音的情感,选择以下情感**之一**:
2581
+
2582
+ "happy", "sad", "angry", "fearful", "disgusted", "surprised", "calm", "fluent", "whisper"
2583
+
2584
+ ## 要求
2585
+ 输出 JSON 格式,包含一个 emotion 字段,值为以上情感之一。
2586
+ `;
2587
+ const schema = {
2588
+ name: 'emotion_schema',
2589
+ schema: {
2590
+ type: 'object',
2591
+ properties: {
2592
+ emotion: {
2593
+ type: 'string',
2594
+ enum: [
2595
+ 'neutral',
2596
+ 'happy',
2597
+ 'sad',
2598
+ 'angry',
2599
+ 'fearful',
2600
+ 'disgusted',
2601
+ 'surprised',
2602
+ 'calm',
2603
+ 'fluent',
2604
+ 'whisper',
2605
+ ],
2606
+ description: '用户输入语音的情感',
2607
+ },
2608
+ },
2609
+ required: ['emotion'],
2610
+ },
2611
+ };
2612
+ const payload = {
2613
+ model: 'Doubao-Seed-1.6-flash',
2614
+ messages: [
2615
+ {
2616
+ role: 'system',
2617
+ content: prompt,
2618
+ },
2619
+ {
2620
+ role: 'user',
2621
+ content: `## 语音内容:
2622
+ ${text.trim()}
2623
+
2624
+ ## 语音上下文
2625
+ ${context_texts.join('\n')}
2626
+ `,
2627
+ },
2628
+ ],
2629
+ response_format: {
2630
+ type: 'json_schema',
2631
+ json_schema: schema,
2632
+ },
2633
+ };
2634
+ const completion = await ai.getCompletions(payload);
2635
+ const emotionObj = JSON.parse(completion.choices[0]?.message?.content ?? '{}');
2636
+ emotion = emotionObj.emotion ?? 'neutral';
2637
+ }
2590
2638
  res = await ai.textToSpeech({
2591
2639
  text: text.trim(),
2592
2640
  voiceName: voiceID,
@@ -2627,6 +2675,8 @@ server.registerTool('generate-scene-tts', {
2627
2675
  uri,
2628
2676
  durationMs: Math.floor((duration || 0) * 1000),
2629
2677
  text,
2678
+ emotion,
2679
+ context_texts,
2630
2680
  voiceName: voiceID,
2631
2681
  speed: finalSpeed,
2632
2682
  timestamp: new Date().toISOString(),
@@ -2891,91 +2941,49 @@ server.registerTool('get-schema', {
2891
2941
  });
2892
2942
  server.registerTool('search-voices', {
2893
2943
  title: 'Search Voices',
2894
- description: 'Search voices from doubao_voices_full based on scenes, emotions, languages, and gender.',
2944
+ description: 'Search voices from doubao_voices_full based on languages, and gender. 搜索并选择符合要求的语音,在合适的情况下,优先采用 volcano_tts_2 类型的语音',
2895
2945
  inputSchema: {
2896
- scenes: zod_1.z
2897
- .array(zod_1.z.enum([
2898
- 'asmr',
2899
- 'audiobook',
2900
- 'customer_service',
2901
- 'dialect_fun',
2902
- 'dialogue',
2903
- 'kids_content',
2904
- 'news_explainer',
2905
- 'podcast_voiceover',
2906
- 'product_ad',
2907
- 'promo_trailer',
2908
- 'roleplay_drama',
2909
- 'story_narration',
2910
- 'storytelling',
2911
- 'tutorial',
2912
- ]))
2913
- .optional()
2914
- .describe('Filter by scenes (e.g., ["product_ad", "tutorial"]). If not provided, no scene filtering is applied.'),
2915
- emotions: zod_1.z
2946
+ languages: zod_1.z
2916
2947
  .array(zod_1.z.enum([
2917
- 'ASMR',
2918
- 'affectionate',
2919
- 'angry',
2920
- 'authoritative',
2921
- 'chat',
2922
- 'coldness',
2923
- 'depressed',
2924
- 'excited',
2925
- 'fear',
2926
- 'happy',
2927
- 'hate',
2928
- 'neutral',
2929
- 'sad',
2930
- 'surprised',
2931
- 'warm',
2948
+ 'zh',
2949
+ 'en',
2950
+ 'ja',
2951
+ 'ko',
2952
+ 'es',
2953
+ 'pt',
2954
+ 'nl',
2955
+ 'vi',
2956
+ 'ru',
2957
+ 'id',
2958
+ 'de',
2959
+ 'fr',
2960
+ 'it',
2961
+ 'ar',
2962
+ 'tr',
2963
+ 'uk',
2932
2964
  ]))
2933
2965
  .optional()
2934
- .describe('Filter by emotions (e.g., ["happy", "neutral"]). If not provided, no emotion filtering is applied.'),
2935
- languages: zod_1.z
2936
- .array(zod_1.z.string())
2937
- .optional()
2938
- .describe('Filter by languages (e.g., ["zh", "en-US"]). If not provided, no language filtering is applied.'),
2966
+ .describe('Filter by languages (e.g., ["zh", "en"]). If not provided, no language filtering is applied.'),
2939
2967
  gender: zod_1.z
2940
2968
  .enum(['male', 'female'])
2941
2969
  .optional()
2942
2970
  .describe('Filter by gender (male or female). If not provided, no gender filtering is applied.'),
2943
2971
  },
2944
- }, async ({ scenes, emotions, languages, gender }) => {
2972
+ }, async ({ languages, gender }) => {
2945
2973
  try {
2946
- let filteredVoices = [...doubao_voices_full_1.doubaoVoicesFull];
2947
- // Filter by scenes
2948
- if (scenes && scenes.length > 0) {
2949
- filteredVoices = filteredVoices.filter(voice => voice.scenes &&
2950
- voice.scenes.some(scene => scenes.includes(scene)));
2951
- }
2952
- // Filter by emotions
2953
- if (emotions && emotions.length > 0) {
2954
- filteredVoices = filteredVoices.filter(voice => {
2955
- // If emotions includes 'neutral', also include voices without emotions field
2956
- if (emotions.includes('neutral') && !voice.emotions) {
2957
- return true;
2958
- }
2959
- return (voice.emotions &&
2960
- voice.emotions.some(emotion => emotions.includes(emotion)));
2961
- });
2962
- }
2974
+ // 验证session状态
2975
+ const currentSession = await validateSession('search-voices');
2976
+ const ai = currentSession.ai;
2977
+ let filteredVoices = await ai.listVoices();
2963
2978
  // Filter by languages
2964
2979
  if (languages && languages.length > 0) {
2965
2980
  filteredVoices = filteredVoices.filter(voice => voice.languages &&
2966
- voice.languages.some(lang => languages.includes(lang)));
2981
+ voice.languages.some((lang) => languages.includes(lang)));
2967
2982
  }
2968
2983
  // Filter by gender
2969
2984
  if (gender) {
2970
2985
  filteredVoices = filteredVoices.filter(voice => {
2971
- const voiceId = voice.voiceID.toLowerCase();
2972
- if (gender === 'male') {
2973
- return voiceId.includes('_male_');
2974
- }
2975
- else if (gender === 'female') {
2976
- return voiceId.includes('_female_');
2977
- }
2978
- return true;
2986
+ return voice.gender === gender;
2979
2987
  });
2980
2988
  }
2981
2989
  return {
@@ -2988,8 +2996,6 @@ server.registerTool('search-voices', {
2988
2996
  totalCount: filteredVoices.length,
2989
2997
  voices: filteredVoices,
2990
2998
  filters: {
2991
- scenes: scenes || null,
2992
- emotions: emotions || null,
2993
2999
  languages: languages || null,
2994
3000
  gender: gender || null,
2995
3001
  },