cerevox 3.0.0-beta.21 → 3.0.0-beta.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/ai.d.ts +2 -1
- package/dist/core/ai.d.ts.map +1 -1
- package/dist/core/ai.js +201 -2
- package/dist/core/ai.js.map +1 -1
- package/dist/mcp/servers/helper/doubao_voices_full.js +1 -1
- package/dist/mcp/servers/zerocut.d.ts.map +1 -1
- package/dist/mcp/servers/zerocut.js +113 -107
- package/dist/mcp/servers/zerocut.js.map +1 -1
- package/package.json +1 -1
|
@@ -50,7 +50,6 @@ const constants_1 = require("../../utils/constants");
|
|
|
50
50
|
const videokit_1 = require("../../utils/videokit");
|
|
51
51
|
const promises_1 = require("node:fs/promises");
|
|
52
52
|
const node_path_1 = __importStar(require("node:path"));
|
|
53
|
-
const doubao_voices_full_1 = require("./helper/doubao_voices_full");
|
|
54
53
|
const node_fs_1 = require("node:fs");
|
|
55
54
|
const coze_1 = require("../../utils/coze");
|
|
56
55
|
const mp3_duration_1 = __importDefault(require("mp3-duration"));
|
|
@@ -2463,44 +2462,44 @@ server.registerTool('generate-scene-tts', {
|
|
|
2463
2462
|
.optional()
|
|
2464
2463
|
.default(1.0)
|
|
2465
2464
|
.describe('The volume of the tts.'),
|
|
2466
|
-
emotion: zod_1.z
|
|
2467
|
-
.enum([
|
|
2468
|
-
'storytelling',
|
|
2469
|
-
'neutral',
|
|
2470
|
-
'excited',
|
|
2471
|
-
'coldness',
|
|
2472
|
-
'angry',
|
|
2473
|
-
'sad',
|
|
2474
|
-
'happy',
|
|
2475
|
-
'surprised',
|
|
2476
|
-
'fear',
|
|
2477
|
-
'depressed',
|
|
2478
|
-
'lovey-dovey',
|
|
2479
|
-
'shy',
|
|
2480
|
-
'comfort',
|
|
2481
|
-
'tension',
|
|
2482
|
-
'tender',
|
|
2483
|
-
'magnetic',
|
|
2484
|
-
'vocal - fry',
|
|
2485
|
-
'ASMR',
|
|
2486
|
-
])
|
|
2487
|
-
.optional(),
|
|
2488
2465
|
voiceID: zod_1.z
|
|
2489
2466
|
.string()
|
|
2490
2467
|
.describe(`适合作为视频配音的音色ID,除非用户指定,否则你必须已通过 search_voice 工具检查确定该音色确实是存在的。`),
|
|
2468
|
+
context_texts: zod_1.z
|
|
2469
|
+
.array(zod_1.z.string())
|
|
2470
|
+
.default([])
|
|
2471
|
+
.describe(`语音合成的辅助信息,用于模型对话式合成,能更好的体现语音情感
|
|
2472
|
+
|
|
2473
|
+
可以探索,比如常见示例有以下几种:
|
|
2474
|
+
|
|
2475
|
+
1. 语速调整
|
|
2476
|
+
- context_texts: ["你可以说慢一点吗?"]
|
|
2477
|
+
2. 情绪/语气调整
|
|
2478
|
+
- context_texts=["你可以用特别特别痛心的语气说话吗?"]
|
|
2479
|
+
- context_texts=["嗯,你的语气再欢乐一点"]
|
|
2480
|
+
3. 音量调整
|
|
2481
|
+
- context_texts=["你嗓门再小点。"]
|
|
2482
|
+
4. 音感调整
|
|
2483
|
+
- context_texts=["你能用骄傲的语气来说话吗?"]
|
|
2484
|
+
`),
|
|
2491
2485
|
explicit_language: zod_1.z.enum(['zh', 'en', 'ja']).optional().default('zh'),
|
|
2492
2486
|
},
|
|
2493
|
-
}, async ({ text, sceneIndex, storyBoardFile, skipConsistencyCheck, voiceID, saveToFileName, speed, pitch, volume,
|
|
2487
|
+
}, async ({ text, sceneIndex, storyBoardFile, skipConsistencyCheck, voiceID, saveToFileName, speed, pitch, volume, context_texts, explicit_language, }) => {
|
|
2494
2488
|
try {
|
|
2495
2489
|
// 验证session状态
|
|
2496
2490
|
const currentSession = await validateSession('generate-scene-tts');
|
|
2497
2491
|
const validatedFileName = validateFileName(saveToFileName);
|
|
2498
2492
|
const finalSpeed = speed ?? 1;
|
|
2499
2493
|
volume = volume ?? 1;
|
|
2494
|
+
const ai = currentSession.ai;
|
|
2500
2495
|
let scene = null;
|
|
2501
2496
|
// 校验 text 与 storyboard.json 中场景设定的一致性
|
|
2502
2497
|
if (sceneIndex && !skipConsistencyCheck) {
|
|
2503
2498
|
try {
|
|
2499
|
+
const voice = (await ai.listVoices()).find(v => v.id === voiceID);
|
|
2500
|
+
if (!voice) {
|
|
2501
|
+
return createErrorResponse(`Voice ${voiceID} not found in voice-list. Use search-voices tool to find available voices. 若用户坚持要使用该音色,需跳过一致性检查。`, 'generate-scene-tts');
|
|
2502
|
+
}
|
|
2504
2503
|
const storyBoardPath = (0, node_path_1.resolve)(process.env.ZEROCUT_PROJECT_CWD || process.cwd(), projectLocalDir, storyBoardFile);
|
|
2505
2504
|
if ((0, node_fs_1.existsSync)(storyBoardPath)) {
|
|
2506
2505
|
const storyBoardContent = await (0, promises_1.readFile)(storyBoardPath, 'utf8');
|
|
@@ -2551,7 +2550,6 @@ server.registerTool('generate-scene-tts', {
|
|
|
2551
2550
|
}
|
|
2552
2551
|
}
|
|
2553
2552
|
console.log(`Generating TTS with voice: ${voiceID}, speed: ${finalSpeed}, text: ${text.substring(0, 100)}...`);
|
|
2554
|
-
const ai = currentSession.ai;
|
|
2555
2553
|
if (voiceID.startsWith('BV0')) {
|
|
2556
2554
|
throw new Error(`BV0* 系列音色已弃用,你必须已通过 search_voice 工具检查确定该音色确实是存在的。`);
|
|
2557
2555
|
}
|
|
@@ -2563,30 +2561,80 @@ server.registerTool('generate-scene-tts', {
|
|
|
2563
2561
|
? 'volcano'
|
|
2564
2562
|
: 'minimax';
|
|
2565
2563
|
let res;
|
|
2564
|
+
let emotion = 'auto';
|
|
2566
2565
|
if (type === 'volcano') {
|
|
2567
|
-
const voice = doubao_voices_full_1.doubaoVoicesFull.find(v => v.voiceID === voiceID);
|
|
2568
|
-
if (!voice) {
|
|
2569
|
-
return createErrorResponse(`Voice ${voiceID} not found in Doubao voices. Use search-voices tool to find available voices.`, 'generate-scene-tts');
|
|
2570
|
-
}
|
|
2571
|
-
const emotions = voice.emotions || [];
|
|
2572
|
-
if (emotion && !emotions.includes(emotion)) {
|
|
2573
|
-
emotion = 'neutral';
|
|
2574
|
-
}
|
|
2575
|
-
// 修复可能的 emotion 错误情况
|
|
2576
|
-
emotion = emotion || 'neutral';
|
|
2577
2566
|
volume = Math.max(Math.min(volume, 2.0), 0.5);
|
|
2578
2567
|
res = await ai.textToSpeechVolc({
|
|
2579
2568
|
text: text.trim(),
|
|
2580
2569
|
speaker: voiceID,
|
|
2581
2570
|
speed: Math.floor(100 * (finalSpeed - 1)),
|
|
2582
2571
|
volume: Math.floor(100 * (volume - 1)),
|
|
2583
|
-
|
|
2572
|
+
context_texts,
|
|
2584
2573
|
explicit_language,
|
|
2585
2574
|
voice_to_caption: explicit_language === 'zh' || explicit_language === 'en',
|
|
2586
2575
|
});
|
|
2587
2576
|
}
|
|
2588
2577
|
else {
|
|
2589
|
-
emotion =
|
|
2578
|
+
emotion = 'neutral';
|
|
2579
|
+
if (context_texts.length > 0) {
|
|
2580
|
+
const prompt = `根据用户输入语音内容和上下文判断语音的情感,选择以下情感**之一**:
|
|
2581
|
+
|
|
2582
|
+
"happy", "sad", "angry", "fearful", "disgusted", "surprised", "calm", "fluent", "whisper"
|
|
2583
|
+
|
|
2584
|
+
## 要求
|
|
2585
|
+
输出 JSON 格式,包含一个 emotion 字段,值为以上情感之一。
|
|
2586
|
+
`;
|
|
2587
|
+
const schema = {
|
|
2588
|
+
name: 'emotion_schema',
|
|
2589
|
+
schema: {
|
|
2590
|
+
type: 'object',
|
|
2591
|
+
properties: {
|
|
2592
|
+
emotion: {
|
|
2593
|
+
type: 'string',
|
|
2594
|
+
enum: [
|
|
2595
|
+
'neutral',
|
|
2596
|
+
'happy',
|
|
2597
|
+
'sad',
|
|
2598
|
+
'angry',
|
|
2599
|
+
'fearful',
|
|
2600
|
+
'disgusted',
|
|
2601
|
+
'surprised',
|
|
2602
|
+
'calm',
|
|
2603
|
+
'fluent',
|
|
2604
|
+
'whisper',
|
|
2605
|
+
],
|
|
2606
|
+
description: '用户输入语音的情感',
|
|
2607
|
+
},
|
|
2608
|
+
},
|
|
2609
|
+
required: ['emotion'],
|
|
2610
|
+
},
|
|
2611
|
+
};
|
|
2612
|
+
const payload = {
|
|
2613
|
+
model: 'Doubao-Seed-1.6-flash',
|
|
2614
|
+
messages: [
|
|
2615
|
+
{
|
|
2616
|
+
role: 'system',
|
|
2617
|
+
content: prompt,
|
|
2618
|
+
},
|
|
2619
|
+
{
|
|
2620
|
+
role: 'user',
|
|
2621
|
+
content: `## 语音内容:
|
|
2622
|
+
${text.trim()}
|
|
2623
|
+
|
|
2624
|
+
## 语音上下文
|
|
2625
|
+
${context_texts.join('\n')}
|
|
2626
|
+
`,
|
|
2627
|
+
},
|
|
2628
|
+
],
|
|
2629
|
+
response_format: {
|
|
2630
|
+
type: 'json_schema',
|
|
2631
|
+
json_schema: schema,
|
|
2632
|
+
},
|
|
2633
|
+
};
|
|
2634
|
+
const completion = await ai.getCompletions(payload);
|
|
2635
|
+
const emotionObj = JSON.parse(completion.choices[0]?.message?.content ?? '{}');
|
|
2636
|
+
emotion = emotionObj.emotion ?? 'neutral';
|
|
2637
|
+
}
|
|
2590
2638
|
res = await ai.textToSpeech({
|
|
2591
2639
|
text: text.trim(),
|
|
2592
2640
|
voiceName: voiceID,
|
|
@@ -2627,6 +2675,8 @@ server.registerTool('generate-scene-tts', {
|
|
|
2627
2675
|
uri,
|
|
2628
2676
|
durationMs: Math.floor((duration || 0) * 1000),
|
|
2629
2677
|
text,
|
|
2678
|
+
emotion,
|
|
2679
|
+
context_texts,
|
|
2630
2680
|
voiceName: voiceID,
|
|
2631
2681
|
speed: finalSpeed,
|
|
2632
2682
|
timestamp: new Date().toISOString(),
|
|
@@ -2891,91 +2941,49 @@ server.registerTool('get-schema', {
|
|
|
2891
2941
|
});
|
|
2892
2942
|
server.registerTool('search-voices', {
|
|
2893
2943
|
title: 'Search Voices',
|
|
2894
|
-
description: 'Search voices from doubao_voices_full based on
|
|
2944
|
+
description: 'Search voices from doubao_voices_full based on languages, and gender. 搜索并选择符合要求的语音,在合适的情况下,优先采用 volcano_tts_2 类型的语音',
|
|
2895
2945
|
inputSchema: {
|
|
2896
|
-
|
|
2897
|
-
.array(zod_1.z.enum([
|
|
2898
|
-
'asmr',
|
|
2899
|
-
'audiobook',
|
|
2900
|
-
'customer_service',
|
|
2901
|
-
'dialect_fun',
|
|
2902
|
-
'dialogue',
|
|
2903
|
-
'kids_content',
|
|
2904
|
-
'news_explainer',
|
|
2905
|
-
'podcast_voiceover',
|
|
2906
|
-
'product_ad',
|
|
2907
|
-
'promo_trailer',
|
|
2908
|
-
'roleplay_drama',
|
|
2909
|
-
'story_narration',
|
|
2910
|
-
'storytelling',
|
|
2911
|
-
'tutorial',
|
|
2912
|
-
]))
|
|
2913
|
-
.optional()
|
|
2914
|
-
.describe('Filter by scenes (e.g., ["product_ad", "tutorial"]). If not provided, no scene filtering is applied.'),
|
|
2915
|
-
emotions: zod_1.z
|
|
2946
|
+
languages: zod_1.z
|
|
2916
2947
|
.array(zod_1.z.enum([
|
|
2917
|
-
'
|
|
2918
|
-
'
|
|
2919
|
-
'
|
|
2920
|
-
'
|
|
2921
|
-
'
|
|
2922
|
-
'
|
|
2923
|
-
'
|
|
2924
|
-
'
|
|
2925
|
-
'
|
|
2926
|
-
'
|
|
2927
|
-
'
|
|
2928
|
-
'
|
|
2929
|
-
'
|
|
2930
|
-
'
|
|
2931
|
-
'
|
|
2948
|
+
'zh',
|
|
2949
|
+
'en',
|
|
2950
|
+
'ja',
|
|
2951
|
+
'ko',
|
|
2952
|
+
'es',
|
|
2953
|
+
'pt',
|
|
2954
|
+
'nl',
|
|
2955
|
+
'vi',
|
|
2956
|
+
'ru',
|
|
2957
|
+
'id',
|
|
2958
|
+
'de',
|
|
2959
|
+
'fr',
|
|
2960
|
+
'it',
|
|
2961
|
+
'ar',
|
|
2962
|
+
'tr',
|
|
2963
|
+
'uk',
|
|
2932
2964
|
]))
|
|
2933
2965
|
.optional()
|
|
2934
|
-
.describe('Filter by
|
|
2935
|
-
languages: zod_1.z
|
|
2936
|
-
.array(zod_1.z.string())
|
|
2937
|
-
.optional()
|
|
2938
|
-
.describe('Filter by languages (e.g., ["zh", "en-US"]). If not provided, no language filtering is applied.'),
|
|
2966
|
+
.describe('Filter by languages (e.g., ["zh", "en"]). If not provided, no language filtering is applied.'),
|
|
2939
2967
|
gender: zod_1.z
|
|
2940
2968
|
.enum(['male', 'female'])
|
|
2941
2969
|
.optional()
|
|
2942
2970
|
.describe('Filter by gender (male or female). If not provided, no gender filtering is applied.'),
|
|
2943
2971
|
},
|
|
2944
|
-
}, async ({
|
|
2972
|
+
}, async ({ languages, gender }) => {
|
|
2945
2973
|
try {
|
|
2946
|
-
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
voice.scenes.some(scene => scenes.includes(scene)));
|
|
2951
|
-
}
|
|
2952
|
-
// Filter by emotions
|
|
2953
|
-
if (emotions && emotions.length > 0) {
|
|
2954
|
-
filteredVoices = filteredVoices.filter(voice => {
|
|
2955
|
-
// If emotions includes 'neutral', also include voices without emotions field
|
|
2956
|
-
if (emotions.includes('neutral') && !voice.emotions) {
|
|
2957
|
-
return true;
|
|
2958
|
-
}
|
|
2959
|
-
return (voice.emotions &&
|
|
2960
|
-
voice.emotions.some(emotion => emotions.includes(emotion)));
|
|
2961
|
-
});
|
|
2962
|
-
}
|
|
2974
|
+
// 验证session状态
|
|
2975
|
+
const currentSession = await validateSession('search-voices');
|
|
2976
|
+
const ai = currentSession.ai;
|
|
2977
|
+
let filteredVoices = await ai.listVoices();
|
|
2963
2978
|
// Filter by languages
|
|
2964
2979
|
if (languages && languages.length > 0) {
|
|
2965
2980
|
filteredVoices = filteredVoices.filter(voice => voice.languages &&
|
|
2966
|
-
voice.languages.some(lang => languages.includes(lang)));
|
|
2981
|
+
voice.languages.some((lang) => languages.includes(lang)));
|
|
2967
2982
|
}
|
|
2968
2983
|
// Filter by gender
|
|
2969
2984
|
if (gender) {
|
|
2970
2985
|
filteredVoices = filteredVoices.filter(voice => {
|
|
2971
|
-
|
|
2972
|
-
if (gender === 'male') {
|
|
2973
|
-
return voiceId.includes('_male_');
|
|
2974
|
-
}
|
|
2975
|
-
else if (gender === 'female') {
|
|
2976
|
-
return voiceId.includes('_female_');
|
|
2977
|
-
}
|
|
2978
|
-
return true;
|
|
2986
|
+
return voice.gender === gender;
|
|
2979
2987
|
});
|
|
2980
2988
|
}
|
|
2981
2989
|
return {
|
|
@@ -2988,8 +2996,6 @@ server.registerTool('search-voices', {
|
|
|
2988
2996
|
totalCount: filteredVoices.length,
|
|
2989
2997
|
voices: filteredVoices,
|
|
2990
2998
|
filters: {
|
|
2991
|
-
scenes: scenes || null,
|
|
2992
|
-
emotions: emotions || null,
|
|
2993
2999
|
languages: languages || null,
|
|
2994
3000
|
gender: gender || null,
|
|
2995
3001
|
},
|