@sage-rsc/talking-head-react 1.0.40 → 1.0.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +2 -2
- package/dist/index.js +451 -328
- package/package.json +1 -1
- package/src/components/TalkingHeadAvatar.jsx +12 -2
- package/src/config/ttsConfig.js +41 -10
- package/src/lib/talkinghead.mjs +156 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import React, { useEffect, useRef, useState, useCallback, forwardRef, useImperativeHandle } from 'react';
|
|
2
2
|
import { TalkingHead } from '../lib/talkinghead.mjs';
|
|
3
|
-
import { getActiveTTSConfig, ELEVENLABS_CONFIG } from '../config/ttsConfig';
|
|
3
|
+
import { getActiveTTSConfig, ELEVENLABS_CONFIG, DEEPGRAM_CONFIG } from '../config/ttsConfig';
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* TalkingHeadAvatar - A reusable React component for 3D talking avatars
|
|
@@ -13,7 +13,7 @@ import { getActiveTTSConfig, ELEVENLABS_CONFIG } from '../config/ttsConfig';
|
|
|
13
13
|
* @param {string} props.avatarBody - Avatar body type ('M' or 'F')
|
|
14
14
|
* @param {string} props.mood - Initial mood ('happy', 'sad', 'neutral', etc.)
|
|
15
15
|
* @param {string} props.ttsLang - Text-to-speech language code
|
|
16
|
-
* @param {string} props.ttsService - TTS service ('edge', 'elevenlabs', 'google', 'azure', 'browser')
|
|
16
|
+
* @param {string} props.ttsService - TTS service ('edge', 'elevenlabs', 'deepgram', 'google', 'azure', 'browser')
|
|
17
17
|
* @param {string} props.ttsVoice - TTS voice ID
|
|
18
18
|
* @param {string} props.ttsApiKey - TTS API key (overrides config for ElevenLabs, Google Cloud, Azure)
|
|
19
19
|
* @param {string} props.bodyMovement - Initial body movement type
|
|
@@ -79,6 +79,16 @@ const TalkingHeadAvatar = forwardRef(({
|
|
|
79
79
|
defaultVoice: ttsVoice || ttsConfig.defaultVoice || ELEVENLABS_CONFIG.defaultVoice,
|
|
80
80
|
voices: ttsConfig.voices || ELEVENLABS_CONFIG.voices
|
|
81
81
|
};
|
|
82
|
+
} else if (effectiveTtsService === 'deepgram') {
|
|
83
|
+
// Explicitly set up Deepgram configuration
|
|
84
|
+
const apiKey = ttsApiKey || ttsConfig.apiKey;
|
|
85
|
+
effectiveTtsConfig = {
|
|
86
|
+
service: 'deepgram',
|
|
87
|
+
endpoint: 'https://api.deepgram.com/v1/speak',
|
|
88
|
+
apiKey: apiKey,
|
|
89
|
+
defaultVoice: ttsVoice || ttsConfig.defaultVoice || DEEPGRAM_CONFIG.defaultVoice,
|
|
90
|
+
voices: ttsConfig.voices || DEEPGRAM_CONFIG.voices
|
|
91
|
+
};
|
|
82
92
|
} else {
|
|
83
93
|
// For other services, use config with prop overrides
|
|
84
94
|
effectiveTtsConfig = {
|
package/src/config/ttsConfig.js
CHANGED
|
@@ -8,15 +8,16 @@
|
|
|
8
8
|
*
|
|
9
9
|
* 💰 PREMIUM OPTIONS (if you want even better quality):
|
|
10
10
|
* 1. ElevenLabs - Most human-like voices (10K chars/month free)
|
|
11
|
-
* 2.
|
|
12
|
-
* 3.
|
|
11
|
+
* 2. Deepgram Aura-2 - Enterprise-grade, low-latency TTS (40% lower cost than ElevenLabs)
|
|
12
|
+
* 3. Google Cloud TTS - 1M characters/month free
|
|
13
|
+
* 4. Azure Cognitive Services - 500K characters/month free
|
|
13
14
|
*
|
|
14
15
|
* 🔧 TO SWITCH SERVICES:
|
|
15
16
|
* 1. Set the service you want to use: enabled: true
|
|
16
17
|
* 2. Set others to: enabled: false
|
|
17
18
|
* 3. For paid services, add your API key
|
|
18
19
|
*
|
|
19
|
-
* Priority order: ElevenLabs > Edge TTS (FREE) > Google Cloud > Azure > Browser
|
|
20
|
+
* Priority order: ElevenLabs > Deepgram > Edge TTS (FREE) > Google Cloud > Azure > Browser
|
|
20
21
|
*/
|
|
21
22
|
|
|
22
23
|
// ElevenLabs Configuration (Recommended - Most human-like voices)
|
|
@@ -37,6 +38,25 @@ export const ELEVENLABS_CONFIG = {
|
|
|
37
38
|
}
|
|
38
39
|
};
|
|
39
40
|
|
|
41
|
+
// Deepgram Aura-2 Configuration (Enterprise-grade, low-latency TTS)
|
|
42
|
+
// Sign up at: https://deepgram.com
|
|
43
|
+
// Competitive pricing, ~40% lower cost than ElevenLabs Flash
|
|
44
|
+
export const DEEPGRAM_CONFIG = {
|
|
45
|
+
enabled: false, // Set to true to use Deepgram
|
|
46
|
+
apiKey: "YOUR_DEEPGRAM_API_KEY", // Replace with your actual API key
|
|
47
|
+
endpoint: "https://api.deepgram.com/v1/speak",
|
|
48
|
+
defaultVoice: "aura-2-thalia-en", // Thalia (Female, English)
|
|
49
|
+
voices: {
|
|
50
|
+
thalia: "aura-2-thalia-en", // Female, English - Natural and clear
|
|
51
|
+
asteria: "aura-2-asteria-en", // Female, English - Warm and friendly
|
|
52
|
+
orion: "aura-2-orion-en", // Male, English - Professional
|
|
53
|
+
stella: "aura-2-stella-en", // Female, English - Energetic
|
|
54
|
+
athena: "aura-2-athena-en", // Female, English - Authoritative
|
|
55
|
+
hera: "aura-2-hera-en", // Female, English - Calm
|
|
56
|
+
zeus: "aura-2-zeus-en" // Male, English - Powerful
|
|
57
|
+
}
|
|
58
|
+
};
|
|
59
|
+
|
|
40
60
|
// Azure Cognitive Services Configuration
|
|
41
61
|
// Free tier: 500,000 characters/month
|
|
42
62
|
// Sign up at: https://azure.microsoft.com
|
|
@@ -149,7 +169,7 @@ export const BROWSER_CONFIG = {
|
|
|
149
169
|
|
|
150
170
|
/**
|
|
151
171
|
* Get the active TTS configuration
|
|
152
|
-
* Priority: ElevenLabs > Edge TTS (FREE) > Google Cloud > Azure Cognitive > Browser
|
|
172
|
+
* Priority: ElevenLabs > Deepgram > Edge TTS (FREE) > Google Cloud > Azure Cognitive > Browser
|
|
153
173
|
*/
|
|
154
174
|
export function getActiveTTSConfig() {
|
|
155
175
|
// 1. ElevenLabs (Premium - Most human-like)
|
|
@@ -163,7 +183,18 @@ export function getActiveTTSConfig() {
|
|
|
163
183
|
};
|
|
164
184
|
}
|
|
165
185
|
|
|
166
|
-
// 2.
|
|
186
|
+
// 2. Deepgram Aura-2 (Enterprise-grade, low-latency)
|
|
187
|
+
if (DEEPGRAM_CONFIG.enabled && DEEPGRAM_CONFIG.apiKey && DEEPGRAM_CONFIG.apiKey !== "YOUR_DEEPGRAM_API_KEY") {
|
|
188
|
+
return {
|
|
189
|
+
service: "deepgram",
|
|
190
|
+
endpoint: DEEPGRAM_CONFIG.endpoint,
|
|
191
|
+
apiKey: DEEPGRAM_CONFIG.apiKey,
|
|
192
|
+
defaultVoice: DEEPGRAM_CONFIG.defaultVoice,
|
|
193
|
+
voices: DEEPGRAM_CONFIG.voices
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// 3. Microsoft Edge TTS (FREE - Recommended)
|
|
167
198
|
if (EDGE_CONFIG.enabled) {
|
|
168
199
|
return {
|
|
169
200
|
service: "edge",
|
|
@@ -174,7 +205,7 @@ export function getActiveTTSConfig() {
|
|
|
174
205
|
};
|
|
175
206
|
}
|
|
176
207
|
|
|
177
|
-
//
|
|
208
|
+
// 4. Google Cloud TTS (FREE tier: 1M characters/month)
|
|
178
209
|
if (GOOGLE_CLOUD_CONFIG.enabled && GOOGLE_CLOUD_CONFIG.apiKey !== "YOUR_GOOGLE_CLOUD_API_KEY") {
|
|
179
210
|
return {
|
|
180
211
|
service: "google",
|
|
@@ -185,7 +216,7 @@ export function getActiveTTSConfig() {
|
|
|
185
216
|
};
|
|
186
217
|
}
|
|
187
218
|
|
|
188
|
-
//
|
|
219
|
+
// 5. Azure Cognitive Services (FREE tier: 500K characters/month)
|
|
189
220
|
if (AZURE_COGNITIVE_CONFIG.enabled && AZURE_COGNITIVE_CONFIG.apiKey !== "YOUR_AZURE_COGNITIVE_API_KEY") {
|
|
190
221
|
return {
|
|
191
222
|
service: "azure",
|
|
@@ -196,7 +227,7 @@ export function getActiveTTSConfig() {
|
|
|
196
227
|
};
|
|
197
228
|
}
|
|
198
229
|
|
|
199
|
-
//
|
|
230
|
+
// 6. Legacy Azure Config (for backward compatibility)
|
|
200
231
|
if (AZURE_CONFIG.enabled && AZURE_CONFIG.apiKey !== "YOUR_AZURE_API_KEY") {
|
|
201
232
|
return {
|
|
202
233
|
service: "azure",
|
|
@@ -207,7 +238,7 @@ export function getActiveTTSConfig() {
|
|
|
207
238
|
};
|
|
208
239
|
}
|
|
209
240
|
|
|
210
|
-
//
|
|
241
|
+
// 7. Legacy Google Config (for backward compatibility)
|
|
211
242
|
if (GOOGLE_CONFIG.enabled && GOOGLE_CONFIG.apiKey !== "YOUR_GOOGLE_API_KEY") {
|
|
212
243
|
return {
|
|
213
244
|
service: "google",
|
|
@@ -218,7 +249,7 @@ export function getActiveTTSConfig() {
|
|
|
218
249
|
};
|
|
219
250
|
}
|
|
220
251
|
|
|
221
|
-
//
|
|
252
|
+
// 8. Fallback to browser TTS
|
|
222
253
|
return {
|
|
223
254
|
service: "browser",
|
|
224
255
|
endpoint: "",
|
package/src/lib/talkinghead.mjs
CHANGED
|
@@ -3990,6 +3990,159 @@ class TalkingHead {
|
|
|
3990
3990
|
this.playAudio();
|
|
3991
3991
|
}
|
|
3992
3992
|
|
|
3993
|
+
/**
|
|
3994
|
+
* Synthesize speech using Deepgram Aura-2 TTS
|
|
3995
|
+
* @param {Object} line Speech line object
|
|
3996
|
+
*/
|
|
3997
|
+
async synthesizeWithDeepgramTTS(line) {
|
|
3998
|
+
// Get the text from the line
|
|
3999
|
+
const text = line.text.map(x => x.word).join(' ');
|
|
4000
|
+
|
|
4001
|
+
// Deepgram API request
|
|
4002
|
+
const voiceModel = line.voice || this.avatar.ttsVoice || this.opt.ttsVoice || "aura-2-thalia-en"; // Default to Thalia
|
|
4003
|
+
|
|
4004
|
+
// Build URL with model as query parameter
|
|
4005
|
+
const url = `${this.opt.ttsEndpoint}?model=${voiceModel}`;
|
|
4006
|
+
|
|
4007
|
+
const response = await fetch(url, {
|
|
4008
|
+
method: 'POST',
|
|
4009
|
+
headers: {
|
|
4010
|
+
'Authorization': `Token ${this.opt.ttsApikey}`,
|
|
4011
|
+
'Content-Type': 'text/plain',
|
|
4012
|
+
'Accept': 'audio/mpeg'
|
|
4013
|
+
},
|
|
4014
|
+
body: text
|
|
4015
|
+
});
|
|
4016
|
+
|
|
4017
|
+
if (!response.ok) {
|
|
4018
|
+
throw new Error(`Deepgram TTS error: ${response.status} ${response.statusText}`);
|
|
4019
|
+
}
|
|
4020
|
+
|
|
4021
|
+
// Get audio data
|
|
4022
|
+
const audioArrayBuffer = await response.arrayBuffer();
|
|
4023
|
+
const audioBuffer = await this.audioCtx.decodeAudioData(audioArrayBuffer);
|
|
4024
|
+
|
|
4025
|
+
// Use text-based lip-sync with proper error handling
|
|
4026
|
+
console.log('Using text-based lip-sync for Deepgram...');
|
|
4027
|
+
const lipsyncLang = this.avatar.lipsyncLang || this.opt.lipsyncLang || 'en';
|
|
4028
|
+
|
|
4029
|
+
let audioAnalysis;
|
|
4030
|
+
try {
|
|
4031
|
+
console.log('Lip-sync modules available:', {
|
|
4032
|
+
hasLipsync: !!this.lipsync,
|
|
4033
|
+
lipsyncKeys: this.lipsync ? Object.keys(this.lipsync) : [],
|
|
4034
|
+
lipsyncLang: lipsyncLang
|
|
4035
|
+
});
|
|
4036
|
+
|
|
4037
|
+
const processedText = this.lipsyncPreProcessText(text, lipsyncLang);
|
|
4038
|
+
const lipsyncData = this.lipsyncWordsToVisemes(processedText, lipsyncLang);
|
|
4039
|
+
|
|
4040
|
+
console.log('Lip-sync data:', {
|
|
4041
|
+
processedText,
|
|
4042
|
+
lipsyncData,
|
|
4043
|
+
hasVisemes: lipsyncData && lipsyncData.visemes && lipsyncData.visemes.length > 0
|
|
4044
|
+
});
|
|
4045
|
+
|
|
4046
|
+
if (lipsyncData && lipsyncData.visemes && lipsyncData.visemes.length > 0) {
|
|
4047
|
+
// Create audio analysis structure for compatibility
|
|
4048
|
+
audioAnalysis = {
|
|
4049
|
+
visemes: lipsyncData.visemes.map((viseme, i) => ({
|
|
4050
|
+
viseme: viseme,
|
|
4051
|
+
startTime: (i * audioBuffer.duration) / lipsyncData.visemes.length,
|
|
4052
|
+
endTime: ((i + 1) * audioBuffer.duration) / lipsyncData.visemes.length,
|
|
4053
|
+
duration: audioBuffer.duration / lipsyncData.visemes.length,
|
|
4054
|
+
intensity: 0.7
|
|
4055
|
+
})),
|
|
4056
|
+
words: [],
|
|
4057
|
+
duration: audioBuffer.duration,
|
|
4058
|
+
features: { onsets: [], boundaries: [] }
|
|
4059
|
+
};
|
|
4060
|
+
} else {
|
|
4061
|
+
throw new Error('No visemes generated from text');
|
|
4062
|
+
}
|
|
4063
|
+
} catch (error) {
|
|
4064
|
+
console.error('Text-based lip-sync failed, using fallback:', error);
|
|
4065
|
+
// Fallback: create simple visemes from text
|
|
4066
|
+
const words = text.toLowerCase().split(/\s+/);
|
|
4067
|
+
const simpleVisemes = [];
|
|
4068
|
+
|
|
4069
|
+
for (const word of words) {
|
|
4070
|
+
// Simple phonetic mapping
|
|
4071
|
+
for (const char of word) {
|
|
4072
|
+
let viseme = 'aa'; // default
|
|
4073
|
+
if ('aeiou'.includes(char)) viseme = 'aa';
|
|
4074
|
+
else if ('bp'.includes(char)) viseme = 'PP';
|
|
4075
|
+
else if ('fv'.includes(char)) viseme = 'FF';
|
|
4076
|
+
else if ('st'.includes(char)) viseme = 'SS';
|
|
4077
|
+
else if ('dln'.includes(char)) viseme = 'DD';
|
|
4078
|
+
else if ('kg'.includes(char)) viseme = 'kk';
|
|
4079
|
+
else if ('rw'.includes(char)) viseme = 'RR';
|
|
4080
|
+
|
|
4081
|
+
simpleVisemes.push(viseme);
|
|
4082
|
+
}
|
|
4083
|
+
}
|
|
4084
|
+
|
|
4085
|
+
audioAnalysis = {
|
|
4086
|
+
visemes: simpleVisemes.map((viseme, i) => ({
|
|
4087
|
+
viseme: viseme,
|
|
4088
|
+
startTime: (i * audioBuffer.duration) / simpleVisemes.length,
|
|
4089
|
+
endTime: ((i + 1) * audioBuffer.duration) / simpleVisemes.length,
|
|
4090
|
+
duration: audioBuffer.duration / simpleVisemes.length,
|
|
4091
|
+
intensity: 0.6
|
|
4092
|
+
})),
|
|
4093
|
+
words: [],
|
|
4094
|
+
duration: audioBuffer.duration,
|
|
4095
|
+
features: { onsets: [], boundaries: [] }
|
|
4096
|
+
};
|
|
4097
|
+
}
|
|
4098
|
+
|
|
4099
|
+
console.log('Deepgram TTS Audio Analysis:', {
|
|
4100
|
+
text,
|
|
4101
|
+
audioDuration: audioBuffer.duration,
|
|
4102
|
+
visemeCount: audioAnalysis.visemes ? audioAnalysis.visemes.length : 0,
|
|
4103
|
+
wordCount: audioAnalysis.words ? audioAnalysis.words.length : 0,
|
|
4104
|
+
features: {
|
|
4105
|
+
onsets: audioAnalysis.features && audioAnalysis.features.onsets ? audioAnalysis.features.onsets.length : 0,
|
|
4106
|
+
boundaries: audioAnalysis.features && audioAnalysis.features.phonemeBoundaries ? audioAnalysis.features.phonemeBoundaries.length : 0
|
|
4107
|
+
},
|
|
4108
|
+
visemes: audioAnalysis.visemes ? audioAnalysis.visemes.slice(0, 3) : [] // Show first 3 visemes for debugging
|
|
4109
|
+
});
|
|
4110
|
+
|
|
4111
|
+
// Generate precise lip-sync animation from audio analysis
|
|
4112
|
+
const lipsyncAnim = [];
|
|
4113
|
+
if (audioAnalysis.visemes && audioAnalysis.visemes.length > 0) {
|
|
4114
|
+
console.log('Deepgram: Generating lip-sync animation from', audioAnalysis.visemes.length, 'visemes');
|
|
4115
|
+
for (let i = 0; i < audioAnalysis.visemes.length; i++) {
|
|
4116
|
+
const visemeData = audioAnalysis.visemes[i];
|
|
4117
|
+
const time = visemeData.startTime * 1000; // Convert to milliseconds
|
|
4118
|
+
const duration = visemeData.duration * 1000;
|
|
4119
|
+
const intensity = visemeData.intensity;
|
|
4120
|
+
|
|
4121
|
+
lipsyncAnim.push({
|
|
4122
|
+
template: { name: 'viseme' },
|
|
4123
|
+
ts: [time - Math.min(60, 2 * duration / 3), time + Math.min(25, duration / 2), time + duration + Math.min(60, duration / 2)],
|
|
4124
|
+
vs: {
|
|
4125
|
+
['viseme_' + visemeData.viseme]: [null, intensity, 0]
|
|
4126
|
+
}
|
|
4127
|
+
});
|
|
4128
|
+
}
|
|
4129
|
+
console.log('Deepgram: Generated', lipsyncAnim.length, 'lip-sync animation frames');
|
|
4130
|
+
} else {
|
|
4131
|
+
console.warn('Deepgram: No visemes available for lip-sync animation');
|
|
4132
|
+
}
|
|
4133
|
+
|
|
4134
|
+
// Combine original animation with lip-sync animation
|
|
4135
|
+
const combinedAnim = [...line.anim, ...lipsyncAnim];
|
|
4136
|
+
console.log('Deepgram: Combined animation frames:', combinedAnim.length, '(original:', line.anim.length, '+ lipsync:', lipsyncAnim.length, ')');
|
|
4137
|
+
|
|
4138
|
+
// Add to playlist
|
|
4139
|
+
this.audioPlaylist.push({ anim: combinedAnim, audio: audioBuffer });
|
|
4140
|
+
this.onSubtitles = line.onSubtitles || null;
|
|
4141
|
+
this.resetLips();
|
|
4142
|
+
if (line.mood) this.setMood(line.mood);
|
|
4143
|
+
this.playAudio();
|
|
4144
|
+
}
|
|
4145
|
+
|
|
3993
4146
|
/**
|
|
3994
4147
|
* Synthesize speech using Azure TTS
|
|
3995
4148
|
* @param {Object} line Speech line object
|
|
@@ -4235,6 +4388,9 @@ class TalkingHead {
|
|
|
4235
4388
|
} else if (this.opt.ttsService === "elevenlabs") {
|
|
4236
4389
|
// Use ElevenLabs TTS
|
|
4237
4390
|
await this.synthesizeWithElevenLabsTTS(line);
|
|
4391
|
+
} else if (this.opt.ttsService === "deepgram") {
|
|
4392
|
+
// Use Deepgram Aura-2 TTS
|
|
4393
|
+
await this.synthesizeWithDeepgramTTS(line);
|
|
4238
4394
|
} else if (this.opt.ttsService === "azure") {
|
|
4239
4395
|
// Use Azure TTS
|
|
4240
4396
|
await this.synthesizeWithAzureTTS(line);
|