@sage-rsc/talking-head-react 1.0.40 → 1.0.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sage-rsc/talking-head-react",
3
- "version": "1.0.40",
3
+ "version": "1.0.42",
4
4
  "description": "A reusable React component for 3D talking avatars with lip-sync and text-to-speech",
5
5
  "main": "./dist/index.cjs",
6
6
  "module": "./dist/index.js",
@@ -279,7 +279,8 @@ const CurriculumLearning = forwardRef(({
279
279
  });
280
280
  }
281
281
 
282
- if (avatarRef.current && firstQuestion) {
282
+ // Ensure avatar is ready before speaking
283
+ if (avatarRef.current && avatarRef.current.isReady && firstQuestion) {
283
284
  avatarRef.current.setMood("curious");
284
285
 
285
286
  // Play custom animation if available
@@ -303,9 +304,16 @@ const CurriculumLearning = forwardRef(({
303
304
  } else {
304
305
  avatarRef.current.speakText(`Now let me ask you some questions. Here's the first one: ${firstQuestion.question}`, { lipsyncLang: config.lipsyncLang });
305
306
  }
306
- } else if (avatarRef.current) {
307
+ } else if (avatarRef.current && avatarRef.current.isReady) {
307
308
  const config = defaultAvatarConfigRef.current || { lipsyncLang: 'en' };
308
309
  avatarRef.current.speakText("Now let me ask you some questions to test your understanding.", { lipsyncLang: config.lipsyncLang });
310
+ } else {
311
+ // Avatar not ready yet, retry after a short delay
312
+ setTimeout(() => {
313
+ if (startQuestionsRef.current) {
314
+ startQuestionsRef.current();
315
+ }
316
+ }, 100);
309
317
  }
310
318
  }, [animations.questionStart, getCurrentLesson, getCurrentQuestion]);
311
319
 
@@ -671,7 +679,7 @@ const CurriculumLearning = forwardRef(({
671
679
  if (startTeachingRef.current) {
672
680
  startTeachingRef.current();
673
681
  }
674
- }, 50);
682
+ }, 10);
675
683
  }
676
684
  }, [autoStart, getCurrentLesson]);
677
685
 
@@ -1,6 +1,6 @@
1
1
  import React, { useEffect, useRef, useState, useCallback, forwardRef, useImperativeHandle } from 'react';
2
2
  import { TalkingHead } from '../lib/talkinghead.mjs';
3
- import { getActiveTTSConfig, ELEVENLABS_CONFIG } from '../config/ttsConfig';
3
+ import { getActiveTTSConfig, ELEVENLABS_CONFIG, DEEPGRAM_CONFIG } from '../config/ttsConfig';
4
4
 
5
5
  /**
6
6
  * TalkingHeadAvatar - A reusable React component for 3D talking avatars
@@ -13,7 +13,7 @@ import { getActiveTTSConfig, ELEVENLABS_CONFIG } from '../config/ttsConfig';
13
13
  * @param {string} props.avatarBody - Avatar body type ('M' or 'F')
14
14
  * @param {string} props.mood - Initial mood ('happy', 'sad', 'neutral', etc.)
15
15
  * @param {string} props.ttsLang - Text-to-speech language code
16
- * @param {string} props.ttsService - TTS service ('edge', 'elevenlabs', 'google', 'azure', 'browser')
16
+ * @param {string} props.ttsService - TTS service ('edge', 'elevenlabs', 'deepgram', 'google', 'azure', 'browser')
17
17
  * @param {string} props.ttsVoice - TTS voice ID
18
18
  * @param {string} props.ttsApiKey - TTS API key (overrides config for ElevenLabs, Google Cloud, Azure)
19
19
  * @param {string} props.bodyMovement - Initial body movement type
@@ -79,6 +79,16 @@ const TalkingHeadAvatar = forwardRef(({
79
79
  defaultVoice: ttsVoice || ttsConfig.defaultVoice || ELEVENLABS_CONFIG.defaultVoice,
80
80
  voices: ttsConfig.voices || ELEVENLABS_CONFIG.voices
81
81
  };
82
+ } else if (effectiveTtsService === 'deepgram') {
83
+ // Explicitly set up Deepgram configuration
84
+ const apiKey = ttsApiKey || ttsConfig.apiKey;
85
+ effectiveTtsConfig = {
86
+ service: 'deepgram',
87
+ endpoint: 'https://api.deepgram.com/v1/speak',
88
+ apiKey: apiKey,
89
+ defaultVoice: ttsVoice || ttsConfig.defaultVoice || DEEPGRAM_CONFIG.defaultVoice,
90
+ voices: ttsConfig.voices || DEEPGRAM_CONFIG.voices
91
+ };
82
92
  } else {
83
93
  // For other services, use config with prop overrides
84
94
  effectiveTtsConfig = {
@@ -279,7 +289,7 @@ const TalkingHeadAvatar = forwardRef(({
279
289
  // This is because onAudioEnd is only for streaming mode
280
290
  let checkInterval = null;
281
291
  let checkCount = 0;
282
- const maxChecks = 600; // 60 seconds max (100ms intervals)
292
+ const maxChecks = 600; // 60 seconds max (50ms intervals for faster detection)
283
293
 
284
294
  const checkSpeechFinished = () => {
285
295
  checkCount++;
@@ -307,21 +317,21 @@ const TalkingHeadAvatar = forwardRef(({
307
317
  checkInterval = null;
308
318
  }
309
319
 
310
- // Small delay to ensure everything is settled
320
+ // Small delay to ensure everything is settled - reduced for faster transitions
311
321
  setTimeout(() => {
312
322
  try {
313
323
  options.onSpeechEnd();
314
324
  } catch (e) {
315
325
  console.error('Error in onSpeechEnd callback:', e);
316
326
  }
317
- }, 100);
327
+ }, 10);
318
328
  }
319
329
  };
320
330
 
321
- // Start checking after a short delay (to allow speech to start)
331
+ // Start checking after a minimal delay (to allow speech to start)
322
332
  setTimeout(() => {
323
- checkInterval = setInterval(checkSpeechFinished, 100);
324
- }, 500);
333
+ checkInterval = setInterval(checkSpeechFinished, 50);
334
+ }, 100);
325
335
  }
326
336
 
327
337
  if (talkingHeadRef.current.lipsync && Object.keys(talkingHeadRef.current.lipsync).length > 0) {
@@ -338,7 +348,7 @@ const TalkingHeadAvatar = forwardRef(({
338
348
  }
339
349
  talkingHeadRef.current.speakText(textToSpeak, speakOptions);
340
350
  }
341
- }, 500);
351
+ }, 100);
342
352
  }
343
353
  } catch (err) {
344
354
  console.error('Error speaking text:', err);
@@ -8,15 +8,16 @@
8
8
  *
9
9
  * 💰 PREMIUM OPTIONS (if you want even better quality):
10
10
  * 1. ElevenLabs - Most human-like voices (10K chars/month free)
11
- * 2. Google Cloud TTS - 1M characters/month free
12
- * 3. Azure Cognitive Services - 500K characters/month free
11
+ * 2. Deepgram Aura-2 - Enterprise-grade, low-latency TTS (40% lower cost than ElevenLabs)
12
+ * 3. Google Cloud TTS - 1M characters/month free
13
+ * 4. Azure Cognitive Services - 500K characters/month free
13
14
  *
14
15
  * 🔧 TO SWITCH SERVICES:
15
16
  * 1. Set the service you want to use: enabled: true
16
17
  * 2. Set others to: enabled: false
17
18
  * 3. For paid services, add your API key
18
19
  *
19
- * Priority order: ElevenLabs > Edge TTS (FREE) > Google Cloud > Azure > Browser
20
+ * Priority order: ElevenLabs > Deepgram > Edge TTS (FREE) > Google Cloud > Azure > Browser
20
21
  */
21
22
 
22
23
  // ElevenLabs Configuration (Recommended - Most human-like voices)
@@ -37,6 +38,25 @@ export const ELEVENLABS_CONFIG = {
37
38
  }
38
39
  };
39
40
 
41
+ // Deepgram Aura-2 Configuration (Enterprise-grade, low-latency TTS)
42
+ // Sign up at: https://deepgram.com
43
+ // Competitive pricing, ~40% lower cost than ElevenLabs Flash
44
+ export const DEEPGRAM_CONFIG = {
45
+ enabled: false, // Set to true to use Deepgram
46
+ apiKey: "YOUR_DEEPGRAM_API_KEY", // Replace with your actual API key
47
+ endpoint: "https://api.deepgram.com/v1/speak",
48
+ defaultVoice: "aura-2-thalia-en", // Thalia (Female, English)
49
+ voices: {
50
+ thalia: "aura-2-thalia-en", // Female, English - Natural and clear
51
+ asteria: "aura-2-asteria-en", // Female, English - Warm and friendly
52
+ orion: "aura-2-orion-en", // Male, English - Professional
53
+ stella: "aura-2-stella-en", // Female, English - Energetic
54
+ athena: "aura-2-athena-en", // Female, English - Authoritative
55
+ hera: "aura-2-hera-en", // Female, English - Calm
56
+ zeus: "aura-2-zeus-en" // Male, English - Powerful
57
+ }
58
+ };
59
+
40
60
  // Azure Cognitive Services Configuration
41
61
  // Free tier: 500,000 characters/month
42
62
  // Sign up at: https://azure.microsoft.com
@@ -149,7 +169,7 @@ export const BROWSER_CONFIG = {
149
169
 
150
170
  /**
151
171
  * Get the active TTS configuration
152
- * Priority: ElevenLabs > Edge TTS (FREE) > Google Cloud > Azure Cognitive > Browser
172
+ * Priority: ElevenLabs > Deepgram > Edge TTS (FREE) > Google Cloud > Azure Cognitive > Browser
153
173
  */
154
174
  export function getActiveTTSConfig() {
155
175
  // 1. ElevenLabs (Premium - Most human-like)
@@ -163,7 +183,18 @@ export function getActiveTTSConfig() {
163
183
  };
164
184
  }
165
185
 
166
- // 2. Microsoft Edge TTS (FREE - Recommended)
186
+ // 2. Deepgram Aura-2 (Enterprise-grade, low-latency)
187
+ if (DEEPGRAM_CONFIG.enabled && DEEPGRAM_CONFIG.apiKey && DEEPGRAM_CONFIG.apiKey !== "YOUR_DEEPGRAM_API_KEY") {
188
+ return {
189
+ service: "deepgram",
190
+ endpoint: DEEPGRAM_CONFIG.endpoint,
191
+ apiKey: DEEPGRAM_CONFIG.apiKey,
192
+ defaultVoice: DEEPGRAM_CONFIG.defaultVoice,
193
+ voices: DEEPGRAM_CONFIG.voices
194
+ };
195
+ }
196
+
197
+ // 3. Microsoft Edge TTS (FREE - Recommended)
167
198
  if (EDGE_CONFIG.enabled) {
168
199
  return {
169
200
  service: "edge",
@@ -174,7 +205,7 @@ export function getActiveTTSConfig() {
174
205
  };
175
206
  }
176
207
 
177
- // 3. Google Cloud TTS (FREE tier: 1M characters/month)
208
+ // 4. Google Cloud TTS (FREE tier: 1M characters/month)
178
209
  if (GOOGLE_CLOUD_CONFIG.enabled && GOOGLE_CLOUD_CONFIG.apiKey !== "YOUR_GOOGLE_CLOUD_API_KEY") {
179
210
  return {
180
211
  service: "google",
@@ -185,7 +216,7 @@ export function getActiveTTSConfig() {
185
216
  };
186
217
  }
187
218
 
188
- // 4. Azure Cognitive Services (FREE tier: 500K characters/month)
219
+ // 5. Azure Cognitive Services (FREE tier: 500K characters/month)
189
220
  if (AZURE_COGNITIVE_CONFIG.enabled && AZURE_COGNITIVE_CONFIG.apiKey !== "YOUR_AZURE_COGNITIVE_API_KEY") {
190
221
  return {
191
222
  service: "azure",
@@ -196,7 +227,7 @@ export function getActiveTTSConfig() {
196
227
  };
197
228
  }
198
229
 
199
- // 5. Legacy Azure Config (for backward compatibility)
230
+ // 6. Legacy Azure Config (for backward compatibility)
200
231
  if (AZURE_CONFIG.enabled && AZURE_CONFIG.apiKey !== "YOUR_AZURE_API_KEY") {
201
232
  return {
202
233
  service: "azure",
@@ -207,7 +238,7 @@ export function getActiveTTSConfig() {
207
238
  };
208
239
  }
209
240
 
210
- // 6. Legacy Google Config (for backward compatibility)
241
+ // 7. Legacy Google Config (for backward compatibility)
211
242
  if (GOOGLE_CONFIG.enabled && GOOGLE_CONFIG.apiKey !== "YOUR_GOOGLE_API_KEY") {
212
243
  return {
213
244
  service: "google",
@@ -218,7 +249,7 @@ export function getActiveTTSConfig() {
218
249
  };
219
250
  }
220
251
 
221
- // 7. Fallback to browser TTS
252
+ // 8. Fallback to browser TTS
222
253
  return {
223
254
  service: "browser",
224
255
  endpoint: "",
@@ -3990,6 +3990,159 @@ class TalkingHead {
3990
3990
  this.playAudio();
3991
3991
  }
3992
3992
 
3993
+ /**
3994
+ * Synthesize speech using Deepgram Aura-2 TTS
3995
+ * @param {Object} line Speech line object
3996
+ */
3997
+ async synthesizeWithDeepgramTTS(line) {
3998
+ // Get the text from the line
3999
+ const text = line.text.map(x => x.word).join(' ');
4000
+
4001
+ // Deepgram API request
4002
+ const voiceModel = line.voice || this.avatar.ttsVoice || this.opt.ttsVoice || "aura-2-thalia-en"; // Default to Thalia
4003
+
4004
+ // Build URL with model as query parameter
4005
+ const url = `${this.opt.ttsEndpoint}?model=${voiceModel}`;
4006
+
4007
+ const response = await fetch(url, {
4008
+ method: 'POST',
4009
+ headers: {
4010
+ 'Authorization': `Token ${this.opt.ttsApikey}`,
4011
+ 'Content-Type': 'text/plain',
4012
+ 'Accept': 'audio/mpeg'
4013
+ },
4014
+ body: text
4015
+ });
4016
+
4017
+ if (!response.ok) {
4018
+ throw new Error(`Deepgram TTS error: ${response.status} ${response.statusText}`);
4019
+ }
4020
+
4021
+ // Get audio data
4022
+ const audioArrayBuffer = await response.arrayBuffer();
4023
+ const audioBuffer = await this.audioCtx.decodeAudioData(audioArrayBuffer);
4024
+
4025
+ // Use text-based lip-sync with proper error handling
4026
+ console.log('Using text-based lip-sync for Deepgram...');
4027
+ const lipsyncLang = this.avatar.lipsyncLang || this.opt.lipsyncLang || 'en';
4028
+
4029
+ let audioAnalysis;
4030
+ try {
4031
+ console.log('Lip-sync modules available:', {
4032
+ hasLipsync: !!this.lipsync,
4033
+ lipsyncKeys: this.lipsync ? Object.keys(this.lipsync) : [],
4034
+ lipsyncLang: lipsyncLang
4035
+ });
4036
+
4037
+ const processedText = this.lipsyncPreProcessText(text, lipsyncLang);
4038
+ const lipsyncData = this.lipsyncWordsToVisemes(processedText, lipsyncLang);
4039
+
4040
+ console.log('Lip-sync data:', {
4041
+ processedText,
4042
+ lipsyncData,
4043
+ hasVisemes: lipsyncData && lipsyncData.visemes && lipsyncData.visemes.length > 0
4044
+ });
4045
+
4046
+ if (lipsyncData && lipsyncData.visemes && lipsyncData.visemes.length > 0) {
4047
+ // Create audio analysis structure for compatibility
4048
+ audioAnalysis = {
4049
+ visemes: lipsyncData.visemes.map((viseme, i) => ({
4050
+ viseme: viseme,
4051
+ startTime: (i * audioBuffer.duration) / lipsyncData.visemes.length,
4052
+ endTime: ((i + 1) * audioBuffer.duration) / lipsyncData.visemes.length,
4053
+ duration: audioBuffer.duration / lipsyncData.visemes.length,
4054
+ intensity: 0.7
4055
+ })),
4056
+ words: [],
4057
+ duration: audioBuffer.duration,
4058
+ features: { onsets: [], boundaries: [] }
4059
+ };
4060
+ } else {
4061
+ throw new Error('No visemes generated from text');
4062
+ }
4063
+ } catch (error) {
4064
+ console.error('Text-based lip-sync failed, using fallback:', error);
4065
+ // Fallback: create simple visemes from text
4066
+ const words = text.toLowerCase().split(/\s+/);
4067
+ const simpleVisemes = [];
4068
+
4069
+ for (const word of words) {
4070
+ // Simple phonetic mapping
4071
+ for (const char of word) {
4072
+ let viseme = 'aa'; // default
4073
+ if ('aeiou'.includes(char)) viseme = 'aa';
4074
+ else if ('bp'.includes(char)) viseme = 'PP';
4075
+ else if ('fv'.includes(char)) viseme = 'FF';
4076
+ else if ('st'.includes(char)) viseme = 'SS';
4077
+ else if ('dln'.includes(char)) viseme = 'DD';
4078
+ else if ('kg'.includes(char)) viseme = 'kk';
4079
+ else if ('rw'.includes(char)) viseme = 'RR';
4080
+
4081
+ simpleVisemes.push(viseme);
4082
+ }
4083
+ }
4084
+
4085
+ audioAnalysis = {
4086
+ visemes: simpleVisemes.map((viseme, i) => ({
4087
+ viseme: viseme,
4088
+ startTime: (i * audioBuffer.duration) / simpleVisemes.length,
4089
+ endTime: ((i + 1) * audioBuffer.duration) / simpleVisemes.length,
4090
+ duration: audioBuffer.duration / simpleVisemes.length,
4091
+ intensity: 0.6
4092
+ })),
4093
+ words: [],
4094
+ duration: audioBuffer.duration,
4095
+ features: { onsets: [], boundaries: [] }
4096
+ };
4097
+ }
4098
+
4099
+ console.log('Deepgram TTS Audio Analysis:', {
4100
+ text,
4101
+ audioDuration: audioBuffer.duration,
4102
+ visemeCount: audioAnalysis.visemes ? audioAnalysis.visemes.length : 0,
4103
+ wordCount: audioAnalysis.words ? audioAnalysis.words.length : 0,
4104
+ features: {
4105
+ onsets: audioAnalysis.features && audioAnalysis.features.onsets ? audioAnalysis.features.onsets.length : 0,
4106
+ boundaries: audioAnalysis.features && audioAnalysis.features.phonemeBoundaries ? audioAnalysis.features.phonemeBoundaries.length : 0
4107
+ },
4108
+ visemes: audioAnalysis.visemes ? audioAnalysis.visemes.slice(0, 3) : [] // Show first 3 visemes for debugging
4109
+ });
4110
+
4111
+ // Generate precise lip-sync animation from audio analysis
4112
+ const lipsyncAnim = [];
4113
+ if (audioAnalysis.visemes && audioAnalysis.visemes.length > 0) {
4114
+ console.log('Deepgram: Generating lip-sync animation from', audioAnalysis.visemes.length, 'visemes');
4115
+ for (let i = 0; i < audioAnalysis.visemes.length; i++) {
4116
+ const visemeData = audioAnalysis.visemes[i];
4117
+ const time = visemeData.startTime * 1000; // Convert to milliseconds
4118
+ const duration = visemeData.duration * 1000;
4119
+ const intensity = visemeData.intensity;
4120
+
4121
+ lipsyncAnim.push({
4122
+ template: { name: 'viseme' },
4123
+ ts: [time - Math.min(60, 2 * duration / 3), time + Math.min(25, duration / 2), time + duration + Math.min(60, duration / 2)],
4124
+ vs: {
4125
+ ['viseme_' + visemeData.viseme]: [null, intensity, 0]
4126
+ }
4127
+ });
4128
+ }
4129
+ console.log('Deepgram: Generated', lipsyncAnim.length, 'lip-sync animation frames');
4130
+ } else {
4131
+ console.warn('Deepgram: No visemes available for lip-sync animation');
4132
+ }
4133
+
4134
+ // Combine original animation with lip-sync animation
4135
+ const combinedAnim = [...line.anim, ...lipsyncAnim];
4136
+ console.log('Deepgram: Combined animation frames:', combinedAnim.length, '(original:', line.anim.length, '+ lipsync:', lipsyncAnim.length, ')');
4137
+
4138
+ // Add to playlist
4139
+ this.audioPlaylist.push({ anim: combinedAnim, audio: audioBuffer });
4140
+ this.onSubtitles = line.onSubtitles || null;
4141
+ this.resetLips();
4142
+ if (line.mood) this.setMood(line.mood);
4143
+ this.playAudio();
4144
+ }
4145
+
3993
4146
  /**
3994
4147
  * Synthesize speech using Azure TTS
3995
4148
  * @param {Object} line Speech line object
@@ -4235,6 +4388,9 @@ class TalkingHead {
4235
4388
  } else if (this.opt.ttsService === "elevenlabs") {
4236
4389
  // Use ElevenLabs TTS
4237
4390
  await this.synthesizeWithElevenLabsTTS(line);
4391
+ } else if (this.opt.ttsService === "deepgram") {
4392
+ // Use Deepgram Aura-2 TTS
4393
+ await this.synthesizeWithDeepgramTTS(line);
4238
4394
  } else if (this.opt.ttsService === "azure") {
4239
4395
  // Use Azure TTS
4240
4396
  await this.synthesizeWithAzureTTS(line);