npm - @sage-rsc/talking-head-react - Versions diffs - 1.0.40 → 1.0.42 - Mend

@sage-rsc/talking-head-react 1.0.40 → 1.0.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs +2 -2
package/dist/index.js +504 -378
package/package.json +1 -1
package/src/components/CurriculumLearning.jsx +11 -3
package/src/components/TalkingHeadAvatar.jsx +19 -9
package/src/config/ttsConfig.js +41 -10
package/src/lib/talkinghead.mjs +156 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sage-rsc/talking-head-react",
-  "version": "1.0.40",
+  "version": "1.0.42",
   "description": "A reusable React component for 3D talking avatars with lip-sync and text-to-speech",
   "main": "./dist/index.cjs",
   "module": "./dist/index.js",

package/src/components/CurriculumLearning.jsx CHANGED Viewed

@@ -279,7 +279,8 @@ const CurriculumLearning = forwardRef(({
       });
     }
-    if (avatarRef.current && firstQuestion) {
+    // Ensure avatar is ready before speaking
+    if (avatarRef.current && avatarRef.current.isReady && firstQuestion) {
       avatarRef.current.setMood("curious");
       // Play custom animation if available
@@ -303,9 +304,16 @@ const CurriculumLearning = forwardRef(({
       } else {
         avatarRef.current.speakText(`Now let me ask you some questions. Here's the first one: ${firstQuestion.question}`, { lipsyncLang: config.lipsyncLang });
       }
-    } else if (avatarRef.current) {
+    } else if (avatarRef.current && avatarRef.current.isReady) {
       const config = defaultAvatarConfigRef.current || { lipsyncLang: 'en' };
       avatarRef.current.speakText("Now let me ask you some questions to test your understanding.", { lipsyncLang: config.lipsyncLang });
+    } else {
+      // Avatar not ready yet, retry after a short delay
+      setTimeout(() => {
+        if (startQuestionsRef.current) {
+          startQuestionsRef.current();
+        }
+      }, 100);
     }
   }, [animations.questionStart, getCurrentLesson, getCurrentQuestion]);
@@ -671,7 +679,7 @@ const CurriculumLearning = forwardRef(({
         if (startTeachingRef.current) {
           startTeachingRef.current();
         }
-      }, 50);
+      }, 10);
     }
   }, [autoStart, getCurrentLesson]);

package/src/components/TalkingHeadAvatar.jsx CHANGED Viewed

@@ -1,6 +1,6 @@
 import React, { useEffect, useRef, useState, useCallback, forwardRef, useImperativeHandle } from 'react';
 import { TalkingHead } from '../lib/talkinghead.mjs';
-import { getActiveTTSConfig, ELEVENLABS_CONFIG } from '../config/ttsConfig';
+import { getActiveTTSConfig, ELEVENLABS_CONFIG, DEEPGRAM_CONFIG } from '../config/ttsConfig';
 /**
  * TalkingHeadAvatar - A reusable React component for 3D talking avatars
@@ -13,7 +13,7 @@ import { getActiveTTSConfig, ELEVENLABS_CONFIG } from '../config/ttsConfig';
  * @param {string} props.avatarBody - Avatar body type ('M' or 'F')
  * @param {string} props.mood - Initial mood ('happy', 'sad', 'neutral', etc.)
  * @param {string} props.ttsLang - Text-to-speech language code
- * @param {string} props.ttsService - TTS service ('edge', 'elevenlabs', 'google', 'azure', 'browser')
+ * @param {string} props.ttsService - TTS service ('edge', 'elevenlabs', 'deepgram', 'google', 'azure', 'browser')
  * @param {string} props.ttsVoice - TTS voice ID
  * @param {string} props.ttsApiKey - TTS API key (overrides config for ElevenLabs, Google Cloud, Azure)
  * @param {string} props.bodyMovement - Initial body movement type
@@ -79,6 +79,16 @@ const TalkingHeadAvatar = forwardRef(({
       defaultVoice: ttsVoice || ttsConfig.defaultVoice || ELEVENLABS_CONFIG.defaultVoice,
       voices: ttsConfig.voices || ELEVENLABS_CONFIG.voices
     };
+  } else if (effectiveTtsService === 'deepgram') {
+    // Explicitly set up Deepgram configuration
+    const apiKey = ttsApiKey || ttsConfig.apiKey;
+    effectiveTtsConfig = {
+      service: 'deepgram',
+      endpoint: 'https://api.deepgram.com/v1/speak',
+      apiKey: apiKey,
+      defaultVoice: ttsVoice || ttsConfig.defaultVoice || DEEPGRAM_CONFIG.defaultVoice,
+      voices: ttsConfig.voices || DEEPGRAM_CONFIG.voices
+    };
   } else {
     // For other services, use config with prop overrides
     effectiveTtsConfig = {
@@ -279,7 +289,7 @@ const TalkingHeadAvatar = forwardRef(({
           // This is because onAudioEnd is only for streaming mode
           let checkInterval = null;
           let checkCount = 0;
-          const maxChecks = 600; // 60 seconds max (100ms intervals)
+          const maxChecks = 600; // 60 seconds max (50ms intervals for faster detection)
           const checkSpeechFinished = () => {
             checkCount++;
@@ -307,21 +317,21 @@ const TalkingHeadAvatar = forwardRef(({
                 checkInterval = null;
               }
-              // Small delay to ensure everything is settled
+              // Small delay to ensure everything is settled - reduced for faster transitions
               setTimeout(() => {
                 try {
                   options.onSpeechEnd();
                 } catch (e) {
                   console.error('Error in onSpeechEnd callback:', e);
                 }
-              }, 100);
+              }, 10);
             }
           };
-          // Start checking after a short delay (to allow speech to start)
+          // Start checking after a minimal delay (to allow speech to start)
           setTimeout(() => {
-            checkInterval = setInterval(checkSpeechFinished, 100);
-          }, 500);
+            checkInterval = setInterval(checkSpeechFinished, 50);
+          }, 100);
         }
         if (talkingHeadRef.current.lipsync && Object.keys(talkingHeadRef.current.lipsync).length > 0) {
@@ -338,7 +348,7 @@ const TalkingHeadAvatar = forwardRef(({
               }
               talkingHeadRef.current.speakText(textToSpeak, speakOptions);
             }
-          }, 500);
+          }, 100);
         }
       } catch (err) {
         console.error('Error speaking text:', err);

package/src/config/ttsConfig.js CHANGED Viewed

@@ -8,15 +8,16 @@
  *
  * 💰 PREMIUM OPTIONS (if you want even better quality):
  * 1. ElevenLabs - Most human-like voices (10K chars/month free)
- * 2. Google Cloud TTS - 1M characters/month free
- * 3. Azure Cognitive Services - 500K characters/month free
+ * 2. Deepgram Aura-2 - Enterprise-grade, low-latency TTS (40% lower cost than ElevenLabs)
+ * 3. Google Cloud TTS - 1M characters/month free
+ * 4. Azure Cognitive Services - 500K characters/month free
  *
  * 🔧 TO SWITCH SERVICES:
  * 1. Set the service you want to use: enabled: true
  * 2. Set others to: enabled: false
  * 3. For paid services, add your API key
  *
- * Priority order: ElevenLabs > Edge TTS (FREE) > Google Cloud > Azure > Browser
+ * Priority order: ElevenLabs > Deepgram > Edge TTS (FREE) > Google Cloud > Azure > Browser
  */
 // ElevenLabs Configuration (Recommended - Most human-like voices)
@@ -37,6 +38,25 @@ export const ELEVENLABS_CONFIG = {
   }
 };
+// Deepgram Aura-2 Configuration (Enterprise-grade, low-latency TTS)
+// Sign up at: https://deepgram.com
+// Competitive pricing, ~40% lower cost than ElevenLabs Flash
+export const DEEPGRAM_CONFIG = {
+  enabled: false, // Set to true to use Deepgram
+  apiKey: "YOUR_DEEPGRAM_API_KEY", // Replace with your actual API key
+  endpoint: "https://api.deepgram.com/v1/speak",
+  defaultVoice: "aura-2-thalia-en", // Thalia (Female, English)
+  voices: {
+    thalia: "aura-2-thalia-en", // Female, English - Natural and clear
+    asteria: "aura-2-asteria-en", // Female, English - Warm and friendly
+    orion: "aura-2-orion-en", // Male, English - Professional
+    stella: "aura-2-stella-en", // Female, English - Energetic
+    athena: "aura-2-athena-en", // Female, English - Authoritative
+    hera: "aura-2-hera-en", // Female, English - Calm
+    zeus: "aura-2-zeus-en" // Male, English - Powerful
+  }
+};
 // Azure Cognitive Services Configuration
 // Free tier: 500,000 characters/month
 // Sign up at: https://azure.microsoft.com
@@ -149,7 +169,7 @@ export const BROWSER_CONFIG = {
 /**
  * Get the active TTS configuration
- * Priority: ElevenLabs > Edge TTS (FREE) > Google Cloud > Azure Cognitive > Browser
+ * Priority: ElevenLabs > Deepgram > Edge TTS (FREE) > Google Cloud > Azure Cognitive > Browser
  */
 export function getActiveTTSConfig() {
   // 1. ElevenLabs (Premium - Most human-like)
@@ -163,7 +183,18 @@ export function getActiveTTSConfig() {
     };
   }
-  // 2. Microsoft Edge TTS (FREE - Recommended)
+  // 2. Deepgram Aura-2 (Enterprise-grade, low-latency)
+  if (DEEPGRAM_CONFIG.enabled && DEEPGRAM_CONFIG.apiKey && DEEPGRAM_CONFIG.apiKey !== "YOUR_DEEPGRAM_API_KEY") {
+    return {
+      service: "deepgram",
+      endpoint: DEEPGRAM_CONFIG.endpoint,
+      apiKey: DEEPGRAM_CONFIG.apiKey,
+      defaultVoice: DEEPGRAM_CONFIG.defaultVoice,
+      voices: DEEPGRAM_CONFIG.voices
+    };
+  }
+  // 3. Microsoft Edge TTS (FREE - Recommended)
   if (EDGE_CONFIG.enabled) {
     return {
       service: "edge",
@@ -174,7 +205,7 @@ export function getActiveTTSConfig() {
     };
   }
-  // 3. Google Cloud TTS (FREE tier: 1M characters/month)
+  // 4. Google Cloud TTS (FREE tier: 1M characters/month)
   if (GOOGLE_CLOUD_CONFIG.enabled && GOOGLE_CLOUD_CONFIG.apiKey !== "YOUR_GOOGLE_CLOUD_API_KEY") {
     return {
       service: "google",
@@ -185,7 +216,7 @@ export function getActiveTTSConfig() {
     };
   }
-  // 4. Azure Cognitive Services (FREE tier: 500K characters/month)
+  // 5. Azure Cognitive Services (FREE tier: 500K characters/month)
   if (AZURE_COGNITIVE_CONFIG.enabled && AZURE_COGNITIVE_CONFIG.apiKey !== "YOUR_AZURE_COGNITIVE_API_KEY") {
     return {
       service: "azure",
@@ -196,7 +227,7 @@ export function getActiveTTSConfig() {
     };
   }
-  // 5. Legacy Azure Config (for backward compatibility)
+  // 6. Legacy Azure Config (for backward compatibility)
   if (AZURE_CONFIG.enabled && AZURE_CONFIG.apiKey !== "YOUR_AZURE_API_KEY") {
     return {
       service: "azure",
@@ -207,7 +238,7 @@ export function getActiveTTSConfig() {
     };
   }
-  // 6. Legacy Google Config (for backward compatibility)
+  // 7. Legacy Google Config (for backward compatibility)
   if (GOOGLE_CONFIG.enabled && GOOGLE_CONFIG.apiKey !== "YOUR_GOOGLE_API_KEY") {
     return {
       service: "google",
@@ -218,7 +249,7 @@ export function getActiveTTSConfig() {
     };
   }
-  // 7. Fallback to browser TTS
+  // 8. Fallback to browser TTS
   return {
     service: "browser",
     endpoint: "",

package/src/lib/talkinghead.mjs CHANGED Viewed

@@ -3990,6 +3990,159 @@ class TalkingHead {
         this.playAudio();
   }
+  /**
+   * Synthesize speech using Deepgram Aura-2 TTS
+   * @param {Object} line Speech line object
+   */
+  async synthesizeWithDeepgramTTS(line) {
+    // Get the text from the line
+    const text = line.text.map(x => x.word).join(' ');
+    // Deepgram API request
+    const voiceModel = line.voice || this.avatar.ttsVoice || this.opt.ttsVoice || "aura-2-thalia-en"; // Default to Thalia
+    // Build URL with model as query parameter
+    const url = `${this.opt.ttsEndpoint}?model=${voiceModel}`;
+    const response = await fetch(url, {
+      method: 'POST',
+      headers: {
+        'Authorization': `Token ${this.opt.ttsApikey}`,
+        'Content-Type': 'text/plain',
+        'Accept': 'audio/mpeg'
+      },
+      body: text
+    });
+    if (!response.ok) {
+      throw new Error(`Deepgram TTS error: ${response.status} ${response.statusText}`);
+    }
+    // Get audio data
+    const audioArrayBuffer = await response.arrayBuffer();
+    const audioBuffer = await this.audioCtx.decodeAudioData(audioArrayBuffer);
+    // Use text-based lip-sync with proper error handling
+    console.log('Using text-based lip-sync for Deepgram...');
+    const lipsyncLang = this.avatar.lipsyncLang || this.opt.lipsyncLang || 'en';
+    let audioAnalysis;
+    try {
+      console.log('Lip-sync modules available:', {
+        hasLipsync: !!this.lipsync,
+        lipsyncKeys: this.lipsync ? Object.keys(this.lipsync) : [],
+        lipsyncLang: lipsyncLang
+      });
+      const processedText = this.lipsyncPreProcessText(text, lipsyncLang);
+      const lipsyncData = this.lipsyncWordsToVisemes(processedText, lipsyncLang);
+      console.log('Lip-sync data:', {
+        processedText,
+        lipsyncData,
+        hasVisemes: lipsyncData && lipsyncData.visemes && lipsyncData.visemes.length > 0
+      });
+      if (lipsyncData && lipsyncData.visemes && lipsyncData.visemes.length > 0) {
+        // Create audio analysis structure for compatibility
+        audioAnalysis = {
+          visemes: lipsyncData.visemes.map((viseme, i) => ({
+            viseme: viseme,
+            startTime: (i * audioBuffer.duration) / lipsyncData.visemes.length,
+            endTime: ((i + 1) * audioBuffer.duration) / lipsyncData.visemes.length,
+            duration: audioBuffer.duration / lipsyncData.visemes.length,
+            intensity: 0.7
+          })),
+          words: [],
+          duration: audioBuffer.duration,
+          features: { onsets: [], boundaries: [] }
+        };
+      } else {
+        throw new Error('No visemes generated from text');
+      }
+    } catch (error) {
+      console.error('Text-based lip-sync failed, using fallback:', error);
+      // Fallback: create simple visemes from text
+      const words = text.toLowerCase().split(/\s+/);
+      const simpleVisemes = [];
+      for (const word of words) {
+        // Simple phonetic mapping
+        for (const char of word) {
+          let viseme = 'aa'; // default
+          if ('aeiou'.includes(char)) viseme = 'aa';
+          else if ('bp'.includes(char)) viseme = 'PP';
+          else if ('fv'.includes(char)) viseme = 'FF';
+          else if ('st'.includes(char)) viseme = 'SS';
+          else if ('dln'.includes(char)) viseme = 'DD';
+          else if ('kg'.includes(char)) viseme = 'kk';
+          else if ('rw'.includes(char)) viseme = 'RR';
+          simpleVisemes.push(viseme);
+        }
+      }
+      audioAnalysis = {
+        visemes: simpleVisemes.map((viseme, i) => ({
+          viseme: viseme,
+          startTime: (i * audioBuffer.duration) / simpleVisemes.length,
+          endTime: ((i + 1) * audioBuffer.duration) / simpleVisemes.length,
+          duration: audioBuffer.duration / simpleVisemes.length,
+          intensity: 0.6
+        })),
+        words: [],
+        duration: audioBuffer.duration,
+        features: { onsets: [], boundaries: [] }
+      };
+    }
+    console.log('Deepgram TTS Audio Analysis:', {
+      text,
+      audioDuration: audioBuffer.duration,
+      visemeCount: audioAnalysis.visemes ? audioAnalysis.visemes.length : 0,
+      wordCount: audioAnalysis.words ? audioAnalysis.words.length : 0,
+      features: {
+        onsets: audioAnalysis.features && audioAnalysis.features.onsets ? audioAnalysis.features.onsets.length : 0,
+        boundaries: audioAnalysis.features && audioAnalysis.features.phonemeBoundaries ? audioAnalysis.features.phonemeBoundaries.length : 0
+      },
+      visemes: audioAnalysis.visemes ? audioAnalysis.visemes.slice(0, 3) : [] // Show first 3 visemes for debugging
+    });
+    // Generate precise lip-sync animation from audio analysis
+    const lipsyncAnim = [];
+    if (audioAnalysis.visemes && audioAnalysis.visemes.length > 0) {
+      console.log('Deepgram: Generating lip-sync animation from', audioAnalysis.visemes.length, 'visemes');
+      for (let i = 0; i < audioAnalysis.visemes.length; i++) {
+      const visemeData = audioAnalysis.visemes[i];
+      const time = visemeData.startTime * 1000; // Convert to milliseconds
+      const duration = visemeData.duration * 1000;
+      const intensity = visemeData.intensity;
+      lipsyncAnim.push({
+        template: { name: 'viseme' },
+        ts: [time - Math.min(60, 2 * duration / 3), time + Math.min(25, duration / 2), time + duration + Math.min(60, duration / 2)],
+        vs: {
+          ['viseme_' + visemeData.viseme]: [null, intensity, 0]
+        }
+      });
+    }
+    console.log('Deepgram: Generated', lipsyncAnim.length, 'lip-sync animation frames');
+    } else {
+      console.warn('Deepgram: No visemes available for lip-sync animation');
+    }
+    // Combine original animation with lip-sync animation
+    const combinedAnim = [...line.anim, ...lipsyncAnim];
+    console.log('Deepgram: Combined animation frames:', combinedAnim.length, '(original:', line.anim.length, '+ lipsync:', lipsyncAnim.length, ')');
+    // Add to playlist
+    this.audioPlaylist.push({ anim: combinedAnim, audio: audioBuffer });
+        this.onSubtitles = line.onSubtitles || null;
+    this.resetLips();
+    if (line.mood) this.setMood(line.mood);
+        this.playAudio();
+  }
   /**
    * Synthesize speech using Azure TTS
    * @param {Object} line Speech line object
@@ -4235,6 +4388,9 @@ class TalkingHead {
           } else if (this.opt.ttsService === "elevenlabs") {
             // Use ElevenLabs TTS
             await this.synthesizeWithElevenLabsTTS(line);
+          } else if (this.opt.ttsService === "deepgram") {
+            // Use Deepgram Aura-2 TTS
+            await this.synthesizeWithDeepgramTTS(line);
           } else if (this.opt.ttsService === "azure") {
             // Use Azure TTS
             await this.synthesizeWithAzureTTS(line);