@sage-rsc/talking-head-react 1.7.7 → 1.7.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sage-rsc/talking-head-react",
3
- "version": "1.7.7",
3
+ "version": "1.7.9",
4
4
  "description": "A reusable React component for 3D talking avatars with lip-sync and text-to-speech",
5
5
  "main": "./dist/index.cjs",
6
6
  "module": "./dist/index.js",
@@ -81,6 +81,8 @@ const SimpleTalkingAvatar = forwardRef(({
81
81
  const currentAnimationGroupRef = useRef(null);
82
82
  const playedAnimationsRef = useRef([]); // Track animations played during current speech
83
83
  const animationQueueRef = useRef([]); // Queue of animations to play in order
84
+ const currentSentenceIndexRef = useRef(0); // Track which sentence is currently playing
85
+ const pausedAudioDataRef = useRef(null); // Store trimmed audio buffer when paused
84
86
 
85
87
  // Keep ref in sync with state
86
88
  useEffect(() => {
@@ -612,6 +614,7 @@ const SimpleTalkingAvatar = forwardRef(({
612
614
  // Split text into sentences for tracking
613
615
  const sentences = textToSpeak.split(/[.!?]+/).filter(s => s.trim().length > 0);
614
616
  originalSentencesRef.current = sentences;
617
+ currentSentenceIndexRef.current = 0; // Reset sentence tracking
615
618
 
616
619
  const speakOptions = {
617
620
  lipsyncLang: options.lipsyncLang || 'en',
@@ -691,8 +694,8 @@ const SimpleTalkingAvatar = forwardRef(({
691
694
  try {
692
695
  // Check if currently speaking
693
696
  const isSpeaking = talkingHeadRef.current.isSpeaking || false;
694
- const audioPlaylist = talkingHeadRef.current.audioPlaylist || [];
695
- const speechQueue = talkingHeadRef.current.speechQueue || [];
697
+ const audioPlaylist = [...(talkingHeadRef.current.audioPlaylist || [])]; // Copy before pausing
698
+ const speechQueue = [...(talkingHeadRef.current.speechQueue || [])]; // Copy before clearing
696
699
 
697
700
  if (isSpeaking || audioPlaylist.length > 0 || speechQueue.length > 0) {
698
701
  // Clear speech end interval
@@ -701,51 +704,100 @@ const SimpleTalkingAvatar = forwardRef(({
701
704
  speechEndIntervalRef.current = null;
702
705
  }
703
706
 
704
- // Extract remaining text from speech queue (not yet sent to TTS)
705
- let remainingText = '';
706
- if (speechQueue.length > 0) {
707
- remainingText = speechQueue.map(item => {
708
- if (item.text && Array.isArray(item.text)) {
709
- return item.text.map(wordObj => wordObj.word).join(' ');
710
- }
711
- return item.text || '';
712
- }).join(' ');
713
- }
714
-
715
- // Extract text from audio playlist (currently playing or queued audio)
716
- // This includes the currently playing sentence if it was interrupted
717
- let audioPlaylistText = '';
718
- if (audioPlaylist.length > 0) {
719
- audioPlaylistText = audioPlaylist
720
- .map(item => {
721
- // Try to get text from the audio item
707
+ // IMPORTANT: Extract text BEFORE calling pauseSpeaking(), which clears audioPlaylist
708
+ // Track which sentences need to be re-spoken
709
+ const sentences = originalSentencesRef.current;
710
+ let remainingSentences = [];
711
+
712
+ // Check if audio is currently playing (item already shifted from playlist)
713
+ const isAudioCurrentlyPlaying = talkingHeadRef.current.isAudioPlaying || false;
714
+
715
+ if (sentences.length > 0) {
716
+ // Calculate which sentence is currently playing
717
+ // Total sentences = sentences.length
718
+ // Queued in audioPlaylist = audioPlaylist.length (already processed by TTS, waiting to play)
719
+ // Queued in speechQueue = speechQueue.length (not yet sent to TTS)
720
+ // Currently playing = 1 (if isAudioPlaying is true)
721
+
722
+ const queuedCount = audioPlaylist.length + speechQueue.length;
723
+ const currentlyPlayingCount = isAudioCurrentlyPlaying ? 1 : 0;
724
+ const processedCount = sentences.length - queuedCount - currentlyPlayingCount;
725
+
726
+ // If audio is currently playing, we're mid-sentence - restart from current sentence
727
+ // Otherwise, continue from next sentence
728
+ const startIndex = isAudioCurrentlyPlaying ? processedCount : processedCount + currentlyPlayingCount;
729
+
730
+ if (startIndex < sentences.length) {
731
+ remainingSentences = sentences.slice(startIndex);
732
+ }
733
+ } else {
734
+ // Fallback: Extract text from queues if we don't have original sentences
735
+ // Extract text from audio playlist (queued audio, not yet playing)
736
+ if (audioPlaylist.length > 0) {
737
+ audioPlaylist.forEach(item => {
722
738
  if (item.text) {
723
739
  if (Array.isArray(item.text)) {
724
- return item.text.map(wordObj => wordObj.word).join(' ');
740
+ const sentenceText = item.text.map(wordObj => wordObj.word).join(' ');
741
+ if (sentenceText.trim()) {
742
+ remainingSentences.push(sentenceText);
743
+ }
744
+ } else if (item.text.trim()) {
745
+ remainingSentences.push(item.text);
725
746
  }
726
- return item.text;
727
747
  }
728
- return '';
729
- })
730
- .filter(text => text.trim().length > 0)
731
- .join(' ');
748
+ });
749
+ }
750
+
751
+ // Extract remaining text from speech queue (not yet sent to TTS)
752
+ if (speechQueue.length > 0) {
753
+ speechQueue.forEach(item => {
754
+ if (item.text) {
755
+ if (Array.isArray(item.text)) {
756
+ const sentenceText = item.text.map(wordObj => wordObj.word).join(' ');
757
+ if (sentenceText.trim()) {
758
+ remainingSentences.push(sentenceText);
759
+ }
760
+ } else if (item.text.trim()) {
761
+ remainingSentences.push(item.text);
762
+ }
763
+ }
764
+ });
765
+ }
732
766
  }
733
-
734
- // Combine: if audio is playing, include that text first, then remaining queue text
735
- const combinedRemainingText = audioPlaylistText
736
- ? (audioPlaylistText + (remainingText ? ' ' + remainingText : ''))
737
- : remainingText;
767
+
768
+ // Combine remaining sentences
769
+ const combinedRemainingText = remainingSentences.join(' ');
738
770
 
739
771
  // Store progress for resume
740
772
  speechProgressRef.current = {
741
773
  remainingText: combinedRemainingText || null,
742
774
  originalText: pausedSpeechRef.current?.text || null,
743
- options: pausedSpeechRef.current?.options || null
775
+ options: pausedSpeechRef.current?.options || null,
776
+ // Track if we're pausing mid-sentence (has currently playing audio)
777
+ isMidSentence: audioPlaylist.length > 0
744
778
  };
745
779
 
746
- // Clear speech queue and pause
747
- talkingHeadRef.current.speechQueue.length = 0;
748
- talkingHeadRef.current.pauseSpeaking();
780
+ // IMPORTANT: Save speech queue BEFORE pausing if audio is playing
781
+ // We need to preserve it so remaining text continues after trimmed audio
782
+ const wasAudioPlaying = talkingHeadRef.current.isAudioPlaying || false;
783
+ const savedSpeechQueue = wasAudioPlaying ? [...(talkingHeadRef.current.speechQueue || [])] : null;
784
+
785
+ // Pause and get trimmed buffer if available
786
+ // pauseSpeaking() clears audioPlaylist but doesn't clear speechQueue
787
+ const pausedAudioData = talkingHeadRef.current.pauseSpeaking();
788
+
789
+ // If we have trimmed buffer, restore speech queue for continuation
790
+ // Otherwise, clear it (normal pause behavior)
791
+ if (pausedAudioData && pausedAudioData.audio && savedSpeechQueue) {
792
+ // Restore speech queue so remaining text continues after trimmed audio
793
+ talkingHeadRef.current.speechQueue.length = 0;
794
+ talkingHeadRef.current.speechQueue.push(...savedSpeechQueue);
795
+ } else {
796
+ // No trimmed buffer, clear speech queue normally
797
+ talkingHeadRef.current.speechQueue.length = 0;
798
+ }
799
+
800
+ pausedAudioDataRef.current = pausedAudioData; // Store trimmed buffer for exact resume
749
801
  setIsPaused(true);
750
802
  isPausedRef.current = true;
751
803
  }
@@ -765,7 +817,54 @@ const SimpleTalkingAvatar = forwardRef(({
765
817
  setIsPaused(false);
766
818
  isPausedRef.current = false;
767
819
 
768
- // Determine what text to speak
820
+ // If we have trimmed audio data from pause, resume from exact position
821
+ if (pausedAudioDataRef.current && pausedAudioDataRef.current.audio) {
822
+ // Ensure speaking state is set for animations to continue
823
+ isSpeakingRef.current = true;
824
+
825
+ // Restore animation group if it was set
826
+ const originalOptions = speechProgressRef.current?.options || pausedSpeechRef.current?.options || {};
827
+ const animationGroup = originalOptions.animationGroup || autoAnimationGroup;
828
+ if (animationGroup) {
829
+ currentAnimationGroupRef.current = animationGroup;
830
+ }
831
+
832
+ // Ensure remaining text is in speech queue so it continues after trimmed buffer
833
+ const remainingText = speechProgressRef.current?.remainingText;
834
+ if (remainingText && talkingHeadRef.current.speechQueue) {
835
+ // Re-add remaining text to speech queue so it continues after trimmed audio
836
+ // Split into sentences and add to queue
837
+ const sentences = remainingText.split(/[.!?]+/).filter(s => s.trim().length > 0);
838
+ sentences.forEach(sentence => {
839
+ talkingHeadRef.current.speechQueue.push({
840
+ text: sentence.trim(),
841
+ options: originalOptions
842
+ });
843
+ });
844
+ }
845
+
846
+ // Mark as speaking so audio continues and speech queue processes
847
+ talkingHeadRef.current.isSpeaking = true;
848
+
849
+ // Resume with trimmed buffer (exact position)
850
+ // After trimmed buffer finishes, playAudio will call startSpeaking() which processes speechQueue
851
+ await talkingHeadRef.current.playAudio(false, pausedAudioDataRef.current);
852
+
853
+ // Continue animations if animation group is set
854
+ // Start immediately and they will continue as long as isSpeakingRef is true
855
+ if (animationGroup && !originalOptions.skipAnimation) {
856
+ // Reset animation queue for smooth continuation
857
+ animationQueueRef.current = [];
858
+ playedAnimationsRef.current = [];
859
+ // Start playing animations immediately
860
+ playRandomAnimation(animationGroup);
861
+ }
862
+
863
+ pausedAudioDataRef.current = null; // Clear after use
864
+ return;
865
+ }
866
+
867
+ // Otherwise, resume from remaining text (fallback)
769
868
  const remainingText = speechProgressRef.current?.remainingText;
770
869
  const originalText = speechProgressRef.current?.originalText || pausedSpeechRef.current?.text;
771
870
  const originalOptions = speechProgressRef.current?.options || pausedSpeechRef.current?.options || {};
@@ -775,12 +874,16 @@ const SimpleTalkingAvatar = forwardRef(({
775
874
  if (textToSpeak) {
776
875
  speakText(textToSpeak, originalOptions);
777
876
  }
877
+
878
+ // Clear paused audio data
879
+ pausedAudioDataRef.current = null;
778
880
  } catch (err) {
779
881
  console.warn('Error resuming speech:', err);
780
882
  setIsPaused(false);
781
883
  isPausedRef.current = false;
884
+ pausedAudioDataRef.current = null;
782
885
  }
783
- }, [isPaused, speakText, resumeAudioContext]);
886
+ }, [isPaused, speakText, resumeAudioContext, autoAnimationGroup, playRandomAnimation]);
784
887
 
785
888
  // Stop speaking
786
889
  const stopSpeaking = useCallback(() => {
@@ -835,6 +835,11 @@ class TalkingHead {
835
835
  this.speechQueue = [];
836
836
  this.isSpeaking = false;
837
837
  this.isListening = false;
838
+
839
+ // Pause/resume tracking for buffer trimming
840
+ this.audioStartTime = null; // When current audio started playing
841
+ this.currentAudioItem = null; // Current audio item being played
842
+ this.pausedAudioData = null; // Stored trimmed buffer when paused
838
843
 
839
844
  // Setup Google text-to-speech
840
845
  if ( this.opt.ttsEndpoint ) {
@@ -3718,10 +3723,80 @@ class TalkingHead {
3718
3723
  /**
3719
3724
  * Play audio playlist using Web Audio API.
3720
3725
  * @param {boolean} [force=false] If true, forces to proceed
3726
+ * @param {Object} [pausedAudioData=null] Trimmed audio data from pause to resume from exact position
3721
3727
  */
3722
- async playAudio(force=false) {
3728
+ async playAudio(force=false, pausedAudioData=null) {
3723
3729
  if ( !this.armature || (this.isAudioPlaying && !force) ) return;
3724
3730
  this.isAudioPlaying = true;
3731
+
3732
+ // If we have paused audio data, play that first (resume from exact position)
3733
+ if (pausedAudioData && pausedAudioData.audio) {
3734
+ const item = {
3735
+ audio: pausedAudioData.audio,
3736
+ anim: pausedAudioData.anim,
3737
+ text: pausedAudioData.text,
3738
+ delay: pausedAudioData.delay || 0,
3739
+ isRaw: false
3740
+ };
3741
+
3742
+ // If Web Audio API is suspended, try to resume it
3743
+ if ( this.audioCtx.state === "suspended" || this.audioCtx.state === "interrupted" ) {
3744
+ const resume = this.audioCtx.resume();
3745
+ const timeout = new Promise((_r, rej) => setTimeout(() => rej("p2"), 1000));
3746
+ try {
3747
+ await Promise.race([resume, timeout]);
3748
+ } catch(e) {
3749
+ console.log("Can't play audio. Web Audio API suspended.");
3750
+ this.playAudio(true);
3751
+ return;
3752
+ }
3753
+ }
3754
+
3755
+ // Store current audio item and start time
3756
+ this.currentAudioItem = {
3757
+ audio: item.audio,
3758
+ anim: item.anim ? JSON.parse(JSON.stringify(item.anim)) : null,
3759
+ text: item.text,
3760
+ delay: item.delay
3761
+ };
3762
+
3763
+ // Create audio source
3764
+ this.audioSpeechSource = this.audioCtx.createBufferSource();
3765
+ this.audioSpeechSource.buffer = item.audio;
3766
+ this.audioSpeechSource.playbackRate.value = 1 / this.animSlowdownRate;
3767
+ this.audioSpeechSource.connect(this.audioAnalyzerNode);
3768
+
3769
+ const startDelay = item.delay / 1000;
3770
+ this.audioStartTime = this.audioCtx.currentTime + startDelay;
3771
+
3772
+ this.audioSpeechSource.addEventListener('ended', () => {
3773
+ this.audioSpeechSource.disconnect();
3774
+ this.audioStartTime = null;
3775
+ this.currentAudioItem = null;
3776
+ // Ensure isSpeaking is true so startSpeaking() processes the queue after trimmed audio
3777
+ this.isSpeaking = true;
3778
+ this.playAudio(true);
3779
+ }, { once: true });
3780
+
3781
+ // Push trimmed animation data to queue
3782
+ if ( item.anim && item.anim.length > 0 ) {
3783
+ item.anim.forEach( animGroup => {
3784
+ if (animGroup && animGroup.ts && animGroup.ts.length > 0) {
3785
+ const animData = {
3786
+ template: animGroup.template,
3787
+ ts: animGroup.ts.map(ts => this.animClock + ts),
3788
+ vs: animGroup.vs
3789
+ };
3790
+ this.animQueue.push(animData);
3791
+ }
3792
+ });
3793
+ }
3794
+
3795
+ // Play immediately (no delay for resumed audio)
3796
+ this.audioSpeechSource.start(startDelay);
3797
+ return;
3798
+ }
3799
+
3725
3800
  if ( this.audioPlaylist.length ) {
3726
3801
  const item = this.audioPlaylist.shift();
3727
3802
 
@@ -3748,23 +3823,45 @@ class TalkingHead {
3748
3823
  audio = item.audio;
3749
3824
  }
3750
3825
 
3826
+ // Store current audio item and start time for pause/resume tracking
3827
+ this.currentAudioItem = {
3828
+ audio: audio,
3829
+ anim: item.anim ? JSON.parse(JSON.stringify(item.anim)) : null, // Deep copy
3830
+ text: item.text,
3831
+ delay: 0
3832
+ };
3833
+
3834
+ // Calculate delay for pre-animations
3835
+ let delay = 0;
3836
+ if ( item.anim ) {
3837
+ // Find the lowest negative time point, if any
3838
+ if ( !item.isRaw ) {
3839
+ delay = Math.abs(Math.min(0, ...item.anim.map( x => Math.min(...x.ts) ) ) );
3840
+ }
3841
+ this.currentAudioItem.delay = delay;
3842
+ }
3843
+
3751
3844
  // Create audio source
3752
3845
  this.audioSpeechSource = this.audioCtx.createBufferSource();
3753
3846
  this.audioSpeechSource.buffer = audio;
3754
3847
  this.audioSpeechSource.playbackRate.value = 1 / this.animSlowdownRate;
3755
3848
  this.audioSpeechSource.connect(this.audioAnalyzerNode);
3849
+
3850
+ // Track when audio starts playing (accounting for delay)
3851
+ const startDelay = delay / 1000;
3852
+ this.audioStartTime = this.audioCtx.currentTime + startDelay;
3853
+
3756
3854
  this.audioSpeechSource.addEventListener('ended', () => {
3757
3855
  this.audioSpeechSource.disconnect();
3856
+ this.audioStartTime = null;
3857
+ this.currentAudioItem = null;
3858
+ // Ensure isSpeaking is true so startSpeaking() processes the queue
3859
+ this.isSpeaking = true;
3758
3860
  this.playAudio(true);
3759
3861
  }, { once: true });
3760
3862
 
3761
3863
  // Rescale lipsync and push to queue
3762
- let delay = 0;
3763
3864
  if ( item.anim ) {
3764
- // Find the lowest negative time point, if any
3765
- if ( !item.isRaw ) {
3766
- delay = Math.abs(Math.min(0, ...item.anim.map( x => Math.min(...x.ts) ) ) );
3767
- }
3768
3865
  item.anim.forEach( x => {
3769
3866
  for(let i=0; i<x.ts.length; i++) {
3770
3867
  x.ts[i] = this.animClock + x.ts[i] + delay;
@@ -3773,8 +3870,8 @@ class TalkingHead {
3773
3870
  });
3774
3871
  }
3775
3872
 
3776
- // Play, dealy in seconds so pre-animations can be played
3777
- this.audioSpeechSource.start(delay/1000);
3873
+ // Play, delay in seconds so pre-animations can be played
3874
+ this.audioSpeechSource.start(startDelay);
3778
3875
 
3779
3876
  } else {
3780
3877
  this.isAudioPlaying = false;
@@ -4433,18 +4530,109 @@ class TalkingHead {
4433
4530
 
4434
4531
  /**
4435
4532
  * Pause speaking.
4533
+ * Returns paused audio data with trimmed buffer if audio was playing.
4436
4534
  */
4437
4535
  pauseSpeaking() {
4438
- try { this.audioSpeechSource.stop(); } catch(error) {}
4536
+ let pausedData = null;
4537
+
4538
+ // If audio is currently playing, calculate elapsed time and trim buffer
4539
+ if (this.audioSpeechSource && this.currentAudioItem && this.audioStartTime !== null) {
4540
+ try {
4541
+ const currentTime = this.audioCtx.currentTime;
4542
+ const elapsedTime = Math.max(0, currentTime - this.audioStartTime);
4543
+ const playbackRate = this.audioSpeechSource.playbackRate.value;
4544
+ const elapsedInBuffer = elapsedTime * playbackRate;
4545
+
4546
+ const originalBuffer = this.currentAudioItem.audio;
4547
+ const sampleRate = originalBuffer.sampleRate;
4548
+ const startSample = Math.floor(elapsedInBuffer * sampleRate);
4549
+
4550
+ // Only trim if we haven't played the entire buffer
4551
+ if (startSample < originalBuffer.length) {
4552
+ // Create trimmed buffer
4553
+ const trimmedLength = originalBuffer.length - startSample;
4554
+ const trimmedBuffer = this.audioCtx.createBuffer(
4555
+ originalBuffer.numberOfChannels,
4556
+ trimmedLength,
4557
+ sampleRate
4558
+ );
4559
+
4560
+ // Copy remaining samples
4561
+ for (let channel = 0; channel < originalBuffer.numberOfChannels; channel++) {
4562
+ const originalData = originalBuffer.getChannelData(channel);
4563
+ const trimmedData = trimmedBuffer.getChannelData(channel);
4564
+ for (let i = 0; i < trimmedLength; i++) {
4565
+ trimmedData[i] = originalData[startSample + i];
4566
+ }
4567
+ }
4568
+
4569
+ // Trim animation data (lip-sync) - adjust timestamps
4570
+ let trimmedAnim = null;
4571
+ if (this.currentAudioItem.anim) {
4572
+ // Calculate the absolute time when this audio started (for comparison)
4573
+ const audioStartAnimTime = this.animClock + this.currentAudioItem.delay;
4574
+ const elapsedAnimTime = elapsedTime * 1000; // Convert to ms
4575
+ const currentAnimTime = audioStartAnimTime + elapsedAnimTime;
4576
+
4577
+ trimmedAnim = this.currentAudioItem.anim.map(animGroup => {
4578
+ const trimmed = {
4579
+ template: animGroup.template,
4580
+ ts: [],
4581
+ vs: []
4582
+ };
4583
+
4584
+ // Find animations that haven't started yet
4585
+ // animGroup.ts contains absolute timestamps (already adjusted to animClock)
4586
+ for (let i = 0; i < animGroup.ts.length; i++) {
4587
+ const animTimestamp = animGroup.ts[i];
4588
+
4589
+ // If animation timestamp is in the future (hasn't happened yet)
4590
+ if (animTimestamp > currentAnimTime) {
4591
+ // Adjust to relative time from resume point (start from 0)
4592
+ const relativeTime = animTimestamp - currentAnimTime;
4593
+ trimmed.ts.push(relativeTime);
4594
+ trimmed.vs.push(animGroup.vs[i]);
4595
+ }
4596
+ }
4597
+
4598
+ return trimmed.ts.length > 0 ? trimmed : null;
4599
+ }).filter(x => x !== null);
4600
+ }
4601
+
4602
+ pausedData = {
4603
+ audio: trimmedBuffer,
4604
+ anim: trimmedAnim,
4605
+ text: this.currentAudioItem.text,
4606
+ delay: 0, // No delay needed for trimmed buffer
4607
+ elapsedTime: elapsedTime
4608
+ };
4609
+ }
4610
+
4611
+ this.audioSpeechSource.stop();
4612
+ } catch(error) {
4613
+ console.warn('Error trimming audio buffer on pause:', error);
4614
+ }
4615
+ } else {
4616
+ // No audio playing, just stop if source exists
4617
+ try { this.audioSpeechSource?.stop(); } catch(error) {}
4618
+ }
4619
+
4439
4620
  this.audioPlaylist.length = 0;
4440
4621
  this.stateName = 'idle';
4441
4622
  this.isSpeaking = false;
4442
4623
  this.isAudioPlaying = false;
4624
+ this.audioStartTime = null;
4625
+ this.currentAudioItem = null;
4626
+
4627
+ // Clear viseme animations but keep others
4443
4628
  this.animQueue = this.animQueue.filter( x => x.template.name !== 'viseme' && x.template.name !== 'subtitles' && x.template.name !== 'blendshapes' );
4629
+
4444
4630
  if ( this.armature ) {
4445
4631
  this.resetLips();
4446
4632
  this.render();
4447
4633
  }
4634
+
4635
+ return pausedData;
4448
4636
  }
4449
4637
 
4450
4638
  /**