@sage-rsc/talking-head-react 1.7.7 → 1.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sage-rsc/talking-head-react",
3
- "version": "1.7.7",
3
+ "version": "1.7.8",
4
4
  "description": "A reusable React component for 3D talking avatars with lip-sync and text-to-speech",
5
5
  "main": "./dist/index.cjs",
6
6
  "module": "./dist/index.js",
@@ -81,6 +81,8 @@ const SimpleTalkingAvatar = forwardRef(({
81
81
  const currentAnimationGroupRef = useRef(null);
82
82
  const playedAnimationsRef = useRef([]); // Track animations played during current speech
83
83
  const animationQueueRef = useRef([]); // Queue of animations to play in order
84
+ const currentSentenceIndexRef = useRef(0); // Track which sentence is currently playing
85
+ const pausedAudioDataRef = useRef(null); // Store trimmed audio buffer when paused
84
86
 
85
87
  // Keep ref in sync with state
86
88
  useEffect(() => {
@@ -612,6 +614,7 @@ const SimpleTalkingAvatar = forwardRef(({
612
614
  // Split text into sentences for tracking
613
615
  const sentences = textToSpeak.split(/[.!?]+/).filter(s => s.trim().length > 0);
614
616
  originalSentencesRef.current = sentences;
617
+ currentSentenceIndexRef.current = 0; // Reset sentence tracking
615
618
 
616
619
  const speakOptions = {
617
620
  lipsyncLang: options.lipsyncLang || 'en',
@@ -691,8 +694,8 @@ const SimpleTalkingAvatar = forwardRef(({
691
694
  try {
692
695
  // Check if currently speaking
693
696
  const isSpeaking = talkingHeadRef.current.isSpeaking || false;
694
- const audioPlaylist = talkingHeadRef.current.audioPlaylist || [];
695
- const speechQueue = talkingHeadRef.current.speechQueue || [];
697
+ const audioPlaylist = [...(talkingHeadRef.current.audioPlaylist || [])]; // Copy before pausing
698
+ const speechQueue = [...(talkingHeadRef.current.speechQueue || [])]; // Copy before clearing
696
699
 
697
700
  if (isSpeaking || audioPlaylist.length > 0 || speechQueue.length > 0) {
698
701
  // Clear speech end interval
@@ -701,51 +704,84 @@ const SimpleTalkingAvatar = forwardRef(({
701
704
  speechEndIntervalRef.current = null;
702
705
  }
703
706
 
704
- // Extract remaining text from speech queue (not yet sent to TTS)
705
- let remainingText = '';
706
- if (speechQueue.length > 0) {
707
- remainingText = speechQueue.map(item => {
708
- if (item.text && Array.isArray(item.text)) {
709
- return item.text.map(wordObj => wordObj.word).join(' ');
710
- }
711
- return item.text || '';
712
- }).join(' ');
713
- }
714
-
715
- // Extract text from audio playlist (currently playing or queued audio)
716
- // This includes the currently playing sentence if it was interrupted
717
- let audioPlaylistText = '';
718
- if (audioPlaylist.length > 0) {
719
- audioPlaylistText = audioPlaylist
720
- .map(item => {
721
- // Try to get text from the audio item
707
+ // IMPORTANT: Extract text BEFORE calling pauseSpeaking(), which clears audioPlaylist
708
+ // Track which sentences need to be re-spoken
709
+ const sentences = originalSentencesRef.current;
710
+ let remainingSentences = [];
711
+
712
+ // Check if audio is currently playing (item already shifted from playlist)
713
+ const isAudioCurrentlyPlaying = talkingHeadRef.current.isAudioPlaying || false;
714
+
715
+ if (sentences.length > 0) {
716
+ // Calculate which sentence is currently playing
717
+ // Total sentences = sentences.length
718
+ // Queued in audioPlaylist = audioPlaylist.length (already processed by TTS, waiting to play)
719
+ // Queued in speechQueue = speechQueue.length (not yet sent to TTS)
720
+ // Currently playing = 1 (if isAudioPlaying is true)
721
+
722
+ const queuedCount = audioPlaylist.length + speechQueue.length;
723
+ const currentlyPlayingCount = isAudioCurrentlyPlaying ? 1 : 0;
724
+ const processedCount = sentences.length - queuedCount - currentlyPlayingCount;
725
+
726
+ // If audio is currently playing, we're mid-sentence - restart from current sentence
727
+ // Otherwise, continue from next sentence
728
+ const startIndex = isAudioCurrentlyPlaying ? processedCount : processedCount + currentlyPlayingCount;
729
+
730
+ if (startIndex < sentences.length) {
731
+ remainingSentences = sentences.slice(startIndex);
732
+ }
733
+ } else {
734
+ // Fallback: Extract text from queues if we don't have original sentences
735
+ // Extract text from audio playlist (queued audio, not yet playing)
736
+ if (audioPlaylist.length > 0) {
737
+ audioPlaylist.forEach(item => {
738
+ if (item.text) {
739
+ if (Array.isArray(item.text)) {
740
+ const sentenceText = item.text.map(wordObj => wordObj.word).join(' ');
741
+ if (sentenceText.trim()) {
742
+ remainingSentences.push(sentenceText);
743
+ }
744
+ } else if (item.text.trim()) {
745
+ remainingSentences.push(item.text);
746
+ }
747
+ }
748
+ });
749
+ }
750
+
751
+ // Extract remaining text from speech queue (not yet sent to TTS)
752
+ if (speechQueue.length > 0) {
753
+ speechQueue.forEach(item => {
722
754
  if (item.text) {
723
755
  if (Array.isArray(item.text)) {
724
- return item.text.map(wordObj => wordObj.word).join(' ');
756
+ const sentenceText = item.text.map(wordObj => wordObj.word).join(' ');
757
+ if (sentenceText.trim()) {
758
+ remainingSentences.push(sentenceText);
759
+ }
760
+ } else if (item.text.trim()) {
761
+ remainingSentences.push(item.text);
725
762
  }
726
- return item.text;
727
763
  }
728
- return '';
729
- })
730
- .filter(text => text.trim().length > 0)
731
- .join(' ');
764
+ });
765
+ }
732
766
  }
733
-
734
- // Combine: if audio is playing, include that text first, then remaining queue text
735
- const combinedRemainingText = audioPlaylistText
736
- ? (audioPlaylistText + (remainingText ? ' ' + remainingText : ''))
737
- : remainingText;
767
+
768
+ // Combine remaining sentences
769
+ const combinedRemainingText = remainingSentences.join(' ');
738
770
 
739
771
  // Store progress for resume
740
772
  speechProgressRef.current = {
741
773
  remainingText: combinedRemainingText || null,
742
774
  originalText: pausedSpeechRef.current?.text || null,
743
- options: pausedSpeechRef.current?.options || null
775
+ options: pausedSpeechRef.current?.options || null,
776
+ // Track if we're pausing mid-sentence (has currently playing audio)
777
+ isMidSentence: audioPlaylist.length > 0
744
778
  };
745
779
 
746
- // Clear speech queue and pause
780
+ // Clear speech queue and pause (this will stop audio and clear audioPlaylist)
781
+ // pauseSpeaking() now returns trimmed audio data if audio was playing
747
782
  talkingHeadRef.current.speechQueue.length = 0;
748
- talkingHeadRef.current.pauseSpeaking();
783
+ const pausedAudioData = talkingHeadRef.current.pauseSpeaking();
784
+ pausedAudioDataRef.current = pausedAudioData; // Store trimmed buffer for exact resume
749
785
  setIsPaused(true);
750
786
  isPausedRef.current = true;
751
787
  }
@@ -765,7 +801,15 @@ const SimpleTalkingAvatar = forwardRef(({
765
801
  setIsPaused(false);
766
802
  isPausedRef.current = false;
767
803
 
768
- // Determine what text to speak
804
+ // If we have trimmed audio data from pause, resume from exact position
805
+ if (pausedAudioDataRef.current && pausedAudioDataRef.current.audio) {
806
+ // Resume with trimmed buffer (exact position)
807
+ await talkingHeadRef.current.playAudio(false, pausedAudioDataRef.current);
808
+ pausedAudioDataRef.current = null; // Clear after use
809
+ return;
810
+ }
811
+
812
+ // Otherwise, resume from remaining text (fallback)
769
813
  const remainingText = speechProgressRef.current?.remainingText;
770
814
  const originalText = speechProgressRef.current?.originalText || pausedSpeechRef.current?.text;
771
815
  const originalOptions = speechProgressRef.current?.options || pausedSpeechRef.current?.options || {};
@@ -775,10 +819,14 @@ const SimpleTalkingAvatar = forwardRef(({
775
819
  if (textToSpeak) {
776
820
  speakText(textToSpeak, originalOptions);
777
821
  }
822
+
823
+ // Clear paused audio data
824
+ pausedAudioDataRef.current = null;
778
825
  } catch (err) {
779
826
  console.warn('Error resuming speech:', err);
780
827
  setIsPaused(false);
781
828
  isPausedRef.current = false;
829
+ pausedAudioDataRef.current = null;
782
830
  }
783
831
  }, [isPaused, speakText, resumeAudioContext]);
784
832
 
@@ -835,6 +835,11 @@ class TalkingHead {
835
835
  this.speechQueue = [];
836
836
  this.isSpeaking = false;
837
837
  this.isListening = false;
838
+
839
+ // Pause/resume tracking for buffer trimming
840
+ this.audioStartTime = null; // When current audio started playing
841
+ this.currentAudioItem = null; // Current audio item being played
842
+ this.pausedAudioData = null; // Stored trimmed buffer when paused
838
843
 
839
844
  // Setup Google text-to-speech
840
845
  if ( this.opt.ttsEndpoint ) {
@@ -3718,10 +3723,78 @@ class TalkingHead {
3718
3723
  /**
3719
3724
  * Play audio playlist using Web Audio API.
3720
3725
  * @param {boolean} [force=false] If true, forces to proceed
3726
+ * @param {Object} [pausedAudioData=null] Trimmed audio data from pause to resume from exact position
3721
3727
  */
3722
- async playAudio(force=false) {
3728
+ async playAudio(force=false, pausedAudioData=null) {
3723
3729
  if ( !this.armature || (this.isAudioPlaying && !force) ) return;
3724
3730
  this.isAudioPlaying = true;
3731
+
3732
+ // If we have paused audio data, play that first (resume from exact position)
3733
+ if (pausedAudioData && pausedAudioData.audio) {
3734
+ const item = {
3735
+ audio: pausedAudioData.audio,
3736
+ anim: pausedAudioData.anim,
3737
+ text: pausedAudioData.text,
3738
+ delay: pausedAudioData.delay || 0,
3739
+ isRaw: false
3740
+ };
3741
+
3742
+ // If Web Audio API is suspended, try to resume it
3743
+ if ( this.audioCtx.state === "suspended" || this.audioCtx.state === "interrupted" ) {
3744
+ const resume = this.audioCtx.resume();
3745
+ const timeout = new Promise((_r, rej) => setTimeout(() => rej("p2"), 1000));
3746
+ try {
3747
+ await Promise.race([resume, timeout]);
3748
+ } catch(e) {
3749
+ console.log("Can't play audio. Web Audio API suspended.");
3750
+ this.playAudio(true);
3751
+ return;
3752
+ }
3753
+ }
3754
+
3755
+ // Store current audio item and start time
3756
+ this.currentAudioItem = {
3757
+ audio: item.audio,
3758
+ anim: item.anim ? JSON.parse(JSON.stringify(item.anim)) : null,
3759
+ text: item.text,
3760
+ delay: item.delay
3761
+ };
3762
+
3763
+ // Create audio source
3764
+ this.audioSpeechSource = this.audioCtx.createBufferSource();
3765
+ this.audioSpeechSource.buffer = item.audio;
3766
+ this.audioSpeechSource.playbackRate.value = 1 / this.animSlowdownRate;
3767
+ this.audioSpeechSource.connect(this.audioAnalyzerNode);
3768
+
3769
+ const startDelay = item.delay / 1000;
3770
+ this.audioStartTime = this.audioCtx.currentTime + startDelay;
3771
+
3772
+ this.audioSpeechSource.addEventListener('ended', () => {
3773
+ this.audioSpeechSource.disconnect();
3774
+ this.audioStartTime = null;
3775
+ this.currentAudioItem = null;
3776
+ this.playAudio(true);
3777
+ }, { once: true });
3778
+
3779
+ // Push trimmed animation data to queue
3780
+ if ( item.anim && item.anim.length > 0 ) {
3781
+ item.anim.forEach( animGroup => {
3782
+ if (animGroup && animGroup.ts && animGroup.ts.length > 0) {
3783
+ const animData = {
3784
+ template: animGroup.template,
3785
+ ts: animGroup.ts.map(ts => this.animClock + ts),
3786
+ vs: animGroup.vs
3787
+ };
3788
+ this.animQueue.push(animData);
3789
+ }
3790
+ });
3791
+ }
3792
+
3793
+ // Play immediately (no delay for resumed audio)
3794
+ this.audioSpeechSource.start(startDelay);
3795
+ return;
3796
+ }
3797
+
3725
3798
  if ( this.audioPlaylist.length ) {
3726
3799
  const item = this.audioPlaylist.shift();
3727
3800
 
@@ -3748,23 +3821,43 @@ class TalkingHead {
3748
3821
  audio = item.audio;
3749
3822
  }
3750
3823
 
3824
+ // Store current audio item and start time for pause/resume tracking
3825
+ this.currentAudioItem = {
3826
+ audio: audio,
3827
+ anim: item.anim ? JSON.parse(JSON.stringify(item.anim)) : null, // Deep copy
3828
+ text: item.text,
3829
+ delay: 0
3830
+ };
3831
+
3832
+ // Calculate delay for pre-animations
3833
+ let delay = 0;
3834
+ if ( item.anim ) {
3835
+ // Find the lowest negative time point, if any
3836
+ if ( !item.isRaw ) {
3837
+ delay = Math.abs(Math.min(0, ...item.anim.map( x => Math.min(...x.ts) ) ) );
3838
+ }
3839
+ this.currentAudioItem.delay = delay;
3840
+ }
3841
+
3751
3842
  // Create audio source
3752
3843
  this.audioSpeechSource = this.audioCtx.createBufferSource();
3753
3844
  this.audioSpeechSource.buffer = audio;
3754
3845
  this.audioSpeechSource.playbackRate.value = 1 / this.animSlowdownRate;
3755
3846
  this.audioSpeechSource.connect(this.audioAnalyzerNode);
3847
+
3848
+ // Track when audio starts playing (accounting for delay)
3849
+ const startDelay = delay / 1000;
3850
+ this.audioStartTime = this.audioCtx.currentTime + startDelay;
3851
+
3756
3852
  this.audioSpeechSource.addEventListener('ended', () => {
3757
3853
  this.audioSpeechSource.disconnect();
3854
+ this.audioStartTime = null;
3855
+ this.currentAudioItem = null;
3758
3856
  this.playAudio(true);
3759
3857
  }, { once: true });
3760
3858
 
3761
3859
  // Rescale lipsync and push to queue
3762
- let delay = 0;
3763
3860
  if ( item.anim ) {
3764
- // Find the lowest negative time point, if any
3765
- if ( !item.isRaw ) {
3766
- delay = Math.abs(Math.min(0, ...item.anim.map( x => Math.min(...x.ts) ) ) );
3767
- }
3768
3861
  item.anim.forEach( x => {
3769
3862
  for(let i=0; i<x.ts.length; i++) {
3770
3863
  x.ts[i] = this.animClock + x.ts[i] + delay;
@@ -3773,8 +3866,8 @@ class TalkingHead {
3773
3866
  });
3774
3867
  }
3775
3868
 
3776
- // Play, dealy in seconds so pre-animations can be played
3777
- this.audioSpeechSource.start(delay/1000);
3869
+ // Play, delay in seconds so pre-animations can be played
3870
+ this.audioSpeechSource.start(startDelay);
3778
3871
 
3779
3872
  } else {
3780
3873
  this.isAudioPlaying = false;
@@ -4433,18 +4526,109 @@ class TalkingHead {
4433
4526
 
4434
4527
  /**
4435
4528
  * Pause speaking.
4529
+ * Returns paused audio data with trimmed buffer if audio was playing.
4436
4530
  */
4437
4531
  pauseSpeaking() {
4438
- try { this.audioSpeechSource.stop(); } catch(error) {}
4532
+ let pausedData = null;
4533
+
4534
+ // If audio is currently playing, calculate elapsed time and trim buffer
4535
+ if (this.audioSpeechSource && this.currentAudioItem && this.audioStartTime !== null) {
4536
+ try {
4537
+ const currentTime = this.audioCtx.currentTime;
4538
+ const elapsedTime = Math.max(0, currentTime - this.audioStartTime);
4539
+ const playbackRate = this.audioSpeechSource.playbackRate.value;
4540
+ const elapsedInBuffer = elapsedTime * playbackRate;
4541
+
4542
+ const originalBuffer = this.currentAudioItem.audio;
4543
+ const sampleRate = originalBuffer.sampleRate;
4544
+ const startSample = Math.floor(elapsedInBuffer * sampleRate);
4545
+
4546
+ // Only trim if we haven't played the entire buffer
4547
+ if (startSample < originalBuffer.length) {
4548
+ // Create trimmed buffer
4549
+ const trimmedLength = originalBuffer.length - startSample;
4550
+ const trimmedBuffer = this.audioCtx.createBuffer(
4551
+ originalBuffer.numberOfChannels,
4552
+ trimmedLength,
4553
+ sampleRate
4554
+ );
4555
+
4556
+ // Copy remaining samples
4557
+ for (let channel = 0; channel < originalBuffer.numberOfChannels; channel++) {
4558
+ const originalData = originalBuffer.getChannelData(channel);
4559
+ const trimmedData = trimmedBuffer.getChannelData(channel);
4560
+ for (let i = 0; i < trimmedLength; i++) {
4561
+ trimmedData[i] = originalData[startSample + i];
4562
+ }
4563
+ }
4564
+
4565
+ // Trim animation data (lip-sync) - adjust timestamps
4566
+ let trimmedAnim = null;
4567
+ if (this.currentAudioItem.anim) {
4568
+ // Calculate the absolute time when this audio started (for comparison)
4569
+ const audioStartAnimTime = this.animClock + this.currentAudioItem.delay;
4570
+ const elapsedAnimTime = elapsedTime * 1000; // Convert to ms
4571
+ const currentAnimTime = audioStartAnimTime + elapsedAnimTime;
4572
+
4573
+ trimmedAnim = this.currentAudioItem.anim.map(animGroup => {
4574
+ const trimmed = {
4575
+ template: animGroup.template,
4576
+ ts: [],
4577
+ vs: []
4578
+ };
4579
+
4580
+ // Find animations that haven't started yet
4581
+ // animGroup.ts contains absolute timestamps (already adjusted to animClock)
4582
+ for (let i = 0; i < animGroup.ts.length; i++) {
4583
+ const animTimestamp = animGroup.ts[i];
4584
+
4585
+ // If animation timestamp is in the future (hasn't happened yet)
4586
+ if (animTimestamp > currentAnimTime) {
4587
+ // Adjust to relative time from resume point (start from 0)
4588
+ const relativeTime = animTimestamp - currentAnimTime;
4589
+ trimmed.ts.push(relativeTime);
4590
+ trimmed.vs.push(animGroup.vs[i]);
4591
+ }
4592
+ }
4593
+
4594
+ return trimmed.ts.length > 0 ? trimmed : null;
4595
+ }).filter(x => x !== null);
4596
+ }
4597
+
4598
+ pausedData = {
4599
+ audio: trimmedBuffer,
4600
+ anim: trimmedAnim,
4601
+ text: this.currentAudioItem.text,
4602
+ delay: 0, // No delay needed for trimmed buffer
4603
+ elapsedTime: elapsedTime
4604
+ };
4605
+ }
4606
+
4607
+ this.audioSpeechSource.stop();
4608
+ } catch(error) {
4609
+ console.warn('Error trimming audio buffer on pause:', error);
4610
+ }
4611
+ } else {
4612
+ // No audio playing, just stop if source exists
4613
+ try { this.audioSpeechSource?.stop(); } catch(error) {}
4614
+ }
4615
+
4439
4616
  this.audioPlaylist.length = 0;
4440
4617
  this.stateName = 'idle';
4441
4618
  this.isSpeaking = false;
4442
4619
  this.isAudioPlaying = false;
4620
+ this.audioStartTime = null;
4621
+ this.currentAudioItem = null;
4622
+
4623
+ // Clear viseme animations but keep others
4443
4624
  this.animQueue = this.animQueue.filter( x => x.template.name !== 'viseme' && x.template.name !== 'subtitles' && x.template.name !== 'blendshapes' );
4625
+
4444
4626
  if ( this.armature ) {
4445
4627
  this.resetLips();
4446
4628
  this.render();
4447
4629
  }
4630
+
4631
+ return pausedData;
4448
4632
  }
4449
4633
 
4450
4634
  /**