@wq-hook/volcano-react 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -32,10 +32,8 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  AudioProgressBar: () => AudioProgressBar_default,
34
34
  AudioWaveVisualizer: () => AudioWaveVisualizer_default,
35
+ StreamPlaybackManager: () => StreamPlaybackManager,
35
36
  StreamingTextSplitter: () => StreamingTextSplitter,
36
- clearSessionAudioCache: () => clearSessionAudioCache,
37
- findSessionCacheByText: () => findSessionCacheByText,
38
- getSessionAudioCache: () => getSessionAudioCache,
39
37
  splitTextByDelimiters: () => splitTextByDelimiters,
40
38
  useMessageTTS: () => useMessageTTS,
41
39
  useStreamTTS: () => useStreamTTS,
@@ -442,10 +440,253 @@ function useVolcanoTTS({
442
440
  }
443
441
 
444
442
  // src/tts/useMessageTTS.ts
443
+ var import_react3 = require("react");
444
+
445
+ // src/tts/StreamPlaybackManager.ts
445
446
  var import_tts2 = require("@wq-hook/volcano-sdk/tts");
447
+
448
+ // src/tts/StreamingTextSplitter.ts
446
449
  var import_volcano_sdk2 = require("@wq-hook/volcano-sdk");
447
- var import_react3 = require("react");
448
450
  var import_emoji_regex2 = __toESM(require("emoji-regex"));
451
+ var StreamingTextSplitter = class {
452
+ constructor(options = {}) {
453
+ /** 当前缓冲区 */
454
+ this.buffer = "";
455
+ /** 分段索引计数器 */
456
+ this.segmentIndex = 0;
457
+ /** 已完成的分段列表 */
458
+ this.segments = [];
459
+ /** 是否已完成 */
460
+ this.isCompleted = false;
461
+ this.maxLength = options.maxLength || 150;
462
+ this.minLength = options.minLength || 10;
463
+ this.onSegmentComplete = options.onSegmentComplete;
464
+ this.onAllComplete = options.onAllComplete;
465
+ }
466
+ /**
467
+ * 接收流式文本块
468
+ * @param chunk - 文本块
469
+ */
470
+ onChunk(chunk) {
471
+ if (!chunk || this.isCompleted) return;
472
+ this.buffer += chunk;
473
+ if (this.detectBoundary(chunk)) {
474
+ const newlineIndex = this.buffer.indexOf("\n");
475
+ if (newlineIndex !== -1) {
476
+ if (newlineIndex === 0) {
477
+ this.buffer = this.buffer.substring(1);
478
+ return;
479
+ }
480
+ const segmentBuffer = this.buffer.substring(0, newlineIndex);
481
+ this.buffer = this.buffer.substring(newlineIndex + 1);
482
+ this.flushSegmentWithBuffer(segmentBuffer);
483
+ while (this.buffer.includes("\n")) {
484
+ const nextNewlineIndex = this.buffer.indexOf("\n");
485
+ if (nextNewlineIndex === 0) {
486
+ this.buffer = this.buffer.substring(1);
487
+ continue;
488
+ }
489
+ const nextSegmentBuffer = this.buffer.substring(0, nextNewlineIndex);
490
+ this.buffer = this.buffer.substring(nextNewlineIndex + 1);
491
+ this.flushSegmentWithBuffer(nextSegmentBuffer);
492
+ }
493
+ }
494
+ }
495
+ }
496
+ /**
497
+ * 检测分段边界
498
+ * @param chunk - 最新接收的文本块
499
+ * @returns 是否应该分段
500
+ */
501
+ detectBoundary(chunk) {
502
+ if (chunk.includes("\n")) {
503
+ if (this.buffer.length >= this.maxLength) {
504
+ this.forceSplitAtSentenceBoundary();
505
+ }
506
+ return true;
507
+ }
508
+ if (this.buffer.length >= this.maxLength) {
509
+ this.forceSplitAtSentenceBoundary();
510
+ return true;
511
+ }
512
+ return false;
513
+ }
514
+ /**
515
+ * 在句子边界强制拆分超长段落
516
+ */
517
+ forceSplitAtSentenceBoundary() {
518
+ const content = this.buffer;
519
+ const sentenceEnders = /[。?!]/g;
520
+ let lastMatch = null;
521
+ let match = null;
522
+ while ((match = sentenceEnders.exec(content)) !== null) {
523
+ lastMatch = match;
524
+ }
525
+ if (lastMatch && lastMatch.index > this.minLength) {
526
+ const splitPoint = lastMatch.index + 1;
527
+ const firstPart = content.substring(0, splitPoint);
528
+ const secondPart = content.substring(splitPoint);
529
+ this.buffer = firstPart;
530
+ this.flushSegment();
531
+ this.buffer = secondPart;
532
+ } else {
533
+ const midPoint = Math.floor(content.length / 2);
534
+ const firstPart = content.substring(0, midPoint);
535
+ const secondPart = content.substring(midPoint);
536
+ this.buffer = firstPart;
537
+ this.flushSegment();
538
+ this.buffer = secondPart;
539
+ }
540
+ }
541
+ /**
542
+ * 使用指定缓冲区内容刷新为分段
543
+ * @param bufferToFlush - 要分段的缓冲区内容
544
+ */
545
+ flushSegmentWithBuffer(bufferToFlush) {
546
+ const content = bufferToFlush;
547
+ if (!content) return;
548
+ const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
549
+ const isTooShort = content.length < 3;
550
+ if (isPureSymbols && isTooShort) {
551
+ return;
552
+ }
553
+ const formattedContent = import_volcano_sdk2.MarkdownFormatter.format(content).replace((0, import_emoji_regex2.default)(), "");
554
+ if (!formattedContent) return;
555
+ let subSegments = [formattedContent];
556
+ if (formattedContent.length > this.maxLength) {
557
+ subSegments = this.splitLongSegment(formattedContent);
558
+ }
559
+ for (const subSegment of subSegments) {
560
+ if (!subSegment) continue;
561
+ const segment = {
562
+ index: this.segmentIndex++,
563
+ content: subSegment,
564
+ length: subSegment.length,
565
+ sent: false
566
+ };
567
+ this.segments.push(segment);
568
+ this.onSegmentComplete?.(segment);
569
+ }
570
+ }
571
+ /**
572
+ * 刷新当前缓冲区为分段
573
+ */
574
+ flushSegment() {
575
+ const content = this.buffer.trim();
576
+ if (!content) {
577
+ this.buffer = "";
578
+ return;
579
+ }
580
+ const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
581
+ const isTooShort = content.length < 3;
582
+ if (isPureSymbols && isTooShort) {
583
+ this.buffer = "";
584
+ return;
585
+ }
586
+ const formattedContent = import_volcano_sdk2.MarkdownFormatter.format(content).replace((0, import_emoji_regex2.default)(), "");
587
+ if (!formattedContent) {
588
+ this.buffer = "";
589
+ return;
590
+ }
591
+ let subSegments = [formattedContent];
592
+ if (formattedContent.length > this.maxLength) {
593
+ subSegments = this.splitLongSegment(formattedContent);
594
+ }
595
+ for (const subSegment of subSegments) {
596
+ if (!subSegment) continue;
597
+ const segment = {
598
+ index: this.segmentIndex++,
599
+ content: subSegment,
600
+ length: subSegment.length,
601
+ sent: false
602
+ };
603
+ this.segments.push(segment);
604
+ this.onSegmentComplete?.(segment);
605
+ }
606
+ this.buffer = "";
607
+ }
608
+ /**
609
+ * 拆分超长分段
610
+ * @param segment - 超长的分段
611
+ * @returns 拆分后的分段数组
612
+ */
613
+ splitLongSegment(segment) {
614
+ const result = [];
615
+ let current = "";
616
+ for (const char of segment) {
617
+ current += char;
618
+ const shouldSplit = /[。?!,,]/.test(char);
619
+ if (shouldSplit && current.length <= this.maxLength) {
620
+ result.push(current);
621
+ current = "";
622
+ } else if (current.length >= this.maxLength) {
623
+ result.push(current);
624
+ current = "";
625
+ }
626
+ }
627
+ if (current) {
628
+ result.push(current);
629
+ }
630
+ return result.filter((s) => s.length > 0);
631
+ }
632
+ /**
633
+ * 完成流式输入
634
+ * 处理剩余的缓冲区内容
635
+ */
636
+ complete() {
637
+ if (this.isCompleted) return;
638
+ this.isCompleted = true;
639
+ while (this.buffer.includes("\n")) {
640
+ const newlineIndex = this.buffer.indexOf("\n");
641
+ if (newlineIndex === 0) {
642
+ this.buffer = this.buffer.substring(1);
643
+ continue;
644
+ }
645
+ const segmentBuffer = this.buffer.substring(0, newlineIndex);
646
+ this.buffer = this.buffer.substring(newlineIndex + 1);
647
+ this.flushSegmentWithBuffer(segmentBuffer);
648
+ }
649
+ if (this.buffer.trim()) {
650
+ this.flushSegment();
651
+ }
652
+ this.onAllComplete?.(this.segments);
653
+ }
654
+ /**
655
+ * 重置分段器状态
656
+ */
657
+ reset() {
658
+ this.buffer = "";
659
+ this.segmentIndex = 0;
660
+ this.segments = [];
661
+ this.isCompleted = false;
662
+ }
663
+ /**
664
+ * 获取当前缓冲区内容
665
+ */
666
+ getBuffer() {
667
+ return this.buffer;
668
+ }
669
+ /**
670
+ * 获取已分段的列表
671
+ */
672
+ getSegments() {
673
+ return this.segments;
674
+ }
675
+ /**
676
+ * 获取统计信息
677
+ */
678
+ getStats() {
679
+ return {
680
+ bufferLength: this.buffer.length,
681
+ segmentCount: this.segments.length,
682
+ totalChars: this.segments.reduce((sum, seg) => sum + seg.length, 0)
683
+ };
684
+ }
685
+ };
686
+
687
+ // src/tts/StreamPlaybackManager.ts
688
+ var import_emoji_regex3 = __toESM(require("emoji-regex"));
689
+ var import_volcano_sdk3 = require("@wq-hook/volcano-sdk");
449
690
 
450
691
  // src/tts/TextSplitter.ts
451
692
  function splitTextByDelimiters(text, minLength = 10, maxLength = 150) {
@@ -515,338 +756,178 @@ function splitTextByDelimiters(text, minLength = 10, maxLength = 150) {
515
756
  return segments;
516
757
  }
517
758
 
518
- // src/tts/Metrics.ts
519
- var NoopMetricsCollector = class {
520
- record(_metric) {
521
- }
522
- };
523
-
524
- // src/tts/useMessageTTS.ts
759
+ // src/tts/StreamPlaybackManager.ts
525
760
  var WS_URL = "wss://openspeech.bytedance.com/api/v3/tts/bidirection";
526
- var activeInstances = /* @__PURE__ */ new Map();
527
761
  function buildFullUrl2(url, params) {
528
- const { ...auth } = params;
529
762
  const arr = [];
530
- for (const key in auth) {
531
- if (Object.prototype.hasOwnProperty.call(auth, key)) {
763
+ for (const key in params) {
764
+ if (Object.prototype.hasOwnProperty.call(params, key)) {
532
765
  arr.push(
533
- `${key}=${encodeURIComponent(auth[key])}`
766
+ `${key}=${encodeURIComponent(params[key])}`
534
767
  );
535
768
  }
536
769
  }
537
770
  return `${url}?${arr.join("&")}`;
538
771
  }
539
- function useMessageTTS({
540
- ttsConfig,
541
- audioParams,
542
- autoPlay = true,
543
- metricsCollector = new NoopMetricsCollector(),
544
- onPlayStart,
545
- onPlayPause,
546
- onPlayResume,
547
- onPlayEnd,
548
- onError,
549
- exclusive = true,
550
- fallbackVoice,
551
- visualization
552
- }) {
553
- const [isPlaying, setIsPlaying] = (0, import_react3.useState)(false);
554
- const [isPaused, setIsPaused] = (0, import_react3.useState)(false);
555
- const [isSynthesizing, setIsSynthesizing] = (0, import_react3.useState)(false);
556
- const [error, setErrorState] = (0, import_react3.useState)(null);
557
- const [progress, setProgress] = (0, import_react3.useState)(0);
558
- const [visualizationData, setVisualizationData] = (0, import_react3.useState)(
559
- {
560
- frequencyData: new Uint8Array(0),
561
- timeDomainData: new Uint8Array(0)
562
- }
563
- );
564
- const instanceId = (0, import_react3.useRef)(
565
- `tts-${Date.now()}-${Math.random().toString(36).slice(2)}`
566
- ).current;
567
- const clientRef = (0, import_react3.useRef)(null);
568
- const audioRef = (0, import_react3.useRef)(null);
569
- const audioContextRef = (0, import_react3.useRef)(null);
570
- const analyserRef = (0, import_react3.useRef)(null);
571
- const sourceRef = (0, import_react3.useRef)(null);
572
- const audioUrlRef = (0, import_react3.useRef)(null);
573
- const cacheKeyRef = (0, import_react3.useRef)("");
574
- const audioBuffersRef = (0, import_react3.useRef)([]);
575
- const isFallbackRef = (0, import_react3.useRef)(false);
576
- const fallbackUtteranceRef = (0, import_react3.useRef)(null);
577
- const stopOthers = (0, import_react3.useCallback)(() => {
578
- if (!exclusive) return;
579
- activeInstances.forEach((instance, id) => {
580
- if (id !== instanceId) {
581
- instance.pause();
582
- }
583
- });
584
- }, [exclusive, instanceId]);
585
- const initAudioContext = (0, import_react3.useCallback)(() => {
586
- if (!audioRef.current) return;
587
- if (!audioContextRef.current) {
772
+ var PlaybackSession = class {
773
+ constructor(id, config) {
774
+ this.listeners = /* @__PURE__ */ new Set();
775
+ this.audioContext = null;
776
+ this.analyser = null;
777
+ this.source = null;
778
+ this.audioUrl = null;
779
+ // TTS Resources
780
+ this.client = null;
781
+ this.splitter = null;
782
+ // Internal State
783
+ this.segmentQueue = [];
784
+ this.isSending = false;
785
+ this.isSessionStarting = false;
786
+ this.streamText = "";
787
+ this.sessionAudioBuffers = [];
788
+ this.isStreamFinished = false;
789
+ this.isSessionFinished = false;
790
+ this.resolveAllSegmentsSent = null;
791
+ this.animId = null;
792
+ this.lastVisUpdate = 0;
793
+ this.id = id;
794
+ this.config = config;
795
+ this.state = {
796
+ isPlaying: false,
797
+ isPaused: false,
798
+ isSynthesizing: false,
799
+ progress: 0,
800
+ visualizationData: {
801
+ frequencyData: new Uint8Array(0),
802
+ timeDomainData: new Uint8Array(0)
803
+ },
804
+ error: null,
805
+ isConnected: false,
806
+ isSessionStarted: false,
807
+ isStreamFinished: false
808
+ };
809
+ this.audio = new Audio();
810
+ this.audio.crossOrigin = "anonymous";
811
+ this.setupAudioListeners();
812
+ }
813
+ /**
814
+ * 初始化 AudioContext(用于可视化)
815
+ */
816
+ initAudioContext() {
817
+ if (!this.audioContext) {
588
818
  const AudioContextClass = window.AudioContext || window.webkitAudioContext;
589
- audioContextRef.current = new AudioContextClass();
819
+ this.audioContext = new AudioContextClass();
590
820
  }
591
- if (audioContextRef.current.state === "suspended") {
592
- audioContextRef.current.resume();
821
+ if (this.audioContext.state === "suspended") {
822
+ this.audioContext.resume();
593
823
  }
594
- if (!analyserRef.current) {
595
- analyserRef.current = audioContextRef.current.createAnalyser();
596
- analyserRef.current.fftSize = visualization?.fftSize || 256;
824
+ if (!this.analyser && this.audioContext) {
825
+ this.analyser = this.audioContext.createAnalyser();
826
+ this.analyser.fftSize = this.config.visualization?.fftSize || 256;
597
827
  }
598
- if (!sourceRef.current) {
828
+ if (!this.source && this.audioContext && this.analyser) {
599
829
  try {
600
- sourceRef.current = audioContextRef.current.createMediaElementSource(
601
- audioRef.current
602
- );
603
- sourceRef.current.connect(analyserRef.current);
604
- analyserRef.current.connect(audioContextRef.current.destination);
830
+ this.source = this.audioContext.createMediaElementSource(this.audio);
831
+ this.source.connect(this.analyser);
832
+ this.analyser.connect(this.audioContext.destination);
605
833
  } catch (e) {
606
834
  }
607
835
  }
608
- }, []);
609
- const cleanupAudio = (0, import_react3.useCallback)(() => {
610
- if (audioUrlRef.current) {
611
- URL.revokeObjectURL(audioUrlRef.current);
612
- audioUrlRef.current = null;
613
- }
614
- if (audioRef.current) {
615
- audioRef.current.onerror = null;
616
- audioRef.current.onended = null;
617
- audioRef.current.onpause = null;
618
- audioRef.current.onplay = null;
619
- audioRef.current.ontimeupdate = null;
620
- audioRef.current.pause();
621
- audioRef.current.src = "";
622
- audioRef.current = null;
623
- }
624
- if (sourceRef.current) {
625
- try {
626
- sourceRef.current.disconnect();
627
- } catch (e) {
836
+ }
837
+ setupAudioListeners() {
838
+ this.audio.onplay = () => {
839
+ this.updateState({ isPlaying: true, isPaused: false });
840
+ this.config.onPlayStart?.();
841
+ this.initAudioContext();
842
+ this.startVisualizationLoop();
843
+ };
844
+ this.audio.onpause = () => {
845
+ this.updateState({ isPaused: true, isPlaying: false });
846
+ this.config.onPlayPause?.();
847
+ };
848
+ this.audio.onended = () => {
849
+ this.updateState({
850
+ isPlaying: false,
851
+ isPaused: false,
852
+ isSynthesizing: false,
853
+ progress: 100
854
+ });
855
+ this.config.onPlayEnd?.();
856
+ this.stopVisualizationLoop();
857
+ };
858
+ this.audio.onerror = (e) => {
859
+ const msg = this.audio.error?.message || "Audio playback error";
860
+ console.error("[PlaybackSession] Audio error:", msg);
861
+ this.updateState({ error: msg });
862
+ this.config.onError?.(new Error(msg));
863
+ };
864
+ this.audio.ontimeupdate = () => {
865
+ let duration = this.audio.duration;
866
+ if (!isFinite(duration) && this.audio.buffered.length > 0) {
867
+ duration = this.audio.buffered.end(this.audio.buffered.length - 1);
628
868
  }
629
- sourceRef.current = null;
630
- }
631
- if (fallbackUtteranceRef.current) {
632
- window.speechSynthesis.cancel();
633
- fallbackUtteranceRef.current = null;
634
- }
635
- isFallbackRef.current = false;
636
- }, []);
637
- const stop = (0, import_react3.useCallback)(() => {
638
- if (clientRef.current) {
639
- clientRef.current.close();
640
- clientRef.current = null;
641
- }
642
- cleanupAudio();
643
- setIsPlaying(false);
644
- setIsPaused(false);
645
- setIsSynthesizing(false);
646
- setProgress(0);
647
- activeInstances.delete(instanceId);
648
- }, [cleanupAudio, instanceId]);
649
- const pause = (0, import_react3.useCallback)(() => {
650
- if (isFallbackRef.current) {
651
- window.speechSynthesis.pause();
652
- } else if (audioRef.current) {
653
- audioRef.current.pause();
654
- }
655
- setIsPaused(true);
656
- setIsPlaying(false);
657
- onPlayPause?.();
658
- }, [onPlayPause]);
659
- const resume = (0, import_react3.useCallback)(() => {
660
- stopOthers();
661
- if (isFallbackRef.current) {
662
- window.speechSynthesis.resume();
663
- } else if (audioRef.current) {
664
- audioRef.current.play();
665
- }
666
- setIsPaused(false);
667
- setIsPlaying(true);
668
- onPlayResume?.();
669
- activeInstances.set(instanceId, { pause });
670
- }, [stopOthers, instanceId, pause, onPlayResume]);
671
- const togglePlay = (0, import_react3.useCallback)(() => {
672
- if (isPlaying) {
673
- pause();
674
- } else {
675
- resume();
676
- }
677
- }, [isPlaying, pause, resume]);
678
- const playFallback = (0, import_react3.useCallback)(
679
- (text) => {
680
- console.warn("[useMessageTTS] Switching to fallback TTS");
681
- isFallbackRef.current = true;
682
- if (clientRef.current) {
683
- clientRef.current.close();
684
- clientRef.current = null;
869
+ if (isFinite(duration) && duration > 0) {
870
+ const progress = this.audio.currentTime / duration * 100;
871
+ this.updateState({ progress });
685
872
  }
686
- if (audioRef.current) {
687
- audioRef.current.pause();
688
- audioRef.current = null;
873
+ };
874
+ }
875
+ /**
876
+ * 建立 WebSocket 连接
877
+ */
878
+ async connect() {
879
+ if (this.state.isConnected) return;
880
+ this.updateState({
881
+ error: null,
882
+ progress: 0,
883
+ isSynthesizing: false,
884
+ isConnected: false,
885
+ isSessionStarted: false
886
+ });
887
+ this.streamText = "";
888
+ this.segmentQueue = [];
889
+ this.sessionAudioBuffers = [];
890
+ this.isStreamFinished = false;
891
+ this.isSessionFinished = false;
892
+ this.isSessionStarting = false;
893
+ if (this.client) {
894
+ this.client.close();
895
+ this.client = null;
896
+ }
897
+ this.splitter = new StreamingTextSplitter({
898
+ maxLength: this.config.maxSegmentLength || 150,
899
+ onSegmentComplete: (segment) => {
900
+ this.segmentQueue.push(segment);
901
+ if (this.state.isSessionStarted) {
902
+ this.processQueue();
903
+ }
689
904
  }
690
- const utterance = new SpeechSynthesisUtterance(text);
691
- utterance.rate = audioParams?.speech_rate || 1;
692
- const voices = window.speechSynthesis.getVoices();
693
- const zhVoice = voices.find((v) => v.lang.includes("zh"));
694
- if (zhVoice) utterance.voice = zhVoice;
695
- utterance.onstart = () => {
696
- setIsPlaying(true);
697
- setIsPaused(false);
698
- setIsSynthesizing(false);
699
- onPlayStart?.();
700
- activeInstances.set(instanceId, { pause });
701
- };
702
- utterance.onend = () => {
703
- setIsPlaying(false);
704
- setIsPaused(false);
705
- activeInstances.delete(instanceId);
706
- onPlayEnd?.();
707
- };
708
- utterance.onerror = (e) => {
709
- console.error("[useMessageTTS] Fallback TTS failed", e);
710
- setErrorState("Fallback TTS failed");
711
- onError?.(new Error("Fallback TTS failed"));
712
- setIsPlaying(false);
713
- };
714
- fallbackUtteranceRef.current = utterance;
715
- window.speechSynthesis.speak(utterance);
716
- },
717
- [audioParams, instanceId, onError, onPlayEnd, onPlayStart, pause]
718
- );
719
- const executeTTS = (0, import_react3.useCallback)(
720
- async (text, targetVoice) => {
721
- stop();
722
- stopOthers();
723
- setErrorState(null);
724
- setIsSynthesizing(true);
725
- setProgress(0);
726
- audioBuffersRef.current = [];
727
- isFallbackRef.current = false;
728
- const speed = audioParams?.speech_rate || 0;
729
- const voice = targetVoice;
730
- const cacheKey = TTSCache.generateKey(text, voice, speed);
731
- cacheKeyRef.current = cacheKey;
732
- const startTime = Date.now();
733
- metricsCollector.record({
734
- name: "tts_request",
735
- labels: { voice, speed, text_length: text.length },
736
- value: 1,
737
- timestamp: startTime
738
- });
905
+ });
906
+ this.client = (0, import_tts2.WebsocketMSE)({ autoStartSession: false });
907
+ const { ttsConfig, audioParams } = this.config;
908
+ const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
909
+ const startTime = Date.now();
910
+ this.config.metricsCollector?.record({
911
+ name: "tts_request",
912
+ labels: { voice, text_length: 0 },
913
+ value: 1,
914
+ timestamp: startTime
915
+ });
916
+ return new Promise((resolve, reject) => {
917
+ const timeoutId = setTimeout(() => {
918
+ const err = new Error("WebSocket connection timeout (10s)");
919
+ this.updateState({ error: err.message });
920
+ reject(err);
921
+ }, 1e4);
739
922
  try {
740
- const cachedData = await TTSCache.get(cacheKey);
741
- const audio = new Audio();
742
- audio.crossOrigin = "anonymous";
743
- audioRef.current = audio;
744
- audio.onplay = () => {
745
- setIsPlaying(true);
746
- setIsPaused(false);
747
- onPlayStart?.();
748
- initAudioContext();
749
- activeInstances.set(instanceId, { pause });
750
- metricsCollector.record({
751
- name: "tts_latency",
752
- labels: { stage: "playback", voice, speed },
753
- value: Date.now() - startTime,
754
- timestamp: Date.now()
755
- });
756
- };
757
- audio.onpause = () => {
758
- if (!audio.ended) {
759
- }
760
- };
761
- audio.onended = () => {
762
- setIsPlaying(false);
763
- setIsPaused(false);
764
- onPlayEnd?.();
765
- activeInstances.delete(instanceId);
766
- };
767
- audio.onerror = (e) => {
768
- console.error("Audio playback error:", e, audio.error);
769
- metricsCollector.record({
770
- name: "tts_error",
771
- labels: {
772
- error_code: "playback_error",
773
- voice,
774
- detail: audio.error?.message || String(audio.error?.code)
775
- },
776
- value: 1,
777
- timestamp: Date.now()
778
- });
779
- handleError(text, voice);
780
- };
781
- audio.ontimeupdate = () => {
782
- let duration = audio.duration;
783
- if (!isFinite(duration)) {
784
- if (audio.buffered.length > 0) {
785
- duration = audio.buffered.end(audio.buffered.length - 1);
786
- }
787
- }
788
- if (isFinite(duration) && duration > 0) {
789
- setProgress(audio.currentTime / duration * 100);
790
- }
791
- };
792
- if (cachedData) {
793
- const totalSize = cachedData.reduce(
794
- (acc, buf) => acc + buf.byteLength,
795
- 0
796
- );
797
- metricsCollector.record({
798
- name: "tts_cache_hit",
799
- labels: { voice, speed },
800
- value: 1,
801
- timestamp: Date.now()
802
- });
803
- console.log(
804
- JSON.stringify({
805
- event: "tts_cache_hit",
806
- cache_hit: true,
807
- text_len: text.length,
808
- voice,
809
- speed,
810
- data_size: totalSize
811
- })
812
- );
813
- if (totalSize === 0) {
814
- console.warn(
815
- "[useMessageTTS] Cached data is empty, falling back to stream"
816
- );
817
- } else {
818
- const blob = new Blob(cachedData, { type: "audio/mpeg" });
819
- const url2 = URL.createObjectURL(blob);
820
- audioUrlRef.current = url2;
821
- audio.src = url2;
822
- setIsSynthesizing(false);
823
- if (autoPlay) {
824
- try {
825
- await audio.play();
826
- } catch (err) {
827
- console.warn("AutoPlay blocked", err);
828
- }
829
- }
830
- return;
831
- }
832
- }
833
- console.log("[useMessageTTS] Cache miss, starting stream");
834
- clientRef.current = (0, import_tts2.WebsocketMSE)({ autoStartSession: true });
835
- const formattedText = import_volcano_sdk2.MarkdownFormatter.format(text).replace(
836
- (0, import_emoji_regex2.default)(),
837
- ""
838
- );
839
- const segments = splitTextByDelimiters(formattedText);
840
- const url = clientRef.current.start({
923
+ const url = this.client.start({
841
924
  url: buildFullUrl2(WS_URL, {
842
925
  api_access_key: `Jwt; ${ttsConfig.token}`,
843
926
  api_app_key: ttsConfig.appid,
844
927
  api_resource_id: ttsConfig.resourceId || "seed-tts-2.0"
845
928
  }),
846
929
  config: {
847
- user: {
848
- uid: `req-${Date.now()}`
849
- },
930
+ user: { uid: `req-${Date.now()}` },
850
931
  namespace: ttsConfig.namespace || "BidirectionalTTS",
851
932
  req_params: {
852
933
  speaker: voice,
@@ -861,456 +942,362 @@ function useMessageTTS({
861
942
  enable_language_detector: true,
862
943
  disable_markdown_filter: true,
863
944
  enable_latex_tn: true
864
- // max_length_to_filter_parenthesis: 100,
865
945
  })
866
946
  }
867
947
  },
948
+ onStart: () => {
949
+ this.updateState({ isConnected: true });
950
+ },
951
+ onConnectionReady: () => {
952
+ clearTimeout(timeoutId);
953
+ resolve();
954
+ },
868
955
  onSessionStarted: () => {
869
- segments.forEach((seg) => {
870
- clientRef.current?.sendText(seg.content);
871
- });
872
- clientRef.current?.finishSession();
956
+ this.updateState({ isSessionStarted: true });
957
+ this.isSessionStarting = false;
958
+ if (this.segmentQueue.length > 0) {
959
+ this.processQueue();
960
+ }
873
961
  },
874
962
  onMessage: (data) => {
875
- if (audioBuffersRef.current.length === 0) {
876
- console.log(
877
- JSON.stringify({
878
- event: "tts_first_packet",
879
- latency_ms: Date.now() - startTime,
880
- voice
881
- })
882
- );
963
+ this.updateState({ isSynthesizing: true });
964
+ if (this.sessionAudioBuffers.length === 0) {
965
+ this.config.metricsCollector?.record({
966
+ name: "tts_latency",
967
+ labels: { stage: "first_packet", voice },
968
+ value: Date.now() - startTime,
969
+ timestamp: Date.now()
970
+ });
883
971
  }
884
972
  const buffer = data instanceof ArrayBuffer ? data.slice(0) : new Uint8Array(data).buffer;
885
- audioBuffersRef.current.push(buffer);
973
+ this.sessionAudioBuffers.push(buffer);
886
974
  },
887
975
  onSessionFinished: () => {
888
- setIsSynthesizing(false);
889
- if (audioBuffersRef.current.length > 0) {
890
- TTSCache.set(cacheKey, [...audioBuffersRef.current]);
891
- }
892
- console.log(
893
- JSON.stringify({
894
- event: "tts_synthesis_finished",
895
- cache_hit: false,
896
- text_len: text.length,
897
- duration_ms: Date.now() - startTime,
976
+ this.updateState({
977
+ isSynthesizing: false,
978
+ isSessionStarted: false
979
+ });
980
+ if (this.sessionAudioBuffers.length > 0 && this.streamText) {
981
+ const speed = audioParams?.speech_rate || 0;
982
+ const cacheKey = TTSCache.generateKey(
983
+ this.streamText,
898
984
  voice,
899
985
  speed
900
- })
901
- );
986
+ );
987
+ TTSCache.set(cacheKey, [...this.sessionAudioBuffers]);
988
+ }
989
+ this.config.metricsCollector?.record({
990
+ name: "tts_synthesis_finished",
991
+ labels: { voice, text_length: this.streamText.length },
992
+ value: Date.now() - startTime,
993
+ timestamp: Date.now()
994
+ });
902
995
  },
903
996
  onError: (err) => {
904
- console.error("TTS Synthesis error:", err);
905
- metricsCollector.record({
906
- name: "tts_error",
907
- labels: { error_code: "synthesis_error", voice },
908
- value: 1,
909
- timestamp: Date.now()
997
+ if (!this.state.isConnected) {
998
+ clearTimeout(timeoutId);
999
+ reject(new Error(err.msg || "TTS error"));
1000
+ }
1001
+ console.error("[PlaybackSession] TTS error:", err);
1002
+ this.updateState({
1003
+ error: err.msg || "TTS error",
1004
+ isSynthesizing: false
910
1005
  });
911
- handleError(text, voice);
912
- setIsSynthesizing(false);
1006
+ this.config.onError?.(new Error(err.msg || "TTS error"));
1007
+ },
1008
+ onWSError: (err) => {
1009
+ if (!this.state.isConnected) {
1010
+ clearTimeout(timeoutId);
1011
+ reject(err instanceof Error ? err : new Error("WebSocket error"));
1012
+ }
913
1013
  }
914
1014
  });
915
- audioUrlRef.current = url;
916
- audio.src = url;
917
- if (autoPlay) {
918
- try {
919
- await audio.play();
920
- } catch (e) {
921
- console.warn("Autoplay blocked/pending", e);
922
- }
1015
+ if (this.audioUrl) {
1016
+ URL.revokeObjectURL(this.audioUrl);
1017
+ }
1018
+ this.audioUrl = url;
1019
+ this.audio.src = url;
1020
+ if (this.config.autoPlay !== false) {
1021
+ this.audio.play().catch(
1022
+ (e) => console.warn("[PlaybackSession] Autoplay blocked:", e)
1023
+ );
923
1024
  }
924
1025
  } catch (err) {
925
- console.error("Unexpected error in executeTTS:", err);
926
- metricsCollector.record({
927
- name: "tts_error",
928
- labels: { error_code: "unexpected_error", voice },
929
- value: 1,
930
- timestamp: Date.now()
931
- });
932
- handleError(text, voice);
933
- }
934
- },
935
- [
936
- ttsConfig,
937
- audioParams,
938
- autoPlay,
939
- stop,
940
- stopOthers,
941
- instanceId,
942
- onPlayStart,
943
- onPlayEnd,
944
- initAudioContext,
945
- pause,
946
- fallbackVoice,
947
- metricsCollector
948
- ]
949
- );
950
- const handleError = (0, import_react3.useCallback)(
951
- (text, failedVoice) => {
952
- if (fallbackVoice && failedVoice !== fallbackVoice) {
953
- console.warn(
954
- `[useMessageTTS] Voice ${failedVoice} failed, switching to fallback voice ${fallbackVoice}`
1026
+ clearTimeout(timeoutId);
1027
+ console.error("[PlaybackSession] Connect error:", err);
1028
+ this.updateState({ error: String(err) });
1029
+ this.config.onError?.(
1030
+ err instanceof Error ? err : new Error(String(err))
955
1031
  );
956
- if (clientRef.current) {
957
- clientRef.current.close();
958
- clientRef.current = null;
959
- }
960
- if (audioRef.current) {
961
- audioRef.current.pause();
962
- audioRef.current = null;
1032
+ reject(err);
1033
+ }
1034
+ });
1035
+ }
1036
+ /**
1037
+ * 发送流式文本
1038
+ */
1039
+ handleStreamChunk(chunk) {
1040
+ if (!chunk) return;
1041
+ this.streamText += chunk;
1042
+ if (!this.state.isSessionStarted && !this.isSessionStarting && this.client && this.state.isConnected && !this.isSessionFinished) {
1043
+ this.isSessionStarting = true;
1044
+ this.client.startSession();
1045
+ }
1046
+ this.splitter?.onChunk(chunk);
1047
+ if (this.state.isSessionStarted) {
1048
+ this.processQueue();
1049
+ }
1050
+ }
1051
+ /**
1052
+ * 结束流式输入
1053
+ */
1054
+ async finishStream() {
1055
+ this.isStreamFinished = true;
1056
+ this.updateState({ isStreamFinished: true });
1057
+ this.splitter?.complete();
1058
+ if (this.state.isSessionStarted) {
1059
+ this.processQueue();
1060
+ }
1061
+ if (this.segmentQueue.length > 0 || this.isSending) {
1062
+ await new Promise((resolve) => {
1063
+ this.resolveAllSegmentsSent = resolve;
1064
+ });
1065
+ } else if (this.client && this.state.isSessionStarted && !this.isSessionFinished) {
1066
+ this.isSessionFinished = true;
1067
+ this.client.finishSession();
1068
+ }
1069
+ }
1070
+ /**
1071
+ * 处理非流式播放(直接播放整段文本)
1072
+ */
1073
+ async play(text) {
1074
+ const formattedText = import_volcano_sdk3.MarkdownFormatter.format(text).replace(
1075
+ (0, import_emoji_regex3.default)(),
1076
+ ""
1077
+ );
1078
+ const { audioParams } = this.config;
1079
+ const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
1080
+ const speed = audioParams?.speech_rate || 0;
1081
+ const cacheKey = TTSCache.generateKey(formattedText, voice, speed);
1082
+ const cachedData = await TTSCache.get(cacheKey);
1083
+ if (cachedData && cachedData.length > 0) {
1084
+ const blob = new Blob(cachedData, { type: "audio/mpeg" });
1085
+ const url = URL.createObjectURL(blob);
1086
+ if (this.audioUrl) URL.revokeObjectURL(this.audioUrl);
1087
+ this.audioUrl = url;
1088
+ this.audio.src = url;
1089
+ this.updateState({ isSynthesizing: false });
1090
+ if (this.config.autoPlay !== false) {
1091
+ try {
1092
+ await this.audio.play();
1093
+ } catch (e) {
1094
+ console.warn("Autoplay blocked", e);
963
1095
  }
964
- executeTTS(text, fallbackVoice);
965
- } else {
966
- playFallback(text);
967
1096
  }
968
- },
969
- [fallbackVoice, executeTTS, playFallback]
970
- );
971
- const play = (0, import_react3.useCallback)(
972
- (text) => {
973
- const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
974
- return executeTTS(text, voice);
975
- },
976
- [audioParams, executeTTS]
977
- );
978
- const getFrequencyData = (0, import_react3.useCallback)(() => {
979
- if (!analyserRef.current) return new Uint8Array(0);
980
- const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
981
- analyserRef.current.getByteFrequencyData(dataArray);
982
- return dataArray;
983
- }, []);
984
- const getTimeDomainData = (0, import_react3.useCallback)(() => {
985
- if (!analyserRef.current) return new Uint8Array(0);
986
- const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
987
- analyserRef.current.getByteTimeDomainData(dataArray);
988
- return dataArray;
989
- }, []);
990
- (0, import_react3.useEffect)(() => {
991
- if (!visualization?.enabled) return;
992
- let animId;
993
- let lastUpdate = 0;
994
- const interval = visualization.refreshInterval || 0;
1097
+ return;
1098
+ }
1099
+ await this.connect();
1100
+ this.streamText = formattedText;
1101
+ const segments = splitTextByDelimiters(formattedText);
1102
+ if (this.state.isConnected) {
1103
+ if (!this.state.isSessionStarted && !this.isSessionStarting) {
1104
+ this.isSessionStarting = true;
1105
+ this.client?.startSession();
1106
+ }
1107
+ }
1108
+ segments.forEach((seg, idx) => {
1109
+ this.segmentQueue.push({
1110
+ index: idx,
1111
+ content: seg.content,
1112
+ length: seg.content.length,
1113
+ sent: false
1114
+ });
1115
+ });
1116
+ if (this.state.isSessionStarted) {
1117
+ this.processQueue();
1118
+ }
1119
+ await this.finishStream();
1120
+ }
1121
+ processQueue() {
1122
+ if (!this.client || !this.state.isSessionStarted || this.isSending || this.isSessionFinished) {
1123
+ return;
1124
+ }
1125
+ if (this.segmentQueue.length === 0) {
1126
+ if (this.isStreamFinished && !this.isSessionFinished) {
1127
+ this.isSessionFinished = true;
1128
+ this.client.finishSession();
1129
+ this.resolveAllSegmentsSent?.();
1130
+ }
1131
+ return;
1132
+ }
1133
+ this.isSending = true;
1134
+ const segment = this.segmentQueue.shift();
1135
+ this.client.sendText(segment.content);
1136
+ segment.sent = true;
1137
+ this.isSending = false;
1138
+ setTimeout(() => this.processQueue(), 0);
1139
+ }
1140
+ pause() {
1141
+ this.audio.pause();
1142
+ this.updateState({ isPaused: true, isPlaying: false });
1143
+ }
1144
+ resume() {
1145
+ this.audio.play();
1146
+ this.updateState({ isPaused: false, isPlaying: true });
1147
+ }
1148
+ stop() {
1149
+ if (this.client) {
1150
+ this.client.close();
1151
+ this.client = null;
1152
+ }
1153
+ this.audio.pause();
1154
+ this.audio.currentTime = 0;
1155
+ if (this.audioUrl) {
1156
+ URL.revokeObjectURL(this.audioUrl);
1157
+ this.audioUrl = null;
1158
+ }
1159
+ this.stopVisualizationLoop();
1160
+ this.audioContext?.close();
1161
+ this.audioContext = null;
1162
+ this.updateState({
1163
+ isPlaying: false,
1164
+ isPaused: false,
1165
+ isSynthesizing: false,
1166
+ progress: 0,
1167
+ isConnected: false,
1168
+ isSessionStarted: false
1169
+ });
1170
+ }
1171
+ seek(percentage) {
1172
+ let duration = this.audio.duration;
1173
+ if (!isFinite(duration) && this.audio.buffered.length > 0) {
1174
+ duration = this.audio.buffered.end(this.audio.buffered.length - 1);
1175
+ }
1176
+ if (isFinite(duration) && duration > 0) {
1177
+ const time = percentage / 100 * duration;
1178
+ if (isFinite(time)) {
1179
+ this.audio.currentTime = time;
1180
+ this.updateState({ progress: percentage });
1181
+ }
1182
+ }
1183
+ }
1184
+ updateState(partial) {
1185
+ this.state = { ...this.state, ...partial };
1186
+ this.notifyListeners();
1187
+ }
1188
+ subscribe(listener) {
1189
+ this.listeners.add(listener);
1190
+ listener(this.state);
1191
+ return () => this.listeners.delete(listener);
1192
+ }
1193
+ notifyListeners() {
1194
+ this.listeners.forEach((l) => l(this.state));
1195
+ }
1196
+ // Visualization
1197
+ getFrequencyData() {
1198
+ if (!this.analyser) return new Uint8Array(0);
1199
+ const data = new Uint8Array(this.analyser.frequencyBinCount);
1200
+ this.analyser.getByteFrequencyData(data);
1201
+ return data;
1202
+ }
1203
+ getTimeDomainData() {
1204
+ if (!this.analyser) return new Uint8Array(0);
1205
+ const data = new Uint8Array(this.analyser.frequencyBinCount);
1206
+ this.analyser.getByteTimeDomainData(data);
1207
+ return data;
1208
+ }
1209
+ startVisualizationLoop() {
1210
+ if (!this.config.visualization?.enabled) return;
995
1211
  const update = (timestamp) => {
996
- if (isPlaying && !isPaused) {
997
- if (timestamp - lastUpdate >= interval) {
998
- setVisualizationData({
999
- frequencyData: getFrequencyData(),
1000
- timeDomainData: getTimeDomainData()
1212
+ if (this.state.isPlaying && !this.state.isPaused) {
1213
+ if (timestamp - this.lastVisUpdate >= (this.config.visualization?.refreshInterval || 0)) {
1214
+ this.updateState({
1215
+ visualizationData: {
1216
+ frequencyData: this.getFrequencyData(),
1217
+ timeDomainData: this.getTimeDomainData()
1218
+ }
1001
1219
  });
1002
- lastUpdate = timestamp;
1220
+ this.lastVisUpdate = timestamp;
1003
1221
  }
1004
- animId = requestAnimationFrame(update);
1222
+ this.animId = requestAnimationFrame(update);
1005
1223
  }
1006
1224
  };
1007
- if (isPlaying && !isPaused) {
1008
- animId = requestAnimationFrame(update);
1225
+ this.animId = requestAnimationFrame(update);
1226
+ }
1227
+ stopVisualizationLoop() {
1228
+ if (this.animId) {
1229
+ cancelAnimationFrame(this.animId);
1230
+ this.animId = null;
1009
1231
  }
1010
- return () => {
1011
- if (animId) cancelAnimationFrame(animId);
1012
- };
1013
- }, [isPlaying, isPaused, visualization, getFrequencyData, getTimeDomainData]);
1014
- (0, import_react3.useEffect)(() => {
1015
- return () => {
1016
- stop();
1017
- if (audioContextRef.current) {
1018
- audioContextRef.current.close();
1019
- }
1020
- };
1021
- }, [stop]);
1022
- const seek = (0, import_react3.useCallback)((percentage) => {
1023
- if (audioRef.current) {
1024
- let duration = audioRef.current.duration;
1025
- if (!isFinite(duration) && audioRef.current.buffered.length > 0) {
1026
- duration = audioRef.current.buffered.end(
1027
- audioRef.current.buffered.length - 1
1028
- );
1029
- }
1030
- if (isFinite(duration) && duration > 0) {
1031
- const time = percentage / 100 * duration;
1032
- if (isFinite(time)) {
1033
- audioRef.current.currentTime = time;
1034
- setProgress(percentage);
1035
- }
1232
+ }
1233
+ };
1234
+ var StreamPlaybackManagerImpl = class {
1235
+ constructor() {
1236
+ this.sessions = /* @__PURE__ */ new Map();
1237
+ this.activeStreamId = null;
1238
+ }
1239
+ /**
1240
+ * 创建新的播放会话
1241
+ */
1242
+ createSession(id, config) {
1243
+ if (this.activeStreamId && this.activeStreamId !== id) {
1244
+ this.pause(this.activeStreamId);
1245
+ }
1246
+ const session = new PlaybackSession(id, config);
1247
+ this.sessions.set(id, session);
1248
+ this.activeStreamId = id;
1249
+ return session;
1250
+ }
1251
+ /**
1252
+ * 获取会话
1253
+ */
1254
+ getSession(id) {
1255
+ return this.sessions.get(id);
1256
+ }
1257
+ /**
1258
+ * 停止会话
1259
+ */
1260
+ stop(id) {
1261
+ const session = this.sessions.get(id);
1262
+ if (session) {
1263
+ session.stop();
1264
+ this.sessions.delete(id);
1265
+ if (this.activeStreamId === id) {
1266
+ this.activeStreamId = null;
1036
1267
  }
1037
1268
  }
1038
- }, []);
1039
- return {
1040
- isPlaying,
1041
- isPaused,
1042
- isSynthesizing,
1043
- error,
1044
- play,
1045
- pause,
1046
- resume,
1047
- stop,
1048
- togglePlay,
1049
- seek,
1050
- progress,
1051
- getFrequencyData,
1052
- getTimeDomainData,
1053
- visualizationData
1054
- };
1055
- }
1056
-
1057
- // src/tts/useStreamTTS.ts
1058
- var import_tts3 = require("@wq-hook/volcano-sdk/tts");
1059
- var import_react4 = require("react");
1060
-
1061
- // src/tts/StreamingTextSplitter.ts
1062
- var import_volcano_sdk3 = require("@wq-hook/volcano-sdk");
1063
- var import_emoji_regex3 = __toESM(require("emoji-regex"));
1064
- var StreamingTextSplitter = class {
1065
- constructor(options = {}) {
1066
- /** 当前缓冲区 */
1067
- this.buffer = "";
1068
- /** 分段索引计数器 */
1069
- this.segmentIndex = 0;
1070
- /** 已完成的分段列表 */
1071
- this.segments = [];
1072
- /** 是否已完成 */
1073
- this.isCompleted = false;
1074
- this.maxLength = options.maxLength || 150;
1075
- this.minLength = options.minLength || 10;
1076
- this.onSegmentComplete = options.onSegmentComplete;
1077
- this.onAllComplete = options.onAllComplete;
1078
- }
1079
- /**
1080
- * 接收流式文本块
1081
- * @param chunk - 文本块
1082
- */
1083
- onChunk(chunk) {
1084
- if (!chunk || this.isCompleted) return;
1085
- this.buffer += chunk;
1086
- if (this.detectBoundary(chunk)) {
1087
- const newlineIndex = this.buffer.indexOf("\n");
1088
- if (newlineIndex !== -1) {
1089
- if (newlineIndex === 0) {
1090
- this.buffer = this.buffer.substring(1);
1091
- return;
1092
- }
1093
- const segmentBuffer = this.buffer.substring(0, newlineIndex);
1094
- this.buffer = this.buffer.substring(newlineIndex + 1);
1095
- this.flushSegmentWithBuffer(segmentBuffer);
1096
- while (this.buffer.includes("\n")) {
1097
- const nextNewlineIndex = this.buffer.indexOf("\n");
1098
- if (nextNewlineIndex === 0) {
1099
- this.buffer = this.buffer.substring(1);
1100
- continue;
1101
- }
1102
- const nextSegmentBuffer = this.buffer.substring(0, nextNewlineIndex);
1103
- this.buffer = this.buffer.substring(nextNewlineIndex + 1);
1104
- this.flushSegmentWithBuffer(nextSegmentBuffer);
1105
- }
1106
- }
1107
- }
1108
- }
1109
- /**
1110
- * 检测分段边界
1111
- * @param chunk - 最新接收的文本块
1112
- * @returns 是否应该分段
1113
- */
1114
- detectBoundary(chunk) {
1115
- if (chunk.includes("\n")) {
1116
- if (this.buffer.length >= this.maxLength) {
1117
- this.forceSplitAtSentenceBoundary();
1118
- }
1119
- return true;
1120
- }
1121
- if (this.buffer.length >= this.maxLength) {
1122
- this.forceSplitAtSentenceBoundary();
1123
- return true;
1124
- }
1125
- return false;
1126
1269
  }
1127
1270
  /**
1128
- * 在句子边界强制拆分超长段落
1271
+ * 暂停会话
1129
1272
  */
1130
- forceSplitAtSentenceBoundary() {
1131
- const content = this.buffer;
1132
- const sentenceEnders = /[。?!]/g;
1133
- let lastMatch = null;
1134
- let match = null;
1135
- while ((match = sentenceEnders.exec(content)) !== null) {
1136
- lastMatch = match;
1137
- }
1138
- if (lastMatch && lastMatch.index > this.minLength) {
1139
- const splitPoint = lastMatch.index + 1;
1140
- const firstPart = content.substring(0, splitPoint);
1141
- const secondPart = content.substring(splitPoint);
1142
- this.buffer = firstPart;
1143
- this.flushSegment();
1144
- this.buffer = secondPart;
1145
- } else {
1146
- const midPoint = Math.floor(content.length / 2);
1147
- const firstPart = content.substring(0, midPoint);
1148
- const secondPart = content.substring(midPoint);
1149
- this.buffer = firstPart;
1150
- this.flushSegment();
1151
- this.buffer = secondPart;
1152
- }
1273
+ pause(id) {
1274
+ this.sessions.get(id)?.pause();
1153
1275
  }
1154
1276
  /**
1155
- * 使用指定缓冲区内容刷新为分段
1156
- * @param bufferToFlush - 要分段的缓冲区内容
1277
+ * 恢复会话
1157
1278
  */
1158
- flushSegmentWithBuffer(bufferToFlush) {
1159
- const content = bufferToFlush;
1160
- if (!content) return;
1161
- const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
1162
- const isTooShort = content.length < 3;
1163
- if (isPureSymbols && isTooShort) {
1164
- return;
1165
- }
1166
- const formattedContent = import_volcano_sdk3.MarkdownFormatter.format(content).replace((0, import_emoji_regex3.default)(), "");
1167
- if (!formattedContent) return;
1168
- let subSegments = [formattedContent];
1169
- if (formattedContent.length > this.maxLength) {
1170
- subSegments = this.splitLongSegment(formattedContent);
1171
- }
1172
- for (const subSegment of subSegments) {
1173
- if (!subSegment) continue;
1174
- const segment = {
1175
- index: this.segmentIndex++,
1176
- content: subSegment,
1177
- length: subSegment.length,
1178
- sent: false
1179
- };
1180
- this.segments.push(segment);
1181
- this.onSegmentComplete?.(segment);
1279
+ resume(id) {
1280
+ if (this.activeStreamId && this.activeStreamId !== id) {
1281
+ this.pause(this.activeStreamId);
1182
1282
  }
1283
+ this.sessions.get(id)?.resume();
1284
+ this.activeStreamId = id;
1183
1285
  }
1184
1286
  /**
1185
- * 刷新当前缓冲区为分段
1287
+ * 注册(兼容旧 API,但推荐直接用 createSession)
1288
+ * 为了兼容 useMessageTTS 旧逻辑,这里可以保留一些别名,但我们会重构 hook,所以可以改变 API。
1186
1289
  */
1187
- flushSegment() {
1188
- const content = this.buffer.trim();
1189
- if (!content) {
1190
- this.buffer = "";
1191
- return;
1192
- }
1193
- const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
1194
- const isTooShort = content.length < 3;
1195
- if (isPureSymbols && isTooShort) {
1196
- this.buffer = "";
1197
- return;
1198
- }
1199
- const formattedContent = import_volcano_sdk3.MarkdownFormatter.format(content).replace((0, import_emoji_regex3.default)(), "");
1200
- if (!formattedContent) {
1201
- this.buffer = "";
1202
- return;
1203
- }
1204
- let subSegments = [formattedContent];
1205
- if (formattedContent.length > this.maxLength) {
1206
- subSegments = this.splitLongSegment(formattedContent);
1207
- }
1208
- for (const subSegment of subSegments) {
1209
- if (!subSegment) continue;
1210
- const segment = {
1211
- index: this.segmentIndex++,
1212
- content: subSegment,
1213
- length: subSegment.length,
1214
- sent: false
1215
- };
1216
- this.segments.push(segment);
1217
- this.onSegmentComplete?.(segment);
1218
- }
1219
- this.buffer = "";
1220
- }
1221
- /**
1222
- * 拆分超长分段
1223
- * @param segment - 超长的分段
1224
- * @returns 拆分后的分段数组
1225
- */
1226
- splitLongSegment(segment) {
1227
- const result = [];
1228
- let current = "";
1229
- for (const char of segment) {
1230
- current += char;
1231
- const shouldSplit = /[。?!,,]/.test(char);
1232
- if (shouldSplit && current.length <= this.maxLength) {
1233
- result.push(current);
1234
- current = "";
1235
- } else if (current.length >= this.maxLength) {
1236
- result.push(current);
1237
- current = "";
1238
- }
1239
- }
1240
- if (current) {
1241
- result.push(current);
1242
- }
1243
- return result.filter((s) => s.length > 0);
1244
- }
1245
- /**
1246
- * 完成流式输入
1247
- * 处理剩余的缓冲区内容
1248
- */
1249
- complete() {
1250
- if (this.isCompleted) return;
1251
- this.isCompleted = true;
1252
- while (this.buffer.includes("\n")) {
1253
- const newlineIndex = this.buffer.indexOf("\n");
1254
- if (newlineIndex === 0) {
1255
- this.buffer = this.buffer.substring(1);
1256
- continue;
1257
- }
1258
- const segmentBuffer = this.buffer.substring(0, newlineIndex);
1259
- this.buffer = this.buffer.substring(newlineIndex + 1);
1260
- this.flushSegmentWithBuffer(segmentBuffer);
1261
- }
1262
- if (this.buffer.trim()) {
1263
- this.flushSegment();
1264
- }
1265
- this.onAllComplete?.(this.segments);
1266
- }
1267
- /**
1268
- * 重置分段器状态
1269
- */
1270
- reset() {
1271
- this.buffer = "";
1272
- this.segmentIndex = 0;
1273
- this.segments = [];
1274
- this.isCompleted = false;
1275
- }
1276
- /**
1277
- * 获取当前缓冲区内容
1278
- */
1279
- getBuffer() {
1280
- return this.buffer;
1281
- }
1282
- /**
1283
- * 获取已分段的列表
1284
- */
1285
- getSegments() {
1286
- return this.segments;
1287
- }
1288
- /**
1289
- * 获取统计信息
1290
- */
1291
- getStats() {
1292
- return {
1293
- bufferLength: this.buffer.length,
1294
- segmentCount: this.segments.length,
1295
- totalChars: this.segments.reduce((sum, seg) => sum + seg.length, 0)
1296
- };
1297
- }
1298
1290
  };
1291
+ var StreamPlaybackManager = new StreamPlaybackManagerImpl();
1299
1292
 
1300
- // src/tts/useStreamTTS.ts
1301
- var WS_URL2 = "wss://openspeech.bytedance.com/api/v3/tts/bidirection";
1302
- var activeInstances2 = /* @__PURE__ */ new Map();
1303
- var sessionAudioCache = /* @__PURE__ */ new Map();
1304
- function buildFullUrl3(url, params) {
1305
- const arr = [];
1306
- for (const key in params) {
1307
- if (Object.prototype.hasOwnProperty.call(params, key)) {
1308
- arr.push(`${key}=${encodeURIComponent(params[key])}`);
1309
- }
1293
+ // src/tts/Metrics.ts
1294
+ var NoopMetricsCollector = class {
1295
+ record(_metric) {
1310
1296
  }
1311
- return `${url}?${arr.join("&")}`;
1312
- }
1313
- function useStreamTTS({
1297
+ };
1298
+
1299
+ // src/tts/useMessageTTS.ts
1300
+ function useMessageTTS({
1314
1301
  ttsConfig,
1315
1302
  audioParams,
1316
1303
  autoPlay = true,
@@ -1319,431 +1306,427 @@ function useStreamTTS({
1319
1306
  onPlayPause,
1320
1307
  onPlayResume,
1321
1308
  onPlayEnd,
1309
+ onStop,
1322
1310
  onError,
1311
+ fallbackVoice,
1323
1312
  visualization,
1324
- maxSegmentLength = 150
1313
+ streamId: externalStreamId
1325
1314
  }) {
1326
- const [isConnected, setIsConnected] = (0, import_react4.useState)(false);
1327
- const [isSessionStarted, setIsSessionStarted] = (0, import_react4.useState)(false);
1328
- const [isSynthesizing, setIsSynthesizing] = (0, import_react4.useState)(false);
1329
- const [isPlaying, setIsPlaying] = (0, import_react4.useState)(false);
1330
- const [isPaused, setIsPaused] = (0, import_react4.useState)(false);
1331
- const [error, setErrorState] = (0, import_react4.useState)(null);
1332
- const [streamText, setStreamText] = (0, import_react4.useState)("");
1333
- const [progress, setProgress] = (0, import_react4.useState)(0);
1334
- const [visualizationData, setVisualizationData] = (0, import_react4.useState)({
1335
- frequencyData: new Uint8Array(0),
1336
- timeDomainData: new Uint8Array(0)
1315
+ const isSubscriptionMode = !!externalStreamId;
1316
+ const [internalStreamId, setInternalStreamId] = (0, import_react3.useState)("");
1317
+ const [isSwitchedToIndependent, setIsSwitchedToIndependent] = (0, import_react3.useState)(false);
1318
+ const streamId = isSwitchedToIndependent ? internalStreamId : externalStreamId || internalStreamId;
1319
+ const [state, setState] = (0, import_react3.useState)({
1320
+ isPlaying: false,
1321
+ isPaused: false,
1322
+ isSynthesizing: false,
1323
+ progress: 0,
1324
+ visualizationData: {
1325
+ frequencyData: new Uint8Array(0),
1326
+ timeDomainData: new Uint8Array(0)
1327
+ },
1328
+ error: null,
1329
+ isConnected: false,
1330
+ isSessionStarted: false,
1331
+ isStreamFinished: false
1337
1332
  });
1338
- const instanceId = (0, import_react4.useRef)(`tts-stream-${Date.now()}-${Math.random().toString(36).slice(2)}`).current;
1339
- const clientRef = (0, import_react4.useRef)(null);
1340
- const audioRef = (0, import_react4.useRef)(null);
1341
- const audioContextRef = (0, import_react4.useRef)(null);
1342
- const analyserRef = (0, import_react4.useRef)(null);
1343
- const sourceRef = (0, import_react4.useRef)(null);
1344
- const audioUrlRef = (0, import_react4.useRef)(null);
1345
- const streamTextRef = (0, import_react4.useRef)("");
1346
- const isConnectedRef = (0, import_react4.useRef)(false);
1347
- const isSessionStartedRef = (0, import_react4.useRef)(false);
1348
- const calledSessionStartedRef = (0, import_react4.useRef)(false);
1349
- const splitterRef = (0, import_react4.useRef)(null);
1350
- const segmentQueueRef = (0, import_react4.useRef)([]);
1351
- const isSendingRef = (0, import_react4.useRef)(false);
1352
- const sessionAudioBuffersRef = (0, import_react4.useRef)([]);
1353
- const isStreamFinishedRef = (0, import_react4.useRef)(false);
1354
- const isSessionFinishedRef = (0, import_react4.useRef)(false);
1355
- const resolveAllSegmentsSentRef = (0, import_react4.useRef)(null);
1356
- const currentVoiceRef = (0, import_react4.useRef)("");
1357
- const initAudioContext = (0, import_react4.useCallback)(() => {
1358
- if (!audioRef.current) return;
1359
- if (!audioContextRef.current) {
1360
- const AudioContextClass = window.AudioContext || window.webkitAudioContext;
1361
- audioContextRef.current = new AudioContextClass();
1362
- }
1363
- if (audioContextRef.current.state === "suspended") {
1364
- audioContextRef.current.resume();
1365
- }
1366
- if (!analyserRef.current) {
1367
- analyserRef.current = audioContextRef.current.createAnalyser();
1368
- analyserRef.current.fftSize = visualization?.fftSize || 256;
1369
- }
1370
- if (!sourceRef.current) {
1371
- try {
1372
- sourceRef.current = audioContextRef.current.createMediaElementSource(audioRef.current);
1373
- sourceRef.current.connect(analyserRef.current);
1374
- analyserRef.current.connect(audioContextRef.current.destination);
1375
- } catch (e) {
1376
- }
1333
+ const [error, setErrorState] = (0, import_react3.useState)(null);
1334
+ const isFallbackRef = (0, import_react3.useRef)(false);
1335
+ const fallbackUtteranceRef = (0, import_react3.useRef)(null);
1336
+ const currentTextRef = (0, import_react3.useRef)("");
1337
+ (0, import_react3.useEffect)(() => {
1338
+ if (!streamId) return;
1339
+ const session = StreamPlaybackManager.getSession(streamId);
1340
+ if (session) {
1341
+ const unsubscribe = session.subscribe((newState) => {
1342
+ setState(newState);
1343
+ if (newState.error) setErrorState(newState.error);
1344
+ });
1345
+ return () => {
1346
+ unsubscribe();
1347
+ };
1377
1348
  }
1378
- }, [visualization?.fftSize]);
1379
- const cleanupAudio = (0, import_react4.useCallback)(() => {
1380
- if (audioUrlRef.current) {
1381
- URL.revokeObjectURL(audioUrlRef.current);
1382
- audioUrlRef.current = null;
1383
- }
1384
- if (audioRef.current) {
1385
- audioRef.current.onerror = null;
1386
- audioRef.current.onended = null;
1387
- audioRef.current.onpause = null;
1388
- audioRef.current.onplay = null;
1389
- audioRef.current.ontimeupdate = null;
1390
- audioRef.current.pause();
1391
- audioRef.current.src = "";
1392
- audioRef.current = null;
1393
- }
1394
- if (sourceRef.current) {
1395
- try {
1396
- sourceRef.current.disconnect();
1397
- } catch (e) {
1349
+ }, [streamId]);
1350
+ const stop = (0, import_react3.useCallback)(() => {
1351
+ if (streamId) {
1352
+ StreamPlaybackManager.stop(streamId);
1353
+ if (!isSubscriptionMode || isSwitchedToIndependent) {
1354
+ setInternalStreamId("");
1355
+ setIsSwitchedToIndependent(false);
1398
1356
  }
1399
- sourceRef.current = null;
1400
1357
  }
1401
- }, []);
1402
- const stopOthers = (0, import_react4.useCallback)(() => {
1403
- activeInstances2.forEach((instance, id) => {
1404
- if (id !== instanceId) {
1405
- instance.pause();
1406
- }
1407
- });
1408
- }, [instanceId]);
1409
- const pause = (0, import_react4.useCallback)(() => {
1410
- if (audioRef.current) {
1411
- audioRef.current.pause();
1358
+ if (fallbackUtteranceRef.current) {
1359
+ window.speechSynthesis.cancel();
1360
+ fallbackUtteranceRef.current = null;
1412
1361
  }
1413
- setIsPaused(true);
1414
- setIsPlaying(false);
1415
- onPlayPause?.();
1416
- }, [onPlayPause]);
1417
- const resume = (0, import_react4.useCallback)(() => {
1418
- stopOthers();
1419
- if (audioRef.current) {
1420
- audioRef.current.play();
1421
- }
1422
- setIsPaused(false);
1423
- setIsPlaying(true);
1424
- onPlayResume?.();
1425
- activeInstances2.set(instanceId, { pause });
1426
- }, [stopOthers, instanceId, pause, onPlayResume]);
1427
- const sendNextSegment = (0, import_react4.useCallback)(() => {
1428
- if (!clientRef.current || !isSessionStartedRef.current || isSendingRef.current || isSessionFinishedRef.current) {
1429
- return;
1362
+ isFallbackRef.current = false;
1363
+ setState((prev) => ({
1364
+ ...prev,
1365
+ isPlaying: false,
1366
+ isPaused: false,
1367
+ isSynthesizing: false,
1368
+ progress: 0
1369
+ }));
1370
+ onStop?.();
1371
+ }, [streamId, isSubscriptionMode, isSwitchedToIndependent, onStop]);
1372
+ const pause = (0, import_react3.useCallback)(() => {
1373
+ if (isFallbackRef.current) {
1374
+ window.speechSynthesis.pause();
1375
+ setState((prev) => ({ ...prev, isPaused: true, isPlaying: false }));
1376
+ onPlayPause?.();
1377
+ } else if (streamId) {
1378
+ StreamPlaybackManager.pause(streamId);
1430
1379
  }
1431
- if (segmentQueueRef.current.length === 0) {
1432
- if (isStreamFinishedRef.current && !isSessionFinishedRef.current) {
1433
- console.log("[useStreamTTS] All segments sent, finishing session");
1434
- isSessionFinishedRef.current = true;
1435
- clientRef.current.finishSession();
1436
- resolveAllSegmentsSentRef.current?.();
1380
+ }, [streamId, onPlayPause]);
1381
+ const resume = (0, import_react3.useCallback)(() => {
1382
+ if (isFallbackRef.current) {
1383
+ window.speechSynthesis.resume();
1384
+ setState((prev) => ({ ...prev, isPaused: false, isPlaying: true }));
1385
+ onPlayResume?.();
1386
+ } else if (streamId) {
1387
+ const session = StreamPlaybackManager.getSession(streamId);
1388
+ if (session) {
1389
+ StreamPlaybackManager.resume(streamId);
1390
+ } else {
1391
+ console.log(
1392
+ "[useMessageTTS] Session not found, resetting pause state"
1393
+ );
1394
+ setState((prev) => ({ ...prev, isPaused: false, isPlaying: false }));
1437
1395
  }
1438
- return;
1439
1396
  }
1440
- isSendingRef.current = true;
1441
- const segment = segmentQueueRef.current.shift();
1442
- console.log(`[useStreamTTS] Sending segment ${segment.index}: ${segment.content.substring(0, 30)}...`);
1443
- clientRef.current.sendText(segment.content);
1444
- segment.sent = true;
1445
- isSendingRef.current = false;
1446
- setTimeout(() => sendNextSegment(), 0);
1447
- }, []);
1448
- const stop = (0, import_react4.useCallback)(() => {
1449
- if (clientRef.current) {
1450
- clientRef.current.close();
1451
- clientRef.current = null;
1452
- }
1453
- cleanupAudio();
1454
- setIsConnected(false);
1455
- isConnectedRef.current = false;
1456
- setIsSessionStarted(false);
1457
- isSessionStartedRef.current = false;
1458
- calledSessionStartedRef.current = false;
1459
- setIsPlaying(false);
1460
- setIsPaused(false);
1461
- setIsSynthesizing(false);
1462
- setProgress(0);
1463
- activeInstances2.delete(instanceId);
1464
- streamTextRef.current = "";
1465
- setStreamText("");
1466
- segmentQueueRef.current = [];
1467
- isSendingRef.current = false;
1468
- sessionAudioBuffersRef.current = [];
1469
- isStreamFinishedRef.current = false;
1470
- isSessionFinishedRef.current = false;
1471
- splitterRef.current?.reset();
1472
- }, [cleanupAudio, instanceId]);
1473
- const connect = (0, import_react4.useCallback)(async () => {
1474
- stop();
1475
- setErrorState(null);
1476
- setProgress(0);
1477
- sessionAudioBuffersRef.current = [];
1478
- isStreamFinishedRef.current = false;
1479
- streamTextRef.current = "";
1480
- setStreamText("");
1481
- segmentQueueRef.current = [];
1482
- isSendingRef.current = false;
1483
- isSessionStartedRef.current = false;
1484
- calledSessionStartedRef.current = false;
1485
- setIsSessionStarted(false);
1486
- const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
1487
- currentVoiceRef.current = voice;
1488
- const startTime = Date.now();
1489
- metricsCollector.record({
1490
- name: "tts_request",
1491
- labels: { voice, text_length: 0 },
1492
- value: 1,
1493
- timestamp: startTime
1494
- });
1495
- try {
1496
- const audio = new Audio();
1497
- audio.crossOrigin = "anonymous";
1498
- audioRef.current = audio;
1499
- audio.onplay = () => {
1500
- setIsPlaying(true);
1501
- setIsPaused(false);
1397
+ }, [streamId, onPlayResume]);
1398
+ const togglePlay = (0, import_react3.useCallback)(() => {
1399
+ if (state.isPlaying) {
1400
+ pause();
1401
+ } else {
1402
+ resume();
1403
+ }
1404
+ }, [state.isPlaying, pause, resume]);
1405
+ const playFallback = (0, import_react3.useCallback)(
1406
+ (text) => {
1407
+ console.warn("[useMessageTTS] Switching to fallback TTS");
1408
+ stop();
1409
+ isFallbackRef.current = true;
1410
+ setErrorState(null);
1411
+ const utterance = new SpeechSynthesisUtterance(text);
1412
+ utterance.rate = audioParams?.speech_rate || 1;
1413
+ const voices = window.speechSynthesis.getVoices();
1414
+ const zhVoice = voices.find((v) => v.lang.includes("zh"));
1415
+ if (zhVoice) utterance.voice = zhVoice;
1416
+ utterance.onstart = () => {
1417
+ setState((prev) => ({ ...prev, isPlaying: true, isPaused: false }));
1502
1418
  onPlayStart?.();
1503
- initAudioContext();
1504
- activeInstances2.set(instanceId, { pause });
1505
1419
  };
1506
- audio.onended = () => {
1507
- setIsPlaying(false);
1508
- setIsPaused(false);
1420
+ utterance.onend = () => {
1421
+ setState((prev) => ({
1422
+ ...prev,
1423
+ isPlaying: false,
1424
+ isPaused: false,
1425
+ progress: 100
1426
+ }));
1509
1427
  onPlayEnd?.();
1510
- activeInstances2.delete(instanceId);
1511
1428
  };
1512
- audio.onerror = (e) => {
1513
- console.error("[useStreamTTS] Audio playback error:", e, audio.error);
1514
- setErrorState(audio.error?.message || "Audio playback error");
1515
- onError?.(new Error(audio.error?.message || "Audio playback error"));
1516
- };
1517
- audio.ontimeupdate = () => {
1518
- let duration = audio.duration;
1519
- if (!isFinite(duration) && audio.buffered.length > 0) {
1520
- duration = audio.buffered.end(audio.buffered.length - 1);
1521
- }
1522
- if (isFinite(duration) && duration > 0) {
1523
- setProgress(audio.currentTime / duration * 100);
1524
- }
1429
+ utterance.onerror = (e) => {
1430
+ console.error("[useMessageTTS] Fallback TTS failed", e);
1431
+ setErrorState("Fallback TTS failed");
1432
+ onError?.(new Error("Fallback TTS failed"));
1525
1433
  };
1526
- clientRef.current = (0, import_tts3.WebsocketMSE)({ autoStartSession: false });
1527
- splitterRef.current = new StreamingTextSplitter({
1528
- maxLength: maxSegmentLength,
1529
- onSegmentComplete: (segment) => {
1530
- segmentQueueRef.current.push(segment);
1531
- console.log(`[useStreamTTS] Segment ${segment.index} queued (${segment.length} chars)`);
1532
- if (isSessionStartedRef.current) {
1533
- sendNextSegment();
1534
- }
1535
- },
1536
- onAllComplete: () => {
1537
- console.log(`[useStreamTTS] All segments completed, total: ${segmentQueueRef.current.length} in queue`);
1434
+ fallbackUtteranceRef.current = utterance;
1435
+ window.speechSynthesis.speak(utterance);
1436
+ },
1437
+ [audioParams, onError, onPlayEnd, onPlayStart, stop]
1438
+ );
1439
+ const handleError = (0, import_react3.useCallback)(
1440
+ (text, failedVoice) => {
1441
+ if (fallbackVoice && failedVoice !== fallbackVoice) {
1442
+ console.warn(
1443
+ `[useMessageTTS] Voice ${failedVoice} failed, switching to fallback voice ${fallbackVoice}`
1444
+ );
1445
+ const newId = internalStreamId || `msg-tts-retry-${Date.now()}`;
1446
+ setInternalStreamId(newId);
1447
+ const session = StreamPlaybackManager.createSession(newId, {
1448
+ ttsConfig,
1449
+ audioParams: { ...audioParams, speaker: fallbackVoice },
1450
+ autoPlay,
1451
+ metricsCollector,
1452
+ visualization,
1453
+ onPlayStart,
1454
+ onPlayPause,
1455
+ onPlayResume,
1456
+ onPlayEnd,
1457
+ onError: () => playFallback(text)
1458
+ });
1459
+ session.play(text);
1460
+ } else {
1461
+ playFallback(text);
1462
+ }
1463
+ },
1464
+ [
1465
+ fallbackVoice,
1466
+ playFallback,
1467
+ ttsConfig,
1468
+ audioParams,
1469
+ autoPlay,
1470
+ metricsCollector,
1471
+ visualization,
1472
+ onPlayStart,
1473
+ onPlayPause,
1474
+ onPlayResume,
1475
+ onPlayEnd,
1476
+ internalStreamId
1477
+ ]
1478
+ );
1479
+ const play = (0, import_react3.useCallback)(
1480
+ async (text) => {
1481
+ let shouldSwitchToIndependent = false;
1482
+ if (isSubscriptionMode) {
1483
+ const session2 = StreamPlaybackManager.getSession(externalStreamId || "");
1484
+ if (!session2) {
1485
+ console.log(
1486
+ "[useMessageTTS] Stream session not found, switching to independent play mode"
1487
+ );
1488
+ shouldSwitchToIndependent = true;
1489
+ setIsSwitchedToIndependent(true);
1490
+ } else if (session2.state.isStreamFinished) {
1491
+ console.log(
1492
+ "[useMessageTTS] Stream finished, switching to independent play mode"
1493
+ );
1494
+ shouldSwitchToIndependent = true;
1495
+ setIsSwitchedToIndependent(true);
1496
+ } else if (session2.state.isSynthesizing || session2.state.isPlaying) {
1497
+ console.warn(
1498
+ "[useMessageTTS] play() called in subscription mode while streaming, ignoring"
1499
+ );
1500
+ return;
1501
+ } else {
1502
+ console.log(
1503
+ "[useMessageTTS] Stream not active, switching to independent play mode"
1504
+ );
1505
+ shouldSwitchToIndependent = true;
1506
+ setIsSwitchedToIndependent(true);
1538
1507
  }
1539
- });
1540
- const url = clientRef.current.start({
1541
- url: buildFullUrl3(WS_URL2, {
1542
- api_access_key: `Jwt; ${ttsConfig.token}`,
1543
- api_app_key: ttsConfig.appid,
1544
- api_resource_id: ttsConfig.resourceId || "seed-tts-2.0"
1545
- }),
1546
- config: {
1547
- user: {
1548
- uid: `req-${Date.now()}`
1549
- },
1550
- namespace: ttsConfig.namespace || "BidirectionalTTS",
1551
- req_params: {
1552
- speaker: voice,
1553
- audio_params: {
1554
- sample_rate: audioParams?.sample_rate || 24e3,
1555
- format: audioParams?.format || "mp3",
1556
- speech_rate: audioParams?.speech_rate,
1557
- pitch_rate: audioParams?.pitch_rate,
1558
- loudness_rate: audioParams?.loudness_rate
1559
- },
1560
- additions: JSON.stringify({
1561
- enable_language_detector: true,
1562
- disable_markdown_filter: true,
1563
- enable_latex_tn: true
1564
- })
1565
- }
1566
- },
1567
- // ===== 关键回调 =====
1568
- onStart: () => {
1569
- setIsConnected(true);
1570
- isConnectedRef.current = true;
1571
- console.log("[useStreamTTS] WebSocket connected, waiting for text...");
1572
- },
1573
- onSessionStarted: () => {
1574
- setIsSessionStarted(true);
1575
- isSessionStartedRef.current = true;
1576
- console.log("[useStreamTTS] Session started, can send text now");
1577
- if (segmentQueueRef.current.length > 0) {
1578
- sendNextSegment();
1579
- }
1580
- },
1581
- onMessage: (data) => {
1582
- setIsSynthesizing(true);
1583
- if (sessionAudioBuffersRef.current.length === 0) {
1584
- metricsCollector.record({
1585
- name: "tts_latency",
1586
- labels: { stage: "first_packet", voice },
1587
- value: Date.now() - startTime,
1588
- timestamp: Date.now()
1589
- });
1590
- }
1591
- const buffer = data instanceof ArrayBuffer ? data.slice(0) : new Uint8Array(data).buffer;
1592
- sessionAudioBuffersRef.current.push(buffer);
1593
- },
1594
- onSessionFinished: () => {
1595
- setIsSynthesizing(false);
1596
- setIsSessionStarted(false);
1597
- isSessionStartedRef.current = false;
1598
- calledSessionStartedRef.current = false;
1599
- if (sessionAudioBuffersRef.current.length > 0 && streamTextRef.current) {
1600
- const speed = audioParams?.speech_rate || 0;
1601
- const cacheKey = TTSCache.generateKey(streamTextRef.current, voice, speed);
1602
- TTSCache.set(cacheKey, [...sessionAudioBuffersRef.current]);
1603
- sessionAudioCache.set(instanceId, {
1604
- streamText: streamTextRef.current,
1605
- audioBuffers: [...sessionAudioBuffersRef.current],
1606
- timestamp: Date.now(),
1607
- voice,
1608
- speed
1609
- });
1610
- console.log(`[useStreamTTS] Session finished, cached ${sessionAudioBuffersRef.current.length} audio buffers`);
1611
- }
1612
- metricsCollector.record({
1613
- name: "tts_synthesis_finished",
1614
- labels: { voice, text_length: streamTextRef.current.length },
1615
- value: Date.now() - startTime,
1616
- timestamp: Date.now()
1617
- });
1618
- },
1508
+ }
1509
+ currentTextRef.current = text;
1510
+ stop();
1511
+ setErrorState(null);
1512
+ isFallbackRef.current = false;
1513
+ const id = `msg-tts-${Date.now()}-${Math.random().toString(36).slice(2)}`;
1514
+ const session = StreamPlaybackManager.createSession(id, {
1515
+ ttsConfig,
1516
+ audioParams,
1517
+ autoPlay,
1518
+ metricsCollector,
1519
+ visualization,
1520
+ onPlayStart,
1521
+ onPlayPause,
1522
+ onPlayResume,
1523
+ onPlayEnd,
1619
1524
  onError: (err) => {
1620
- console.error("[useStreamTTS] TTS error:", err);
1621
- setErrorState(err.msg || "TTS error");
1622
- onError?.(new Error(err.msg || "TTS error"));
1623
- setIsSynthesizing(false);
1525
+ handleError(text, audioParams?.speaker || "");
1624
1526
  }
1625
1527
  });
1626
- audioUrlRef.current = url;
1627
- audio.src = url;
1628
- if (autoPlay) {
1629
- try {
1630
- await audio.play();
1631
- } catch (e) {
1632
- console.warn("[useStreamTTS] Autoplay blocked:", e);
1633
- }
1528
+ setInternalStreamId(id);
1529
+ await session.play(text);
1530
+ },
1531
+ [
1532
+ isSubscriptionMode,
1533
+ externalStreamId,
1534
+ stop,
1535
+ ttsConfig,
1536
+ audioParams,
1537
+ autoPlay,
1538
+ metricsCollector,
1539
+ visualization,
1540
+ onPlayStart,
1541
+ onPlayPause,
1542
+ onPlayResume,
1543
+ onPlayEnd,
1544
+ handleError
1545
+ ]
1546
+ );
1547
+ const seek = (0, import_react3.useCallback)(
1548
+ (percentage) => {
1549
+ if (streamId) {
1550
+ StreamPlaybackManager.getSession(streamId)?.seek(percentage);
1634
1551
  }
1635
- } catch (err) {
1636
- console.error("[useStreamTTS] Connect error:", err);
1637
- setErrorState(String(err));
1638
- onError?.(err instanceof Error ? err : new Error(String(err)));
1639
- }
1552
+ },
1553
+ [streamId]
1554
+ );
1555
+ const getFrequencyData = (0, import_react3.useCallback)(
1556
+ () => state.visualizationData.frequencyData,
1557
+ [state.visualizationData]
1558
+ );
1559
+ const getTimeDomainData = (0, import_react3.useCallback)(
1560
+ () => state.visualizationData.timeDomainData,
1561
+ [state.visualizationData]
1562
+ );
1563
+ const isStreamActive = !!(externalStreamId && (state.isPlaying || state.isPaused || state.isSynthesizing));
1564
+ const canResume = (0, import_react3.useCallback)(() => {
1565
+ if (!streamId) return false;
1566
+ const session = StreamPlaybackManager.getSession(streamId);
1567
+ return !!session;
1568
+ }, [streamId]);
1569
+ return {
1570
+ isPlaying: state.isPlaying,
1571
+ isPaused: state.isPaused,
1572
+ isSynthesizing: state.isSynthesizing,
1573
+ progress: state.progress,
1574
+ error,
1575
+ play,
1576
+ pause,
1577
+ resume,
1578
+ stop,
1579
+ togglePlay,
1580
+ seek,
1581
+ getFrequencyData,
1582
+ getTimeDomainData,
1583
+ visualizationData: state.visualizationData,
1584
+ isStreamActive,
1585
+ streamState: state,
1586
+ canResume
1587
+ };
1588
+ }
1589
+
1590
+ // src/tts/useStreamTTS.ts
1591
+ var import_react4 = require("react");
1592
+ function useStreamTTS({
1593
+ ttsConfig,
1594
+ audioParams,
1595
+ autoPlay = true,
1596
+ metricsCollector = new NoopMetricsCollector(),
1597
+ onPlayStart,
1598
+ onPlayPause,
1599
+ onPlayResume,
1600
+ onPlayEnd,
1601
+ onError,
1602
+ visualization,
1603
+ maxSegmentLength = 150
1604
+ }) {
1605
+ const [streamId, setStreamId] = (0, import_react4.useState)("");
1606
+ const streamIdRef = (0, import_react4.useRef)("");
1607
+ const [state, setState] = (0, import_react4.useState)({
1608
+ isPlaying: false,
1609
+ isPaused: false,
1610
+ isSynthesizing: false,
1611
+ progress: 0,
1612
+ visualizationData: {
1613
+ frequencyData: new Uint8Array(0),
1614
+ timeDomainData: new Uint8Array(0)
1615
+ },
1616
+ error: null,
1617
+ isConnected: false,
1618
+ isSessionStarted: false,
1619
+ isStreamFinished: false
1620
+ });
1621
+ const [streamText, setStreamText] = (0, import_react4.useState)("");
1622
+ const streamTextRef = (0, import_react4.useRef)("");
1623
+ const connect = (0, import_react4.useCallback)(async () => {
1624
+ const newStreamId = `tts-stream-${Date.now()}-${Math.random().toString(36).slice(2)}`;
1625
+ setStreamId(newStreamId);
1626
+ streamIdRef.current = newStreamId;
1627
+ streamTextRef.current = "";
1628
+ setStreamText("");
1629
+ const session = StreamPlaybackManager.createSession(newStreamId, {
1630
+ ttsConfig,
1631
+ audioParams,
1632
+ autoPlay,
1633
+ metricsCollector,
1634
+ visualization,
1635
+ maxSegmentLength,
1636
+ onPlayStart,
1637
+ onPlayPause,
1638
+ onPlayResume,
1639
+ onPlayEnd,
1640
+ onError: (err) => {
1641
+ setState((prev) => ({ ...prev, error: err.message }));
1642
+ onError?.(err);
1643
+ }
1644
+ });
1645
+ await session.connect();
1646
+ return newStreamId;
1640
1647
  }, [
1641
1648
  ttsConfig,
1642
1649
  audioParams,
1643
1650
  autoPlay,
1644
- stop,
1645
- instanceId,
1646
- onPlayStart,
1647
- onPlayEnd,
1648
- initAudioContext,
1649
- pause,
1650
1651
  metricsCollector,
1652
+ visualization,
1651
1653
  maxSegmentLength,
1652
- sendNextSegment,
1654
+ onPlayStart,
1655
+ onPlayPause,
1656
+ onPlayResume,
1657
+ onPlayEnd,
1653
1658
  onError
1654
1659
  ]);
1660
+ (0, import_react4.useEffect)(() => {
1661
+ if (!streamId) return;
1662
+ const session = StreamPlaybackManager.getSession(streamId);
1663
+ if (!session) return;
1664
+ const unsubscribe = session.subscribe((newState) => {
1665
+ setState(newState);
1666
+ });
1667
+ return () => {
1668
+ unsubscribe();
1669
+ };
1670
+ }, [streamId]);
1655
1671
  const onMessage = (0, import_react4.useCallback)((chunk) => {
1656
- if (!chunk) return;
1672
+ if (!streamIdRef.current) return;
1657
1673
  streamTextRef.current += chunk;
1658
1674
  setStreamText(streamTextRef.current);
1659
- if (!calledSessionStartedRef.current && !isSessionStartedRef.current && clientRef.current && isConnectedRef.current) {
1660
- console.log("[useStreamTTS] First text received, starting session...");
1661
- calledSessionStartedRef.current = true;
1662
- clientRef.current.startSession();
1663
- }
1664
- splitterRef.current?.onChunk(chunk);
1675
+ const session = StreamPlaybackManager.getSession(streamIdRef.current);
1676
+ session?.handleStreamChunk(chunk);
1665
1677
  }, []);
1666
1678
  const finishStream = (0, import_react4.useCallback)(async () => {
1667
- isStreamFinishedRef.current = true;
1668
- splitterRef.current?.complete();
1669
- console.log(`[useStreamTTS] Stream finished, ${segmentQueueRef.current.length} segments remaining in queue`);
1670
- if (segmentQueueRef.current.length > 0 || isSendingRef.current) {
1671
- await new Promise((resolve) => {
1672
- resolveAllSegmentsSentRef.current = resolve;
1673
- });
1674
- } else if (clientRef.current && isSessionStartedRef.current && !isSessionFinishedRef.current) {
1675
- isSessionFinishedRef.current = true;
1676
- clientRef.current.finishSession();
1677
- }
1679
+ if (!streamIdRef.current) return;
1680
+ const session = StreamPlaybackManager.getSession(streamIdRef.current);
1681
+ await session?.finishStream();
1678
1682
  }, []);
1679
- const seek = (0, import_react4.useCallback)((percentage) => {
1680
- if (audioRef.current) {
1681
- let duration = audioRef.current.duration;
1682
- if (!isFinite(duration) && audioRef.current.buffered.length > 0) {
1683
- duration = audioRef.current.buffered.end(audioRef.current.buffered.length - 1);
1684
- }
1685
- if (isFinite(duration) && duration > 0) {
1686
- const time = percentage / 100 * duration;
1687
- if (isFinite(time)) {
1688
- audioRef.current.currentTime = time;
1689
- setProgress(percentage);
1690
- }
1691
- }
1683
+ const pause = (0, import_react4.useCallback)(() => {
1684
+ if (streamIdRef.current) {
1685
+ StreamPlaybackManager.pause(streamIdRef.current);
1692
1686
  }
1693
1687
  }, []);
1694
- const getFrequencyData = (0, import_react4.useCallback)(() => {
1695
- if (!analyserRef.current) return new Uint8Array(0);
1696
- const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
1697
- analyserRef.current.getByteFrequencyData(dataArray);
1698
- return dataArray;
1688
+ const resume = (0, import_react4.useCallback)(() => {
1689
+ if (streamIdRef.current) {
1690
+ StreamPlaybackManager.resume(streamIdRef.current);
1691
+ }
1699
1692
  }, []);
1700
- const getTimeDomainData = (0, import_react4.useCallback)(() => {
1701
- if (!analyserRef.current) return new Uint8Array(0);
1702
- const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
1703
- analyserRef.current.getByteTimeDomainData(dataArray);
1704
- return dataArray;
1693
+ const stop = (0, import_react4.useCallback)(() => {
1694
+ if (streamIdRef.current) {
1695
+ StreamPlaybackManager.stop(streamIdRef.current);
1696
+ setStreamId("");
1697
+ streamIdRef.current = "";
1698
+ }
1705
1699
  }, []);
1706
- (0, import_react4.useEffect)(() => {
1707
- if (!visualization?.enabled) return;
1708
- let animId;
1709
- let lastUpdate = 0;
1710
- const interval = visualization.refreshInterval || 0;
1711
- const update = (timestamp) => {
1712
- if (isPlaying && !isPaused) {
1713
- if (timestamp - lastUpdate >= interval) {
1714
- setVisualizationData({
1715
- frequencyData: getFrequencyData(),
1716
- timeDomainData: getTimeDomainData()
1717
- });
1718
- lastUpdate = timestamp;
1719
- }
1720
- animId = requestAnimationFrame(update);
1721
- }
1722
- };
1723
- if (isPlaying && !isPaused) {
1724
- animId = requestAnimationFrame(update);
1700
+ const seek = (0, import_react4.useCallback)((percentage) => {
1701
+ if (streamIdRef.current) {
1702
+ StreamPlaybackManager.getSession(streamIdRef.current)?.seek(percentage);
1725
1703
  }
1726
- return () => {
1727
- if (animId) cancelAnimationFrame(animId);
1728
- };
1729
- }, [isPlaying, isPaused, visualization, getFrequencyData, getTimeDomainData]);
1704
+ }, []);
1730
1705
  (0, import_react4.useEffect)(() => {
1731
1706
  return () => {
1732
- stop();
1733
- if (audioContextRef.current) {
1734
- audioContextRef.current.close();
1707
+ if (streamIdRef.current) {
1708
+ StreamPlaybackManager.stop(streamIdRef.current);
1735
1709
  }
1736
1710
  };
1737
- }, [stop]);
1711
+ }, []);
1712
+ const getFrequencyData = (0, import_react4.useCallback)(
1713
+ () => state.visualizationData.frequencyData,
1714
+ [state.visualizationData]
1715
+ );
1716
+ const getTimeDomainData = (0, import_react4.useCallback)(
1717
+ () => state.visualizationData.timeDomainData,
1718
+ [state.visualizationData]
1719
+ );
1738
1720
  return {
1739
- isConnected,
1740
- isSessionStarted,
1741
- isSynthesizing,
1742
- isPlaying,
1743
- isPaused,
1744
- error,
1721
+ streamId,
1722
+ isConnected: state.isConnected,
1723
+ isSessionStarted: state.isSessionStarted,
1724
+ isSynthesizing: state.isSynthesizing,
1725
+ isPlaying: state.isPlaying,
1726
+ isPaused: state.isPaused,
1727
+ error: state.error,
1745
1728
  streamText,
1746
- progress,
1729
+ progress: state.progress,
1747
1730
  connect,
1748
1731
  onMessage,
1749
1732
  finishStream,
@@ -1753,23 +1736,9 @@ function useStreamTTS({
1753
1736
  seek,
1754
1737
  getFrequencyData,
1755
1738
  getTimeDomainData,
1756
- visualizationData
1739
+ visualizationData: state.visualizationData
1757
1740
  };
1758
1741
  }
1759
- function getSessionAudioCache(instanceId) {
1760
- return sessionAudioCache.get(instanceId);
1761
- }
1762
- function clearSessionAudioCache(instanceId) {
1763
- sessionAudioCache.delete(instanceId);
1764
- }
1765
- function findSessionCacheByText(streamText, voice, speed) {
1766
- for (const entry of sessionAudioCache.values()) {
1767
- if (entry.streamText === streamText && entry.voice === voice && entry.speed === speed) {
1768
- return entry;
1769
- }
1770
- }
1771
- return void 0;
1772
- }
1773
1742
 
1774
1743
  // src/components/AudioWaveVisualizer.tsx
1775
1744
  var import_react5 = require("react");
@@ -2145,10 +2114,8 @@ var AudioProgressBar_default = AudioProgressBar;
2145
2114
  0 && (module.exports = {
2146
2115
  AudioProgressBar,
2147
2116
  AudioWaveVisualizer,
2117
+ StreamPlaybackManager,
2148
2118
  StreamingTextSplitter,
2149
- clearSessionAudioCache,
2150
- findSessionCacheByText,
2151
- getSessionAudioCache,
2152
2119
  splitTextByDelimiters,
2153
2120
  useMessageTTS,
2154
2121
  useStreamTTS,