@wq-hook/volcano-react 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -396,10 +396,253 @@ function useVolcanoTTS({
396
396
  }
397
397
 
398
398
  // src/tts/useMessageTTS.ts
399
+ import { useCallback as useCallback3, useEffect as useEffect2, useRef as useRef3, useState as useState3 } from "react";
400
+
401
+ // src/tts/StreamPlaybackManager.ts
399
402
  import { WebsocketMSE as WebsocketMSE2 } from "@wq-hook/volcano-sdk/tts";
403
+
404
+ // src/tts/StreamingTextSplitter.ts
400
405
  import { MarkdownFormatter as MarkdownFormatter2 } from "@wq-hook/volcano-sdk";
401
- import { useCallback as useCallback3, useEffect as useEffect2, useRef as useRef3, useState as useState3 } from "react";
402
406
  import emojiRegex2 from "emoji-regex";
407
+ var StreamingTextSplitter = class {
408
+ constructor(options = {}) {
409
+ /** 当前缓冲区 */
410
+ this.buffer = "";
411
+ /** 分段索引计数器 */
412
+ this.segmentIndex = 0;
413
+ /** 已完成的分段列表 */
414
+ this.segments = [];
415
+ /** 是否已完成 */
416
+ this.isCompleted = false;
417
+ this.maxLength = options.maxLength || 150;
418
+ this.minLength = options.minLength || 10;
419
+ this.onSegmentComplete = options.onSegmentComplete;
420
+ this.onAllComplete = options.onAllComplete;
421
+ }
422
+ /**
423
+ * 接收流式文本块
424
+ * @param chunk - 文本块
425
+ */
426
+ onChunk(chunk) {
427
+ if (!chunk || this.isCompleted) return;
428
+ this.buffer += chunk;
429
+ if (this.detectBoundary(chunk)) {
430
+ const newlineIndex = this.buffer.indexOf("\n");
431
+ if (newlineIndex !== -1) {
432
+ if (newlineIndex === 0) {
433
+ this.buffer = this.buffer.substring(1);
434
+ return;
435
+ }
436
+ const segmentBuffer = this.buffer.substring(0, newlineIndex);
437
+ this.buffer = this.buffer.substring(newlineIndex + 1);
438
+ this.flushSegmentWithBuffer(segmentBuffer);
439
+ while (this.buffer.includes("\n")) {
440
+ const nextNewlineIndex = this.buffer.indexOf("\n");
441
+ if (nextNewlineIndex === 0) {
442
+ this.buffer = this.buffer.substring(1);
443
+ continue;
444
+ }
445
+ const nextSegmentBuffer = this.buffer.substring(0, nextNewlineIndex);
446
+ this.buffer = this.buffer.substring(nextNewlineIndex + 1);
447
+ this.flushSegmentWithBuffer(nextSegmentBuffer);
448
+ }
449
+ }
450
+ }
451
+ }
452
+ /**
453
+ * 检测分段边界
454
+ * @param chunk - 最新接收的文本块
455
+ * @returns 是否应该分段
456
+ */
457
+ detectBoundary(chunk) {
458
+ if (chunk.includes("\n")) {
459
+ if (this.buffer.length >= this.maxLength) {
460
+ this.forceSplitAtSentenceBoundary();
461
+ }
462
+ return true;
463
+ }
464
+ if (this.buffer.length >= this.maxLength) {
465
+ this.forceSplitAtSentenceBoundary();
466
+ return true;
467
+ }
468
+ return false;
469
+ }
470
+ /**
471
+ * 在句子边界强制拆分超长段落
472
+ */
473
+ forceSplitAtSentenceBoundary() {
474
+ const content = this.buffer;
475
+ const sentenceEnders = /[。?!]/g;
476
+ let lastMatch = null;
477
+ let match = null;
478
+ while ((match = sentenceEnders.exec(content)) !== null) {
479
+ lastMatch = match;
480
+ }
481
+ if (lastMatch && lastMatch.index > this.minLength) {
482
+ const splitPoint = lastMatch.index + 1;
483
+ const firstPart = content.substring(0, splitPoint);
484
+ const secondPart = content.substring(splitPoint);
485
+ this.buffer = firstPart;
486
+ this.flushSegment();
487
+ this.buffer = secondPart;
488
+ } else {
489
+ const midPoint = Math.floor(content.length / 2);
490
+ const firstPart = content.substring(0, midPoint);
491
+ const secondPart = content.substring(midPoint);
492
+ this.buffer = firstPart;
493
+ this.flushSegment();
494
+ this.buffer = secondPart;
495
+ }
496
+ }
497
+ /**
498
+ * 使用指定缓冲区内容刷新为分段
499
+ * @param bufferToFlush - 要分段的缓冲区内容
500
+ */
501
+ flushSegmentWithBuffer(bufferToFlush) {
502
+ const content = bufferToFlush;
503
+ if (!content) return;
504
+ const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
505
+ const isTooShort = content.length < 3;
506
+ if (isPureSymbols && isTooShort) {
507
+ return;
508
+ }
509
+ const formattedContent = MarkdownFormatter2.format(content).replace(emojiRegex2(), "");
510
+ if (!formattedContent) return;
511
+ let subSegments = [formattedContent];
512
+ if (formattedContent.length > this.maxLength) {
513
+ subSegments = this.splitLongSegment(formattedContent);
514
+ }
515
+ for (const subSegment of subSegments) {
516
+ if (!subSegment) continue;
517
+ const segment = {
518
+ index: this.segmentIndex++,
519
+ content: subSegment,
520
+ length: subSegment.length,
521
+ sent: false
522
+ };
523
+ this.segments.push(segment);
524
+ this.onSegmentComplete?.(segment);
525
+ }
526
+ }
527
+ /**
528
+ * 刷新当前缓冲区为分段
529
+ */
530
+ flushSegment() {
531
+ const content = this.buffer.trim();
532
+ if (!content) {
533
+ this.buffer = "";
534
+ return;
535
+ }
536
+ const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
537
+ const isTooShort = content.length < 3;
538
+ if (isPureSymbols && isTooShort) {
539
+ this.buffer = "";
540
+ return;
541
+ }
542
+ const formattedContent = MarkdownFormatter2.format(content).replace(emojiRegex2(), "");
543
+ if (!formattedContent) {
544
+ this.buffer = "";
545
+ return;
546
+ }
547
+ let subSegments = [formattedContent];
548
+ if (formattedContent.length > this.maxLength) {
549
+ subSegments = this.splitLongSegment(formattedContent);
550
+ }
551
+ for (const subSegment of subSegments) {
552
+ if (!subSegment) continue;
553
+ const segment = {
554
+ index: this.segmentIndex++,
555
+ content: subSegment,
556
+ length: subSegment.length,
557
+ sent: false
558
+ };
559
+ this.segments.push(segment);
560
+ this.onSegmentComplete?.(segment);
561
+ }
562
+ this.buffer = "";
563
+ }
564
+ /**
565
+ * 拆分超长分段
566
+ * @param segment - 超长的分段
567
+ * @returns 拆分后的分段数组
568
+ */
569
+ splitLongSegment(segment) {
570
+ const result = [];
571
+ let current = "";
572
+ for (const char of segment) {
573
+ current += char;
574
+ const shouldSplit = /[。?!,,]/.test(char);
575
+ if (shouldSplit && current.length <= this.maxLength) {
576
+ result.push(current);
577
+ current = "";
578
+ } else if (current.length >= this.maxLength) {
579
+ result.push(current);
580
+ current = "";
581
+ }
582
+ }
583
+ if (current) {
584
+ result.push(current);
585
+ }
586
+ return result.filter((s) => s.length > 0);
587
+ }
588
+ /**
589
+ * 完成流式输入
590
+ * 处理剩余的缓冲区内容
591
+ */
592
+ complete() {
593
+ if (this.isCompleted) return;
594
+ this.isCompleted = true;
595
+ while (this.buffer.includes("\n")) {
596
+ const newlineIndex = this.buffer.indexOf("\n");
597
+ if (newlineIndex === 0) {
598
+ this.buffer = this.buffer.substring(1);
599
+ continue;
600
+ }
601
+ const segmentBuffer = this.buffer.substring(0, newlineIndex);
602
+ this.buffer = this.buffer.substring(newlineIndex + 1);
603
+ this.flushSegmentWithBuffer(segmentBuffer);
604
+ }
605
+ if (this.buffer.trim()) {
606
+ this.flushSegment();
607
+ }
608
+ this.onAllComplete?.(this.segments);
609
+ }
610
+ /**
611
+ * 重置分段器状态
612
+ */
613
+ reset() {
614
+ this.buffer = "";
615
+ this.segmentIndex = 0;
616
+ this.segments = [];
617
+ this.isCompleted = false;
618
+ }
619
+ /**
620
+ * 获取当前缓冲区内容
621
+ */
622
+ getBuffer() {
623
+ return this.buffer;
624
+ }
625
+ /**
626
+ * 获取已分段的列表
627
+ */
628
+ getSegments() {
629
+ return this.segments;
630
+ }
631
+ /**
632
+ * 获取统计信息
633
+ */
634
+ getStats() {
635
+ return {
636
+ bufferLength: this.buffer.length,
637
+ segmentCount: this.segments.length,
638
+ totalChars: this.segments.reduce((sum, seg) => sum + seg.length, 0)
639
+ };
640
+ }
641
+ };
642
+
643
+ // src/tts/StreamPlaybackManager.ts
644
+ import emojiRegex3 from "emoji-regex";
645
+ import { MarkdownFormatter as MarkdownFormatter3 } from "@wq-hook/volcano-sdk";
403
646
 
404
647
  // src/tts/TextSplitter.ts
405
648
  function splitTextByDelimiters(text, minLength = 10, maxLength = 150) {
@@ -469,338 +712,178 @@ function splitTextByDelimiters(text, minLength = 10, maxLength = 150) {
469
712
  return segments;
470
713
  }
471
714
 
472
- // src/tts/Metrics.ts
473
- var NoopMetricsCollector = class {
474
- record(_metric) {
475
- }
476
- };
477
-
478
- // src/tts/useMessageTTS.ts
715
+ // src/tts/StreamPlaybackManager.ts
479
716
  var WS_URL = "wss://openspeech.bytedance.com/api/v3/tts/bidirection";
480
- var activeInstances = /* @__PURE__ */ new Map();
481
717
  function buildFullUrl2(url, params) {
482
- const { ...auth } = params;
483
718
  const arr = [];
484
- for (const key in auth) {
485
- if (Object.prototype.hasOwnProperty.call(auth, key)) {
719
+ for (const key in params) {
720
+ if (Object.prototype.hasOwnProperty.call(params, key)) {
486
721
  arr.push(
487
- `${key}=${encodeURIComponent(auth[key])}`
722
+ `${key}=${encodeURIComponent(params[key])}`
488
723
  );
489
724
  }
490
725
  }
491
726
  return `${url}?${arr.join("&")}`;
492
727
  }
493
- function useMessageTTS({
494
- ttsConfig,
495
- audioParams,
496
- autoPlay = true,
497
- metricsCollector = new NoopMetricsCollector(),
498
- onPlayStart,
499
- onPlayPause,
500
- onPlayResume,
501
- onPlayEnd,
502
- onError,
503
- exclusive = true,
504
- fallbackVoice,
505
- visualization
506
- }) {
507
- const [isPlaying, setIsPlaying] = useState3(false);
508
- const [isPaused, setIsPaused] = useState3(false);
509
- const [isSynthesizing, setIsSynthesizing] = useState3(false);
510
- const [error, setErrorState] = useState3(null);
511
- const [progress, setProgress] = useState3(0);
512
- const [visualizationData, setVisualizationData] = useState3(
513
- {
514
- frequencyData: new Uint8Array(0),
515
- timeDomainData: new Uint8Array(0)
516
- }
517
- );
518
- const instanceId = useRef3(
519
- `tts-${Date.now()}-${Math.random().toString(36).slice(2)}`
520
- ).current;
521
- const clientRef = useRef3(null);
522
- const audioRef = useRef3(null);
523
- const audioContextRef = useRef3(null);
524
- const analyserRef = useRef3(null);
525
- const sourceRef = useRef3(null);
526
- const audioUrlRef = useRef3(null);
527
- const cacheKeyRef = useRef3("");
528
- const audioBuffersRef = useRef3([]);
529
- const isFallbackRef = useRef3(false);
530
- const fallbackUtteranceRef = useRef3(null);
531
- const stopOthers = useCallback3(() => {
532
- if (!exclusive) return;
533
- activeInstances.forEach((instance, id) => {
534
- if (id !== instanceId) {
535
- instance.pause();
536
- }
537
- });
538
- }, [exclusive, instanceId]);
539
- const initAudioContext = useCallback3(() => {
540
- if (!audioRef.current) return;
541
- if (!audioContextRef.current) {
728
+ var PlaybackSession = class {
729
+ constructor(id, config) {
730
+ this.listeners = /* @__PURE__ */ new Set();
731
+ this.audioContext = null;
732
+ this.analyser = null;
733
+ this.source = null;
734
+ this.audioUrl = null;
735
+ // TTS Resources
736
+ this.client = null;
737
+ this.splitter = null;
738
+ // Internal State
739
+ this.segmentQueue = [];
740
+ this.isSending = false;
741
+ this.isSessionStarting = false;
742
+ this.streamText = "";
743
+ this.sessionAudioBuffers = [];
744
+ this.isStreamFinished = false;
745
+ this.isSessionFinished = false;
746
+ this.resolveAllSegmentsSent = null;
747
+ this.animId = null;
748
+ this.lastVisUpdate = 0;
749
+ this.id = id;
750
+ this.config = config;
751
+ this.state = {
752
+ isPlaying: false,
753
+ isPaused: false,
754
+ isSynthesizing: false,
755
+ progress: 0,
756
+ visualizationData: {
757
+ frequencyData: new Uint8Array(0),
758
+ timeDomainData: new Uint8Array(0)
759
+ },
760
+ error: null,
761
+ isConnected: false,
762
+ isSessionStarted: false,
763
+ isStreamFinished: false
764
+ };
765
+ this.audio = new Audio();
766
+ this.audio.crossOrigin = "anonymous";
767
+ this.setupAudioListeners();
768
+ }
769
+ /**
770
+ * 初始化 AudioContext(用于可视化)
771
+ */
772
+ initAudioContext() {
773
+ if (!this.audioContext) {
542
774
  const AudioContextClass = window.AudioContext || window.webkitAudioContext;
543
- audioContextRef.current = new AudioContextClass();
775
+ this.audioContext = new AudioContextClass();
544
776
  }
545
- if (audioContextRef.current.state === "suspended") {
546
- audioContextRef.current.resume();
777
+ if (this.audioContext.state === "suspended") {
778
+ this.audioContext.resume();
547
779
  }
548
- if (!analyserRef.current) {
549
- analyserRef.current = audioContextRef.current.createAnalyser();
550
- analyserRef.current.fftSize = visualization?.fftSize || 256;
780
+ if (!this.analyser && this.audioContext) {
781
+ this.analyser = this.audioContext.createAnalyser();
782
+ this.analyser.fftSize = this.config.visualization?.fftSize || 256;
551
783
  }
552
- if (!sourceRef.current) {
784
+ if (!this.source && this.audioContext && this.analyser) {
553
785
  try {
554
- sourceRef.current = audioContextRef.current.createMediaElementSource(
555
- audioRef.current
556
- );
557
- sourceRef.current.connect(analyserRef.current);
558
- analyserRef.current.connect(audioContextRef.current.destination);
786
+ this.source = this.audioContext.createMediaElementSource(this.audio);
787
+ this.source.connect(this.analyser);
788
+ this.analyser.connect(this.audioContext.destination);
559
789
  } catch (e) {
560
790
  }
561
791
  }
562
- }, []);
563
- const cleanupAudio = useCallback3(() => {
564
- if (audioUrlRef.current) {
565
- URL.revokeObjectURL(audioUrlRef.current);
566
- audioUrlRef.current = null;
567
- }
568
- if (audioRef.current) {
569
- audioRef.current.onerror = null;
570
- audioRef.current.onended = null;
571
- audioRef.current.onpause = null;
572
- audioRef.current.onplay = null;
573
- audioRef.current.ontimeupdate = null;
574
- audioRef.current.pause();
575
- audioRef.current.src = "";
576
- audioRef.current = null;
577
- }
578
- if (sourceRef.current) {
579
- try {
580
- sourceRef.current.disconnect();
581
- } catch (e) {
792
+ }
793
+ setupAudioListeners() {
794
+ this.audio.onplay = () => {
795
+ this.updateState({ isPlaying: true, isPaused: false });
796
+ this.config.onPlayStart?.();
797
+ this.initAudioContext();
798
+ this.startVisualizationLoop();
799
+ };
800
+ this.audio.onpause = () => {
801
+ this.updateState({ isPaused: true, isPlaying: false });
802
+ this.config.onPlayPause?.();
803
+ };
804
+ this.audio.onended = () => {
805
+ this.updateState({
806
+ isPlaying: false,
807
+ isPaused: false,
808
+ isSynthesizing: false,
809
+ progress: 100
810
+ });
811
+ this.config.onPlayEnd?.();
812
+ this.stopVisualizationLoop();
813
+ };
814
+ this.audio.onerror = (e) => {
815
+ const msg = this.audio.error?.message || "Audio playback error";
816
+ console.error("[PlaybackSession] Audio error:", msg);
817
+ this.updateState({ error: msg });
818
+ this.config.onError?.(new Error(msg));
819
+ };
820
+ this.audio.ontimeupdate = () => {
821
+ let duration = this.audio.duration;
822
+ if (!isFinite(duration) && this.audio.buffered.length > 0) {
823
+ duration = this.audio.buffered.end(this.audio.buffered.length - 1);
582
824
  }
583
- sourceRef.current = null;
584
- }
585
- if (fallbackUtteranceRef.current) {
586
- window.speechSynthesis.cancel();
587
- fallbackUtteranceRef.current = null;
588
- }
589
- isFallbackRef.current = false;
590
- }, []);
591
- const stop = useCallback3(() => {
592
- if (clientRef.current) {
593
- clientRef.current.close();
594
- clientRef.current = null;
595
- }
596
- cleanupAudio();
597
- setIsPlaying(false);
598
- setIsPaused(false);
599
- setIsSynthesizing(false);
600
- setProgress(0);
601
- activeInstances.delete(instanceId);
602
- }, [cleanupAudio, instanceId]);
603
- const pause = useCallback3(() => {
604
- if (isFallbackRef.current) {
605
- window.speechSynthesis.pause();
606
- } else if (audioRef.current) {
607
- audioRef.current.pause();
608
- }
609
- setIsPaused(true);
610
- setIsPlaying(false);
611
- onPlayPause?.();
612
- }, [onPlayPause]);
613
- const resume = useCallback3(() => {
614
- stopOthers();
615
- if (isFallbackRef.current) {
616
- window.speechSynthesis.resume();
617
- } else if (audioRef.current) {
618
- audioRef.current.play();
619
- }
620
- setIsPaused(false);
621
- setIsPlaying(true);
622
- onPlayResume?.();
623
- activeInstances.set(instanceId, { pause });
624
- }, [stopOthers, instanceId, pause, onPlayResume]);
625
- const togglePlay = useCallback3(() => {
626
- if (isPlaying) {
627
- pause();
628
- } else {
629
- resume();
630
- }
631
- }, [isPlaying, pause, resume]);
632
- const playFallback = useCallback3(
633
- (text) => {
634
- console.warn("[useMessageTTS] Switching to fallback TTS");
635
- isFallbackRef.current = true;
636
- if (clientRef.current) {
637
- clientRef.current.close();
638
- clientRef.current = null;
825
+ if (isFinite(duration) && duration > 0) {
826
+ const progress = this.audio.currentTime / duration * 100;
827
+ this.updateState({ progress });
639
828
  }
640
- if (audioRef.current) {
641
- audioRef.current.pause();
642
- audioRef.current = null;
829
+ };
830
+ }
831
+ /**
832
+ * 建立 WebSocket 连接
833
+ */
834
+ async connect() {
835
+ if (this.state.isConnected) return;
836
+ this.updateState({
837
+ error: null,
838
+ progress: 0,
839
+ isSynthesizing: false,
840
+ isConnected: false,
841
+ isSessionStarted: false
842
+ });
843
+ this.streamText = "";
844
+ this.segmentQueue = [];
845
+ this.sessionAudioBuffers = [];
846
+ this.isStreamFinished = false;
847
+ this.isSessionFinished = false;
848
+ this.isSessionStarting = false;
849
+ if (this.client) {
850
+ this.client.close();
851
+ this.client = null;
852
+ }
853
+ this.splitter = new StreamingTextSplitter({
854
+ maxLength: this.config.maxSegmentLength || 150,
855
+ onSegmentComplete: (segment) => {
856
+ this.segmentQueue.push(segment);
857
+ if (this.state.isSessionStarted) {
858
+ this.processQueue();
859
+ }
643
860
  }
644
- const utterance = new SpeechSynthesisUtterance(text);
645
- utterance.rate = audioParams?.speech_rate || 1;
646
- const voices = window.speechSynthesis.getVoices();
647
- const zhVoice = voices.find((v) => v.lang.includes("zh"));
648
- if (zhVoice) utterance.voice = zhVoice;
649
- utterance.onstart = () => {
650
- setIsPlaying(true);
651
- setIsPaused(false);
652
- setIsSynthesizing(false);
653
- onPlayStart?.();
654
- activeInstances.set(instanceId, { pause });
655
- };
656
- utterance.onend = () => {
657
- setIsPlaying(false);
658
- setIsPaused(false);
659
- activeInstances.delete(instanceId);
660
- onPlayEnd?.();
661
- };
662
- utterance.onerror = (e) => {
663
- console.error("[useMessageTTS] Fallback TTS failed", e);
664
- setErrorState("Fallback TTS failed");
665
- onError?.(new Error("Fallback TTS failed"));
666
- setIsPlaying(false);
667
- };
668
- fallbackUtteranceRef.current = utterance;
669
- window.speechSynthesis.speak(utterance);
670
- },
671
- [audioParams, instanceId, onError, onPlayEnd, onPlayStart, pause]
672
- );
673
- const executeTTS = useCallback3(
674
- async (text, targetVoice) => {
675
- stop();
676
- stopOthers();
677
- setErrorState(null);
678
- setIsSynthesizing(true);
679
- setProgress(0);
680
- audioBuffersRef.current = [];
681
- isFallbackRef.current = false;
682
- const speed = audioParams?.speech_rate || 0;
683
- const voice = targetVoice;
684
- const cacheKey = TTSCache.generateKey(text, voice, speed);
685
- cacheKeyRef.current = cacheKey;
686
- const startTime = Date.now();
687
- metricsCollector.record({
688
- name: "tts_request",
689
- labels: { voice, speed, text_length: text.length },
690
- value: 1,
691
- timestamp: startTime
692
- });
861
+ });
862
+ this.client = WebsocketMSE2({ autoStartSession: false });
863
+ const { ttsConfig, audioParams } = this.config;
864
+ const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
865
+ const startTime = Date.now();
866
+ this.config.metricsCollector?.record({
867
+ name: "tts_request",
868
+ labels: { voice, text_length: 0 },
869
+ value: 1,
870
+ timestamp: startTime
871
+ });
872
+ return new Promise((resolve, reject) => {
873
+ const timeoutId = setTimeout(() => {
874
+ const err = new Error("WebSocket connection timeout (10s)");
875
+ this.updateState({ error: err.message });
876
+ reject(err);
877
+ }, 1e4);
693
878
  try {
694
- const cachedData = await TTSCache.get(cacheKey);
695
- const audio = new Audio();
696
- audio.crossOrigin = "anonymous";
697
- audioRef.current = audio;
698
- audio.onplay = () => {
699
- setIsPlaying(true);
700
- setIsPaused(false);
701
- onPlayStart?.();
702
- initAudioContext();
703
- activeInstances.set(instanceId, { pause });
704
- metricsCollector.record({
705
- name: "tts_latency",
706
- labels: { stage: "playback", voice, speed },
707
- value: Date.now() - startTime,
708
- timestamp: Date.now()
709
- });
710
- };
711
- audio.onpause = () => {
712
- if (!audio.ended) {
713
- }
714
- };
715
- audio.onended = () => {
716
- setIsPlaying(false);
717
- setIsPaused(false);
718
- onPlayEnd?.();
719
- activeInstances.delete(instanceId);
720
- };
721
- audio.onerror = (e) => {
722
- console.error("Audio playback error:", e, audio.error);
723
- metricsCollector.record({
724
- name: "tts_error",
725
- labels: {
726
- error_code: "playback_error",
727
- voice,
728
- detail: audio.error?.message || String(audio.error?.code)
729
- },
730
- value: 1,
731
- timestamp: Date.now()
732
- });
733
- handleError(text, voice);
734
- };
735
- audio.ontimeupdate = () => {
736
- let duration = audio.duration;
737
- if (!isFinite(duration)) {
738
- if (audio.buffered.length > 0) {
739
- duration = audio.buffered.end(audio.buffered.length - 1);
740
- }
741
- }
742
- if (isFinite(duration) && duration > 0) {
743
- setProgress(audio.currentTime / duration * 100);
744
- }
745
- };
746
- if (cachedData) {
747
- const totalSize = cachedData.reduce(
748
- (acc, buf) => acc + buf.byteLength,
749
- 0
750
- );
751
- metricsCollector.record({
752
- name: "tts_cache_hit",
753
- labels: { voice, speed },
754
- value: 1,
755
- timestamp: Date.now()
756
- });
757
- console.log(
758
- JSON.stringify({
759
- event: "tts_cache_hit",
760
- cache_hit: true,
761
- text_len: text.length,
762
- voice,
763
- speed,
764
- data_size: totalSize
765
- })
766
- );
767
- if (totalSize === 0) {
768
- console.warn(
769
- "[useMessageTTS] Cached data is empty, falling back to stream"
770
- );
771
- } else {
772
- const blob = new Blob(cachedData, { type: "audio/mpeg" });
773
- const url2 = URL.createObjectURL(blob);
774
- audioUrlRef.current = url2;
775
- audio.src = url2;
776
- setIsSynthesizing(false);
777
- if (autoPlay) {
778
- try {
779
- await audio.play();
780
- } catch (err) {
781
- console.warn("AutoPlay blocked", err);
782
- }
783
- }
784
- return;
785
- }
786
- }
787
- console.log("[useMessageTTS] Cache miss, starting stream");
788
- clientRef.current = WebsocketMSE2({ autoStartSession: true });
789
- const formattedText = MarkdownFormatter2.format(text).replace(
790
- emojiRegex2(),
791
- ""
792
- );
793
- const segments = splitTextByDelimiters(formattedText);
794
- const url = clientRef.current.start({
879
+ const url = this.client.start({
795
880
  url: buildFullUrl2(WS_URL, {
796
881
  api_access_key: `Jwt; ${ttsConfig.token}`,
797
882
  api_app_key: ttsConfig.appid,
798
883
  api_resource_id: ttsConfig.resourceId || "seed-tts-2.0"
799
884
  }),
800
885
  config: {
801
- user: {
802
- uid: `req-${Date.now()}`
803
- },
886
+ user: { uid: `req-${Date.now()}` },
804
887
  namespace: ttsConfig.namespace || "BidirectionalTTS",
805
888
  req_params: {
806
889
  speaker: voice,
@@ -815,456 +898,362 @@ function useMessageTTS({
815
898
  enable_language_detector: true,
816
899
  disable_markdown_filter: true,
817
900
  enable_latex_tn: true
818
- // max_length_to_filter_parenthesis: 100,
819
901
  })
820
902
  }
821
903
  },
904
+ onStart: () => {
905
+ this.updateState({ isConnected: true });
906
+ },
907
+ onConnectionReady: () => {
908
+ clearTimeout(timeoutId);
909
+ resolve();
910
+ },
822
911
  onSessionStarted: () => {
823
- segments.forEach((seg) => {
824
- clientRef.current?.sendText(seg.content);
825
- });
826
- clientRef.current?.finishSession();
912
+ this.updateState({ isSessionStarted: true });
913
+ this.isSessionStarting = false;
914
+ if (this.segmentQueue.length > 0) {
915
+ this.processQueue();
916
+ }
827
917
  },
828
918
  onMessage: (data) => {
829
- if (audioBuffersRef.current.length === 0) {
830
- console.log(
831
- JSON.stringify({
832
- event: "tts_first_packet",
833
- latency_ms: Date.now() - startTime,
834
- voice
835
- })
836
- );
919
+ this.updateState({ isSynthesizing: true });
920
+ if (this.sessionAudioBuffers.length === 0) {
921
+ this.config.metricsCollector?.record({
922
+ name: "tts_latency",
923
+ labels: { stage: "first_packet", voice },
924
+ value: Date.now() - startTime,
925
+ timestamp: Date.now()
926
+ });
837
927
  }
838
928
  const buffer = data instanceof ArrayBuffer ? data.slice(0) : new Uint8Array(data).buffer;
839
- audioBuffersRef.current.push(buffer);
929
+ this.sessionAudioBuffers.push(buffer);
840
930
  },
841
931
  onSessionFinished: () => {
842
- setIsSynthesizing(false);
843
- if (audioBuffersRef.current.length > 0) {
844
- TTSCache.set(cacheKey, [...audioBuffersRef.current]);
845
- }
846
- console.log(
847
- JSON.stringify({
848
- event: "tts_synthesis_finished",
849
- cache_hit: false,
850
- text_len: text.length,
851
- duration_ms: Date.now() - startTime,
932
+ this.updateState({
933
+ isSynthesizing: false,
934
+ isSessionStarted: false
935
+ });
936
+ if (this.sessionAudioBuffers.length > 0 && this.streamText) {
937
+ const speed = audioParams?.speech_rate || 0;
938
+ const cacheKey = TTSCache.generateKey(
939
+ this.streamText,
852
940
  voice,
853
941
  speed
854
- })
855
- );
942
+ );
943
+ TTSCache.set(cacheKey, [...this.sessionAudioBuffers]);
944
+ }
945
+ this.config.metricsCollector?.record({
946
+ name: "tts_synthesis_finished",
947
+ labels: { voice, text_length: this.streamText.length },
948
+ value: Date.now() - startTime,
949
+ timestamp: Date.now()
950
+ });
856
951
  },
857
952
  onError: (err) => {
858
- console.error("TTS Synthesis error:", err);
859
- metricsCollector.record({
860
- name: "tts_error",
861
- labels: { error_code: "synthesis_error", voice },
862
- value: 1,
863
- timestamp: Date.now()
953
+ if (!this.state.isConnected) {
954
+ clearTimeout(timeoutId);
955
+ reject(new Error(err.msg || "TTS error"));
956
+ }
957
+ console.error("[PlaybackSession] TTS error:", err);
958
+ this.updateState({
959
+ error: err.msg || "TTS error",
960
+ isSynthesizing: false
864
961
  });
865
- handleError(text, voice);
866
- setIsSynthesizing(false);
962
+ this.config.onError?.(new Error(err.msg || "TTS error"));
963
+ },
964
+ onWSError: (err) => {
965
+ if (!this.state.isConnected) {
966
+ clearTimeout(timeoutId);
967
+ reject(err instanceof Error ? err : new Error("WebSocket error"));
968
+ }
867
969
  }
868
970
  });
869
- audioUrlRef.current = url;
870
- audio.src = url;
871
- if (autoPlay) {
872
- try {
873
- await audio.play();
874
- } catch (e) {
875
- console.warn("Autoplay blocked/pending", e);
876
- }
971
+ if (this.audioUrl) {
972
+ URL.revokeObjectURL(this.audioUrl);
973
+ }
974
+ this.audioUrl = url;
975
+ this.audio.src = url;
976
+ if (this.config.autoPlay !== false) {
977
+ this.audio.play().catch(
978
+ (e) => console.warn("[PlaybackSession] Autoplay blocked:", e)
979
+ );
877
980
  }
878
981
  } catch (err) {
879
- console.error("Unexpected error in executeTTS:", err);
880
- metricsCollector.record({
881
- name: "tts_error",
882
- labels: { error_code: "unexpected_error", voice },
883
- value: 1,
884
- timestamp: Date.now()
885
- });
886
- handleError(text, voice);
887
- }
888
- },
889
- [
890
- ttsConfig,
891
- audioParams,
892
- autoPlay,
893
- stop,
894
- stopOthers,
895
- instanceId,
896
- onPlayStart,
897
- onPlayEnd,
898
- initAudioContext,
899
- pause,
900
- fallbackVoice,
901
- metricsCollector
902
- ]
903
- );
904
- const handleError = useCallback3(
905
- (text, failedVoice) => {
906
- if (fallbackVoice && failedVoice !== fallbackVoice) {
907
- console.warn(
908
- `[useMessageTTS] Voice ${failedVoice} failed, switching to fallback voice ${fallbackVoice}`
982
+ clearTimeout(timeoutId);
983
+ console.error("[PlaybackSession] Connect error:", err);
984
+ this.updateState({ error: String(err) });
985
+ this.config.onError?.(
986
+ err instanceof Error ? err : new Error(String(err))
909
987
  );
910
- if (clientRef.current) {
911
- clientRef.current.close();
912
- clientRef.current = null;
913
- }
914
- if (audioRef.current) {
915
- audioRef.current.pause();
916
- audioRef.current = null;
988
+ reject(err);
989
+ }
990
+ });
991
+ }
992
+ /**
993
+ * 发送流式文本
994
+ */
995
+ handleStreamChunk(chunk) {
996
+ if (!chunk) return;
997
+ this.streamText += chunk;
998
+ if (!this.state.isSessionStarted && !this.isSessionStarting && this.client && this.state.isConnected && !this.isSessionFinished) {
999
+ this.isSessionStarting = true;
1000
+ this.client.startSession();
1001
+ }
1002
+ this.splitter?.onChunk(chunk);
1003
+ if (this.state.isSessionStarted) {
1004
+ this.processQueue();
1005
+ }
1006
+ }
1007
+ /**
1008
+ * 结束流式输入
1009
+ */
1010
+ async finishStream() {
1011
+ this.isStreamFinished = true;
1012
+ this.updateState({ isStreamFinished: true });
1013
+ this.splitter?.complete();
1014
+ if (this.state.isSessionStarted) {
1015
+ this.processQueue();
1016
+ }
1017
+ if (this.segmentQueue.length > 0 || this.isSending) {
1018
+ await new Promise((resolve) => {
1019
+ this.resolveAllSegmentsSent = resolve;
1020
+ });
1021
+ } else if (this.client && this.state.isSessionStarted && !this.isSessionFinished) {
1022
+ this.isSessionFinished = true;
1023
+ this.client.finishSession();
1024
+ }
1025
+ }
1026
+ /**
1027
+ * 处理非流式播放(直接播放整段文本)
1028
+ */
1029
+ async play(text) {
1030
+ const formattedText = MarkdownFormatter3.format(text).replace(
1031
+ emojiRegex3(),
1032
+ ""
1033
+ );
1034
+ const { audioParams } = this.config;
1035
+ const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
1036
+ const speed = audioParams?.speech_rate || 0;
1037
+ const cacheKey = TTSCache.generateKey(formattedText, voice, speed);
1038
+ const cachedData = await TTSCache.get(cacheKey);
1039
+ if (cachedData && cachedData.length > 0) {
1040
+ const blob = new Blob(cachedData, { type: "audio/mpeg" });
1041
+ const url = URL.createObjectURL(blob);
1042
+ if (this.audioUrl) URL.revokeObjectURL(this.audioUrl);
1043
+ this.audioUrl = url;
1044
+ this.audio.src = url;
1045
+ this.updateState({ isSynthesizing: false });
1046
+ if (this.config.autoPlay !== false) {
1047
+ try {
1048
+ await this.audio.play();
1049
+ } catch (e) {
1050
+ console.warn("Autoplay blocked", e);
917
1051
  }
918
- executeTTS(text, fallbackVoice);
919
- } else {
920
- playFallback(text);
921
1052
  }
922
- },
923
- [fallbackVoice, executeTTS, playFallback]
924
- );
925
- const play = useCallback3(
926
- (text) => {
927
- const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
928
- return executeTTS(text, voice);
929
- },
930
- [audioParams, executeTTS]
931
- );
932
- const getFrequencyData = useCallback3(() => {
933
- if (!analyserRef.current) return new Uint8Array(0);
934
- const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
935
- analyserRef.current.getByteFrequencyData(dataArray);
936
- return dataArray;
937
- }, []);
938
- const getTimeDomainData = useCallback3(() => {
939
- if (!analyserRef.current) return new Uint8Array(0);
940
- const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
941
- analyserRef.current.getByteTimeDomainData(dataArray);
942
- return dataArray;
943
- }, []);
944
- useEffect2(() => {
945
- if (!visualization?.enabled) return;
946
- let animId;
947
- let lastUpdate = 0;
948
- const interval = visualization.refreshInterval || 0;
1053
+ return;
1054
+ }
1055
+ await this.connect();
1056
+ this.streamText = formattedText;
1057
+ const segments = splitTextByDelimiters(formattedText);
1058
+ if (this.state.isConnected) {
1059
+ if (!this.state.isSessionStarted && !this.isSessionStarting) {
1060
+ this.isSessionStarting = true;
1061
+ this.client?.startSession();
1062
+ }
1063
+ }
1064
+ segments.forEach((seg, idx) => {
1065
+ this.segmentQueue.push({
1066
+ index: idx,
1067
+ content: seg.content,
1068
+ length: seg.content.length,
1069
+ sent: false
1070
+ });
1071
+ });
1072
+ if (this.state.isSessionStarted) {
1073
+ this.processQueue();
1074
+ }
1075
+ await this.finishStream();
1076
+ }
1077
+ processQueue() {
1078
+ if (!this.client || !this.state.isSessionStarted || this.isSending || this.isSessionFinished) {
1079
+ return;
1080
+ }
1081
+ if (this.segmentQueue.length === 0) {
1082
+ if (this.isStreamFinished && !this.isSessionFinished) {
1083
+ this.isSessionFinished = true;
1084
+ this.client.finishSession();
1085
+ this.resolveAllSegmentsSent?.();
1086
+ }
1087
+ return;
1088
+ }
1089
+ this.isSending = true;
1090
+ const segment = this.segmentQueue.shift();
1091
+ this.client.sendText(segment.content);
1092
+ segment.sent = true;
1093
+ this.isSending = false;
1094
+ setTimeout(() => this.processQueue(), 0);
1095
+ }
1096
+ pause() {
1097
+ this.audio.pause();
1098
+ this.updateState({ isPaused: true, isPlaying: false });
1099
+ }
1100
+ resume() {
1101
+ this.audio.play();
1102
+ this.updateState({ isPaused: false, isPlaying: true });
1103
+ }
1104
+ stop() {
1105
+ if (this.client) {
1106
+ this.client.close();
1107
+ this.client = null;
1108
+ }
1109
+ this.audio.pause();
1110
+ this.audio.currentTime = 0;
1111
+ if (this.audioUrl) {
1112
+ URL.revokeObjectURL(this.audioUrl);
1113
+ this.audioUrl = null;
1114
+ }
1115
+ this.stopVisualizationLoop();
1116
+ this.audioContext?.close();
1117
+ this.audioContext = null;
1118
+ this.updateState({
1119
+ isPlaying: false,
1120
+ isPaused: false,
1121
+ isSynthesizing: false,
1122
+ progress: 0,
1123
+ isConnected: false,
1124
+ isSessionStarted: false
1125
+ });
1126
+ }
1127
+ seek(percentage) {
1128
+ let duration = this.audio.duration;
1129
+ if (!isFinite(duration) && this.audio.buffered.length > 0) {
1130
+ duration = this.audio.buffered.end(this.audio.buffered.length - 1);
1131
+ }
1132
+ if (isFinite(duration) && duration > 0) {
1133
+ const time = percentage / 100 * duration;
1134
+ if (isFinite(time)) {
1135
+ this.audio.currentTime = time;
1136
+ this.updateState({ progress: percentage });
1137
+ }
1138
+ }
1139
+ }
1140
+ updateState(partial) {
1141
+ this.state = { ...this.state, ...partial };
1142
+ this.notifyListeners();
1143
+ }
1144
+ subscribe(listener) {
1145
+ this.listeners.add(listener);
1146
+ listener(this.state);
1147
+ return () => this.listeners.delete(listener);
1148
+ }
1149
+ notifyListeners() {
1150
+ this.listeners.forEach((l) => l(this.state));
1151
+ }
1152
+ // Visualization
1153
+ getFrequencyData() {
1154
+ if (!this.analyser) return new Uint8Array(0);
1155
+ const data = new Uint8Array(this.analyser.frequencyBinCount);
1156
+ this.analyser.getByteFrequencyData(data);
1157
+ return data;
1158
+ }
1159
+ getTimeDomainData() {
1160
+ if (!this.analyser) return new Uint8Array(0);
1161
+ const data = new Uint8Array(this.analyser.frequencyBinCount);
1162
+ this.analyser.getByteTimeDomainData(data);
1163
+ return data;
1164
+ }
1165
+ startVisualizationLoop() {
1166
+ if (!this.config.visualization?.enabled) return;
949
1167
  const update = (timestamp) => {
950
- if (isPlaying && !isPaused) {
951
- if (timestamp - lastUpdate >= interval) {
952
- setVisualizationData({
953
- frequencyData: getFrequencyData(),
954
- timeDomainData: getTimeDomainData()
1168
+ if (this.state.isPlaying && !this.state.isPaused) {
1169
+ if (timestamp - this.lastVisUpdate >= (this.config.visualization?.refreshInterval || 0)) {
1170
+ this.updateState({
1171
+ visualizationData: {
1172
+ frequencyData: this.getFrequencyData(),
1173
+ timeDomainData: this.getTimeDomainData()
1174
+ }
955
1175
  });
956
- lastUpdate = timestamp;
1176
+ this.lastVisUpdate = timestamp;
957
1177
  }
958
- animId = requestAnimationFrame(update);
1178
+ this.animId = requestAnimationFrame(update);
959
1179
  }
960
1180
  };
961
- if (isPlaying && !isPaused) {
962
- animId = requestAnimationFrame(update);
1181
+ this.animId = requestAnimationFrame(update);
1182
+ }
1183
+ stopVisualizationLoop() {
1184
+ if (this.animId) {
1185
+ cancelAnimationFrame(this.animId);
1186
+ this.animId = null;
963
1187
  }
964
- return () => {
965
- if (animId) cancelAnimationFrame(animId);
966
- };
967
- }, [isPlaying, isPaused, visualization, getFrequencyData, getTimeDomainData]);
968
- useEffect2(() => {
969
- return () => {
970
- stop();
971
- if (audioContextRef.current) {
972
- audioContextRef.current.close();
973
- }
974
- };
975
- }, [stop]);
976
- const seek = useCallback3((percentage) => {
977
- if (audioRef.current) {
978
- let duration = audioRef.current.duration;
979
- if (!isFinite(duration) && audioRef.current.buffered.length > 0) {
980
- duration = audioRef.current.buffered.end(
981
- audioRef.current.buffered.length - 1
982
- );
983
- }
984
- if (isFinite(duration) && duration > 0) {
985
- const time = percentage / 100 * duration;
986
- if (isFinite(time)) {
987
- audioRef.current.currentTime = time;
988
- setProgress(percentage);
989
- }
1188
+ }
1189
+ };
1190
+ var StreamPlaybackManagerImpl = class {
1191
+ constructor() {
1192
+ this.sessions = /* @__PURE__ */ new Map();
1193
+ this.activeStreamId = null;
1194
+ }
1195
+ /**
1196
+ * 创建新的播放会话
1197
+ */
1198
+ createSession(id, config) {
1199
+ if (this.activeStreamId && this.activeStreamId !== id) {
1200
+ this.pause(this.activeStreamId);
1201
+ }
1202
+ const session = new PlaybackSession(id, config);
1203
+ this.sessions.set(id, session);
1204
+ this.activeStreamId = id;
1205
+ return session;
1206
+ }
1207
+ /**
1208
+ * 获取会话
1209
+ */
1210
+ getSession(id) {
1211
+ return this.sessions.get(id);
1212
+ }
1213
+ /**
1214
+ * 停止会话
1215
+ */
1216
+ stop(id) {
1217
+ const session = this.sessions.get(id);
1218
+ if (session) {
1219
+ session.stop();
1220
+ this.sessions.delete(id);
1221
+ if (this.activeStreamId === id) {
1222
+ this.activeStreamId = null;
990
1223
  }
991
1224
  }
992
- }, []);
993
- return {
994
- isPlaying,
995
- isPaused,
996
- isSynthesizing,
997
- error,
998
- play,
999
- pause,
1000
- resume,
1001
- stop,
1002
- togglePlay,
1003
- seek,
1004
- progress,
1005
- getFrequencyData,
1006
- getTimeDomainData,
1007
- visualizationData
1008
- };
1009
- }
1010
-
1011
- // src/tts/useStreamTTS.ts
1012
- import { WebsocketMSE as WebsocketMSE3 } from "@wq-hook/volcano-sdk/tts";
1013
- import { useCallback as useCallback4, useEffect as useEffect3, useRef as useRef4, useState as useState4 } from "react";
1014
-
1015
- // src/tts/StreamingTextSplitter.ts
1016
- import { MarkdownFormatter as MarkdownFormatter3 } from "@wq-hook/volcano-sdk";
1017
- import emojiRegex3 from "emoji-regex";
1018
- var StreamingTextSplitter = class {
1019
- constructor(options = {}) {
1020
- /** 当前缓冲区 */
1021
- this.buffer = "";
1022
- /** 分段索引计数器 */
1023
- this.segmentIndex = 0;
1024
- /** 已完成的分段列表 */
1025
- this.segments = [];
1026
- /** 是否已完成 */
1027
- this.isCompleted = false;
1028
- this.maxLength = options.maxLength || 150;
1029
- this.minLength = options.minLength || 10;
1030
- this.onSegmentComplete = options.onSegmentComplete;
1031
- this.onAllComplete = options.onAllComplete;
1032
- }
1033
- /**
1034
- * 接收流式文本块
1035
- * @param chunk - 文本块
1036
- */
1037
- onChunk(chunk) {
1038
- if (!chunk || this.isCompleted) return;
1039
- this.buffer += chunk;
1040
- if (this.detectBoundary(chunk)) {
1041
- const newlineIndex = this.buffer.indexOf("\n");
1042
- if (newlineIndex !== -1) {
1043
- if (newlineIndex === 0) {
1044
- this.buffer = this.buffer.substring(1);
1045
- return;
1046
- }
1047
- const segmentBuffer = this.buffer.substring(0, newlineIndex);
1048
- this.buffer = this.buffer.substring(newlineIndex + 1);
1049
- this.flushSegmentWithBuffer(segmentBuffer);
1050
- while (this.buffer.includes("\n")) {
1051
- const nextNewlineIndex = this.buffer.indexOf("\n");
1052
- if (nextNewlineIndex === 0) {
1053
- this.buffer = this.buffer.substring(1);
1054
- continue;
1055
- }
1056
- const nextSegmentBuffer = this.buffer.substring(0, nextNewlineIndex);
1057
- this.buffer = this.buffer.substring(nextNewlineIndex + 1);
1058
- this.flushSegmentWithBuffer(nextSegmentBuffer);
1059
- }
1060
- }
1061
- }
1062
- }
1063
- /**
1064
- * 检测分段边界
1065
- * @param chunk - 最新接收的文本块
1066
- * @returns 是否应该分段
1067
- */
1068
- detectBoundary(chunk) {
1069
- if (chunk.includes("\n")) {
1070
- if (this.buffer.length >= this.maxLength) {
1071
- this.forceSplitAtSentenceBoundary();
1072
- }
1073
- return true;
1074
- }
1075
- if (this.buffer.length >= this.maxLength) {
1076
- this.forceSplitAtSentenceBoundary();
1077
- return true;
1078
- }
1079
- return false;
1080
1225
  }
1081
1226
  /**
1082
- * 在句子边界强制拆分超长段落
1227
+ * 暂停会话
1083
1228
  */
1084
- forceSplitAtSentenceBoundary() {
1085
- const content = this.buffer;
1086
- const sentenceEnders = /[。?!]/g;
1087
- let lastMatch = null;
1088
- let match = null;
1089
- while ((match = sentenceEnders.exec(content)) !== null) {
1090
- lastMatch = match;
1091
- }
1092
- if (lastMatch && lastMatch.index > this.minLength) {
1093
- const splitPoint = lastMatch.index + 1;
1094
- const firstPart = content.substring(0, splitPoint);
1095
- const secondPart = content.substring(splitPoint);
1096
- this.buffer = firstPart;
1097
- this.flushSegment();
1098
- this.buffer = secondPart;
1099
- } else {
1100
- const midPoint = Math.floor(content.length / 2);
1101
- const firstPart = content.substring(0, midPoint);
1102
- const secondPart = content.substring(midPoint);
1103
- this.buffer = firstPart;
1104
- this.flushSegment();
1105
- this.buffer = secondPart;
1106
- }
1229
+ pause(id) {
1230
+ this.sessions.get(id)?.pause();
1107
1231
  }
1108
1232
  /**
1109
- * 使用指定缓冲区内容刷新为分段
1110
- * @param bufferToFlush - 要分段的缓冲区内容
1233
+ * 恢复会话
1111
1234
  */
1112
- flushSegmentWithBuffer(bufferToFlush) {
1113
- const content = bufferToFlush;
1114
- if (!content) return;
1115
- const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
1116
- const isTooShort = content.length < 3;
1117
- if (isPureSymbols && isTooShort) {
1118
- return;
1119
- }
1120
- const formattedContent = MarkdownFormatter3.format(content).replace(emojiRegex3(), "");
1121
- if (!formattedContent) return;
1122
- let subSegments = [formattedContent];
1123
- if (formattedContent.length > this.maxLength) {
1124
- subSegments = this.splitLongSegment(formattedContent);
1125
- }
1126
- for (const subSegment of subSegments) {
1127
- if (!subSegment) continue;
1128
- const segment = {
1129
- index: this.segmentIndex++,
1130
- content: subSegment,
1131
- length: subSegment.length,
1132
- sent: false
1133
- };
1134
- this.segments.push(segment);
1135
- this.onSegmentComplete?.(segment);
1235
+ resume(id) {
1236
+ if (this.activeStreamId && this.activeStreamId !== id) {
1237
+ this.pause(this.activeStreamId);
1136
1238
  }
1239
+ this.sessions.get(id)?.resume();
1240
+ this.activeStreamId = id;
1137
1241
  }
1138
1242
  /**
1139
- * 刷新当前缓冲区为分段
1243
+ * 注册(兼容旧 API,但推荐直接用 createSession)
1244
+ * 为了兼容 useMessageTTS 旧逻辑,这里可以保留一些别名,但我们会重构 hook,所以可以改变 API。
1140
1245
  */
1141
- flushSegment() {
1142
- const content = this.buffer.trim();
1143
- if (!content) {
1144
- this.buffer = "";
1145
- return;
1146
- }
1147
- const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
1148
- const isTooShort = content.length < 3;
1149
- if (isPureSymbols && isTooShort) {
1150
- this.buffer = "";
1151
- return;
1152
- }
1153
- const formattedContent = MarkdownFormatter3.format(content).replace(emojiRegex3(), "");
1154
- if (!formattedContent) {
1155
- this.buffer = "";
1156
- return;
1157
- }
1158
- let subSegments = [formattedContent];
1159
- if (formattedContent.length > this.maxLength) {
1160
- subSegments = this.splitLongSegment(formattedContent);
1161
- }
1162
- for (const subSegment of subSegments) {
1163
- if (!subSegment) continue;
1164
- const segment = {
1165
- index: this.segmentIndex++,
1166
- content: subSegment,
1167
- length: subSegment.length,
1168
- sent: false
1169
- };
1170
- this.segments.push(segment);
1171
- this.onSegmentComplete?.(segment);
1172
- }
1173
- this.buffer = "";
1174
- }
1175
- /**
1176
- * 拆分超长分段
1177
- * @param segment - 超长的分段
1178
- * @returns 拆分后的分段数组
1179
- */
1180
- splitLongSegment(segment) {
1181
- const result = [];
1182
- let current = "";
1183
- for (const char of segment) {
1184
- current += char;
1185
- const shouldSplit = /[。?!,,]/.test(char);
1186
- if (shouldSplit && current.length <= this.maxLength) {
1187
- result.push(current);
1188
- current = "";
1189
- } else if (current.length >= this.maxLength) {
1190
- result.push(current);
1191
- current = "";
1192
- }
1193
- }
1194
- if (current) {
1195
- result.push(current);
1196
- }
1197
- return result.filter((s) => s.length > 0);
1198
- }
1199
- /**
1200
- * 完成流式输入
1201
- * 处理剩余的缓冲区内容
1202
- */
1203
- complete() {
1204
- if (this.isCompleted) return;
1205
- this.isCompleted = true;
1206
- while (this.buffer.includes("\n")) {
1207
- const newlineIndex = this.buffer.indexOf("\n");
1208
- if (newlineIndex === 0) {
1209
- this.buffer = this.buffer.substring(1);
1210
- continue;
1211
- }
1212
- const segmentBuffer = this.buffer.substring(0, newlineIndex);
1213
- this.buffer = this.buffer.substring(newlineIndex + 1);
1214
- this.flushSegmentWithBuffer(segmentBuffer);
1215
- }
1216
- if (this.buffer.trim()) {
1217
- this.flushSegment();
1218
- }
1219
- this.onAllComplete?.(this.segments);
1220
- }
1221
- /**
1222
- * 重置分段器状态
1223
- */
1224
- reset() {
1225
- this.buffer = "";
1226
- this.segmentIndex = 0;
1227
- this.segments = [];
1228
- this.isCompleted = false;
1229
- }
1230
- /**
1231
- * 获取当前缓冲区内容
1232
- */
1233
- getBuffer() {
1234
- return this.buffer;
1235
- }
1236
- /**
1237
- * 获取已分段的列表
1238
- */
1239
- getSegments() {
1240
- return this.segments;
1241
- }
1242
- /**
1243
- * 获取统计信息
1244
- */
1245
- getStats() {
1246
- return {
1247
- bufferLength: this.buffer.length,
1248
- segmentCount: this.segments.length,
1249
- totalChars: this.segments.reduce((sum, seg) => sum + seg.length, 0)
1250
- };
1251
- }
1252
1246
  };
1247
+ var StreamPlaybackManager = new StreamPlaybackManagerImpl();
1253
1248
 
1254
- // src/tts/useStreamTTS.ts
1255
- var WS_URL2 = "wss://openspeech.bytedance.com/api/v3/tts/bidirection";
1256
- var activeInstances2 = /* @__PURE__ */ new Map();
1257
- var sessionAudioCache = /* @__PURE__ */ new Map();
1258
- function buildFullUrl3(url, params) {
1259
- const arr = [];
1260
- for (const key in params) {
1261
- if (Object.prototype.hasOwnProperty.call(params, key)) {
1262
- arr.push(`${key}=${encodeURIComponent(params[key])}`);
1263
- }
1249
+ // src/tts/Metrics.ts
1250
+ var NoopMetricsCollector = class {
1251
+ record(_metric) {
1264
1252
  }
1265
- return `${url}?${arr.join("&")}`;
1266
- }
1267
- function useStreamTTS({
1253
+ };
1254
+
1255
+ // src/tts/useMessageTTS.ts
1256
+ function useMessageTTS({
1268
1257
  ttsConfig,
1269
1258
  audioParams,
1270
1259
  autoPlay = true,
@@ -1273,428 +1262,427 @@ function useStreamTTS({
1273
1262
  onPlayPause,
1274
1263
  onPlayResume,
1275
1264
  onPlayEnd,
1265
+ onStop,
1276
1266
  onError,
1267
+ fallbackVoice,
1277
1268
  visualization,
1278
- maxSegmentLength = 150
1269
+ streamId: externalStreamId
1279
1270
  }) {
1280
- const [isConnected, setIsConnected] = useState4(false);
1281
- const [isSessionStarted, setIsSessionStarted] = useState4(false);
1282
- const [isSynthesizing, setIsSynthesizing] = useState4(false);
1283
- const [isPlaying, setIsPlaying] = useState4(false);
1284
- const [isPaused, setIsPaused] = useState4(false);
1285
- const [error, setErrorState] = useState4(null);
1286
- const [streamText, setStreamText] = useState4("");
1287
- const [progress, setProgress] = useState4(0);
1288
- const [visualizationData, setVisualizationData] = useState4({
1289
- frequencyData: new Uint8Array(0),
1290
- timeDomainData: new Uint8Array(0)
1271
+ const isSubscriptionMode = !!externalStreamId;
1272
+ const [internalStreamId, setInternalStreamId] = useState3("");
1273
+ const [isSwitchedToIndependent, setIsSwitchedToIndependent] = useState3(false);
1274
+ const streamId = isSwitchedToIndependent ? internalStreamId : externalStreamId || internalStreamId;
1275
+ const [state, setState] = useState3({
1276
+ isPlaying: false,
1277
+ isPaused: false,
1278
+ isSynthesizing: false,
1279
+ progress: 0,
1280
+ visualizationData: {
1281
+ frequencyData: new Uint8Array(0),
1282
+ timeDomainData: new Uint8Array(0)
1283
+ },
1284
+ error: null,
1285
+ isConnected: false,
1286
+ isSessionStarted: false,
1287
+ isStreamFinished: false
1291
1288
  });
1292
- const instanceId = useRef4(`tts-stream-${Date.now()}-${Math.random().toString(36).slice(2)}`).current;
1293
- const clientRef = useRef4(null);
1294
- const audioRef = useRef4(null);
1295
- const audioContextRef = useRef4(null);
1296
- const analyserRef = useRef4(null);
1297
- const sourceRef = useRef4(null);
1298
- const audioUrlRef = useRef4(null);
1299
- const streamTextRef = useRef4("");
1300
- const isSessionStartedRef = useRef4(false);
1301
- const calledSessionStartedRef = useRef4(false);
1302
- const splitterRef = useRef4(null);
1303
- const segmentQueueRef = useRef4([]);
1304
- const isSendingRef = useRef4(false);
1305
- const sessionAudioBuffersRef = useRef4([]);
1306
- const isStreamFinishedRef = useRef4(false);
1307
- const isSessionFinishedRef = useRef4(false);
1308
- const resolveAllSegmentsSentRef = useRef4(null);
1309
- const currentVoiceRef = useRef4("");
1310
- const initAudioContext = useCallback4(() => {
1311
- if (!audioRef.current) return;
1312
- if (!audioContextRef.current) {
1313
- const AudioContextClass = window.AudioContext || window.webkitAudioContext;
1314
- audioContextRef.current = new AudioContextClass();
1315
- }
1316
- if (audioContextRef.current.state === "suspended") {
1317
- audioContextRef.current.resume();
1318
- }
1319
- if (!analyserRef.current) {
1320
- analyserRef.current = audioContextRef.current.createAnalyser();
1321
- analyserRef.current.fftSize = visualization?.fftSize || 256;
1322
- }
1323
- if (!sourceRef.current) {
1324
- try {
1325
- sourceRef.current = audioContextRef.current.createMediaElementSource(audioRef.current);
1326
- sourceRef.current.connect(analyserRef.current);
1327
- analyserRef.current.connect(audioContextRef.current.destination);
1328
- } catch (e) {
1329
- }
1289
+ const [error, setErrorState] = useState3(null);
1290
+ const isFallbackRef = useRef3(false);
1291
+ const fallbackUtteranceRef = useRef3(null);
1292
+ const currentTextRef = useRef3("");
1293
+ useEffect2(() => {
1294
+ if (!streamId) return;
1295
+ const session = StreamPlaybackManager.getSession(streamId);
1296
+ if (session) {
1297
+ const unsubscribe = session.subscribe((newState) => {
1298
+ setState(newState);
1299
+ if (newState.error) setErrorState(newState.error);
1300
+ });
1301
+ return () => {
1302
+ unsubscribe();
1303
+ };
1330
1304
  }
1331
- }, [visualization?.fftSize]);
1332
- const cleanupAudio = useCallback4(() => {
1333
- if (audioUrlRef.current) {
1334
- URL.revokeObjectURL(audioUrlRef.current);
1335
- audioUrlRef.current = null;
1336
- }
1337
- if (audioRef.current) {
1338
- audioRef.current.onerror = null;
1339
- audioRef.current.onended = null;
1340
- audioRef.current.onpause = null;
1341
- audioRef.current.onplay = null;
1342
- audioRef.current.ontimeupdate = null;
1343
- audioRef.current.pause();
1344
- audioRef.current.src = "";
1345
- audioRef.current = null;
1346
- }
1347
- if (sourceRef.current) {
1348
- try {
1349
- sourceRef.current.disconnect();
1350
- } catch (e) {
1305
+ }, [streamId]);
1306
+ const stop = useCallback3(() => {
1307
+ if (streamId) {
1308
+ StreamPlaybackManager.stop(streamId);
1309
+ if (!isSubscriptionMode || isSwitchedToIndependent) {
1310
+ setInternalStreamId("");
1311
+ setIsSwitchedToIndependent(false);
1351
1312
  }
1352
- sourceRef.current = null;
1353
1313
  }
1354
- }, []);
1355
- const stopOthers = useCallback4(() => {
1356
- activeInstances2.forEach((instance, id) => {
1357
- if (id !== instanceId) {
1358
- instance.pause();
1359
- }
1360
- });
1361
- }, [instanceId]);
1362
- const pause = useCallback4(() => {
1363
- if (audioRef.current) {
1364
- audioRef.current.pause();
1314
+ if (fallbackUtteranceRef.current) {
1315
+ window.speechSynthesis.cancel();
1316
+ fallbackUtteranceRef.current = null;
1365
1317
  }
1366
- setIsPaused(true);
1367
- setIsPlaying(false);
1368
- onPlayPause?.();
1369
- }, [onPlayPause]);
1370
- const resume = useCallback4(() => {
1371
- stopOthers();
1372
- if (audioRef.current) {
1373
- audioRef.current.play();
1374
- }
1375
- setIsPaused(false);
1376
- setIsPlaying(true);
1377
- onPlayResume?.();
1378
- activeInstances2.set(instanceId, { pause });
1379
- }, [stopOthers, instanceId, pause, onPlayResume]);
1380
- const sendNextSegment = useCallback4(() => {
1381
- if (!clientRef.current || !isSessionStartedRef.current || isSendingRef.current || isSessionFinishedRef.current) {
1382
- return;
1318
+ isFallbackRef.current = false;
1319
+ setState((prev) => ({
1320
+ ...prev,
1321
+ isPlaying: false,
1322
+ isPaused: false,
1323
+ isSynthesizing: false,
1324
+ progress: 0
1325
+ }));
1326
+ onStop?.();
1327
+ }, [streamId, isSubscriptionMode, isSwitchedToIndependent, onStop]);
1328
+ const pause = useCallback3(() => {
1329
+ if (isFallbackRef.current) {
1330
+ window.speechSynthesis.pause();
1331
+ setState((prev) => ({ ...prev, isPaused: true, isPlaying: false }));
1332
+ onPlayPause?.();
1333
+ } else if (streamId) {
1334
+ StreamPlaybackManager.pause(streamId);
1383
1335
  }
1384
- if (segmentQueueRef.current.length === 0) {
1385
- if (isStreamFinishedRef.current && !isSessionFinishedRef.current) {
1386
- console.log("[useStreamTTS] All segments sent, finishing session");
1387
- isSessionFinishedRef.current = true;
1388
- clientRef.current.finishSession();
1389
- resolveAllSegmentsSentRef.current?.();
1336
+ }, [streamId, onPlayPause]);
1337
+ const resume = useCallback3(() => {
1338
+ if (isFallbackRef.current) {
1339
+ window.speechSynthesis.resume();
1340
+ setState((prev) => ({ ...prev, isPaused: false, isPlaying: true }));
1341
+ onPlayResume?.();
1342
+ } else if (streamId) {
1343
+ const session = StreamPlaybackManager.getSession(streamId);
1344
+ if (session) {
1345
+ StreamPlaybackManager.resume(streamId);
1346
+ } else {
1347
+ console.log(
1348
+ "[useMessageTTS] Session not found, resetting pause state"
1349
+ );
1350
+ setState((prev) => ({ ...prev, isPaused: false, isPlaying: false }));
1390
1351
  }
1391
- return;
1392
1352
  }
1393
- isSendingRef.current = true;
1394
- const segment = segmentQueueRef.current.shift();
1395
- console.log(`[useStreamTTS] Sending segment ${segment.index}: ${segment.content.substring(0, 30)}...`);
1396
- clientRef.current.sendText(segment.content);
1397
- segment.sent = true;
1398
- isSendingRef.current = false;
1399
- setTimeout(() => sendNextSegment(), 0);
1400
- }, []);
1401
- const stop = useCallback4(() => {
1402
- if (clientRef.current) {
1403
- clientRef.current.close();
1404
- clientRef.current = null;
1405
- }
1406
- cleanupAudio();
1407
- setIsConnected(false);
1408
- setIsSessionStarted(false);
1409
- isSessionStartedRef.current = false;
1410
- calledSessionStartedRef.current = false;
1411
- setIsPlaying(false);
1412
- setIsPaused(false);
1413
- setIsSynthesizing(false);
1414
- setProgress(0);
1415
- activeInstances2.delete(instanceId);
1416
- streamTextRef.current = "";
1417
- setStreamText("");
1418
- segmentQueueRef.current = [];
1419
- isSendingRef.current = false;
1420
- sessionAudioBuffersRef.current = [];
1421
- isStreamFinishedRef.current = false;
1422
- isSessionFinishedRef.current = false;
1423
- splitterRef.current?.reset();
1424
- }, [cleanupAudio, instanceId]);
1425
- const connect = useCallback4(async () => {
1426
- stop();
1427
- setErrorState(null);
1428
- setProgress(0);
1429
- sessionAudioBuffersRef.current = [];
1430
- isStreamFinishedRef.current = false;
1431
- streamTextRef.current = "";
1432
- setStreamText("");
1433
- segmentQueueRef.current = [];
1434
- isSendingRef.current = false;
1435
- isSessionStartedRef.current = false;
1436
- calledSessionStartedRef.current = false;
1437
- setIsSessionStarted(false);
1438
- const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
1439
- currentVoiceRef.current = voice;
1440
- const startTime = Date.now();
1441
- metricsCollector.record({
1442
- name: "tts_request",
1443
- labels: { voice, text_length: 0 },
1444
- value: 1,
1445
- timestamp: startTime
1446
- });
1447
- try {
1448
- const audio = new Audio();
1449
- audio.crossOrigin = "anonymous";
1450
- audioRef.current = audio;
1451
- audio.onplay = () => {
1452
- setIsPlaying(true);
1453
- setIsPaused(false);
1353
+ }, [streamId, onPlayResume]);
1354
+ const togglePlay = useCallback3(() => {
1355
+ if (state.isPlaying) {
1356
+ pause();
1357
+ } else {
1358
+ resume();
1359
+ }
1360
+ }, [state.isPlaying, pause, resume]);
1361
+ const playFallback = useCallback3(
1362
+ (text) => {
1363
+ console.warn("[useMessageTTS] Switching to fallback TTS");
1364
+ stop();
1365
+ isFallbackRef.current = true;
1366
+ setErrorState(null);
1367
+ const utterance = new SpeechSynthesisUtterance(text);
1368
+ utterance.rate = audioParams?.speech_rate || 1;
1369
+ const voices = window.speechSynthesis.getVoices();
1370
+ const zhVoice = voices.find((v) => v.lang.includes("zh"));
1371
+ if (zhVoice) utterance.voice = zhVoice;
1372
+ utterance.onstart = () => {
1373
+ setState((prev) => ({ ...prev, isPlaying: true, isPaused: false }));
1454
1374
  onPlayStart?.();
1455
- initAudioContext();
1456
- activeInstances2.set(instanceId, { pause });
1457
1375
  };
1458
- audio.onended = () => {
1459
- setIsPlaying(false);
1460
- setIsPaused(false);
1376
+ utterance.onend = () => {
1377
+ setState((prev) => ({
1378
+ ...prev,
1379
+ isPlaying: false,
1380
+ isPaused: false,
1381
+ progress: 100
1382
+ }));
1461
1383
  onPlayEnd?.();
1462
- activeInstances2.delete(instanceId);
1463
1384
  };
1464
- audio.onerror = (e) => {
1465
- console.error("[useStreamTTS] Audio playback error:", e, audio.error);
1466
- setErrorState(audio.error?.message || "Audio playback error");
1467
- onError?.(new Error(audio.error?.message || "Audio playback error"));
1468
- };
1469
- audio.ontimeupdate = () => {
1470
- let duration = audio.duration;
1471
- if (!isFinite(duration) && audio.buffered.length > 0) {
1472
- duration = audio.buffered.end(audio.buffered.length - 1);
1473
- }
1474
- if (isFinite(duration) && duration > 0) {
1475
- setProgress(audio.currentTime / duration * 100);
1476
- }
1385
+ utterance.onerror = (e) => {
1386
+ console.error("[useMessageTTS] Fallback TTS failed", e);
1387
+ setErrorState("Fallback TTS failed");
1388
+ onError?.(new Error("Fallback TTS failed"));
1477
1389
  };
1478
- clientRef.current = WebsocketMSE3({ autoStartSession: false });
1479
- splitterRef.current = new StreamingTextSplitter({
1480
- maxLength: maxSegmentLength,
1481
- onSegmentComplete: (segment) => {
1482
- segmentQueueRef.current.push(segment);
1483
- console.log(`[useStreamTTS] Segment ${segment.index} queued (${segment.length} chars)`);
1484
- if (isSessionStartedRef.current) {
1485
- sendNextSegment();
1486
- }
1487
- },
1488
- onAllComplete: () => {
1489
- console.log(`[useStreamTTS] All segments completed, total: ${segmentQueueRef.current.length} in queue`);
1390
+ fallbackUtteranceRef.current = utterance;
1391
+ window.speechSynthesis.speak(utterance);
1392
+ },
1393
+ [audioParams, onError, onPlayEnd, onPlayStart, stop]
1394
+ );
1395
+ const handleError = useCallback3(
1396
+ (text, failedVoice) => {
1397
+ if (fallbackVoice && failedVoice !== fallbackVoice) {
1398
+ console.warn(
1399
+ `[useMessageTTS] Voice ${failedVoice} failed, switching to fallback voice ${fallbackVoice}`
1400
+ );
1401
+ const newId = internalStreamId || `msg-tts-retry-${Date.now()}`;
1402
+ setInternalStreamId(newId);
1403
+ const session = StreamPlaybackManager.createSession(newId, {
1404
+ ttsConfig,
1405
+ audioParams: { ...audioParams, speaker: fallbackVoice },
1406
+ autoPlay,
1407
+ metricsCollector,
1408
+ visualization,
1409
+ onPlayStart,
1410
+ onPlayPause,
1411
+ onPlayResume,
1412
+ onPlayEnd,
1413
+ onError: () => playFallback(text)
1414
+ });
1415
+ session.play(text);
1416
+ } else {
1417
+ playFallback(text);
1418
+ }
1419
+ },
1420
+ [
1421
+ fallbackVoice,
1422
+ playFallback,
1423
+ ttsConfig,
1424
+ audioParams,
1425
+ autoPlay,
1426
+ metricsCollector,
1427
+ visualization,
1428
+ onPlayStart,
1429
+ onPlayPause,
1430
+ onPlayResume,
1431
+ onPlayEnd,
1432
+ internalStreamId
1433
+ ]
1434
+ );
1435
+ const play = useCallback3(
1436
+ async (text) => {
1437
+ let shouldSwitchToIndependent = false;
1438
+ if (isSubscriptionMode) {
1439
+ const session2 = StreamPlaybackManager.getSession(externalStreamId || "");
1440
+ if (!session2) {
1441
+ console.log(
1442
+ "[useMessageTTS] Stream session not found, switching to independent play mode"
1443
+ );
1444
+ shouldSwitchToIndependent = true;
1445
+ setIsSwitchedToIndependent(true);
1446
+ } else if (session2.state.isStreamFinished) {
1447
+ console.log(
1448
+ "[useMessageTTS] Stream finished, switching to independent play mode"
1449
+ );
1450
+ shouldSwitchToIndependent = true;
1451
+ setIsSwitchedToIndependent(true);
1452
+ } else if (session2.state.isSynthesizing || session2.state.isPlaying) {
1453
+ console.warn(
1454
+ "[useMessageTTS] play() called in subscription mode while streaming, ignoring"
1455
+ );
1456
+ return;
1457
+ } else {
1458
+ console.log(
1459
+ "[useMessageTTS] Stream not active, switching to independent play mode"
1460
+ );
1461
+ shouldSwitchToIndependent = true;
1462
+ setIsSwitchedToIndependent(true);
1490
1463
  }
1491
- });
1492
- const url = clientRef.current.start({
1493
- url: buildFullUrl3(WS_URL2, {
1494
- api_access_key: `Jwt; ${ttsConfig.token}`,
1495
- api_app_key: ttsConfig.appid,
1496
- api_resource_id: ttsConfig.resourceId || "seed-tts-2.0"
1497
- }),
1498
- config: {
1499
- user: {
1500
- uid: `req-${Date.now()}`
1501
- },
1502
- namespace: ttsConfig.namespace || "BidirectionalTTS",
1503
- req_params: {
1504
- speaker: voice,
1505
- audio_params: {
1506
- sample_rate: audioParams?.sample_rate || 24e3,
1507
- format: audioParams?.format || "mp3",
1508
- speech_rate: audioParams?.speech_rate,
1509
- pitch_rate: audioParams?.pitch_rate,
1510
- loudness_rate: audioParams?.loudness_rate
1511
- },
1512
- additions: JSON.stringify({
1513
- enable_language_detector: true,
1514
- disable_markdown_filter: true,
1515
- enable_latex_tn: true
1516
- })
1517
- }
1518
- },
1519
- // ===== 关键回调 =====
1520
- onStart: () => {
1521
- setIsConnected(true);
1522
- console.log("[useStreamTTS] WebSocket connected, waiting for text...");
1523
- },
1524
- onSessionStarted: () => {
1525
- setIsSessionStarted(true);
1526
- isSessionStartedRef.current = true;
1527
- console.log("[useStreamTTS] Session started, can send text now");
1528
- if (segmentQueueRef.current.length > 0) {
1529
- sendNextSegment();
1530
- }
1531
- },
1532
- onMessage: (data) => {
1533
- setIsSynthesizing(true);
1534
- if (sessionAudioBuffersRef.current.length === 0) {
1535
- metricsCollector.record({
1536
- name: "tts_latency",
1537
- labels: { stage: "first_packet", voice },
1538
- value: Date.now() - startTime,
1539
- timestamp: Date.now()
1540
- });
1541
- }
1542
- const buffer = data instanceof ArrayBuffer ? data.slice(0) : new Uint8Array(data).buffer;
1543
- sessionAudioBuffersRef.current.push(buffer);
1544
- },
1545
- onSessionFinished: () => {
1546
- setIsSynthesizing(false);
1547
- setIsSessionStarted(false);
1548
- isSessionStartedRef.current = false;
1549
- calledSessionStartedRef.current = false;
1550
- if (sessionAudioBuffersRef.current.length > 0 && streamTextRef.current) {
1551
- const speed = audioParams?.speech_rate || 0;
1552
- const cacheKey = TTSCache.generateKey(streamTextRef.current, voice, speed);
1553
- TTSCache.set(cacheKey, [...sessionAudioBuffersRef.current]);
1554
- sessionAudioCache.set(instanceId, {
1555
- streamText: streamTextRef.current,
1556
- audioBuffers: [...sessionAudioBuffersRef.current],
1557
- timestamp: Date.now(),
1558
- voice,
1559
- speed
1560
- });
1561
- console.log(`[useStreamTTS] Session finished, cached ${sessionAudioBuffersRef.current.length} audio buffers`);
1562
- }
1563
- metricsCollector.record({
1564
- name: "tts_synthesis_finished",
1565
- labels: { voice, text_length: streamTextRef.current.length },
1566
- value: Date.now() - startTime,
1567
- timestamp: Date.now()
1568
- });
1569
- },
1464
+ }
1465
+ currentTextRef.current = text;
1466
+ stop();
1467
+ setErrorState(null);
1468
+ isFallbackRef.current = false;
1469
+ const id = `msg-tts-${Date.now()}-${Math.random().toString(36).slice(2)}`;
1470
+ const session = StreamPlaybackManager.createSession(id, {
1471
+ ttsConfig,
1472
+ audioParams,
1473
+ autoPlay,
1474
+ metricsCollector,
1475
+ visualization,
1476
+ onPlayStart,
1477
+ onPlayPause,
1478
+ onPlayResume,
1479
+ onPlayEnd,
1570
1480
  onError: (err) => {
1571
- console.error("[useStreamTTS] TTS error:", err);
1572
- setErrorState(err.msg || "TTS error");
1573
- onError?.(new Error(err.msg || "TTS error"));
1574
- setIsSynthesizing(false);
1481
+ handleError(text, audioParams?.speaker || "");
1575
1482
  }
1576
1483
  });
1577
- audioUrlRef.current = url;
1578
- audio.src = url;
1579
- if (autoPlay) {
1580
- try {
1581
- await audio.play();
1582
- } catch (e) {
1583
- console.warn("[useStreamTTS] Autoplay blocked:", e);
1584
- }
1484
+ setInternalStreamId(id);
1485
+ await session.play(text);
1486
+ },
1487
+ [
1488
+ isSubscriptionMode,
1489
+ externalStreamId,
1490
+ stop,
1491
+ ttsConfig,
1492
+ audioParams,
1493
+ autoPlay,
1494
+ metricsCollector,
1495
+ visualization,
1496
+ onPlayStart,
1497
+ onPlayPause,
1498
+ onPlayResume,
1499
+ onPlayEnd,
1500
+ handleError
1501
+ ]
1502
+ );
1503
+ const seek = useCallback3(
1504
+ (percentage) => {
1505
+ if (streamId) {
1506
+ StreamPlaybackManager.getSession(streamId)?.seek(percentage);
1585
1507
  }
1586
- } catch (err) {
1587
- console.error("[useStreamTTS] Connect error:", err);
1588
- setErrorState(String(err));
1589
- onError?.(err instanceof Error ? err : new Error(String(err)));
1590
- }
1508
+ },
1509
+ [streamId]
1510
+ );
1511
+ const getFrequencyData = useCallback3(
1512
+ () => state.visualizationData.frequencyData,
1513
+ [state.visualizationData]
1514
+ );
1515
+ const getTimeDomainData = useCallback3(
1516
+ () => state.visualizationData.timeDomainData,
1517
+ [state.visualizationData]
1518
+ );
1519
+ const isStreamActive = !!(externalStreamId && (state.isPlaying || state.isPaused || state.isSynthesizing));
1520
+ const canResume = useCallback3(() => {
1521
+ if (!streamId) return false;
1522
+ const session = StreamPlaybackManager.getSession(streamId);
1523
+ return !!session;
1524
+ }, [streamId]);
1525
+ return {
1526
+ isPlaying: state.isPlaying,
1527
+ isPaused: state.isPaused,
1528
+ isSynthesizing: state.isSynthesizing,
1529
+ progress: state.progress,
1530
+ error,
1531
+ play,
1532
+ pause,
1533
+ resume,
1534
+ stop,
1535
+ togglePlay,
1536
+ seek,
1537
+ getFrequencyData,
1538
+ getTimeDomainData,
1539
+ visualizationData: state.visualizationData,
1540
+ isStreamActive,
1541
+ streamState: state,
1542
+ canResume
1543
+ };
1544
+ }
1545
+
1546
+ // src/tts/useStreamTTS.ts
1547
+ import { useCallback as useCallback4, useEffect as useEffect3, useRef as useRef4, useState as useState4 } from "react";
1548
+ function useStreamTTS({
1549
+ ttsConfig,
1550
+ audioParams,
1551
+ autoPlay = true,
1552
+ metricsCollector = new NoopMetricsCollector(),
1553
+ onPlayStart,
1554
+ onPlayPause,
1555
+ onPlayResume,
1556
+ onPlayEnd,
1557
+ onError,
1558
+ visualization,
1559
+ maxSegmentLength = 150
1560
+ }) {
1561
+ const [streamId, setStreamId] = useState4("");
1562
+ const streamIdRef = useRef4("");
1563
+ const [state, setState] = useState4({
1564
+ isPlaying: false,
1565
+ isPaused: false,
1566
+ isSynthesizing: false,
1567
+ progress: 0,
1568
+ visualizationData: {
1569
+ frequencyData: new Uint8Array(0),
1570
+ timeDomainData: new Uint8Array(0)
1571
+ },
1572
+ error: null,
1573
+ isConnected: false,
1574
+ isSessionStarted: false,
1575
+ isStreamFinished: false
1576
+ });
1577
+ const [streamText, setStreamText] = useState4("");
1578
+ const streamTextRef = useRef4("");
1579
+ const connect = useCallback4(async () => {
1580
+ const newStreamId = `tts-stream-${Date.now()}-${Math.random().toString(36).slice(2)}`;
1581
+ setStreamId(newStreamId);
1582
+ streamIdRef.current = newStreamId;
1583
+ streamTextRef.current = "";
1584
+ setStreamText("");
1585
+ const session = StreamPlaybackManager.createSession(newStreamId, {
1586
+ ttsConfig,
1587
+ audioParams,
1588
+ autoPlay,
1589
+ metricsCollector,
1590
+ visualization,
1591
+ maxSegmentLength,
1592
+ onPlayStart,
1593
+ onPlayPause,
1594
+ onPlayResume,
1595
+ onPlayEnd,
1596
+ onError: (err) => {
1597
+ setState((prev) => ({ ...prev, error: err.message }));
1598
+ onError?.(err);
1599
+ }
1600
+ });
1601
+ await session.connect();
1602
+ return newStreamId;
1591
1603
  }, [
1592
1604
  ttsConfig,
1593
1605
  audioParams,
1594
1606
  autoPlay,
1595
- stop,
1596
- instanceId,
1597
- onPlayStart,
1598
- onPlayEnd,
1599
- initAudioContext,
1600
- pause,
1601
1607
  metricsCollector,
1608
+ visualization,
1602
1609
  maxSegmentLength,
1603
- sendNextSegment,
1610
+ onPlayStart,
1611
+ onPlayPause,
1612
+ onPlayResume,
1613
+ onPlayEnd,
1604
1614
  onError
1605
1615
  ]);
1616
+ useEffect3(() => {
1617
+ if (!streamId) return;
1618
+ const session = StreamPlaybackManager.getSession(streamId);
1619
+ if (!session) return;
1620
+ const unsubscribe = session.subscribe((newState) => {
1621
+ setState(newState);
1622
+ });
1623
+ return () => {
1624
+ unsubscribe();
1625
+ };
1626
+ }, [streamId]);
1606
1627
  const onMessage = useCallback4((chunk) => {
1607
- if (!chunk) return;
1628
+ if (!streamIdRef.current) return;
1608
1629
  streamTextRef.current += chunk;
1609
1630
  setStreamText(streamTextRef.current);
1610
- if (!calledSessionStartedRef.current && !isSessionStartedRef.current && clientRef.current && isConnected) {
1611
- console.log("[useStreamTTS] First text received, starting session...");
1612
- calledSessionStartedRef.current = true;
1613
- clientRef.current.startSession();
1614
- }
1615
- splitterRef.current?.onChunk(chunk);
1616
- }, [isConnected]);
1631
+ const session = StreamPlaybackManager.getSession(streamIdRef.current);
1632
+ session?.handleStreamChunk(chunk);
1633
+ }, []);
1617
1634
  const finishStream = useCallback4(async () => {
1618
- isStreamFinishedRef.current = true;
1619
- splitterRef.current?.complete();
1620
- console.log(`[useStreamTTS] Stream finished, ${segmentQueueRef.current.length} segments remaining in queue`);
1621
- if (segmentQueueRef.current.length > 0 || isSendingRef.current) {
1622
- await new Promise((resolve) => {
1623
- resolveAllSegmentsSentRef.current = resolve;
1624
- });
1625
- } else if (clientRef.current && isSessionStartedRef.current && !isSessionFinishedRef.current) {
1626
- isSessionFinishedRef.current = true;
1627
- clientRef.current.finishSession();
1628
- }
1635
+ if (!streamIdRef.current) return;
1636
+ const session = StreamPlaybackManager.getSession(streamIdRef.current);
1637
+ await session?.finishStream();
1629
1638
  }, []);
1630
- const seek = useCallback4((percentage) => {
1631
- if (audioRef.current) {
1632
- let duration = audioRef.current.duration;
1633
- if (!isFinite(duration) && audioRef.current.buffered.length > 0) {
1634
- duration = audioRef.current.buffered.end(audioRef.current.buffered.length - 1);
1635
- }
1636
- if (isFinite(duration) && duration > 0) {
1637
- const time = percentage / 100 * duration;
1638
- if (isFinite(time)) {
1639
- audioRef.current.currentTime = time;
1640
- setProgress(percentage);
1641
- }
1642
- }
1639
+ const pause = useCallback4(() => {
1640
+ if (streamIdRef.current) {
1641
+ StreamPlaybackManager.pause(streamIdRef.current);
1643
1642
  }
1644
1643
  }, []);
1645
- const getFrequencyData = useCallback4(() => {
1646
- if (!analyserRef.current) return new Uint8Array(0);
1647
- const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
1648
- analyserRef.current.getByteFrequencyData(dataArray);
1649
- return dataArray;
1644
+ const resume = useCallback4(() => {
1645
+ if (streamIdRef.current) {
1646
+ StreamPlaybackManager.resume(streamIdRef.current);
1647
+ }
1650
1648
  }, []);
1651
- const getTimeDomainData = useCallback4(() => {
1652
- if (!analyserRef.current) return new Uint8Array(0);
1653
- const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
1654
- analyserRef.current.getByteTimeDomainData(dataArray);
1655
- return dataArray;
1649
+ const stop = useCallback4(() => {
1650
+ if (streamIdRef.current) {
1651
+ StreamPlaybackManager.stop(streamIdRef.current);
1652
+ setStreamId("");
1653
+ streamIdRef.current = "";
1654
+ }
1656
1655
  }, []);
1657
- useEffect3(() => {
1658
- if (!visualization?.enabled) return;
1659
- let animId;
1660
- let lastUpdate = 0;
1661
- const interval = visualization.refreshInterval || 0;
1662
- const update = (timestamp) => {
1663
- if (isPlaying && !isPaused) {
1664
- if (timestamp - lastUpdate >= interval) {
1665
- setVisualizationData({
1666
- frequencyData: getFrequencyData(),
1667
- timeDomainData: getTimeDomainData()
1668
- });
1669
- lastUpdate = timestamp;
1670
- }
1671
- animId = requestAnimationFrame(update);
1672
- }
1673
- };
1674
- if (isPlaying && !isPaused) {
1675
- animId = requestAnimationFrame(update);
1656
+ const seek = useCallback4((percentage) => {
1657
+ if (streamIdRef.current) {
1658
+ StreamPlaybackManager.getSession(streamIdRef.current)?.seek(percentage);
1676
1659
  }
1677
- return () => {
1678
- if (animId) cancelAnimationFrame(animId);
1679
- };
1680
- }, [isPlaying, isPaused, visualization, getFrequencyData, getTimeDomainData]);
1660
+ }, []);
1681
1661
  useEffect3(() => {
1682
1662
  return () => {
1683
- stop();
1684
- if (audioContextRef.current) {
1685
- audioContextRef.current.close();
1663
+ if (streamIdRef.current) {
1664
+ StreamPlaybackManager.stop(streamIdRef.current);
1686
1665
  }
1687
1666
  };
1688
- }, [stop]);
1667
+ }, []);
1668
+ const getFrequencyData = useCallback4(
1669
+ () => state.visualizationData.frequencyData,
1670
+ [state.visualizationData]
1671
+ );
1672
+ const getTimeDomainData = useCallback4(
1673
+ () => state.visualizationData.timeDomainData,
1674
+ [state.visualizationData]
1675
+ );
1689
1676
  return {
1690
- isConnected,
1691
- isSessionStarted,
1692
- isSynthesizing,
1693
- isPlaying,
1694
- isPaused,
1695
- error,
1677
+ streamId,
1678
+ isConnected: state.isConnected,
1679
+ isSessionStarted: state.isSessionStarted,
1680
+ isSynthesizing: state.isSynthesizing,
1681
+ isPlaying: state.isPlaying,
1682
+ isPaused: state.isPaused,
1683
+ error: state.error,
1696
1684
  streamText,
1697
- progress,
1685
+ progress: state.progress,
1698
1686
  connect,
1699
1687
  onMessage,
1700
1688
  finishStream,
@@ -1704,23 +1692,9 @@ function useStreamTTS({
1704
1692
  seek,
1705
1693
  getFrequencyData,
1706
1694
  getTimeDomainData,
1707
- visualizationData
1695
+ visualizationData: state.visualizationData
1708
1696
  };
1709
1697
  }
1710
- function getSessionAudioCache(instanceId) {
1711
- return sessionAudioCache.get(instanceId);
1712
- }
1713
- function clearSessionAudioCache(instanceId) {
1714
- sessionAudioCache.delete(instanceId);
1715
- }
1716
- function findSessionCacheByText(streamText, voice, speed) {
1717
- for (const entry of sessionAudioCache.values()) {
1718
- if (entry.streamText === streamText && entry.voice === voice && entry.speed === speed) {
1719
- return entry;
1720
- }
1721
- }
1722
- return void 0;
1723
- }
1724
1698
 
1725
1699
  // src/components/AudioWaveVisualizer.tsx
1726
1700
  import { useEffect as useEffect4, useRef as useRef5 } from "react";
@@ -2095,10 +2069,8 @@ var AudioProgressBar_default = AudioProgressBar;
2095
2069
  export {
2096
2070
  AudioProgressBar_default as AudioProgressBar,
2097
2071
  AudioWaveVisualizer_default as AudioWaveVisualizer,
2072
+ StreamPlaybackManager,
2098
2073
  StreamingTextSplitter,
2099
- clearSessionAudioCache,
2100
- findSessionCacheByText,
2101
- getSessionAudioCache,
2102
2074
  splitTextByDelimiters,
2103
2075
  useMessageTTS,
2104
2076
  useStreamTTS,