@wq-hook/volcano-react 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -509,10 +509,12 @@ function useMessageTTS({
509
509
  const [isSynthesizing, setIsSynthesizing] = useState3(false);
510
510
  const [error, setErrorState] = useState3(null);
511
511
  const [progress, setProgress] = useState3(0);
512
- const [visualizationData, setVisualizationData] = useState3({
513
- frequencyData: new Uint8Array(0),
514
- timeDomainData: new Uint8Array(0)
515
- });
512
+ const [visualizationData, setVisualizationData] = useState3(
513
+ {
514
+ frequencyData: new Uint8Array(0),
515
+ timeDomainData: new Uint8Array(0)
516
+ }
517
+ );
516
518
  const instanceId = useRef3(
517
519
  `tts-${Date.now()}-${Math.random().toString(36).slice(2)}`
518
520
  ).current;
@@ -720,7 +722,11 @@ function useMessageTTS({
720
722
  console.error("Audio playback error:", e, audio.error);
721
723
  metricsCollector.record({
722
724
  name: "tts_error",
723
- labels: { error_code: "playback_error", voice, detail: audio.error?.message || String(audio.error?.code) },
725
+ labels: {
726
+ error_code: "playback_error",
727
+ voice,
728
+ detail: audio.error?.message || String(audio.error?.code)
729
+ },
724
730
  value: 1,
725
731
  timestamp: Date.now()
726
732
  });
@@ -738,7 +744,10 @@ function useMessageTTS({
738
744
  }
739
745
  };
740
746
  if (cachedData) {
741
- const totalSize = cachedData.reduce((acc, buf) => acc + buf.byteLength, 0);
747
+ const totalSize = cachedData.reduce(
748
+ (acc, buf) => acc + buf.byteLength,
749
+ 0
750
+ );
742
751
  metricsCollector.record({
743
752
  name: "tts_cache_hit",
744
753
  labels: { voice, speed },
@@ -756,7 +765,9 @@ function useMessageTTS({
756
765
  })
757
766
  );
758
767
  if (totalSize === 0) {
759
- console.warn("[useMessageTTS] Cached data is empty, falling back to stream");
768
+ console.warn(
769
+ "[useMessageTTS] Cached data is empty, falling back to stream"
770
+ );
760
771
  } else {
761
772
  const blob = new Blob(cachedData, { type: "audio/mpeg" });
762
773
  const url2 = URL.createObjectURL(blob);
@@ -775,7 +786,10 @@ function useMessageTTS({
775
786
  }
776
787
  console.log("[useMessageTTS] Cache miss, starting stream");
777
788
  clientRef.current = WebsocketMSE2({ autoStartSession: true });
778
- const formattedText = MarkdownFormatter2.format(text).replace(emojiRegex2(), "");
789
+ const formattedText = MarkdownFormatter2.format(text).replace(
790
+ emojiRegex2(),
791
+ ""
792
+ );
779
793
  const segments = splitTextByDelimiters(formattedText);
780
794
  const url = clientRef.current.start({
781
795
  url: buildFullUrl2(WS_URL, {
@@ -893,6 +907,14 @@ function useMessageTTS({
893
907
  console.warn(
894
908
  `[useMessageTTS] Voice ${failedVoice} failed, switching to fallback voice ${fallbackVoice}`
895
909
  );
910
+ if (clientRef.current) {
911
+ clientRef.current.close();
912
+ clientRef.current = null;
913
+ }
914
+ if (audioRef.current) {
915
+ audioRef.current.pause();
916
+ audioRef.current = null;
917
+ }
896
918
  executeTTS(text, fallbackVoice);
897
919
  } else {
898
920
  playFallback(text);
@@ -902,7 +924,7 @@ function useMessageTTS({
902
924
  );
903
925
  const play = useCallback3(
904
926
  (text) => {
905
- const voice = audioParams?.speaker || "default";
927
+ const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
906
928
  return executeTTS(text, voice);
907
929
  },
908
930
  [audioParams, executeTTS]
@@ -955,7 +977,9 @@ function useMessageTTS({
955
977
  if (audioRef.current) {
956
978
  let duration = audioRef.current.duration;
957
979
  if (!isFinite(duration) && audioRef.current.buffered.length > 0) {
958
- duration = audioRef.current.buffered.end(audioRef.current.buffered.length - 1);
980
+ duration = audioRef.current.buffered.end(
981
+ audioRef.current.buffered.length - 1
982
+ );
959
983
  }
960
984
  if (isFinite(duration) && duration > 0) {
961
985
  const time = percentage / 100 * duration;
@@ -984,8 +1008,725 @@ function useMessageTTS({
984
1008
  };
985
1009
  }
986
1010
 
1011
+ // src/tts/useStreamTTS.ts
1012
+ import { WebsocketMSE as WebsocketMSE3 } from "@wq-hook/volcano-sdk/tts";
1013
+ import { useCallback as useCallback4, useEffect as useEffect3, useRef as useRef4, useState as useState4 } from "react";
1014
+
1015
+ // src/tts/StreamingTextSplitter.ts
1016
+ import { MarkdownFormatter as MarkdownFormatter3 } from "@wq-hook/volcano-sdk";
1017
+ import emojiRegex3 from "emoji-regex";
1018
+ var StreamingTextSplitter = class {
1019
+ constructor(options = {}) {
1020
+ /** 当前缓冲区 */
1021
+ this.buffer = "";
1022
+ /** 分段索引计数器 */
1023
+ this.segmentIndex = 0;
1024
+ /** 已完成的分段列表 */
1025
+ this.segments = [];
1026
+ /** 是否已完成 */
1027
+ this.isCompleted = false;
1028
+ this.maxLength = options.maxLength || 150;
1029
+ this.minLength = options.minLength || 10;
1030
+ this.onSegmentComplete = options.onSegmentComplete;
1031
+ this.onAllComplete = options.onAllComplete;
1032
+ }
1033
+ /**
1034
+ * 接收流式文本块
1035
+ * @param chunk - 文本块
1036
+ */
1037
+ onChunk(chunk) {
1038
+ if (!chunk || this.isCompleted) return;
1039
+ this.buffer += chunk;
1040
+ if (this.detectBoundary(chunk)) {
1041
+ const newlineIndex = this.buffer.indexOf("\n");
1042
+ if (newlineIndex !== -1) {
1043
+ if (newlineIndex === 0) {
1044
+ this.buffer = this.buffer.substring(1);
1045
+ return;
1046
+ }
1047
+ const segmentBuffer = this.buffer.substring(0, newlineIndex);
1048
+ this.buffer = this.buffer.substring(newlineIndex + 1);
1049
+ this.flushSegmentWithBuffer(segmentBuffer);
1050
+ while (this.buffer.includes("\n")) {
1051
+ const nextNewlineIndex = this.buffer.indexOf("\n");
1052
+ if (nextNewlineIndex === 0) {
1053
+ this.buffer = this.buffer.substring(1);
1054
+ continue;
1055
+ }
1056
+ const nextSegmentBuffer = this.buffer.substring(0, nextNewlineIndex);
1057
+ this.buffer = this.buffer.substring(nextNewlineIndex + 1);
1058
+ this.flushSegmentWithBuffer(nextSegmentBuffer);
1059
+ }
1060
+ }
1061
+ }
1062
+ }
1063
+ /**
1064
+ * 检测分段边界
1065
+ * @param chunk - 最新接收的文本块
1066
+ * @returns 是否应该分段
1067
+ */
1068
+ detectBoundary(chunk) {
1069
+ if (chunk.includes("\n")) {
1070
+ if (this.buffer.length >= this.maxLength) {
1071
+ this.forceSplitAtSentenceBoundary();
1072
+ }
1073
+ return true;
1074
+ }
1075
+ if (this.buffer.length >= this.maxLength) {
1076
+ this.forceSplitAtSentenceBoundary();
1077
+ return true;
1078
+ }
1079
+ return false;
1080
+ }
1081
+ /**
1082
+ * 在句子边界强制拆分超长段落
1083
+ */
1084
+ forceSplitAtSentenceBoundary() {
1085
+ const content = this.buffer;
1086
+ const sentenceEnders = /[。?!]/g;
1087
+ let lastMatch = null;
1088
+ let match = null;
1089
+ while ((match = sentenceEnders.exec(content)) !== null) {
1090
+ lastMatch = match;
1091
+ }
1092
+ if (lastMatch && lastMatch.index > this.minLength) {
1093
+ const splitPoint = lastMatch.index + 1;
1094
+ const firstPart = content.substring(0, splitPoint);
1095
+ const secondPart = content.substring(splitPoint);
1096
+ this.buffer = firstPart;
1097
+ this.flushSegment();
1098
+ this.buffer = secondPart;
1099
+ } else {
1100
+ const midPoint = Math.floor(content.length / 2);
1101
+ const firstPart = content.substring(0, midPoint);
1102
+ const secondPart = content.substring(midPoint);
1103
+ this.buffer = firstPart;
1104
+ this.flushSegment();
1105
+ this.buffer = secondPart;
1106
+ }
1107
+ }
1108
+ /**
1109
+ * 使用指定缓冲区内容刷新为分段
1110
+ * @param bufferToFlush - 要分段的缓冲区内容
1111
+ */
1112
+ flushSegmentWithBuffer(bufferToFlush) {
1113
+ const content = bufferToFlush;
1114
+ if (!content) return;
1115
+ const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
1116
+ const isTooShort = content.length < 3;
1117
+ if (isPureSymbols && isTooShort) {
1118
+ return;
1119
+ }
1120
+ const formattedContent = MarkdownFormatter3.format(content).replace(emojiRegex3(), "");
1121
+ if (!formattedContent) return;
1122
+ let subSegments = [formattedContent];
1123
+ if (formattedContent.length > this.maxLength) {
1124
+ subSegments = this.splitLongSegment(formattedContent);
1125
+ }
1126
+ for (const subSegment of subSegments) {
1127
+ if (!subSegment) continue;
1128
+ const segment = {
1129
+ index: this.segmentIndex++,
1130
+ content: subSegment,
1131
+ length: subSegment.length,
1132
+ sent: false
1133
+ };
1134
+ this.segments.push(segment);
1135
+ this.onSegmentComplete?.(segment);
1136
+ }
1137
+ }
1138
+ /**
1139
+ * 刷新当前缓冲区为分段
1140
+ */
1141
+ flushSegment() {
1142
+ const content = this.buffer.trim();
1143
+ if (!content) {
1144
+ this.buffer = "";
1145
+ return;
1146
+ }
1147
+ const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
1148
+ const isTooShort = content.length < 3;
1149
+ if (isPureSymbols && isTooShort) {
1150
+ this.buffer = "";
1151
+ return;
1152
+ }
1153
+ const formattedContent = MarkdownFormatter3.format(content).replace(emojiRegex3(), "");
1154
+ if (!formattedContent) {
1155
+ this.buffer = "";
1156
+ return;
1157
+ }
1158
+ let subSegments = [formattedContent];
1159
+ if (formattedContent.length > this.maxLength) {
1160
+ subSegments = this.splitLongSegment(formattedContent);
1161
+ }
1162
+ for (const subSegment of subSegments) {
1163
+ if (!subSegment) continue;
1164
+ const segment = {
1165
+ index: this.segmentIndex++,
1166
+ content: subSegment,
1167
+ length: subSegment.length,
1168
+ sent: false
1169
+ };
1170
+ this.segments.push(segment);
1171
+ this.onSegmentComplete?.(segment);
1172
+ }
1173
+ this.buffer = "";
1174
+ }
1175
+ /**
1176
+ * 拆分超长分段
1177
+ * @param segment - 超长的分段
1178
+ * @returns 拆分后的分段数组
1179
+ */
1180
+ splitLongSegment(segment) {
1181
+ const result = [];
1182
+ let current = "";
1183
+ for (const char of segment) {
1184
+ current += char;
1185
+ const shouldSplit = /[。?!,,]/.test(char);
1186
+ if (shouldSplit && current.length <= this.maxLength) {
1187
+ result.push(current);
1188
+ current = "";
1189
+ } else if (current.length >= this.maxLength) {
1190
+ result.push(current);
1191
+ current = "";
1192
+ }
1193
+ }
1194
+ if (current) {
1195
+ result.push(current);
1196
+ }
1197
+ return result.filter((s) => s.length > 0);
1198
+ }
1199
+ /**
1200
+ * 完成流式输入
1201
+ * 处理剩余的缓冲区内容
1202
+ */
1203
+ complete() {
1204
+ if (this.isCompleted) return;
1205
+ this.isCompleted = true;
1206
+ while (this.buffer.includes("\n")) {
1207
+ const newlineIndex = this.buffer.indexOf("\n");
1208
+ if (newlineIndex === 0) {
1209
+ this.buffer = this.buffer.substring(1);
1210
+ continue;
1211
+ }
1212
+ const segmentBuffer = this.buffer.substring(0, newlineIndex);
1213
+ this.buffer = this.buffer.substring(newlineIndex + 1);
1214
+ this.flushSegmentWithBuffer(segmentBuffer);
1215
+ }
1216
+ if (this.buffer.trim()) {
1217
+ this.flushSegment();
1218
+ }
1219
+ this.onAllComplete?.(this.segments);
1220
+ }
1221
+ /**
1222
+ * 重置分段器状态
1223
+ */
1224
+ reset() {
1225
+ this.buffer = "";
1226
+ this.segmentIndex = 0;
1227
+ this.segments = [];
1228
+ this.isCompleted = false;
1229
+ }
1230
+ /**
1231
+ * 获取当前缓冲区内容
1232
+ */
1233
+ getBuffer() {
1234
+ return this.buffer;
1235
+ }
1236
+ /**
1237
+ * 获取已分段的列表
1238
+ */
1239
+ getSegments() {
1240
+ return this.segments;
1241
+ }
1242
+ /**
1243
+ * 获取统计信息
1244
+ */
1245
+ getStats() {
1246
+ return {
1247
+ bufferLength: this.buffer.length,
1248
+ segmentCount: this.segments.length,
1249
+ totalChars: this.segments.reduce((sum, seg) => sum + seg.length, 0)
1250
+ };
1251
+ }
1252
+ };
1253
+
1254
+ // src/tts/useStreamTTS.ts
1255
+ var WS_URL2 = "wss://openspeech.bytedance.com/api/v3/tts/bidirection";
1256
+ var activeInstances2 = /* @__PURE__ */ new Map();
1257
+ var sessionAudioCache = /* @__PURE__ */ new Map();
1258
+ function buildFullUrl3(url, params) {
1259
+ const arr = [];
1260
+ for (const key in params) {
1261
+ if (Object.prototype.hasOwnProperty.call(params, key)) {
1262
+ arr.push(`${key}=${encodeURIComponent(params[key])}`);
1263
+ }
1264
+ }
1265
+ return `${url}?${arr.join("&")}`;
1266
+ }
1267
+ function useStreamTTS({
1268
+ ttsConfig,
1269
+ audioParams,
1270
+ autoPlay = true,
1271
+ metricsCollector = new NoopMetricsCollector(),
1272
+ onPlayStart,
1273
+ onPlayPause,
1274
+ onPlayResume,
1275
+ onPlayEnd,
1276
+ onError,
1277
+ visualization,
1278
+ maxSegmentLength = 150
1279
+ }) {
1280
+ const [isConnected, setIsConnected] = useState4(false);
1281
+ const [isSessionStarted, setIsSessionStarted] = useState4(false);
1282
+ const [isSynthesizing, setIsSynthesizing] = useState4(false);
1283
+ const [isPlaying, setIsPlaying] = useState4(false);
1284
+ const [isPaused, setIsPaused] = useState4(false);
1285
+ const [error, setErrorState] = useState4(null);
1286
+ const [streamText, setStreamText] = useState4("");
1287
+ const [progress, setProgress] = useState4(0);
1288
+ const [visualizationData, setVisualizationData] = useState4({
1289
+ frequencyData: new Uint8Array(0),
1290
+ timeDomainData: new Uint8Array(0)
1291
+ });
1292
+ const instanceId = useRef4(`tts-stream-${Date.now()}-${Math.random().toString(36).slice(2)}`).current;
1293
+ const clientRef = useRef4(null);
1294
+ const audioRef = useRef4(null);
1295
+ const audioContextRef = useRef4(null);
1296
+ const analyserRef = useRef4(null);
1297
+ const sourceRef = useRef4(null);
1298
+ const audioUrlRef = useRef4(null);
1299
+ const streamTextRef = useRef4("");
1300
+ const isConnectedRef = useRef4(false);
1301
+ const isSessionStartedRef = useRef4(false);
1302
+ const calledSessionStartedRef = useRef4(false);
1303
+ const splitterRef = useRef4(null);
1304
+ const segmentQueueRef = useRef4([]);
1305
+ const isSendingRef = useRef4(false);
1306
+ const sessionAudioBuffersRef = useRef4([]);
1307
+ const isStreamFinishedRef = useRef4(false);
1308
+ const isSessionFinishedRef = useRef4(false);
1309
+ const resolveAllSegmentsSentRef = useRef4(null);
1310
+ const currentVoiceRef = useRef4("");
1311
+ const initAudioContext = useCallback4(() => {
1312
+ if (!audioRef.current) return;
1313
+ if (!audioContextRef.current) {
1314
+ const AudioContextClass = window.AudioContext || window.webkitAudioContext;
1315
+ audioContextRef.current = new AudioContextClass();
1316
+ }
1317
+ if (audioContextRef.current.state === "suspended") {
1318
+ audioContextRef.current.resume();
1319
+ }
1320
+ if (!analyserRef.current) {
1321
+ analyserRef.current = audioContextRef.current.createAnalyser();
1322
+ analyserRef.current.fftSize = visualization?.fftSize || 256;
1323
+ }
1324
+ if (!sourceRef.current) {
1325
+ try {
1326
+ sourceRef.current = audioContextRef.current.createMediaElementSource(audioRef.current);
1327
+ sourceRef.current.connect(analyserRef.current);
1328
+ analyserRef.current.connect(audioContextRef.current.destination);
1329
+ } catch (e) {
1330
+ }
1331
+ }
1332
+ }, [visualization?.fftSize]);
1333
+ const cleanupAudio = useCallback4(() => {
1334
+ if (audioUrlRef.current) {
1335
+ URL.revokeObjectURL(audioUrlRef.current);
1336
+ audioUrlRef.current = null;
1337
+ }
1338
+ if (audioRef.current) {
1339
+ audioRef.current.onerror = null;
1340
+ audioRef.current.onended = null;
1341
+ audioRef.current.onpause = null;
1342
+ audioRef.current.onplay = null;
1343
+ audioRef.current.ontimeupdate = null;
1344
+ audioRef.current.pause();
1345
+ audioRef.current.src = "";
1346
+ audioRef.current = null;
1347
+ }
1348
+ if (sourceRef.current) {
1349
+ try {
1350
+ sourceRef.current.disconnect();
1351
+ } catch (e) {
1352
+ }
1353
+ sourceRef.current = null;
1354
+ }
1355
+ }, []);
1356
+ const stopOthers = useCallback4(() => {
1357
+ activeInstances2.forEach((instance, id) => {
1358
+ if (id !== instanceId) {
1359
+ instance.pause();
1360
+ }
1361
+ });
1362
+ }, [instanceId]);
1363
+ const pause = useCallback4(() => {
1364
+ if (audioRef.current) {
1365
+ audioRef.current.pause();
1366
+ }
1367
+ setIsPaused(true);
1368
+ setIsPlaying(false);
1369
+ onPlayPause?.();
1370
+ }, [onPlayPause]);
1371
+ const resume = useCallback4(() => {
1372
+ stopOthers();
1373
+ if (audioRef.current) {
1374
+ audioRef.current.play();
1375
+ }
1376
+ setIsPaused(false);
1377
+ setIsPlaying(true);
1378
+ onPlayResume?.();
1379
+ activeInstances2.set(instanceId, { pause });
1380
+ }, [stopOthers, instanceId, pause, onPlayResume]);
1381
+ const sendNextSegment = useCallback4(() => {
1382
+ if (!clientRef.current || !isSessionStartedRef.current || isSendingRef.current || isSessionFinishedRef.current) {
1383
+ return;
1384
+ }
1385
+ if (segmentQueueRef.current.length === 0) {
1386
+ if (isStreamFinishedRef.current && !isSessionFinishedRef.current) {
1387
+ console.log("[useStreamTTS] All segments sent, finishing session");
1388
+ isSessionFinishedRef.current = true;
1389
+ clientRef.current.finishSession();
1390
+ resolveAllSegmentsSentRef.current?.();
1391
+ }
1392
+ return;
1393
+ }
1394
+ isSendingRef.current = true;
1395
+ const segment = segmentQueueRef.current.shift();
1396
+ console.log(`[useStreamTTS] Sending segment ${segment.index}: ${segment.content.substring(0, 30)}...`);
1397
+ clientRef.current.sendText(segment.content);
1398
+ segment.sent = true;
1399
+ isSendingRef.current = false;
1400
+ setTimeout(() => sendNextSegment(), 0);
1401
+ }, []);
1402
+ const stop = useCallback4(() => {
1403
+ if (clientRef.current) {
1404
+ clientRef.current.close();
1405
+ clientRef.current = null;
1406
+ }
1407
+ cleanupAudio();
1408
+ setIsConnected(false);
1409
+ isConnectedRef.current = false;
1410
+ setIsSessionStarted(false);
1411
+ isSessionStartedRef.current = false;
1412
+ calledSessionStartedRef.current = false;
1413
+ setIsPlaying(false);
1414
+ setIsPaused(false);
1415
+ setIsSynthesizing(false);
1416
+ setProgress(0);
1417
+ activeInstances2.delete(instanceId);
1418
+ streamTextRef.current = "";
1419
+ setStreamText("");
1420
+ segmentQueueRef.current = [];
1421
+ isSendingRef.current = false;
1422
+ sessionAudioBuffersRef.current = [];
1423
+ isStreamFinishedRef.current = false;
1424
+ isSessionFinishedRef.current = false;
1425
+ splitterRef.current?.reset();
1426
+ }, [cleanupAudio, instanceId]);
1427
+ const connect = useCallback4(async () => {
1428
+ stop();
1429
+ setErrorState(null);
1430
+ setProgress(0);
1431
+ sessionAudioBuffersRef.current = [];
1432
+ isStreamFinishedRef.current = false;
1433
+ streamTextRef.current = "";
1434
+ setStreamText("");
1435
+ segmentQueueRef.current = [];
1436
+ isSendingRef.current = false;
1437
+ isSessionStartedRef.current = false;
1438
+ calledSessionStartedRef.current = false;
1439
+ setIsSessionStarted(false);
1440
+ const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
1441
+ currentVoiceRef.current = voice;
1442
+ const startTime = Date.now();
1443
+ metricsCollector.record({
1444
+ name: "tts_request",
1445
+ labels: { voice, text_length: 0 },
1446
+ value: 1,
1447
+ timestamp: startTime
1448
+ });
1449
+ try {
1450
+ const audio = new Audio();
1451
+ audio.crossOrigin = "anonymous";
1452
+ audioRef.current = audio;
1453
+ audio.onplay = () => {
1454
+ setIsPlaying(true);
1455
+ setIsPaused(false);
1456
+ onPlayStart?.();
1457
+ initAudioContext();
1458
+ activeInstances2.set(instanceId, { pause });
1459
+ };
1460
+ audio.onended = () => {
1461
+ setIsPlaying(false);
1462
+ setIsPaused(false);
1463
+ onPlayEnd?.();
1464
+ activeInstances2.delete(instanceId);
1465
+ };
1466
+ audio.onerror = (e) => {
1467
+ console.error("[useStreamTTS] Audio playback error:", e, audio.error);
1468
+ setErrorState(audio.error?.message || "Audio playback error");
1469
+ onError?.(new Error(audio.error?.message || "Audio playback error"));
1470
+ };
1471
+ audio.ontimeupdate = () => {
1472
+ let duration = audio.duration;
1473
+ if (!isFinite(duration) && audio.buffered.length > 0) {
1474
+ duration = audio.buffered.end(audio.buffered.length - 1);
1475
+ }
1476
+ if (isFinite(duration) && duration > 0) {
1477
+ setProgress(audio.currentTime / duration * 100);
1478
+ }
1479
+ };
1480
+ clientRef.current = WebsocketMSE3({ autoStartSession: false });
1481
+ splitterRef.current = new StreamingTextSplitter({
1482
+ maxLength: maxSegmentLength,
1483
+ onSegmentComplete: (segment) => {
1484
+ segmentQueueRef.current.push(segment);
1485
+ console.log(`[useStreamTTS] Segment ${segment.index} queued (${segment.length} chars)`);
1486
+ if (isSessionStartedRef.current) {
1487
+ sendNextSegment();
1488
+ }
1489
+ },
1490
+ onAllComplete: () => {
1491
+ console.log(`[useStreamTTS] All segments completed, total: ${segmentQueueRef.current.length} in queue`);
1492
+ }
1493
+ });
1494
+ const url = clientRef.current.start({
1495
+ url: buildFullUrl3(WS_URL2, {
1496
+ api_access_key: `Jwt; ${ttsConfig.token}`,
1497
+ api_app_key: ttsConfig.appid,
1498
+ api_resource_id: ttsConfig.resourceId || "seed-tts-2.0"
1499
+ }),
1500
+ config: {
1501
+ user: {
1502
+ uid: `req-${Date.now()}`
1503
+ },
1504
+ namespace: ttsConfig.namespace || "BidirectionalTTS",
1505
+ req_params: {
1506
+ speaker: voice,
1507
+ audio_params: {
1508
+ sample_rate: audioParams?.sample_rate || 24e3,
1509
+ format: audioParams?.format || "mp3",
1510
+ speech_rate: audioParams?.speech_rate,
1511
+ pitch_rate: audioParams?.pitch_rate,
1512
+ loudness_rate: audioParams?.loudness_rate
1513
+ },
1514
+ additions: JSON.stringify({
1515
+ enable_language_detector: true,
1516
+ disable_markdown_filter: true,
1517
+ enable_latex_tn: true
1518
+ })
1519
+ }
1520
+ },
1521
+ // ===== 关键回调 =====
1522
+ onStart: () => {
1523
+ setIsConnected(true);
1524
+ isConnectedRef.current = true;
1525
+ console.log("[useStreamTTS] WebSocket connected, waiting for text...");
1526
+ },
1527
+ onSessionStarted: () => {
1528
+ setIsSessionStarted(true);
1529
+ isSessionStartedRef.current = true;
1530
+ console.log("[useStreamTTS] Session started, can send text now");
1531
+ if (segmentQueueRef.current.length > 0) {
1532
+ sendNextSegment();
1533
+ }
1534
+ },
1535
+ onMessage: (data) => {
1536
+ setIsSynthesizing(true);
1537
+ if (sessionAudioBuffersRef.current.length === 0) {
1538
+ metricsCollector.record({
1539
+ name: "tts_latency",
1540
+ labels: { stage: "first_packet", voice },
1541
+ value: Date.now() - startTime,
1542
+ timestamp: Date.now()
1543
+ });
1544
+ }
1545
+ const buffer = data instanceof ArrayBuffer ? data.slice(0) : new Uint8Array(data).buffer;
1546
+ sessionAudioBuffersRef.current.push(buffer);
1547
+ },
1548
+ onSessionFinished: () => {
1549
+ setIsSynthesizing(false);
1550
+ setIsSessionStarted(false);
1551
+ isSessionStartedRef.current = false;
1552
+ calledSessionStartedRef.current = false;
1553
+ if (sessionAudioBuffersRef.current.length > 0 && streamTextRef.current) {
1554
+ const speed = audioParams?.speech_rate || 0;
1555
+ const cacheKey = TTSCache.generateKey(streamTextRef.current, voice, speed);
1556
+ TTSCache.set(cacheKey, [...sessionAudioBuffersRef.current]);
1557
+ sessionAudioCache.set(instanceId, {
1558
+ streamText: streamTextRef.current,
1559
+ audioBuffers: [...sessionAudioBuffersRef.current],
1560
+ timestamp: Date.now(),
1561
+ voice,
1562
+ speed
1563
+ });
1564
+ console.log(`[useStreamTTS] Session finished, cached ${sessionAudioBuffersRef.current.length} audio buffers`);
1565
+ }
1566
+ metricsCollector.record({
1567
+ name: "tts_synthesis_finished",
1568
+ labels: { voice, text_length: streamTextRef.current.length },
1569
+ value: Date.now() - startTime,
1570
+ timestamp: Date.now()
1571
+ });
1572
+ },
1573
+ onError: (err) => {
1574
+ console.error("[useStreamTTS] TTS error:", err);
1575
+ setErrorState(err.msg || "TTS error");
1576
+ onError?.(new Error(err.msg || "TTS error"));
1577
+ setIsSynthesizing(false);
1578
+ }
1579
+ });
1580
+ audioUrlRef.current = url;
1581
+ audio.src = url;
1582
+ if (autoPlay) {
1583
+ try {
1584
+ await audio.play();
1585
+ } catch (e) {
1586
+ console.warn("[useStreamTTS] Autoplay blocked:", e);
1587
+ }
1588
+ }
1589
+ } catch (err) {
1590
+ console.error("[useStreamTTS] Connect error:", err);
1591
+ setErrorState(String(err));
1592
+ onError?.(err instanceof Error ? err : new Error(String(err)));
1593
+ }
1594
+ }, [
1595
+ ttsConfig,
1596
+ audioParams,
1597
+ autoPlay,
1598
+ stop,
1599
+ instanceId,
1600
+ onPlayStart,
1601
+ onPlayEnd,
1602
+ initAudioContext,
1603
+ pause,
1604
+ metricsCollector,
1605
+ maxSegmentLength,
1606
+ sendNextSegment,
1607
+ onError
1608
+ ]);
1609
+ const onMessage = useCallback4((chunk) => {
1610
+ if (!chunk) return;
1611
+ streamTextRef.current += chunk;
1612
+ setStreamText(streamTextRef.current);
1613
+ if (!calledSessionStartedRef.current && !isSessionStartedRef.current && clientRef.current && isConnectedRef.current) {
1614
+ console.log("[useStreamTTS] First text received, starting session...");
1615
+ calledSessionStartedRef.current = true;
1616
+ clientRef.current.startSession();
1617
+ }
1618
+ splitterRef.current?.onChunk(chunk);
1619
+ }, []);
1620
+ const finishStream = useCallback4(async () => {
1621
+ isStreamFinishedRef.current = true;
1622
+ splitterRef.current?.complete();
1623
+ console.log(`[useStreamTTS] Stream finished, ${segmentQueueRef.current.length} segments remaining in queue`);
1624
+ if (segmentQueueRef.current.length > 0 || isSendingRef.current) {
1625
+ await new Promise((resolve) => {
1626
+ resolveAllSegmentsSentRef.current = resolve;
1627
+ });
1628
+ } else if (clientRef.current && isSessionStartedRef.current && !isSessionFinishedRef.current) {
1629
+ isSessionFinishedRef.current = true;
1630
+ clientRef.current.finishSession();
1631
+ }
1632
+ }, []);
1633
+ const seek = useCallback4((percentage) => {
1634
+ if (audioRef.current) {
1635
+ let duration = audioRef.current.duration;
1636
+ if (!isFinite(duration) && audioRef.current.buffered.length > 0) {
1637
+ duration = audioRef.current.buffered.end(audioRef.current.buffered.length - 1);
1638
+ }
1639
+ if (isFinite(duration) && duration > 0) {
1640
+ const time = percentage / 100 * duration;
1641
+ if (isFinite(time)) {
1642
+ audioRef.current.currentTime = time;
1643
+ setProgress(percentage);
1644
+ }
1645
+ }
1646
+ }
1647
+ }, []);
1648
+ const getFrequencyData = useCallback4(() => {
1649
+ if (!analyserRef.current) return new Uint8Array(0);
1650
+ const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
1651
+ analyserRef.current.getByteFrequencyData(dataArray);
1652
+ return dataArray;
1653
+ }, []);
1654
+ const getTimeDomainData = useCallback4(() => {
1655
+ if (!analyserRef.current) return new Uint8Array(0);
1656
+ const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
1657
+ analyserRef.current.getByteTimeDomainData(dataArray);
1658
+ return dataArray;
1659
+ }, []);
1660
+ useEffect3(() => {
1661
+ if (!visualization?.enabled) return;
1662
+ let animId;
1663
+ let lastUpdate = 0;
1664
+ const interval = visualization.refreshInterval || 0;
1665
+ const update = (timestamp) => {
1666
+ if (isPlaying && !isPaused) {
1667
+ if (timestamp - lastUpdate >= interval) {
1668
+ setVisualizationData({
1669
+ frequencyData: getFrequencyData(),
1670
+ timeDomainData: getTimeDomainData()
1671
+ });
1672
+ lastUpdate = timestamp;
1673
+ }
1674
+ animId = requestAnimationFrame(update);
1675
+ }
1676
+ };
1677
+ if (isPlaying && !isPaused) {
1678
+ animId = requestAnimationFrame(update);
1679
+ }
1680
+ return () => {
1681
+ if (animId) cancelAnimationFrame(animId);
1682
+ };
1683
+ }, [isPlaying, isPaused, visualization, getFrequencyData, getTimeDomainData]);
1684
+ useEffect3(() => {
1685
+ return () => {
1686
+ stop();
1687
+ if (audioContextRef.current) {
1688
+ audioContextRef.current.close();
1689
+ }
1690
+ };
1691
+ }, [stop]);
1692
+ return {
1693
+ isConnected,
1694
+ isSessionStarted,
1695
+ isSynthesizing,
1696
+ isPlaying,
1697
+ isPaused,
1698
+ error,
1699
+ streamText,
1700
+ progress,
1701
+ connect,
1702
+ onMessage,
1703
+ finishStream,
1704
+ pause,
1705
+ resume,
1706
+ stop,
1707
+ seek,
1708
+ getFrequencyData,
1709
+ getTimeDomainData,
1710
+ visualizationData
1711
+ };
1712
+ }
1713
+ function getSessionAudioCache(instanceId) {
1714
+ return sessionAudioCache.get(instanceId);
1715
+ }
1716
+ function clearSessionAudioCache(instanceId) {
1717
+ sessionAudioCache.delete(instanceId);
1718
+ }
1719
+ function findSessionCacheByText(streamText, voice, speed) {
1720
+ for (const entry of sessionAudioCache.values()) {
1721
+ if (entry.streamText === streamText && entry.voice === voice && entry.speed === speed) {
1722
+ return entry;
1723
+ }
1724
+ }
1725
+ return void 0;
1726
+ }
1727
+
987
1728
  // src/components/AudioWaveVisualizer.tsx
988
- import { useEffect as useEffect3, useRef as useRef4 } from "react";
1729
+ import { useEffect as useEffect4, useRef as useRef5 } from "react";
989
1730
  import { jsx } from "react/jsx-runtime";
990
1731
  var AudioWaveVisualizer = ({
991
1732
  isPlaying,
@@ -1000,8 +1741,8 @@ var AudioWaveVisualizer = ({
1000
1741
  className,
1001
1742
  styleObj
1002
1743
  }) => {
1003
- const canvasRef = useRef4(null);
1004
- const requestRef = useRef4(null);
1744
+ const canvasRef = useRef5(null);
1745
+ const requestRef = useRef5(null);
1005
1746
  const progressBackground = Array.isArray(color) ? `linear-gradient(90deg, ${color[0]}, ${color[1]})` : color;
1006
1747
  const textColor = Array.isArray(color) ? color[0] : color;
1007
1748
  const draw = () => {
@@ -1066,7 +1807,7 @@ var AudioWaveVisualizer = ({
1066
1807
  requestRef.current = requestAnimationFrame(draw);
1067
1808
  }
1068
1809
  };
1069
- useEffect3(() => {
1810
+ useEffect4(() => {
1070
1811
  if (isPlaying && !isPaused) {
1071
1812
  requestRef.current = requestAnimationFrame(draw);
1072
1813
  } else {
@@ -1110,7 +1851,7 @@ var AudioWaveVisualizer = ({
1110
1851
  var AudioWaveVisualizer_default = AudioWaveVisualizer;
1111
1852
 
1112
1853
  // src/components/AudioProgressBar.tsx
1113
- import { useEffect as useEffect4, useRef as useRef5 } from "react";
1854
+ import { useEffect as useEffect5, useRef as useRef6 } from "react";
1114
1855
  import { jsx as jsx2, jsxs } from "react/jsx-runtime";
1115
1856
  var AudioProgressBar = ({
1116
1857
  progress,
@@ -1123,16 +1864,16 @@ var AudioProgressBar = ({
1123
1864
  className,
1124
1865
  style
1125
1866
  }) => {
1126
- const progressBarRef = useRef5(null);
1127
- const containerRef = useRef5(null);
1128
- const progressTextRef = useRef5(null);
1129
- const thumbRef = useRef5(null);
1130
- const displayedProgress = useRef5(0);
1131
- const requestRef = useRef5(null);
1132
- const isDragging = useRef5(false);
1133
- const isHovering = useRef5(false);
1134
- const isTouch = useRef5(false);
1135
- useEffect4(() => {
1867
+ const progressBarRef = useRef6(null);
1868
+ const containerRef = useRef6(null);
1869
+ const progressTextRef = useRef6(null);
1870
+ const thumbRef = useRef6(null);
1871
+ const displayedProgress = useRef6(0);
1872
+ const requestRef = useRef6(null);
1873
+ const isDragging = useRef6(false);
1874
+ const isHovering = useRef6(false);
1875
+ const isTouch = useRef6(false);
1876
+ useEffect5(() => {
1136
1877
  const match = window.matchMedia("(pointer: coarse)");
1137
1878
  isTouch.current = match.matches;
1138
1879
  if (isTouch.current && thumbRef.current) {
@@ -1171,7 +1912,7 @@ var AudioProgressBar = ({
1171
1912
  requestRef.current = requestAnimationFrame(animate);
1172
1913
  }
1173
1914
  };
1174
- useEffect4(() => {
1915
+ useEffect5(() => {
1175
1916
  requestRef.current = requestAnimationFrame(animate);
1176
1917
  return () => {
1177
1918
  if (requestRef.current) {
@@ -1357,8 +2098,13 @@ var AudioProgressBar_default = AudioProgressBar;
1357
2098
  export {
1358
2099
  AudioProgressBar_default as AudioProgressBar,
1359
2100
  AudioWaveVisualizer_default as AudioWaveVisualizer,
2101
+ StreamingTextSplitter,
2102
+ clearSessionAudioCache,
2103
+ findSessionCacheByText,
2104
+ getSessionAudioCache,
1360
2105
  splitTextByDelimiters,
1361
2106
  useMessageTTS,
2107
+ useStreamTTS,
1362
2108
  useVolcanoASR,
1363
2109
  useVolcanoTTS
1364
2110
  };