@wq-hook/volcano-react 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -32,8 +32,13 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  AudioProgressBar: () => AudioProgressBar_default,
34
34
  AudioWaveVisualizer: () => AudioWaveVisualizer_default,
35
+ StreamingTextSplitter: () => StreamingTextSplitter,
36
+ clearSessionAudioCache: () => clearSessionAudioCache,
37
+ findSessionCacheByText: () => findSessionCacheByText,
38
+ getSessionAudioCache: () => getSessionAudioCache,
35
39
  splitTextByDelimiters: () => splitTextByDelimiters,
36
40
  useMessageTTS: () => useMessageTTS,
41
+ useStreamTTS: () => useStreamTTS,
37
42
  useVolcanoASR: () => useVolcanoASR,
38
43
  useVolcanoTTS: () => useVolcanoTTS
39
44
  });
@@ -550,10 +555,12 @@ function useMessageTTS({
550
555
  const [isSynthesizing, setIsSynthesizing] = (0, import_react3.useState)(false);
551
556
  const [error, setErrorState] = (0, import_react3.useState)(null);
552
557
  const [progress, setProgress] = (0, import_react3.useState)(0);
553
- const [visualizationData, setVisualizationData] = (0, import_react3.useState)({
554
- frequencyData: new Uint8Array(0),
555
- timeDomainData: new Uint8Array(0)
556
- });
558
+ const [visualizationData, setVisualizationData] = (0, import_react3.useState)(
559
+ {
560
+ frequencyData: new Uint8Array(0),
561
+ timeDomainData: new Uint8Array(0)
562
+ }
563
+ );
557
564
  const instanceId = (0, import_react3.useRef)(
558
565
  `tts-${Date.now()}-${Math.random().toString(36).slice(2)}`
559
566
  ).current;
@@ -761,7 +768,11 @@ function useMessageTTS({
761
768
  console.error("Audio playback error:", e, audio.error);
762
769
  metricsCollector.record({
763
770
  name: "tts_error",
764
- labels: { error_code: "playback_error", voice, detail: audio.error?.message || String(audio.error?.code) },
771
+ labels: {
772
+ error_code: "playback_error",
773
+ voice,
774
+ detail: audio.error?.message || String(audio.error?.code)
775
+ },
765
776
  value: 1,
766
777
  timestamp: Date.now()
767
778
  });
@@ -779,7 +790,10 @@ function useMessageTTS({
779
790
  }
780
791
  };
781
792
  if (cachedData) {
782
- const totalSize = cachedData.reduce((acc, buf) => acc + buf.byteLength, 0);
793
+ const totalSize = cachedData.reduce(
794
+ (acc, buf) => acc + buf.byteLength,
795
+ 0
796
+ );
783
797
  metricsCollector.record({
784
798
  name: "tts_cache_hit",
785
799
  labels: { voice, speed },
@@ -797,7 +811,9 @@ function useMessageTTS({
797
811
  })
798
812
  );
799
813
  if (totalSize === 0) {
800
- console.warn("[useMessageTTS] Cached data is empty, falling back to stream");
814
+ console.warn(
815
+ "[useMessageTTS] Cached data is empty, falling back to stream"
816
+ );
801
817
  } else {
802
818
  const blob = new Blob(cachedData, { type: "audio/mpeg" });
803
819
  const url2 = URL.createObjectURL(blob);
@@ -816,7 +832,10 @@ function useMessageTTS({
816
832
  }
817
833
  console.log("[useMessageTTS] Cache miss, starting stream");
818
834
  clientRef.current = (0, import_tts2.WebsocketMSE)({ autoStartSession: true });
819
- const formattedText = import_volcano_sdk2.MarkdownFormatter.format(text).replace((0, import_emoji_regex2.default)(), "");
835
+ const formattedText = import_volcano_sdk2.MarkdownFormatter.format(text).replace(
836
+ (0, import_emoji_regex2.default)(),
837
+ ""
838
+ );
820
839
  const segments = splitTextByDelimiters(formattedText);
821
840
  const url = clientRef.current.start({
822
841
  url: buildFullUrl2(WS_URL, {
@@ -934,6 +953,14 @@ function useMessageTTS({
934
953
  console.warn(
935
954
  `[useMessageTTS] Voice ${failedVoice} failed, switching to fallback voice ${fallbackVoice}`
936
955
  );
956
+ if (clientRef.current) {
957
+ clientRef.current.close();
958
+ clientRef.current = null;
959
+ }
960
+ if (audioRef.current) {
961
+ audioRef.current.pause();
962
+ audioRef.current = null;
963
+ }
937
964
  executeTTS(text, fallbackVoice);
938
965
  } else {
939
966
  playFallback(text);
@@ -943,7 +970,7 @@ function useMessageTTS({
943
970
  );
944
971
  const play = (0, import_react3.useCallback)(
945
972
  (text) => {
946
- const voice = audioParams?.speaker || "default";
973
+ const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
947
974
  return executeTTS(text, voice);
948
975
  },
949
976
  [audioParams, executeTTS]
@@ -996,7 +1023,9 @@ function useMessageTTS({
996
1023
  if (audioRef.current) {
997
1024
  let duration = audioRef.current.duration;
998
1025
  if (!isFinite(duration) && audioRef.current.buffered.length > 0) {
999
- duration = audioRef.current.buffered.end(audioRef.current.buffered.length - 1);
1026
+ duration = audioRef.current.buffered.end(
1027
+ audioRef.current.buffered.length - 1
1028
+ );
1000
1029
  }
1001
1030
  if (isFinite(duration) && duration > 0) {
1002
1031
  const time = percentage / 100 * duration;
@@ -1025,8 +1054,722 @@ function useMessageTTS({
1025
1054
  };
1026
1055
  }
1027
1056
 
1028
- // src/components/AudioWaveVisualizer.tsx
1057
+ // src/tts/useStreamTTS.ts
1058
+ var import_tts3 = require("@wq-hook/volcano-sdk/tts");
1029
1059
  var import_react4 = require("react");
1060
+
1061
+ // src/tts/StreamingTextSplitter.ts
1062
+ var import_volcano_sdk3 = require("@wq-hook/volcano-sdk");
1063
+ var import_emoji_regex3 = __toESM(require("emoji-regex"));
1064
+ var StreamingTextSplitter = class {
1065
+ constructor(options = {}) {
1066
+ /** 当前缓冲区 */
1067
+ this.buffer = "";
1068
+ /** 分段索引计数器 */
1069
+ this.segmentIndex = 0;
1070
+ /** 已完成的分段列表 */
1071
+ this.segments = [];
1072
+ /** 是否已完成 */
1073
+ this.isCompleted = false;
1074
+ this.maxLength = options.maxLength || 150;
1075
+ this.minLength = options.minLength || 10;
1076
+ this.onSegmentComplete = options.onSegmentComplete;
1077
+ this.onAllComplete = options.onAllComplete;
1078
+ }
1079
+ /**
1080
+ * 接收流式文本块
1081
+ * @param chunk - 文本块
1082
+ */
1083
+ onChunk(chunk) {
1084
+ if (!chunk || this.isCompleted) return;
1085
+ this.buffer += chunk;
1086
+ if (this.detectBoundary(chunk)) {
1087
+ const newlineIndex = this.buffer.indexOf("\n");
1088
+ if (newlineIndex !== -1) {
1089
+ if (newlineIndex === 0) {
1090
+ this.buffer = this.buffer.substring(1);
1091
+ return;
1092
+ }
1093
+ const segmentBuffer = this.buffer.substring(0, newlineIndex);
1094
+ this.buffer = this.buffer.substring(newlineIndex + 1);
1095
+ this.flushSegmentWithBuffer(segmentBuffer);
1096
+ while (this.buffer.includes("\n")) {
1097
+ const nextNewlineIndex = this.buffer.indexOf("\n");
1098
+ if (nextNewlineIndex === 0) {
1099
+ this.buffer = this.buffer.substring(1);
1100
+ continue;
1101
+ }
1102
+ const nextSegmentBuffer = this.buffer.substring(0, nextNewlineIndex);
1103
+ this.buffer = this.buffer.substring(nextNewlineIndex + 1);
1104
+ this.flushSegmentWithBuffer(nextSegmentBuffer);
1105
+ }
1106
+ }
1107
+ }
1108
+ }
1109
+ /**
1110
+ * 检测分段边界
1111
+ * @param chunk - 最新接收的文本块
1112
+ * @returns 是否应该分段
1113
+ */
1114
+ detectBoundary(chunk) {
1115
+ if (chunk.includes("\n")) {
1116
+ if (this.buffer.length >= this.maxLength) {
1117
+ this.forceSplitAtSentenceBoundary();
1118
+ }
1119
+ return true;
1120
+ }
1121
+ if (this.buffer.length >= this.maxLength) {
1122
+ this.forceSplitAtSentenceBoundary();
1123
+ return true;
1124
+ }
1125
+ return false;
1126
+ }
1127
+ /**
1128
+ * 在句子边界强制拆分超长段落
1129
+ */
1130
+ forceSplitAtSentenceBoundary() {
1131
+ const content = this.buffer;
1132
+ const sentenceEnders = /[。?!]/g;
1133
+ let lastMatch = null;
1134
+ let match = null;
1135
+ while ((match = sentenceEnders.exec(content)) !== null) {
1136
+ lastMatch = match;
1137
+ }
1138
+ if (lastMatch && lastMatch.index > this.minLength) {
1139
+ const splitPoint = lastMatch.index + 1;
1140
+ const firstPart = content.substring(0, splitPoint);
1141
+ const secondPart = content.substring(splitPoint);
1142
+ this.buffer = firstPart;
1143
+ this.flushSegment();
1144
+ this.buffer = secondPart;
1145
+ } else {
1146
+ const midPoint = Math.floor(content.length / 2);
1147
+ const firstPart = content.substring(0, midPoint);
1148
+ const secondPart = content.substring(midPoint);
1149
+ this.buffer = firstPart;
1150
+ this.flushSegment();
1151
+ this.buffer = secondPart;
1152
+ }
1153
+ }
1154
+ /**
1155
+ * 使用指定缓冲区内容刷新为分段
1156
+ * @param bufferToFlush - 要分段的缓冲区内容
1157
+ */
1158
+ flushSegmentWithBuffer(bufferToFlush) {
1159
+ const content = bufferToFlush;
1160
+ if (!content) return;
1161
+ const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
1162
+ const isTooShort = content.length < 3;
1163
+ if (isPureSymbols && isTooShort) {
1164
+ return;
1165
+ }
1166
+ const formattedContent = import_volcano_sdk3.MarkdownFormatter.format(content).replace((0, import_emoji_regex3.default)(), "");
1167
+ if (!formattedContent) return;
1168
+ let subSegments = [formattedContent];
1169
+ if (formattedContent.length > this.maxLength) {
1170
+ subSegments = this.splitLongSegment(formattedContent);
1171
+ }
1172
+ for (const subSegment of subSegments) {
1173
+ if (!subSegment) continue;
1174
+ const segment = {
1175
+ index: this.segmentIndex++,
1176
+ content: subSegment,
1177
+ length: subSegment.length,
1178
+ sent: false
1179
+ };
1180
+ this.segments.push(segment);
1181
+ this.onSegmentComplete?.(segment);
1182
+ }
1183
+ }
1184
+ /**
1185
+ * 刷新当前缓冲区为分段
1186
+ */
1187
+ flushSegment() {
1188
+ const content = this.buffer.trim();
1189
+ if (!content) {
1190
+ this.buffer = "";
1191
+ return;
1192
+ }
1193
+ const isPureSymbols = /^[^\p{L}\p{N}]*$/u.test(content);
1194
+ const isTooShort = content.length < 3;
1195
+ if (isPureSymbols && isTooShort) {
1196
+ this.buffer = "";
1197
+ return;
1198
+ }
1199
+ const formattedContent = import_volcano_sdk3.MarkdownFormatter.format(content).replace((0, import_emoji_regex3.default)(), "");
1200
+ if (!formattedContent) {
1201
+ this.buffer = "";
1202
+ return;
1203
+ }
1204
+ let subSegments = [formattedContent];
1205
+ if (formattedContent.length > this.maxLength) {
1206
+ subSegments = this.splitLongSegment(formattedContent);
1207
+ }
1208
+ for (const subSegment of subSegments) {
1209
+ if (!subSegment) continue;
1210
+ const segment = {
1211
+ index: this.segmentIndex++,
1212
+ content: subSegment,
1213
+ length: subSegment.length,
1214
+ sent: false
1215
+ };
1216
+ this.segments.push(segment);
1217
+ this.onSegmentComplete?.(segment);
1218
+ }
1219
+ this.buffer = "";
1220
+ }
1221
+ /**
1222
+ * 拆分超长分段
1223
+ * @param segment - 超长的分段
1224
+ * @returns 拆分后的分段数组
1225
+ */
1226
+ splitLongSegment(segment) {
1227
+ const result = [];
1228
+ let current = "";
1229
+ for (const char of segment) {
1230
+ current += char;
1231
+ const shouldSplit = /[。?!,,]/.test(char);
1232
+ if (shouldSplit && current.length <= this.maxLength) {
1233
+ result.push(current);
1234
+ current = "";
1235
+ } else if (current.length >= this.maxLength) {
1236
+ result.push(current);
1237
+ current = "";
1238
+ }
1239
+ }
1240
+ if (current) {
1241
+ result.push(current);
1242
+ }
1243
+ return result.filter((s) => s.length > 0);
1244
+ }
1245
+ /**
1246
+ * 完成流式输入
1247
+ * 处理剩余的缓冲区内容
1248
+ */
1249
+ complete() {
1250
+ if (this.isCompleted) return;
1251
+ this.isCompleted = true;
1252
+ while (this.buffer.includes("\n")) {
1253
+ const newlineIndex = this.buffer.indexOf("\n");
1254
+ if (newlineIndex === 0) {
1255
+ this.buffer = this.buffer.substring(1);
1256
+ continue;
1257
+ }
1258
+ const segmentBuffer = this.buffer.substring(0, newlineIndex);
1259
+ this.buffer = this.buffer.substring(newlineIndex + 1);
1260
+ this.flushSegmentWithBuffer(segmentBuffer);
1261
+ }
1262
+ if (this.buffer.trim()) {
1263
+ this.flushSegment();
1264
+ }
1265
+ this.onAllComplete?.(this.segments);
1266
+ }
1267
+ /**
1268
+ * 重置分段器状态
1269
+ */
1270
+ reset() {
1271
+ this.buffer = "";
1272
+ this.segmentIndex = 0;
1273
+ this.segments = [];
1274
+ this.isCompleted = false;
1275
+ }
1276
+ /**
1277
+ * 获取当前缓冲区内容
1278
+ */
1279
+ getBuffer() {
1280
+ return this.buffer;
1281
+ }
1282
+ /**
1283
+ * 获取已分段的列表
1284
+ */
1285
+ getSegments() {
1286
+ return this.segments;
1287
+ }
1288
+ /**
1289
+ * 获取统计信息
1290
+ */
1291
+ getStats() {
1292
+ return {
1293
+ bufferLength: this.buffer.length,
1294
+ segmentCount: this.segments.length,
1295
+ totalChars: this.segments.reduce((sum, seg) => sum + seg.length, 0)
1296
+ };
1297
+ }
1298
+ };
1299
+
1300
+ // src/tts/useStreamTTS.ts
1301
+ var WS_URL2 = "wss://openspeech.bytedance.com/api/v3/tts/bidirection";
1302
+ var activeInstances2 = /* @__PURE__ */ new Map();
1303
+ var sessionAudioCache = /* @__PURE__ */ new Map();
1304
+ function buildFullUrl3(url, params) {
1305
+ const arr = [];
1306
+ for (const key in params) {
1307
+ if (Object.prototype.hasOwnProperty.call(params, key)) {
1308
+ arr.push(`${key}=${encodeURIComponent(params[key])}`);
1309
+ }
1310
+ }
1311
+ return `${url}?${arr.join("&")}`;
1312
+ }
1313
+ function useStreamTTS({
1314
+ ttsConfig,
1315
+ audioParams,
1316
+ autoPlay = true,
1317
+ metricsCollector = new NoopMetricsCollector(),
1318
+ onPlayStart,
1319
+ onPlayPause,
1320
+ onPlayResume,
1321
+ onPlayEnd,
1322
+ onError,
1323
+ visualization,
1324
+ maxSegmentLength = 150
1325
+ }) {
1326
+ const [isConnected, setIsConnected] = (0, import_react4.useState)(false);
1327
+ const [isSessionStarted, setIsSessionStarted] = (0, import_react4.useState)(false);
1328
+ const [isSynthesizing, setIsSynthesizing] = (0, import_react4.useState)(false);
1329
+ const [isPlaying, setIsPlaying] = (0, import_react4.useState)(false);
1330
+ const [isPaused, setIsPaused] = (0, import_react4.useState)(false);
1331
+ const [error, setErrorState] = (0, import_react4.useState)(null);
1332
+ const [streamText, setStreamText] = (0, import_react4.useState)("");
1333
+ const [progress, setProgress] = (0, import_react4.useState)(0);
1334
+ const [visualizationData, setVisualizationData] = (0, import_react4.useState)({
1335
+ frequencyData: new Uint8Array(0),
1336
+ timeDomainData: new Uint8Array(0)
1337
+ });
1338
+ const instanceId = (0, import_react4.useRef)(`tts-stream-${Date.now()}-${Math.random().toString(36).slice(2)}`).current;
1339
+ const clientRef = (0, import_react4.useRef)(null);
1340
+ const audioRef = (0, import_react4.useRef)(null);
1341
+ const audioContextRef = (0, import_react4.useRef)(null);
1342
+ const analyserRef = (0, import_react4.useRef)(null);
1343
+ const sourceRef = (0, import_react4.useRef)(null);
1344
+ const audioUrlRef = (0, import_react4.useRef)(null);
1345
+ const streamTextRef = (0, import_react4.useRef)("");
1346
+ const isSessionStartedRef = (0, import_react4.useRef)(false);
1347
+ const calledSessionStartedRef = (0, import_react4.useRef)(false);
1348
+ const splitterRef = (0, import_react4.useRef)(null);
1349
+ const segmentQueueRef = (0, import_react4.useRef)([]);
1350
+ const isSendingRef = (0, import_react4.useRef)(false);
1351
+ const sessionAudioBuffersRef = (0, import_react4.useRef)([]);
1352
+ const isStreamFinishedRef = (0, import_react4.useRef)(false);
1353
+ const isSessionFinishedRef = (0, import_react4.useRef)(false);
1354
+ const resolveAllSegmentsSentRef = (0, import_react4.useRef)(null);
1355
+ const currentVoiceRef = (0, import_react4.useRef)("");
1356
+ const initAudioContext = (0, import_react4.useCallback)(() => {
1357
+ if (!audioRef.current) return;
1358
+ if (!audioContextRef.current) {
1359
+ const AudioContextClass = window.AudioContext || window.webkitAudioContext;
1360
+ audioContextRef.current = new AudioContextClass();
1361
+ }
1362
+ if (audioContextRef.current.state === "suspended") {
1363
+ audioContextRef.current.resume();
1364
+ }
1365
+ if (!analyserRef.current) {
1366
+ analyserRef.current = audioContextRef.current.createAnalyser();
1367
+ analyserRef.current.fftSize = visualization?.fftSize || 256;
1368
+ }
1369
+ if (!sourceRef.current) {
1370
+ try {
1371
+ sourceRef.current = audioContextRef.current.createMediaElementSource(audioRef.current);
1372
+ sourceRef.current.connect(analyserRef.current);
1373
+ analyserRef.current.connect(audioContextRef.current.destination);
1374
+ } catch (e) {
1375
+ }
1376
+ }
1377
+ }, [visualization?.fftSize]);
1378
+ const cleanupAudio = (0, import_react4.useCallback)(() => {
1379
+ if (audioUrlRef.current) {
1380
+ URL.revokeObjectURL(audioUrlRef.current);
1381
+ audioUrlRef.current = null;
1382
+ }
1383
+ if (audioRef.current) {
1384
+ audioRef.current.onerror = null;
1385
+ audioRef.current.onended = null;
1386
+ audioRef.current.onpause = null;
1387
+ audioRef.current.onplay = null;
1388
+ audioRef.current.ontimeupdate = null;
1389
+ audioRef.current.pause();
1390
+ audioRef.current.src = "";
1391
+ audioRef.current = null;
1392
+ }
1393
+ if (sourceRef.current) {
1394
+ try {
1395
+ sourceRef.current.disconnect();
1396
+ } catch (e) {
1397
+ }
1398
+ sourceRef.current = null;
1399
+ }
1400
+ }, []);
1401
+ const stopOthers = (0, import_react4.useCallback)(() => {
1402
+ activeInstances2.forEach((instance, id) => {
1403
+ if (id !== instanceId) {
1404
+ instance.pause();
1405
+ }
1406
+ });
1407
+ }, [instanceId]);
1408
+ const pause = (0, import_react4.useCallback)(() => {
1409
+ if (audioRef.current) {
1410
+ audioRef.current.pause();
1411
+ }
1412
+ setIsPaused(true);
1413
+ setIsPlaying(false);
1414
+ onPlayPause?.();
1415
+ }, [onPlayPause]);
1416
+ const resume = (0, import_react4.useCallback)(() => {
1417
+ stopOthers();
1418
+ if (audioRef.current) {
1419
+ audioRef.current.play();
1420
+ }
1421
+ setIsPaused(false);
1422
+ setIsPlaying(true);
1423
+ onPlayResume?.();
1424
+ activeInstances2.set(instanceId, { pause });
1425
+ }, [stopOthers, instanceId, pause, onPlayResume]);
1426
+ const sendNextSegment = (0, import_react4.useCallback)(() => {
1427
+ if (!clientRef.current || !isSessionStartedRef.current || isSendingRef.current || isSessionFinishedRef.current) {
1428
+ return;
1429
+ }
1430
+ if (segmentQueueRef.current.length === 0) {
1431
+ if (isStreamFinishedRef.current && !isSessionFinishedRef.current) {
1432
+ console.log("[useStreamTTS] All segments sent, finishing session");
1433
+ isSessionFinishedRef.current = true;
1434
+ clientRef.current.finishSession();
1435
+ resolveAllSegmentsSentRef.current?.();
1436
+ }
1437
+ return;
1438
+ }
1439
+ isSendingRef.current = true;
1440
+ const segment = segmentQueueRef.current.shift();
1441
+ console.log(`[useStreamTTS] Sending segment ${segment.index}: ${segment.content.substring(0, 30)}...`);
1442
+ clientRef.current.sendText(segment.content);
1443
+ segment.sent = true;
1444
+ isSendingRef.current = false;
1445
+ setTimeout(() => sendNextSegment(), 0);
1446
+ }, []);
1447
+ const stop = (0, import_react4.useCallback)(() => {
1448
+ if (clientRef.current) {
1449
+ clientRef.current.close();
1450
+ clientRef.current = null;
1451
+ }
1452
+ cleanupAudio();
1453
+ setIsConnected(false);
1454
+ setIsSessionStarted(false);
1455
+ isSessionStartedRef.current = false;
1456
+ calledSessionStartedRef.current = false;
1457
+ setIsPlaying(false);
1458
+ setIsPaused(false);
1459
+ setIsSynthesizing(false);
1460
+ setProgress(0);
1461
+ activeInstances2.delete(instanceId);
1462
+ streamTextRef.current = "";
1463
+ setStreamText("");
1464
+ segmentQueueRef.current = [];
1465
+ isSendingRef.current = false;
1466
+ sessionAudioBuffersRef.current = [];
1467
+ isStreamFinishedRef.current = false;
1468
+ isSessionFinishedRef.current = false;
1469
+ splitterRef.current?.reset();
1470
+ }, [cleanupAudio, instanceId]);
1471
+ const connect = (0, import_react4.useCallback)(async () => {
1472
+ stop();
1473
+ setErrorState(null);
1474
+ setProgress(0);
1475
+ sessionAudioBuffersRef.current = [];
1476
+ isStreamFinishedRef.current = false;
1477
+ streamTextRef.current = "";
1478
+ setStreamText("");
1479
+ segmentQueueRef.current = [];
1480
+ isSendingRef.current = false;
1481
+ isSessionStartedRef.current = false;
1482
+ calledSessionStartedRef.current = false;
1483
+ setIsSessionStarted(false);
1484
+ const voice = audioParams?.speaker || "zh_female_vv_uranus_bigtts";
1485
+ currentVoiceRef.current = voice;
1486
+ const startTime = Date.now();
1487
+ metricsCollector.record({
1488
+ name: "tts_request",
1489
+ labels: { voice, text_length: 0 },
1490
+ value: 1,
1491
+ timestamp: startTime
1492
+ });
1493
+ try {
1494
+ const audio = new Audio();
1495
+ audio.crossOrigin = "anonymous";
1496
+ audioRef.current = audio;
1497
+ audio.onplay = () => {
1498
+ setIsPlaying(true);
1499
+ setIsPaused(false);
1500
+ onPlayStart?.();
1501
+ initAudioContext();
1502
+ activeInstances2.set(instanceId, { pause });
1503
+ };
1504
+ audio.onended = () => {
1505
+ setIsPlaying(false);
1506
+ setIsPaused(false);
1507
+ onPlayEnd?.();
1508
+ activeInstances2.delete(instanceId);
1509
+ };
1510
+ audio.onerror = (e) => {
1511
+ console.error("[useStreamTTS] Audio playback error:", e, audio.error);
1512
+ setErrorState(audio.error?.message || "Audio playback error");
1513
+ onError?.(new Error(audio.error?.message || "Audio playback error"));
1514
+ };
1515
+ audio.ontimeupdate = () => {
1516
+ let duration = audio.duration;
1517
+ if (!isFinite(duration) && audio.buffered.length > 0) {
1518
+ duration = audio.buffered.end(audio.buffered.length - 1);
1519
+ }
1520
+ if (isFinite(duration) && duration > 0) {
1521
+ setProgress(audio.currentTime / duration * 100);
1522
+ }
1523
+ };
1524
+ clientRef.current = (0, import_tts3.WebsocketMSE)({ autoStartSession: false });
1525
+ splitterRef.current = new StreamingTextSplitter({
1526
+ maxLength: maxSegmentLength,
1527
+ onSegmentComplete: (segment) => {
1528
+ segmentQueueRef.current.push(segment);
1529
+ console.log(`[useStreamTTS] Segment ${segment.index} queued (${segment.length} chars)`);
1530
+ if (isSessionStartedRef.current) {
1531
+ sendNextSegment();
1532
+ }
1533
+ },
1534
+ onAllComplete: () => {
1535
+ console.log(`[useStreamTTS] All segments completed, total: ${segmentQueueRef.current.length} in queue`);
1536
+ }
1537
+ });
1538
+ const url = clientRef.current.start({
1539
+ url: buildFullUrl3(WS_URL2, {
1540
+ api_access_key: `Jwt; ${ttsConfig.token}`,
1541
+ api_app_key: ttsConfig.appid,
1542
+ api_resource_id: ttsConfig.resourceId || "seed-tts-2.0"
1543
+ }),
1544
+ config: {
1545
+ user: {
1546
+ uid: `req-${Date.now()}`
1547
+ },
1548
+ namespace: ttsConfig.namespace || "BidirectionalTTS",
1549
+ req_params: {
1550
+ speaker: voice,
1551
+ audio_params: {
1552
+ sample_rate: audioParams?.sample_rate || 24e3,
1553
+ format: audioParams?.format || "mp3",
1554
+ speech_rate: audioParams?.speech_rate,
1555
+ pitch_rate: audioParams?.pitch_rate,
1556
+ loudness_rate: audioParams?.loudness_rate
1557
+ },
1558
+ additions: JSON.stringify({
1559
+ enable_language_detector: true,
1560
+ disable_markdown_filter: true,
1561
+ enable_latex_tn: true
1562
+ })
1563
+ }
1564
+ },
1565
+ // ===== 关键回调 =====
1566
+ onStart: () => {
1567
+ setIsConnected(true);
1568
+ console.log("[useStreamTTS] WebSocket connected, waiting for text...");
1569
+ },
1570
+ onSessionStarted: () => {
1571
+ setIsSessionStarted(true);
1572
+ isSessionStartedRef.current = true;
1573
+ console.log("[useStreamTTS] Session started, can send text now");
1574
+ if (segmentQueueRef.current.length > 0) {
1575
+ sendNextSegment();
1576
+ }
1577
+ },
1578
+ onMessage: (data) => {
1579
+ setIsSynthesizing(true);
1580
+ if (sessionAudioBuffersRef.current.length === 0) {
1581
+ metricsCollector.record({
1582
+ name: "tts_latency",
1583
+ labels: { stage: "first_packet", voice },
1584
+ value: Date.now() - startTime,
1585
+ timestamp: Date.now()
1586
+ });
1587
+ }
1588
+ const buffer = data instanceof ArrayBuffer ? data.slice(0) : new Uint8Array(data).buffer;
1589
+ sessionAudioBuffersRef.current.push(buffer);
1590
+ },
1591
+ onSessionFinished: () => {
1592
+ setIsSynthesizing(false);
1593
+ setIsSessionStarted(false);
1594
+ isSessionStartedRef.current = false;
1595
+ calledSessionStartedRef.current = false;
1596
+ if (sessionAudioBuffersRef.current.length > 0 && streamTextRef.current) {
1597
+ const speed = audioParams?.speech_rate || 0;
1598
+ const cacheKey = TTSCache.generateKey(streamTextRef.current, voice, speed);
1599
+ TTSCache.set(cacheKey, [...sessionAudioBuffersRef.current]);
1600
+ sessionAudioCache.set(instanceId, {
1601
+ streamText: streamTextRef.current,
1602
+ audioBuffers: [...sessionAudioBuffersRef.current],
1603
+ timestamp: Date.now(),
1604
+ voice,
1605
+ speed
1606
+ });
1607
+ console.log(`[useStreamTTS] Session finished, cached ${sessionAudioBuffersRef.current.length} audio buffers`);
1608
+ }
1609
+ metricsCollector.record({
1610
+ name: "tts_synthesis_finished",
1611
+ labels: { voice, text_length: streamTextRef.current.length },
1612
+ value: Date.now() - startTime,
1613
+ timestamp: Date.now()
1614
+ });
1615
+ },
1616
+ onError: (err) => {
1617
+ console.error("[useStreamTTS] TTS error:", err);
1618
+ setErrorState(err.msg || "TTS error");
1619
+ onError?.(new Error(err.msg || "TTS error"));
1620
+ setIsSynthesizing(false);
1621
+ }
1622
+ });
1623
+ audioUrlRef.current = url;
1624
+ audio.src = url;
1625
+ if (autoPlay) {
1626
+ try {
1627
+ await audio.play();
1628
+ } catch (e) {
1629
+ console.warn("[useStreamTTS] Autoplay blocked:", e);
1630
+ }
1631
+ }
1632
+ } catch (err) {
1633
+ console.error("[useStreamTTS] Connect error:", err);
1634
+ setErrorState(String(err));
1635
+ onError?.(err instanceof Error ? err : new Error(String(err)));
1636
+ }
1637
+ }, [
1638
+ ttsConfig,
1639
+ audioParams,
1640
+ autoPlay,
1641
+ stop,
1642
+ instanceId,
1643
+ onPlayStart,
1644
+ onPlayEnd,
1645
+ initAudioContext,
1646
+ pause,
1647
+ metricsCollector,
1648
+ maxSegmentLength,
1649
+ sendNextSegment,
1650
+ onError
1651
+ ]);
1652
+ const onMessage = (0, import_react4.useCallback)((chunk) => {
1653
+ if (!chunk) return;
1654
+ streamTextRef.current += chunk;
1655
+ setStreamText(streamTextRef.current);
1656
+ if (!calledSessionStartedRef.current && !isSessionStartedRef.current && clientRef.current && isConnected) {
1657
+ console.log("[useStreamTTS] First text received, starting session...");
1658
+ calledSessionStartedRef.current = true;
1659
+ clientRef.current.startSession();
1660
+ }
1661
+ splitterRef.current?.onChunk(chunk);
1662
+ }, [isConnected]);
1663
+ const finishStream = (0, import_react4.useCallback)(async () => {
1664
+ isStreamFinishedRef.current = true;
1665
+ splitterRef.current?.complete();
1666
+ console.log(`[useStreamTTS] Stream finished, ${segmentQueueRef.current.length} segments remaining in queue`);
1667
+ if (segmentQueueRef.current.length > 0 || isSendingRef.current) {
1668
+ await new Promise((resolve) => {
1669
+ resolveAllSegmentsSentRef.current = resolve;
1670
+ });
1671
+ } else if (clientRef.current && isSessionStartedRef.current && !isSessionFinishedRef.current) {
1672
+ isSessionFinishedRef.current = true;
1673
+ clientRef.current.finishSession();
1674
+ }
1675
+ }, []);
1676
+ const seek = (0, import_react4.useCallback)((percentage) => {
1677
+ if (audioRef.current) {
1678
+ let duration = audioRef.current.duration;
1679
+ if (!isFinite(duration) && audioRef.current.buffered.length > 0) {
1680
+ duration = audioRef.current.buffered.end(audioRef.current.buffered.length - 1);
1681
+ }
1682
+ if (isFinite(duration) && duration > 0) {
1683
+ const time = percentage / 100 * duration;
1684
+ if (isFinite(time)) {
1685
+ audioRef.current.currentTime = time;
1686
+ setProgress(percentage);
1687
+ }
1688
+ }
1689
+ }
1690
+ }, []);
1691
+ const getFrequencyData = (0, import_react4.useCallback)(() => {
1692
+ if (!analyserRef.current) return new Uint8Array(0);
1693
+ const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
1694
+ analyserRef.current.getByteFrequencyData(dataArray);
1695
+ return dataArray;
1696
+ }, []);
1697
+ const getTimeDomainData = (0, import_react4.useCallback)(() => {
1698
+ if (!analyserRef.current) return new Uint8Array(0);
1699
+ const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount);
1700
+ analyserRef.current.getByteTimeDomainData(dataArray);
1701
+ return dataArray;
1702
+ }, []);
1703
+ (0, import_react4.useEffect)(() => {
1704
+ if (!visualization?.enabled) return;
1705
+ let animId;
1706
+ let lastUpdate = 0;
1707
+ const interval = visualization.refreshInterval || 0;
1708
+ const update = (timestamp) => {
1709
+ if (isPlaying && !isPaused) {
1710
+ if (timestamp - lastUpdate >= interval) {
1711
+ setVisualizationData({
1712
+ frequencyData: getFrequencyData(),
1713
+ timeDomainData: getTimeDomainData()
1714
+ });
1715
+ lastUpdate = timestamp;
1716
+ }
1717
+ animId = requestAnimationFrame(update);
1718
+ }
1719
+ };
1720
+ if (isPlaying && !isPaused) {
1721
+ animId = requestAnimationFrame(update);
1722
+ }
1723
+ return () => {
1724
+ if (animId) cancelAnimationFrame(animId);
1725
+ };
1726
+ }, [isPlaying, isPaused, visualization, getFrequencyData, getTimeDomainData]);
1727
+ (0, import_react4.useEffect)(() => {
1728
+ return () => {
1729
+ stop();
1730
+ if (audioContextRef.current) {
1731
+ audioContextRef.current.close();
1732
+ }
1733
+ };
1734
+ }, [stop]);
1735
+ return {
1736
+ isConnected,
1737
+ isSessionStarted,
1738
+ isSynthesizing,
1739
+ isPlaying,
1740
+ isPaused,
1741
+ error,
1742
+ streamText,
1743
+ progress,
1744
+ connect,
1745
+ onMessage,
1746
+ finishStream,
1747
+ pause,
1748
+ resume,
1749
+ stop,
1750
+ seek,
1751
+ getFrequencyData,
1752
+ getTimeDomainData,
1753
+ visualizationData
1754
+ };
1755
+ }
1756
+ function getSessionAudioCache(instanceId) {
1757
+ return sessionAudioCache.get(instanceId);
1758
+ }
1759
+ function clearSessionAudioCache(instanceId) {
1760
+ sessionAudioCache.delete(instanceId);
1761
+ }
1762
+ function findSessionCacheByText(streamText, voice, speed) {
1763
+ for (const entry of sessionAudioCache.values()) {
1764
+ if (entry.streamText === streamText && entry.voice === voice && entry.speed === speed) {
1765
+ return entry;
1766
+ }
1767
+ }
1768
+ return void 0;
1769
+ }
1770
+
1771
+ // src/components/AudioWaveVisualizer.tsx
1772
+ var import_react5 = require("react");
1030
1773
  var import_jsx_runtime = require("react/jsx-runtime");
1031
1774
  var AudioWaveVisualizer = ({
1032
1775
  isPlaying,
@@ -1041,8 +1784,8 @@ var AudioWaveVisualizer = ({
1041
1784
  className,
1042
1785
  styleObj
1043
1786
  }) => {
1044
- const canvasRef = (0, import_react4.useRef)(null);
1045
- const requestRef = (0, import_react4.useRef)(null);
1787
+ const canvasRef = (0, import_react5.useRef)(null);
1788
+ const requestRef = (0, import_react5.useRef)(null);
1046
1789
  const progressBackground = Array.isArray(color) ? `linear-gradient(90deg, ${color[0]}, ${color[1]})` : color;
1047
1790
  const textColor = Array.isArray(color) ? color[0] : color;
1048
1791
  const draw = () => {
@@ -1107,7 +1850,7 @@ var AudioWaveVisualizer = ({
1107
1850
  requestRef.current = requestAnimationFrame(draw);
1108
1851
  }
1109
1852
  };
1110
- (0, import_react4.useEffect)(() => {
1853
+ (0, import_react5.useEffect)(() => {
1111
1854
  if (isPlaying && !isPaused) {
1112
1855
  requestRef.current = requestAnimationFrame(draw);
1113
1856
  } else {
@@ -1151,7 +1894,7 @@ var AudioWaveVisualizer = ({
1151
1894
  var AudioWaveVisualizer_default = AudioWaveVisualizer;
1152
1895
 
1153
1896
  // src/components/AudioProgressBar.tsx
1154
- var import_react5 = require("react");
1897
+ var import_react6 = require("react");
1155
1898
  var import_jsx_runtime2 = require("react/jsx-runtime");
1156
1899
  var AudioProgressBar = ({
1157
1900
  progress,
@@ -1164,16 +1907,16 @@ var AudioProgressBar = ({
1164
1907
  className,
1165
1908
  style
1166
1909
  }) => {
1167
- const progressBarRef = (0, import_react5.useRef)(null);
1168
- const containerRef = (0, import_react5.useRef)(null);
1169
- const progressTextRef = (0, import_react5.useRef)(null);
1170
- const thumbRef = (0, import_react5.useRef)(null);
1171
- const displayedProgress = (0, import_react5.useRef)(0);
1172
- const requestRef = (0, import_react5.useRef)(null);
1173
- const isDragging = (0, import_react5.useRef)(false);
1174
- const isHovering = (0, import_react5.useRef)(false);
1175
- const isTouch = (0, import_react5.useRef)(false);
1176
- (0, import_react5.useEffect)(() => {
1910
+ const progressBarRef = (0, import_react6.useRef)(null);
1911
+ const containerRef = (0, import_react6.useRef)(null);
1912
+ const progressTextRef = (0, import_react6.useRef)(null);
1913
+ const thumbRef = (0, import_react6.useRef)(null);
1914
+ const displayedProgress = (0, import_react6.useRef)(0);
1915
+ const requestRef = (0, import_react6.useRef)(null);
1916
+ const isDragging = (0, import_react6.useRef)(false);
1917
+ const isHovering = (0, import_react6.useRef)(false);
1918
+ const isTouch = (0, import_react6.useRef)(false);
1919
+ (0, import_react6.useEffect)(() => {
1177
1920
  const match = window.matchMedia("(pointer: coarse)");
1178
1921
  isTouch.current = match.matches;
1179
1922
  if (isTouch.current && thumbRef.current) {
@@ -1212,7 +1955,7 @@ var AudioProgressBar = ({
1212
1955
  requestRef.current = requestAnimationFrame(animate);
1213
1956
  }
1214
1957
  };
1215
- (0, import_react5.useEffect)(() => {
1958
+ (0, import_react6.useEffect)(() => {
1216
1959
  requestRef.current = requestAnimationFrame(animate);
1217
1960
  return () => {
1218
1961
  if (requestRef.current) {
@@ -1399,8 +2142,13 @@ var AudioProgressBar_default = AudioProgressBar;
1399
2142
  0 && (module.exports = {
1400
2143
  AudioProgressBar,
1401
2144
  AudioWaveVisualizer,
2145
+ StreamingTextSplitter,
2146
+ clearSessionAudioCache,
2147
+ findSessionCacheByText,
2148
+ getSessionAudioCache,
1402
2149
  splitTextByDelimiters,
1403
2150
  useMessageTTS,
2151
+ useStreamTTS,
1404
2152
  useVolcanoASR,
1405
2153
  useVolcanoTTS
1406
2154
  });