univoice 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +78 -16
- package/dist/{base--a8Bo2f0.d.ts → base-Bw2Puefv.d.ts} +22 -2
- package/dist/{base-Bae_riVx.d.ts → base-DhiS1mCx.d.ts} +17 -2
- package/dist/{chunk-RRXTYGBU.js → chunk-2MN6RZSS.js} +375 -4
- package/dist/chunk-2MN6RZSS.js.map +1 -0
- package/dist/{chunk-EHSTFTRI.js → chunk-FUVQN5PE.js} +3 -3
- package/dist/{chunk-EHSTFTRI.js.map → chunk-FUVQN5PE.js.map} +1 -1
- package/dist/{chunk-TY2HDS4F.js → chunk-NYM7PZUP.js} +3 -3
- package/dist/{chunk-TY2HDS4F.js.map → chunk-NYM7PZUP.js.map} +1 -1
- package/dist/chunk-R66GH6Y5.js +105 -0
- package/dist/chunk-R66GH6Y5.js.map +1 -0
- package/dist/{chunk-CM7VAOIV.js → chunk-U5KAFNFM.js} +287 -8
- package/dist/chunk-U5KAFNFM.js.map +1 -0
- package/dist/{save-DGQVjHM1.d.ts → save-lNS0YZU4.d.ts} +1 -1
- package/dist/src/asr/index.d.ts +3 -3
- package/dist/src/asr/index.js +3 -3
- package/dist/src/asr/providers/index.d.ts +2 -2
- package/dist/src/asr/providers/index.js +4 -3
- package/dist/src/asr/providers/index.js.map +1 -1
- package/dist/src/index.d.ts +4 -4
- package/dist/src/index.js +5 -5
- package/dist/src/tts/index.d.ts +3 -3
- package/dist/src/tts/index.js +3 -3
- package/dist/src/tts/providers/index.d.ts +2 -2
- package/dist/src/tts/providers/index.js +2 -2
- package/dist/{tee-BufkUu6s.d.ts → tee-CRWCx7JR.d.ts} +1 -1
- package/dist/{qwen-0GGX_nkP.d.ts → xfyun-ciu6L1-M.d.ts} +23 -2
- package/dist/{qwen-DzZEciEh.d.ts → xfyun-iV-ra5ZL.d.ts} +30 -2
- package/examples/asr/providers/xfyun/README.md +77 -0
- package/examples/asr/providers/xfyun/pcm-stream-in-stream-out.ts +98 -0
- package/examples/asr/providers/xfyun/stream-in-stream-out.ts +104 -0
- package/examples/tts/providers/doubao/seed-tts-1.0/README.md +7 -0
- package/examples/tts/providers/doubao/seed-tts-1.0/stream-in-stream-out-ogg-opus.ts +94 -0
- package/examples/tts/providers/doubao/seed-tts-2.0/README.md +9 -2
- package/examples/tts/providers/doubao/seed-tts-2.0/stream-in-stream-out-ogg-opus.ts +94 -0
- package/examples/tts/providers/xfyun/super-human/direct-instance.ts +86 -0
- package/examples/tts/providers/xfyun/super-human/non-stream-in-non-stream-out.ts +77 -0
- package/examples/tts/providers/xfyun/super-human/stream-in-stream-out.ts +85 -0
- package/examples/utils/common.ts +44 -0
- package/package.json +8 -8
- package/dist/chunk-7QVYU63E.js +0 -6
- package/dist/chunk-7QVYU63E.js.map +0 -1
- package/dist/chunk-CM7VAOIV.js.map +0 -1
- package/dist/chunk-RRXTYGBU.js.map +0 -1
package/README.md
CHANGED
|
@@ -269,7 +269,7 @@ const response = await tts.synthesize({ text: '你好' });
|
|
|
269
269
|
import { createTTS } from 'univoice';
|
|
270
270
|
|
|
271
271
|
const tts = createTTS({
|
|
272
|
-
provider: 'doubao' | 'openai' | 'minimax' | 'qwen' | 'gemini',
|
|
272
|
+
provider: 'doubao' | 'openai' | 'minimax' | 'qwen' | 'qwen-realtime' | 'gemini' | 'glm' | 'xfyun',
|
|
273
273
|
// 通用配置
|
|
274
274
|
apiKey?: string,
|
|
275
275
|
baseUrl?: string,
|
|
@@ -314,7 +314,7 @@ const tts = createTTS({
|
|
|
314
314
|
import { createASR } from 'univoice';
|
|
315
315
|
|
|
316
316
|
const asr = createASR({
|
|
317
|
-
provider: 'doubao' | 'openai' | 'minimax' | 'qwen' | 'gemini',
|
|
317
|
+
provider: 'doubao' | 'openai' | 'minimax' | 'qwen' | 'gemini' | 'glm' | 'xfyun',
|
|
318
318
|
apiKey?: string,
|
|
319
319
|
baseUrl?: string,
|
|
320
320
|
model?: string,
|
|
@@ -355,6 +355,7 @@ const asr = createASR({
|
|
|
355
355
|
| OpenAI | `openai` | 待实现 | 待实现 | 待实现 | 待实现 |
|
|
356
356
|
| MiniMax | `minimax` | - | - | - | - |
|
|
357
357
|
| Gemini | `gemini` | 待实现 | 待实现 | 待实现 | 待实现 |
|
|
358
|
+
| 科大讯飞 | `xfyun` | ✅ | ✅ | ✅ | ✅ |
|
|
358
359
|
|
|
359
360
|
#### TTS 能力矩阵
|
|
360
361
|
|
|
@@ -366,6 +367,7 @@ const asr = createASR({
|
|
|
366
367
|
| OpenAI | `openai` | 待实现 | 待实现 | 待实现 | 待实现 |
|
|
367
368
|
| MiniMax | `minimax` | ✅ | ✅ | ✅ | ✅ |
|
|
368
369
|
| Gemini | `gemini` | 待实现 | 待实现 | 待实现 | 待实现 |
|
|
370
|
+
| 科大讯飞 | `xfyun` | ✅ | ✅ | ✅ | ✅ |
|
|
369
371
|
|
|
370
372
|
#### 能力说明
|
|
371
373
|
|
|
@@ -474,6 +476,29 @@ const asr = createASR({
|
|
|
474
476
|
});
|
|
475
477
|
```
|
|
476
478
|
|
|
479
|
+
#### 科大讯飞
|
|
480
|
+
|
|
481
|
+
```typescript
|
|
482
|
+
const tts = createTTS({
|
|
483
|
+
provider: 'xfyun',
|
|
484
|
+
appId: process.env.XFYUN_APP_ID,
|
|
485
|
+
apiSecret: process.env.XFYUN_API_SECRET,
|
|
486
|
+
apiKey: process.env.XFYUN_API_KEY,
|
|
487
|
+
voice: 'x5_lingxiaoxuan_flow',
|
|
488
|
+
model: 'super-human-tts',
|
|
489
|
+
format: 'pcm',
|
|
490
|
+
sampleRate: 16000,
|
|
491
|
+
});
|
|
492
|
+
|
|
493
|
+
const asr = createASR({
|
|
494
|
+
provider: 'xfyun',
|
|
495
|
+
appId: process.env.XFYUN_APP_ID,
|
|
496
|
+
apiSecret: process.env.XFYUN_API_SECRET,
|
|
497
|
+
apiKey: process.env.XFYUN_API_KEY,
|
|
498
|
+
language: 'zh-CN',
|
|
499
|
+
});
|
|
500
|
+
```
|
|
501
|
+
|
|
477
502
|
---
|
|
478
503
|
|
|
479
504
|
<!-- PERFORMANCE_TABLE_START -->
|
|
@@ -493,7 +518,7 @@ const asr = createASR({
|
|
|
493
518
|
>
|
|
494
519
|
> 如需评估服务商的真实性能,建议直接使用服务商官方 SDK 进行测试。
|
|
495
520
|
|
|
496
|
-
> 生成时间: 2026/
|
|
521
|
+
> 生成时间: 2026/4/8 17:39:11
|
|
497
522
|
|
|
498
523
|
> 环境: Node.js v24.14.0, darwin arm64
|
|
499
524
|
|
|
@@ -588,7 +613,7 @@ const asr = createASR({
|
|
|
588
613
|
| 豆包 | seed-tts-1.0 | zh_male_lengkugege_emo_v2_mars_bigtts | ogg_opus | 16000 | 3 | 653 | 62 | 3254 | 3461 | 231 | 88.9 |
|
|
589
614
|
| 豆包 | seed-tts-1.0 | zh_male_lengkugege_emo_v2_mars_bigtts | ogg_opus | 24000 | 3 | 670 | 102 | 3429 | 7919 | 2209 | 59.3 |
|
|
590
615
|
| 豆包 | seed-tts-1.0 | zh_male_lengkugege_emo_v2_mars_bigtts | ogg_opus | 48000 | 3 | 698 | 65 | 3217 | 3603 | 200 | 85.8 |
|
|
591
|
-
| 豆包 | seed-tts-2.0 | zh_female_vv_uranus_bigtts | pcm | 8000 | 3 | 491 | 65 | 3810 | 3859 |
|
|
616
|
+
| 豆包 | seed-tts-2.0 | zh_female_vv_uranus_bigtts | pcm | 8000 | 3 | 491 | 65 | 3810 | 3859 | 29 | 74.6 |
|
|
592
617
|
| 豆包 | seed-tts-2.0 | zh_female_vv_uranus_bigtts | pcm | 16000 | 3 | 500 | 66 | 3752 | 3832 | 43 | 75.5 |
|
|
593
618
|
| 豆包 | seed-tts-2.0 | zh_female_vv_uranus_bigtts | pcm | 24000 | 3 | 510 | 65 | 3760 | 3783 | 50 | 76.3 |
|
|
594
619
|
| 豆包 | seed-tts-2.0 | zh_female_vv_uranus_bigtts | pcm | 48000 | 3 | 560 | 64 | 3770 | 3835 | 43 | 75.4 |
|
|
@@ -645,6 +670,9 @@ const asr = createASR({
|
|
|
645
670
|
| MiniMax | speech-01-turbo | male-qn-qingse | pcm | 32000 | 3 | 490 | 2 | 2101 | 2148 | 99 | 138.6 |
|
|
646
671
|
| MiniMax | speech-01-turbo | male-qn-qingse | pcm | 44100 | 3 | 448 | **2 🏆** | 1920 | 2100 | 99 | 145.2 |
|
|
647
672
|
| 智谱 GLM | glm-tts | tongtong | pcm | 24000 | 3 | 861 | *542* | 5037 | 5614 | 296 | 54.8 |
|
|
673
|
+
| 科大讯飞 | super-human-tts | x5_lingxiaoxuan_flow | pcm | 8000 | 3 | 543 | 17 | 2704 | 2897 | 119 | 104.1 |
|
|
674
|
+
| 科大讯飞 | super-human-tts | x5_lingxiaoxuan_flow | pcm | 16000 | 3 | 511 | 17 | 2629 | 2721 | 51 | 107.5 |
|
|
675
|
+
| 科大讯飞 | super-human-tts | x5_lingxiaoxuan_flow | pcm | 24000 | 3 | 592 | 16 | 2690 | 2709 | **28 🏆** | 106.3 |
|
|
648
676
|
|
|
649
677
|
## ASR 性能指标
|
|
650
678
|
|
|
@@ -675,13 +703,14 @@ const asr = createASR({
|
|
|
675
703
|
|
|
676
704
|
| 服务商 | 模型 | 语言 | 输入格式 | 采样率 (Hz) | 测试次数 | 首包延迟 (ms) | 平均间隔 (ms) | P50 (ms) | P95 (ms) | 标准差 (ms) | RTF |
|
|
677
705
|
|--------|------|------|----------|-------------|----------|---------------|---------------|----------|----------|-------------|-----|
|
|
678
|
-
| 通义千问 | paraformer-realtime-v2 | zh-CN | pcm | 16000 | 3 |
|
|
679
|
-
| 通义千问 | paraformer-realtime-v1 | zh-CN | pcm | 16000 | 3 | **439 🏆** | **29 🏆** | **498 🏆** | **509 🏆** | **10 🏆** |
|
|
680
|
-
| 豆包 | bigmodel | zh-CN | pcm | 16000 | 3 | 513 | 69 |
|
|
706
|
+
| 通义千问 | paraformer-realtime-v2 | zh-CN | pcm | 16000 | 3 | 978 | 82 | 685 | 2085 | *666* | *1.32* |
|
|
707
|
+
| 通义千问 | paraformer-realtime-v1 | zh-CN | pcm | 16000 | 3 | **439 🏆** | **29 🏆** | **498 🏆** | **509 🏆** | **10 🏆** | 0.57 |
|
|
708
|
+
| 豆包 | bigmodel | zh-CN | pcm | 16000 | 3 | 513 | 69 | 904 | 960 | 107 | 0.99 |
|
|
709
|
+
| 科大讯飞 | iat | zh-CN | pcm | 16000 | 3 | *1551* | *927* | *2835* | *2948* | 587 | **0.12 🏆** |
|
|
681
710
|
|
|
682
711
|
---
|
|
683
712
|
|
|
684
|
-
*数据更新于: 2026-
|
|
713
|
+
*数据更新于: 2026-04-08*
|
|
685
714
|
|
|
686
715
|
<!-- PERFORMANCE_TABLE_END -->
|
|
687
716
|
|
|
@@ -756,26 +785,58 @@ src/
|
|
|
756
785
|
├── tts/ # TTS 模块
|
|
757
786
|
│ ├── base.ts # BaseTTS 抽象类
|
|
758
787
|
│ ├── factory.ts # 工厂函数
|
|
788
|
+
│ ├── protocols/ # 协议实现
|
|
789
|
+
│ │ ├── volcengine.ts
|
|
790
|
+
│ │ ├── dashscope.ts
|
|
791
|
+
│ │ ├── dashscope-realtime.ts
|
|
792
|
+
│ │ ├── minimax.ts
|
|
793
|
+
│ │ └── xfyun.ts
|
|
759
794
|
│ ├── utils/ # 工具函数
|
|
760
|
-
│ │ ├── save.ts
|
|
761
|
-
│ │ ├──
|
|
762
|
-
│ │ ├──
|
|
763
|
-
│ │
|
|
795
|
+
│ │ ├── save.ts
|
|
796
|
+
│ │ ├── save-audio.ts
|
|
797
|
+
│ │ ├── collect.ts
|
|
798
|
+
│ │ ├── play.ts
|
|
799
|
+
│ │ └── tee.ts
|
|
764
800
|
│ └── providers/ # 提供商实现
|
|
765
801
|
│ ├── doubao.ts
|
|
766
802
|
│ ├── openai.ts
|
|
767
803
|
│ ├── minimax.ts
|
|
768
804
|
│ ├── qwen.ts
|
|
769
|
-
│
|
|
805
|
+
│ ├── qwen-realtime.ts
|
|
806
|
+
│ ├── gemini.ts
|
|
807
|
+
│ ├── glm.ts
|
|
808
|
+
│ └── xfyun.ts
|
|
770
809
|
├── asr/ # ASR 模块
|
|
771
810
|
│ ├── base.ts # BaseASR 抽象类
|
|
772
811
|
│ ├── factory.ts # 工厂函数
|
|
812
|
+
│ ├── protocols/ # 协议实现
|
|
813
|
+
│ │ ├── dashscope.ts
|
|
814
|
+
│ │ ├── sauc.ts
|
|
815
|
+
│ │ └── xfyun.ts
|
|
773
816
|
│ ├── utils/ # 工具函数
|
|
817
|
+
│ │ ├── audio.ts
|
|
818
|
+
│ │ ├── collect.ts
|
|
819
|
+
│ │ ├── save.ts
|
|
820
|
+
│ │ ├── ogg-muxer.ts
|
|
821
|
+
│ │ └── opus-decode.ts
|
|
774
822
|
│ └── providers/ # 提供商实现
|
|
823
|
+
│ ├── doubao.ts
|
|
824
|
+
│ ├── openai.ts
|
|
825
|
+
│ ├── minimax.ts
|
|
826
|
+
│ ├── qwen.ts
|
|
827
|
+
│ ├── gemini.ts
|
|
828
|
+
│ ├── glm.ts
|
|
829
|
+
│ └── xfyun.ts
|
|
775
830
|
└── types/ # 类型定义
|
|
776
|
-
├──
|
|
777
|
-
├──
|
|
778
|
-
|
|
831
|
+
├── index.ts
|
|
832
|
+
├── tts.ts
|
|
833
|
+
├── asr.ts
|
|
834
|
+
├── llm-stream.ts
|
|
835
|
+
└── voices/
|
|
836
|
+
├── doubao.ts
|
|
837
|
+
├── minimax.ts
|
|
838
|
+
├── qwen.ts
|
|
839
|
+
└── glm.ts
|
|
779
840
|
```
|
|
780
841
|
|
|
781
842
|
---
|
|
@@ -801,3 +862,4 @@ src/
|
|
|
801
862
|
- [阿里云通义千问](https://tongyi.aliyun.com/)
|
|
802
863
|
- [Google Gemini](https://ai.google.dev/)
|
|
803
864
|
- [智谱 AI](https://open.bigmodel.cn/)
|
|
865
|
+
- [科大讯飞](https://www.xfyun.cn/)
|
|
@@ -39,6 +39,24 @@ interface GlmASROptions extends BaseASROptions {
|
|
|
39
39
|
hotwords?: string[];
|
|
40
40
|
context?: string;
|
|
41
41
|
}
|
|
42
|
+
interface XfyunASROptions extends BaseASROptions {
|
|
43
|
+
appId?: string;
|
|
44
|
+
apiSecret?: string;
|
|
45
|
+
sampleRate?: number;
|
|
46
|
+
domain?: string;
|
|
47
|
+
accent?: string;
|
|
48
|
+
eos?: number;
|
|
49
|
+
dwa?: string;
|
|
50
|
+
ltc?: number;
|
|
51
|
+
dhw?: string;
|
|
52
|
+
ptt?: number;
|
|
53
|
+
rlang?: string;
|
|
54
|
+
vinfo?: number;
|
|
55
|
+
nunum?: number;
|
|
56
|
+
nbest?: number;
|
|
57
|
+
wbest?: number;
|
|
58
|
+
sendInterval?: number;
|
|
59
|
+
}
|
|
42
60
|
type ASROptions = ({
|
|
43
61
|
provider: 'doubao';
|
|
44
62
|
} & DoubaoASROptions) | ({
|
|
@@ -52,6 +70,8 @@ type ASROptions = ({
|
|
|
52
70
|
} & BaseASROptions) | ({
|
|
53
71
|
provider: 'gemini';
|
|
54
72
|
} & BaseASROptions) | ({
|
|
73
|
+
provider: 'xfyun';
|
|
74
|
+
} & XfyunASROptions) | ({
|
|
55
75
|
provider: string;
|
|
56
76
|
} & BaseASROptions);
|
|
57
77
|
interface ListenInstanceOptions {
|
|
@@ -85,7 +105,7 @@ interface ASRProvider {
|
|
|
85
105
|
name: string;
|
|
86
106
|
listenStream(audio: AudioStream): AsyncIterable<ASRStreamChunk>;
|
|
87
107
|
}
|
|
88
|
-
type ASRProviderType = 'doubao' | 'minimax' | 'qwen' | 'openai' | 'gemini' | string;
|
|
108
|
+
type ASRProviderType = 'doubao' | 'minimax' | 'qwen' | 'openai' | 'gemini' | 'xfyun' | string;
|
|
89
109
|
type AudioStream = AsyncIterable<Buffer | Uint8Array>;
|
|
90
110
|
type AudioStreamInput = AudioStream | Buffer | Uint8Array | string;
|
|
91
111
|
type ASRConnectionState = 'connected' | 'closed' | 'error';
|
|
@@ -131,4 +151,4 @@ declare abstract class BaseASR {
|
|
|
131
151
|
}): Promise<ASRResponse>;
|
|
132
152
|
}
|
|
133
153
|
|
|
134
|
-
export { type ASRConnectOptions as A, BaseASR as B, type DoubaoASROptions as D, type GlmASROptions as G, type ListenInstanceOptions as L, type QwenASROptions as Q, type ASRConnection as a, type ASRConnectionState as b, type ASROptions as c, type ASRProvider as d, type ASRProviderType as e, type ASRRequest as f, type ASRResponse as g, type ASRSegment as h, type ASRStreamChunk as i, type AudioCodecFormat as j, type AudioContainerFormat as k, type AudioFormat as l, type AudioStream as m, type AudioStreamInput as n, type BaseASROptions as o };
|
|
154
|
+
export { type ASRConnectOptions as A, BaseASR as B, type DoubaoASROptions as D, type GlmASROptions as G, type ListenInstanceOptions as L, type QwenASROptions as Q, type XfyunASROptions as X, type ASRConnection as a, type ASRConnectionState as b, type ASROptions as c, type ASRProvider as d, type ASRProviderType as e, type ASRRequest as f, type ASRResponse as g, type ASRSegment as h, type ASRStreamChunk as i, type AudioCodecFormat as j, type AudioContainerFormat as k, type AudioFormat as l, type AudioStream as m, type AudioStreamInput as n, type BaseASROptions as o };
|
|
@@ -96,6 +96,19 @@ interface QwenRealtimeTTSOptions extends BaseTTSOptions {
|
|
|
96
96
|
interface GlmTTSOptions extends BaseTTSOptions {
|
|
97
97
|
voice?: AcceptAnyString<GlmVoice>;
|
|
98
98
|
}
|
|
99
|
+
interface XfyunTTSOptions extends BaseTTSOptions {
|
|
100
|
+
appId?: string;
|
|
101
|
+
apiSecret?: string;
|
|
102
|
+
sampleRate?: number;
|
|
103
|
+
oralLevel?: 'high' | 'mid' | 'low';
|
|
104
|
+
sparkAssist?: number;
|
|
105
|
+
stopSplit?: number;
|
|
106
|
+
remain?: number;
|
|
107
|
+
reg?: number;
|
|
108
|
+
rdn?: number;
|
|
109
|
+
rhy?: number;
|
|
110
|
+
bgs?: number;
|
|
111
|
+
}
|
|
99
112
|
type TTSOptions = ({
|
|
100
113
|
provider: 'doubao';
|
|
101
114
|
} & DoubaoTTSOptions) | ({
|
|
@@ -111,6 +124,8 @@ type TTSOptions = ({
|
|
|
111
124
|
} & BaseTTSOptions) | ({
|
|
112
125
|
provider: 'glm';
|
|
113
126
|
} & GlmTTSOptions) | ({
|
|
127
|
+
provider: 'xfyun';
|
|
128
|
+
} & XfyunTTSOptions) | ({
|
|
114
129
|
provider: string;
|
|
115
130
|
} & BaseTTSOptions);
|
|
116
131
|
interface TTSRequest {
|
|
@@ -141,7 +156,7 @@ interface TTSVoice {
|
|
|
141
156
|
language: string;
|
|
142
157
|
gender?: 'male' | 'female' | 'neutral';
|
|
143
158
|
}
|
|
144
|
-
type TTSProviderType = 'doubao' | 'minimax' | 'qwen' | 'openai' | 'gemini' | string;
|
|
159
|
+
type TTSProviderType = 'doubao' | 'minimax' | 'qwen' | 'openai' | 'gemini' | 'xfyun' | string;
|
|
145
160
|
type TTSConnectionState = 'connected' | 'closed' | 'error';
|
|
146
161
|
interface TTSConnectOptions {
|
|
147
162
|
timeout?: number;
|
|
@@ -188,4 +203,4 @@ declare abstract class BaseTTS implements TTSProvider {
|
|
|
188
203
|
};
|
|
189
204
|
}
|
|
190
205
|
|
|
191
|
-
export { BaseTTS as B, type CosyVoiceV1Voice as C, type DoubaoTTSOptions as D, type GlmTTSOptions as G, type MinimaxTTSOptions as M, type OpenAIChatCompletionChunk as O, type QwenRealtimeOptions as Q, type SpeakInstanceOptions as S, type TTSConnectOptions as T, type BaseTTSOptions as a, type QwenRealtimeTTSOptions as b, type QwenTTSOptions as c, type TTSConnection as d, type TTSConnectionState as e, type TTSOptions as f, type TTSProvider as g, type TTSProviderType as h, type TTSRequest as i, type TTSResponse as j, type TTSStreamChunk as k, type TTSVoice as l, type TextStream as m, type CosyVoiceV2Voice as n, type CosyVoiceV3FlashVoice as o, type CosyVoiceV3PlusVoice as p, type CosyVoiceVoice as q, type DoubaoJupiterVoice as r, type DoubaoV1Voice as s, type DoubaoV2Voice as t, type DoubaoVoice as u, type MinimaxVoice as v, type OpenAIStream as w, type QwenRealtimeVoice as x, type QwenTTSModel as y };
|
|
206
|
+
export { BaseTTS as B, type CosyVoiceV1Voice as C, type DoubaoTTSOptions as D, type GlmTTSOptions as G, type MinimaxTTSOptions as M, type OpenAIChatCompletionChunk as O, type QwenRealtimeOptions as Q, type SpeakInstanceOptions as S, type TTSConnectOptions as T, type XfyunTTSOptions as X, type BaseTTSOptions as a, type QwenRealtimeTTSOptions as b, type QwenTTSOptions as c, type TTSConnection as d, type TTSConnectionState as e, type TTSOptions as f, type TTSProvider as g, type TTSProviderType as h, type TTSRequest as i, type TTSResponse as j, type TTSStreamChunk as k, type TTSVoice as l, type TextStream as m, type CosyVoiceV2Voice as n, type CosyVoiceV3FlashVoice as o, type CosyVoiceV3PlusVoice as p, type CosyVoiceVoice as q, type DoubaoJupiterVoice as r, type DoubaoV1Voice as s, type DoubaoV2Voice as t, type DoubaoVoice as u, type MinimaxVoice as v, type OpenAIStream as w, type QwenRealtimeVoice as x, type QwenTTSModel as y };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { __name } from './chunk-
|
|
1
|
+
import { __name, buildAuthUrl } from './chunk-R66GH6Y5.js';
|
|
2
2
|
import { Buffer } from 'buffer';
|
|
3
3
|
import { randomUUID } from 'crypto';
|
|
4
4
|
import WebSocket from 'ws';
|
|
@@ -3142,6 +3142,376 @@ var QwenRealtimeTTSConnection = class {
|
|
|
3142
3142
|
return this.provider.synthesizeOnConnection(this.ws, textChunks.join(""));
|
|
3143
3143
|
}
|
|
3144
3144
|
};
|
|
3145
|
+
function buildTTSAuthUrl(apiKey, apiSecret) {
|
|
3146
|
+
return buildAuthUrl("cbm01.cn-huabei-1.xf-yun.com", "/v1/private/mcd9m97e6", apiKey, apiSecret);
|
|
3147
|
+
}
|
|
3148
|
+
__name(buildTTSAuthUrl, "buildTTSAuthUrl");
|
|
3149
|
+
function mapAudioEncoding(format) {
|
|
3150
|
+
const encodingMap = {
|
|
3151
|
+
mp3: "lame",
|
|
3152
|
+
pcm: "raw",
|
|
3153
|
+
opus: "opus"
|
|
3154
|
+
};
|
|
3155
|
+
return encodingMap[format] || "lame";
|
|
3156
|
+
}
|
|
3157
|
+
__name(mapAudioEncoding, "mapAudioEncoding");
|
|
3158
|
+
function createRequestPayload(options, text, status, seq) {
|
|
3159
|
+
const payload = {
|
|
3160
|
+
header: {
|
|
3161
|
+
app_id: options.appId,
|
|
3162
|
+
status
|
|
3163
|
+
},
|
|
3164
|
+
parameter: {
|
|
3165
|
+
tts: {
|
|
3166
|
+
vcn: options.vcn,
|
|
3167
|
+
speed: options.speed,
|
|
3168
|
+
volume: options.volume,
|
|
3169
|
+
pitch: options.pitch,
|
|
3170
|
+
bgs: options.bgs ?? 0,
|
|
3171
|
+
reg: options.reg ?? 0,
|
|
3172
|
+
rdn: options.rdn ?? 0,
|
|
3173
|
+
rhy: options.rhy ?? 0,
|
|
3174
|
+
audio: {
|
|
3175
|
+
encoding: options.encoding,
|
|
3176
|
+
sample_rate: options.sampleRate,
|
|
3177
|
+
channels: 1,
|
|
3178
|
+
bit_depth: 16,
|
|
3179
|
+
frame_size: 0
|
|
3180
|
+
}
|
|
3181
|
+
}
|
|
3182
|
+
},
|
|
3183
|
+
payload: {
|
|
3184
|
+
text: {
|
|
3185
|
+
encoding: "utf8",
|
|
3186
|
+
compress: "raw",
|
|
3187
|
+
format: "plain",
|
|
3188
|
+
status,
|
|
3189
|
+
seq,
|
|
3190
|
+
text: Buffer.from(text).toString("base64")
|
|
3191
|
+
}
|
|
3192
|
+
}
|
|
3193
|
+
};
|
|
3194
|
+
if (options.oralLevel || options.sparkAssist != null || options.stopSplit != null || options.remain != null) {
|
|
3195
|
+
payload.parameter.oral = {
|
|
3196
|
+
...options.oralLevel ? { oral_level: options.oralLevel } : {},
|
|
3197
|
+
...options.sparkAssist != null ? { spark_assist: options.sparkAssist } : {},
|
|
3198
|
+
...options.stopSplit != null ? { stop_split: options.stopSplit } : {},
|
|
3199
|
+
...options.remain != null ? { remain: options.remain } : {}
|
|
3200
|
+
};
|
|
3201
|
+
}
|
|
3202
|
+
return JSON.stringify(payload);
|
|
3203
|
+
}
|
|
3204
|
+
__name(createRequestPayload, "createRequestPayload");
|
|
3205
|
+
function parseTTSResponse(data) {
|
|
3206
|
+
let text;
|
|
3207
|
+
if (Buffer.isBuffer(data)) {
|
|
3208
|
+
text = data.toString("utf8");
|
|
3209
|
+
} else if (data instanceof ArrayBuffer) {
|
|
3210
|
+
text = new TextDecoder().decode(data);
|
|
3211
|
+
} else if (Array.isArray(data)) {
|
|
3212
|
+
text = Buffer.concat(data).toString("utf8");
|
|
3213
|
+
} else {
|
|
3214
|
+
text = String(data);
|
|
3215
|
+
}
|
|
3216
|
+
return JSON.parse(text);
|
|
3217
|
+
}
|
|
3218
|
+
__name(parseTTSResponse, "parseTTSResponse");
|
|
3219
|
+
function extractAudioFromResponse(response) {
|
|
3220
|
+
return response.payload?.audio?.audio ?? null;
|
|
3221
|
+
}
|
|
3222
|
+
__name(extractAudioFromResponse, "extractAudioFromResponse");
|
|
3223
|
+
function isTTSSuccessResponse(response) {
|
|
3224
|
+
return response.header.code === 0;
|
|
3225
|
+
}
|
|
3226
|
+
__name(isTTSSuccessResponse, "isTTSSuccessResponse");
|
|
3227
|
+
function isTTSFinishedResponse(response) {
|
|
3228
|
+
return response.header.status === 2;
|
|
3229
|
+
}
|
|
3230
|
+
__name(isTTSFinishedResponse, "isTTSFinishedResponse");
|
|
3231
|
+
|
|
3232
|
+
// src/tts/providers/xfyun.ts
|
|
3233
|
+
var XfyunTTS = class extends BaseTTS {
|
|
3234
|
+
static {
|
|
3235
|
+
__name(this, "XfyunTTS");
|
|
3236
|
+
}
|
|
3237
|
+
name = "xfyun";
|
|
3238
|
+
/** 讯飞 AppID */
|
|
3239
|
+
appId;
|
|
3240
|
+
/** 讯飞 APISecret(用于 HMAC-SHA256 签名) */
|
|
3241
|
+
apiSecret;
|
|
3242
|
+
/** 音频采样率 */
|
|
3243
|
+
sampleRate;
|
|
3244
|
+
/** 口语化等级(仅 x4 系列发音人支持) */
|
|
3245
|
+
oralLevel;
|
|
3246
|
+
/** 是否通过大模型进行口语化(仅 x4 系列发音人支持) */
|
|
3247
|
+
sparkAssist;
|
|
3248
|
+
/** 是否关闭服务端拆句(仅 x4 系列发音人支持) */
|
|
3249
|
+
stopSplit;
|
|
3250
|
+
/** 是否保留原书面语(仅 x4 系列发音人支持) */
|
|
3251
|
+
remain;
|
|
3252
|
+
/** 英文发音方式 */
|
|
3253
|
+
reg;
|
|
3254
|
+
/** 数字发音方式 */
|
|
3255
|
+
rdn;
|
|
3256
|
+
/** 是否返回拼音标注 */
|
|
3257
|
+
rhy;
|
|
3258
|
+
/** 背景音 */
|
|
3259
|
+
bgs;
|
|
3260
|
+
constructor(options) {
|
|
3261
|
+
super(options);
|
|
3262
|
+
this.appId = options.appId || "";
|
|
3263
|
+
this.apiSecret = options.apiSecret || "";
|
|
3264
|
+
this.sampleRate = options.sampleRate ?? 24e3;
|
|
3265
|
+
this.oralLevel = options.oralLevel;
|
|
3266
|
+
this.sparkAssist = options.sparkAssist;
|
|
3267
|
+
this.stopSplit = options.stopSplit;
|
|
3268
|
+
this.remain = options.remain;
|
|
3269
|
+
this.reg = options.reg;
|
|
3270
|
+
this.rdn = options.rdn;
|
|
3271
|
+
this.rhy = options.rhy;
|
|
3272
|
+
this.bgs = options.bgs;
|
|
3273
|
+
this.voice = options.voice || "x5_lingxiaoxuan_flow";
|
|
3274
|
+
this.format = options.format || "mp3";
|
|
3275
|
+
}
|
|
3276
|
+
/**
|
|
3277
|
+
* 将 BaseTTS 的 speed/volume/pitch (0-2 范围) 映射为讯飞的 0-100 范围
|
|
3278
|
+
* BaseTTS 默认 1.0 → xfyun 50
|
|
3279
|
+
*/
|
|
3280
|
+
mapParam(value) {
|
|
3281
|
+
return Math.round(value * 50);
|
|
3282
|
+
}
|
|
3283
|
+
/**
|
|
3284
|
+
* 构建协议配置选项
|
|
3285
|
+
*/
|
|
3286
|
+
buildProtocolOptions() {
|
|
3287
|
+
return {
|
|
3288
|
+
appId: this.appId,
|
|
3289
|
+
vcn: this.voice,
|
|
3290
|
+
speed: this.mapParam(this.speed),
|
|
3291
|
+
volume: this.mapParam(this.volume),
|
|
3292
|
+
pitch: this.mapParam(this.pitch),
|
|
3293
|
+
encoding: mapAudioEncoding(this.format),
|
|
3294
|
+
sampleRate: this.sampleRate,
|
|
3295
|
+
oralLevel: this.oralLevel,
|
|
3296
|
+
sparkAssist: this.sparkAssist,
|
|
3297
|
+
stopSplit: this.stopSplit,
|
|
3298
|
+
remain: this.remain,
|
|
3299
|
+
reg: this.reg,
|
|
3300
|
+
rdn: this.rdn,
|
|
3301
|
+
rhy: this.rhy,
|
|
3302
|
+
bgs: this.bgs
|
|
3303
|
+
};
|
|
3304
|
+
}
|
|
3305
|
+
/**
|
|
3306
|
+
* 合并多个 Uint8Array
|
|
3307
|
+
*/
|
|
3308
|
+
concatArrays(arrays) {
|
|
3309
|
+
const totalLength = arrays.reduce((sum, arr) => sum + arr.length, 0);
|
|
3310
|
+
const result = new Uint8Array(totalLength);
|
|
3311
|
+
let offset = 0;
|
|
3312
|
+
for (const arr of arrays) {
|
|
3313
|
+
result.set(arr, offset);
|
|
3314
|
+
offset += arr.length;
|
|
3315
|
+
}
|
|
3316
|
+
return result;
|
|
3317
|
+
}
|
|
3318
|
+
/**
|
|
3319
|
+
* 合成语音(非流式)
|
|
3320
|
+
* 建立 WebSocket → 发送请求(status=2 一次性发送)→ 收集所有音频块 → 合并返回
|
|
3321
|
+
*/
|
|
3322
|
+
async synthesize(request) {
|
|
3323
|
+
if (!this.appId) {
|
|
3324
|
+
throw new Error("appId is required for Xfyun TTS");
|
|
3325
|
+
}
|
|
3326
|
+
if (!this.apiKey) {
|
|
3327
|
+
throw new Error("apiKey is required for Xfyun TTS");
|
|
3328
|
+
}
|
|
3329
|
+
if (!this.apiSecret) {
|
|
3330
|
+
throw new Error("apiSecret is required for Xfyun TTS");
|
|
3331
|
+
}
|
|
3332
|
+
const protocolOptions = this.buildProtocolOptions();
|
|
3333
|
+
const url = buildTTSAuthUrl(this.apiKey, this.apiSecret);
|
|
3334
|
+
const ws = new WebSocket(url);
|
|
3335
|
+
await new Promise((resolve, reject) => {
|
|
3336
|
+
ws.on("open", resolve);
|
|
3337
|
+
ws.on("error", reject);
|
|
3338
|
+
});
|
|
3339
|
+
try {
|
|
3340
|
+
const payload = createRequestPayload(protocolOptions, request.text, 2, 0);
|
|
3341
|
+
ws.send(payload);
|
|
3342
|
+
const audioChunks = [];
|
|
3343
|
+
await new Promise((resolve, reject) => {
|
|
3344
|
+
ws.on("message", (data) => {
|
|
3345
|
+
try {
|
|
3346
|
+
const response = parseTTSResponse(data);
|
|
3347
|
+
if (!isTTSSuccessResponse(response)) {
|
|
3348
|
+
reject(
|
|
3349
|
+
new Error(`Xfyun TTS error: ${response.header.code} - ${response.header.message}`)
|
|
3350
|
+
);
|
|
3351
|
+
return;
|
|
3352
|
+
}
|
|
3353
|
+
const audioBase64 = extractAudioFromResponse(response);
|
|
3354
|
+
if (audioBase64) {
|
|
3355
|
+
audioChunks.push(Buffer.from(audioBase64, "base64"));
|
|
3356
|
+
}
|
|
3357
|
+
if (isTTSFinishedResponse(response)) {
|
|
3358
|
+
resolve();
|
|
3359
|
+
}
|
|
3360
|
+
} catch (err) {
|
|
3361
|
+
reject(err instanceof Error ? err : new Error(String(err)));
|
|
3362
|
+
}
|
|
3363
|
+
});
|
|
3364
|
+
ws.on("error", reject);
|
|
3365
|
+
ws.on("close", () => resolve());
|
|
3366
|
+
});
|
|
3367
|
+
const audio = this.concatArrays(audioChunks);
|
|
3368
|
+
if (audio.length === 0) {
|
|
3369
|
+
throw new Error("No audio received from Xfyun TTS service");
|
|
3370
|
+
}
|
|
3371
|
+
return {
|
|
3372
|
+
audio: Buffer.from(audio),
|
|
3373
|
+
format: this.format
|
|
3374
|
+
};
|
|
3375
|
+
} finally {
|
|
3376
|
+
ws.close();
|
|
3377
|
+
}
|
|
3378
|
+
}
|
|
3379
|
+
/**
|
|
3380
|
+
* 流式语音合成(内部实现方法)
|
|
3381
|
+
* 支持双向流式:流式文本输入,流式音频输出
|
|
3382
|
+
*
|
|
3383
|
+
* @param input 文本输入,可以是字符串或文本流
|
|
3384
|
+
* @returns 流式音频块
|
|
3385
|
+
* @internal
|
|
3386
|
+
*/
|
|
3387
|
+
async *speakStream(input) {
|
|
3388
|
+
if (!this.appId) {
|
|
3389
|
+
throw new Error("appId is required for Xfyun TTS");
|
|
3390
|
+
}
|
|
3391
|
+
if (!this.apiKey) {
|
|
3392
|
+
throw new Error("apiKey is required for Xfyun TTS");
|
|
3393
|
+
}
|
|
3394
|
+
if (!this.apiSecret) {
|
|
3395
|
+
throw new Error("apiSecret is required for Xfyun TTS");
|
|
3396
|
+
}
|
|
3397
|
+
const textStream = normalizeTextStream(input);
|
|
3398
|
+
const protocolOptions = this.buildProtocolOptions();
|
|
3399
|
+
const url = buildTTSAuthUrl(this.apiKey, this.apiSecret);
|
|
3400
|
+
const queue = [];
|
|
3401
|
+
const syncState = { resolveWait: null, finished: false };
|
|
3402
|
+
const enqueue = /* @__PURE__ */ __name((item) => {
|
|
3403
|
+
queue.push(item);
|
|
3404
|
+
syncState.resolveWait?.();
|
|
3405
|
+
syncState.resolveWait = null;
|
|
3406
|
+
}, "enqueue");
|
|
3407
|
+
const ws = new WebSocket(url);
|
|
3408
|
+
await new Promise((resolve, reject) => {
|
|
3409
|
+
ws.on("open", resolve);
|
|
3410
|
+
ws.on("error", reject);
|
|
3411
|
+
});
|
|
3412
|
+
const processPromise = (async () => {
|
|
3413
|
+
try {
|
|
3414
|
+
await Promise.all([
|
|
3415
|
+
sendTextStream(ws, protocolOptions, textStream),
|
|
3416
|
+
receiveAudioToQueue(ws, enqueue)
|
|
3417
|
+
]);
|
|
3418
|
+
} catch (error) {
|
|
3419
|
+
enqueue({
|
|
3420
|
+
type: "error",
|
|
3421
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
3422
|
+
});
|
|
3423
|
+
} finally {
|
|
3424
|
+
syncState.finished = true;
|
|
3425
|
+
syncState.resolveWait?.();
|
|
3426
|
+
syncState.resolveWait = null;
|
|
3427
|
+
ws.close();
|
|
3428
|
+
}
|
|
3429
|
+
})();
|
|
3430
|
+
try {
|
|
3431
|
+
while (!syncState.finished || queue.length > 0) {
|
|
3432
|
+
while (queue.length === 0 && !syncState.finished) {
|
|
3433
|
+
await new Promise((resolve) => {
|
|
3434
|
+
syncState.resolveWait = resolve;
|
|
3435
|
+
});
|
|
3436
|
+
}
|
|
3437
|
+
if (queue.length === 0) break;
|
|
3438
|
+
const item = queue.shift();
|
|
3439
|
+
if (!item) break;
|
|
3440
|
+
switch (item.type) {
|
|
3441
|
+
case "audio":
|
|
3442
|
+
yield { audioChunk: item.chunk };
|
|
3443
|
+
break;
|
|
3444
|
+
case "error":
|
|
3445
|
+
throw item.error;
|
|
3446
|
+
case "end":
|
|
3447
|
+
return;
|
|
3448
|
+
}
|
|
3449
|
+
}
|
|
3450
|
+
} finally {
|
|
3451
|
+
await processPromise.catch(() => {
|
|
3452
|
+
});
|
|
3453
|
+
}
|
|
3454
|
+
}
|
|
3455
|
+
};
|
|
3456
|
+
async function sendTextStream(ws, protocolOptions, textStream) {
|
|
3457
|
+
let seq = 0;
|
|
3458
|
+
let isFirst = true;
|
|
3459
|
+
for await (const chunk of textStream) {
|
|
3460
|
+
if (!chunk) continue;
|
|
3461
|
+
if (isFirst) {
|
|
3462
|
+
const payload = createRequestPayload(protocolOptions, chunk, 0, seq);
|
|
3463
|
+
ws.send(payload);
|
|
3464
|
+
isFirst = false;
|
|
3465
|
+
} else {
|
|
3466
|
+
const payload = createRequestPayload(protocolOptions, chunk, 1, seq);
|
|
3467
|
+
ws.send(payload);
|
|
3468
|
+
}
|
|
3469
|
+
seq++;
|
|
3470
|
+
}
|
|
3471
|
+
const endPayload = createRequestPayload(protocolOptions, "", 2, seq);
|
|
3472
|
+
ws.send(endPayload);
|
|
3473
|
+
}
|
|
3474
|
+
__name(sendTextStream, "sendTextStream");
|
|
3475
|
+
async function receiveAudioToQueue(ws, enqueue) {
|
|
3476
|
+
return new Promise((resolve, reject) => {
|
|
3477
|
+
ws.on("message", (data) => {
|
|
3478
|
+
try {
|
|
3479
|
+
const response = parseTTSResponse(data);
|
|
3480
|
+
if (!isTTSSuccessResponse(response)) {
|
|
3481
|
+
enqueue({
|
|
3482
|
+
type: "error",
|
|
3483
|
+
error: new Error(
|
|
3484
|
+
`Xfyun TTS error: ${response.header.code} - ${response.header.message}`
|
|
3485
|
+
)
|
|
3486
|
+
});
|
|
3487
|
+
resolve();
|
|
3488
|
+
return;
|
|
3489
|
+
}
|
|
3490
|
+
const audioBase64 = extractAudioFromResponse(response);
|
|
3491
|
+
if (audioBase64) {
|
|
3492
|
+
enqueue({ type: "audio", chunk: Buffer.from(audioBase64, "base64") });
|
|
3493
|
+
}
|
|
3494
|
+
if (isTTSFinishedResponse(response)) {
|
|
3495
|
+
enqueue({ type: "end" });
|
|
3496
|
+
resolve();
|
|
3497
|
+
}
|
|
3498
|
+
} catch (err) {
|
|
3499
|
+
enqueue({
|
|
3500
|
+
type: "error",
|
|
3501
|
+
error: err instanceof Error ? err : new Error(String(err))
|
|
3502
|
+
});
|
|
3503
|
+
resolve();
|
|
3504
|
+
}
|
|
3505
|
+
});
|
|
3506
|
+
ws.on("error", (err) => {
|
|
3507
|
+
reject(err);
|
|
3508
|
+
});
|
|
3509
|
+
ws.on("close", () => {
|
|
3510
|
+
resolve();
|
|
3511
|
+
});
|
|
3512
|
+
});
|
|
3513
|
+
}
|
|
3514
|
+
__name(receiveAudioToQueue, "receiveAudioToQueue");
|
|
3145
3515
|
|
|
3146
3516
|
// src/tts/providers/index.ts
|
|
3147
3517
|
registerTTSProvider("doubao", DoubaoTTS);
|
|
@@ -3151,7 +3521,8 @@ registerTTSProvider("qwen", QwenTTS);
|
|
|
3151
3521
|
registerTTSProvider("qwen-realtime", QwenRealtimeTTS);
|
|
3152
3522
|
registerTTSProvider("openai", TTS1);
|
|
3153
3523
|
registerTTSProvider("gemini", GeminiTTS);
|
|
3524
|
+
registerTTSProvider("xfyun", XfyunTTS);
|
|
3154
3525
|
|
|
3155
|
-
export { BaseTTS, DoubaoTTS, GeminiTTS, GlmTTS, MinimaxTTS, QwenRealtimeTTS, QwenTTS, TTS1, createTTS, getTTSProviders, registerTTSProvider };
|
|
3156
|
-
//# sourceMappingURL=chunk-
|
|
3157
|
-
//# sourceMappingURL=chunk-
|
|
3526
|
+
export { BaseTTS, DoubaoTTS, GeminiTTS, GlmTTS, MinimaxTTS, QwenRealtimeTTS, QwenTTS, TTS1, XfyunTTS, createTTS, getTTSProviders, registerTTSProvider };
|
|
3527
|
+
//# sourceMappingURL=chunk-2MN6RZSS.js.map
|
|
3528
|
+
//# sourceMappingURL=chunk-2MN6RZSS.js.map
|