univoice 0.1.0-beta.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +474 -1
- package/dist/{chunk-HONGPTUH.js → chunk-IL5MVEN6.js} +261 -25
- package/dist/chunk-IL5MVEN6.js.map +1 -0
- package/dist/index-DaR9FCnn.d.ts +109 -0
- package/dist/src/index.d.ts +1 -1
- package/dist/src/index.js +3 -1
- package/dist/src/tts/index.d.ts +1 -91
- package/dist/src/tts/index.js +3 -1
- package/package.json +8 -7
- package/dist/chunk-HONGPTUH.js.map +0 -1
package/README.md
CHANGED
|
@@ -5,5 +5,478 @@
|
|
|
5
5
|
<br />
|
|
6
6
|
|
|
7
7
|
<div align="center">
|
|
8
|
-
|
|
8
|
+
|
|
9
|
+
[](https://www.npmjs.com/package/univoice)
|
|
10
|
+
[](https://www.npmjs.com/package/univoice)
|
|
11
|
+
[](https://opensource.org/licenses/MIT)
|
|
12
|
+
[](https://codecov.io/gh/shenjingnan/univoice)
|
|
13
|
+
|
|
14
|
+
**统一的 TTS(文字转语音)和 ASR(语音识别)SDK**
|
|
15
|
+
|
|
16
|
+
[快速开始](#快速开始) · [API 文档](#api-文档) · [支持的提供商](#支持的提供商)
|
|
17
|
+
|
|
9
18
|
</div>
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## 简介
|
|
23
|
+
|
|
24
|
+
**univoice** 是一个统一的语音处理 SDK,提供统一的 API 来调用多种 TTS(文字转语音)和 ASR(语音识别)服务提供商。
|
|
25
|
+
|
|
26
|
+
### 核心特性
|
|
27
|
+
|
|
28
|
+
- 🎯 **统一 API** - 一套 API 调用多种语音服务提供商
|
|
29
|
+
- 🔄 **流式支持** - TTS 支持流式输入和输出,适合 LLM 流式输出场景
|
|
30
|
+
- 🚀 **边发边收** - LLM 流式输出可直接转换为语音,显著降低首字延迟
|
|
31
|
+
- 🔌 **插件化架构** - 轻松扩展支持新的语音服务提供商
|
|
32
|
+
- 📦 **TypeScript 优先** - 完整的类型定义支持
|
|
33
|
+
|
|
34
|
+
### 适用场景
|
|
35
|
+
|
|
36
|
+
- AI 助手语音交互
|
|
37
|
+
- 有声书/播客生成
|
|
38
|
+
- 客服语音系统
|
|
39
|
+
- 实时语音翻译
|
|
40
|
+
- 语音消息应用
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## 安装
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
# 使用 pnpm
|
|
48
|
+
pnpm add univoice
|
|
49
|
+
|
|
50
|
+
# 使用 npm
|
|
51
|
+
npm install univoice
|
|
52
|
+
|
|
53
|
+
# 使用 yarn
|
|
54
|
+
yarn add univoice
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### 环境要求
|
|
58
|
+
|
|
59
|
+
- Node.js >= 20.0.0
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## 快速开始
|
|
64
|
+
|
|
65
|
+
### TTS(文字转语音)
|
|
66
|
+
|
|
67
|
+
#### 非流式合成
|
|
68
|
+
|
|
69
|
+
最简单的使用方式,适合已知完整文本的场景:
|
|
70
|
+
|
|
71
|
+
```typescript
|
|
72
|
+
import { createTTS } from 'univoice';
|
|
73
|
+
|
|
74
|
+
const tts = createTTS({
|
|
75
|
+
provider: 'doubao',
|
|
76
|
+
appId: 'your-app-id',
|
|
77
|
+
accessToken: 'your-access-token',
|
|
78
|
+
voice: 'zh_female_tianmeixiaoyuan_moon_bigtts',
|
|
79
|
+
format: 'mp3',
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
const response = await tts.synthesize({
|
|
83
|
+
text: '欢迎来到杭州!',
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
console.log(`音频格式: ${response.format}`);
|
|
87
|
+
console.log(`音频大小: ${response.audio.length} bytes`);
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
#### 流式合成
|
|
91
|
+
|
|
92
|
+
适合流式输入场景,支持两种输入模式:
|
|
93
|
+
|
|
94
|
+
```typescript
|
|
95
|
+
import { createTTS } from 'univoice';
|
|
96
|
+
|
|
97
|
+
const tts = createTTS({
|
|
98
|
+
provider: 'doubao',
|
|
99
|
+
appId: 'your-app-id',
|
|
100
|
+
accessToken: 'your-access-token',
|
|
101
|
+
voice: 'zh_female_tianmeixiaoyuan_moon_bigtts',
|
|
102
|
+
format: 'pcm',
|
|
103
|
+
sampleRate: 24000,
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
// 方式一:字符串输入
|
|
107
|
+
const text = '欢迎来到龙井村。这里是西湖龙井茶的原产地。';
|
|
108
|
+
for await (const { audioChunk } of tts.speak(text)) {
|
|
109
|
+
console.log('收到音频块:', audioChunk.length);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// 方式二:流式文本输入(如 Generator)
|
|
113
|
+
async function* textGenerator() {
|
|
114
|
+
yield '你好,';
|
|
115
|
+
yield '世界!';
|
|
116
|
+
}
|
|
117
|
+
for await (const { audioChunk } of tts.speak(textGenerator())) {
|
|
118
|
+
console.log('收到音频块:', audioChunk.length);
|
|
119
|
+
}
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
#### LLM 流式输出转语音(核心特性)
|
|
123
|
+
|
|
124
|
+
将 LLM 的流式输出直接转换为语音,实现边发边收,显著降低首字延迟:
|
|
125
|
+
|
|
126
|
+
```typescript
|
|
127
|
+
import OpenAI from 'openai';
|
|
128
|
+
import { createTTS } from 'univoice';
|
|
129
|
+
|
|
130
|
+
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
|
|
131
|
+
const tts = createTTS({
|
|
132
|
+
provider: 'doubao',
|
|
133
|
+
appId: 'your-app-id',
|
|
134
|
+
accessToken: 'your-access-token',
|
|
135
|
+
voice: 'zh_female_tianmeixiaoyuan_moon_bigtts',
|
|
136
|
+
format: 'pcm',
|
|
137
|
+
sampleRate: 24000,
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
// 创建 OpenAI 流式请求
|
|
141
|
+
const openaiStream = await openai.chat.completions.stream({
|
|
142
|
+
model: 'gpt-4o-mini',
|
|
143
|
+
messages: [{ role: 'user', content: '请介绍 TypeScript' }],
|
|
144
|
+
stream: true,
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
// 直接将 OpenAI stream 传入 TTS speak
|
|
148
|
+
const chunks: Uint8Array[] = [];
|
|
149
|
+
for await (const { audioChunk } of tts.speak(openaiStream)) {
|
|
150
|
+
chunks.push(audioChunk);
|
|
151
|
+
console.log('收到音频块');
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// 保存音频
|
|
155
|
+
import { writeFileSync } from 'node:fs';
|
|
156
|
+
const buffer = Buffer.concat(chunks.map(c => Buffer.from(c)));
|
|
157
|
+
writeFileSync('output.pcm', buffer);
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
#### 保存音频
|
|
161
|
+
|
|
162
|
+
使用工具函数快速保存音频:
|
|
163
|
+
|
|
164
|
+
```typescript
|
|
165
|
+
import { createTTS, saveAudio } from 'univoice';
|
|
166
|
+
|
|
167
|
+
const tts = createTTS({ /* config */ });
|
|
168
|
+
|
|
169
|
+
// 直接保存流式输出
|
|
170
|
+
await saveAudio('output.pcm', tts.speak('你好,世界!'));
|
|
171
|
+
|
|
172
|
+
// 保存非流式输出
|
|
173
|
+
import { saveTTSResponse } from 'univoice';
|
|
174
|
+
const response = await tts.synthesize({ text: '你好' });
|
|
175
|
+
const filepath = await saveTTSResponse(response);
|
|
176
|
+
console.log(`已保存到: ${filepath}`);
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### ASR(语音识别)
|
|
180
|
+
|
|
181
|
+
```typescript
|
|
182
|
+
import { createASR } from 'univoice';
|
|
183
|
+
import { readFileSync } from 'node:fs';
|
|
184
|
+
|
|
185
|
+
const asr = createASR({
|
|
186
|
+
provider: 'openai',
|
|
187
|
+
apiKey: 'your-api-key',
|
|
188
|
+
model: 'whisper-1',
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
const audioBuffer = readFileSync('audio.mp3');
|
|
192
|
+
const result = await asr.recognize({
|
|
193
|
+
audio: audioBuffer,
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
console.log(`识别结果: ${result.text}`);
|
|
197
|
+
console.log(`语言: ${result.language}`);
|
|
198
|
+
console.log(`时长: ${result.duration}s`);
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## API 文档
|
|
204
|
+
|
|
205
|
+
### TTS API
|
|
206
|
+
|
|
207
|
+
#### 创建实例
|
|
208
|
+
|
|
209
|
+
```typescript
|
|
210
|
+
import { createTTS } from 'univoice';
|
|
211
|
+
|
|
212
|
+
const tts = createTTS({
|
|
213
|
+
provider: 'doubao' | 'openai' | 'minimax' | 'qwen' | 'gemini',
|
|
214
|
+
// 通用配置
|
|
215
|
+
apiKey?: string,
|
|
216
|
+
baseUrl?: string,
|
|
217
|
+
model?: string,
|
|
218
|
+
voice?: string,
|
|
219
|
+
format?: 'mp3' | 'wav' | 'ogg' | 'flac' | 'pcm',
|
|
220
|
+
speed?: number,
|
|
221
|
+
volume?: number,
|
|
222
|
+
pitch?: number,
|
|
223
|
+
language?: string,
|
|
224
|
+
// doubao 专用
|
|
225
|
+
appId?: string,
|
|
226
|
+
accessToken?: string,
|
|
227
|
+
resourceId?: string,
|
|
228
|
+
sampleRate?: number,
|
|
229
|
+
});
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
#### 方法
|
|
233
|
+
|
|
234
|
+
| 方法 | 说明 | 返回类型 |
|
|
235
|
+
|------|------|----------|
|
|
236
|
+
| `tts.synthesize(request)` | 非流式合成 | `Promise<TTSResponse>` |
|
|
237
|
+
| `tts.speak(input)` | 流式合成 | `AsyncIterable<TTSStreamChunk>` |
|
|
238
|
+
| `tts.listVoices?()` | 列出可用声音 | `Promise<TTSVoice[]>` |
|
|
239
|
+
|
|
240
|
+
#### 工具函数
|
|
241
|
+
|
|
242
|
+
| 函数 | 说明 |
|
|
243
|
+
|------|------|
|
|
244
|
+
| `saveTTSResponse(response, options)` | 保存 TTS 响应到文件 |
|
|
245
|
+
| `saveAudio(filename, stream)` | 保存流式音频到文件 |
|
|
246
|
+
| `collectAudio(response, options)` | 收集音频数据 |
|
|
247
|
+
| `playAudio(response, options)` | 播放音频 |
|
|
248
|
+
| `teeAudio(response, options)` | 同时保存和播放 |
|
|
249
|
+
|
|
250
|
+
### ASR API
|
|
251
|
+
|
|
252
|
+
#### 创建实例
|
|
253
|
+
|
|
254
|
+
```typescript
|
|
255
|
+
import { createASR } from 'univoice';
|
|
256
|
+
|
|
257
|
+
const asr = createASR({
|
|
258
|
+
provider: 'doubao' | 'openai' | 'minimax' | 'qwen' | 'gemini',
|
|
259
|
+
apiKey?: string,
|
|
260
|
+
baseUrl?: string,
|
|
261
|
+
model?: string,
|
|
262
|
+
language?: string,
|
|
263
|
+
prompt?: string,
|
|
264
|
+
responseFormat?: 'json' | 'text' | 'srt' | 'vtt' | 'verbose_json',
|
|
265
|
+
});
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
#### 方法
|
|
269
|
+
|
|
270
|
+
| 方法 | 说明 | 返回类型 |
|
|
271
|
+
|------|------|----------|
|
|
272
|
+
| `asr.recognize(request)` | 语音识别 | `Promise<ASRResponse>` |
|
|
273
|
+
|
|
274
|
+
#### 工具函数
|
|
275
|
+
|
|
276
|
+
| 函数 | 说明 |
|
|
277
|
+
|------|------|
|
|
278
|
+
| `saveText(text, options)` | 保存识别文本到文件 |
|
|
279
|
+
| `collectText(response, options)` | 收集识别结果 |
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
## 支持的提供商
|
|
284
|
+
|
|
285
|
+
### TTS 提供商
|
|
286
|
+
|
|
287
|
+
| 提供商 | 标识符 | 流式支持 |
|
|
288
|
+
|--------|--------|----------|
|
|
289
|
+
| 豆包(火山引擎) | `doubao` | ✅ |
|
|
290
|
+
| OpenAI | `openai` | ❌ |
|
|
291
|
+
| MiniMax | `minimax` | ✅ |
|
|
292
|
+
| 通义千问 | `qwen` | ✅ |
|
|
293
|
+
| Gemini | `gemini` | ❌ |
|
|
294
|
+
|
|
295
|
+
### ASR 提供商
|
|
296
|
+
|
|
297
|
+
| 提供商 | 标识符 |
|
|
298
|
+
|--------|--------|
|
|
299
|
+
| 豆包(火山引擎) | `doubao` |
|
|
300
|
+
| OpenAI | `openai` |
|
|
301
|
+
| MiniMax | `minimax` |
|
|
302
|
+
| 通义千问 | `qwen` |
|
|
303
|
+
| Gemini | `gemini` |
|
|
304
|
+
|
|
305
|
+
### 配置示例
|
|
306
|
+
|
|
307
|
+
#### 豆包(火山引擎)
|
|
308
|
+
|
|
309
|
+
```typescript
|
|
310
|
+
const tts = createTTS({
|
|
311
|
+
provider: 'doubao',
|
|
312
|
+
appId: process.env.DOUBAO_APP_ID,
|
|
313
|
+
accessToken: process.env.DOUBAO_ACCESS_TOKEN,
|
|
314
|
+
voice: 'zh_female_tianmeixiaoyuan_moon_bigtts',
|
|
315
|
+
resourceId: 'seed-tts-2.0',
|
|
316
|
+
format: 'mp3',
|
|
317
|
+
sampleRate: 24000,
|
|
318
|
+
});
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
#### OpenAI
|
|
322
|
+
|
|
323
|
+
```typescript
|
|
324
|
+
const tts = createTTS({
|
|
325
|
+
provider: 'openai',
|
|
326
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
327
|
+
model: 'tts-1',
|
|
328
|
+
voice: 'alloy',
|
|
329
|
+
speed: 1.0,
|
|
330
|
+
});
|
|
331
|
+
|
|
332
|
+
const asr = createASR({
|
|
333
|
+
provider: 'openai',
|
|
334
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
335
|
+
model: 'whisper-1',
|
|
336
|
+
language: 'zh',
|
|
337
|
+
});
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
#### MiniMax
|
|
341
|
+
|
|
342
|
+
```typescript
|
|
343
|
+
const tts = createTTS({
|
|
344
|
+
provider: 'minimax',
|
|
345
|
+
apiKey: process.env.MINIMAX_API_KEY,
|
|
346
|
+
groupId: process.env.MINIMAX_GROUP_ID,
|
|
347
|
+
voice: 'female-tianmei',
|
|
348
|
+
format: 'mp3',
|
|
349
|
+
});
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
#### 通义千问
|
|
353
|
+
|
|
354
|
+
```typescript
|
|
355
|
+
const tts = createTTS({
|
|
356
|
+
provider: 'qwen',
|
|
357
|
+
apiKey: process.env.QWEN_API_KEY,
|
|
358
|
+
model: 'cosyvoice-v1',
|
|
359
|
+
voice: 'longxiaochun',
|
|
360
|
+
format: 'mp3',
|
|
361
|
+
});
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
#### Gemini
|
|
365
|
+
|
|
366
|
+
```typescript
|
|
367
|
+
const tts = createTTS({
|
|
368
|
+
provider: 'gemini',
|
|
369
|
+
apiKey: process.env.GEMINI_API_KEY,
|
|
370
|
+
voice: 'Kore',
|
|
371
|
+
language: 'zh-CN',
|
|
372
|
+
});
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
---
|
|
376
|
+
|
|
377
|
+
## 开发指南
|
|
378
|
+
|
|
379
|
+
### 本地开发
|
|
380
|
+
|
|
381
|
+
```bash
|
|
382
|
+
# 克隆仓库
|
|
383
|
+
git clone https://github.com/shenjingnan/univoice.git
|
|
384
|
+
cd univoice
|
|
385
|
+
|
|
386
|
+
# 安装依赖
|
|
387
|
+
pnpm install
|
|
388
|
+
|
|
389
|
+
# 构建项目
|
|
390
|
+
pnpm build
|
|
391
|
+
|
|
392
|
+
# 运行测试
|
|
393
|
+
pnpm test
|
|
394
|
+
|
|
395
|
+
# 代码检查
|
|
396
|
+
pnpm lint
|
|
397
|
+
|
|
398
|
+
# 格式化代码
|
|
399
|
+
pnpm format
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
### 添加新提供商
|
|
403
|
+
|
|
404
|
+
1. 在 `src/tts/providers/` 或 `src/asr/providers/` 创建新文件
|
|
405
|
+
2. 继承 `BaseTTS` 或 `BaseASR` 类
|
|
406
|
+
3. 实现必要的方法
|
|
407
|
+
4. 在文件末尾调用 `registerTTSProvider()` 或 `registerASRProvider()`
|
|
408
|
+
|
|
409
|
+
```typescript
|
|
410
|
+
// src/tts/providers/my-provider.ts
|
|
411
|
+
import { BaseTTS, registerTTSProvider } from '@/tts/index';
|
|
412
|
+
import type { TTSOptions, TTSRequest, TTSResponse } from '@/types/tts';
|
|
413
|
+
|
|
414
|
+
class MyTTS extends BaseTTS {
|
|
415
|
+
constructor(options: TTSOptions) {
|
|
416
|
+
super(options);
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
async synthesize(request: TTSRequest): Promise<TTSResponse> {
|
|
420
|
+
// 实现合成逻辑
|
|
421
|
+
return {
|
|
422
|
+
audio: Buffer.from('...'),
|
|
423
|
+
format: 'mp3',
|
|
424
|
+
};
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
registerTTSProvider('my-provider', (options) => new MyTTS(options));
|
|
429
|
+
```
|
|
430
|
+
|
|
431
|
+
### 项目结构
|
|
432
|
+
|
|
433
|
+
```
|
|
434
|
+
src/
|
|
435
|
+
├── index.ts # 主入口,导出所有公开 API
|
|
436
|
+
├── tts/ # TTS 模块
|
|
437
|
+
│ ├── base.ts # BaseTTS 抽象类
|
|
438
|
+
│ ├── factory.ts # 工厂函数
|
|
439
|
+
│ ├── utils/ # 工具函数
|
|
440
|
+
│ │ ├── save.ts # 保存音频
|
|
441
|
+
│ │ ├── collect.ts # 收集音频
|
|
442
|
+
│ │ ├── play.ts # 播放音频
|
|
443
|
+
│ │ └── tee.ts # 同时保存和播放
|
|
444
|
+
│ └── providers/ # 提供商实现
|
|
445
|
+
│ ├── doubao.ts
|
|
446
|
+
│ ├── openai.ts
|
|
447
|
+
│ ├── minimax.ts
|
|
448
|
+
│ ├── qwen.ts
|
|
449
|
+
│ └── gemini.ts
|
|
450
|
+
├── asr/ # ASR 模块
|
|
451
|
+
│ ├── base.ts # BaseASR 抽象类
|
|
452
|
+
│ ├── factory.ts # 工厂函数
|
|
453
|
+
│ ├── utils/ # 工具函数
|
|
454
|
+
│ └── providers/ # 提供商实现
|
|
455
|
+
└── types/ # 类型定义
|
|
456
|
+
├── tts.ts # TTS 相关类型
|
|
457
|
+
├── asr.ts # ASR 相关类型
|
|
458
|
+
└── llm-stream.ts # LLM 流式输出类型
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
---
|
|
462
|
+
|
|
463
|
+
## 许可证
|
|
464
|
+
|
|
465
|
+
[MIT](LICENSE)
|
|
466
|
+
|
|
467
|
+
---
|
|
468
|
+
|
|
469
|
+
## 贡献
|
|
470
|
+
|
|
471
|
+
欢迎提交 Issue 和 Pull Request!
|
|
472
|
+
|
|
473
|
+
---
|
|
474
|
+
|
|
475
|
+
## 致谢
|
|
476
|
+
|
|
477
|
+
感谢以下语音服务提供商:
|
|
478
|
+
- [火山引擎](https://www.volcengine.com/)
|
|
479
|
+
- [OpenAI](https://openai.com/)
|
|
480
|
+
- [MiniMax](https://www.minimaxi.com/)
|
|
481
|
+
- [阿里云通义千问](https://tongyi.aliyun.com/)
|
|
482
|
+
- [Google Gemini](https://ai.google.dev/)
|