@amaster.ai/asr-client 1.1.0-beta.50 → 1.1.0-beta.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +167 -47
- package/dist/index.cjs +202 -40
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +172 -22
- package/dist/index.d.ts +172 -22
- package/dist/index.js +202 -40
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -2,15 +2,19 @@
|
|
|
2
2
|
|
|
3
3
|
基于 Web Audio + WebSocket 的 **实时语音识别(ASR)客户端 SDK**,用于将浏览器麦克风音频实时发送到 ASR 服务(如 Qwen ASR Realtime),并接收实时/最终转写结果。
|
|
4
4
|
|
|
5
|
+
遵循 [Qwen-ASR Realtime API](https://help.aliyun.com/zh/model-studio/qwen-realtime-speech-recognition) 协议实现,完整支持会话管理、VAD 模式、手动模式等全部功能。
|
|
6
|
+
|
|
5
7
|
---
|
|
6
8
|
|
|
7
9
|
## 特性
|
|
8
10
|
|
|
9
11
|
- 🎙 浏览器麦克风实时采集(16kHz / 单声道)
|
|
10
12
|
- 🔁 实时音频流式发送(Base64 PCM16)
|
|
11
|
-
- 🧠
|
|
13
|
+
- 🧠 支持 VAD 模式(自动检测语音开始/结束)和 Manual 模式(手动控制)
|
|
14
|
+
- 🌍 支持 25 种语言识别
|
|
12
15
|
- ✍️ 支持中间结果与最终转写结果
|
|
13
|
-
-
|
|
16
|
+
- 📡 完整的会话生命周期管理(session.update、session.finish)
|
|
17
|
+
- 🔌 事件 ID 自动生成,符合协议规范
|
|
14
18
|
- 🌐 同时支持 WebSocket 实时识别和 HTTP 按压识别
|
|
15
19
|
|
|
16
20
|
---
|
|
@@ -46,70 +50,95 @@ yarn add @amaster.ai/asr-client
|
|
|
46
50
|
|
|
47
51
|
适合需要实时看到识别结果的场景,如语音输入、实时字幕等。
|
|
48
52
|
|
|
49
|
-
####
|
|
53
|
+
#### VAD 模式(推荐)
|
|
54
|
+
|
|
55
|
+
自动检测语音开始和结束:
|
|
50
56
|
|
|
51
57
|
```ts
|
|
52
58
|
import { createASRClient } from "@amaster.ai/asr-client";
|
|
53
59
|
|
|
54
60
|
const client = createASRClient({
|
|
61
|
+
// 语言配置
|
|
62
|
+
language: "zh", // 支持 zh、yue、en、ja 等 25 种语言
|
|
63
|
+
|
|
64
|
+
// VAD 配置(自动检测语音)
|
|
65
|
+
enableVAD: true,
|
|
66
|
+
vadThreshold: 0.0, // 推荐设为 0.0,默认 0.2
|
|
67
|
+
vadSilenceDurationMs: 400, // 推荐设为 400ms,默认 800ms
|
|
68
|
+
|
|
69
|
+
// 回调函数
|
|
55
70
|
onReady() {
|
|
56
|
-
console.log("ASR
|
|
71
|
+
console.log("ASR 连接成功,会话已配置");
|
|
57
72
|
},
|
|
58
73
|
|
|
59
74
|
onSpeechStart() {
|
|
60
|
-
console.log("检测到说话开始");
|
|
75
|
+
console.log("🎤 检测到说话开始");
|
|
61
76
|
},
|
|
62
77
|
|
|
63
78
|
onSpeechEnd() {
|
|
64
|
-
console.log("检测到说话结束");
|
|
79
|
+
console.log("🛑 检测到说话结束");
|
|
65
80
|
},
|
|
66
81
|
|
|
67
82
|
onTranscript(text, isFinal) {
|
|
68
|
-
console.log(isFinal ? "最终结果:" : "实时结果:", text);
|
|
83
|
+
console.log(isFinal ? "✅ 最终结果:" : "📝 实时结果:", text);
|
|
84
|
+
},
|
|
85
|
+
|
|
86
|
+
onSessionFinished() {
|
|
87
|
+
console.log("👋 会话已结束");
|
|
69
88
|
},
|
|
70
89
|
|
|
71
90
|
onError(err) {
|
|
72
|
-
console.error("ASR 错误", err);
|
|
91
|
+
console.error("❌ ASR 错误", err);
|
|
73
92
|
},
|
|
74
93
|
|
|
75
94
|
onClose() {
|
|
76
|
-
console.log("连接已关闭");
|
|
95
|
+
console.log("🔌 连接已关闭");
|
|
77
96
|
},
|
|
78
97
|
});
|
|
79
|
-
```
|
|
80
98
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
```ts
|
|
99
|
+
// 1. 建立连接(自动发送 session.update)
|
|
84
100
|
await client.connect();
|
|
85
|
-
```
|
|
86
101
|
|
|
87
|
-
|
|
102
|
+
// 2. 开始录音
|
|
103
|
+
await client.startRecording();
|
|
88
104
|
|
|
89
|
-
|
|
105
|
+
// 3. 停止录音(VAD 模式下可选,会自动检测结束)
|
|
106
|
+
await client.stopRecording();
|
|
90
107
|
|
|
91
|
-
|
|
92
|
-
await client.
|
|
108
|
+
// 4. 关闭连接(自动发送 session.finish)
|
|
109
|
+
await client.close();
|
|
93
110
|
```
|
|
94
111
|
|
|
95
|
-
|
|
96
|
-
- 自动开始推送音频流
|
|
97
|
-
- 服务端会持续返回实时转写结果
|
|
112
|
+
#### Manual 模式(非 VAD)
|
|
98
113
|
|
|
99
|
-
|
|
114
|
+
手动控制识别时机,适合按住说话、松开识别的场景:
|
|
100
115
|
|
|
101
116
|
```ts
|
|
102
|
-
client
|
|
103
|
-
|
|
117
|
+
const client = createASRClient({
|
|
118
|
+
language: "zh",
|
|
119
|
+
enableVAD: false, // 关闭 VAD,使用手动模式
|
|
120
|
+
|
|
121
|
+
onReady() {
|
|
122
|
+
console.log("准备就绪");
|
|
123
|
+
},
|
|
104
124
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
125
|
+
onTranscript(text, isFinal) {
|
|
126
|
+
if (isFinal) {
|
|
127
|
+
console.log("识别结果:", text);
|
|
128
|
+
}
|
|
129
|
+
},
|
|
108
130
|
|
|
109
|
-
|
|
131
|
+
onAudioBufferCommitted() {
|
|
132
|
+
console.log("音频已提交,等待识别结果...");
|
|
133
|
+
},
|
|
134
|
+
});
|
|
110
135
|
|
|
111
|
-
|
|
112
|
-
client.
|
|
136
|
+
await client.connect();
|
|
137
|
+
await client.startRecording();
|
|
138
|
+
|
|
139
|
+
// 用户说话中...
|
|
140
|
+
|
|
141
|
+
await client.stopRecording(); // 触发 input_audio_buffer.commit
|
|
113
142
|
```
|
|
114
143
|
|
|
115
144
|
---
|
|
@@ -183,26 +212,73 @@ WebSocket 实时 ASR 客户端。
|
|
|
183
212
|
|
|
184
213
|
#### `ASRClientConfig`
|
|
185
214
|
|
|
186
|
-
| 参数
|
|
187
|
-
|
|
|
188
|
-
| `audioFormat`
|
|
189
|
-
| `sampleRate`
|
|
190
|
-
| `
|
|
191
|
-
| `
|
|
192
|
-
| `
|
|
193
|
-
| `
|
|
194
|
-
| `
|
|
195
|
-
| `
|
|
196
|
-
| `
|
|
215
|
+
| 参数 | 类型 | 默认值 | 说明 |
|
|
216
|
+
| ------------------------ | ------------------------------------------ | ------- | -------------------------------------------------------- |
|
|
217
|
+
| `audioFormat` | `"pcm" \| "opus"` | `"pcm"` | 音频格式 |
|
|
218
|
+
| `sampleRate` | `16000 \| 8000` | `16000` | 采样率。8000 会先升采样到 16000,可能引入微小延迟 |
|
|
219
|
+
| `language` | `ASRLanguage` | `"zh"` | 音频源语言,支持 25 种语言(见下方语言列表) |
|
|
220
|
+
| `enableVAD` | `boolean` | `true` | 是否启用 VAD 模式。`true`=自动检测,`false`=手动控制 |
|
|
221
|
+
| `vadThreshold` | `number` | `0.2` | VAD 检测阈值,推荐设为 `0.0`,范围 `[-1, 1]` |
|
|
222
|
+
| `vadSilenceDurationMs` | `number` | `800` | VAD 断句检测阈值(ms),推荐设为 `400`,范围 `[200, 6000]` |
|
|
223
|
+
| `getAccessToken` | `() => string \| null` | - | 获取访问令牌(用于 WebSocket 认证) |
|
|
224
|
+
| `onReady` | `() => void` | - | 会话创建并配置完成(收到 session.updated) |
|
|
225
|
+
| `onSpeechStart` | `() => void` | - | 检测到语音开始(VAD 模式) |
|
|
226
|
+
| `onSpeechEnd` | `() => void` | - | 检测到语音结束(VAD 模式) |
|
|
227
|
+
| `onTranscript` | `(text: string, isFinal: boolean) => void` | - | 转写回调,`isFinal` 表示是否为最终结果 |
|
|
228
|
+
| `onAudioBufferCommitted` | `() => void` | - | 音频缓冲区已提交(非 VAD 模式) |
|
|
229
|
+
| `onSessionFinished` | `() => void` | - | 会话已结束(收到 session.finished) |
|
|
230
|
+
| `onError` | `(error: Error) => void` | - | 错误回调 |
|
|
231
|
+
| `onClose` | `() => void` | - | 连接关闭回调 |
|
|
232
|
+
|
|
233
|
+
#### 支持的语言 (ASRLanguage)
|
|
234
|
+
|
|
235
|
+
| 代码 | 语言 | 代码 | 语言 |
|
|
236
|
+
| ----- | ------------------------------------ | ---- | -------- |
|
|
237
|
+
| `zh` | 中文(普通话、四川话、闽南语、吴语) | `it` | 意大利语 |
|
|
238
|
+
| `yue` | 粤语 | `es` | 西班牙语 |
|
|
239
|
+
| `en` | 英文 | `hi` | 印地语 |
|
|
240
|
+
| `ja` | 日语 | `id` | 印尼语 |
|
|
241
|
+
| `de` | 德语 | `th` | 泰语 |
|
|
242
|
+
| `ko` | 韩语 | `tr` | 土耳其语 |
|
|
243
|
+
| `ru` | 俄语 | `uk` | 乌克兰语 |
|
|
244
|
+
| `fr` | 法语 | `vi` | 越南语 |
|
|
245
|
+
| `pt` | 葡萄牙语 | `cs` | 捷克语 |
|
|
246
|
+
| `ar` | 阿拉伯语 | `da` | 丹麦语 |
|
|
247
|
+
| `fil` | 菲律宾语 | `fi` | 芬兰语 |
|
|
248
|
+
| `is` | 冰岛语 | `ms` | 马来语 |
|
|
249
|
+
| `no` | 挪威语 | `pl` | 波兰语 |
|
|
250
|
+
| `sv` | 瑞典语 | | |
|
|
197
251
|
|
|
198
252
|
#### `ASRClient`
|
|
199
253
|
|
|
200
254
|
```ts
|
|
201
255
|
interface ASRClient {
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
256
|
+
/** 建立 WebSocket 连接并发送 session.update */
|
|
257
|
+
connect(): Promise<void>;
|
|
258
|
+
|
|
259
|
+
/** 开始录音并流式发送音频 */
|
|
260
|
+
startRecording(): Promise<void>;
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* 停止录音
|
|
264
|
+
* - VAD 模式:停止发送音频
|
|
265
|
+
* - 非 VAD 模式:停止发送音频并触发识别(发送 input_audio_buffer.commit)
|
|
266
|
+
*/
|
|
267
|
+
stopRecording(): Promise<void>;
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* 关闭连接
|
|
271
|
+
* - 发送 session.finish
|
|
272
|
+
* - 等待 session.finished
|
|
273
|
+
* - 关闭 WebSocket
|
|
274
|
+
*/
|
|
275
|
+
close(): Promise<void>;
|
|
276
|
+
|
|
277
|
+
/** 是否正在录音 */
|
|
278
|
+
isRecording(): boolean;
|
|
279
|
+
|
|
280
|
+
/** 是否已连接到服务器 */
|
|
281
|
+
isConnected(): boolean;
|
|
206
282
|
}
|
|
207
283
|
```
|
|
208
284
|
|
|
@@ -273,11 +349,42 @@ const result = await asrHttpClient.stopRecording();
|
|
|
273
349
|
|
|
274
350
|
---
|
|
275
351
|
|
|
352
|
+
## VAD 模式 vs Manual 模式
|
|
353
|
+
|
|
354
|
+
| 特性 | VAD 模式(enableVAD: true) | Manual 模式(enableVAD: false) |
|
|
355
|
+
| -------- | -------------------------------------- | ------------------------------- |
|
|
356
|
+
| 语音检测 | 服务端自动检测 | 客户端手动控制 |
|
|
357
|
+
| 适用场景 | 连续对话、实时字幕 | 按住说话、语音消息 |
|
|
358
|
+
| 开始录音 | `startRecording()` | `startRecording()` |
|
|
359
|
+
| 停止录音 | 自动检测 / `stopRecording()` | `stopRecording()` 触发识别 |
|
|
360
|
+
| 事件触发 | `onSpeechStart` / `onSpeechEnd` | `onAudioBufferCommitted` |
|
|
361
|
+
| 配置参数 | `vadThreshold`, `vadSilenceDurationMs` | 无需 VAD 配置 |
|
|
362
|
+
|
|
363
|
+
### VAD 配置建议
|
|
364
|
+
|
|
365
|
+
```ts
|
|
366
|
+
// 快速响应场景(如实时字幕)
|
|
367
|
+
const client = createASRClient({
|
|
368
|
+
enableVAD: true,
|
|
369
|
+
vadThreshold: 0.0, // 最灵敏
|
|
370
|
+
vadSilenceDurationMs: 400, // 较短停顿即断句
|
|
371
|
+
});
|
|
372
|
+
|
|
373
|
+
// 长句输入场景(如文章朗读)
|
|
374
|
+
const client = createASRClient({
|
|
375
|
+
enableVAD: true,
|
|
376
|
+
vadThreshold: 0.2, // 默认灵敏度
|
|
377
|
+
vadSilenceDurationMs: 1200, // 较长停顿才断句
|
|
378
|
+
});
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
---
|
|
382
|
+
|
|
276
383
|
## 音频参数说明
|
|
277
384
|
|
|
278
385
|
### WebSocket ASR
|
|
279
386
|
|
|
280
|
-
- **采样率**:16000 Hz
|
|
387
|
+
- **采样率**:16000 Hz(推荐)或 8000 Hz
|
|
281
388
|
- **声道**:单声道
|
|
282
389
|
- **格式**:PCM16 → Base64
|
|
283
390
|
- **缓冲大小**:4096 frames
|
|
@@ -312,14 +419,27 @@ ASR 服务通常要求 16kHz PCM 输入,否则会影响识别效果或直接
|
|
|
312
419
|
- iOS Safari 需用户手势触发录音
|
|
313
420
|
- 后台会自动暂停音频采集
|
|
314
421
|
|
|
422
|
+
### Q: VAD 模式下为什么要等一段时间才返回最终结果?
|
|
423
|
+
|
|
424
|
+
VAD 需要检测到静音(silence)才会认为一句话结束,然后返回最终结果。`vadSilenceDurationMs` 配置了这个静音时长阈值。
|
|
425
|
+
|
|
426
|
+
### Q: 如何切换语言?
|
|
427
|
+
|
|
428
|
+
```ts
|
|
429
|
+
const client = createASRClient({
|
|
430
|
+
language: "en", // 英文识别
|
|
431
|
+
});
|
|
432
|
+
```
|
|
433
|
+
|
|
315
434
|
---
|
|
316
435
|
|
|
317
436
|
## 注意事项
|
|
318
437
|
|
|
319
438
|
- WebSocket 必须在 HTTPS 页面下使用麦克风
|
|
320
|
-
- 页面关闭前建议调用 `client.close()`
|
|
439
|
+
- 页面关闭前建议调用 `await client.close()` 优雅关闭
|
|
321
440
|
- 不建议在多个 ASR Client 实例中共享麦克风
|
|
322
441
|
- HTTP ASR 会自动将录音转为 WAV 格式上传
|
|
442
|
+
- 每个事件都会自动生成唯一的 `event_id`,符合协议规范
|
|
323
443
|
|
|
324
444
|
---
|
|
325
445
|
|
package/dist/index.cjs
CHANGED
|
@@ -28,10 +28,10 @@ module.exports = __toCommonJS(index_exports);
|
|
|
28
28
|
// src/asr-client.ts
|
|
29
29
|
var ASR_PATH = "/api/proxy/builtin/platform/qwen-asr-realtime/api-ws/v1/realtime";
|
|
30
30
|
async function createRealtimeRecorder() {
|
|
31
|
-
let stream;
|
|
32
|
-
let ctx;
|
|
33
|
-
let source;
|
|
34
|
-
let processor;
|
|
31
|
+
let stream = null;
|
|
32
|
+
let ctx = null;
|
|
33
|
+
let source = null;
|
|
34
|
+
let processor = null;
|
|
35
35
|
return {
|
|
36
36
|
async start(onAudio) {
|
|
37
37
|
stream = await navigator.mediaDevices.getUserMedia({
|
|
@@ -63,18 +63,144 @@ async function createRealtimeRecorder() {
|
|
|
63
63
|
stream?.getTracks().forEach((t) => t.stop());
|
|
64
64
|
source?.disconnect();
|
|
65
65
|
processor?.disconnect();
|
|
66
|
-
|
|
66
|
+
if (ctx) {
|
|
67
|
+
await ctx.close();
|
|
68
|
+
}
|
|
69
|
+
stream = null;
|
|
70
|
+
ctx = null;
|
|
71
|
+
source = null;
|
|
72
|
+
processor = null;
|
|
67
73
|
}
|
|
68
74
|
};
|
|
69
75
|
}
|
|
70
76
|
var log = (message, type = "") => {
|
|
71
77
|
console.log(`[${type}]`, message);
|
|
72
78
|
};
|
|
79
|
+
var eventIdCounter = 0;
|
|
80
|
+
function generateEventId() {
|
|
81
|
+
return `event_${Date.now()}_${++eventIdCounter}`;
|
|
82
|
+
}
|
|
73
83
|
function createASRClient(config) {
|
|
74
|
-
const {
|
|
84
|
+
const {
|
|
85
|
+
onReady,
|
|
86
|
+
onSpeechStart,
|
|
87
|
+
onSpeechEnd,
|
|
88
|
+
onTranscript,
|
|
89
|
+
onAudioBufferCommitted,
|
|
90
|
+
onSessionFinished,
|
|
91
|
+
onError,
|
|
92
|
+
onClose,
|
|
93
|
+
getAccessToken,
|
|
94
|
+
audioFormat = "pcm",
|
|
95
|
+
sampleRate = 16e3,
|
|
96
|
+
language = "zh",
|
|
97
|
+
enableVAD = true,
|
|
98
|
+
vadThreshold = 0.2,
|
|
99
|
+
vadSilenceDurationMs = 400
|
|
100
|
+
} = config;
|
|
75
101
|
let ws = null;
|
|
76
102
|
let recorder = null;
|
|
103
|
+
let isRecordingFlag = false;
|
|
104
|
+
let isClosing = false;
|
|
77
105
|
const path = ASR_PATH;
|
|
106
|
+
function sendEvent(event) {
|
|
107
|
+
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
|
108
|
+
throw new Error("WebSocket not connected");
|
|
109
|
+
}
|
|
110
|
+
ws.send(JSON.stringify(event));
|
|
111
|
+
}
|
|
112
|
+
function buildSessionConfig() {
|
|
113
|
+
const sessionConfig = {
|
|
114
|
+
input_audio_format: audioFormat,
|
|
115
|
+
sample_rate: sampleRate,
|
|
116
|
+
input_audio_transcription: {
|
|
117
|
+
language
|
|
118
|
+
}
|
|
119
|
+
};
|
|
120
|
+
if (enableVAD) {
|
|
121
|
+
sessionConfig.turn_detection = {
|
|
122
|
+
type: "server_vad",
|
|
123
|
+
threshold: vadThreshold,
|
|
124
|
+
silence_duration_ms: vadSilenceDurationMs
|
|
125
|
+
};
|
|
126
|
+
} else {
|
|
127
|
+
sessionConfig.turn_detection = null;
|
|
128
|
+
}
|
|
129
|
+
return sessionConfig;
|
|
130
|
+
}
|
|
131
|
+
function sendSessionUpdate() {
|
|
132
|
+
const event = {
|
|
133
|
+
event_id: generateEventId(),
|
|
134
|
+
type: "session.update",
|
|
135
|
+
session: buildSessionConfig()
|
|
136
|
+
};
|
|
137
|
+
sendEvent(event);
|
|
138
|
+
}
|
|
139
|
+
function sendAudioBufferAppend(audio) {
|
|
140
|
+
const event = {
|
|
141
|
+
event_id: generateEventId(),
|
|
142
|
+
type: "input_audio_buffer.append",
|
|
143
|
+
audio
|
|
144
|
+
};
|
|
145
|
+
sendEvent(event);
|
|
146
|
+
}
|
|
147
|
+
function sendAudioBufferCommit() {
|
|
148
|
+
const event = {
|
|
149
|
+
event_id: generateEventId(),
|
|
150
|
+
type: "input_audio_buffer.commit"
|
|
151
|
+
};
|
|
152
|
+
sendEvent(event);
|
|
153
|
+
}
|
|
154
|
+
function sendSessionFinish() {
|
|
155
|
+
const event = {
|
|
156
|
+
event_id: generateEventId(),
|
|
157
|
+
type: "session.finish"
|
|
158
|
+
};
|
|
159
|
+
sendEvent(event);
|
|
160
|
+
}
|
|
161
|
+
function handleServerEvent(data) {
|
|
162
|
+
switch (data.type) {
|
|
163
|
+
case "session.created":
|
|
164
|
+
try {
|
|
165
|
+
sendSessionUpdate();
|
|
166
|
+
} catch (err2) {
|
|
167
|
+
onError?.(
|
|
168
|
+
new Error(
|
|
169
|
+
"Failed to send session.update: " + (err2 instanceof Error ? err2.message : String(err2))
|
|
170
|
+
)
|
|
171
|
+
);
|
|
172
|
+
}
|
|
173
|
+
break;
|
|
174
|
+
case "session.updated":
|
|
175
|
+
onReady?.();
|
|
176
|
+
break;
|
|
177
|
+
case "input_audio_buffer.speech_started":
|
|
178
|
+
onSpeechStart?.();
|
|
179
|
+
break;
|
|
180
|
+
case "input_audio_buffer.speech_stopped":
|
|
181
|
+
onSpeechEnd?.();
|
|
182
|
+
break;
|
|
183
|
+
case "input_audio_buffer.committed":
|
|
184
|
+
onAudioBufferCommitted?.();
|
|
185
|
+
break;
|
|
186
|
+
case "conversation.item.input_audio_transcription.text":
|
|
187
|
+
onTranscript?.(data.text || data.stash || data.transcript || "", false);
|
|
188
|
+
break;
|
|
189
|
+
case "conversation.item.input_audio_transcription.completed":
|
|
190
|
+
onTranscript?.(data.text || data.transcript || "", true);
|
|
191
|
+
break;
|
|
192
|
+
case "session.finished":
|
|
193
|
+
onSessionFinished?.();
|
|
194
|
+
ws?.close();
|
|
195
|
+
break;
|
|
196
|
+
case "error":
|
|
197
|
+
const err = new Error(data.error?.message || "ASR error");
|
|
198
|
+
onError?.(err);
|
|
199
|
+
break;
|
|
200
|
+
default:
|
|
201
|
+
console.warn("[ASR] Unknown server event:", data.type);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
78
204
|
async function connect() {
|
|
79
205
|
let wsUrl = path;
|
|
80
206
|
if (getAccessToken) {
|
|
@@ -84,30 +210,34 @@ function createASRClient(config) {
|
|
|
84
210
|
wsUrl = `${path}${separator}token=${encodeURIComponent(token)}`;
|
|
85
211
|
}
|
|
86
212
|
}
|
|
213
|
+
if (typeof window !== "undefined" && window.location) {
|
|
214
|
+
const protocol = window.location.protocol === "https:" ? "wss:" : "ws:";
|
|
215
|
+
if (!wsUrl.startsWith("ws://") && !wsUrl.startsWith("wss://")) {
|
|
216
|
+
wsUrl = `${protocol}//${window.location.host}${wsUrl}`;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
87
219
|
ws = new WebSocket(wsUrl);
|
|
88
220
|
return new Promise((resolve, reject) => {
|
|
221
|
+
if (!ws) {
|
|
222
|
+
reject(new Error("Failed to create WebSocket"));
|
|
223
|
+
return;
|
|
224
|
+
}
|
|
225
|
+
ws.onopen = () => {
|
|
226
|
+
log("WebSocket connected", "success");
|
|
227
|
+
};
|
|
89
228
|
ws.onmessage = (event) => {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
onTranscript?.(data.text || data.stash || data.transcript || "", false);
|
|
103
|
-
}
|
|
104
|
-
if (data.type === "conversation.item.input_audio_transcription.completed") {
|
|
105
|
-
onTranscript?.(data.text || data.transcript || "", true);
|
|
106
|
-
}
|
|
107
|
-
if (data.type === "error") {
|
|
108
|
-
const err = new Error(data.error?.message || "ASR error");
|
|
109
|
-
onError?.(err);
|
|
110
|
-
reject(err);
|
|
229
|
+
try {
|
|
230
|
+
const data = JSON.parse(event.data);
|
|
231
|
+
handleServerEvent(data);
|
|
232
|
+
if (data.type === "session.updated") {
|
|
233
|
+
resolve();
|
|
234
|
+
}
|
|
235
|
+
} catch (err) {
|
|
236
|
+
const error = new Error(
|
|
237
|
+
"Failed to parse server message: " + (err instanceof Error ? err.message : String(err))
|
|
238
|
+
);
|
|
239
|
+
onError?.(error);
|
|
240
|
+
reject(error);
|
|
111
241
|
}
|
|
112
242
|
};
|
|
113
243
|
ws.onerror = () => {
|
|
@@ -116,8 +246,7 @@ function createASRClient(config) {
|
|
|
116
246
|
reject(err);
|
|
117
247
|
};
|
|
118
248
|
ws.onclose = () => {
|
|
119
|
-
|
|
120
|
-
recorder = null;
|
|
249
|
+
isRecordingFlag = false;
|
|
121
250
|
ws = null;
|
|
122
251
|
onClose?.();
|
|
123
252
|
};
|
|
@@ -127,37 +256,70 @@ function createASRClient(config) {
|
|
|
127
256
|
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
|
128
257
|
throw new Error("WebSocket not connected");
|
|
129
258
|
}
|
|
259
|
+
if (isRecordingFlag) {
|
|
260
|
+
throw new Error("Already recording");
|
|
261
|
+
}
|
|
130
262
|
recorder = await createRealtimeRecorder();
|
|
263
|
+
isRecordingFlag = true;
|
|
131
264
|
await recorder.start((audio) => {
|
|
132
265
|
if (!ws || ws.readyState !== WebSocket.OPEN) return;
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
);
|
|
266
|
+
try {
|
|
267
|
+
sendAudioBufferAppend(audio);
|
|
268
|
+
} catch (err) {
|
|
269
|
+
console.error("[ASR] Failed to send audio:", err);
|
|
270
|
+
}
|
|
139
271
|
});
|
|
140
272
|
}
|
|
141
273
|
async function stopRecording() {
|
|
274
|
+
if (!isRecordingFlag) {
|
|
275
|
+
return;
|
|
276
|
+
}
|
|
142
277
|
try {
|
|
143
278
|
await recorder?.stop();
|
|
144
279
|
} catch (err) {
|
|
280
|
+
console.error("[ASR] Error stopping recorder:", err);
|
|
145
281
|
}
|
|
146
282
|
recorder = null;
|
|
147
|
-
|
|
148
|
-
|
|
283
|
+
isRecordingFlag = false;
|
|
284
|
+
if (!enableVAD && ws?.readyState === WebSocket.OPEN) {
|
|
285
|
+
try {
|
|
286
|
+
sendAudioBufferCommit();
|
|
287
|
+
} catch (err) {
|
|
288
|
+
console.error("[ASR] Failed to send commit:", err);
|
|
289
|
+
}
|
|
149
290
|
}
|
|
150
291
|
}
|
|
151
|
-
function close() {
|
|
152
|
-
|
|
292
|
+
async function close() {
|
|
293
|
+
if (isClosing) {
|
|
294
|
+
return;
|
|
295
|
+
}
|
|
296
|
+
isClosing = true;
|
|
297
|
+
await stopRecording();
|
|
298
|
+
if (ws?.readyState === WebSocket.OPEN) {
|
|
299
|
+
try {
|
|
300
|
+
sendSessionFinish();
|
|
301
|
+
await new Promise((resolve) => setTimeout(resolve, 1e3));
|
|
302
|
+
} catch (err) {
|
|
303
|
+
console.error("[ASR] Failed to send session.finish:", err);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
153
306
|
ws?.close();
|
|
154
307
|
ws = null;
|
|
308
|
+
isClosing = false;
|
|
309
|
+
}
|
|
310
|
+
function isRecording() {
|
|
311
|
+
return isRecordingFlag;
|
|
312
|
+
}
|
|
313
|
+
function isConnected() {
|
|
314
|
+
return ws !== null && ws.readyState === WebSocket.OPEN;
|
|
155
315
|
}
|
|
156
316
|
return {
|
|
157
317
|
connect,
|
|
158
318
|
startRecording,
|
|
159
319
|
stopRecording,
|
|
160
|
-
close
|
|
320
|
+
close,
|
|
321
|
+
isRecording,
|
|
322
|
+
isConnected
|
|
161
323
|
};
|
|
162
324
|
}
|
|
163
325
|
var asr_client_default = (authConfig) => (config) => createASRClient({ ...authConfig, ...config });
|