@amaster.ai/asr-client 1.0.0-beta.7 → 1.0.0-beta.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,40 +1,300 @@
1
+ import { HttpClient } from '@amaster.ai/http-client';
2
+
1
3
  /**
2
- * ASR Realtime WebSocket Client
3
- */
4
- /**
5
- * Simple ASR: start listening and get transcript
6
- * @returns stop function
4
+ * ASR Realtime WebSocket Client for Qwen-ASR Realtime API
5
+ *
6
+ * WebSocket-based real-time speech recognition for streaming transcription.
7
+ * Follows the Qwen-ASR Realtime API protocol with proper event handling.
8
+ *
7
9
  * @example
8
- * const stop = await listen((text, isFinal) => console.log(text))
9
- * // later: stop()
10
+ * ```typescript
11
+ * const client = createASRClient({
12
+ * language: "zh",
13
+ * enableVAD: true,
14
+ * onReady() {
15
+ * console.log("ASR connected");
16
+ * },
17
+ * onTranscript(text, isFinal) {
18
+ * console.log(isFinal ? "[Final]" : "[Interim]", text);
19
+ * },
20
+ * onError(err) {
21
+ * console.error("ASR error:", err);
22
+ * },
23
+ * });
24
+ *
25
+ * await client.connect();
26
+ * await client.startRecording();
27
+ * // ... stop ...
28
+ * await client.stopRecording();
29
+ * await client.close();
30
+ * ```
10
31
  */
11
- declare function listen(onTranscript: (text: string, isFinal: boolean) => void): Promise<() => void>;
32
+ type ASRLanguage = "zh" | "yue" | "en" | "ja" | "de" | "ko" | "ru" | "fr" | "pt" | "ar" | "it" | "es" | "hi" | "id" | "th" | "tr" | "uk" | "vi" | "cs" | "da" | "fil" | "fi" | "is" | "ms" | "no" | "pl" | "sv";
33
+ type ClientEventType = "session.update" | "input_audio_buffer.append" | "input_audio_buffer.commit" | "session.finish";
34
+ type ServerEventType = "session.created" | "session.updated" | "input_audio_buffer.speech_started" | "input_audio_buffer.speech_stopped" | "input_audio_buffer.committed" | "conversation.item.input_audio_transcription.text" | "conversation.item.input_audio_transcription.completed" | "session.finished" | "error";
35
+ interface BaseEvent {
36
+ event_id: string;
37
+ type: ClientEventType | ServerEventType;
38
+ }
39
+ interface SessionUpdateEvent extends BaseEvent {
40
+ type: "session.update";
41
+ session: SessionConfig;
42
+ }
43
+ interface InputAudioBufferAppendEvent extends BaseEvent {
44
+ type: "input_audio_buffer.append";
45
+ audio: string;
46
+ }
47
+ interface InputAudioBufferCommitEvent extends BaseEvent {
48
+ type: "input_audio_buffer.commit";
49
+ }
50
+ interface SessionFinishEvent extends BaseEvent {
51
+ type: "session.finish";
52
+ }
53
+ type ClientEvent = SessionUpdateEvent | InputAudioBufferAppendEvent | InputAudioBufferCommitEvent | SessionFinishEvent;
54
+ interface SessionCreatedEvent extends BaseEvent {
55
+ type: "session.created";
56
+ session: {
57
+ id: string;
58
+ };
59
+ }
60
+ interface SessionUpdatedEvent extends BaseEvent {
61
+ type: "session.updated";
62
+ session: SessionConfig;
63
+ }
64
+ interface SpeechStartedEvent extends BaseEvent {
65
+ type: "input_audio_buffer.speech_started";
66
+ }
67
+ interface SpeechStoppedEvent extends BaseEvent {
68
+ type: "input_audio_buffer.speech_stopped";
69
+ }
70
+ interface InputAudioBufferCommittedEvent extends BaseEvent {
71
+ type: "input_audio_buffer.committed";
72
+ }
73
+ interface TranscriptionTextEvent extends BaseEvent {
74
+ type: "conversation.item.input_audio_transcription.text";
75
+ text?: string;
76
+ stash?: string;
77
+ transcript?: string;
78
+ }
79
+ interface TranscriptionCompletedEvent extends BaseEvent {
80
+ type: "conversation.item.input_audio_transcription.completed";
81
+ text?: string;
82
+ transcript?: string;
83
+ }
84
+ interface SessionFinishedEvent extends BaseEvent {
85
+ type: "session.finished";
86
+ }
87
+ interface ErrorEvent extends BaseEvent {
88
+ type: "error";
89
+ error: {
90
+ message: string;
91
+ code?: string;
92
+ };
93
+ }
94
+ type ServerEvent = SessionCreatedEvent | SessionUpdatedEvent | SpeechStartedEvent | SpeechStoppedEvent | InputAudioBufferCommittedEvent | TranscriptionTextEvent | TranscriptionCompletedEvent | SessionFinishedEvent | ErrorEvent;
95
+ interface TurnDetectionConfig {
96
+ type: "server_vad";
97
+ /** VAD检测阈值,推荐设为 0.0,默认值 0.2,范围 [-1, 1] */
98
+ threshold?: number;
99
+ /** VAD断句检测阈值(ms),推荐设为 400,默认值 800,范围 [200, 6000] */
100
+ silence_duration_ms?: number;
101
+ }
102
+ interface InputAudioTranscriptionConfig {
103
+ language?: ASRLanguage;
104
+ }
105
+ interface SessionConfig {
106
+ input_audio_format?: "pcm" | "opus";
107
+ sample_rate?: 16000 | 8000;
108
+ input_audio_transcription?: InputAudioTranscriptionConfig;
109
+ turn_detection?: TurnDetectionConfig | null;
110
+ }
12
111
  interface ASRClientConfig {
13
- /** Audio format, default 'pcm16' */
14
- audioFormat?: 'pcm16' | 'g711a' | 'g711u';
15
- /** Sample rate, default 16000 */
16
- sampleRate?: number;
17
- /** Called when connection is ready */
112
+ /**
113
+ * Audio format
114
+ * @default "pcm"
115
+ */
116
+ audioFormat?: "pcm" | "opus";
117
+ /**
118
+ * Sample rate in Hz
119
+ * @default 16000
120
+ * @description 支持 16000 和 8000。设置为 8000 时,服务端会先升采样到16000Hz再进行识别,可能引入微小延迟。
121
+ */
122
+ sampleRate?: 16000 | 8000;
123
+ /**
124
+ * Audio source language
125
+ * @default "zh"
126
+ * @description 支持多种语言,包括 zh(中文)、yue(粤语)、en(英文)、ja(日语)等
127
+ */
128
+ language?: ASRLanguage;
129
+ /**
130
+ * Enable VAD (Voice Activity Detection) mode
131
+ * @default true
132
+ * @description true = VAD模式(服务端自动检测语音开始/结束),false = Manual模式(客户端手动控制)
133
+ */
134
+ enableVAD?: boolean;
135
+ /**
136
+ * VAD detection threshold
137
+ * @default 0.2
138
+ * @description 推荐设为 0.0。取值范围 [-1, 1]。较低的阈值会提高 VAD 的灵敏度。
139
+ */
140
+ vadThreshold?: number;
141
+ /**
142
+ * VAD silence duration threshold in milliseconds
143
+ * @default 800
144
+ * @description 推荐设为 400。取值范围 [200, 6000]。静音持续时长超过该阈值将被认为是语句结束。
145
+ */
146
+ vadSilenceDurationMs?: number;
147
+ /**
148
+ * Get access token for WebSocket authentication
149
+ */
150
+ getAccessToken?: () => string | null;
151
+ /**
152
+ * Called when connection is ready (session.created received and session.update sent)
153
+ */
18
154
  onReady?: () => void;
19
- /** Called when speech is detected */
155
+ /**
156
+ * Called when speech is detected (VAD mode only)
157
+ */
20
158
  onSpeechStart?: () => void;
21
- /** Called when speech stops */
159
+ /**
160
+ * Called when speech stops (VAD mode only)
161
+ */
22
162
  onSpeechEnd?: () => void;
23
- /** Called on transcript result */
163
+ /**
164
+ * Called on transcript result
165
+ * @param text - Transcribed text
166
+ * @param isFinal - Whether this is the final result
167
+ */
24
168
  onTranscript?: (text: string, isFinal: boolean) => void;
25
- /** Called on error */
169
+ /**
170
+ * Called when audio buffer is committed (non-VAD mode only)
171
+ */
172
+ onAudioBufferCommitted?: () => void;
173
+ /**
174
+ * Called when session is finished
175
+ */
176
+ onSessionFinished?: () => void;
177
+ /**
178
+ * Called on error
179
+ */
26
180
  onError?: (error: Error) => void;
181
+ /**
182
+ * Called on close
183
+ */
184
+ onClose?: () => void;
27
185
  }
28
186
  interface ASRClient {
29
- /** Connect to ASR service */
187
+ /** Connect to ASR service and establish session */
30
188
  connect(): Promise<void>;
31
189
  /** Start recording from microphone */
32
190
  startRecording(): Promise<void>;
33
- /** Stop recording */
34
- stopRecording(): void;
35
- /** Close connection */
36
- close(): void;
191
+ /**
192
+ * Stop recording
193
+ * @description In non-VAD mode, this triggers recognition by sending input_audio_buffer.commit
194
+ */
195
+ stopRecording(): Promise<void>;
196
+ /**
197
+ * Close connection gracefully
198
+ * @description Sends session.finish and waits for session.finished before closing
199
+ */
200
+ close(): Promise<void>;
201
+ /**
202
+ * Check if currently recording
203
+ */
204
+ isRecording(): boolean;
205
+ /**
206
+ * Check if connected to server
207
+ */
208
+ isConnected(): boolean;
209
+ }
210
+ declare const _default$1: (authConfig: Pick<ASRClientConfig, "getAccessToken">) => (config: ASRClientConfig) => ASRClient;
211
+
212
+ /**
213
+ * HTTP ASR Client - Press-to-talk style speech recognition
214
+ *
215
+ * HTTP-based speech recognition suitable for press-to-talk scenarios where you hold to speak
216
+ * and release to recognize. Good for voice messages, voice search, etc.
217
+ *
218
+ * @example
219
+ * ```typescript
220
+ * const client = createASRHttpClient({
221
+ * onRecordingStart() {
222
+ * console.log("Recording started");
223
+ * },
224
+ * onRecordingStop() {
225
+ * console.log("Recording stopped");
226
+ * },
227
+ * onResult(text) {
228
+ * console.log("Recognized:", text);
229
+ * },
230
+ * onError(err) {
231
+ * console.error("ASR error:", err);
232
+ * },
233
+ * });
234
+ *
235
+ * // Hold to speak, release to recognize
236
+ * await client.startRecording();
237
+ * // ... stop ...
238
+ * const result = await client.stopRecording();
239
+ * ```
240
+ */
241
+
242
+ type Status = "idle" | "recording" | "recognizing";
243
+ interface Recorder {
244
+ /** Start recording */
245
+ start(): Promise<void>;
246
+ /**
247
+ * Stop recording and get base64-encoded WAV audio data. You can use this data to call the ASR API.
248
+ *
249
+ * @returns Base64-encoded WAV audio data
250
+ */
251
+ stop(): Promise<void>;
252
+ }
253
+ interface RecorderOptions {
254
+ /** Called when recording starts */
255
+ onStart?: () => void;
256
+ /**
257
+ * Called when recording stops, with base64-encoded WAV audio data. You can use this data to call the ASR API.
258
+ *
259
+ * @param base64 - Base64-encoded WAV audio data
260
+ * @returns void
261
+ */
262
+ onStop?: (base64: string) => void;
263
+ onError?: (error: Error) => void;
264
+ }
265
+ interface ASRHttpClientConfig {
266
+ http?: HttpClient;
267
+ /** Get access token */
268
+ getAccessToken?(): string | null;
269
+ /** Language, default 'zh' */
270
+ language?: string;
271
+ /** Sample rate, default 16000 */
272
+ sampleRate?: number;
273
+ /** Create custom recorder */
274
+ createRecorder?(options?: RecorderOptions): Promise<Recorder>;
275
+ /** Called when recording starts */
276
+ onRecordingStart?: () => void;
277
+ /** Called when recording stops */
278
+ onRecordingStop?: () => void;
279
+ /** Called with recognition result */
280
+ onResult?: (text: string) => void;
281
+ /** Called on error */
282
+ onError?: (error: Error) => void;
283
+ /** Called when status changes */
284
+ onStatusChange?: (status: Status) => void;
285
+ }
286
+ interface ASRHttpClient {
287
+ /** Start recording (press-to-talk) */
288
+ startRecording(): Promise<void>;
289
+ /** Stop recording and get result */
290
+ stopRecording(): Promise<void>;
291
+ /** Record for specific duration then recognize */
292
+ recordAndRecognize(durationMs: number): Promise<void>;
293
+ /** Recognize audio file (File or Blob) */
294
+ recognizeFile(base64: string): Promise<string>;
295
+ /** Recognize audio from URL */
296
+ recognizeUrl(audioUrl: string): Promise<string>;
37
297
  }
38
- declare function createASRClient(config: ASRClientConfig): ASRClient;
298
+ declare const _default: (authConfig: Pick<ASRHttpClientConfig, "getAccessToken" | "http">) => (config: ASRHttpClientConfig) => ASRHttpClient;
39
299
 
40
- export { type ASRClient, type ASRClientConfig, createASRClient, createASRClient as createAsrClient, listen };
300
+ export { type ASRClient, type ASRClientConfig, type ASRHttpClient, type ASRHttpClientConfig, type ASRLanguage, type ClientEvent, type ErrorEvent, type InputAudioBufferAppendEvent, type InputAudioBufferCommitEvent, type InputAudioTranscriptionConfig, type ServerEvent, type SessionConfig, type SessionCreatedEvent, type SessionFinishEvent, type SessionFinishedEvent, type SessionUpdateEvent, type SessionUpdatedEvent, type TranscriptionCompletedEvent, type TranscriptionTextEvent, type TurnDetectionConfig, _default$1 as createASRClient, _default as createASRHttpClient };