@wovin/tranz 0.1.35 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -5
- package/dist/{audio.min.js → audio.js} +32 -18
- package/dist/index.d.ts +3 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/{index.min.js → index.js} +161 -29
- package/dist/providers.d.ts +1 -1
- package/dist/providers.d.ts.map +1 -1
- package/dist/{providers.min.js → providers.js} +68 -24
- package/dist/utils/audio/merge-results.d.ts +14 -12
- package/dist/utils/audio/merge-results.d.ts.map +1 -1
- package/dist/utils/transcription/format.d.ts +27 -0
- package/dist/utils/transcription/format.d.ts.map +1 -1
- package/dist/utils/transcription/providers.d.ts +30 -1
- package/dist/utils/transcription/providers.d.ts.map +1 -1
- package/dist/utils/transcription/transcribe.d.ts +5 -0
- package/dist/utils/transcription/transcribe.d.ts.map +1 -1
- package/package.json +10 -8
- package/src/audio.ts +25 -0
- package/src/index.ts +61 -0
- package/src/providers.ts +23 -0
- package/src/realtime.ts +58 -0
- package/src/utils/audio/index.ts +6 -0
- package/src/utils/audio/merge-results.ts +198 -0
- package/src/utils/audio/split.ts +504 -0
- package/src/utils/file-utils.ts +16 -0
- package/src/utils/transcription/format.ts +208 -0
- package/src/utils/transcription/mime-detection.ts +80 -0
- package/src/utils/transcription/providers.ts +572 -0
- package/src/utils/transcription/realtime.ts +821 -0
- package/src/utils/transcription/runtime.ts +40 -0
- package/src/utils/transcription/transcribe.ts +366 -0
- /package/dist/{realtime.min.js → realtime.js} +0 -0
|
@@ -0,0 +1,821 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Realtime transcription API for Mistral's WebSocket-based transcription service
|
|
3
|
+
*
|
|
4
|
+
* Provides a simple, event-driven interface for streaming audio transcription.
|
|
5
|
+
* Users provide audio as AsyncIterable<Uint8Array> and receive typed events.
|
|
6
|
+
*
|
|
7
|
+
* Browser-compatible: Uses native WebSocket in browsers/Deno, 'ws' package in Node.js
|
|
8
|
+
*
|
|
9
|
+
* @example
|
|
10
|
+
* ```typescript
|
|
11
|
+
* import { createRealtimeTranscriber } from '@wovin/tranz/realtime'
|
|
12
|
+
*
|
|
13
|
+
* const transcriber = createRealtimeTranscriber({
|
|
14
|
+
* apiKey: process.env.MISTRAL_API_KEY,
|
|
15
|
+
* })
|
|
16
|
+
*
|
|
17
|
+
* for await (const event of transcriber.transcribe(audioStream)) {
|
|
18
|
+
* if (event.type === 'transcription.text.delta') {
|
|
19
|
+
* process.stdout.write(event.text)
|
|
20
|
+
* }
|
|
21
|
+
* }
|
|
22
|
+
* ```
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
import { getWebSocketImpl } from "./runtime.js";
|
|
26
|
+
|
|
27
|
+
// ============================================================================
|
|
28
|
+
// Type Definitions
|
|
29
|
+
// ============================================================================
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Audio encoding formats supported by the transcription service
|
|
33
|
+
*/
|
|
34
|
+
export enum AudioEncoding {
|
|
35
|
+
PcmS16le = "pcm_s16le",
|
|
36
|
+
PcmS16be = "pcm_s16be",
|
|
37
|
+
PcmU16le = "pcm_u16le",
|
|
38
|
+
PcmU16be = "pcm_u16be",
|
|
39
|
+
PcmS24le = "pcm_s24le",
|
|
40
|
+
PcmS24be = "pcm_s24be",
|
|
41
|
+
PcmU24le = "pcm_u24le",
|
|
42
|
+
PcmU24be = "pcm_u24be",
|
|
43
|
+
PcmS32le = "pcm_s32le",
|
|
44
|
+
PcmS32be = "pcm_s32be",
|
|
45
|
+
PcmU32le = "pcm_u32le",
|
|
46
|
+
PcmU32be = "pcm_u32be",
|
|
47
|
+
PcmF32le = "pcm_f32le",
|
|
48
|
+
PcmF32be = "pcm_f32be",
|
|
49
|
+
PcmF64le = "pcm_f64le",
|
|
50
|
+
PcmF64be = "pcm_f64be",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Audio format configuration for realtime transcription
|
|
55
|
+
*/
|
|
56
|
+
export interface AudioFormat {
|
|
57
|
+
/** Audio encoding format (default: pcm_s16le) */
|
|
58
|
+
encoding: AudioEncoding;
|
|
59
|
+
/** Sample rate in Hz (default: 16000) */
|
|
60
|
+
sampleRate: number;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Configuration for creating a realtime transcriber
|
|
65
|
+
*/
|
|
66
|
+
export interface RealtimeConfig {
|
|
67
|
+
/** Mistral API key */
|
|
68
|
+
apiKey: string;
|
|
69
|
+
/** Model ID (default: voxtral-mini-transcribe-realtime-2602) */
|
|
70
|
+
model?: string;
|
|
71
|
+
/** WebSocket base URL (default: wss://api.mistral.ai) */
|
|
72
|
+
baseUrl?: string;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Options for transcription
|
|
77
|
+
*/
|
|
78
|
+
export interface TranscribeOptions {
|
|
79
|
+
/** Audio format configuration (optional, uses defaults if not provided) */
|
|
80
|
+
audioFormat?: Partial<AudioFormat>;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Union type for all realtime transcription events
|
|
85
|
+
* These events are yielded as the transcription progresses
|
|
86
|
+
*/
|
|
87
|
+
export type RealtimeEvent =
|
|
88
|
+
| SessionCreatedEvent
|
|
89
|
+
| SessionUpdatedEvent
|
|
90
|
+
| TranscriptionTextDeltaEvent
|
|
91
|
+
| TranscriptionLanguageEvent
|
|
92
|
+
| TranscriptionSegmentEvent
|
|
93
|
+
| TranscriptionDoneEvent
|
|
94
|
+
| ErrorEvent;
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Session created event - emitted when WebSocket connection is established
|
|
98
|
+
*/
|
|
99
|
+
export interface SessionCreatedEvent {
|
|
100
|
+
type: "session.created";
|
|
101
|
+
session: {
|
|
102
|
+
id: string;
|
|
103
|
+
audioFormat: AudioFormat;
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Session updated event - emitted when audio format is confirmed
|
|
109
|
+
*/
|
|
110
|
+
export interface SessionUpdatedEvent {
|
|
111
|
+
type: "session.updated";
|
|
112
|
+
session: {
|
|
113
|
+
audioFormat: AudioFormat;
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Text delta event - emitted as transcription text arrives in chunks
|
|
119
|
+
* This is the primary event for displaying real-time transcription
|
|
120
|
+
*/
|
|
121
|
+
export interface TranscriptionTextDeltaEvent {
|
|
122
|
+
type: "transcription.text.delta";
|
|
123
|
+
text: string;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Language detection event - emitted when audio language is detected
|
|
128
|
+
*/
|
|
129
|
+
export interface TranscriptionLanguageEvent {
|
|
130
|
+
type: "transcription.language";
|
|
131
|
+
audioLanguage: string;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Segment event - emitted for timestamped segments
|
|
136
|
+
* NOTE: WebSocket realtime API does NOT support this - included for completeness
|
|
137
|
+
*/
|
|
138
|
+
export interface TranscriptionSegmentEvent {
|
|
139
|
+
type: "transcription.segment";
|
|
140
|
+
start?: number;
|
|
141
|
+
end?: number;
|
|
142
|
+
text: string;
|
|
143
|
+
speakerId?: string;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Transcription done event - emitted when transcription completes
|
|
148
|
+
* Contains the complete transcript
|
|
149
|
+
*/
|
|
150
|
+
export interface TranscriptionDoneEvent {
|
|
151
|
+
type: "transcription.done";
|
|
152
|
+
text: string;
|
|
153
|
+
language?: string;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Error event - emitted when an error occurs
|
|
158
|
+
*/
|
|
159
|
+
export interface ErrorEvent {
|
|
160
|
+
type: "error";
|
|
161
|
+
error: {
|
|
162
|
+
message: string | unknown;
|
|
163
|
+
code?: string;
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// ============================================================================
|
|
168
|
+
// Main API
|
|
169
|
+
// ============================================================================
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Realtime transcriber interface
|
|
173
|
+
*/
|
|
174
|
+
export interface RealtimeTranscriber {
|
|
175
|
+
/**
|
|
176
|
+
* Transcribe audio stream and yield events as they arrive
|
|
177
|
+
*
|
|
178
|
+
* @param audioStream - AsyncIterable of audio chunks (Uint8Array)
|
|
179
|
+
* @param options - Optional transcription options
|
|
180
|
+
* @returns AsyncIterable of transcription events
|
|
181
|
+
*
|
|
182
|
+
* @example
|
|
183
|
+
* ```typescript
|
|
184
|
+
* const transcriber = createRealtimeTranscriber({ apiKey: 'xxx' })
|
|
185
|
+
*
|
|
186
|
+
* for await (const event of transcriber.transcribe(audioStream)) {
|
|
187
|
+
* if (event.type === 'transcription.text.delta') {
|
|
188
|
+
* console.log(event.text)
|
|
189
|
+
* } else if (event.type === 'transcription.done') {
|
|
190
|
+
* console.log('Complete:', event.text)
|
|
191
|
+
* break
|
|
192
|
+
* }
|
|
193
|
+
* }
|
|
194
|
+
* ```
|
|
195
|
+
*/
|
|
196
|
+
transcribe(
|
|
197
|
+
audioStream: AsyncIterable<Uint8Array>,
|
|
198
|
+
options?: TranscribeOptions
|
|
199
|
+
): AsyncIterable<RealtimeEvent>;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Create a realtime transcriber instance
|
|
204
|
+
*
|
|
205
|
+
* @param config - Configuration including API key and optional model/baseUrl
|
|
206
|
+
* @returns RealtimeTranscriber instance
|
|
207
|
+
*
|
|
208
|
+
* @example
|
|
209
|
+
* ```typescript
|
|
210
|
+
* const transcriber = createRealtimeTranscriber({
|
|
211
|
+
* apiKey: process.env.MISTRAL_API_KEY,
|
|
212
|
+
* model: 'voxtral-mini-transcribe-realtime-2602', // optional
|
|
213
|
+
* baseUrl: 'wss://api.mistral.ai', // optional
|
|
214
|
+
* })
|
|
215
|
+
* ```
|
|
216
|
+
*/
|
|
217
|
+
export function createRealtimeTranscriber(
|
|
218
|
+
config: RealtimeConfig
|
|
219
|
+
): RealtimeTranscriber {
|
|
220
|
+
// Check if running in browser - not supported yet
|
|
221
|
+
const isBrowser =
|
|
222
|
+
typeof window !== "undefined" &&
|
|
223
|
+
typeof document !== "undefined" &&
|
|
224
|
+
typeof navigator !== "undefined";
|
|
225
|
+
|
|
226
|
+
if (isBrowser) {
|
|
227
|
+
throw new Error(
|
|
228
|
+
"Realtime transcription is not yet supported in browsers. " +
|
|
229
|
+
"Browser WebSocket API does not support authentication headers required by Mistral API. " +
|
|
230
|
+
"Use this API in Node.js or server-side environments only. " +
|
|
231
|
+
"See: https://github.com/wovin/tranz/issues"
|
|
232
|
+
);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const model = config.model ?? "voxtral-mini-transcribe-realtime-2602";
|
|
236
|
+
const baseUrl = config.baseUrl ?? "wss://api.mistral.ai";
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
async *transcribe(
|
|
240
|
+
audioStream: AsyncIterable<Uint8Array>,
|
|
241
|
+
options?: TranscribeOptions
|
|
242
|
+
): AsyncIterable<RealtimeEvent> {
|
|
243
|
+
// Merge default audio format with user options
|
|
244
|
+
const audioFormat: AudioFormat = {
|
|
245
|
+
encoding: options?.audioFormat?.encoding ?? AudioEncoding.PcmS16le,
|
|
246
|
+
sampleRate: options?.audioFormat?.sampleRate ?? 16000,
|
|
247
|
+
};
|
|
248
|
+
|
|
249
|
+
// Create WebSocket connection
|
|
250
|
+
const connection = await createConnection(
|
|
251
|
+
config.apiKey,
|
|
252
|
+
baseUrl,
|
|
253
|
+
model,
|
|
254
|
+
audioFormat
|
|
255
|
+
);
|
|
256
|
+
|
|
257
|
+
try {
|
|
258
|
+
// Start audio sending task
|
|
259
|
+
let stopRequested = false;
|
|
260
|
+
const sendAudioTask = (async () => {
|
|
261
|
+
try {
|
|
262
|
+
for await (const chunk of audioStream) {
|
|
263
|
+
if (stopRequested || connection.isClosed) {
|
|
264
|
+
break;
|
|
265
|
+
}
|
|
266
|
+
await connection.sendAudio(chunk);
|
|
267
|
+
}
|
|
268
|
+
} finally {
|
|
269
|
+
await connection.endAudio();
|
|
270
|
+
}
|
|
271
|
+
})();
|
|
272
|
+
|
|
273
|
+
// Yield events as they arrive
|
|
274
|
+
for await (const event of connection.events()) {
|
|
275
|
+
yield event;
|
|
276
|
+
|
|
277
|
+
if (event.type === "transcription.done" || event.type === "error") {
|
|
278
|
+
break;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// Wait for audio sending to complete
|
|
283
|
+
await sendAudioTask;
|
|
284
|
+
} finally {
|
|
285
|
+
await connection.close();
|
|
286
|
+
|
|
287
|
+
// Clean up audio stream if possible
|
|
288
|
+
const maybeReturn = (
|
|
289
|
+
audioStream as {
|
|
290
|
+
return?: () => Promise<IteratorResult<Uint8Array>>;
|
|
291
|
+
}
|
|
292
|
+
).return;
|
|
293
|
+
if (typeof maybeReturn === "function") {
|
|
294
|
+
await maybeReturn.call(audioStream);
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
},
|
|
298
|
+
};
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// ============================================================================
|
|
302
|
+
// WebSocket Connection Implementation
|
|
303
|
+
// ============================================================================
|
|
304
|
+
|
|
305
|
+
interface Connection {
|
|
306
|
+
isClosed: boolean;
|
|
307
|
+
events(): AsyncGenerator<RealtimeEvent>;
|
|
308
|
+
sendAudio(chunk: Uint8Array): Promise<void>;
|
|
309
|
+
endAudio(): Promise<void>;
|
|
310
|
+
close(): Promise<void>;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
async function createConnection(
|
|
314
|
+
apiKey: string,
|
|
315
|
+
baseUrl: string,
|
|
316
|
+
model: string,
|
|
317
|
+
audioFormat: AudioFormat
|
|
318
|
+
): Promise<Connection> {
|
|
319
|
+
const WebSocketImpl = await getWebSocketImpl();
|
|
320
|
+
|
|
321
|
+
// Build WebSocket URL
|
|
322
|
+
const wsUrl = buildWebSocketUrl(baseUrl, model, apiKey);
|
|
323
|
+
|
|
324
|
+
// Detect if this is Node.js 'ws' package or browser WebSocket
|
|
325
|
+
const isNodeWs = typeof process !== "undefined" && process.versions?.node;
|
|
326
|
+
|
|
327
|
+
// Create WebSocket
|
|
328
|
+
// Browser WebSocket doesn't support headers in constructor, Node.js ws does
|
|
329
|
+
const ws = isNodeWs
|
|
330
|
+
? new (WebSocketImpl as any)(wsUrl, {
|
|
331
|
+
headers: {
|
|
332
|
+
Authorization: `Bearer ${apiKey}`,
|
|
333
|
+
},
|
|
334
|
+
})
|
|
335
|
+
: new WebSocketImpl(wsUrl);
|
|
336
|
+
|
|
337
|
+
// Wait for connection and session creation
|
|
338
|
+
const session = await waitForSession(ws as WebSocket);
|
|
339
|
+
|
|
340
|
+
let closed = false;
|
|
341
|
+
|
|
342
|
+
const websocket = ws as WebSocket;
|
|
343
|
+
|
|
344
|
+
const connection: Connection = {
|
|
345
|
+
get isClosed() {
|
|
346
|
+
return closed || websocket.readyState === 2 || websocket.readyState === 3;
|
|
347
|
+
},
|
|
348
|
+
|
|
349
|
+
async *events(): AsyncGenerator<RealtimeEvent> {
|
|
350
|
+
type QueueItem = {
|
|
351
|
+
kind: "message" | "close" | "error";
|
|
352
|
+
data?: unknown;
|
|
353
|
+
error?: Error;
|
|
354
|
+
};
|
|
355
|
+
const queue: QueueItem[] = [];
|
|
356
|
+
let resolver: ((item: QueueItem) => void) | null = null;
|
|
357
|
+
let done = false;
|
|
358
|
+
|
|
359
|
+
const push = (item: QueueItem) => {
|
|
360
|
+
if (done) return;
|
|
361
|
+
if (resolver) {
|
|
362
|
+
const resolve = resolver;
|
|
363
|
+
resolver = null;
|
|
364
|
+
resolve(item);
|
|
365
|
+
return;
|
|
366
|
+
}
|
|
367
|
+
queue.push(item);
|
|
368
|
+
};
|
|
369
|
+
|
|
370
|
+
const handleMessage = (event: MessageEvent) => {
|
|
371
|
+
push({ kind: "message", data: event.data });
|
|
372
|
+
};
|
|
373
|
+
|
|
374
|
+
const handleClose = () => {
|
|
375
|
+
closed = true;
|
|
376
|
+
push({ kind: "close" });
|
|
377
|
+
};
|
|
378
|
+
|
|
379
|
+
const handleError = (event: Event) => {
|
|
380
|
+
push({
|
|
381
|
+
kind: "error",
|
|
382
|
+
error: new Error("WebSocket connection error"),
|
|
383
|
+
});
|
|
384
|
+
};
|
|
385
|
+
|
|
386
|
+
websocket.addEventListener("message", handleMessage);
|
|
387
|
+
websocket.addEventListener("close", handleClose);
|
|
388
|
+
websocket.addEventListener("error", handleError);
|
|
389
|
+
|
|
390
|
+
try {
|
|
391
|
+
while (true) {
|
|
392
|
+
const item =
|
|
393
|
+
queue.length > 0
|
|
394
|
+
? queue.shift()!
|
|
395
|
+
: await new Promise<QueueItem>((resolve) => {
|
|
396
|
+
resolver = resolve;
|
|
397
|
+
});
|
|
398
|
+
|
|
399
|
+
if (item.kind === "close") break;
|
|
400
|
+
|
|
401
|
+
if (item.kind === "error") {
|
|
402
|
+
const error =
|
|
403
|
+
item.error ?? new Error("WebSocket connection error");
|
|
404
|
+
yield {
|
|
405
|
+
type: "error",
|
|
406
|
+
error: { message: error.message },
|
|
407
|
+
} as ErrorEvent;
|
|
408
|
+
continue;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
const event = parseRealtimeEvent(item.data);
|
|
412
|
+
yield event;
|
|
413
|
+
}
|
|
414
|
+
} finally {
|
|
415
|
+
done = true;
|
|
416
|
+
websocket.removeEventListener("message", handleMessage);
|
|
417
|
+
websocket.removeEventListener("close", handleClose);
|
|
418
|
+
websocket.removeEventListener("error", handleError);
|
|
419
|
+
if (resolver !== null) {
|
|
420
|
+
const resolve = resolver;
|
|
421
|
+
resolver = null;
|
|
422
|
+
resolve({ kind: "close" });
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
},
|
|
426
|
+
|
|
427
|
+
async sendAudio(chunk: Uint8Array): Promise<void> {
|
|
428
|
+
if (connection.isClosed) {
|
|
429
|
+
throw new Error("Connection is closed");
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
const base64Audio = arrayBufferToBase64(chunk);
|
|
433
|
+
const message = {
|
|
434
|
+
type: "input_audio.append",
|
|
435
|
+
audio: base64Audio,
|
|
436
|
+
};
|
|
437
|
+
|
|
438
|
+
await sendJson(websocket, message);
|
|
439
|
+
},
|
|
440
|
+
|
|
441
|
+
async endAudio(): Promise<void> {
|
|
442
|
+
if (connection.isClosed) return;
|
|
443
|
+
await sendJson(websocket, { type: "input_audio.end" });
|
|
444
|
+
},
|
|
445
|
+
|
|
446
|
+
async close(): Promise<void> {
|
|
447
|
+
if (closed) return;
|
|
448
|
+
closed = true;
|
|
449
|
+
|
|
450
|
+
if (websocket.readyState === 3) return;
|
|
451
|
+
|
|
452
|
+
await new Promise<void>((resolve) => {
|
|
453
|
+
const finalize = () => {
|
|
454
|
+
websocket.removeEventListener("close", finalize);
|
|
455
|
+
resolve();
|
|
456
|
+
};
|
|
457
|
+
websocket.addEventListener("close", finalize);
|
|
458
|
+
websocket.close(1000, "");
|
|
459
|
+
});
|
|
460
|
+
},
|
|
461
|
+
};
|
|
462
|
+
|
|
463
|
+
return connection;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
function buildWebSocketUrl(
|
|
467
|
+
baseUrl: string,
|
|
468
|
+
model: string,
|
|
469
|
+
apiKey: string
|
|
470
|
+
): string {
|
|
471
|
+
const url = new URL("v1/audio/transcriptions/realtime", baseUrl);
|
|
472
|
+
url.searchParams.set("model", model);
|
|
473
|
+
return url.toString();
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
async function waitForSession(ws: WebSocket): Promise<SessionCreatedEvent> {
|
|
477
|
+
return new Promise((resolve, reject) => {
|
|
478
|
+
const timeout = setTimeout(() => {
|
|
479
|
+
cleanup();
|
|
480
|
+
ws.close();
|
|
481
|
+
reject(new Error("Timeout waiting for session creation"));
|
|
482
|
+
}, 10000);
|
|
483
|
+
|
|
484
|
+
const cleanup = () => {
|
|
485
|
+
clearTimeout(timeout);
|
|
486
|
+
ws.removeEventListener("message", handleMessage);
|
|
487
|
+
ws.removeEventListener("close", handleClose);
|
|
488
|
+
ws.removeEventListener("error", handleError);
|
|
489
|
+
};
|
|
490
|
+
|
|
491
|
+
const handleMessage = (event: MessageEvent) => {
|
|
492
|
+
try {
|
|
493
|
+
const parsed = parseRealtimeEvent(event.data);
|
|
494
|
+
if (parsed.type === "session.created") {
|
|
495
|
+
cleanup();
|
|
496
|
+
resolve(parsed as SessionCreatedEvent);
|
|
497
|
+
} else if (parsed.type === "error") {
|
|
498
|
+
cleanup();
|
|
499
|
+
ws.close();
|
|
500
|
+
reject(
|
|
501
|
+
new Error(
|
|
502
|
+
`Realtime transcription error: ${JSON.stringify(parsed.error)}`
|
|
503
|
+
)
|
|
504
|
+
);
|
|
505
|
+
}
|
|
506
|
+
} catch (err) {
|
|
507
|
+
cleanup();
|
|
508
|
+
ws.close();
|
|
509
|
+
reject(err);
|
|
510
|
+
}
|
|
511
|
+
};
|
|
512
|
+
|
|
513
|
+
const handleClose = () => {
|
|
514
|
+
cleanup();
|
|
515
|
+
reject(new Error("WebSocket closed during handshake"));
|
|
516
|
+
};
|
|
517
|
+
|
|
518
|
+
const handleError = () => {
|
|
519
|
+
cleanup();
|
|
520
|
+
reject(new Error("WebSocket error during handshake"));
|
|
521
|
+
};
|
|
522
|
+
|
|
523
|
+
ws.addEventListener("message", handleMessage);
|
|
524
|
+
ws.addEventListener("close", handleClose);
|
|
525
|
+
ws.addEventListener("error", handleError);
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
function parseRealtimeEvent(data: unknown): RealtimeEvent {
|
|
530
|
+
try {
|
|
531
|
+
const text =
|
|
532
|
+
typeof data === "string" ? data : new TextDecoder().decode(data as ArrayBuffer);
|
|
533
|
+
const payload = JSON.parse(text);
|
|
534
|
+
|
|
535
|
+
if (typeof payload.type !== "string") {
|
|
536
|
+
return {
|
|
537
|
+
type: "error",
|
|
538
|
+
error: { message: "Invalid event: missing type" },
|
|
539
|
+
};
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
return payload as RealtimeEvent;
|
|
543
|
+
} catch (err) {
|
|
544
|
+
return {
|
|
545
|
+
type: "error",
|
|
546
|
+
error: { message: `Failed to parse event: ${err}` },
|
|
547
|
+
};
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
async function sendJson(ws: WebSocket, payload: unknown): Promise<void> {
|
|
552
|
+
return new Promise((resolve, reject) => {
|
|
553
|
+
const message = JSON.stringify(payload);
|
|
554
|
+
|
|
555
|
+
// Node.js 'ws' package uses callback
|
|
556
|
+
if (typeof (ws as any).send === "function") {
|
|
557
|
+
const send = (ws as any).send.bind(ws);
|
|
558
|
+
// Try callback signature (Node.js ws)
|
|
559
|
+
try {
|
|
560
|
+
send(message, (err: Error | undefined) => {
|
|
561
|
+
if (err) reject(err);
|
|
562
|
+
else resolve();
|
|
563
|
+
});
|
|
564
|
+
} catch {
|
|
565
|
+
// Fallback for browser WebSocket (no callback)
|
|
566
|
+
ws.send(message);
|
|
567
|
+
resolve();
|
|
568
|
+
}
|
|
569
|
+
} else {
|
|
570
|
+
ws.send(message);
|
|
571
|
+
resolve();
|
|
572
|
+
}
|
|
573
|
+
});
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
function arrayBufferToBase64(buffer: Uint8Array): string {
|
|
577
|
+
// Browser
|
|
578
|
+
if (typeof btoa !== "undefined") {
|
|
579
|
+
const binary = Array.from(buffer)
|
|
580
|
+
.map((byte) => String.fromCharCode(byte))
|
|
581
|
+
.join("");
|
|
582
|
+
return btoa(binary);
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
// Node.js
|
|
586
|
+
if (typeof Buffer !== "undefined") {
|
|
587
|
+
return Buffer.from(buffer).toString("base64");
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
throw new Error("No base64 encoding available");
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
// ============================================================================
|
|
594
|
+
// Helper Functions
|
|
595
|
+
// ============================================================================
|
|
596
|
+
|
|
597
|
+
/**
|
|
598
|
+
* Result from audio capture - includes stream and stop function
|
|
599
|
+
*/
|
|
600
|
+
export interface AudioCaptureResult {
|
|
601
|
+
/** AsyncGenerator yielding audio chunks */
|
|
602
|
+
stream: AsyncGenerator<Uint8Array, void, unknown>;
|
|
603
|
+
/** Function to stop audio capture */
|
|
604
|
+
stop: () => void;
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
/**
|
|
608
|
+
* Capture audio from microphone using SoX `rec` command (Node.js only)
|
|
609
|
+
*
|
|
610
|
+
* Yields PCM 16-bit signed little-endian mono audio chunks suitable for
|
|
611
|
+
* realtime transcription.
|
|
612
|
+
*
|
|
613
|
+
* **Requirements:**
|
|
614
|
+
* - SoX audio tools must be installed
|
|
615
|
+
* - macOS: `brew install sox`
|
|
616
|
+
* - Linux: `sudo apt install sox`
|
|
617
|
+
*
|
|
618
|
+
* **Note:** This is Node.js only. For browser audio capture, use `captureAudioFromBrowser()`
|
|
619
|
+
*
|
|
620
|
+
* @param sampleRate - Sample rate in Hz (default: 16000)
|
|
621
|
+
* @returns Object with audio stream and stop function
|
|
622
|
+
*
|
|
623
|
+
* @example
|
|
624
|
+
* ```typescript
|
|
625
|
+
* const { stream, stop } = captureAudioFromMicrophone(16000)
|
|
626
|
+
*
|
|
627
|
+
* try {
|
|
628
|
+
* for await (const event of transcriber.transcribe(stream)) {
|
|
629
|
+
* // ... handle events
|
|
630
|
+
* }
|
|
631
|
+
* } finally {
|
|
632
|
+
* stop() // Clean up audio capture
|
|
633
|
+
* }
|
|
634
|
+
* ```
|
|
635
|
+
*/
|
|
636
|
+
export async function captureAudioFromMicrophone(
|
|
637
|
+
sampleRate: number = 16000
|
|
638
|
+
): Promise<AudioCaptureResult> {
|
|
639
|
+
// Check if we're in Node.js
|
|
640
|
+
if (typeof process === "undefined" || !process.versions?.node) {
|
|
641
|
+
throw new Error(
|
|
642
|
+
"captureAudioFromMicrophone() is Node.js only. Use captureAudioFromBrowser() in browsers."
|
|
643
|
+
);
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
// Dynamic import for Node.js child_process (ES module compatible)
|
|
647
|
+
const { spawn } = await import("node:child_process");
|
|
648
|
+
|
|
649
|
+
const recorder = spawn(
|
|
650
|
+
"rec",
|
|
651
|
+
[
|
|
652
|
+
"-q", // Quiet mode
|
|
653
|
+
"-t",
|
|
654
|
+
"raw", // Raw PCM output
|
|
655
|
+
"-b",
|
|
656
|
+
"16", // 16-bit samples
|
|
657
|
+
"-e",
|
|
658
|
+
"signed-integer", // Signed PCM
|
|
659
|
+
"-r",
|
|
660
|
+
String(sampleRate), // Sample rate
|
|
661
|
+
"-c",
|
|
662
|
+
"1", // Mono (1 channel)
|
|
663
|
+
"-", // Output to stdout
|
|
664
|
+
],
|
|
665
|
+
{ stdio: ["ignore", "pipe", "ignore"] }
|
|
666
|
+
);
|
|
667
|
+
|
|
668
|
+
recorder.on("error", (err: any) => {
|
|
669
|
+
if (err.code === "ENOENT") {
|
|
670
|
+
console.error(
|
|
671
|
+
"\nError: 'rec' command not found. Please install SoX:",
|
|
672
|
+
"\n macOS: brew install sox",
|
|
673
|
+
"\n Linux: sudo apt install sox"
|
|
674
|
+
);
|
|
675
|
+
process.exit(1);
|
|
676
|
+
}
|
|
677
|
+
throw err;
|
|
678
|
+
});
|
|
679
|
+
|
|
680
|
+
const stream = (async function* () {
|
|
681
|
+
try {
|
|
682
|
+
if (!recorder.stdout) {
|
|
683
|
+
throw new Error("Failed to create audio capture stream");
|
|
684
|
+
}
|
|
685
|
+
for await (const chunk of recorder.stdout) {
|
|
686
|
+
yield new Uint8Array(chunk as Buffer);
|
|
687
|
+
}
|
|
688
|
+
} finally {
|
|
689
|
+
if (!recorder.killed) {
|
|
690
|
+
recorder.kill("SIGTERM");
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
})();
|
|
694
|
+
|
|
695
|
+
const stop = () => {
|
|
696
|
+
if (!recorder.killed) {
|
|
697
|
+
recorder.kill("SIGTERM");
|
|
698
|
+
}
|
|
699
|
+
};
|
|
700
|
+
|
|
701
|
+
return { stream, stop };
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
/**
|
|
705
|
+
* Capture audio from browser microphone using Web Audio API
|
|
706
|
+
*
|
|
707
|
+
* **CURRENTLY DISABLED** - Browser support is not available yet due to
|
|
708
|
+
* WebSocket authentication limitations with Mistral API.
|
|
709
|
+
*
|
|
710
|
+
* @deprecated Browser realtime transcription is not yet supported.
|
|
711
|
+
* Use captureAudioFromMicrophone() in Node.js instead.
|
|
712
|
+
*
|
|
713
|
+
* @param sampleRate - Target sample rate in Hz (default: 16000)
|
|
714
|
+
* @returns Object with audio stream and stop function
|
|
715
|
+
*
|
|
716
|
+
* @throws Error - Always throws as browser mode is disabled
|
|
717
|
+
*
|
|
718
|
+
* @todo Enable when Mistral API supports browser WebSocket authentication
|
|
719
|
+
* @todo Migrate to AudioWorklet for better performance
|
|
720
|
+
* See: https://web.dev/patterns/media/microphone-process/
|
|
721
|
+
*/
|
|
722
|
+
export async function captureAudioFromBrowser(
|
|
723
|
+
sampleRate: number = 16000
|
|
724
|
+
): Promise<AudioCaptureResult> {
|
|
725
|
+
throw new Error(
|
|
726
|
+
"Browser realtime transcription is not yet supported. " +
|
|
727
|
+
"Browser WebSocket API does not support authentication headers required by Mistral API. " +
|
|
728
|
+
"Use captureAudioFromMicrophone() in Node.js environments instead."
|
|
729
|
+
);
|
|
730
|
+
|
|
731
|
+
// Implementation disabled - kept for future when auth is resolved
|
|
732
|
+
/* istanbul ignore next */
|
|
733
|
+
if (typeof navigator === "undefined" || !navigator.mediaDevices) {
|
|
734
|
+
throw new Error(
|
|
735
|
+
"captureAudioFromBrowser() requires a browser environment with getUserMedia support"
|
|
736
|
+
);
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
const mediaStream = await navigator.mediaDevices.getUserMedia({
|
|
740
|
+
audio: {
|
|
741
|
+
channelCount: 1,
|
|
742
|
+
sampleRate,
|
|
743
|
+
echoCancellation: true,
|
|
744
|
+
noiseSuppression: true,
|
|
745
|
+
},
|
|
746
|
+
});
|
|
747
|
+
|
|
748
|
+
const audioContext = new AudioContext({ sampleRate });
|
|
749
|
+
const source = audioContext.createMediaStreamSource(mediaStream);
|
|
750
|
+
const processor = audioContext.createScriptProcessor(4096, 1, 1);
|
|
751
|
+
|
|
752
|
+
let stopped = false;
|
|
753
|
+
const chunks: Int16Array[] = [];
|
|
754
|
+
let resolver: ((value: IteratorResult<Uint8Array>) => void) | null = null;
|
|
755
|
+
|
|
756
|
+
processor.onaudioprocess = (event) => {
|
|
757
|
+
if (stopped) return;
|
|
758
|
+
|
|
759
|
+
const inputData = event.inputBuffer.getChannelData(0);
|
|
760
|
+
const pcm16 = new Int16Array(inputData.length);
|
|
761
|
+
|
|
762
|
+
// Convert Float32 to PCM S16LE
|
|
763
|
+
for (let i = 0; i < inputData.length; i++) {
|
|
764
|
+
const sample = Math.max(-1, Math.min(1, inputData[i]));
|
|
765
|
+
pcm16[i] = sample < 0 ? sample * 0x8000 : sample * 0x7fff;
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
// Convert to Uint8Array (little-endian)
|
|
769
|
+
const uint8 = new Uint8Array(pcm16.length * 2);
|
|
770
|
+
for (let i = 0; i < pcm16.length; i++) {
|
|
771
|
+
uint8[i * 2] = pcm16[i] & 0xff;
|
|
772
|
+
uint8[i * 2 + 1] = (pcm16[i] >> 8) & 0xff;
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
if (resolver) {
|
|
776
|
+
const resolve = resolver;
|
|
777
|
+
resolver = null;
|
|
778
|
+
resolve({ value: uint8, done: false });
|
|
779
|
+
} else {
|
|
780
|
+
chunks.push(pcm16);
|
|
781
|
+
}
|
|
782
|
+
};
|
|
783
|
+
|
|
784
|
+
source.connect(processor);
|
|
785
|
+
processor.connect(audioContext.destination);
|
|
786
|
+
|
|
787
|
+
const stream = (async function* () {
|
|
788
|
+
try {
|
|
789
|
+
while (!stopped) {
|
|
790
|
+
if (chunks.length > 0) {
|
|
791
|
+
const pcm16 = chunks.shift()!;
|
|
792
|
+
const uint8 = new Uint8Array(pcm16.length * 2);
|
|
793
|
+
for (let i = 0; i < pcm16.length; i++) {
|
|
794
|
+
uint8[i * 2] = pcm16[i] & 0xff;
|
|
795
|
+
uint8[i * 2 + 1] = (pcm16[i] >> 8) & 0xff;
|
|
796
|
+
}
|
|
797
|
+
yield uint8;
|
|
798
|
+
} else {
|
|
799
|
+
await new Promise<IteratorResult<Uint8Array>>((resolve) => {
|
|
800
|
+
resolver = resolve;
|
|
801
|
+
});
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
} finally {
|
|
805
|
+
processor.disconnect();
|
|
806
|
+
source.disconnect();
|
|
807
|
+
mediaStream.getTracks().forEach((track) => track.stop());
|
|
808
|
+
await audioContext.close();
|
|
809
|
+
}
|
|
810
|
+
})();
|
|
811
|
+
|
|
812
|
+
const stop = () => {
|
|
813
|
+
stopped = true;
|
|
814
|
+
if (resolver) {
|
|
815
|
+
resolver({ value: undefined, done: true });
|
|
816
|
+
resolver = null;
|
|
817
|
+
}
|
|
818
|
+
};
|
|
819
|
+
|
|
820
|
+
return { stream, stop };
|
|
821
|
+
}
|