@wovin/tranz 0.1.36 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,821 @@
1
+ /**
2
+ * Realtime transcription API for Mistral's WebSocket-based transcription service
3
+ *
4
+ * Provides a simple, event-driven interface for streaming audio transcription.
5
+ * Users provide audio as AsyncIterable<Uint8Array> and receive typed events.
6
+ *
7
+ * Browser-compatible: Uses native WebSocket in browsers/Deno, 'ws' package in Node.js
8
+ *
9
+ * @example
10
+ * ```typescript
11
+ * import { createRealtimeTranscriber } from '@wovin/tranz/realtime'
12
+ *
13
+ * const transcriber = createRealtimeTranscriber({
14
+ * apiKey: process.env.MISTRAL_API_KEY,
15
+ * })
16
+ *
17
+ * for await (const event of transcriber.transcribe(audioStream)) {
18
+ * if (event.type === 'transcription.text.delta') {
19
+ * process.stdout.write(event.text)
20
+ * }
21
+ * }
22
+ * ```
23
+ */
24
+
25
+ import { getWebSocketImpl } from "./runtime.js";
26
+
27
+ // ============================================================================
28
+ // Type Definitions
29
+ // ============================================================================
30
+
31
+ /**
32
+ * Audio encoding formats supported by the transcription service
33
+ */
34
+ export enum AudioEncoding {
35
+ PcmS16le = "pcm_s16le",
36
+ PcmS16be = "pcm_s16be",
37
+ PcmU16le = "pcm_u16le",
38
+ PcmU16be = "pcm_u16be",
39
+ PcmS24le = "pcm_s24le",
40
+ PcmS24be = "pcm_s24be",
41
+ PcmU24le = "pcm_u24le",
42
+ PcmU24be = "pcm_u24be",
43
+ PcmS32le = "pcm_s32le",
44
+ PcmS32be = "pcm_s32be",
45
+ PcmU32le = "pcm_u32le",
46
+ PcmU32be = "pcm_u32be",
47
+ PcmF32le = "pcm_f32le",
48
+ PcmF32be = "pcm_f32be",
49
+ PcmF64le = "pcm_f64le",
50
+ PcmF64be = "pcm_f64be",
51
+ }
52
+
53
+ /**
54
+ * Audio format configuration for realtime transcription
55
+ */
56
+ export interface AudioFormat {
57
+ /** Audio encoding format (default: pcm_s16le) */
58
+ encoding: AudioEncoding;
59
+ /** Sample rate in Hz (default: 16000) */
60
+ sampleRate: number;
61
+ }
62
+
63
+ /**
64
+ * Configuration for creating a realtime transcriber
65
+ */
66
+ export interface RealtimeConfig {
67
+ /** Mistral API key */
68
+ apiKey: string;
69
+ /** Model ID (default: voxtral-mini-transcribe-realtime-2602) */
70
+ model?: string;
71
+ /** WebSocket base URL (default: wss://api.mistral.ai) */
72
+ baseUrl?: string;
73
+ }
74
+
75
+ /**
76
+ * Options for transcription
77
+ */
78
+ export interface TranscribeOptions {
79
+ /** Audio format configuration (optional, uses defaults if not provided) */
80
+ audioFormat?: Partial<AudioFormat>;
81
+ }
82
+
83
+ /**
84
+ * Union type for all realtime transcription events
85
+ * These events are yielded as the transcription progresses
86
+ */
87
+ export type RealtimeEvent =
88
+ | SessionCreatedEvent
89
+ | SessionUpdatedEvent
90
+ | TranscriptionTextDeltaEvent
91
+ | TranscriptionLanguageEvent
92
+ | TranscriptionSegmentEvent
93
+ | TranscriptionDoneEvent
94
+ | ErrorEvent;
95
+
96
+ /**
97
+ * Session created event - emitted when WebSocket connection is established
98
+ */
99
+ export interface SessionCreatedEvent {
100
+ type: "session.created";
101
+ session: {
102
+ id: string;
103
+ audioFormat: AudioFormat;
104
+ };
105
+ }
106
+
107
+ /**
108
+ * Session updated event - emitted when audio format is confirmed
109
+ */
110
+ export interface SessionUpdatedEvent {
111
+ type: "session.updated";
112
+ session: {
113
+ audioFormat: AudioFormat;
114
+ };
115
+ }
116
+
117
+ /**
118
+ * Text delta event - emitted as transcription text arrives in chunks
119
+ * This is the primary event for displaying real-time transcription
120
+ */
121
+ export interface TranscriptionTextDeltaEvent {
122
+ type: "transcription.text.delta";
123
+ text: string;
124
+ }
125
+
126
+ /**
127
+ * Language detection event - emitted when audio language is detected
128
+ */
129
+ export interface TranscriptionLanguageEvent {
130
+ type: "transcription.language";
131
+ audioLanguage: string;
132
+ }
133
+
134
+ /**
135
+ * Segment event - emitted for timestamped segments
136
+ * NOTE: WebSocket realtime API does NOT support this - included for completeness
137
+ */
138
+ export interface TranscriptionSegmentEvent {
139
+ type: "transcription.segment";
140
+ start?: number;
141
+ end?: number;
142
+ text: string;
143
+ speakerId?: string;
144
+ }
145
+
146
+ /**
147
+ * Transcription done event - emitted when transcription completes
148
+ * Contains the complete transcript
149
+ */
150
+ export interface TranscriptionDoneEvent {
151
+ type: "transcription.done";
152
+ text: string;
153
+ language?: string;
154
+ }
155
+
156
+ /**
157
+ * Error event - emitted when an error occurs
158
+ */
159
+ export interface ErrorEvent {
160
+ type: "error";
161
+ error: {
162
+ message: string | unknown;
163
+ code?: string;
164
+ };
165
+ }
166
+
167
+ // ============================================================================
168
+ // Main API
169
+ // ============================================================================
170
+
171
+ /**
172
+ * Realtime transcriber interface
173
+ */
174
+ export interface RealtimeTranscriber {
175
+ /**
176
+ * Transcribe audio stream and yield events as they arrive
177
+ *
178
+ * @param audioStream - AsyncIterable of audio chunks (Uint8Array)
179
+ * @param options - Optional transcription options
180
+ * @returns AsyncIterable of transcription events
181
+ *
182
+ * @example
183
+ * ```typescript
184
+ * const transcriber = createRealtimeTranscriber({ apiKey: 'xxx' })
185
+ *
186
+ * for await (const event of transcriber.transcribe(audioStream)) {
187
+ * if (event.type === 'transcription.text.delta') {
188
+ * console.log(event.text)
189
+ * } else if (event.type === 'transcription.done') {
190
+ * console.log('Complete:', event.text)
191
+ * break
192
+ * }
193
+ * }
194
+ * ```
195
+ */
196
+ transcribe(
197
+ audioStream: AsyncIterable<Uint8Array>,
198
+ options?: TranscribeOptions
199
+ ): AsyncIterable<RealtimeEvent>;
200
+ }
201
+
202
+ /**
203
+ * Create a realtime transcriber instance
204
+ *
205
+ * @param config - Configuration including API key and optional model/baseUrl
206
+ * @returns RealtimeTranscriber instance
207
+ *
208
+ * @example
209
+ * ```typescript
210
+ * const transcriber = createRealtimeTranscriber({
211
+ * apiKey: process.env.MISTRAL_API_KEY,
212
+ * model: 'voxtral-mini-transcribe-realtime-2602', // optional
213
+ * baseUrl: 'wss://api.mistral.ai', // optional
214
+ * })
215
+ * ```
216
+ */
217
+ export function createRealtimeTranscriber(
218
+ config: RealtimeConfig
219
+ ): RealtimeTranscriber {
220
+ // Check if running in browser - not supported yet
221
+ const isBrowser =
222
+ typeof window !== "undefined" &&
223
+ typeof document !== "undefined" &&
224
+ typeof navigator !== "undefined";
225
+
226
+ if (isBrowser) {
227
+ throw new Error(
228
+ "Realtime transcription is not yet supported in browsers. " +
229
+ "Browser WebSocket API does not support authentication headers required by Mistral API. " +
230
+ "Use this API in Node.js or server-side environments only. " +
231
+ "See: https://github.com/wovin/tranz/issues"
232
+ );
233
+ }
234
+
235
+ const model = config.model ?? "voxtral-mini-transcribe-realtime-2602";
236
+ const baseUrl = config.baseUrl ?? "wss://api.mistral.ai";
237
+
238
+ return {
239
+ async *transcribe(
240
+ audioStream: AsyncIterable<Uint8Array>,
241
+ options?: TranscribeOptions
242
+ ): AsyncIterable<RealtimeEvent> {
243
+ // Merge default audio format with user options
244
+ const audioFormat: AudioFormat = {
245
+ encoding: options?.audioFormat?.encoding ?? AudioEncoding.PcmS16le,
246
+ sampleRate: options?.audioFormat?.sampleRate ?? 16000,
247
+ };
248
+
249
+ // Create WebSocket connection
250
+ const connection = await createConnection(
251
+ config.apiKey,
252
+ baseUrl,
253
+ model,
254
+ audioFormat
255
+ );
256
+
257
+ try {
258
+ // Start audio sending task
259
+ let stopRequested = false;
260
+ const sendAudioTask = (async () => {
261
+ try {
262
+ for await (const chunk of audioStream) {
263
+ if (stopRequested || connection.isClosed) {
264
+ break;
265
+ }
266
+ await connection.sendAudio(chunk);
267
+ }
268
+ } finally {
269
+ await connection.endAudio();
270
+ }
271
+ })();
272
+
273
+ // Yield events as they arrive
274
+ for await (const event of connection.events()) {
275
+ yield event;
276
+
277
+ if (event.type === "transcription.done" || event.type === "error") {
278
+ break;
279
+ }
280
+ }
281
+
282
+ // Wait for audio sending to complete
283
+ await sendAudioTask;
284
+ } finally {
285
+ await connection.close();
286
+
287
+ // Clean up audio stream if possible
288
+ const maybeReturn = (
289
+ audioStream as {
290
+ return?: () => Promise<IteratorResult<Uint8Array>>;
291
+ }
292
+ ).return;
293
+ if (typeof maybeReturn === "function") {
294
+ await maybeReturn.call(audioStream);
295
+ }
296
+ }
297
+ },
298
+ };
299
+ }
300
+
301
+ // ============================================================================
302
+ // WebSocket Connection Implementation
303
+ // ============================================================================
304
+
305
+ interface Connection {
306
+ isClosed: boolean;
307
+ events(): AsyncGenerator<RealtimeEvent>;
308
+ sendAudio(chunk: Uint8Array): Promise<void>;
309
+ endAudio(): Promise<void>;
310
+ close(): Promise<void>;
311
+ }
312
+
313
+ async function createConnection(
314
+ apiKey: string,
315
+ baseUrl: string,
316
+ model: string,
317
+ audioFormat: AudioFormat
318
+ ): Promise<Connection> {
319
+ const WebSocketImpl = await getWebSocketImpl();
320
+
321
+ // Build WebSocket URL
322
+ const wsUrl = buildWebSocketUrl(baseUrl, model, apiKey);
323
+
324
+ // Detect if this is Node.js 'ws' package or browser WebSocket
325
+ const isNodeWs = typeof process !== "undefined" && process.versions?.node;
326
+
327
+ // Create WebSocket
328
+ // Browser WebSocket doesn't support headers in constructor, Node.js ws does
329
+ const ws = isNodeWs
330
+ ? new (WebSocketImpl as any)(wsUrl, {
331
+ headers: {
332
+ Authorization: `Bearer ${apiKey}`,
333
+ },
334
+ })
335
+ : new WebSocketImpl(wsUrl);
336
+
337
+ // Wait for connection and session creation
338
+ const session = await waitForSession(ws as WebSocket);
339
+
340
+ let closed = false;
341
+
342
+ const websocket = ws as WebSocket;
343
+
344
+ const connection: Connection = {
345
+ get isClosed() {
346
+ return closed || websocket.readyState === 2 || websocket.readyState === 3;
347
+ },
348
+
349
+ async *events(): AsyncGenerator<RealtimeEvent> {
350
+ type QueueItem = {
351
+ kind: "message" | "close" | "error";
352
+ data?: unknown;
353
+ error?: Error;
354
+ };
355
+ const queue: QueueItem[] = [];
356
+ let resolver: ((item: QueueItem) => void) | null = null;
357
+ let done = false;
358
+
359
+ const push = (item: QueueItem) => {
360
+ if (done) return;
361
+ if (resolver) {
362
+ const resolve = resolver;
363
+ resolver = null;
364
+ resolve(item);
365
+ return;
366
+ }
367
+ queue.push(item);
368
+ };
369
+
370
+ const handleMessage = (event: MessageEvent) => {
371
+ push({ kind: "message", data: event.data });
372
+ };
373
+
374
+ const handleClose = () => {
375
+ closed = true;
376
+ push({ kind: "close" });
377
+ };
378
+
379
+ const handleError = (event: Event) => {
380
+ push({
381
+ kind: "error",
382
+ error: new Error("WebSocket connection error"),
383
+ });
384
+ };
385
+
386
+ websocket.addEventListener("message", handleMessage);
387
+ websocket.addEventListener("close", handleClose);
388
+ websocket.addEventListener("error", handleError);
389
+
390
+ try {
391
+ while (true) {
392
+ const item =
393
+ queue.length > 0
394
+ ? queue.shift()!
395
+ : await new Promise<QueueItem>((resolve) => {
396
+ resolver = resolve;
397
+ });
398
+
399
+ if (item.kind === "close") break;
400
+
401
+ if (item.kind === "error") {
402
+ const error =
403
+ item.error ?? new Error("WebSocket connection error");
404
+ yield {
405
+ type: "error",
406
+ error: { message: error.message },
407
+ } as ErrorEvent;
408
+ continue;
409
+ }
410
+
411
+ const event = parseRealtimeEvent(item.data);
412
+ yield event;
413
+ }
414
+ } finally {
415
+ done = true;
416
+ websocket.removeEventListener("message", handleMessage);
417
+ websocket.removeEventListener("close", handleClose);
418
+ websocket.removeEventListener("error", handleError);
419
+ if (resolver !== null) {
420
+ const resolve = resolver;
421
+ resolver = null;
422
+ resolve({ kind: "close" });
423
+ }
424
+ }
425
+ },
426
+
427
+ async sendAudio(chunk: Uint8Array): Promise<void> {
428
+ if (connection.isClosed) {
429
+ throw new Error("Connection is closed");
430
+ }
431
+
432
+ const base64Audio = arrayBufferToBase64(chunk);
433
+ const message = {
434
+ type: "input_audio.append",
435
+ audio: base64Audio,
436
+ };
437
+
438
+ await sendJson(websocket, message);
439
+ },
440
+
441
+ async endAudio(): Promise<void> {
442
+ if (connection.isClosed) return;
443
+ await sendJson(websocket, { type: "input_audio.end" });
444
+ },
445
+
446
+ async close(): Promise<void> {
447
+ if (closed) return;
448
+ closed = true;
449
+
450
+ if (websocket.readyState === 3) return;
451
+
452
+ await new Promise<void>((resolve) => {
453
+ const finalize = () => {
454
+ websocket.removeEventListener("close", finalize);
455
+ resolve();
456
+ };
457
+ websocket.addEventListener("close", finalize);
458
+ websocket.close(1000, "");
459
+ });
460
+ },
461
+ };
462
+
463
+ return connection;
464
+ }
465
+
466
+ function buildWebSocketUrl(
467
+ baseUrl: string,
468
+ model: string,
469
+ apiKey: string
470
+ ): string {
471
+ const url = new URL("v1/audio/transcriptions/realtime", baseUrl);
472
+ url.searchParams.set("model", model);
473
+ return url.toString();
474
+ }
475
+
476
+ async function waitForSession(ws: WebSocket): Promise<SessionCreatedEvent> {
477
+ return new Promise((resolve, reject) => {
478
+ const timeout = setTimeout(() => {
479
+ cleanup();
480
+ ws.close();
481
+ reject(new Error("Timeout waiting for session creation"));
482
+ }, 10000);
483
+
484
+ const cleanup = () => {
485
+ clearTimeout(timeout);
486
+ ws.removeEventListener("message", handleMessage);
487
+ ws.removeEventListener("close", handleClose);
488
+ ws.removeEventListener("error", handleError);
489
+ };
490
+
491
+ const handleMessage = (event: MessageEvent) => {
492
+ try {
493
+ const parsed = parseRealtimeEvent(event.data);
494
+ if (parsed.type === "session.created") {
495
+ cleanup();
496
+ resolve(parsed as SessionCreatedEvent);
497
+ } else if (parsed.type === "error") {
498
+ cleanup();
499
+ ws.close();
500
+ reject(
501
+ new Error(
502
+ `Realtime transcription error: ${JSON.stringify(parsed.error)}`
503
+ )
504
+ );
505
+ }
506
+ } catch (err) {
507
+ cleanup();
508
+ ws.close();
509
+ reject(err);
510
+ }
511
+ };
512
+
513
+ const handleClose = () => {
514
+ cleanup();
515
+ reject(new Error("WebSocket closed during handshake"));
516
+ };
517
+
518
+ const handleError = () => {
519
+ cleanup();
520
+ reject(new Error("WebSocket error during handshake"));
521
+ };
522
+
523
+ ws.addEventListener("message", handleMessage);
524
+ ws.addEventListener("close", handleClose);
525
+ ws.addEventListener("error", handleError);
526
+ });
527
+ }
528
+
529
+ function parseRealtimeEvent(data: unknown): RealtimeEvent {
530
+ try {
531
+ const text =
532
+ typeof data === "string" ? data : new TextDecoder().decode(data as ArrayBuffer);
533
+ const payload = JSON.parse(text);
534
+
535
+ if (typeof payload.type !== "string") {
536
+ return {
537
+ type: "error",
538
+ error: { message: "Invalid event: missing type" },
539
+ };
540
+ }
541
+
542
+ return payload as RealtimeEvent;
543
+ } catch (err) {
544
+ return {
545
+ type: "error",
546
+ error: { message: `Failed to parse event: ${err}` },
547
+ };
548
+ }
549
+ }
550
+
551
+ async function sendJson(ws: WebSocket, payload: unknown): Promise<void> {
552
+ return new Promise((resolve, reject) => {
553
+ const message = JSON.stringify(payload);
554
+
555
+ // Node.js 'ws' package uses callback
556
+ if (typeof (ws as any).send === "function") {
557
+ const send = (ws as any).send.bind(ws);
558
+ // Try callback signature (Node.js ws)
559
+ try {
560
+ send(message, (err: Error | undefined) => {
561
+ if (err) reject(err);
562
+ else resolve();
563
+ });
564
+ } catch {
565
+ // Fallback for browser WebSocket (no callback)
566
+ ws.send(message);
567
+ resolve();
568
+ }
569
+ } else {
570
+ ws.send(message);
571
+ resolve();
572
+ }
573
+ });
574
+ }
575
+
576
+ function arrayBufferToBase64(buffer: Uint8Array): string {
577
+ // Browser
578
+ if (typeof btoa !== "undefined") {
579
+ const binary = Array.from(buffer)
580
+ .map((byte) => String.fromCharCode(byte))
581
+ .join("");
582
+ return btoa(binary);
583
+ }
584
+
585
+ // Node.js
586
+ if (typeof Buffer !== "undefined") {
587
+ return Buffer.from(buffer).toString("base64");
588
+ }
589
+
590
+ throw new Error("No base64 encoding available");
591
+ }
592
+
593
+ // ============================================================================
594
+ // Helper Functions
595
+ // ============================================================================
596
+
597
+ /**
598
+ * Result from audio capture - includes stream and stop function
599
+ */
600
+ export interface AudioCaptureResult {
601
+ /** AsyncGenerator yielding audio chunks */
602
+ stream: AsyncGenerator<Uint8Array, void, unknown>;
603
+ /** Function to stop audio capture */
604
+ stop: () => void;
605
+ }
606
+
607
+ /**
608
+ * Capture audio from microphone using SoX `rec` command (Node.js only)
609
+ *
610
+ * Yields PCM 16-bit signed little-endian mono audio chunks suitable for
611
+ * realtime transcription.
612
+ *
613
+ * **Requirements:**
614
+ * - SoX audio tools must be installed
615
+ * - macOS: `brew install sox`
616
+ * - Linux: `sudo apt install sox`
617
+ *
618
+ * **Note:** This is Node.js only. For browser audio capture, use `captureAudioFromBrowser()`
619
+ *
620
+ * @param sampleRate - Sample rate in Hz (default: 16000)
621
+ * @returns Object with audio stream and stop function
622
+ *
623
+ * @example
624
+ * ```typescript
625
+ * const { stream, stop } = captureAudioFromMicrophone(16000)
626
+ *
627
+ * try {
628
+ * for await (const event of transcriber.transcribe(stream)) {
629
+ * // ... handle events
630
+ * }
631
+ * } finally {
632
+ * stop() // Clean up audio capture
633
+ * }
634
+ * ```
635
+ */
636
+ export async function captureAudioFromMicrophone(
637
+ sampleRate: number = 16000
638
+ ): Promise<AudioCaptureResult> {
639
+ // Check if we're in Node.js
640
+ if (typeof process === "undefined" || !process.versions?.node) {
641
+ throw new Error(
642
+ "captureAudioFromMicrophone() is Node.js only. Use captureAudioFromBrowser() in browsers."
643
+ );
644
+ }
645
+
646
+ // Dynamic import for Node.js child_process (ES module compatible)
647
+ const { spawn } = await import("node:child_process");
648
+
649
+ const recorder = spawn(
650
+ "rec",
651
+ [
652
+ "-q", // Quiet mode
653
+ "-t",
654
+ "raw", // Raw PCM output
655
+ "-b",
656
+ "16", // 16-bit samples
657
+ "-e",
658
+ "signed-integer", // Signed PCM
659
+ "-r",
660
+ String(sampleRate), // Sample rate
661
+ "-c",
662
+ "1", // Mono (1 channel)
663
+ "-", // Output to stdout
664
+ ],
665
+ { stdio: ["ignore", "pipe", "ignore"] }
666
+ );
667
+
668
+ recorder.on("error", (err: any) => {
669
+ if (err.code === "ENOENT") {
670
+ console.error(
671
+ "\nError: 'rec' command not found. Please install SoX:",
672
+ "\n macOS: brew install sox",
673
+ "\n Linux: sudo apt install sox"
674
+ );
675
+ process.exit(1);
676
+ }
677
+ throw err;
678
+ });
679
+
680
+ const stream = (async function* () {
681
+ try {
682
+ if (!recorder.stdout) {
683
+ throw new Error("Failed to create audio capture stream");
684
+ }
685
+ for await (const chunk of recorder.stdout) {
686
+ yield new Uint8Array(chunk as Buffer);
687
+ }
688
+ } finally {
689
+ if (!recorder.killed) {
690
+ recorder.kill("SIGTERM");
691
+ }
692
+ }
693
+ })();
694
+
695
+ const stop = () => {
696
+ if (!recorder.killed) {
697
+ recorder.kill("SIGTERM");
698
+ }
699
+ };
700
+
701
+ return { stream, stop };
702
+ }
703
+
704
+ /**
705
+ * Capture audio from browser microphone using Web Audio API
706
+ *
707
+ * **CURRENTLY DISABLED** - Browser support is not available yet due to
708
+ * WebSocket authentication limitations with Mistral API.
709
+ *
710
+ * @deprecated Browser realtime transcription is not yet supported.
711
+ * Use captureAudioFromMicrophone() in Node.js instead.
712
+ *
713
+ * @param sampleRate - Target sample rate in Hz (default: 16000)
714
+ * @returns Object with audio stream and stop function
715
+ *
716
+ * @throws Error - Always throws as browser mode is disabled
717
+ *
718
+ * @todo Enable when Mistral API supports browser WebSocket authentication
719
+ * @todo Migrate to AudioWorklet for better performance
720
+ * See: https://web.dev/patterns/media/microphone-process/
721
+ */
722
+ export async function captureAudioFromBrowser(
723
+ sampleRate: number = 16000
724
+ ): Promise<AudioCaptureResult> {
725
+ throw new Error(
726
+ "Browser realtime transcription is not yet supported. " +
727
+ "Browser WebSocket API does not support authentication headers required by Mistral API. " +
728
+ "Use captureAudioFromMicrophone() in Node.js environments instead."
729
+ );
730
+
731
+ // Implementation disabled - kept for future when auth is resolved
732
+ /* istanbul ignore next */
733
+ if (typeof navigator === "undefined" || !navigator.mediaDevices) {
734
+ throw new Error(
735
+ "captureAudioFromBrowser() requires a browser environment with getUserMedia support"
736
+ );
737
+ }
738
+
739
+ const mediaStream = await navigator.mediaDevices.getUserMedia({
740
+ audio: {
741
+ channelCount: 1,
742
+ sampleRate,
743
+ echoCancellation: true,
744
+ noiseSuppression: true,
745
+ },
746
+ });
747
+
748
+ const audioContext = new AudioContext({ sampleRate });
749
+ const source = audioContext.createMediaStreamSource(mediaStream);
750
+ const processor = audioContext.createScriptProcessor(4096, 1, 1);
751
+
752
+ let stopped = false;
753
+ const chunks: Int16Array[] = [];
754
+ let resolver: ((value: IteratorResult<Uint8Array>) => void) | null = null;
755
+
756
+ processor.onaudioprocess = (event) => {
757
+ if (stopped) return;
758
+
759
+ const inputData = event.inputBuffer.getChannelData(0);
760
+ const pcm16 = new Int16Array(inputData.length);
761
+
762
+ // Convert Float32 to PCM S16LE
763
+ for (let i = 0; i < inputData.length; i++) {
764
+ const sample = Math.max(-1, Math.min(1, inputData[i]));
765
+ pcm16[i] = sample < 0 ? sample * 0x8000 : sample * 0x7fff;
766
+ }
767
+
768
+ // Convert to Uint8Array (little-endian)
769
+ const uint8 = new Uint8Array(pcm16.length * 2);
770
+ for (let i = 0; i < pcm16.length; i++) {
771
+ uint8[i * 2] = pcm16[i] & 0xff;
772
+ uint8[i * 2 + 1] = (pcm16[i] >> 8) & 0xff;
773
+ }
774
+
775
+ if (resolver) {
776
+ const resolve = resolver;
777
+ resolver = null;
778
+ resolve({ value: uint8, done: false });
779
+ } else {
780
+ chunks.push(pcm16);
781
+ }
782
+ };
783
+
784
+ source.connect(processor);
785
+ processor.connect(audioContext.destination);
786
+
787
+ const stream = (async function* () {
788
+ try {
789
+ while (!stopped) {
790
+ if (chunks.length > 0) {
791
+ const pcm16 = chunks.shift()!;
792
+ const uint8 = new Uint8Array(pcm16.length * 2);
793
+ for (let i = 0; i < pcm16.length; i++) {
794
+ uint8[i * 2] = pcm16[i] & 0xff;
795
+ uint8[i * 2 + 1] = (pcm16[i] >> 8) & 0xff;
796
+ }
797
+ yield uint8;
798
+ } else {
799
+ await new Promise<IteratorResult<Uint8Array>>((resolve) => {
800
+ resolver = resolve;
801
+ });
802
+ }
803
+ }
804
+ } finally {
805
+ processor.disconnect();
806
+ source.disconnect();
807
+ mediaStream.getTracks().forEach((track) => track.stop());
808
+ await audioContext.close();
809
+ }
810
+ })();
811
+
812
+ const stop = () => {
813
+ stopped = true;
814
+ if (resolver) {
815
+ resolver({ value: undefined, done: true });
816
+ resolver = null;
817
+ }
818
+ };
819
+
820
+ return { stream, stop };
821
+ }