@absolutejs/voice 0.0.20 → 0.0.22-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +884 -4
  2. package/dist/angular/index.d.ts +1 -0
  3. package/dist/angular/index.js +759 -3
  4. package/dist/angular/voice-controller.service.d.ts +27 -0
  5. package/dist/angular/voice-stream.service.d.ts +6 -0
  6. package/dist/audioConditioning.d.ts +3 -0
  7. package/dist/client/actions.d.ts +48 -0
  8. package/dist/client/audioPlayer.d.ts +40 -0
  9. package/dist/client/connection.d.ts +5 -0
  10. package/dist/client/controller.d.ts +2 -0
  11. package/dist/client/duplex.d.ts +3 -0
  12. package/dist/client/htmxBootstrap.js +660 -167
  13. package/dist/client/index.d.ts +3 -0
  14. package/dist/client/index.js +991 -6
  15. package/dist/client/microphone.d.ts +4 -2
  16. package/dist/correction.d.ts +33 -0
  17. package/dist/fileStore.d.ts +27 -0
  18. package/dist/index.d.ts +15 -0
  19. package/dist/index.js +3721 -298
  20. package/dist/ops.d.ts +100 -0
  21. package/dist/presets.d.ts +13 -0
  22. package/dist/react/index.d.ts +1 -0
  23. package/dist/react/index.js +728 -3
  24. package/dist/react/useVoiceController.d.ts +26 -0
  25. package/dist/react/useVoiceStream.d.ts +7 -0
  26. package/dist/routing.d.ts +3 -0
  27. package/dist/runtimeOps.d.ts +23 -0
  28. package/dist/store.d.ts +2 -2
  29. package/dist/svelte/index.d.ts +1 -0
  30. package/dist/svelte/index.js +691 -3
  31. package/dist/telephony/response.d.ts +7 -0
  32. package/dist/telephony/twilio.d.ts +116 -0
  33. package/dist/testing/benchmark.d.ts +93 -2
  34. package/dist/testing/corrected.d.ts +41 -0
  35. package/dist/testing/duplex.d.ts +59 -0
  36. package/dist/testing/fixtures.d.ts +18 -2
  37. package/dist/testing/index.d.ts +5 -0
  38. package/dist/testing/index.js +6247 -402
  39. package/dist/testing/review.d.ts +143 -0
  40. package/dist/testing/sessionBenchmark.d.ts +92 -2
  41. package/dist/testing/stt.d.ts +3 -1
  42. package/dist/testing/telephony.d.ts +70 -0
  43. package/dist/testing/tts.d.ts +73 -0
  44. package/dist/turnDetection.d.ts +5 -1
  45. package/dist/turnProfiles.d.ts +6 -0
  46. package/dist/types.d.ts +487 -10
  47. package/dist/vue/index.d.ts +1 -0
  48. package/dist/vue/index.js +750 -3
  49. package/dist/vue/useVoiceController.d.ts +30 -0
  50. package/dist/vue/useVoiceStream.d.ts +11 -0
  51. package/fixtures/README.md +9 -0
  52. package/fixtures/manifest.json +59 -1
  53. package/fixtures/pcm/dialogue-three-clean.pcm +0 -0
  54. package/fixtures/pcm/dialogue-three-mixed.pcm +0 -0
  55. package/fixtures/pcm/dialogue-two-clean.pcm +0 -0
  56. package/fixtures/pcm/dialogue-two-noisy.pcm +0 -0
  57. package/package.json +135 -1
@@ -76,24 +76,30 @@ var WS_NORMAL_CLOSURE = 1000;
76
76
  var DEFAULT_MAX_RECONNECT_ATTEMPTS = 10;
77
77
  var DEFAULT_PING_INTERVAL = 30000;
78
78
  var RECONNECT_DELAY_MS = 500;
79
+ var DEFAULT_SCENARIO_QUERY_PARAM = "scenarioId";
79
80
  var noop = () => {};
80
81
  var noopUnsubscribe = () => noop;
81
82
  var NOOP_CONNECTION = {
83
+ start: () => {},
82
84
  close: noop,
83
85
  endTurn: noop,
84
86
  getReadyState: () => WS_CLOSED,
87
+ getScenarioId: () => "",
85
88
  getSessionId: () => "",
86
89
  send: noop,
87
90
  sendAudio: noop,
88
91
  subscribe: noopUnsubscribe
89
92
  };
90
93
  var createSessionId = () => crypto.randomUUID();
91
- var buildWsUrl = (path, sessionId) => {
94
+ var buildWsUrl = (path, sessionId, scenarioId) => {
92
95
  const { hostname, port, protocol } = window.location;
93
96
  const wsProtocol = protocol === "https:" ? "wss:" : "ws:";
94
97
  const portSuffix = port ? `:${port}` : "";
95
98
  const url = new URL(`${wsProtocol}//${hostname}${portSuffix}${path}`);
96
99
  url.searchParams.set("sessionId", sessionId);
100
+ if (scenarioId) {
101
+ url.searchParams.set(DEFAULT_SCENARIO_QUERY_PARAM, scenarioId);
102
+ }
97
103
  return url.toString();
98
104
  };
99
105
  var isVoiceServerMessage = (value) => {
@@ -101,6 +107,7 @@ var isVoiceServerMessage = (value) => {
101
107
  return false;
102
108
  }
103
109
  switch (value.type) {
110
+ case "audio":
104
111
  case "assistant":
105
112
  case "complete":
106
113
  case "error":
@@ -136,6 +143,7 @@ var createVoiceConnection = (path, options = {}) => {
136
143
  const state = {
137
144
  isConnected: false,
138
145
  pendingMessages: [],
146
+ scenarioId: options.scenarioId ?? null,
139
147
  pingInterval: null,
140
148
  reconnectAttempts: 0,
141
149
  reconnectTimeout: null,
@@ -173,13 +181,14 @@ var createVoiceConnection = (path, options = {}) => {
173
181
  }, RECONNECT_DELAY_MS);
174
182
  };
175
183
  const connect = () => {
176
- const ws = new WebSocket(buildWsUrl(path, state.sessionId));
184
+ const ws = new WebSocket(buildWsUrl(path, state.sessionId, state.scenarioId));
177
185
  ws.binaryType = "arraybuffer";
178
186
  ws.onopen = () => {
179
187
  state.isConnected = true;
180
188
  state.reconnectAttempts = 0;
181
189
  flushPendingMessages();
182
190
  listeners.forEach((listener) => listener({
191
+ scenarioId: state.scenarioId ?? undefined,
183
192
  sessionId: state.sessionId,
184
193
  status: "active",
185
194
  type: "session"
@@ -197,6 +206,7 @@ var createVoiceConnection = (path, options = {}) => {
197
206
  }
198
207
  if (parsed.type === "session") {
199
208
  state.sessionId = parsed.sessionId;
209
+ state.scenarioId = parsed.scenarioId ?? state.scenarioId;
200
210
  }
201
211
  listeners.forEach((listener) => listener(parsed));
202
212
  };
@@ -220,6 +230,19 @@ var createVoiceConnection = (path, options = {}) => {
220
230
  const send = (message) => {
221
231
  sendSerialized(JSON.stringify(message));
222
232
  };
233
+ const start = (input = {}) => {
234
+ if (input.sessionId) {
235
+ state.sessionId = input.sessionId;
236
+ }
237
+ if (input.scenarioId) {
238
+ state.scenarioId = input.scenarioId;
239
+ }
240
+ send({
241
+ type: "start",
242
+ sessionId: state.sessionId,
243
+ scenarioId: state.scenarioId ?? undefined
244
+ });
245
+ };
223
246
  const sendAudio = (audio) => {
224
247
  sendSerialized(audio);
225
248
  };
@@ -243,15 +266,363 @@ var createVoiceConnection = (path, options = {}) => {
243
266
  };
244
267
  connect();
245
268
  return {
269
+ start,
246
270
  close,
247
271
  endTurn,
248
272
  getReadyState: () => state.ws?.readyState ?? WS_CLOSED,
273
+ getScenarioId: () => state.scenarioId ?? "",
249
274
  getSessionId: () => state.sessionId,
250
275
  send,
251
276
  sendAudio,
252
277
  subscribe
253
278
  };
254
279
  };
280
+ // src/client/audioPlayer.ts
281
+ var DEFAULT_LOOKAHEAD_MS = 15;
282
+ var createInitialState = () => ({
283
+ activeSourceCount: 0,
284
+ error: null,
285
+ isActive: false,
286
+ isPlaying: false,
287
+ lastInterruptLatencyMs: undefined,
288
+ lastPlaybackStopLatencyMs: undefined,
289
+ processedChunkCount: 0,
290
+ queuedChunkCount: 0
291
+ });
292
+ var getAudioContextCtor = () => {
293
+ if (typeof window === "undefined") {
294
+ return typeof AudioContext === "undefined" ? undefined : AudioContext;
295
+ }
296
+ return window.AudioContext ?? window.webkitAudioContext;
297
+ };
298
+ var decodePCM16LEChunk = (audioContext, chunk) => {
299
+ const format = chunk.format;
300
+ if (format.container !== "raw" || format.encoding !== "pcm_s16le") {
301
+ throw new Error(`Unsupported assistant audio format: ${format.container}/${format.encoding}`);
302
+ }
303
+ const bytes = chunk.chunk;
304
+ const channels = Math.max(1, format.channels);
305
+ const sampleCount = Math.floor(bytes.byteLength / 2);
306
+ const frameCount = Math.max(1, Math.floor(sampleCount / channels));
307
+ const audioBuffer = audioContext.createBuffer(channels, frameCount, format.sampleRateHz);
308
+ const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
309
+ for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
310
+ const channelData = audioBuffer.getChannelData(channelIndex);
311
+ for (let frameIndex = 0;frameIndex < frameCount; frameIndex += 1) {
312
+ const sampleIndex = frameIndex * channels + channelIndex;
313
+ const sampleOffset = sampleIndex * 2;
314
+ if (sampleOffset + 1 >= bytes.byteLength) {
315
+ channelData[frameIndex] = 0;
316
+ continue;
317
+ }
318
+ channelData[frameIndex] = view.getInt16(sampleOffset, true) / 32768;
319
+ }
320
+ }
321
+ return audioBuffer;
322
+ };
323
+ var createVoiceAudioPlayer = (source, options = {}) => {
324
+ const subscribers = new Set;
325
+ const sourceNodes = new Set;
326
+ const lookaheadSeconds = (options.lookaheadMs ?? DEFAULT_LOOKAHEAD_MS) / 1000;
327
+ let state = createInitialState();
328
+ let audioContext = null;
329
+ let outputNode = null;
330
+ let queueEndTime = 0;
331
+ let syncPromise = Promise.resolve();
332
+ let interruptStartedAt = null;
333
+ let interruptPromise = null;
334
+ let resolveInterruptPromise = null;
335
+ let interruptFallbackTimer = null;
336
+ const notify = () => {
337
+ for (const subscriber of subscribers) {
338
+ subscriber();
339
+ }
340
+ };
341
+ const setState = (next) => {
342
+ state = {
343
+ ...state,
344
+ ...next
345
+ };
346
+ notify();
347
+ };
348
+ const clearError = () => {
349
+ if (state.error !== null) {
350
+ setState({ error: null });
351
+ }
352
+ };
353
+ const clearInterruptTimer = () => {
354
+ if (interruptFallbackTimer !== null) {
355
+ clearTimeout(interruptFallbackTimer);
356
+ interruptFallbackTimer = null;
357
+ }
358
+ };
359
+ const resolveInterrupt = (latencyMs) => {
360
+ clearInterruptTimer();
361
+ interruptStartedAt = null;
362
+ setState({
363
+ activeSourceCount: sourceNodes.size,
364
+ isPlaying: false,
365
+ lastInterruptLatencyMs: latencyMs,
366
+ lastPlaybackStopLatencyMs: state.lastPlaybackStopLatencyMs ?? latencyMs
367
+ });
368
+ resolveInterruptPromise?.();
369
+ resolveInterruptPromise = null;
370
+ interruptPromise = null;
371
+ };
372
+ const estimateOutputStopLatencyMs = (context) => {
373
+ if (!context) {
374
+ return 0;
375
+ }
376
+ return Math.max(0, ((context.baseLatency ?? 0) + (context.outputLatency ?? 0)) * 1000);
377
+ };
378
+ const restoreOutputGain = (context) => {
379
+ if (!outputNode) {
380
+ return;
381
+ }
382
+ const gainValue = 1;
383
+ if (outputNode.gain.setValueAtTime) {
384
+ outputNode.gain.setValueAtTime(gainValue, context?.currentTime ?? 0);
385
+ return;
386
+ }
387
+ outputNode.gain.value = gainValue;
388
+ };
389
+ const muteOutputGain = (context) => {
390
+ if (!outputNode) {
391
+ return;
392
+ }
393
+ const gainValue = 0;
394
+ if (outputNode.gain.setValueAtTime) {
395
+ outputNode.gain.setValueAtTime(gainValue, context?.currentTime ?? 0);
396
+ return;
397
+ }
398
+ outputNode.gain.value = gainValue;
399
+ };
400
+ const maybeResolveInterrupt = () => {
401
+ if (interruptStartedAt === null || sourceNodes.size > 0) {
402
+ return;
403
+ }
404
+ resolveInterrupt(Date.now() - interruptStartedAt);
405
+ };
406
+ const ensureAudioContext = async () => {
407
+ if (audioContext) {
408
+ return audioContext;
409
+ }
410
+ if (options.createAudioContext) {
411
+ audioContext = options.createAudioContext();
412
+ } else {
413
+ const AudioContextCtor = getAudioContextCtor();
414
+ if (!AudioContextCtor) {
415
+ throw new Error("Assistant audio playback requires AudioContext support.");
416
+ }
417
+ audioContext = new AudioContextCtor;
418
+ }
419
+ if (audioContext.createGain) {
420
+ outputNode = audioContext.createGain();
421
+ outputNode.connect?.(audioContext.destination);
422
+ }
423
+ queueEndTime = audioContext.currentTime;
424
+ return audioContext;
425
+ };
426
+ const scheduleChunk = async (chunk) => {
427
+ const context = await ensureAudioContext();
428
+ const buffer = decodePCM16LEChunk(context, chunk);
429
+ const node = context.createBufferSource();
430
+ node.buffer = buffer;
431
+ node.connect(outputNode ?? context.destination);
432
+ node.onended = () => {
433
+ sourceNodes.delete(node);
434
+ node.disconnect?.();
435
+ setState({
436
+ activeSourceCount: sourceNodes.size,
437
+ isPlaying: sourceNodes.size > 0 && state.isActive
438
+ });
439
+ maybeResolveInterrupt();
440
+ };
441
+ const startAt = Math.max(context.currentTime + lookaheadSeconds, queueEndTime);
442
+ queueEndTime = startAt + buffer.duration;
443
+ sourceNodes.add(node);
444
+ setState({
445
+ activeSourceCount: sourceNodes.size,
446
+ isPlaying: true
447
+ });
448
+ node.start(startAt);
449
+ };
450
+ const stopQueuedPlayback = (options2) => {
451
+ for (const node of [...sourceNodes]) {
452
+ node.stop?.();
453
+ }
454
+ queueEndTime = audioContext ? audioContext.currentTime : 0;
455
+ if (options2?.forceClear) {
456
+ for (const node of sourceNodes) {
457
+ node.disconnect?.();
458
+ }
459
+ sourceNodes.clear();
460
+ maybeResolveInterrupt();
461
+ }
462
+ };
463
+ const sync = async () => {
464
+ if (!state.isActive) {
465
+ return;
466
+ }
467
+ const nextChunks = source.assistantAudio.slice(state.processedChunkCount);
468
+ if (nextChunks.length === 0) {
469
+ return;
470
+ }
471
+ try {
472
+ clearError();
473
+ for (const chunk of nextChunks) {
474
+ await scheduleChunk(chunk);
475
+ }
476
+ setState({
477
+ processedChunkCount: source.assistantAudio.length,
478
+ queuedChunkCount: state.queuedChunkCount + nextChunks.length
479
+ });
480
+ } catch (error) {
481
+ setState({
482
+ error: error instanceof Error ? error.message : String(error)
483
+ });
484
+ }
485
+ };
486
+ const queueSync = () => {
487
+ syncPromise = syncPromise.then(() => sync(), () => sync());
488
+ return syncPromise;
489
+ };
490
+ const unsubscribeSource = source.subscribe(() => {
491
+ if (options.autoStart && !state.isActive && source.assistantAudio.length > 0) {
492
+ player.start();
493
+ return;
494
+ }
495
+ if (state.isActive) {
496
+ queueSync();
497
+ }
498
+ });
499
+ const player = {
500
+ close: async () => {
501
+ unsubscribeSource();
502
+ stopQueuedPlayback({ forceClear: true });
503
+ clearInterruptTimer();
504
+ resolveInterruptPromise?.();
505
+ resolveInterruptPromise = null;
506
+ interruptPromise = null;
507
+ interruptStartedAt = null;
508
+ if (audioContext && audioContext.state !== "closed") {
509
+ await audioContext.close();
510
+ }
511
+ audioContext = null;
512
+ outputNode?.disconnect?.();
513
+ outputNode = null;
514
+ queueEndTime = 0;
515
+ setState({
516
+ activeSourceCount: 0,
517
+ isActive: false,
518
+ isPlaying: false
519
+ });
520
+ },
521
+ get activeSourceCount() {
522
+ return state.activeSourceCount;
523
+ },
524
+ get error() {
525
+ return state.error;
526
+ },
527
+ getSnapshot: () => state,
528
+ get isActive() {
529
+ return state.isActive;
530
+ },
531
+ get isPlaying() {
532
+ return state.isPlaying;
533
+ },
534
+ interrupt: async () => {
535
+ const startedAt = Date.now();
536
+ const context = await ensureAudioContext();
537
+ interruptStartedAt = startedAt;
538
+ muteOutputGain(context);
539
+ const playbackStopLatencyMs = Date.now() - startedAt + estimateOutputStopLatencyMs(context);
540
+ setState({
541
+ isActive: false,
542
+ isPlaying: sourceNodes.size > 0,
543
+ lastPlaybackStopLatencyMs: playbackStopLatencyMs
544
+ });
545
+ if (sourceNodes.size === 0) {
546
+ resolveInterrupt(playbackStopLatencyMs);
547
+ return;
548
+ }
549
+ if (!interruptPromise) {
550
+ interruptPromise = new Promise((resolve) => {
551
+ resolveInterruptPromise = resolve;
552
+ });
553
+ }
554
+ clearInterruptTimer();
555
+ interruptFallbackTimer = setTimeout(() => {
556
+ for (const node of sourceNodes) {
557
+ node.disconnect?.();
558
+ }
559
+ sourceNodes.clear();
560
+ resolveInterrupt(Date.now() - startedAt);
561
+ }, 250);
562
+ stopQueuedPlayback();
563
+ await interruptPromise;
564
+ },
565
+ get lastInterruptLatencyMs() {
566
+ return state.lastInterruptLatencyMs;
567
+ },
568
+ get lastPlaybackStopLatencyMs() {
569
+ return state.lastPlaybackStopLatencyMs;
570
+ },
571
+ pause: async () => {
572
+ if (!audioContext) {
573
+ setState({
574
+ activeSourceCount: 0,
575
+ isActive: false,
576
+ isPlaying: false
577
+ });
578
+ return;
579
+ }
580
+ await audioContext.suspend();
581
+ setState({
582
+ activeSourceCount: sourceNodes.size,
583
+ isActive: false,
584
+ isPlaying: false
585
+ });
586
+ },
587
+ get processedChunkCount() {
588
+ return state.processedChunkCount;
589
+ },
590
+ get queuedChunkCount() {
591
+ return state.queuedChunkCount;
592
+ },
593
+ start: async () => {
594
+ try {
595
+ clearError();
596
+ const context = await ensureAudioContext();
597
+ restoreOutputGain(context);
598
+ if (context.state === "suspended") {
599
+ await context.resume();
600
+ }
601
+ setState({
602
+ activeSourceCount: sourceNodes.size,
603
+ isActive: true,
604
+ isPlaying: context.state === "running"
605
+ });
606
+ await queueSync();
607
+ } catch (error) {
608
+ setState({
609
+ error: error instanceof Error ? error.message : String(error),
610
+ isActive: false,
611
+ isPlaying: false
612
+ });
613
+ throw error;
614
+ }
615
+ },
616
+ subscribe: (subscriber) => {
617
+ subscribers.add(subscriber);
618
+ return () => {
619
+ subscribers.delete(subscriber);
620
+ };
621
+ }
622
+ };
623
+ return player;
624
+ };
625
+ var decodeVoiceAudioChunk = (audioContext, chunk) => decodePCM16LEChunk(audioContext, chunk);
255
626
  // src/client/actions.ts
256
627
  var normalizeErrorMessage = (value) => {
257
628
  if (typeof value === "string" && value.trim()) {
@@ -282,6 +653,14 @@ var normalizeErrorMessage = (value) => {
282
653
  };
283
654
  var serverMessageToAction = (message) => {
284
655
  switch (message.type) {
656
+ case "audio":
657
+ return {
658
+ chunk: Uint8Array.from(atob(message.chunkBase64), (char) => char.charCodeAt(0)),
659
+ format: message.format,
660
+ receivedAt: message.receivedAt,
661
+ turnId: message.turnId,
662
+ type: "audio"
663
+ };
285
664
  case "assistant":
286
665
  return {
287
666
  text: message.text,
@@ -310,6 +689,7 @@ var serverMessageToAction = (message) => {
310
689
  case "session":
311
690
  return {
312
691
  sessionId: message.sessionId,
692
+ scenarioId: message.scenarioId,
313
693
  status: message.status,
314
694
  type: "session"
315
695
  };
@@ -324,23 +704,39 @@ var serverMessageToAction = (message) => {
324
704
  };
325
705
 
326
706
  // src/client/store.ts
327
- var createInitialState = () => ({
707
+ var createInitialState2 = () => ({
708
+ assistantAudio: [],
328
709
  assistantTexts: [],
329
710
  error: null,
330
711
  isConnected: false,
712
+ scenarioId: null,
331
713
  partial: "",
332
714
  sessionId: null,
333
715
  status: "idle",
334
716
  turns: []
335
717
  });
336
718
  var createVoiceStreamStore = () => {
337
- let state = createInitialState();
719
+ let state = createInitialState2();
338
720
  const subscribers = new Set;
339
721
  const notify = () => {
340
722
  subscribers.forEach((subscriber) => subscriber());
341
723
  };
342
724
  const dispatch = (action) => {
343
725
  switch (action.type) {
726
+ case "audio":
727
+ state = {
728
+ ...state,
729
+ assistantAudio: [
730
+ ...state.assistantAudio,
731
+ {
732
+ chunk: action.chunk,
733
+ format: action.format,
734
+ receivedAt: action.receivedAt,
735
+ turnId: action.turnId
736
+ }
737
+ ]
738
+ };
739
+ break;
344
740
  case "assistant":
345
741
  state = {
346
742
  ...state,
@@ -389,6 +785,7 @@ var createVoiceStreamStore = () => {
389
785
  state = {
390
786
  ...state,
391
787
  error: null,
788
+ scenarioId: action.scenarioId ?? state.scenarioId,
392
789
  isConnected: action.status === "active",
393
790
  sessionId: action.sessionId,
394
791
  status: action.status
@@ -422,6 +819,12 @@ var createVoiceStream = (path, options = {}) => {
422
819
  const connection = createVoiceConnection(path, options);
423
820
  const store = createVoiceStreamStore();
424
821
  const subscribers = new Set;
822
+ const start = (input) => Promise.resolve().then(() => {
823
+ if (!input?.sessionId && !input?.scenarioId) {
824
+ return;
825
+ }
826
+ connection.start(input);
827
+ });
425
828
  const notify = () => {
426
829
  subscribers.forEach((subscriber) => subscriber());
427
830
  };
@@ -454,6 +857,10 @@ var createVoiceStream = (path, options = {}) => {
454
857
  get isConnected() {
455
858
  return store.getSnapshot().isConnected;
456
859
  },
860
+ get scenarioId() {
861
+ return store.getSnapshot().scenarioId;
862
+ },
863
+ start,
457
864
  get partial() {
458
865
  return store.getSnapshot().partial;
459
866
  },
@@ -469,6 +876,9 @@ var createVoiceStream = (path, options = {}) => {
469
876
  get assistantTexts() {
470
877
  return store.getSnapshot().assistantTexts;
471
878
  },
879
+ get assistantAudio() {
880
+ return store.getSnapshot().assistantAudio;
881
+ },
472
882
  sendAudio(audio) {
473
883
  connection.sendAudio(audio);
474
884
  },
@@ -527,6 +937,7 @@ var bindVoiceHTMX = (stream, options) => {
527
937
  unsubscribe();
528
938
  };
529
939
  };
940
+
530
941
  // src/client/microphone.ts
531
942
  var clampSample = (value) => Math.max(-1, Math.min(1, value));
532
943
  var floatTo16BitPCM = (input) => {
@@ -537,6 +948,22 @@ var floatTo16BitPCM = (input) => {
537
948
  }
538
949
  return new Uint8Array(output.buffer);
539
950
  };
951
+ var getPcmLevel = (audio) => {
952
+ const bytes = audio instanceof Uint8Array ? audio : new Uint8Array(audio);
953
+ if (bytes.byteLength < 2) {
954
+ return 0;
955
+ }
956
+ const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
957
+ if (samples.length === 0) {
958
+ return 0;
959
+ }
960
+ let sumSquares = 0;
961
+ for (const sample of samples) {
962
+ const normalized = sample / 32768;
963
+ sumSquares += normalized * normalized;
964
+ }
965
+ return Math.min(1, Math.max(0, Math.sqrt(sumSquares / samples.length) * 5.5));
966
+ };
540
967
  var downsampleBuffer = (input, sourceRate, targetRate) => {
541
968
  if (sourceRate === targetRate) {
542
969
  return input;
@@ -584,7 +1011,9 @@ var createMicrophoneCapture = (options) => {
584
1011
  processorNode.onaudioprocess = (event) => {
585
1012
  const channel = event.inputBuffer.getChannelData(0);
586
1013
  const downsampled = downsampleBuffer(channel, audioContext?.sampleRate ?? 48000, options.sampleRateHz ?? 16000);
587
- options.onAudio(floatTo16BitPCM(downsampled));
1014
+ const pcm = floatTo16BitPCM(downsampled);
1015
+ options.onLevel?.(getPcmLevel(pcm));
1016
+ options.onAudio(pcm);
588
1017
  };
589
1018
  sourceNode.connect(processorNode);
590
1019
  processorNode.connect(audioContext.destination);
@@ -594,6 +1023,7 @@ var createMicrophoneCapture = (options) => {
594
1023
  sourceNode?.disconnect();
595
1024
  mediaStream?.getTracks().forEach((track) => track.stop());
596
1025
  audioContext?.close();
1026
+ options.onLevel?.(0);
597
1027
  audioContext = null;
598
1028
  mediaStream = null;
599
1029
  processorNode = null;
@@ -601,9 +1031,564 @@ var createMicrophoneCapture = (options) => {
601
1031
  };
602
1032
  return { start, stop };
603
1033
  };
1034
+
1035
+ // src/audioConditioning.ts
1036
+ var DEFAULT_TARGET_LEVEL = 0.08;
1037
+ var DEFAULT_MAX_GAIN = 3;
1038
+ var DEFAULT_NOISE_GATE_THRESHOLD = 0.006;
1039
+ var DEFAULT_NOISE_GATE_ATTENUATION = 0.15;
1040
+ var toInt16Array = (audio) => {
1041
+ if (audio instanceof ArrayBuffer) {
1042
+ return new Int16Array(audio, 0, Math.floor(audio.byteLength / 2));
1043
+ }
1044
+ return new Int16Array(audio.buffer, audio.byteOffset, Math.floor(audio.byteLength / 2));
1045
+ };
1046
+ var computeRms = (samples) => {
1047
+ if (samples.length === 0) {
1048
+ return 0;
1049
+ }
1050
+ let sumSquares = 0;
1051
+ for (const sample of samples) {
1052
+ const normalized = sample / 32768;
1053
+ sumSquares += normalized * normalized;
1054
+ }
1055
+ return Math.sqrt(sumSquares / samples.length);
1056
+ };
1057
+ var resolveAudioConditioningConfig = (config) => {
1058
+ if (!config || config.enabled === false) {
1059
+ return;
1060
+ }
1061
+ return {
1062
+ enabled: true,
1063
+ maxGain: config.maxGain ?? DEFAULT_MAX_GAIN,
1064
+ noiseGateAttenuation: config.noiseGateAttenuation ?? DEFAULT_NOISE_GATE_ATTENUATION,
1065
+ noiseGateThreshold: config.noiseGateThreshold ?? DEFAULT_NOISE_GATE_THRESHOLD,
1066
+ targetLevel: config.targetLevel ?? DEFAULT_TARGET_LEVEL
1067
+ };
1068
+ };
1069
+ var conditionAudioChunk = (audio, config) => {
1070
+ if (!config) {
1071
+ return audio;
1072
+ }
1073
+ const source = toInt16Array(audio);
1074
+ if (source.length === 0) {
1075
+ return audio;
1076
+ }
1077
+ const rms = computeRms(source);
1078
+ const output = new Int16Array(source.length);
1079
+ const gateFactor = rms < config.noiseGateThreshold ? config.noiseGateAttenuation : 1;
1080
+ const baseLevel = Math.max(rms * gateFactor, 0.000001);
1081
+ const gain = Math.min(config.maxGain, config.targetLevel / baseLevel);
1082
+ const appliedGain = Math.max(0.25, gain) * gateFactor;
1083
+ for (let index = 0;index < source.length; index += 1) {
1084
+ const next = Math.round(source[index] * appliedGain);
1085
+ output[index] = Math.max(-32768, Math.min(32767, next));
1086
+ }
1087
+ return new Uint8Array(output.buffer);
1088
+ };
1089
+
1090
+ // src/turnProfiles.ts
1091
+ var TURN_PROFILE_DEFAULTS = {
1092
+ balanced: {
1093
+ qualityProfile: "general",
1094
+ silenceMs: 1400,
1095
+ speechThreshold: 0.012,
1096
+ transcriptStabilityMs: 1000
1097
+ },
1098
+ fast: {
1099
+ qualityProfile: "general",
1100
+ silenceMs: 700,
1101
+ speechThreshold: 0.015,
1102
+ transcriptStabilityMs: 450
1103
+ },
1104
+ "long-form": {
1105
+ qualityProfile: "general",
1106
+ silenceMs: 2200,
1107
+ speechThreshold: 0.01,
1108
+ transcriptStabilityMs: 1500
1109
+ }
1110
+ };
1111
+ var QUALITY_PROFILE_DEFAULTS = {
1112
+ general: {},
1113
+ "accent-heavy": {
1114
+ silenceMs: 1200,
1115
+ speechThreshold: 0.01,
1116
+ transcriptStabilityMs: 1200
1117
+ },
1118
+ "noisy-room": {
1119
+ silenceMs: 2000,
1120
+ speechThreshold: 0.02,
1121
+ transcriptStabilityMs: 1600
1122
+ },
1123
+ "short-command": {
1124
+ silenceMs: 500,
1125
+ speechThreshold: 0.016,
1126
+ transcriptStabilityMs: 420
1127
+ }
1128
+ };
1129
+ var DEFAULT_TURN_PROFILE = "fast";
1130
+ var DEFAULT_QUALITY_PROFILE = "general";
1131
+ var resolveTurnDetectionConfig = (config) => {
1132
+ const profile = config?.profile ?? DEFAULT_TURN_PROFILE;
1133
+ const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
1134
+ const preset = TURN_PROFILE_DEFAULTS[profile];
1135
+ const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
1136
+ return {
1137
+ profile,
1138
+ qualityProfile,
1139
+ silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
1140
+ speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
1141
+ transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
1142
+ };
1143
+ };
1144
+
1145
+ // src/presets.ts
1146
+ var PRESET_INPUTS = {
1147
+ chat: {
1148
+ audioConditioning: {
1149
+ enabled: true,
1150
+ maxGain: 2.5,
1151
+ noiseGateAttenuation: 0,
1152
+ noiseGateThreshold: 0.004,
1153
+ targetLevel: 0.08
1154
+ },
1155
+ capture: {
1156
+ channelCount: 1,
1157
+ sampleRateHz: 16000
1158
+ },
1159
+ connection: {
1160
+ maxReconnectAttempts: 10,
1161
+ pingInterval: 30000,
1162
+ reconnect: true
1163
+ },
1164
+ sttLifecycle: "continuous",
1165
+ turnDetection: {
1166
+ qualityProfile: "short-command",
1167
+ profile: "balanced"
1168
+ }
1169
+ },
1170
+ default: {
1171
+ capture: {
1172
+ channelCount: 1,
1173
+ sampleRateHz: 16000
1174
+ },
1175
+ connection: {
1176
+ maxReconnectAttempts: 10,
1177
+ pingInterval: 30000,
1178
+ reconnect: true
1179
+ },
1180
+ sttLifecycle: "continuous",
1181
+ turnDetection: {
1182
+ qualityProfile: "general",
1183
+ profile: "fast"
1184
+ }
1185
+ },
1186
+ dictation: {
1187
+ audioConditioning: {
1188
+ enabled: true,
1189
+ maxGain: 2.25,
1190
+ noiseGateAttenuation: 0.05,
1191
+ noiseGateThreshold: 0.003,
1192
+ targetLevel: 0.08
1193
+ },
1194
+ capture: {
1195
+ channelCount: 1,
1196
+ sampleRateHz: 16000
1197
+ },
1198
+ connection: {
1199
+ maxReconnectAttempts: 12,
1200
+ pingInterval: 30000,
1201
+ reconnect: true
1202
+ },
1203
+ sttLifecycle: "continuous",
1204
+ turnDetection: {
1205
+ qualityProfile: "accent-heavy",
1206
+ profile: "long-form"
1207
+ }
1208
+ },
1209
+ "guided-intake": {
1210
+ audioConditioning: {
1211
+ enabled: true,
1212
+ maxGain: 2.5,
1213
+ noiseGateAttenuation: 0,
1214
+ noiseGateThreshold: 0.004,
1215
+ targetLevel: 0.08
1216
+ },
1217
+ capture: {
1218
+ channelCount: 1,
1219
+ sampleRateHz: 16000
1220
+ },
1221
+ connection: {
1222
+ maxReconnectAttempts: 12,
1223
+ pingInterval: 30000,
1224
+ reconnect: true
1225
+ },
1226
+ sttLifecycle: "turn-scoped",
1227
+ turnDetection: {
1228
+ qualityProfile: "accent-heavy",
1229
+ profile: "long-form"
1230
+ }
1231
+ },
1232
+ "noisy-room": {
1233
+ audioConditioning: {
1234
+ enabled: true,
1235
+ maxGain: 3,
1236
+ noiseGateAttenuation: 0.12,
1237
+ noiseGateThreshold: 0.006,
1238
+ targetLevel: 0.085
1239
+ },
1240
+ capture: {
1241
+ channelCount: 1,
1242
+ sampleRateHz: 16000
1243
+ },
1244
+ connection: {
1245
+ maxReconnectAttempts: 14,
1246
+ pingInterval: 45000,
1247
+ reconnect: true
1248
+ },
1249
+ sttLifecycle: "continuous",
1250
+ turnDetection: {
1251
+ qualityProfile: "noisy-room",
1252
+ profile: "long-form",
1253
+ silenceMs: 2100,
1254
+ speechThreshold: 0.02,
1255
+ transcriptStabilityMs: 1650
1256
+ }
1257
+ },
1258
+ "pstn-balanced": {
1259
+ audioConditioning: {
1260
+ enabled: true,
1261
+ maxGain: 2.8,
1262
+ noiseGateAttenuation: 0.07,
1263
+ noiseGateThreshold: 0.005,
1264
+ targetLevel: 0.08
1265
+ },
1266
+ capture: {
1267
+ channelCount: 1,
1268
+ sampleRateHz: 16000
1269
+ },
1270
+ connection: {
1271
+ maxReconnectAttempts: 14,
1272
+ pingInterval: 45000,
1273
+ reconnect: true
1274
+ },
1275
+ sttLifecycle: "continuous",
1276
+ turnDetection: {
1277
+ qualityProfile: "noisy-room",
1278
+ profile: "long-form",
1279
+ silenceMs: 660,
1280
+ speechThreshold: 0.012,
1281
+ transcriptStabilityMs: 300
1282
+ }
1283
+ },
1284
+ "pstn-fast": {
1285
+ audioConditioning: {
1286
+ enabled: true,
1287
+ maxGain: 2.75,
1288
+ noiseGateAttenuation: 0.06,
1289
+ noiseGateThreshold: 0.005,
1290
+ targetLevel: 0.08
1291
+ },
1292
+ capture: {
1293
+ channelCount: 1,
1294
+ sampleRateHz: 16000
1295
+ },
1296
+ connection: {
1297
+ maxReconnectAttempts: 14,
1298
+ pingInterval: 45000,
1299
+ reconnect: true
1300
+ },
1301
+ sttLifecycle: "continuous",
1302
+ turnDetection: {
1303
+ qualityProfile: "noisy-room",
1304
+ profile: "long-form",
1305
+ silenceMs: 620,
1306
+ speechThreshold: 0.012,
1307
+ transcriptStabilityMs: 280
1308
+ }
1309
+ },
1310
+ reliability: {
1311
+ audioConditioning: {
1312
+ enabled: true,
1313
+ maxGain: 2.9,
1314
+ noiseGateAttenuation: 0.08,
1315
+ noiseGateThreshold: 0.005,
1316
+ targetLevel: 0.08
1317
+ },
1318
+ capture: {
1319
+ channelCount: 1,
1320
+ sampleRateHz: 16000
1321
+ },
1322
+ connection: {
1323
+ maxReconnectAttempts: 14,
1324
+ pingInterval: 45000,
1325
+ reconnect: true
1326
+ },
1327
+ sttLifecycle: "continuous",
1328
+ turnDetection: {
1329
+ qualityProfile: "noisy-room",
1330
+ profile: "long-form"
1331
+ }
1332
+ }
1333
+ };
1334
+ var resolveVoiceRuntimePreset = (name = "default") => {
1335
+ const preset = PRESET_INPUTS[name];
1336
+ return {
1337
+ audioConditioning: resolveAudioConditioningConfig(preset.audioConditioning),
1338
+ capture: {
1339
+ channelCount: preset.capture?.channelCount ?? 1,
1340
+ sampleRateHz: preset.capture?.sampleRateHz ?? 16000
1341
+ },
1342
+ connection: {
1343
+ ...preset.connection
1344
+ },
1345
+ name,
1346
+ sttLifecycle: preset.sttLifecycle ?? "continuous",
1347
+ turnDetection: resolveTurnDetectionConfig(preset.turnDetection)
1348
+ };
1349
+ };
1350
+
1351
+ // src/client/controller.ts
1352
+ var createInitialState3 = (stream) => ({
1353
+ assistantAudio: [...stream.assistantAudio],
1354
+ assistantTexts: [...stream.assistantTexts],
1355
+ error: stream.error,
1356
+ isConnected: stream.isConnected,
1357
+ isRecording: false,
1358
+ partial: stream.partial,
1359
+ recordingError: null,
1360
+ sessionId: stream.sessionId,
1361
+ scenarioId: stream.scenarioId,
1362
+ status: stream.status,
1363
+ turns: [...stream.turns]
1364
+ });
1365
+ var createVoiceController = (path, options = {}) => {
1366
+ const preset = resolveVoiceRuntimePreset(options.preset);
1367
+ const stream = createVoiceStream(path, {
1368
+ ...preset.connection,
1369
+ ...options.connection
1370
+ });
1371
+ let capture = null;
1372
+ let state = createInitialState3(stream);
1373
+ const subscribers = new Set;
1374
+ const notify = () => {
1375
+ for (const subscriber of subscribers) {
1376
+ subscriber();
1377
+ }
1378
+ };
1379
+ const sync = () => {
1380
+ state = {
1381
+ ...state,
1382
+ assistantAudio: [...stream.assistantAudio],
1383
+ assistantTexts: [...stream.assistantTexts],
1384
+ error: stream.error,
1385
+ isConnected: stream.isConnected,
1386
+ partial: stream.partial,
1387
+ sessionId: stream.sessionId,
1388
+ scenarioId: stream.scenarioId,
1389
+ status: stream.status,
1390
+ turns: [...stream.turns]
1391
+ };
1392
+ if (options.autoStopOnComplete !== false && state.status === "completed" && state.isRecording) {
1393
+ capture?.stop();
1394
+ capture = null;
1395
+ state = {
1396
+ ...state,
1397
+ isRecording: false
1398
+ };
1399
+ }
1400
+ notify();
1401
+ };
1402
+ const unsubscribeStream = stream.subscribe(sync);
1403
+ sync();
1404
+ const ensureCapture = () => {
1405
+ if (capture) {
1406
+ return capture;
1407
+ }
1408
+ capture = createMicrophoneCapture({
1409
+ channelCount: options.capture?.channelCount ?? preset.capture.channelCount,
1410
+ onLevel: options.capture?.onLevel,
1411
+ onAudio: (audio) => stream.sendAudio(audio),
1412
+ sampleRateHz: options.capture?.sampleRateHz ?? preset.capture.sampleRateHz
1413
+ });
1414
+ return capture;
1415
+ };
1416
+ const stopRecording = () => {
1417
+ capture?.stop();
1418
+ capture = null;
1419
+ state = {
1420
+ ...state,
1421
+ isRecording: false
1422
+ };
1423
+ notify();
1424
+ };
1425
+ const startRecording = async () => {
1426
+ if (state.isRecording) {
1427
+ return;
1428
+ }
1429
+ try {
1430
+ state = {
1431
+ ...state,
1432
+ recordingError: null
1433
+ };
1434
+ notify();
1435
+ await ensureCapture().start();
1436
+ state = {
1437
+ ...state,
1438
+ isRecording: true
1439
+ };
1440
+ notify();
1441
+ } catch (error) {
1442
+ capture = null;
1443
+ state = {
1444
+ ...state,
1445
+ isRecording: false,
1446
+ recordingError: error instanceof Error ? error.message : String(error)
1447
+ };
1448
+ notify();
1449
+ throw error;
1450
+ }
1451
+ };
1452
+ const close = () => {
1453
+ unsubscribeStream();
1454
+ stopRecording();
1455
+ stream.close();
1456
+ };
1457
+ return {
1458
+ bindHTMX(bindingOptions) {
1459
+ return bindVoiceHTMX(stream, bindingOptions);
1460
+ },
1461
+ close,
1462
+ endTurn: () => stream.endTurn(),
1463
+ get error() {
1464
+ return state.error;
1465
+ },
1466
+ getServerSnapshot: () => state,
1467
+ getSnapshot: () => state,
1468
+ get isConnected() {
1469
+ return state.isConnected;
1470
+ },
1471
+ get isRecording() {
1472
+ return state.isRecording;
1473
+ },
1474
+ get partial() {
1475
+ return state.partial;
1476
+ },
1477
+ get recordingError() {
1478
+ return state.recordingError;
1479
+ },
1480
+ sendAudio: (audio) => stream.sendAudio(audio),
1481
+ get sessionId() {
1482
+ return state.sessionId;
1483
+ },
1484
+ get scenarioId() {
1485
+ return state.scenarioId;
1486
+ },
1487
+ startRecording,
1488
+ get status() {
1489
+ return state.status;
1490
+ },
1491
+ stopRecording,
1492
+ subscribe: (subscriber) => {
1493
+ subscribers.add(subscriber);
1494
+ return () => {
1495
+ subscribers.delete(subscriber);
1496
+ };
1497
+ },
1498
+ toggleRecording: async () => {
1499
+ if (state.isRecording) {
1500
+ stopRecording();
1501
+ return;
1502
+ }
1503
+ await startRecording();
1504
+ },
1505
+ get turns() {
1506
+ return state.turns;
1507
+ },
1508
+ get assistantTexts() {
1509
+ return state.assistantTexts;
1510
+ },
1511
+ get assistantAudio() {
1512
+ return state.assistantAudio;
1513
+ }
1514
+ };
1515
+ };
1516
+ // src/client/duplex.ts
1517
+ var DEFAULT_INTERRUPT_THRESHOLD = 0.08;
1518
+ var shouldInterruptForLevel = (level, options = {}) => (options.enabled ?? true) && level >= (options.interruptThreshold ?? DEFAULT_INTERRUPT_THRESHOLD);
1519
+ var bindVoiceBargeIn = (controller, player, options = {}) => {
1520
+ let lastPartial = controller.partial;
1521
+ const interruptIfPlaying = () => {
1522
+ if (!player.isPlaying || options.enabled === false) {
1523
+ return;
1524
+ }
1525
+ player.interrupt();
1526
+ };
1527
+ const unsubscribe = controller.subscribe(() => {
1528
+ if (options.interruptOnPartial === false) {
1529
+ lastPartial = controller.partial;
1530
+ return;
1531
+ }
1532
+ if (!lastPartial && controller.partial) {
1533
+ interruptIfPlaying();
1534
+ }
1535
+ lastPartial = controller.partial;
1536
+ });
1537
+ return {
1538
+ close: () => {
1539
+ unsubscribe();
1540
+ },
1541
+ handleLevel: (level) => {
1542
+ if (shouldInterruptForLevel(level, options)) {
1543
+ interruptIfPlaying();
1544
+ }
1545
+ },
1546
+ sendAudio: (audio) => {
1547
+ interruptIfPlaying();
1548
+ controller.sendAudio(audio);
1549
+ }
1550
+ };
1551
+ };
1552
+ var createVoiceDuplexController = (path, options = {}) => {
1553
+ let bargeInBinding = null;
1554
+ const controller = createVoiceController(path, {
1555
+ ...options,
1556
+ capture: {
1557
+ ...options.capture,
1558
+ onLevel: (level) => {
1559
+ bargeInBinding?.handleLevel(level);
1560
+ options.capture?.onLevel?.(level);
1561
+ }
1562
+ }
1563
+ });
1564
+ const audioPlayer = createVoiceAudioPlayer(controller, options.audioPlayer);
1565
+ bargeInBinding = bindVoiceBargeIn(controller, audioPlayer, options.bargeIn);
1566
+ const close = () => {
1567
+ bargeInBinding?.close();
1568
+ bargeInBinding = null;
1569
+ audioPlayer.close();
1570
+ controller.close();
1571
+ };
1572
+ return {
1573
+ ...controller,
1574
+ audioPlayer,
1575
+ close,
1576
+ interruptAssistant: async () => {
1577
+ await audioPlayer.interrupt();
1578
+ },
1579
+ sendAudio: (audio) => {
1580
+ bargeInBinding?.sendAudio(audio);
1581
+ }
1582
+ };
1583
+ };
604
1584
  export {
1585
+ decodeVoiceAudioChunk,
605
1586
  createVoiceStream,
1587
+ createVoiceDuplexController,
1588
+ createVoiceController,
606
1589
  createVoiceConnection,
1590
+ createVoiceAudioPlayer,
607
1591
  createMicrophoneCapture,
608
- bindVoiceHTMX
1592
+ bindVoiceHTMX,
1593
+ bindVoiceBargeIn
609
1594
  };