@absolutejs/voice 0.0.20 → 0.0.22-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +884 -4
  2. package/dist/angular/index.d.ts +1 -0
  3. package/dist/angular/index.js +759 -3
  4. package/dist/angular/voice-controller.service.d.ts +27 -0
  5. package/dist/angular/voice-stream.service.d.ts +6 -0
  6. package/dist/audioConditioning.d.ts +3 -0
  7. package/dist/client/actions.d.ts +48 -0
  8. package/dist/client/audioPlayer.d.ts +40 -0
  9. package/dist/client/connection.d.ts +5 -0
  10. package/dist/client/controller.d.ts +2 -0
  11. package/dist/client/duplex.d.ts +3 -0
  12. package/dist/client/htmxBootstrap.js +660 -167
  13. package/dist/client/index.d.ts +3 -0
  14. package/dist/client/index.js +991 -6
  15. package/dist/client/microphone.d.ts +4 -2
  16. package/dist/correction.d.ts +33 -0
  17. package/dist/fileStore.d.ts +27 -0
  18. package/dist/index.d.ts +15 -0
  19. package/dist/index.js +3721 -298
  20. package/dist/ops.d.ts +100 -0
  21. package/dist/presets.d.ts +13 -0
  22. package/dist/react/index.d.ts +1 -0
  23. package/dist/react/index.js +728 -3
  24. package/dist/react/useVoiceController.d.ts +26 -0
  25. package/dist/react/useVoiceStream.d.ts +7 -0
  26. package/dist/routing.d.ts +3 -0
  27. package/dist/runtimeOps.d.ts +23 -0
  28. package/dist/store.d.ts +2 -2
  29. package/dist/svelte/index.d.ts +1 -0
  30. package/dist/svelte/index.js +691 -3
  31. package/dist/telephony/response.d.ts +7 -0
  32. package/dist/telephony/twilio.d.ts +116 -0
  33. package/dist/testing/benchmark.d.ts +93 -2
  34. package/dist/testing/corrected.d.ts +41 -0
  35. package/dist/testing/duplex.d.ts +59 -0
  36. package/dist/testing/fixtures.d.ts +18 -2
  37. package/dist/testing/index.d.ts +5 -0
  38. package/dist/testing/index.js +6247 -402
  39. package/dist/testing/review.d.ts +143 -0
  40. package/dist/testing/sessionBenchmark.d.ts +92 -2
  41. package/dist/testing/stt.d.ts +3 -1
  42. package/dist/testing/telephony.d.ts +70 -0
  43. package/dist/testing/tts.d.ts +73 -0
  44. package/dist/turnDetection.d.ts +5 -1
  45. package/dist/turnProfiles.d.ts +6 -0
  46. package/dist/types.d.ts +487 -10
  47. package/dist/vue/index.d.ts +1 -0
  48. package/dist/vue/index.js +750 -3
  49. package/dist/vue/useVoiceController.d.ts +30 -0
  50. package/dist/vue/useVoiceStream.d.ts +11 -0
  51. package/fixtures/README.md +9 -0
  52. package/fixtures/manifest.json +59 -1
  53. package/fixtures/pcm/dialogue-three-clean.pcm +0 -0
  54. package/fixtures/pcm/dialogue-three-mixed.pcm +0 -0
  55. package/fixtures/pcm/dialogue-two-clean.pcm +0 -0
  56. package/fixtures/pcm/dialogue-two-noisy.pcm +0 -0
  57. package/package.json +135 -1
package/README.md CHANGED
@@ -25,17 +25,39 @@ Optional framework entrypoints:
25
25
 
26
26
  ```ts
27
27
  import { Elysia } from 'elysia';
28
- import { voice, createVoiceMemoryStore } from '@absolutejs/voice';
28
+ import {
29
+ voice,
30
+ createVoiceMemoryStore,
31
+ createPhraseHintCorrectionHandler
32
+ } from '@absolutejs/voice';
29
33
  import { deepgram } from '@absolutejs/voice-deepgram';
30
34
 
31
35
  const app = new Elysia()
32
36
  .use(
33
37
  voice({
34
38
  path: '/voice',
39
+ preset: 'guided-intake',
40
+ lexicon: [
41
+ {
42
+ text: 'AbsoluteJS',
43
+ aliases: ['absoloot js'],
44
+ pronunciation: 'ab-so-lute jay ess'
45
+ }
46
+ ],
47
+ phraseHints: [
48
+ { text: 'AbsoluteJS', aliases: ['absolute js'] },
49
+ { text: 'Joe Johnston', aliases: ['joe johnson'] }
50
+ ],
51
+ correctTurn: createPhraseHintCorrectionHandler(),
35
52
  onComplete: async ({ session }) => {
36
53
  console.log(session.turns);
37
54
  },
38
55
  async onTurn({ turn }) {
56
+ console.log('turn quality:', {
57
+ source: turn.quality?.source,
58
+ fallbackUsed: turn.quality?.fallbackUsed,
59
+ confidence: turn.quality?.averageConfidence
60
+ });
39
61
  return {
40
62
  assistantText: `You said: ${turn.text}`
41
63
  };
@@ -51,6 +73,602 @@ const app = new Elysia()
51
73
 
52
74
  `createVoiceMemoryStore()` is dev-only. Real deployments should provide a shared store backed by Redis, Postgres, or equivalent.
53
75
 
76
+ ## TTS
77
+
78
+ `@absolutejs/voice` now supports optional assistant audio streaming on the same session path. If you provide a `tts` adapter, `assistantText` responses are still sent as text, and the synthesized PCM chunks are streamed as `audio` messages alongside them.
79
+
80
+ ```ts
81
+ import { voice, createVoiceMemoryStore } from '@absolutejs/voice';
82
+ import { deepgram } from '@absolutejs/voice-deepgram';
83
+ import { elevenlabs } from '@absolutejs/voice-elevenlabs';
84
+
85
+ app.use(
86
+ voice({
87
+ path: '/voice',
88
+ session: createVoiceMemoryStore(),
89
+ stt: deepgram({
90
+ apiKey: process.env.DEEPGRAM_API_KEY!,
91
+ model: 'flux-general-en'
92
+ }),
93
+ tts: elevenlabs({
94
+ apiKey: process.env.ELEVENLABS_API_KEY!,
95
+ voiceId: process.env.ELEVENLABS_VOICE_ID!
96
+ }),
97
+ onTurn: async ({ turn }) => ({
98
+ assistantText: `You said: ${turn.text}`
99
+ }),
100
+ onComplete: async () => {}
101
+ })
102
+ );
103
+ ```
104
+
105
+ Client state now exposes `assistantAudio` on the stream/controller helpers, so apps can buffer or play synthesized chunks without inventing a second transport.
106
+
107
+ If you want a minimal browser playback path, use the client audio player:
108
+
109
+ ```ts
110
+ import {
111
+ createVoiceAudioPlayer,
112
+ createVoiceController
113
+ } from '@absolutejs/voice/client';
114
+
115
+ const voice = createVoiceController('/voice', {
116
+ preset: 'chat'
117
+ });
118
+ const player = createVoiceAudioPlayer(voice);
119
+
120
+ await player.start(); // call from a user gesture
121
+ await player.interrupt(); // flush queued assistant playback for barge-in
122
+ ```
123
+
124
+ `createVoiceAudioPlayer()` subscribes to `assistantAudio`, decodes raw `pcm_s16le` chunks, and queues them in WebAudio. It also exposes `interrupt()`, `lastInterruptLatencyMs`, and `lastPlaybackStopLatencyMs` so apps can flush assistant playback during barge-in and inspect how long it took for queued playback to fully stop.
125
+
126
+ For a higher-level client path, use the duplex helper:
127
+
128
+ ```ts
129
+ import { createVoiceDuplexController } from '@absolutejs/voice/client';
130
+
131
+ const voice = createVoiceDuplexController('/voice', {
132
+ bargeIn: {
133
+ interruptThreshold: 0.08
134
+ },
135
+ preset: 'chat'
136
+ });
137
+
138
+ await voice.audioPlayer.start();
139
+ await voice.startRecording();
140
+ ```
141
+
142
+ `createVoiceDuplexController()` composes the controller and audio player and automatically interrupts assistant playback when:
143
+
144
+ - microphone input crosses the configured barge-in threshold
145
+ - partial user speech starts arriving
146
+ - manual `sendAudio(...)` is called while assistant audio is playing
147
+
148
+ ## Duplex Benchmarks
149
+
150
+ The first duplex benchmark lane measures package-level barge-in interruption on the client path. It records scenario pass/fail plus local interruption latency for:
151
+
152
+ - manual `sendAudio(...)`
153
+ - partial transcript start
154
+ - input-level threshold crossing
155
+
156
+ Run it with:
157
+
158
+ ```bash
159
+ bun run bench:duplex
160
+ ```
161
+
162
+ That writes:
163
+
164
+ - `benchmark-results/duplex-barge-in.json`
165
+
166
+ ## Telephony
167
+
168
+ `@absolutejs/voice` now includes a first PSTN bridge layer for Twilio Media Streams. It converts inbound `audio/x-mulaw` 8 kHz frames into the PCM format the voice session expects, and converts assistant PCM audio back into outbound Twilio media events.
169
+
170
+ Minimal usage:
171
+
172
+ ```ts
173
+ import { createTwilioMediaStreamBridge, createTwilioVoiceResponse } from '@absolutejs/voice';
174
+ import { deepgram } from '@absolutejs/voice-deepgram';
175
+ import { elevenlabs } from '@absolutejs/voice-elevenlabs';
176
+
177
+ const twiml = createTwilioVoiceResponse({
178
+ streamUrl: 'wss://example.com/voice/twilio',
179
+ parameters: {
180
+ sessionId: 'call-123',
181
+ scenarioId: 'phone-intake'
182
+ },
183
+ track: 'both_tracks'
184
+ });
185
+
186
+ const bridge = createTwilioMediaStreamBridge(twilioSocket, {
187
+ context: {},
188
+ onComplete: async () => {},
189
+ onTurn: async ({ turn }) => ({
190
+ assistantText: `You said: ${turn.text}`
191
+ }),
192
+ session: createVoiceMemoryStore(),
193
+ stt: deepgram({
194
+ apiKey: process.env.DEEPGRAM_API_KEY!,
195
+ model: 'flux-general-en'
196
+ }),
197
+ tts: elevenlabs({
198
+ apiKey: process.env.ELEVENLABS_API_KEY!,
199
+ voiceId: process.env.ELEVENLABS_VOICE_ID!
200
+ })
201
+ });
202
+
203
+ await bridge.handleMessage(startMessageFromTwilio);
204
+ await bridge.handleMessage(mediaMessageFromTwilio);
205
+ ```
206
+
207
+ The bridge also sends Twilio `clear` events on new inbound media after assistant audio has started streaming, so telephony barge-in can stop queued outbound playback.
208
+
209
+ You can benchmark the package-level Twilio bridge path with:
210
+
211
+ ```bash
212
+ bun run bench:telephony:run
213
+ ```
214
+
215
+ That writes:
216
+ - `benchmark-results/telephony-twilio-bridge.json`
217
+ - `benchmark-results/telephony-run-manifest.json`
218
+
219
+ For a live vendor-backed duplex smoke benchmark on the real TTS adapters, run:
220
+
221
+ ```bash
222
+ bun run bench:duplex:live:run
223
+ ```
224
+
225
+ That writes fresh results to:
226
+
227
+ For a live vendor-backed telephony smoke benchmark through the Twilio bridge path, run:
228
+
229
+ ```bash
230
+ bun run bench:telephony:live:run
231
+ ```
232
+
233
+ That writes:
234
+ - `benchmark-results/telephony-live-deepgram-elevenlabs.json`
235
+ - `benchmark-results/telephony-live-run-manifest.json`
236
+
237
+ For a repeated live telephony stability read, run:
238
+
239
+ ```bash
240
+ bun run bench:telephony:live:series
241
+ ```
242
+
243
+ That writes:
244
+ - `benchmark-results/telephony-live-series-summary-runs-3.json`
245
+
246
+ For a live Deepgram telephony model shootout on the same PSTN path, run:
247
+
248
+ ```bash
249
+ bun run bench:telephony:live:shootout
250
+ ```
251
+
252
+ That writes:
253
+ - `benchmark-results/telephony-live-flux-general-en.json`
254
+ - `benchmark-results/telephony-live-nova-3-phone.json`
255
+ - `benchmark-results/telephony-live-shootout-manifest.json`
256
+
257
+ - `benchmark-results/duplex-live-elevenlabs.json`
258
+ - `benchmark-results/duplex-live-openai.json`
259
+ - `benchmark-results/duplex-live-all.json`
260
+ - `benchmark-results/duplex-live-run-manifest.json`
261
+
262
+ For a browser-run duplex benchmark that uses a real headless Chrome `AudioContext` instead of the fake Node-side playback context, run:
263
+
264
+ ```bash
265
+ bun run bench:duplex:browser:run
266
+ ```
267
+
268
+ That writes fresh results to:
269
+
270
+ - `benchmark-results/duplex-browser-elevenlabs.json`
271
+ - `benchmark-results/duplex-browser-openai.json`
272
+ - `benchmark-results/duplex-browser-all.json`
273
+ - `benchmark-results/duplex-browser-run-manifest.json`
274
+
275
+ To measure browser duplex stability across repeated runs, use:
276
+
277
+ ```bash
278
+ bun run bench:duplex:browser:series
279
+ ```
280
+
281
+ That writes:
282
+
283
+ - `benchmark-results/duplex-browser-series-summary-runs-3.json`
284
+ - per-run provider artifacts like `benchmark-results/duplex-browser-elevenlabs-series-run-1.json`
285
+
286
+ For repeated interrupt-and-resume across several consecutive assistant turns, run:
287
+
288
+ ```bash
289
+ bun run bench:duplex:browser:overlap:run
290
+ ```
291
+
292
+ That writes:
293
+
294
+ - `benchmark-results/duplex-browser-overlap-elevenlabs.json`
295
+ - `benchmark-results/duplex-browser-overlap-openai.json`
296
+ - `benchmark-results/duplex-browser-overlap-all.json`
297
+ - `benchmark-results/duplex-browser-overlap-run-manifest.json`
298
+
299
+ To measure overlap stability across repeated live browser runs, use:
300
+
301
+ ```bash
302
+ bun run bench:duplex:browser:overlap:series
303
+ ```
304
+
305
+ That writes:
306
+
307
+ - `benchmark-results/duplex-browser-overlap-series-summary-runs-3.json`
308
+ - per-run provider artifacts like `benchmark-results/duplex-browser-overlap-elevenlabs-series-run-1.json`
309
+
310
+ ## TTS Benchmarks
311
+
312
+ `@absolutejs/voice` now includes a first TTS benchmark harness for streaming output adapters. The initial metrics are:
313
+
314
+ - `firstAudioLatencyMs`
315
+ - `elapsedMs`
316
+ - `audioChunkCount`
317
+ - `totalAudioBytes`
318
+ - estimated PCM `audioDurationMs`
319
+ - interruption responsiveness via `interruptionLatencyMs`
320
+
321
+ Run the full TTS suite with one command:
322
+
323
+ ```bash
324
+ bun run bench:tts:run
325
+ ```
326
+
327
+ That writes fresh results to:
328
+
329
+ - `benchmark-results/tts-all.json`
330
+ - `benchmark-results/tts-elevenlabs.json`
331
+ - `benchmark-results/tts-openai.json`
332
+ - `benchmark-results/tts-run-manifest.json`
333
+
334
+ To measure interruption/cancel responsiveness separately:
335
+
336
+ ```bash
337
+ bun run bench:tts:interrupt:run
338
+ ```
339
+
340
+ That writes fresh interruption results to:
341
+
342
+ - `benchmark-results/tts-all-interrupt.json`
343
+ - `benchmark-results/tts-elevenlabs-interrupt.json`
344
+ - `benchmark-results/tts-openai-interrupt.json`
345
+ - `benchmark-results/tts-interrupt-run-manifest.json`
346
+
347
+ ## Recommended Production Path
348
+
349
+ The current best-performing path in the bundled benchmarks is:
350
+
351
+ - `deepgram-flux` as primary STT
352
+ - route-level `lexicon` for pronunciation/domain entries
353
+ - route-level `phraseHints`
354
+ - route-level `correctTurn` using `createPhraseHintCorrectionHandler()`
355
+
356
+ That combination outperformed the raw vendor-only paths in the package benchmarks because it lets AbsoluteJS repair domain-specific terms after strong base transcription instead of depending on a second STT vendor to rescue hard turns.
357
+
358
+ Minimal production-oriented example:
359
+
360
+ ```ts
361
+ import {
362
+ createVoiceSTTRoutingCorrectionHandler,
363
+ createPhraseHintCorrectionHandler,
364
+ resolveVoiceSTTRoutingStrategy,
365
+ voice
366
+ } from '@absolutejs/voice';
367
+ import { deepgram } from '@absolutejs/voice-deepgram';
368
+
369
+ app.use(
370
+ voice({
371
+ path: '/voice/intake',
372
+ preset: 'reliability',
373
+ lexicon: [
374
+ {
375
+ text: 'AbsoluteJS',
376
+ aliases: ['absoloot js'],
377
+ pronunciation: 'ab-so-lute jay ess'
378
+ }
379
+ ],
380
+ phraseHints: [
381
+ { text: 'AbsoluteJS', aliases: ['absolute js'] },
382
+ { text: 'Joe Johnston', aliases: ['joe johnson'] },
383
+ {
384
+ text: 'beneath well thatched trees that shed the rain like a roof',
385
+ aliases: ['beneath wealth', 'shelter beneath wealth']
386
+ }
387
+ ],
388
+ correctTurn: createPhraseHintCorrectionHandler(),
389
+ session: createVoiceMemoryStore(),
390
+ stt: deepgram({
391
+ apiKey: process.env.DEEPGRAM_API_KEY!,
392
+ model: 'flux-general-en'
393
+ }),
394
+ onTurn: async ({ turn }) => ({
395
+ assistantText: `Captured: ${turn.text}`
396
+ }),
397
+ onComplete: async () => {}
398
+ })
399
+ );
400
+ ```
401
+
402
+ `phraseHints` are user-controlled route config, not hidden framework magic. They are there so the app can teach the voice route its domain vocabulary.
403
+
404
+ ## Best Vs Cheap STT
405
+
406
+ `@absolutejs/voice` now exposes an explicit package-level routing split so apps can choose between the strongest benchmarked path and a cheaper/raw path without inventing their own policy layer.
407
+
408
+ ```ts
409
+ import {
410
+ createVoiceMemoryStore,
411
+ createVoiceSTTRoutingCorrectionHandler,
412
+ resolveVoiceSTTRoutingStrategy,
413
+ voice
414
+ } from '@absolutejs/voice';
415
+ import { deepgram } from '@absolutejs/voice-deepgram';
416
+
417
+ const strategy = resolveVoiceSTTRoutingStrategy('best');
418
+
419
+ app.use(
420
+ voice({
421
+ path: '/voice/stt',
422
+ preset: strategy.preset,
423
+ phraseHints: [{ text: 'Joe Johnston', aliases: ['joe johnson'] }],
424
+ correctTurn: createVoiceSTTRoutingCorrectionHandler(strategy.correctionMode),
425
+ session: createVoiceMemoryStore(),
426
+ sttLifecycle: strategy.sttLifecycle,
427
+ stt: deepgram({
428
+ apiKey: process.env.DEEPGRAM_API_KEY!,
429
+ model: 'flux-general-en'
430
+ })
431
+ })
432
+ );
433
+ ```
434
+
435
+ - `best` maps to the current strongest in-package path: Deepgram Flux plus generic deterministic correction.
436
+ - `low-cost` maps to a cheaper/raw package path: one primary STT pass with no correction hook.
437
+ - session benchmarks now include per-turn cost telemetry fields like `averageRelativeCostUnits`, `averagePrimaryAudioMs`, and `averageFallbackReplayAudioMs`.
438
+ - use `bun run bench:stt:routing:run` to benchmark both in parallel and write fresh:
439
+ - `benchmark-results/sessions-best-stt-runs-3.json`
440
+ - `benchmark-results/sessions-cheap-stt-runs-3.json`
441
+ - `benchmark-results/stt-routing-run-manifest.json`
442
+
443
+ ## Presets
444
+
445
+ Voice now ships named runtime presets so apps can start from a useful baseline instead of hand-tuning silence and capture settings every time.
446
+
447
+ - `default`
448
+ - `chat`
449
+ - `guided-intake`
450
+ - `dictation`
451
+ - `noisy-room`
452
+ - `reliability`
453
+
454
+ On the server:
455
+
456
+ ```ts
457
+ voice({
458
+ path: '/voice/intake',
459
+ preset: 'guided-intake',
460
+ session: createVoiceMemoryStore(),
461
+ stt: deepgram({
462
+ apiKey: process.env.DEEPGRAM_API_KEY!,
463
+ model: 'nova-3'
464
+ }),
465
+ onTurn: async ({ turn }) => ({
466
+ assistantText: `Captured: ${turn.text}`
467
+ }),
468
+ onComplete: async () => {}
469
+ });
470
+ ```
471
+
472
+ On the client:
473
+
474
+ ```ts
475
+ import { createVoiceController } from '@absolutejs/voice/client';
476
+
477
+ const voice = createVoiceController('/voice/intake', {
478
+ preset: 'guided-intake'
479
+ });
480
+
481
+ await voice.startRecording();
482
+ voice.endTurn();
483
+ voice.stopRecording();
484
+ ```
485
+
486
+ Presets are still overridable. If you need to tune for a specific route, layer `turnDetection` or `audioConditioning` on top of the preset instead of replacing the whole setup.
487
+
488
+ Presets are not the same thing as phrase hints:
489
+
490
+ - presets tune framework-owned behavior like silence windows, reconnect defaults, and audio conditioning
491
+ - `lexicon` tunes pronunciation-aware domain entries that should reach STT/TTS adapters directly
492
+ - `phraseHints` tune app/domain vocabulary like company names, product names, legal phrases, or subscriber-specific jargon
493
+
494
+ In practice:
495
+
496
+ - use a preset to choose the runtime shape (`guided-intake`, `reliability`, `noisy-room`)
497
+ - use `lexicon` when pronunciation matters and you want adapter-consumable entries
498
+ - use `phraseHints` to teach the route what words matter for your business
499
+ - use `correctTurn` when you want deterministic post-STT repair before the turn is committed
500
+
501
+ ## Framework Helpers
502
+
503
+ The package now exposes higher-level controller helpers as well as the lower-level stream primitives.
504
+
505
+ - `@absolutejs/voice/client`
506
+ - `createVoiceController()`
507
+ - `createVoiceStream()`
508
+ - `bindVoiceHTMX()`
509
+ - `@absolutejs/voice/react`
510
+ - `useVoiceController()`
511
+ - `useVoiceStream()`
512
+ - `@absolutejs/voice/vue`
513
+ - `useVoiceController()`
514
+ - `useVoiceStream()`
515
+ - `@absolutejs/voice/svelte`
516
+ - `createVoiceController()`
517
+ - `createVoiceStream()`
518
+ - `@absolutejs/voice/angular`
519
+ - `VoiceControllerService`
520
+ - `VoiceStreamService`
521
+
522
+ The controller helpers abstract the common browser boilerplate:
523
+
524
+ - microphone capture
525
+ - start / stop / toggle recording
526
+ - stream subscription state
527
+ - HTMX session syncing
528
+
529
+ They do not hide the underlying transport. You still choose the route path and preset explicitly.
530
+
531
+ ## Lexicon, Phrase Hints, And Correction
532
+
533
+ `lexicon` is a route-level input for pronunciation-aware domain entries.
534
+
535
+ It can be:
536
+
537
+ - a static array for known names, products, and jargon
538
+ - a resolver function when entries depend on the tenant, subscriber, or scenario
539
+
540
+ ```ts
541
+ voice({
542
+ path: '/voice/intake',
543
+ lexicon: async ({ context }) => {
544
+ return [
545
+ {
546
+ text: 'AbsoluteJS',
547
+ aliases: ['absoloot js'],
548
+ pronunciation: 'ab-so-lute jay ess'
549
+ },
550
+ {
551
+ text: 'Eden Treaty',
552
+ aliases: ['eden tree tea'],
553
+ pronunciation: 'ee-den tree-tee'
554
+ }
555
+ ];
556
+ },
557
+ session: createVoiceMemoryStore(),
558
+ stt: deepgram({
559
+ apiKey: process.env.DEEPGRAM_API_KEY!,
560
+ model: 'flux-general-en'
561
+ }),
562
+ onTurn: async ({ turn }) => ({
563
+ assistantText: turn.text
564
+ }),
565
+ onComplete: async () => {}
566
+ });
567
+ ```
568
+
569
+ How the package uses it:
570
+
571
+ - adapters receive `lexicon` at open time and translate it into vendor-native hinting surfaces when possible
572
+ - STT adapters can use the canonical text plus aliases to bias recognition
573
+ - future TTS adapters can use the same entries for pronunciation-aware speech output
574
+
575
+ `phraseHints` are a separate route-level input that the application owns.
576
+
577
+ They can be:
578
+
579
+ - a static array for known domain vocabulary
580
+ - a resolver function when hints depend on the authenticated user, tenant, scenario, or subscriber record
581
+
582
+ ```ts
583
+ voice({
584
+ path: '/voice/intake',
585
+ preset: 'reliability',
586
+ phraseHints: async ({ context, scenarioId, sessionId }) => {
587
+ return [
588
+ { text: 'AbsoluteJS', aliases: ['absolute js'] },
589
+ { text: 'Eden Treaty', aliases: ['eden treaty'] },
590
+ { text: 'Joe Johnston', aliases: ['joe johnson'] }
591
+ ];
592
+ },
593
+ correctTurn: createPhraseHintCorrectionHandler(),
594
+ session: createVoiceMemoryStore(),
595
+ stt: deepgram({
596
+ apiKey: process.env.DEEPGRAM_API_KEY!,
597
+ model: 'flux-general-en'
598
+ }),
599
+ onTurn: async ({ turn }) => ({
600
+ assistantText: turn.text
601
+ }),
602
+ onComplete: async () => {}
603
+ });
604
+ ```
605
+
606
+ How the package uses them:
607
+
608
+ - adapters receive `lexicon` and `phraseHints` at open time
609
+ - adapters receive `phraseHints` at open time and can translate them into vendor-native hinting surfaces
610
+ - the correction layer can use the same hints after STT to repair domain terms before commit
611
+
612
+ Current built-in correction helper:
613
+
614
+ ```ts
615
+ import { createPhraseHintCorrectionHandler } from '@absolutejs/voice';
616
+
617
+ const correctTurn = createPhraseHintCorrectionHandler();
618
+ ```
619
+
620
+ This helper is intentionally deterministic. It is for phrase normalization and domain repair, not for hiding an LLM behind your turn commit. If you need something more advanced, provide your own `correctTurn` handler.
621
+
622
+ ### React
623
+
624
+ ```tsx
625
+ import { useVoiceController } from '@absolutejs/voice/react';
626
+
627
+ export function VoiceWidget() {
628
+ const voice = useVoiceController('/voice/intake', {
629
+ preset: 'guided-intake'
630
+ });
631
+
632
+ return (
633
+ <button onClick={() => void voice.toggleRecording()}>
634
+ {voice.isRecording ? 'Stop microphone' : 'Start microphone'}
635
+ </button>
636
+ );
637
+ }
638
+ ```
639
+
640
+ ### Vue
641
+
642
+ ```ts
643
+ import { useVoiceController } from '@absolutejs/voice/vue';
644
+
645
+ const voice = useVoiceController('/voice/intake', {
646
+ preset: 'guided-intake'
647
+ });
648
+ ```
649
+
650
+ ### Svelte
651
+
652
+ ```ts
653
+ import { createVoiceController } from '@absolutejs/voice/svelte';
654
+
655
+ const voice = createVoiceController('/voice/intake', {
656
+ preset: 'guided-intake'
657
+ });
658
+ ```
659
+
660
+ ### Angular
661
+
662
+ ```ts
663
+ import { VoiceControllerService } from '@absolutejs/voice/angular';
664
+
665
+ constructor(private readonly voice: VoiceControllerService) {}
666
+
667
+ controller = this.voice.connect('/voice/intake', {
668
+ preset: 'guided-intake'
669
+ });
670
+ ```
671
+
54
672
  ## HTMX
55
673
 
56
674
  Voice now mirrors the AI plugin's HTMX pattern with plugin-owned renderers and a plugin-owned fragment route.
@@ -91,14 +709,148 @@ The plugin exposes `GET /voice/intake/htmx/session?sessionId=...` by default. Th
91
709
  On the client, bind the browser voice stream to a hidden HTMX refresh element:
92
710
 
93
711
  ```ts
94
- import { bindVoiceHTMX, createVoiceStream } from '@absolutejs/voice/client';
712
+ import { createVoiceController } from '@absolutejs/voice/client';
95
713
 
96
- const voice = createVoiceStream('/voice/intake');
97
- bindVoiceHTMX(voice, { element: '#voice-htmx-sync' });
714
+ const voice = createVoiceController('/voice/intake', {
715
+ preset: 'guided-intake'
716
+ });
717
+ voice.bindHTMX({ element: '#voice-htmx-sync' });
98
718
  ```
99
719
 
100
720
  That keeps HTMX pages declarative without inventing custom fragment endpoints for core voice session UI.
101
721
 
722
+ ## Competitive Benchmarking
723
+
724
+ The package includes a competitive benchmark harness for STT quality and responsiveness.
725
+
726
+ Run:
727
+
728
+ ```bash
729
+ bun run bench:vs
730
+ ```
731
+
732
+ Use profiles to focus where you want to win:
733
+
734
+ - `bun run bench:vs all` (default)
735
+ - `bun run bench:vs all accents`
736
+ - `bun run bench:vs all code-switch`
737
+ - `bun run bench:vs all jargon`
738
+ - `bun run bench:vs all multilingual`
739
+ - `bun run bench:vs all multi-speaker`
740
+ - `bun run bench:vs all telephony`
741
+ - `bun run bench:vs all clean`
742
+ - `bun run bench:vs all noisy`
743
+ - `bun run bench:vs deepgram accents`
744
+ - `bun run bench:vs deepgram-flux accents` (compare Flux candidate, default includes VAPI output if configured)
745
+ - `bun run bench:vs deepgram-nova accents`
746
+
747
+ Current benchmark guidance:
748
+
749
+ - use `deepgram-flux` as the primary conversational STT path
750
+ - prefer route-level `phraseHints` plus `correctTurn` over cross-vendor fallback for domain-specific accuracy
751
+ - use fallback vendors only when your own traffic proves they beat the package-level correction path
752
+ - do not treat `openai` as the default STT path unless your own benchmarks prove it for your traffic
753
+
754
+ If you use a VAPI baseline file, you can run a direct model comparison:
755
+
756
+ ```bash
757
+ bun run bench:vs:deepgram-flux
758
+ ```
759
+
760
+ To benchmark Nova vs Flux back-to-back, set the model explicitly:
761
+
762
+ ```bash
763
+ DEEPGRAM_MODEL=flux-general-en bun run bench:deepgram:accents
764
+ DEEPGRAM_MODEL=nova-3 bun run bench:deepgram:accents
765
+ ```
766
+
767
+ To stress the STT path with synthesized narrowband phone audio:
768
+
769
+ ```bash
770
+ bun run bench:telephony
771
+ bun run bench:telephony:run
772
+ bun run bench:deepgram:telephony
773
+ bun run bench:deepgram:corrected:telephony
774
+ bun run bench:jargon
775
+ bun run bench:deepgram:jargon
776
+ bun run bench:deepgram:corrected:audit:jargon
777
+ bun run bench:multi-speaker:run
778
+ bun run bench:multi-speaker:analyze
779
+ bun run bench:deepgram:multi-speaker
780
+ ```
781
+
782
+ To compare against Vapi or other providers, provide a baseline JSON file:
783
+
784
+ ```bash
785
+ bun run bench:vs all accents --compare /path/to/vapi-baseline.json
786
+ ```
787
+
788
+ Expected benchmark payload:
789
+
790
+ ```json
791
+ {
792
+ "source": "vapi",
793
+ "results": [
794
+ {
795
+ "adapterId": "vapi-baseline",
796
+ "summary": {
797
+ "passRate": 0.0,
798
+ "averageWordErrorRate": 1.0,
799
+ "averageTermRecall": 0.0,
800
+ "averageElapsedMs": 0,
801
+ "averageTimeToEndOfTurnMs": 0,
802
+ "averageTimeToFirstFinalMs": 0,
803
+ "averageTimeToFirstPartialMs": 0,
804
+ "wordAccuracyRate": 0.0
805
+ }
806
+ }
807
+ ]
808
+ }
809
+ ```
810
+
811
+ For a fast parse-only validation of arguments:
812
+
813
+ ```bash
814
+ bun run ./scripts/benchmark-vs.ts --dry-run
815
+ ```
816
+
817
+ The harness prints:
818
+
819
+ - pass rate and recall deltas per adapter
820
+ - weighted scorecard (`passRate`, term recall, word accuracy)
821
+ - optional competitor deltas (Vapi)
822
+ - a markdown report beside the JSON output, for example:
823
+ - `benchmark-results/vs-all-telephony.json`
824
+ - `benchmark-results/vs-all-telephony.md`
825
+
826
+ For package-level multi-turn behavior, use the session benchmark harness instead of raw STT-only benchmarking:
827
+
828
+ ```bash
829
+ bun run bench:sessions
830
+ bun run bench:deepgram:sessions
831
+ bun run bench:deepgram:soak:sessions
832
+ bun run bench:deepgram:hybrid:sessions
833
+ bun run bench:deepgram:corrected:sessions
834
+ bun run bench:deepgram:corrected:soak:sessions
835
+ bun run bench:stt:routing:run
836
+ bun run bench:assemblyai:sessions
837
+ bun run bench:openai:sessions
838
+ bun run bench:soak:run
839
+ ```
840
+
841
+ That harness runs the adapter through `VoiceSession` itself, so the output reflects reconnect handling, turn commit stability, and duplicate-turn protection rather than only raw transcript quality.
842
+
843
+ `bench:soak:run` is the STT-5 runner. It executes the long-session soak lane for raw Deepgram Flux, corrected Deepgram, and the reconnect resilience suite in parallel, then writes fresh JSON into `benchmark-results/` without the runs deleting each other.
844
+
845
+ `bench:stt:routing:run` is the STT-7 runner. It benchmarks the package’s current `best` vs `low-cost` session strategies in parallel, clears stale outputs first, and writes a manifest so the cost-aware summaries are guaranteed fresh.
846
+
847
+ `bench:deepgram:corrected:sessions` exercises the current recommended package-level production path:
848
+
849
+ - Deepgram Flux as primary STT
850
+ - phrase hints routed through the adapter layer
851
+ - committed-turn correction via `createPhraseHintCorrectionHandler()`
852
+ - core turn dedupe, reconnect, and transcript selection still owned by `@absolutejs/voice`
853
+
102
854
  ## Adapter Contract
103
855
 
104
856
  Adapters normalize vendor behavior into a core event model so the plugin never branches on vendor names.
@@ -185,6 +937,134 @@ Default reconnect strategy is `resume-last-turn`.
185
937
 
186
938
  If an adapter does not emit native end-of-turn events, core falls back to silence detection with a default `700ms` threshold.
187
939
 
940
+ ## STT Fallback
941
+
942
+ You can pair a primary vendor with an optional fallback vendor per route when you need extra reliability for accents, edge environments, or short commands.
943
+
944
+ ```ts
945
+ voice({
946
+ path: '/voice/intake',
947
+ preset: 'default',
948
+ session: createVoiceMemoryStore(),
949
+ stt: deepgram({ apiKey: process.env.DEEPGRAM_API_KEY!, model: 'nova-3' }),
950
+ sttFallback: {
951
+ adapter: assemblyai({ apiKey: process.env.ASSEMBLYAI_API_KEY! }),
952
+ trigger: 'empty-or-low-confidence',
953
+ confidenceThreshold: 0.65,
954
+ minTextLength: 2,
955
+ replayWindowMs: 8000,
956
+ settleMs: 220,
957
+ maxAttemptsPerTurn: 1
958
+ },
959
+ onTurn: async ({ turn }) => {
960
+ return { assistantText: `Captured: ${turn.text}` };
961
+ },
962
+ onComplete: async () => {}
963
+ });
964
+ ```
965
+
966
+ Fallback triggers are evaluated at commit time:
967
+
968
+ - `empty-turn`: commit is empty (`< minTextLength` words), then fallback is attempted
969
+ - `low-confidence`: average transcript confidence is below `confidenceThreshold`
970
+ - `empty-or-low-confidence`: both conditions
971
+
972
+ The fallback adapter receives the same window of turn audio as the primary (default `8s`, configurable with `replayWindowMs`) and can only run `maxAttemptsPerTurn` times per turn.
973
+
974
+ ## Benchmark Fixture Sources
975
+
976
+ Bundled fixtures cover the current in-repo English benchmark suite. For multilingual and code-switch evaluation, add external fixture directories and let the benchmark scripts merge them automatically.
977
+
978
+ The public corpus builder currently assembles:
979
+
980
+ - FLEURS multilingual dev clips
981
+ - BSC Catalan-Spanish code-switch evaluation clips
982
+ - CoSHE Hindi-English code-switch evaluation clips
983
+
984
+ Set either:
985
+
986
+ - `VOICE_FIXTURE_DIR=/abs/path/to/fixtures`
987
+ - `VOICE_FIXTURE_DIRS=/abs/path/one,/abs/path/two`
988
+
989
+ Each fixture directory must include:
990
+
991
+ - `manifest.json`
992
+ - `pcm/*.pcm`
993
+
994
+ Each manifest entry can include:
995
+
996
+ - `language`
997
+ - `tags`
998
+ Use `multilingual`, `bilingual`, or `code-switch` to route fixtures into the multilingual benchmark lane.
999
+
1000
+ Benchmark commands:
1001
+
1002
+ ```bash
1003
+ bun run bench:multilingual
1004
+ bun run bench:code-switch
1005
+ bun run bench:code-switch:series
1006
+ bun run bench:code-switch:ca-es
1007
+ bun run bench:code-switch:ca-es:series
1008
+ bun run bench:code-switch:ca-es:corts:series
1009
+ bun run bench:code-switch:ca-es:parlament:series
1010
+ bun run bench:code-switch:hi-en
1011
+ bun run bench:code-switch:hi-en:series
1012
+ bun run bench:deepgram:multilingual
1013
+ bun run bench:deepgram:code-switch
1014
+ bun run bench:deepgram:code-switch:series
1015
+ bun run bench:deepgram:code-switch:ca-es
1016
+ bun run bench:deepgram:code-switch:ca-es:series
1017
+ bun run bench:deepgram:code-switch:ca-es:corts:series
1018
+ bun run bench:deepgram:code-switch:ca-es:parlament:series
1019
+ bun run bench:deepgram:code-switch:ca-es:nova3-multi:series
1020
+ bun run bench:deepgram:code-switch:ca-es:nova3-ca:series
1021
+ bun run bench:deepgram:code-switch:ca-es:nova3-es:series
1022
+ bun run bench:deepgram:code-switch:ca-es:nova2-ca:series
1023
+ bun run bench:deepgram:code-switch:ca-es:nova2-es:series
1024
+ bun run bench:deepgram:code-switch:ca-es:best:corrected:series
1025
+ bun run bench:deepgram:code-switch:ca-es:parlament:debug
1026
+ bun run bench:deepgram:code-switch:corrected:ca-es
1027
+ bun run bench:deepgram:code-switch:corrected:ca-es:series
1028
+ bun run bench:deepgram:code-switch:corrected:ca-es:corts:series
1029
+ bun run bench:deepgram:code-switch:corrected:ca-es:parlament:series
1030
+ bun run bench:deepgram:code-switch:hi-en
1031
+ bun run bench:deepgram:code-switch:hi-en:series
1032
+ bun run bench:deepgram:code-switch:corrected:hi-en
1033
+ bun run bench:deepgram:code-switch:corrected:hi-en:series
1034
+ bun run bench:deepgram:code-switch:corrected
1035
+ bun run bench:deepgram:code-switch:corrected:series
1036
+ bun run bench:assemblyai:multilingual
1037
+ bun run bench:assemblyai:code-switch
1038
+ bun run bench:openai:multilingual
1039
+ bun run bench:openai:code-switch
1040
+ bun run bench:openai:code-switch:series
1041
+ bun run bench:openai:code-switch:ca-es
1042
+ bun run bench:openai:code-switch:ca-es:series
1043
+ bun run bench:openai:code-switch:corrected:ca-es
1044
+ bun run bench:openai:code-switch:corrected:ca-es:series
1045
+ bun run bench:openai:code-switch:hi-en
1046
+ bun run bench:openai:code-switch:hi-en:series
1047
+ bun run bench:openai:code-switch:corrected:hi-en
1048
+ bun run bench:openai:code-switch:corrected:hi-en:series
1049
+ bun run bench:openai:code-switch:corrected
1050
+ bun run bench:openai:code-switch:corrected:series
1051
+ ```
1052
+
1053
+ Current benchmark direction:
1054
+
1055
+ - `openai` is the strongest adapter on the current public multilingual corpus
1056
+ - `deepgram` remains the strongest browser-English path
1057
+ - raw code-switch remains a weaker surface for every adapter and should be benchmarked separately with `bench:code-switch`
1058
+ - jargon-heavy/domain-heavy English terms now have their own profile; use `bench:jargon` for the cross-adapter read and `bench:deepgram:corrected:audit:jargon` to compare `raw` vs `generic` vs `experimental` vs `benchmarkSeeded`
1059
+ - code-switch should be treated as language-pair-specific, not one universal lane; `ca-es` and `hi-en` now have dedicated series commands
1060
+ - `ca-es` also has a dedicated Deepgram model/language shootout lane so you can compare `nova-3`/`nova-2` with `multi`, `ca`, and `es` routing without overwriting results
1061
+ - current best `ca-es` base path is `deepgram` `nova-3` with `language=ca`; the short runner script uses that path for corrected series
1062
+ - `ca-es` is also split by source now: `corts_valencianes` and `parlament_parla` can be benchmarked independently, and `parlament_parla` has a dedicated transcript dump script
1063
+ - corrected code-switch runs now have dedicated lexicon-driven series commands so raw and corrected stability can be compared directly
1064
+ - multi-speaker diarization is now its own benchmark surface; use `bench:multi-speaker:run` for the parallel cross-adapter plus Deepgram-specific read
1065
+ - when tuning diarization specifically, use `bench:multi-speaker:analyze` to split Deepgram into clean vs noisy handoff lanes, include a corrected noisy read, and emit a speaker-pattern debug dump
1066
+ - use the `:series` commands when you need stability rather than a single-pass snapshot
1067
+
188
1068
  ## Client Primitives
189
1069
 
190
1070
  Browser and framework helpers sit on top of the same connection core: