@absolutejs/voice 0.0.20 → 0.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +387 -4
  2. package/dist/angular/index.d.ts +1 -0
  3. package/dist/angular/index.js +669 -3
  4. package/dist/angular/voice-controller.service.d.ts +21 -0
  5. package/dist/audioConditioning.d.ts +3 -0
  6. package/dist/client/actions.d.ts +7 -0
  7. package/dist/client/connection.d.ts +5 -0
  8. package/dist/client/controller.d.ts +2 -0
  9. package/dist/client/htmxBootstrap.js +576 -167
  10. package/dist/client/index.d.ts +1 -0
  11. package/dist/client/index.js +486 -3
  12. package/dist/client/microphone.d.ts +4 -2
  13. package/dist/correction.d.ts +16 -0
  14. package/dist/index.d.ts +4 -0
  15. package/dist/index.js +1314 -283
  16. package/dist/presets.d.ts +13 -0
  17. package/dist/react/index.d.ts +1 -0
  18. package/dist/react/index.js +642 -3
  19. package/dist/react/useVoiceController.d.ts +20 -0
  20. package/dist/react/useVoiceStream.d.ts +1 -0
  21. package/dist/store.d.ts +2 -2
  22. package/dist/svelte/index.d.ts +1 -0
  23. package/dist/svelte/index.js +607 -3
  24. package/dist/testing/benchmark.d.ts +36 -0
  25. package/dist/testing/index.js +1453 -241
  26. package/dist/testing/sessionBenchmark.d.ts +67 -2
  27. package/dist/testing/stt.d.ts +1 -0
  28. package/dist/turnDetection.d.ts +5 -1
  29. package/dist/turnProfiles.d.ts +6 -0
  30. package/dist/types.d.ts +198 -8
  31. package/dist/vue/index.d.ts +1 -0
  32. package/dist/vue/index.js +660 -3
  33. package/dist/vue/useVoiceController.d.ts +19 -0
  34. package/fixtures/README.md +9 -0
  35. package/fixtures/manifest.json +59 -1
  36. package/fixtures/pcm/dialogue-three-clean.pcm +0 -0
  37. package/fixtures/pcm/dialogue-three-mixed.pcm +0 -0
  38. package/fixtures/pcm/dialogue-two-clean.pcm +0 -0
  39. package/fixtures/pcm/dialogue-two-noisy.pcm +0 -0
  40. package/package.json +21 -1
package/README.md CHANGED
@@ -25,17 +25,32 @@ Optional framework entrypoints:
25
25
 
26
26
  ```ts
27
27
  import { Elysia } from 'elysia';
28
- import { voice, createVoiceMemoryStore } from '@absolutejs/voice';
28
+ import {
29
+ voice,
30
+ createVoiceMemoryStore,
31
+ createPhraseHintCorrectionHandler
32
+ } from '@absolutejs/voice';
29
33
  import { deepgram } from '@absolutejs/voice-deepgram';
30
34
 
31
35
  const app = new Elysia()
32
36
  .use(
33
37
  voice({
34
38
  path: '/voice',
39
+ preset: 'guided-intake',
40
+ phraseHints: [
41
+ { text: 'AbsoluteJS', aliases: ['absolute js'] },
42
+ { text: 'Joe Johnston', aliases: ['joe johnson'] }
43
+ ],
44
+ correctTurn: createPhraseHintCorrectionHandler(),
35
45
  onComplete: async ({ session }) => {
36
46
  console.log(session.turns);
37
47
  },
38
48
  async onTurn({ turn }) {
49
+ console.log('turn quality:', {
50
+ source: turn.quality?.source,
51
+ fallbackUsed: turn.quality?.fallbackUsed,
52
+ confidence: turn.quality?.averageConfidence
53
+ });
39
54
  return {
40
55
  assistantText: `You said: ${turn.text}`
41
56
  };
@@ -51,6 +66,237 @@ const app = new Elysia()
51
66
 
52
67
  `createVoiceMemoryStore()` is dev-only. Real deployments should provide a shared store backed by Redis, Postgres, or equivalent.
53
68
 
69
+ ## Recommended Production Path
70
+
71
+ The current best-performing path in the bundled benchmarks is:
72
+
73
+ - `deepgram-flux` as primary STT
74
+ - route-level `phraseHints`
75
+ - route-level `correctTurn` using `createPhraseHintCorrectionHandler()`
76
+
77
+ That combination outperformed the raw vendor-only paths in the package benchmarks because it lets AbsoluteJS repair domain-specific terms after strong base transcription instead of depending on a second STT vendor to rescue hard turns.
78
+
79
+ Minimal production-oriented example:
80
+
81
+ ```ts
82
+ import {
83
+ createPhraseHintCorrectionHandler,
84
+ voice
85
+ } from '@absolutejs/voice';
86
+ import { deepgram } from '@absolutejs/voice-deepgram';
87
+
88
+ app.use(
89
+ voice({
90
+ path: '/voice/intake',
91
+ preset: 'reliability',
92
+ phraseHints: [
93
+ { text: 'AbsoluteJS', aliases: ['absolute js'] },
94
+ { text: 'Joe Johnston', aliases: ['joe johnson'] },
95
+ {
96
+ text: 'beneath well thatched trees that shed the rain like a roof',
97
+ aliases: ['beneath wealth', 'shelter beneath wealth']
98
+ }
99
+ ],
100
+ correctTurn: createPhraseHintCorrectionHandler(),
101
+ session: createVoiceMemoryStore(),
102
+ stt: deepgram({
103
+ apiKey: process.env.DEEPGRAM_API_KEY!,
104
+ model: 'flux-general-en'
105
+ }),
106
+ onTurn: async ({ turn }) => ({
107
+ assistantText: `Captured: ${turn.text}`
108
+ }),
109
+ onComplete: async () => {}
110
+ })
111
+ );
112
+ ```
113
+
114
+ `phraseHints` are user-controlled route config, not hidden framework magic. They are there so the app can teach the voice route its domain vocabulary.
115
+
116
+ ## Presets
117
+
118
+ Voice now ships named runtime presets so apps can start from a useful baseline instead of hand-tuning silence and capture settings every time.
119
+
120
+ - `default`
121
+ - `chat`
122
+ - `guided-intake`
123
+ - `dictation`
124
+ - `noisy-room`
125
+ - `reliability`
126
+
127
+ On the server:
128
+
129
+ ```ts
130
+ voice({
131
+ path: '/voice/intake',
132
+ preset: 'guided-intake',
133
+ session: createVoiceMemoryStore(),
134
+ stt: deepgram({
135
+ apiKey: process.env.DEEPGRAM_API_KEY!,
136
+ model: 'nova-3'
137
+ }),
138
+ onTurn: async ({ turn }) => ({
139
+ assistantText: `Captured: ${turn.text}`
140
+ }),
141
+ onComplete: async () => {}
142
+ });
143
+ ```
144
+
145
+ On the client:
146
+
147
+ ```ts
148
+ import { createVoiceController } from '@absolutejs/voice/client';
149
+
150
+ const voice = createVoiceController('/voice/intake', {
151
+ preset: 'guided-intake'
152
+ });
153
+
154
+ await voice.startRecording();
155
+ voice.endTurn();
156
+ voice.stopRecording();
157
+ ```
158
+
159
+ Presets are still overridable. If you need to tune for a specific route, layer `turnDetection` or `audioConditioning` on top of the preset instead of replacing the whole setup.
160
+
161
+ Presets are not the same thing as phrase hints:
162
+
163
+ - presets tune framework-owned behavior like silence windows, reconnect defaults, and audio conditioning
164
+ - `phraseHints` tune app/domain vocabulary like company names, product names, legal phrases, or subscriber-specific jargon
165
+
166
+ In practice:
167
+
168
+ - use a preset to choose the runtime shape (`guided-intake`, `reliability`, `noisy-room`)
169
+ - use `phraseHints` to teach the route what words matter for your business
170
+ - use `correctTurn` when you want deterministic post-STT repair before the turn is committed
171
+
172
+ ## Framework Helpers
173
+
174
+ The package now exposes higher-level controller helpers as well as the lower-level stream primitives.
175
+
176
+ - `@absolutejs/voice/client`
177
+ - `createVoiceController()`
178
+ - `createVoiceStream()`
179
+ - `bindVoiceHTMX()`
180
+ - `@absolutejs/voice/react`
181
+ - `useVoiceController()`
182
+ - `useVoiceStream()`
183
+ - `@absolutejs/voice/vue`
184
+ - `useVoiceController()`
185
+ - `useVoiceStream()`
186
+ - `@absolutejs/voice/svelte`
187
+ - `createVoiceController()`
188
+ - `createVoiceStream()`
189
+ - `@absolutejs/voice/angular`
190
+ - `VoiceControllerService`
191
+ - `VoiceStreamService`
192
+
193
+ The controller helpers abstract the common browser boilerplate:
194
+
195
+ - microphone capture
196
+ - start / stop / toggle recording
197
+ - stream subscription state
198
+ - HTMX session syncing
199
+
200
+ They do not hide the underlying transport. You still choose the route path and preset explicitly.
201
+
202
+ ## Phrase Hints And Correction
203
+
204
+ `phraseHints` are a route-level input that the application owns.
205
+
206
+ They can be:
207
+
208
+ - a static array for known domain vocabulary
209
+ - a resolver function when hints depend on the authenticated user, tenant, scenario, or subscriber record
210
+
211
+ ```ts
212
+ voice({
213
+ path: '/voice/intake',
214
+ preset: 'reliability',
215
+ phraseHints: async ({ context, scenarioId, sessionId }) => {
216
+ return [
217
+ { text: 'AbsoluteJS', aliases: ['absolute js'] },
218
+ { text: 'Eden Treaty', aliases: ['eden treaty'] },
219
+ { text: 'Joe Johnston', aliases: ['joe johnson'] }
220
+ ];
221
+ },
222
+ correctTurn: createPhraseHintCorrectionHandler(),
223
+ session: createVoiceMemoryStore(),
224
+ stt: deepgram({
225
+ apiKey: process.env.DEEPGRAM_API_KEY!,
226
+ model: 'flux-general-en'
227
+ }),
228
+ onTurn: async ({ turn }) => ({
229
+ assistantText: turn.text
230
+ }),
231
+ onComplete: async () => {}
232
+ });
233
+ ```
234
+
235
+ How the package uses them:
236
+
237
+ - adapters receive `phraseHints` at open time and can translate them into vendor-native hinting surfaces
238
+ - the correction layer can use the same hints after STT to repair domain terms before commit
239
+
240
+ Current built-in correction helper:
241
+
242
+ ```ts
243
+ import { createPhraseHintCorrectionHandler } from '@absolutejs/voice';
244
+
245
+ const correctTurn = createPhraseHintCorrectionHandler();
246
+ ```
247
+
248
+ This helper is intentionally deterministic. It is for phrase normalization and domain repair, not for hiding an LLM behind your turn commit. If you need something more advanced, provide your own `correctTurn` handler.
249
+
250
+ ### React
251
+
252
+ ```tsx
253
+ import { useVoiceController } from '@absolutejs/voice/react';
254
+
255
+ export function VoiceWidget() {
256
+ const voice = useVoiceController('/voice/intake', {
257
+ preset: 'guided-intake'
258
+ });
259
+
260
+ return (
261
+ <button onClick={() => void voice.toggleRecording()}>
262
+ {voice.isRecording ? 'Stop microphone' : 'Start microphone'}
263
+ </button>
264
+ );
265
+ }
266
+ ```
267
+
268
+ ### Vue
269
+
270
+ ```ts
271
+ import { useVoiceController } from '@absolutejs/voice/vue';
272
+
273
+ const voice = useVoiceController('/voice/intake', {
274
+ preset: 'guided-intake'
275
+ });
276
+ ```
277
+
278
+ ### Svelte
279
+
280
+ ```ts
281
+ import { createVoiceController } from '@absolutejs/voice/svelte';
282
+
283
+ const voice = createVoiceController('/voice/intake', {
284
+ preset: 'guided-intake'
285
+ });
286
+ ```
287
+
288
+ ### Angular
289
+
290
+ ```ts
291
+ import { VoiceControllerService } from '@absolutejs/voice/angular';
292
+
293
+ constructor(private readonly voice: VoiceControllerService) {}
294
+
295
+ controller = this.voice.connect('/voice/intake', {
296
+ preset: 'guided-intake'
297
+ });
298
+ ```
299
+
54
300
  ## HTMX
55
301
 
56
302
  Voice now mirrors the AI plugin's HTMX pattern with plugin-owned renderers and a plugin-owned fragment route.
@@ -91,14 +337,117 @@ The plugin exposes `GET /voice/intake/htmx/session?sessionId=...` by default. Th
91
337
  On the client, bind the browser voice stream to a hidden HTMX refresh element:
92
338
 
93
339
  ```ts
94
- import { bindVoiceHTMX, createVoiceStream } from '@absolutejs/voice/client';
340
+ import { createVoiceController } from '@absolutejs/voice/client';
95
341
 
96
- const voice = createVoiceStream('/voice/intake');
97
- bindVoiceHTMX(voice, { element: '#voice-htmx-sync' });
342
+ const voice = createVoiceController('/voice/intake', {
343
+ preset: 'guided-intake'
344
+ });
345
+ voice.bindHTMX({ element: '#voice-htmx-sync' });
98
346
  ```
99
347
 
100
348
  That keeps HTMX pages declarative without inventing custom fragment endpoints for core voice session UI.
101
349
 
350
+ ## Competitive Benchmarking
351
+
352
+ The package includes a competitive benchmark harness for STT quality and responsiveness.
353
+
354
+ Run:
355
+
356
+ ```bash
357
+ bun run bench:vs
358
+ ```
359
+
360
+ Use profiles to focus where you want to win:
361
+
362
+ - `bun run bench:vs all` (default)
363
+ - `bun run bench:vs all accents`
364
+ - `bun run bench:vs all clean`
365
+ - `bun run bench:vs all noisy`
366
+ - `bun run bench:vs deepgram accents`
367
+ - `bun run bench:vs deepgram-flux accents` (compare Flux candidate, default includes VAPI output if configured)
368
+ - `bun run bench:vs deepgram-nova accents`
369
+
370
+ Current benchmark guidance:
371
+
372
+ - use `deepgram-flux` as the primary conversational STT path
373
+ - prefer route-level `phraseHints` plus `correctTurn` over cross-vendor fallback for domain-specific accuracy
374
+ - use fallback vendors only when your own traffic proves they beat the package-level correction path
375
+ - do not treat `openai` as the default STT path unless your own benchmarks prove it for your traffic
376
+
377
+ If you use a VAPI baseline file, you can run a direct model comparison:
378
+
379
+ ```bash
380
+ bun run bench:vs:deepgram-flux
381
+ ```
382
+
383
+ To benchmark Nova vs Flux back-to-back, set the model explicitly:
384
+
385
+ ```bash
386
+ DEEPGRAM_MODEL=flux-general-en bun run bench:deepgram:accents
387
+ DEEPGRAM_MODEL=nova-3 bun run bench:deepgram:accents
388
+ ```
389
+
390
+ To compare against Vapi or other providers, provide a baseline JSON file:
391
+
392
+ ```bash
393
+ bun run bench:vs all accents --compare /path/to/vapi-baseline.json
394
+ ```
395
+
396
+ Expected benchmark payload:
397
+
398
+ ```json
399
+ {
400
+ "source": "vapi",
401
+ "results": [
402
+ {
403
+ "adapterId": "vapi-baseline",
404
+ "summary": {
405
+ "passRate": 0.0,
406
+ "averageWordErrorRate": 1.0,
407
+ "averageTermRecall": 0.0,
408
+ "averageElapsedMs": 0,
409
+ "averageTimeToEndOfTurnMs": 0,
410
+ "averageTimeToFirstFinalMs": 0,
411
+ "averageTimeToFirstPartialMs": 0,
412
+ "wordAccuracyRate": 0.0
413
+ }
414
+ }
415
+ ]
416
+ }
417
+ ```
418
+
419
+ For a fast parse-only validation of arguments:
420
+
421
+ ```bash
422
+ bun run ./scripts/benchmark-vs.ts --dry-run
423
+ ```
424
+
425
+ The harness prints:
426
+
427
+ - pass rate and recall deltas per adapter
428
+ - weighted scorecard (`passRate`, term recall, word accuracy)
429
+ - optional competitor deltas (Vapi)
430
+
431
+ For package-level multi-turn behavior, use the session benchmark harness instead of raw STT-only benchmarking:
432
+
433
+ ```bash
434
+ bun run bench:sessions
435
+ bun run bench:deepgram:sessions
436
+ bun run bench:deepgram:hybrid:sessions
437
+ bun run bench:deepgram:corrected:sessions
438
+ bun run bench:assemblyai:sessions
439
+ bun run bench:openai:sessions
440
+ ```
441
+
442
+ That harness runs the adapter through `VoiceSession` itself, so the output reflects reconnect handling, turn commit stability, and duplicate-turn protection rather than only raw transcript quality.
443
+
444
+ `bench:deepgram:corrected:sessions` exercises the current recommended package-level production path:
445
+
446
+ - Deepgram Flux as primary STT
447
+ - phrase hints routed through the adapter layer
448
+ - committed-turn correction via `createPhraseHintCorrectionHandler()`
449
+ - core turn dedupe, reconnect, and transcript selection still owned by `@absolutejs/voice`
450
+
102
451
  ## Adapter Contract
103
452
 
104
453
  Adapters normalize vendor behavior into a core event model so the plugin never branches on vendor names.
@@ -185,6 +534,40 @@ Default reconnect strategy is `resume-last-turn`.
185
534
 
186
535
  If an adapter does not emit native end-of-turn events, core falls back to silence detection with a default `700ms` threshold.
187
536
 
537
+ ## STT Fallback
538
+
539
+ You can pair a primary vendor with an optional fallback vendor per route when you need extra reliability for accents, edge environments, or short commands.
540
+
541
+ ```ts
542
+ voice({
543
+ path: '/voice/intake',
544
+ preset: 'default',
545
+ session: createVoiceMemoryStore(),
546
+ stt: deepgram({ apiKey: process.env.DEEPGRAM_API_KEY!, model: 'nova-3' }),
547
+ sttFallback: {
548
+ adapter: assemblyai({ apiKey: process.env.ASSEMBLYAI_API_KEY! }),
549
+ trigger: 'empty-or-low-confidence',
550
+ confidenceThreshold: 0.65,
551
+ minTextLength: 2,
552
+ replayWindowMs: 8000,
553
+ settleMs: 220,
554
+ maxAttemptsPerTurn: 1
555
+ },
556
+ onTurn: async ({ turn }) => {
557
+ return { assistantText: `Captured: ${turn.text}` };
558
+ },
559
+ onComplete: async () => {}
560
+ });
561
+ ```
562
+
563
+ Fallback triggers are evaluated at commit time:
564
+
565
+ - `empty-turn`: commit is empty (`< minTextLength` words), then fallback is attempted
566
+ - `low-confidence`: average transcript confidence is below `confidenceThreshold`
567
+ - `empty-or-low-confidence`: both conditions
568
+
569
+ The fallback adapter receives the same window of turn audio as the primary (default `8s`, configurable with `replayWindowMs`) and can only run `maxAttemptsPerTurn` times per turn.
570
+
188
571
  ## Client Primitives
189
572
 
190
573
  Browser and framework helpers sit on top of the same connection core:
@@ -1 +1,2 @@
1
1
  export { VoiceStreamService } from './voice-stream.service';
2
+ export { VoiceControllerService } from './voice-controller.service';