@absolutejs/voice 0.0.20 → 0.0.22-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +884 -4
  2. package/dist/angular/index.d.ts +1 -0
  3. package/dist/angular/index.js +759 -3
  4. package/dist/angular/voice-controller.service.d.ts +27 -0
  5. package/dist/angular/voice-stream.service.d.ts +6 -0
  6. package/dist/audioConditioning.d.ts +3 -0
  7. package/dist/client/actions.d.ts +48 -0
  8. package/dist/client/audioPlayer.d.ts +40 -0
  9. package/dist/client/connection.d.ts +5 -0
  10. package/dist/client/controller.d.ts +2 -0
  11. package/dist/client/duplex.d.ts +3 -0
  12. package/dist/client/htmxBootstrap.js +660 -167
  13. package/dist/client/index.d.ts +3 -0
  14. package/dist/client/index.js +991 -6
  15. package/dist/client/microphone.d.ts +4 -2
  16. package/dist/correction.d.ts +33 -0
  17. package/dist/fileStore.d.ts +27 -0
  18. package/dist/index.d.ts +15 -0
  19. package/dist/index.js +3721 -298
  20. package/dist/ops.d.ts +100 -0
  21. package/dist/presets.d.ts +13 -0
  22. package/dist/react/index.d.ts +1 -0
  23. package/dist/react/index.js +728 -3
  24. package/dist/react/useVoiceController.d.ts +26 -0
  25. package/dist/react/useVoiceStream.d.ts +7 -0
  26. package/dist/routing.d.ts +3 -0
  27. package/dist/runtimeOps.d.ts +23 -0
  28. package/dist/store.d.ts +2 -2
  29. package/dist/svelte/index.d.ts +1 -0
  30. package/dist/svelte/index.js +691 -3
  31. package/dist/telephony/response.d.ts +7 -0
  32. package/dist/telephony/twilio.d.ts +116 -0
  33. package/dist/testing/benchmark.d.ts +93 -2
  34. package/dist/testing/corrected.d.ts +41 -0
  35. package/dist/testing/duplex.d.ts +59 -0
  36. package/dist/testing/fixtures.d.ts +18 -2
  37. package/dist/testing/index.d.ts +5 -0
  38. package/dist/testing/index.js +6247 -402
  39. package/dist/testing/review.d.ts +143 -0
  40. package/dist/testing/sessionBenchmark.d.ts +92 -2
  41. package/dist/testing/stt.d.ts +3 -1
  42. package/dist/testing/telephony.d.ts +70 -0
  43. package/dist/testing/tts.d.ts +73 -0
  44. package/dist/turnDetection.d.ts +5 -1
  45. package/dist/turnProfiles.d.ts +6 -0
  46. package/dist/types.d.ts +487 -10
  47. package/dist/vue/index.d.ts +1 -0
  48. package/dist/vue/index.js +750 -3
  49. package/dist/vue/useVoiceController.d.ts +30 -0
  50. package/dist/vue/useVoiceStream.d.ts +11 -0
  51. package/fixtures/README.md +9 -0
  52. package/fixtures/manifest.json +59 -1
  53. package/fixtures/pcm/dialogue-three-clean.pcm +0 -0
  54. package/fixtures/pcm/dialogue-three-mixed.pcm +0 -0
  55. package/fixtures/pcm/dialogue-two-clean.pcm +0 -0
  56. package/fixtures/pcm/dialogue-two-noisy.pcm +0 -0
  57. package/package.json +135 -1
package/dist/types.d.ts CHANGED
@@ -1,21 +1,99 @@
1
1
  import type { SessionStore } from '@absolutejs/absolute';
2
+ import type { StoredVoiceIntegrationEvent, StoredVoiceOpsTask, VoiceIntegrationEventStore, VoiceOpsTask, VoiceOpsTaskStore } from './ops';
3
+ import type { StoredVoiceCallReviewArtifact, VoiceCallReviewArtifact, VoiceCallReviewStore } from './testing/review';
2
4
  export type AudioFormat = {
3
5
  container: 'raw';
4
- encoding: 'pcm_s16le';
6
+ encoding: 'alaw' | 'mulaw' | 'pcm_s16le';
5
7
  sampleRateHz: number;
6
8
  channels: 1 | 2;
7
9
  };
8
10
  export type AudioChunk = ArrayBuffer | ArrayBufferView;
11
+ export type VoiceLanguageStrategy = {
12
+ mode: 'auto-detect';
13
+ allowedLanguages?: string[];
14
+ } | {
15
+ mode: 'fixed';
16
+ primaryLanguage: string;
17
+ secondaryLanguages?: string[];
18
+ } | {
19
+ mode: 'allow-switching';
20
+ primaryLanguage?: string;
21
+ secondaryLanguages: string[];
22
+ };
23
+ export type VoicePhraseHint = {
24
+ text: string;
25
+ aliases?: string[];
26
+ boost?: number;
27
+ metadata?: Record<string, unknown>;
28
+ };
29
+ export type VoiceCorrectionRiskTier = 'safe' | 'balanced' | 'risky';
30
+ export type VoiceDomainTerm = {
31
+ text: string;
32
+ aliases?: string[];
33
+ boost?: number;
34
+ language?: string;
35
+ metadata?: Record<string, unknown>;
36
+ pronunciation?: string;
37
+ };
38
+ export type VoiceLexiconEntry = {
39
+ text: string;
40
+ aliases?: string[];
41
+ language?: string;
42
+ metadata?: Record<string, unknown>;
43
+ pronunciation?: string;
44
+ };
9
45
  export type Transcript = {
10
46
  id: string;
11
47
  text: string;
12
48
  isFinal: boolean;
13
49
  confidence?: number;
14
50
  language?: string;
51
+ speaker?: string | number;
15
52
  startedAtMs?: number;
16
53
  endedAtMs?: number;
17
54
  vendor?: string;
18
55
  };
56
+ export type VoiceTranscriptQuality = {
57
+ averageConfidence?: number;
58
+ confidenceSampleCount: number;
59
+ correction?: VoiceTurnCorrectionDiagnostics;
60
+ cost?: VoiceTurnCostEstimate;
61
+ fallbackUsed: boolean;
62
+ finalTranscriptCount: number;
63
+ fallback?: VoiceFallbackDiagnostics;
64
+ partialTranscriptCount: number;
65
+ selectedTranscriptCount: number;
66
+ source: 'fallback' | 'primary';
67
+ };
68
+ export type VoiceTurnCorrectionDiagnostics = {
69
+ attempted: boolean;
70
+ changed: boolean;
71
+ correctedText: string;
72
+ metadata?: Record<string, unknown>;
73
+ originalText: string;
74
+ provider?: string;
75
+ reason?: string;
76
+ };
77
+ export type VoiceTurnCostEstimate = {
78
+ estimatedRelativeCostUnits: number;
79
+ fallbackAttemptCount: number;
80
+ fallbackReplayAudioMs: number;
81
+ primaryAudioMs: number;
82
+ totalBillableAudioMs: number;
83
+ };
84
+ export type VoiceFallbackSelectionReason = 'fallback-empty' | 'primary-empty' | 'word-count-margin' | 'confidence-margin' | 'word-count-tiebreak' | 'kept-primary';
85
+ export type VoiceFallbackDiagnostics = {
86
+ attempted: boolean;
87
+ fallbackConfidence?: number;
88
+ fallbackText?: string;
89
+ fallbackWordCount?: number;
90
+ primaryConfidence: number;
91
+ primaryText: string;
92
+ primaryWordCount: number;
93
+ selected: boolean;
94
+ selectionReason: VoiceFallbackSelectionReason;
95
+ trigger: 'empty-turn' | 'low-confidence' | 'empty-or-low-confidence' | 'always';
96
+ };
19
97
  export type VoicePartialEvent = {
20
98
  type: 'partial';
21
99
  transcript: Transcript;
@@ -58,6 +136,9 @@ export type STTAdapterSession = {
58
136
  export type STTAdapterOpenOptions = {
59
137
  sessionId: string;
60
138
  format: AudioFormat;
139
+ languageStrategy?: VoiceLanguageStrategy;
140
+ lexicon?: VoiceLexiconEntry[];
141
+ phraseHints?: VoicePhraseHint[];
61
142
  signal?: AbortSignal;
62
143
  };
63
144
  export type STTAdapter<TOptions extends STTAdapterOpenOptions = STTAdapterOpenOptions> = {
@@ -82,6 +163,7 @@ export type TTSAdapterSession = {
82
163
  };
83
164
  export type TTSAdapterOpenOptions = {
84
165
  sessionId: string;
166
+ lexicon?: VoiceLexiconEntry[];
85
167
  signal?: AbortSignal;
86
168
  };
87
169
  export type TTSAdapter<TOptions extends TTSAdapterOpenOptions = TTSAdapterOpenOptions> = {
@@ -99,6 +181,9 @@ export type RealtimeAdapterSession = {
99
181
  export type RealtimeAdapterOpenOptions = {
100
182
  sessionId: string;
101
183
  format: AudioFormat;
184
+ languageStrategy?: VoiceLanguageStrategy;
185
+ lexicon?: VoiceLexiconEntry[];
186
+ phraseHints?: VoicePhraseHint[];
102
187
  signal?: AbortSignal;
103
188
  };
104
189
  export type RealtimeAdapter<TOptions extends RealtimeAdapterOpenOptions = RealtimeAdapterOpenOptions> = {
@@ -109,11 +194,23 @@ export type VoiceSessionStatus = 'active' | 'reconnecting' | 'completed' | 'fail
109
194
  export type VoiceTurnRecord<TResult = unknown> = {
110
195
  id: string;
111
196
  text: string;
197
+ quality?: VoiceTranscriptQuality;
112
198
  transcripts: Transcript[];
113
199
  assistantText?: string;
114
200
  committedAt: number;
115
201
  result?: TResult;
116
202
  };
203
+ export type VoiceCostTelemetryConfig<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = {
204
+ fallbackPassCostUnit?: number;
205
+ onTurnCost?: (input: {
206
+ api: VoiceSessionHandle<TContext, TSession, TResult>;
207
+ context: TContext;
208
+ estimate: VoiceTurnCostEstimate;
209
+ session: TSession;
210
+ turn: VoiceTurnRecord<TResult>;
211
+ }) => Promise<void> | void;
212
+ primaryPassCostUnit?: number;
213
+ };
117
214
  export type VoiceSessionRecord<TMeta = Record<string, never>, TResult = unknown> = {
118
215
  id: string;
119
216
  createdAt: number;
@@ -123,8 +220,13 @@ export type VoiceSessionRecord<TMeta = Record<string, never>, TResult = unknown>
123
220
  currentTurn: {
124
221
  transcripts: Transcript[];
125
222
  partialText: string;
223
+ partialStartedAt?: number;
224
+ partialEndedAt?: number;
126
225
  finalText: string;
127
226
  lastAudioAt?: number;
227
+ lastSpeechAt?: number;
228
+ lastTranscriptAt?: number;
229
+ silenceStartedAt?: number;
128
230
  };
129
231
  turns: VoiceTurnRecord<TResult>[];
130
232
  committedTurnIds: string[];
@@ -132,7 +234,15 @@ export type VoiceSessionRecord<TMeta = Record<string, never>, TResult = unknown>
132
234
  attempts: number;
133
235
  lastDisconnectAt?: number;
134
236
  };
237
+ lastCommittedTurn?: {
238
+ signature: string;
239
+ text: string;
240
+ transcriptIds: string[];
241
+ committedAt: number;
242
+ };
243
+ call?: VoiceCallLifecycleState;
135
244
  metadata?: TMeta;
245
+ scenarioId?: string;
136
246
  };
137
247
  export type VoiceSessionSummary = {
138
248
  id: string;
@@ -141,6 +251,22 @@ export type VoiceSessionSummary = {
141
251
  status: VoiceSessionStatus;
142
252
  turnCount: number;
143
253
  };
254
+ export type VoiceCallDisposition = 'completed' | 'transferred' | 'escalated' | 'voicemail' | 'no-answer' | 'failed' | 'closed';
255
+ export type VoiceCallLifecycleEvent = {
256
+ at: number;
257
+ type: 'start' | 'end' | 'transfer' | 'escalation' | 'voicemail' | 'no-answer';
258
+ disposition?: VoiceCallDisposition;
259
+ metadata?: Record<string, unknown>;
260
+ reason?: string;
261
+ target?: string;
262
+ };
263
+ export type VoiceCallLifecycleState = {
264
+ disposition?: VoiceCallDisposition;
265
+ endedAt?: number;
266
+ events: VoiceCallLifecycleEvent[];
267
+ lastEventAt: number;
268
+ startedAt: number;
269
+ };
144
270
  export type VoiceSessionStore<TSession extends VoiceSessionRecord = VoiceSessionRecord> = SessionStore<TSession, VoiceSessionSummary>;
145
271
  export type VoiceLogger = {
146
272
  debug?: (message: string, meta?: Record<string, unknown>) => void;
@@ -153,6 +279,59 @@ export type VoiceReconnectConfig = {
153
279
  timeout?: number;
154
280
  maxAttempts?: number;
155
281
  };
282
+ export type VoiceRuntimePreset = 'default' | 'chat' | 'guided-intake' | 'dictation' | 'noisy-room' | 'pstn-balanced' | 'pstn-fast' | 'reliability';
283
+ export type VoiceSTTLifecycle = 'continuous' | 'turn-scoped';
284
+ export type VoiceTurnProfile = 'fast' | 'balanced' | 'long-form';
285
+ export type VoiceTurnQualityProfile = 'general' | 'accent-heavy' | 'noisy-room' | 'short-command';
286
+ export type VoiceTurnFallbackTrigger = 'empty-turn' | 'low-confidence' | 'empty-or-low-confidence' | 'always';
287
+ export type VoiceSTTFallbackConfig = {
288
+ adapter: STTAdapter;
289
+ trigger?: VoiceTurnFallbackTrigger;
290
+ confidenceThreshold?: number;
291
+ minTextLength?: number;
292
+ replayWindowMs?: number;
293
+ settleMs?: number;
294
+ completionTimeoutMs?: number;
295
+ maxAttemptsPerTurn?: number;
296
+ };
297
+ export type VoiceResolvedSTTFallbackConfig = {
298
+ adapter: STTAdapter;
299
+ trigger: VoiceTurnFallbackTrigger;
300
+ confidenceThreshold: number;
301
+ minTextLength: number;
302
+ replayWindowMs: number;
303
+ settleMs: number;
304
+ completionTimeoutMs: number;
305
+ maxAttemptsPerTurn: number;
306
+ };
307
+ export type VoiceTurnDetectionConfig = {
308
+ profile?: VoiceTurnProfile;
309
+ qualityProfile?: VoiceTurnQualityProfile;
310
+ silenceMs?: number;
311
+ speechThreshold?: number;
312
+ transcriptStabilityMs?: number;
313
+ };
314
+ export type VoiceResolvedTurnDetectionConfig = {
315
+ qualityProfile: VoiceTurnQualityProfile;
316
+ profile: VoiceTurnProfile;
317
+ silenceMs: number;
318
+ speechThreshold: number;
319
+ transcriptStabilityMs: number;
320
+ };
321
+ export type VoiceAudioConditioningConfig = {
322
+ enabled?: boolean;
323
+ targetLevel?: number;
324
+ maxGain?: number;
325
+ noiseGateThreshold?: number;
326
+ noiseGateAttenuation?: number;
327
+ };
328
+ export type VoiceResolvedAudioConditioningConfig = {
329
+ enabled: true;
330
+ targetLevel: number;
331
+ maxGain: number;
332
+ noiseGateThreshold: number;
333
+ noiseGateAttenuation: number;
334
+ };
156
335
  export type VoiceSocket = {
157
336
  send: (data: string | Uint8Array | ArrayBuffer) => void | Promise<void>;
158
337
  close: (code?: number, reason?: string) => void | Promise<void>;
@@ -164,7 +343,26 @@ export type VoiceSessionHandle<TContext = unknown, TSession extends VoiceSession
164
343
  commitTurn: (reason?: VoiceEndOfTurnEvent['reason']) => Promise<void>;
165
344
  disconnect: (event?: VoiceCloseEvent) => Promise<void>;
166
345
  complete: (result?: TResult) => Promise<void>;
346
+ escalate: (input: {
347
+ metadata?: Record<string, unknown>;
348
+ reason: string;
349
+ result?: TResult;
350
+ }) => Promise<void>;
167
351
  fail: (error: unknown) => Promise<void>;
352
+ markNoAnswer: (input?: {
353
+ metadata?: Record<string, unknown>;
354
+ result?: TResult;
355
+ }) => Promise<void>;
356
+ markVoicemail: (input?: {
357
+ metadata?: Record<string, unknown>;
358
+ result?: TResult;
359
+ }) => Promise<void>;
360
+ transfer: (input: {
361
+ metadata?: Record<string, unknown>;
362
+ reason?: string;
363
+ result?: TResult;
364
+ target: string;
365
+ }) => Promise<void>;
168
366
  close: (reason?: string) => Promise<void>;
169
367
  snapshot: () => Promise<TSession>;
170
368
  };
@@ -172,7 +370,48 @@ export type VoiceRouteResult<TResult = unknown> = {
172
370
  complete?: boolean;
173
371
  result?: TResult;
174
372
  assistantText?: string;
373
+ transfer?: {
374
+ metadata?: Record<string, unknown>;
375
+ reason?: string;
376
+ target: string;
377
+ };
378
+ escalate?: {
379
+ metadata?: Record<string, unknown>;
380
+ reason: string;
381
+ };
382
+ voicemail?: {
383
+ metadata?: Record<string, unknown>;
384
+ };
385
+ noAnswer?: {
386
+ metadata?: Record<string, unknown>;
387
+ };
175
388
  };
389
+ export type VoiceTurnCorrectionResult = string | {
390
+ text: string;
391
+ reason?: string;
392
+ provider?: string;
393
+ metadata?: Record<string, unknown>;
394
+ };
395
+ export type VoiceTurnCorrectionHandler<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = (input: {
396
+ api: VoiceSessionHandle<TContext, TSession, TResult>;
397
+ context: TContext;
398
+ fallback?: VoiceFallbackDiagnostics;
399
+ lexicon: VoiceLexiconEntry[];
400
+ phraseHints: VoicePhraseHint[];
401
+ session: TSession;
402
+ text: string;
403
+ transcripts: Transcript[];
404
+ }) => Promise<VoiceTurnCorrectionResult | void> | VoiceTurnCorrectionResult | void;
405
+ export type VoicePhraseHintResolver<TContext = unknown> = (input: {
406
+ context: TContext;
407
+ scenarioId?: string;
408
+ sessionId: string;
409
+ }) => Promise<VoicePhraseHint[] | void> | VoicePhraseHint[] | void;
410
+ export type VoiceLexiconResolver<TContext = unknown> = (input: {
411
+ context: TContext;
412
+ scenarioId?: string;
413
+ sessionId: string;
414
+ }) => Promise<VoiceLexiconEntry[] | void> | VoiceLexiconEntry[] | void;
176
415
  export type VoiceOnTurnObjectHandler<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = (input: {
177
416
  context: TContext;
178
417
  session: TSession;
@@ -181,11 +420,26 @@ export type VoiceOnTurnObjectHandler<TContext = unknown, TSession extends VoiceS
181
420
  }) => Promise<VoiceRouteResult<TResult> | void> | VoiceRouteResult<TResult> | void;
182
421
  export type VoiceOnTurnHandler<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = VoiceOnTurnObjectHandler<TContext, TSession, TResult> | ((session: TSession, turn: VoiceTurnRecord, api: VoiceSessionHandle<TContext, TSession, TResult>, context: TContext) => Promise<VoiceRouteResult<TResult> | void> | VoiceRouteResult<TResult> | void);
183
422
  export type VoiceRouteConfig<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = {
423
+ onCallStart?: (input: {
424
+ context: TContext;
425
+ session: TSession;
426
+ api: VoiceSessionHandle<TContext, TSession, TResult>;
427
+ }) => Promise<void> | void;
428
+ onCallEnd?: (input: {
429
+ api: VoiceSessionHandle<TContext, TSession, TResult>;
430
+ context: TContext;
431
+ disposition: VoiceCallDisposition;
432
+ metadata?: Record<string, unknown>;
433
+ reason?: string;
434
+ session: TSession;
435
+ target?: string;
436
+ }) => Promise<void> | void;
184
437
  onSession?: (input: {
185
438
  context: TContext;
186
439
  session: TSession;
187
440
  api: VoiceSessionHandle<TContext, TSession, TResult>;
188
441
  }) => Promise<void> | void;
442
+ correctTurn?: VoiceTurnCorrectionHandler<TContext, TSession, TResult>;
189
443
  onTurn: VoiceOnTurnHandler<TContext, TSession, TResult>;
190
444
  onComplete: (input: {
191
445
  context: TContext;
@@ -199,34 +453,111 @@ export type VoiceRouteConfig<TContext = unknown, TSession extends VoiceSessionRe
199
453
  error: unknown;
200
454
  api?: VoiceSessionHandle<TContext, TSession, TResult>;
201
455
  }) => Promise<void> | void;
456
+ onEscalation?: (input: {
457
+ api: VoiceSessionHandle<TContext, TSession, TResult>;
458
+ context: TContext;
459
+ metadata?: Record<string, unknown>;
460
+ reason: string;
461
+ session: TSession;
462
+ }) => Promise<void> | void;
463
+ onNoAnswer?: (input: {
464
+ api: VoiceSessionHandle<TContext, TSession, TResult>;
465
+ context: TContext;
466
+ metadata?: Record<string, unknown>;
467
+ session: TSession;
468
+ }) => Promise<void> | void;
469
+ onTransfer?: (input: {
470
+ api: VoiceSessionHandle<TContext, TSession, TResult>;
471
+ context: TContext;
472
+ metadata?: Record<string, unknown>;
473
+ reason?: string;
474
+ session: TSession;
475
+ target: string;
476
+ }) => Promise<void> | void;
477
+ onVoicemail?: (input: {
478
+ api: VoiceSessionHandle<TContext, TSession, TResult>;
479
+ context: TContext;
480
+ metadata?: Record<string, unknown>;
481
+ session: TSession;
482
+ }) => Promise<void> | void;
483
+ };
484
+ export type VoiceRuntimeOpsConfig<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = {
485
+ buildReview?: (input: {
486
+ api: VoiceSessionHandle<TContext, TSession, TResult>;
487
+ context: TContext;
488
+ disposition: VoiceCallDisposition;
489
+ metadata?: Record<string, unknown>;
490
+ reason?: string;
491
+ result?: TResult;
492
+ session: TSession;
493
+ target?: string;
494
+ }) => Promise<VoiceCallReviewArtifact | StoredVoiceCallReviewArtifact | void> | VoiceCallReviewArtifact | StoredVoiceCallReviewArtifact | void;
495
+ createTaskFromReview?: (input: {
496
+ api: VoiceSessionHandle<TContext, TSession, TResult>;
497
+ context: TContext;
498
+ disposition: VoiceCallDisposition;
499
+ review: StoredVoiceCallReviewArtifact;
500
+ session: TSession;
501
+ }) => Promise<Omit<VoiceOpsTask, 'id'> | VoiceOpsTask | StoredVoiceOpsTask | null | void> | Omit<VoiceOpsTask, 'id'> | VoiceOpsTask | StoredVoiceOpsTask | null | void;
502
+ events?: VoiceIntegrationEventStore;
503
+ onEvent?: (input: {
504
+ api: VoiceSessionHandle<TContext, TSession, TResult>;
505
+ context: TContext;
506
+ event: StoredVoiceIntegrationEvent;
507
+ session: TSession;
508
+ }) => Promise<void> | void;
509
+ reviews?: VoiceCallReviewStore;
510
+ tasks?: VoiceOpsTaskStore;
202
511
  };
203
512
  export type VoiceNormalizedRouteConfig<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = Omit<VoiceRouteConfig<TContext, TSession, TResult>, 'onTurn'> & {
204
513
  onTurn: VoiceOnTurnObjectHandler<TContext, TSession, TResult>;
205
514
  };
515
+ export type VoiceScenario = {
516
+ id: string;
517
+ name?: string;
518
+ description?: string;
519
+ metadata?: Record<string, unknown>;
520
+ };
521
+ export type VoiceExpectedSpeakerTurn = {
522
+ speaker: string;
523
+ text: string;
524
+ };
206
525
  export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = {
526
+ costTelemetry?: VoiceCostTelemetryConfig<TContext, TSession, TResult>;
207
527
  path: string;
528
+ languageStrategy?: VoiceLanguageStrategy;
529
+ lexicon?: VoiceLexiconEntry[] | VoiceLexiconResolver<TContext>;
530
+ phraseHints?: VoicePhraseHint[] | VoicePhraseHintResolver<TContext>;
531
+ preset?: VoiceRuntimePreset;
208
532
  stt: STTAdapter;
533
+ sttFallback?: VoiceSTTFallbackConfig;
534
+ sttLifecycle?: VoiceSTTLifecycle;
209
535
  tts?: TTSAdapter;
210
536
  session: VoiceSessionStore<NoInfer<TSession>>;
211
537
  reconnect?: VoiceReconnectConfig;
212
- turnDetection?: {
213
- silenceMs?: number;
214
- speechThreshold?: number;
215
- };
538
+ turnDetection?: VoiceTurnDetectionConfig;
539
+ audioConditioning?: VoiceAudioConditioningConfig;
216
540
  logger?: VoiceLogger;
217
541
  htmx?: boolean | VoiceHTMXConfig<TSession, NoInfer<TResult>>;
542
+ ops?: VoiceRuntimeOpsConfig<TContext, TSession, TResult>;
218
543
  } & VoiceRouteConfig<TContext, TSession, TResult>;
219
544
  export type CreateVoiceSessionOptions<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = {
545
+ costTelemetry?: VoiceCostTelemetryConfig<TContext, TSession, TResult>;
220
546
  id: string;
221
547
  context: TContext;
222
548
  socket: VoiceSocket;
223
549
  stt: STTAdapter;
550
+ tts?: TTSAdapter;
551
+ languageStrategy?: VoiceLanguageStrategy;
552
+ lexicon?: VoiceLexiconEntry[];
553
+ sttFallback?: VoiceResolvedSTTFallbackConfig;
224
554
  store: VoiceSessionStore<TSession>;
225
555
  reconnect: Required<VoiceReconnectConfig>;
226
- turnDetection: {
227
- silenceMs: number;
228
- speechThreshold: number;
229
- };
556
+ phraseHints?: VoicePhraseHint[];
557
+ scenarioId?: string;
558
+ sttLifecycle: VoiceSTTLifecycle;
559
+ turnDetection: VoiceResolvedTurnDetectionConfig;
560
+ audioConditioning?: VoiceResolvedAudioConditioningConfig;
230
561
  route: VoiceNormalizedRouteConfig<TContext, TSession, TResult>;
231
562
  logger?: VoiceLogger;
232
563
  };
@@ -234,6 +565,7 @@ export type CreateVoiceSession = <TContext = unknown, TSession extends VoiceSess
234
565
  export type VoiceClientStartMessage = {
235
566
  type: 'start';
236
567
  sessionId?: string;
568
+ scenarioId?: string;
237
569
  };
238
570
  export type VoiceClientEndTurnMessage = {
239
571
  type: 'end_turn';
@@ -250,6 +582,7 @@ export type VoiceServerSessionMessage = {
250
582
  type: 'session';
251
583
  sessionId: string;
252
584
  status: VoiceSessionStatus;
585
+ scenarioId?: string;
253
586
  };
254
587
  export type VoiceServerPartialMessage = {
255
588
  type: 'partial';
@@ -268,6 +601,13 @@ export type VoiceServerAssistantMessage = {
268
601
  text: string;
269
602
  turnId?: string;
270
603
  };
604
+ export type VoiceServerAudioMessage = {
605
+ type: 'audio';
606
+ chunkBase64: string;
607
+ format: AudioFormat;
608
+ receivedAt: number;
609
+ turnId?: string;
610
+ };
271
611
  export type VoiceServerCompleteMessage = {
272
612
  type: 'complete';
273
613
  sessionId: string;
@@ -280,17 +620,54 @@ export type VoiceServerErrorMessage = {
280
620
  export type VoiceServerPongMessage = {
281
621
  type: 'pong';
282
622
  };
283
- export type VoiceServerMessage<TResult = unknown> = VoiceServerSessionMessage | VoiceServerPartialMessage | VoiceServerFinalMessage | VoiceServerTurnMessage<TResult> | VoiceServerAssistantMessage | VoiceServerCompleteMessage | VoiceServerErrorMessage | VoiceServerPongMessage;
623
+ export type VoiceServerMessage<TResult = unknown> = VoiceServerSessionMessage | VoiceServerPartialMessage | VoiceServerFinalMessage | VoiceServerTurnMessage<TResult> | VoiceServerAssistantMessage | VoiceServerAudioMessage | VoiceServerCompleteMessage | VoiceServerErrorMessage | VoiceServerPongMessage;
284
624
  export type VoiceConnectionOptions = {
285
625
  protocols?: string[];
626
+ scenarioId?: string;
286
627
  reconnect?: boolean;
287
628
  maxReconnectAttempts?: number;
288
629
  pingInterval?: number;
289
630
  sessionId?: string;
290
631
  };
632
+ export type VoiceCaptureOptions = {
633
+ channelCount?: 1 | 2;
634
+ onLevel?: (level: number) => void;
635
+ sampleRateHz?: number;
636
+ };
637
+ export type VoiceControllerOptions = {
638
+ preset?: VoiceRuntimePreset;
639
+ connection?: VoiceConnectionOptions;
640
+ capture?: VoiceCaptureOptions;
641
+ autoStopOnComplete?: boolean;
642
+ };
643
+ export type VoiceBargeInOptions = {
644
+ enabled?: boolean;
645
+ interruptOnPartial?: boolean;
646
+ interruptThreshold?: number;
647
+ };
648
+ export type VoiceAudioPlayerOptions = {
649
+ autoStart?: boolean;
650
+ createAudioContext?: () => AudioContext;
651
+ lookaheadMs?: number;
652
+ };
653
+ export type VoiceDuplexControllerOptions = VoiceControllerOptions & {
654
+ audioPlayer?: VoiceAudioPlayerOptions;
655
+ bargeIn?: VoiceBargeInOptions;
656
+ };
657
+ export type VoiceSTTRoutingGoal = 'best' | 'low-cost';
658
+ export type VoiceSTTRoutingCorrectionMode = 'generic' | 'none' | 'risky-turn';
659
+ export type VoiceSTTRoutingStrategy = {
660
+ benchmarkSessionTarget: 'deepgram-corrected' | 'deepgram-flux';
661
+ correctionMode: VoiceSTTRoutingCorrectionMode;
662
+ goal: VoiceSTTRoutingGoal;
663
+ notes: string[];
664
+ preset: VoiceRuntimePreset;
665
+ sttLifecycle: VoiceSTTLifecycle;
666
+ };
291
667
  export type VoiceHTMXRenderInput<TResult = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord> = {
292
668
  assistantTexts: string[];
293
669
  partial: string;
670
+ scenarioId?: string;
294
671
  result?: TResult;
295
672
  session?: TSession;
296
673
  sessionId?: string;
@@ -322,15 +699,26 @@ export type VoiceHTMXOptions<TSession extends VoiceSessionRecord = VoiceSessionR
322
699
  export type VoiceHTMXConfig<TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = VoiceHTMXRenderer<TSession, TResult> | VoiceHTMXOptions<TSession, TResult>;
323
700
  export type VoiceStreamState<TResult = unknown> = {
324
701
  sessionId: string | null;
702
+ scenarioId: string | null;
325
703
  status: VoiceSessionStatus | 'idle';
326
704
  partial: string;
327
705
  turns: VoiceTurnRecord<TResult>[];
328
706
  assistantTexts: string[];
707
+ assistantAudio: Array<{
708
+ chunk: Uint8Array;
709
+ format: AudioFormat;
710
+ receivedAt: number;
711
+ turnId?: string;
712
+ }>;
329
713
  error: string | null;
330
714
  isConnected: boolean;
331
715
  };
332
716
  export type VoiceStream<TResult = unknown> = {
333
717
  close: () => void;
718
+ start: (input?: {
719
+ scenarioId?: string;
720
+ sessionId?: string;
721
+ }) => Promise<void>;
334
722
  endTurn: () => void;
335
723
  error: string | null;
336
724
  getServerSnapshot: () => VoiceStreamState<TResult>;
@@ -339,10 +727,92 @@ export type VoiceStream<TResult = unknown> = {
339
727
  partial: string;
340
728
  sendAudio: (audio: Uint8Array | ArrayBuffer) => void;
341
729
  sessionId: string | null;
730
+ scenarioId: string | null;
731
+ status: VoiceSessionStatus | 'idle';
732
+ subscribe: (subscriber: () => void) => () => void;
733
+ turns: VoiceTurnRecord<TResult>[];
734
+ assistantTexts: string[];
735
+ assistantAudio: Array<{
736
+ chunk: Uint8Array;
737
+ format: AudioFormat;
738
+ receivedAt: number;
739
+ turnId?: string;
740
+ }>;
741
+ };
742
+ export type VoiceControllerState<TResult = unknown> = VoiceStreamState<TResult> & {
743
+ isRecording: boolean;
744
+ recordingError: string | null;
745
+ };
746
+ export type VoiceAudioPlayerState = {
747
+ activeSourceCount: number;
748
+ error: string | null;
749
+ isActive: boolean;
750
+ isPlaying: boolean;
751
+ lastInterruptLatencyMs?: number;
752
+ lastPlaybackStopLatencyMs?: number;
753
+ processedChunkCount: number;
754
+ queuedChunkCount: number;
755
+ };
756
+ export type VoiceAudioPlayerSource = {
757
+ assistantAudio: VoiceStreamState['assistantAudio'];
758
+ subscribe: (subscriber: () => void) => () => void;
759
+ };
760
+ export type VoiceAudioPlayer = {
761
+ close: () => Promise<void>;
762
+ error: string | null;
763
+ getSnapshot: () => VoiceAudioPlayerState;
764
+ activeSourceCount: number;
765
+ isActive: boolean;
766
+ isPlaying: boolean;
767
+ interrupt: () => Promise<void>;
768
+ lastInterruptLatencyMs?: number;
769
+ lastPlaybackStopLatencyMs?: number;
770
+ pause: () => Promise<void>;
771
+ processedChunkCount: number;
772
+ queuedChunkCount: number;
773
+ start: () => Promise<void>;
774
+ subscribe: (subscriber: () => void) => () => void;
775
+ };
776
+ export type VoiceBargeInBinding = {
777
+ close: () => void;
778
+ handleLevel: (level: number) => void;
779
+ sendAudio: (audio: Uint8Array | ArrayBuffer) => void;
780
+ };
781
+ export type VoiceController<TResult = unknown> = {
782
+ bindHTMX: (options: VoiceHTMXBindingOptions) => () => void;
783
+ close: () => void;
784
+ endTurn: () => void;
785
+ start: (input?: {
786
+ scenarioId?: string;
787
+ sessionId?: string;
788
+ }) => Promise<void>;
789
+ error: string | null;
790
+ getServerSnapshot: () => VoiceControllerState<TResult>;
791
+ getSnapshot: () => VoiceControllerState<TResult>;
792
+ isConnected: boolean;
793
+ isRecording: boolean;
794
+ partial: string;
795
+ recordingError: string | null;
796
+ sendAudio: (audio: Uint8Array | ArrayBuffer) => void;
797
+ sessionId: string | null;
798
+ scenarioId: string | null;
799
+ startRecording: () => Promise<void>;
342
800
  status: VoiceSessionStatus | 'idle';
801
+ stopRecording: () => void;
343
802
  subscribe: (subscriber: () => void) => () => void;
803
+ toggleRecording: () => Promise<void>;
344
804
  turns: VoiceTurnRecord<TResult>[];
345
805
  assistantTexts: string[];
806
+ assistantAudio: Array<{
807
+ chunk: Uint8Array;
808
+ format: AudioFormat;
809
+ receivedAt: number;
810
+ turnId?: string;
811
+ }>;
812
+ };
813
+ export type VoiceDuplexController<TResult = unknown> = VoiceController<TResult> & {
814
+ audioPlayer: VoiceAudioPlayer;
815
+ interruptAssistant: () => Promise<void>;
346
816
  };
347
817
  export type VoiceHTMXBindingOptions = {
348
818
  element: Element | string;
@@ -353,6 +823,7 @@ export type VoiceHTMXBindingOptions = {
353
823
  export type VoiceStoreAction<TResult = unknown> = {
354
824
  type: 'session';
355
825
  sessionId: string;
826
+ scenarioId?: string;
356
827
  status: VoiceSessionStatus;
357
828
  } | {
358
829
  type: 'partial';
@@ -366,6 +837,12 @@ export type VoiceStoreAction<TResult = unknown> = {
366
837
  } | {
367
838
  type: 'assistant';
368
839
  text: string;
840
+ } | {
841
+ type: 'audio';
842
+ chunk: Uint8Array;
843
+ format: AudioFormat;
844
+ receivedAt: number;
845
+ turnId?: string;
369
846
  } | {
370
847
  type: 'complete';
371
848
  sessionId: string;