@absolutejs/voice 0.0.20 → 0.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +387 -4
  2. package/dist/angular/index.d.ts +1 -0
  3. package/dist/angular/index.js +669 -3
  4. package/dist/angular/voice-controller.service.d.ts +21 -0
  5. package/dist/audioConditioning.d.ts +3 -0
  6. package/dist/client/actions.d.ts +7 -0
  7. package/dist/client/connection.d.ts +5 -0
  8. package/dist/client/controller.d.ts +2 -0
  9. package/dist/client/htmxBootstrap.js +576 -167
  10. package/dist/client/index.d.ts +1 -0
  11. package/dist/client/index.js +486 -3
  12. package/dist/client/microphone.d.ts +4 -2
  13. package/dist/correction.d.ts +16 -0
  14. package/dist/index.d.ts +4 -0
  15. package/dist/index.js +1314 -283
  16. package/dist/presets.d.ts +13 -0
  17. package/dist/react/index.d.ts +1 -0
  18. package/dist/react/index.js +642 -3
  19. package/dist/react/useVoiceController.d.ts +20 -0
  20. package/dist/react/useVoiceStream.d.ts +1 -0
  21. package/dist/store.d.ts +2 -2
  22. package/dist/svelte/index.d.ts +1 -0
  23. package/dist/svelte/index.js +607 -3
  24. package/dist/testing/benchmark.d.ts +36 -0
  25. package/dist/testing/index.js +1453 -241
  26. package/dist/testing/sessionBenchmark.d.ts +67 -2
  27. package/dist/testing/stt.d.ts +1 -0
  28. package/dist/turnDetection.d.ts +5 -1
  29. package/dist/turnProfiles.d.ts +6 -0
  30. package/dist/types.d.ts +198 -8
  31. package/dist/vue/index.d.ts +1 -0
  32. package/dist/vue/index.js +660 -3
  33. package/dist/vue/useVoiceController.d.ts +19 -0
  34. package/fixtures/README.md +9 -0
  35. package/fixtures/manifest.json +59 -1
  36. package/fixtures/pcm/dialogue-three-clean.pcm +0 -0
  37. package/fixtures/pcm/dialogue-three-mixed.pcm +0 -0
  38. package/fixtures/pcm/dialogue-two-clean.pcm +0 -0
  39. package/fixtures/pcm/dialogue-two-noisy.pcm +0 -0
  40. package/package.json +21 -1
package/dist/index.js CHANGED
@@ -69,6 +69,61 @@ var __decorateElement = (array, flags, name, decorators, target, extra) => {
69
69
  return k || __decoratorMetadata(array, target), desc && __defProp(target, name, desc), p ? k ^ 4 ? extra : desc : target;
70
70
  };
71
71
 
72
+ // src/audioConditioning.ts
73
+ var DEFAULT_TARGET_LEVEL = 0.08;
74
+ var DEFAULT_MAX_GAIN = 3;
75
+ var DEFAULT_NOISE_GATE_THRESHOLD = 0.006;
76
+ var DEFAULT_NOISE_GATE_ATTENUATION = 0.15;
77
+ var toInt16Array = (audio) => {
78
+ if (audio instanceof ArrayBuffer) {
79
+ return new Int16Array(audio, 0, Math.floor(audio.byteLength / 2));
80
+ }
81
+ return new Int16Array(audio.buffer, audio.byteOffset, Math.floor(audio.byteLength / 2));
82
+ };
83
+ var computeRms = (samples) => {
84
+ if (samples.length === 0) {
85
+ return 0;
86
+ }
87
+ let sumSquares = 0;
88
+ for (const sample of samples) {
89
+ const normalized = sample / 32768;
90
+ sumSquares += normalized * normalized;
91
+ }
92
+ return Math.sqrt(sumSquares / samples.length);
93
+ };
94
+ var resolveAudioConditioningConfig = (config) => {
95
+ if (!config || config.enabled === false) {
96
+ return;
97
+ }
98
+ return {
99
+ enabled: true,
100
+ maxGain: config.maxGain ?? DEFAULT_MAX_GAIN,
101
+ noiseGateAttenuation: config.noiseGateAttenuation ?? DEFAULT_NOISE_GATE_ATTENUATION,
102
+ noiseGateThreshold: config.noiseGateThreshold ?? DEFAULT_NOISE_GATE_THRESHOLD,
103
+ targetLevel: config.targetLevel ?? DEFAULT_TARGET_LEVEL
104
+ };
105
+ };
106
+ var conditionAudioChunk = (audio, config) => {
107
+ if (!config) {
108
+ return audio;
109
+ }
110
+ const source = toInt16Array(audio);
111
+ if (source.length === 0) {
112
+ return audio;
113
+ }
114
+ const rms = computeRms(source);
115
+ const output = new Int16Array(source.length);
116
+ const gateFactor = rms < config.noiseGateThreshold ? config.noiseGateAttenuation : 1;
117
+ const baseLevel = Math.max(rms * gateFactor, 0.000001);
118
+ const gain = Math.min(config.maxGain, config.targetLevel / baseLevel);
119
+ const appliedGain = Math.max(0.25, gain) * gateFactor;
120
+ for (let index = 0;index < source.length; index += 1) {
121
+ const next = Math.round(source[index] * appliedGain);
122
+ output[index] = Math.max(-32768, Math.min(32767, next));
123
+ }
124
+ return new Uint8Array(output.buffer);
125
+ };
126
+
72
127
  // src/plugin.ts
73
128
  import { Elysia } from "elysia";
74
129
  import { resolve } from "path";
@@ -118,6 +173,10 @@ var defaultMetrics = (input) => {
118
173
  '<span class="voice-metric-label">Session</span>',
119
174
  `<span class="voice-metric-value">${escapeHtml(input.sessionId)}</span>`,
120
175
  "</div>",
176
+ input.session?.scenarioId ? `<div class="voice-metric">
177
+ <span class="voice-metric-label">Scenario</span>
178
+ <span class="voice-metric-value">${escapeHtml(input.session.scenarioId)}</span>
179
+ </div>` : "",
121
180
  '<div class="voice-metric">',
122
181
  '<span class="voice-metric-label">Status</span>',
123
182
  `<span class="voice-metric-value">${escapeHtml(input.status)}</span>`,
@@ -207,24 +266,245 @@ var resolveLogger = (logger) => ({
207
266
  ...logger
208
267
  });
209
268
 
269
+ // src/turnProfiles.ts
270
+ var TURN_PROFILE_DEFAULTS = {
271
+ balanced: {
272
+ qualityProfile: "general",
273
+ silenceMs: 1400,
274
+ speechThreshold: 0.012,
275
+ transcriptStabilityMs: 1000
276
+ },
277
+ fast: {
278
+ qualityProfile: "general",
279
+ silenceMs: 700,
280
+ speechThreshold: 0.015,
281
+ transcriptStabilityMs: 450
282
+ },
283
+ "long-form": {
284
+ qualityProfile: "general",
285
+ silenceMs: 2200,
286
+ speechThreshold: 0.01,
287
+ transcriptStabilityMs: 1500
288
+ }
289
+ };
290
+ var QUALITY_PROFILE_DEFAULTS = {
291
+ general: {},
292
+ "accent-heavy": {
293
+ silenceMs: 1200,
294
+ speechThreshold: 0.01,
295
+ transcriptStabilityMs: 1200
296
+ },
297
+ "noisy-room": {
298
+ silenceMs: 2000,
299
+ speechThreshold: 0.02,
300
+ transcriptStabilityMs: 1600
301
+ },
302
+ "short-command": {
303
+ silenceMs: 500,
304
+ speechThreshold: 0.016,
305
+ transcriptStabilityMs: 420
306
+ }
307
+ };
308
+ var DEFAULT_TURN_PROFILE = "fast";
309
+ var DEFAULT_QUALITY_PROFILE = "general";
310
+ var resolveTurnDetectionConfig = (config) => {
311
+ const profile = config?.profile ?? DEFAULT_TURN_PROFILE;
312
+ const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
313
+ const preset = TURN_PROFILE_DEFAULTS[profile];
314
+ const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
315
+ return {
316
+ profile,
317
+ qualityProfile,
318
+ silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
319
+ speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
320
+ transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
321
+ };
322
+ };
323
+
324
+ // src/presets.ts
325
+ var PRESET_INPUTS = {
326
+ chat: {
327
+ audioConditioning: {
328
+ enabled: true,
329
+ maxGain: 2.5,
330
+ noiseGateAttenuation: 0,
331
+ noiseGateThreshold: 0.004,
332
+ targetLevel: 0.08
333
+ },
334
+ capture: {
335
+ channelCount: 1,
336
+ sampleRateHz: 16000
337
+ },
338
+ connection: {
339
+ maxReconnectAttempts: 10,
340
+ pingInterval: 30000,
341
+ reconnect: true
342
+ },
343
+ sttLifecycle: "continuous",
344
+ turnDetection: {
345
+ qualityProfile: "short-command",
346
+ profile: "balanced"
347
+ }
348
+ },
349
+ default: {
350
+ capture: {
351
+ channelCount: 1,
352
+ sampleRateHz: 16000
353
+ },
354
+ connection: {
355
+ maxReconnectAttempts: 10,
356
+ pingInterval: 30000,
357
+ reconnect: true
358
+ },
359
+ sttLifecycle: "continuous",
360
+ turnDetection: {
361
+ qualityProfile: "general",
362
+ profile: "fast"
363
+ }
364
+ },
365
+ dictation: {
366
+ audioConditioning: {
367
+ enabled: true,
368
+ maxGain: 2.25,
369
+ noiseGateAttenuation: 0.05,
370
+ noiseGateThreshold: 0.003,
371
+ targetLevel: 0.08
372
+ },
373
+ capture: {
374
+ channelCount: 1,
375
+ sampleRateHz: 16000
376
+ },
377
+ connection: {
378
+ maxReconnectAttempts: 12,
379
+ pingInterval: 30000,
380
+ reconnect: true
381
+ },
382
+ sttLifecycle: "continuous",
383
+ turnDetection: {
384
+ qualityProfile: "accent-heavy",
385
+ profile: "long-form"
386
+ }
387
+ },
388
+ "guided-intake": {
389
+ audioConditioning: {
390
+ enabled: true,
391
+ maxGain: 2.5,
392
+ noiseGateAttenuation: 0,
393
+ noiseGateThreshold: 0.004,
394
+ targetLevel: 0.08
395
+ },
396
+ capture: {
397
+ channelCount: 1,
398
+ sampleRateHz: 16000
399
+ },
400
+ connection: {
401
+ maxReconnectAttempts: 12,
402
+ pingInterval: 30000,
403
+ reconnect: true
404
+ },
405
+ sttLifecycle: "turn-scoped",
406
+ turnDetection: {
407
+ qualityProfile: "accent-heavy",
408
+ profile: "long-form"
409
+ }
410
+ },
411
+ "noisy-room": {
412
+ audioConditioning: {
413
+ enabled: true,
414
+ maxGain: 3,
415
+ noiseGateAttenuation: 0.12,
416
+ noiseGateThreshold: 0.006,
417
+ targetLevel: 0.085
418
+ },
419
+ capture: {
420
+ channelCount: 1,
421
+ sampleRateHz: 16000
422
+ },
423
+ connection: {
424
+ maxReconnectAttempts: 14,
425
+ pingInterval: 45000,
426
+ reconnect: true
427
+ },
428
+ sttLifecycle: "continuous",
429
+ turnDetection: {
430
+ qualityProfile: "noisy-room",
431
+ profile: "long-form",
432
+ silenceMs: 2100,
433
+ speechThreshold: 0.02,
434
+ transcriptStabilityMs: 1650
435
+ }
436
+ },
437
+ reliability: {
438
+ audioConditioning: {
439
+ enabled: true,
440
+ maxGain: 2.9,
441
+ noiseGateAttenuation: 0.08,
442
+ noiseGateThreshold: 0.005,
443
+ targetLevel: 0.08
444
+ },
445
+ capture: {
446
+ channelCount: 1,
447
+ sampleRateHz: 16000
448
+ },
449
+ connection: {
450
+ maxReconnectAttempts: 14,
451
+ pingInterval: 45000,
452
+ reconnect: true
453
+ },
454
+ sttLifecycle: "continuous",
455
+ turnDetection: {
456
+ qualityProfile: "noisy-room",
457
+ profile: "long-form"
458
+ }
459
+ }
460
+ };
461
+ var resolveVoiceRuntimePreset = (name = "default") => {
462
+ const preset = PRESET_INPUTS[name];
463
+ return {
464
+ audioConditioning: resolveAudioConditioningConfig(preset.audioConditioning),
465
+ capture: {
466
+ channelCount: preset.capture?.channelCount ?? 1,
467
+ sampleRateHz: preset.capture?.sampleRateHz ?? 16000
468
+ },
469
+ connection: {
470
+ ...preset.connection
471
+ },
472
+ name,
473
+ sttLifecycle: preset.sttLifecycle ?? "continuous",
474
+ turnDetection: resolveTurnDetectionConfig(preset.turnDetection)
475
+ };
476
+ };
477
+
210
478
  // src/store.ts
211
479
  var createId = () => crypto.randomUUID();
212
- var createVoiceSessionRecord = (id) => ({
480
+ var createVoiceSessionRecord = (id, scenarioId) => ({
213
481
  committedTurnIds: [],
214
482
  createdAt: Date.now(),
215
483
  currentTurn: {
216
484
  finalText: "",
485
+ lastSpeechAt: undefined,
486
+ lastTranscriptAt: undefined,
487
+ partialEndedAt: undefined,
488
+ partialStartedAt: undefined,
217
489
  partialText: "",
490
+ silenceStartedAt: undefined,
218
491
  transcripts: []
219
492
  },
220
493
  id,
494
+ scenarioId,
221
495
  reconnect: { attempts: 0 },
222
496
  status: "active",
223
497
  transcripts: [],
224
- turns: []
498
+ turns: [],
499
+ lastCommittedTurn: {
500
+ committedAt: 0,
501
+ signature: "",
502
+ text: "",
503
+ transcriptIds: []
504
+ }
225
505
  });
226
- var resetVoiceSessionRecord = (id, existing) => ({
227
- ...createVoiceSessionRecord(id),
506
+ var resetVoiceSessionRecord = (id, existing, scenarioId) => ({
507
+ ...createVoiceSessionRecord(id, scenarioId),
228
508
  metadata: existing?.metadata
229
509
  });
230
510
  var toVoiceSessionSummary = (session) => ({
@@ -261,6 +541,61 @@ var measureAudioLevel = (audio) => {
261
541
  return Math.sqrt(sumSquares / samples.length);
262
542
  };
263
543
  var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
544
+ var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
545
+ var selectPreferredTranscriptText = (currentText, nextText) => {
546
+ const current = normalizeText(currentText);
547
+ const next = normalizeText(nextText);
548
+ if (!current) {
549
+ return next;
550
+ }
551
+ if (!next) {
552
+ return current;
553
+ }
554
+ if (current === next || current.includes(next)) {
555
+ return current;
556
+ }
557
+ if (next.includes(current)) {
558
+ return next;
559
+ }
560
+ if (countWords(next) > countWords(current)) {
561
+ return next;
562
+ }
563
+ return current;
564
+ };
565
+ var mergeSequentialTranscriptText = (currentText, nextText) => {
566
+ const current = normalizeText(currentText);
567
+ const next = normalizeText(nextText);
568
+ if (!current) {
569
+ return next;
570
+ }
571
+ if (!next) {
572
+ return current;
573
+ }
574
+ const currentWords = current.split(" ");
575
+ const nextWords = next.split(" ");
576
+ const maxOverlap = Math.min(currentWords.length, nextWords.length);
577
+ for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
578
+ const currentSuffix = currentWords.slice(-overlap).join(" ");
579
+ const nextPrefix = nextWords.slice(0, overlap).join(" ");
580
+ if (currentSuffix === nextPrefix) {
581
+ return [...currentWords, ...nextWords.slice(overlap)].join(" ");
582
+ }
583
+ }
584
+ return `${current} ${next}`.trim();
585
+ };
586
+ var countCommonPrefixWords = (currentText, nextText) => {
587
+ const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
588
+ const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
589
+ const maxWords = Math.min(currentWords.length, nextWords.length);
590
+ let count = 0;
591
+ for (let index = 0;index < maxWords; index += 1) {
592
+ if (currentWords[index] !== nextWords[index]) {
593
+ break;
594
+ }
595
+ count += 1;
596
+ }
597
+ return count;
598
+ };
264
599
  var mergeTranscriptTexts = (transcripts) => {
265
600
  const merged = [];
266
601
  for (const transcript of transcripts) {
@@ -284,24 +619,141 @@ var mergeTranscriptTexts = (transcripts) => {
284
619
  }
285
620
  return merged.join(" ").trim();
286
621
  };
287
- var buildTurnText = (transcripts, partialText) => {
622
+ var buildTurnText = (transcripts, partialText, options = {}) => {
288
623
  const finalText = mergeTranscriptTexts(transcripts);
289
- if (finalText) {
290
- return finalText;
624
+ const nextPartial = normalizeText(partialText);
625
+ const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
626
+ if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
627
+ return mergeSequentialTranscriptText(finalText, nextPartial);
291
628
  }
292
- return normalizeText(partialText);
629
+ return selectPreferredTranscriptText(finalText, nextPartial);
293
630
  };
294
631
 
295
632
  // src/session.ts
296
633
  var DEFAULT_RECONNECT_TIMEOUT = 30000;
297
634
  var DEFAULT_MAX_RECONNECT_ATTEMPTS = 10;
635
+ var DEFAULT_TRANSCRIPT_STABILITY_MS = 450;
636
+ var DEFAULT_FALLBACK_REPLAY_MS = 8000;
637
+ var DEFAULT_FALLBACK_SETTLE_MS = 220;
638
+ var DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS = 2500;
639
+ var DEFAULT_FALLBACK_CONFIDENCE_THRESHOLD = 0.6;
640
+ var DEFAULT_FALLBACK_MIN_TEXT_LENGTH = 2;
641
+ var DEFAULT_FALLBACK_MAX_ATTEMPTS_PER_TURN = 1;
642
+ var DEFAULT_DUPLICATE_TURN_WINDOW_MS = 5000;
643
+ var FALLBACK_CONFIDENCE_SELECTION_DELTA = 0.05;
644
+ var FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO = 0.12;
645
+ var DEFAULT_FORMAT = {
646
+ channels: 1,
647
+ container: "raw",
648
+ encoding: "pcm_s16le",
649
+ sampleRateHz: 16000
650
+ };
298
651
  var toError = (value) => value instanceof Error ? value : new Error(String(value));
299
652
  var createEmptyCurrentTurn = () => ({
300
653
  finalText: "",
654
+ lastSpeechAt: undefined,
655
+ lastTranscriptAt: undefined,
656
+ partialEndedAt: undefined,
657
+ partialStartedAt: undefined,
301
658
  partialText: "",
659
+ silenceStartedAt: undefined,
302
660
  transcripts: []
303
661
  });
304
662
  var cloneTranscript = (transcript) => ({ ...transcript });
663
+ var countWords2 = (text) => text.trim().split(/\s+/).filter(Boolean).length;
664
+ var normalizeText2 = (text) => text.trim().replace(/\s+/g, " ");
665
+ var getAudioChunkDurationMs = (chunk) => chunk.byteLength / (DEFAULT_FORMAT.sampleRateHz * DEFAULT_FORMAT.channels * 2) * 1000;
666
+ var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => total + getAudioChunkDurationMs(chunk), 0);
667
+ var calculateMeanConfidence = (transcripts) => {
668
+ let sum = 0;
669
+ let total = 0;
670
+ for (const transcript of transcripts) {
671
+ if (typeof transcript.confidence === "number") {
672
+ sum += transcript.confidence;
673
+ total += 1;
674
+ }
675
+ }
676
+ if (total === 0) {
677
+ return 0;
678
+ }
679
+ return sum / total;
680
+ };
681
+ var createTurnQuality = (transcripts, source, fallbackUsed, fallbackDiagnostics, correctionDiagnostics) => {
682
+ const sampledTranscripts = transcripts.filter((transcript) => typeof transcript.confidence === "number");
683
+ const confidenceSampleCount = sampledTranscripts.length;
684
+ return {
685
+ averageConfidence: confidenceSampleCount > 0 ? sampledTranscripts.reduce((sum, transcript) => sum + transcript.confidence, 0) / confidenceSampleCount : undefined,
686
+ confidenceSampleCount,
687
+ correction: correctionDiagnostics,
688
+ fallback: fallbackDiagnostics,
689
+ fallbackUsed,
690
+ finalTranscriptCount: transcripts.filter((transcript) => transcript.isFinal).length,
691
+ partialTranscriptCount: transcripts.filter((transcript) => !transcript.isFinal).length,
692
+ selectedTranscriptCount: transcripts.length,
693
+ source
694
+ };
695
+ };
696
+ var normalizeCorrectionText = (text) => normalizeText2(text);
697
+ var isFallbackNeeded = (candidate, config) => {
698
+ const trimmed = normalizeText2(candidate.text);
699
+ const wordCount = countWords2(trimmed);
700
+ if (config.trigger === "always") {
701
+ return true;
702
+ }
703
+ if (config.trigger === "empty-turn") {
704
+ return wordCount < config.minTextLength;
705
+ }
706
+ const averageConfidence = calculateMeanConfidence(candidate.transcripts);
707
+ if (config.trigger === "low-confidence") {
708
+ return averageConfidence > 0 && averageConfidence < config.confidenceThreshold;
709
+ }
710
+ return averageConfidence > 0 && averageConfidence < config.confidenceThreshold || wordCount < config.minTextLength;
711
+ };
712
+ var selectBetterTurnText = (candidate, fallback) => {
713
+ if (!fallback.text) {
714
+ return {
715
+ reason: "fallback-empty",
716
+ winner: candidate
717
+ };
718
+ }
719
+ if (!candidate.text) {
720
+ return {
721
+ reason: "primary-empty",
722
+ winner: fallback
723
+ };
724
+ }
725
+ const largestWordCount = Math.max(candidate.wordCount, fallback.wordCount, 1);
726
+ const wordCountDelta = fallback.wordCount - candidate.wordCount;
727
+ const wordCountDeltaRatio = Math.abs(wordCountDelta) / largestWordCount;
728
+ if (wordCountDeltaRatio >= FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO && wordCountDelta !== 0) {
729
+ return {
730
+ reason: "word-count-margin",
731
+ winner: wordCountDelta > 0 ? fallback : candidate
732
+ };
733
+ }
734
+ if (fallback.confidence > candidate.confidence + FALLBACK_CONFIDENCE_SELECTION_DELTA) {
735
+ return {
736
+ reason: "confidence-margin",
737
+ winner: fallback
738
+ };
739
+ }
740
+ if (candidate.confidence > fallback.confidence + FALLBACK_CONFIDENCE_SELECTION_DELTA) {
741
+ return {
742
+ reason: "kept-primary",
743
+ winner: candidate
744
+ };
745
+ }
746
+ if (fallback.wordCount > candidate.wordCount) {
747
+ return {
748
+ reason: "word-count-tiebreak",
749
+ winner: fallback
750
+ };
751
+ }
752
+ return {
753
+ reason: "kept-primary",
754
+ winner: candidate
755
+ };
756
+ };
305
757
  var setTurnResult = (session, turnId, input) => {
306
758
  session.turns = session.turns.map((turn) => turn.id === turnId ? {
307
759
  ...turn,
@@ -318,12 +770,55 @@ var createVoiceSession = (options) => {
318
770
  };
319
771
  const turnDetection = {
320
772
  silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
321
- speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD
773
+ speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
774
+ transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
322
775
  };
776
+ const sttFallback = options.sttFallback ? {
777
+ adapter: options.sttFallback.adapter,
778
+ completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
779
+ confidenceThreshold: options.sttFallback.confidenceThreshold ?? DEFAULT_FALLBACK_CONFIDENCE_THRESHOLD,
780
+ maxAttemptsPerTurn: options.sttFallback.maxAttemptsPerTurn ?? DEFAULT_FALLBACK_MAX_ATTEMPTS_PER_TURN,
781
+ minTextLength: options.sttFallback.minTextLength ?? DEFAULT_FALLBACK_MIN_TEXT_LENGTH,
782
+ replayWindowMs: options.sttFallback.replayWindowMs ?? DEFAULT_FALLBACK_REPLAY_MS,
783
+ settleMs: options.sttFallback.settleMs ?? DEFAULT_FALLBACK_SETTLE_MS,
784
+ trigger: options.sttFallback.trigger ?? "empty-or-low-confidence"
785
+ } : undefined;
786
+ const phraseHints = options.phraseHints ?? [];
323
787
  let socket = options.socket;
324
788
  let sttSession = null;
325
789
  let silenceTimer = null;
326
790
  let speechDetected = false;
791
+ let operationQueue = Promise.resolve();
792
+ let adapterGenerationCounter = 0;
793
+ let activeAdapterGeneration = 0;
794
+ const currentTurnAudio = [];
795
+ let fallbackAttemptsForCurrentTurn = 0;
796
+ const pruneTurnAudio = () => {
797
+ const replayWindowMs = sttFallback?.replayWindowMs ?? DEFAULT_FALLBACK_REPLAY_MS;
798
+ const cutoffAt = Date.now() - replayWindowMs;
799
+ let index = 0;
800
+ while (index < currentTurnAudio.length && currentTurnAudio[index].recordedAt < cutoffAt) {
801
+ index += 1;
802
+ }
803
+ if (index > 0) {
804
+ currentTurnAudio.splice(0, index);
805
+ }
806
+ };
807
+ const pushTurnAudio = (audio) => {
808
+ const chunk = audio instanceof ArrayBuffer ? new Uint8Array(audio.slice(0)) : new Uint8Array(audio.buffer.slice(audio.byteOffset, audio.byteOffset + audio.byteLength));
809
+ currentTurnAudio.push({
810
+ chunk,
811
+ recordedAt: Date.now()
812
+ });
813
+ pruneTurnAudio();
814
+ };
815
+ const getFallbackWindowAudio = () => {
816
+ if (!sttFallback?.adapter) {
817
+ return [];
818
+ }
819
+ pruneTurnAudio();
820
+ return currentTurnAudio.map((audio) => audio.chunk);
821
+ };
327
822
  const clearSilenceTimer = () => {
328
823
  if (!silenceTimer) {
329
824
  return;
@@ -349,12 +844,28 @@ var createVoiceSession = (options) => {
349
844
  await options.store.set(options.id, session);
350
845
  return session;
351
846
  };
847
+ const runSerial = (phase, operation) => {
848
+ const result = operationQueue.then(async () => {
849
+ logger.debug("voice session operation", {
850
+ phase,
851
+ sessionId: options.id
852
+ });
853
+ return await operation();
854
+ });
855
+ operationQueue = result.then(() => {
856
+ return;
857
+ }, () => {
858
+ return;
859
+ });
860
+ return result;
861
+ };
352
862
  const closeAdapter = async (reason) => {
353
863
  if (!sttSession) {
354
864
  return;
355
865
  }
356
866
  const activeSession = sttSession;
357
867
  sttSession = null;
868
+ activeAdapterGeneration = 0;
358
869
  try {
359
870
  await activeSession.close(reason);
360
871
  } catch (error) {
@@ -364,13 +875,87 @@ var createVoiceSession = (options) => {
364
875
  });
365
876
  }
366
877
  };
367
- const scheduleSilenceCommit = () => {
368
- if (silenceTimer) {
878
+ const scheduleTurnCommit = (delayMs, reason, reset = true) => {
879
+ if (!reset && silenceTimer) {
369
880
  return;
370
881
  }
882
+ if (reset) {
883
+ clearSilenceTimer();
884
+ }
371
885
  silenceTimer = setTimeout(() => {
372
- api.commitTurn("silence");
373
- }, turnDetection.silenceMs);
886
+ silenceTimer = null;
887
+ api.commitTurn(reason);
888
+ }, delayMs);
889
+ };
890
+ const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
891
+ const requestTurnCommit = async (reason) => {
892
+ const session = await readSession();
893
+ const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
894
+ partialEndedAtMs: session.currentTurn.partialEndedAt,
895
+ partialStartedAtMs: session.currentTurn.partialStartedAt
896
+ });
897
+ if (!text) {
898
+ return;
899
+ }
900
+ const transcriptStabilityAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : undefined;
901
+ if (reason !== "manual" && typeof transcriptStabilityAge === "number" && transcriptStabilityAge < turnDetection.transcriptStabilityMs) {
902
+ scheduleTurnCommit(turnDetection.transcriptStabilityMs - transcriptStabilityAge, reason);
903
+ return;
904
+ }
905
+ await commitTurnInternal(reason);
906
+ };
907
+ const failInternal = async (error) => {
908
+ clearSilenceTimer();
909
+ const session = await writeSession((currentSession) => {
910
+ currentSession.lastActivityAt = Date.now();
911
+ currentSession.status = "failed";
912
+ });
913
+ const resolvedError = toError(error);
914
+ await send({
915
+ message: resolvedError.message,
916
+ recoverable: false,
917
+ type: "error"
918
+ });
919
+ await closeAdapter("failed");
920
+ speechDetected = false;
921
+ rewindFallbackTurnAudio();
922
+ await options.route.onError?.({
923
+ api,
924
+ context: options.context,
925
+ error: resolvedError,
926
+ session,
927
+ sessionId: options.id
928
+ });
929
+ };
930
+ const completeInternal = async (result) => {
931
+ clearSilenceTimer();
932
+ const session = await writeSession((currentSession) => {
933
+ if (currentSession.status === "completed") {
934
+ return;
935
+ }
936
+ currentSession.lastActivityAt = Date.now();
937
+ currentSession.status = "completed";
938
+ if (result !== undefined && currentSession.turns.length > 0) {
939
+ const lastTurn = currentSession.turns.at(-1);
940
+ if (lastTurn) {
941
+ setTurnResult(currentSession, lastTurn.id, {
942
+ result
943
+ });
944
+ }
945
+ }
946
+ });
947
+ await send({
948
+ sessionId: options.id,
949
+ type: "complete"
950
+ });
951
+ await closeAdapter("complete");
952
+ speechDetected = false;
953
+ rewindFallbackTurnAudio();
954
+ await options.route.onComplete({
955
+ api,
956
+ context: options.context,
957
+ session
958
+ });
374
959
  };
375
960
  const handleError = async (event) => {
376
961
  await send({
@@ -379,18 +964,273 @@ var createVoiceSession = (options) => {
379
964
  type: "error"
380
965
  });
381
966
  if (!event.recoverable) {
382
- await api.fail(event.error);
967
+ await failInternal(event.error);
383
968
  }
384
969
  };
385
970
  const handleClose = async (event) => {
386
971
  if (event.recoverable === false) {
387
- await api.fail(new Error(event.reason ?? "Speech-to-text session closed"));
972
+ await failInternal(new Error(event.reason ?? "Speech-to-text session closed"));
973
+ return;
974
+ }
975
+ if (!event.reason) {
976
+ await closeAdapter("provider stream closed");
977
+ return;
978
+ }
979
+ await closeAdapter(event.reason);
980
+ };
981
+ const rewindFallbackTurnAudio = () => {
982
+ fallbackAttemptsForCurrentTurn = 0;
983
+ currentTurnAudio.length = 0;
984
+ };
985
+ const runFallbackTranscription = async (primaryText, primaryTranscripts) => {
986
+ if (!sttFallback?.adapter || fallbackAttemptsForCurrentTurn >= sttFallback.maxAttemptsPerTurn) {
987
+ return null;
988
+ }
989
+ const candidate = {
990
+ text: primaryText,
991
+ transcripts: primaryTranscripts
992
+ };
993
+ if (!isFallbackNeeded(candidate, sttFallback)) {
994
+ return null;
995
+ }
996
+ fallbackAttemptsForCurrentTurn += 1;
997
+ const replayAudio = getFallbackWindowAudio();
998
+ if (replayAudio.length === 0) {
999
+ return null;
1000
+ }
1001
+ let fallbackSession = null;
1002
+ const fallbackTranscripts = [];
1003
+ let fallbackClosed = false;
1004
+ let fallbackEndOfTurnReceived = false;
1005
+ let fallbackFinalReceived = false;
1006
+ let lastFallbackTranscriptAt = 0;
1007
+ try {
1008
+ fallbackSession = await sttFallback.adapter.open({
1009
+ format: DEFAULT_FORMAT,
1010
+ phraseHints,
1011
+ sessionId: `${options.id}:fallback:${fallbackAttemptsForCurrentTurn}`
1012
+ });
1013
+ } catch (error) {
1014
+ logger.warn("voice stt fallback open failed", {
1015
+ error: toError(error).message,
1016
+ sessionId: options.id
1017
+ });
1018
+ return null;
1019
+ }
1020
+ const unsubscribers = [
1021
+ fallbackSession.on("final", ({ transcript }) => {
1022
+ fallbackFinalReceived = true;
1023
+ lastFallbackTranscriptAt = Date.now();
1024
+ fallbackTranscripts.push(cloneTranscript(transcript));
1025
+ }),
1026
+ fallbackSession.on("partial", ({ transcript }) => {
1027
+ lastFallbackTranscriptAt = Date.now();
1028
+ fallbackTranscripts.push(cloneTranscript(transcript));
1029
+ }),
1030
+ fallbackSession.on("endOfTurn", () => {
1031
+ fallbackEndOfTurnReceived = true;
1032
+ }),
1033
+ fallbackSession.on("error", (event) => {
1034
+ logger.warn("voice stt fallback error", {
1035
+ error: toError(event.error).message,
1036
+ sessionId: options.id
1037
+ });
1038
+ }),
1039
+ fallbackSession.on("close", () => {
1040
+ fallbackClosed = true;
1041
+ })
1042
+ ];
1043
+ const closeFallback = async (reason) => {
1044
+ if (!fallbackSession) {
1045
+ return;
1046
+ }
1047
+ try {
1048
+ await fallbackSession.close(reason);
1049
+ } catch (error) {
1050
+ logger.warn("voice stt fallback close failed", {
1051
+ error: toError(error).message,
1052
+ sessionId: options.id
1053
+ });
1054
+ } finally {
1055
+ fallbackSession = null;
1056
+ }
1057
+ };
1058
+ try {
1059
+ for (const chunk of replayAudio) {
1060
+ await fallbackSession.send(chunk);
1061
+ }
1062
+ const replayDurationMs = getBufferedAudioDurationMs(replayAudio);
1063
+ const completionTimeoutMs = Math.max(sttFallback.completionTimeoutMs, Math.min(4000, Math.max(sttFallback.settleMs * 4, Math.round(replayDurationMs * 0.18))));
1064
+ const waitStartedAt = Date.now();
1065
+ while (Date.now() - waitStartedAt < completionTimeoutMs) {
1066
+ const idleMs = lastFallbackTranscriptAt > 0 ? Date.now() - lastFallbackTranscriptAt : Date.now() - waitStartedAt;
1067
+ if (fallbackEndOfTurnReceived && idleMs >= sttFallback.settleMs) {
1068
+ break;
1069
+ }
1070
+ if (fallbackFinalReceived && idleMs >= sttFallback.settleMs) {
1071
+ break;
1072
+ }
1073
+ if (fallbackClosed && (lastFallbackTranscriptAt === 0 || idleMs >= sttFallback.settleMs)) {
1074
+ break;
1075
+ }
1076
+ await Bun.sleep(Math.min(75, Math.max(25, sttFallback.settleMs / 2)));
1077
+ }
1078
+ } catch (error) {
1079
+ logger.warn("voice stt fallback failed", {
1080
+ error: toError(error).message,
1081
+ sessionId: options.id
1082
+ });
1083
+ } finally {
1084
+ await closeFallback("fallback-complete");
1085
+ for (const unsubscribe of unsubscribers) {
1086
+ unsubscribe();
1087
+ }
1088
+ }
1089
+ if (fallbackTranscripts.length === 0) {
1090
+ return null;
1091
+ }
1092
+ const fallbackText = buildTurnText(fallbackTranscripts, "", {});
1093
+ const fallbackConfidence = calculateMeanConfidence(fallbackTranscripts);
1094
+ const fallbackCandidate = {
1095
+ confidence: fallbackConfidence,
1096
+ text: fallbackText,
1097
+ wordCount: countWords2(normalizeText2(fallbackText))
1098
+ };
1099
+ const primaryCandidate = {
1100
+ confidence: calculateMeanConfidence(primaryTranscripts),
1101
+ text: primaryText,
1102
+ wordCount: countWords2(normalizeText2(primaryText))
1103
+ };
1104
+ const selection = selectBetterTurnText(primaryCandidate, fallbackCandidate);
1105
+ const diagnostics = {
1106
+ attempted: true,
1107
+ fallbackConfidence: fallbackCandidate.confidence,
1108
+ fallbackText: fallbackCandidate.text,
1109
+ fallbackWordCount: fallbackCandidate.wordCount,
1110
+ primaryConfidence: primaryCandidate.confidence,
1111
+ primaryText,
1112
+ primaryWordCount: primaryCandidate.wordCount,
1113
+ selected: selection.winner.text === fallbackCandidate.text,
1114
+ selectionReason: selection.reason,
1115
+ trigger: sttFallback.trigger
1116
+ };
1117
+ if (selection.winner.text === primaryCandidate.text) {
1118
+ return {
1119
+ diagnostics,
1120
+ fallbackUsed: false,
1121
+ source: "primary",
1122
+ text: primaryText,
1123
+ transcripts: primaryTranscripts.map((transcript) => ({
1124
+ ...transcript,
1125
+ isFinal: true
1126
+ }))
1127
+ };
1128
+ }
1129
+ const candidateTranscripts = fallbackText === fallbackCandidate.text ? fallbackTranscripts : [];
1130
+ return {
1131
+ diagnostics,
1132
+ fallbackUsed: true,
1133
+ source: "fallback",
1134
+ text: selection.winner.text,
1135
+ transcripts: candidateTranscripts.length > 0 ? candidateTranscripts.map((transcript) => ({
1136
+ ...transcript,
1137
+ isFinal: true
1138
+ })) : [{ id: createId(), isFinal: false, text: selection.winner.text }]
1139
+ };
1140
+ };
1141
+ const getFinalTranscriptIds = (transcripts) => {
1142
+ const finalTranscriptIds = transcripts.filter((transcript) => transcript.isFinal).map((transcript) => transcript.id);
1143
+ const fallbackIds = transcripts.map((transcript) => transcript.id);
1144
+ return finalTranscriptIds.length > 0 ? finalTranscriptIds : fallbackIds;
1145
+ };
1146
+ const runTurnCorrection = async (input) => {
1147
+ if (!options.route.correctTurn) {
1148
+ return;
388
1149
  }
1150
+ const originalText = input.text;
1151
+ const result = await options.route.correctTurn({
1152
+ api,
1153
+ context: options.context,
1154
+ fallback: input.fallbackDiagnostics,
1155
+ phraseHints,
1156
+ session: input.session,
1157
+ text: originalText,
1158
+ transcripts: input.transcripts.map(cloneTranscript)
1159
+ });
1160
+ const nextText = typeof result === "string" ? result : typeof result?.text === "string" ? result.text : originalText;
1161
+ const correctedText = normalizeCorrectionText(nextText);
1162
+ const normalizedOriginal = normalizeCorrectionText(originalText);
1163
+ return {
1164
+ diagnostics: {
1165
+ attempted: true,
1166
+ changed: correctedText.length > 0 && correctedText !== normalizedOriginal,
1167
+ correctedText: correctedText.length > 0 ? correctedText : normalizedOriginal,
1168
+ metadata: typeof result === "object" ? result.metadata : undefined,
1169
+ originalText,
1170
+ provider: typeof result === "object" ? result.provider : undefined,
1171
+ reason: typeof result === "object" ? result.reason : undefined
1172
+ },
1173
+ text: correctedText.length > 0 ? correctedText : originalText
1174
+ };
1175
+ };
1176
+ const ensureCommittedTurnGuard = (session) => {
1177
+ if (!session.lastCommittedTurn) {
1178
+ session.lastCommittedTurn = {
1179
+ committedAt: 0,
1180
+ signature: "",
1181
+ text: "",
1182
+ transcriptIds: []
1183
+ };
1184
+ }
1185
+ return session;
1186
+ };
1187
+ const buildTurnSignature = (session, finalText, transcriptIdsOverride) => {
1188
+ const finalTranscriptIds = transcriptIdsOverride ?? getFinalTranscriptIds(session.currentTurn.transcripts);
1189
+ return `${normalizeText2(finalText)}|${finalTranscriptIds.join(",")}`;
1190
+ };
1191
+ const isDuplicateTurnCommit = (session, finalText) => {
1192
+ const signature = buildTurnSignature(session, finalText);
1193
+ const committedTurn = session.lastCommittedTurn;
1194
+ const isRecent = committedTurn && committedTurn.committedAt > 0 && Date.now() - committedTurn.committedAt < DEFAULT_DUPLICATE_TURN_WINDOW_MS;
1195
+ const committedSignature = committedTurn?.signature ?? "";
1196
+ const committedTranscriptIds = committedTurn?.transcriptIds ?? [];
1197
+ const committedText = normalizeText2(committedTurn?.text ?? "");
1198
+ const isSameText = normalizeText2(finalText) === committedText;
1199
+ const hasNoNewAudioSinceCommit = (session.currentTurn.lastAudioAt ?? 0) <= (committedTurn?.committedAt ?? 0);
1200
+ if (!isRecent) {
1201
+ return false;
1202
+ }
1203
+ if (isSameText && hasNoNewAudioSinceCommit) {
1204
+ return true;
1205
+ }
1206
+ if (signature !== committedSignature) {
1207
+ return false;
1208
+ }
1209
+ const lastSignatureIds = new Set(committedTranscriptIds);
1210
+ const hasNoNewFinalIds = session.currentTurn.transcripts.every((transcript) => !transcript.isFinal || lastSignatureIds.has(transcript.id));
1211
+ return isRecent && hasNoNewFinalIds;
1212
+ };
1213
+ const markTurnCommitted = (session, finalText, committedTranscripts) => {
1214
+ session.lastCommittedTurn = {
1215
+ ...session.lastCommittedTurn ?? {},
1216
+ committedAt: Date.now(),
1217
+ signature: buildTurnSignature(session, finalText, getFinalTranscriptIds(committedTranscripts)),
1218
+ text: normalizeText2(finalText),
1219
+ transcriptIds: getFinalTranscriptIds(committedTranscripts)
1220
+ };
389
1221
  };
390
1222
  const handlePartial = async (transcript) => {
391
1223
  await writeSession((session) => {
392
- session.currentTurn.lastAudioAt = Date.now();
393
- session.currentTurn.partialText = buildTurnText(session.currentTurn.transcripts, transcript.text);
1224
+ const nextPartialStartedAt = transcript.startedAtMs ?? session.currentTurn.partialStartedAt;
1225
+ const nextPartialEndedAt = transcript.endedAtMs ?? session.currentTurn.partialEndedAt;
1226
+ const preferredPartial = selectPreferredTranscriptText(session.currentTurn.partialText, transcript.text);
1227
+ session.currentTurn.lastTranscriptAt = Date.now();
1228
+ session.currentTurn.partialStartedAt = nextPartialStartedAt;
1229
+ session.currentTurn.partialEndedAt = nextPartialEndedAt;
1230
+ session.currentTurn.partialText = buildTurnText(session.currentTurn.transcripts, preferredPartial, {
1231
+ partialEndedAtMs: nextPartialEndedAt,
1232
+ partialStartedAtMs: nextPartialStartedAt
1233
+ });
394
1234
  session.lastActivityAt = Date.now();
395
1235
  session.status = "active";
396
1236
  });
@@ -412,8 +1252,11 @@ var createVoiceSession = (options) => {
412
1252
  cloneTranscript(transcript)
413
1253
  ];
414
1254
  }
415
- session.currentTurn.finalText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText);
416
- session.currentTurn.lastAudioAt = Date.now();
1255
+ session.currentTurn.finalText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
1256
+ partialEndedAtMs: session.currentTurn.partialEndedAt,
1257
+ partialStartedAtMs: session.currentTurn.partialStartedAt
1258
+ });
1259
+ session.currentTurn.lastTranscriptAt = Date.now();
417
1260
  session.lastActivityAt = Date.now();
418
1261
  session.status = "active";
419
1262
  });
@@ -422,36 +1265,60 @@ var createVoiceSession = (options) => {
422
1265
  type: "final"
423
1266
  });
424
1267
  };
1268
+ const resumePendingTurnCommit = (session) => {
1269
+ const pendingText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
1270
+ partialEndedAtMs: session.currentTurn.partialEndedAt,
1271
+ partialStartedAtMs: session.currentTurn.partialStartedAt
1272
+ });
1273
+ if (!pendingText) {
1274
+ speechDetected = false;
1275
+ return;
1276
+ }
1277
+ speechDetected = true;
1278
+ const audioAge = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : session.currentTurn.lastSpeechAt !== undefined ? Date.now() - session.currentTurn.lastSpeechAt : 0;
1279
+ const transcriptAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : turnDetection.transcriptStabilityMs;
1280
+ const delayMs = Math.max(0, turnDetection.silenceMs - audioAge, turnDetection.transcriptStabilityMs - transcriptAge);
1281
+ scheduleSilenceCommit(delayMs);
1282
+ };
425
1283
  const ensureAdapter = async () => {
426
1284
  if (sttSession) {
427
1285
  return sttSession;
428
1286
  }
429
- sttSession = await options.stt.open({
430
- format: {
431
- channels: 1,
432
- container: "raw",
433
- encoding: "pcm_s16le",
434
- sampleRateHz: 16000
435
- },
1287
+ const openedSession = await options.stt.open({
1288
+ format: DEFAULT_FORMAT,
1289
+ phraseHints,
436
1290
  sessionId: options.id
437
1291
  });
438
- sttSession.on("partial", ({ transcript }) => {
439
- handlePartial(transcript);
1292
+ const generation = ++adapterGenerationCounter;
1293
+ sttSession = openedSession;
1294
+ activeAdapterGeneration = generation;
1295
+ const runAdapterEvent = (phase, handler) => {
1296
+ runSerial(phase, async () => {
1297
+ if (activeAdapterGeneration !== generation) {
1298
+ return;
1299
+ }
1300
+ await handler();
1301
+ });
1302
+ };
1303
+ openedSession.on("partial", ({ transcript }) => {
1304
+ runAdapterEvent("adapter.partial", () => handlePartial(transcript));
440
1305
  });
441
- sttSession.on("final", ({ transcript }) => {
442
- handleFinal(transcript);
1306
+ openedSession.on("final", ({ transcript }) => {
1307
+ runAdapterEvent("adapter.final", () => handleFinal(transcript));
443
1308
  });
444
- sttSession.on("endOfTurn", ({ reason }) => {
445
- clearSilenceTimer();
446
- api.commitTurn(reason);
1309
+ openedSession.on("endOfTurn", ({ reason }) => {
1310
+ runAdapterEvent("adapter.endOfTurn", async () => {
1311
+ clearSilenceTimer();
1312
+ await requestTurnCommit(reason);
1313
+ });
447
1314
  });
448
- sttSession.on("error", (event) => {
449
- handleError(event);
1315
+ openedSession.on("error", (event) => {
1316
+ runAdapterEvent("adapter.error", () => handleError(event));
450
1317
  });
451
- sttSession.on("close", (event) => {
452
- handleClose(event);
1318
+ openedSession.on("close", (event) => {
1319
+ runAdapterEvent("adapter.close", () => handleClose(event));
453
1320
  });
454
- return sttSession;
1321
+ return openedSession;
455
1322
  };
456
1323
  const completeTurn = async (session, turn) => {
457
1324
  const output = await options.route.onTurn({
@@ -480,207 +1347,267 @@ var createVoiceSession = (options) => {
480
1347
  });
481
1348
  }
482
1349
  if (output?.complete) {
483
- await api.complete(output.result);
1350
+ await completeInternal(output.result);
484
1351
  }
485
1352
  };
486
- const api = {
487
- id: options.id,
488
- close: async (reason) => {
489
- clearSilenceTimer();
490
- await closeAdapter(reason);
491
- await Promise.resolve(socket.close(1000, reason));
492
- },
493
- commitTurn: async (reason = "manual") => {
494
- clearSilenceTimer();
495
- const session = await readSession();
496
- if (session.status === "completed" || session.status === "failed") {
497
- return;
498
- }
499
- const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText);
500
- if (!text) {
501
- return;
1353
+ const commitTurnInternal = async (reason = "manual") => {
1354
+ clearSilenceTimer();
1355
+ const session = await readSession();
1356
+ if (session.status === "completed" || session.status === "failed") {
1357
+ return;
1358
+ }
1359
+ const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
1360
+ partialEndedAtMs: session.currentTurn.partialEndedAt,
1361
+ partialStartedAtMs: session.currentTurn.partialStartedAt
1362
+ });
1363
+ let transcripts = session.currentTurn.transcripts.length ? session.currentTurn.transcripts.map(cloneTranscript) : [];
1364
+ let finalText = text;
1365
+ const transcriptStabilityAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : undefined;
1366
+ const fallbackSelection = await runFallbackTranscription(text, session.currentTurn.transcripts);
1367
+ const source = fallbackSelection?.source ?? "primary";
1368
+ const fallbackUsed = fallbackSelection?.fallbackUsed ?? false;
1369
+ const fallbackDiagnostics = fallbackSelection?.diagnostics;
1370
+ if (fallbackSelection) {
1371
+ finalText = fallbackSelection.text;
1372
+ transcripts = fallbackSelection.transcripts.length ? fallbackSelection.transcripts.map(cloneTranscript) : transcripts.length ? transcripts : [
1373
+ {
1374
+ id: createId(),
1375
+ isFinal: false,
1376
+ text: finalText
1377
+ }
1378
+ ];
1379
+ if (fallbackSelection.fallbackUsed) {
1380
+ logger.info("voice fallback turn selected", {
1381
+ reason,
1382
+ sessionId: options.id,
1383
+ text: finalText
1384
+ });
502
1385
  }
503
- const turn = {
504
- committedAt: Date.now(),
505
- id: createId(),
506
- text,
507
- transcripts: session.currentTurn.transcripts.length > 0 ? session.currentTurn.transcripts.map(cloneTranscript) : [
508
- {
509
- id: createId(),
510
- isFinal: false,
511
- text
512
- }
513
- ]
514
- };
515
- const updatedSession = await writeSession((currentSession) => {
516
- currentSession.committedTurnIds = [
517
- ...currentSession.committedTurnIds,
518
- turn.id
519
- ];
520
- currentSession.currentTurn = createEmptyCurrentTurn();
521
- currentSession.lastActivityAt = Date.now();
522
- currentSession.status = "active";
523
- currentSession.turns = [...currentSession.turns, turn];
524
- });
525
- speechDetected = false;
526
- logger.info("voice turn committed", {
1386
+ }
1387
+ const correctionSelection = await runTurnCorrection({
1388
+ fallbackDiagnostics,
1389
+ fallbackUsed,
1390
+ session,
1391
+ source,
1392
+ text: finalText,
1393
+ transcripts
1394
+ });
1395
+ const correctionDiagnostics = correctionSelection?.diagnostics;
1396
+ if (correctionSelection) {
1397
+ finalText = correctionSelection.text;
1398
+ }
1399
+ if (!finalText) {
1400
+ return;
1401
+ }
1402
+ if (isDuplicateTurnCommit(session, finalText)) {
1403
+ logger.debug("voice turn commit deduped", {
527
1404
  reason,
528
- sessionId: options.id,
529
- turnId: turn.id
530
- });
531
- await send({
532
- turn,
533
- type: "turn"
1405
+ sessionId: options.id
534
1406
  });
535
- await completeTurn(updatedSession, turn);
536
- },
537
- complete: async (result) => {
538
- clearSilenceTimer();
539
- const session = await writeSession((currentSession) => {
540
- if (currentSession.status === "completed") {
541
- return;
542
- }
543
- currentSession.lastActivityAt = Date.now();
544
- currentSession.status = "completed";
545
- if (result !== undefined && currentSession.turns.length > 0) {
546
- const lastTurn = currentSession.turns.at(-1);
547
- if (lastTurn) {
548
- setTurnResult(currentSession, lastTurn.id, {
549
- result
550
- });
551
- }
1407
+ return;
1408
+ }
1409
+ if (typeof transcriptStabilityAge === "number" && transcriptStabilityAge < turnDetection.transcriptStabilityMs && reason !== "manual") {
1410
+ scheduleTurnCommit(turnDetection.transcriptStabilityMs - transcriptStabilityAge, reason, false);
1411
+ return;
1412
+ }
1413
+ const turn = {
1414
+ committedAt: Date.now(),
1415
+ id: createId(),
1416
+ text: finalText,
1417
+ quality: createTurnQuality(transcripts, source, fallbackUsed, fallbackDiagnostics, correctionDiagnostics),
1418
+ transcripts: transcripts.length > 0 ? transcripts : [
1419
+ {
1420
+ id: createId(),
1421
+ isFinal: false,
1422
+ text: finalText
552
1423
  }
553
- });
554
- await send({
555
- sessionId: options.id,
556
- type: "complete"
557
- });
558
- await closeAdapter("complete");
559
- speechDetected = false;
560
- await options.route.onComplete({
1424
+ ]
1425
+ };
1426
+ const updatedSession = await writeSession((currentSession) => {
1427
+ currentSession.committedTurnIds = [
1428
+ ...currentSession.committedTurnIds,
1429
+ turn.id
1430
+ ];
1431
+ currentSession.currentTurn = createEmptyCurrentTurn();
1432
+ currentSession.lastActivityAt = Date.now();
1433
+ currentSession.status = "active";
1434
+ currentSession.turns = [...currentSession.turns, turn];
1435
+ markTurnCommitted(currentSession, finalText, transcripts);
1436
+ });
1437
+ speechDetected = false;
1438
+ rewindFallbackTurnAudio();
1439
+ logger.info("voice turn committed", {
1440
+ reason,
1441
+ sessionId: options.id,
1442
+ turnId: turn.id
1443
+ });
1444
+ await send({
1445
+ turn,
1446
+ type: "turn"
1447
+ });
1448
+ if (options.sttLifecycle === "turn-scoped") {
1449
+ await closeAdapter("turn-commit");
1450
+ }
1451
+ await completeTurn(updatedSession, turn);
1452
+ };
1453
+ const connectInternal = async (nextSocket) => {
1454
+ socket = nextSocket;
1455
+ const existingSession = await options.store.get(options.id);
1456
+ let session = existingSession ?? createVoiceSessionRecord(options.id, options.scenarioId);
1457
+ if (options.scenarioId && session.scenarioId !== options.scenarioId) {
1458
+ session.scenarioId = options.scenarioId;
1459
+ }
1460
+ ensureCommittedTurnGuard(session);
1461
+ let shouldFireOnSession = !existingSession;
1462
+ if (existingSession?.scenarioId && options.scenarioId && existingSession.scenarioId !== options.scenarioId) {
1463
+ session = resetVoiceSessionRecord(options.id, existingSession, options.scenarioId);
1464
+ shouldFireOnSession = true;
1465
+ }
1466
+ rewindFallbackTurnAudio();
1467
+ if (existingSession?.status === "reconnecting") {
1468
+ const nextAttempts = existingSession.reconnect.attempts + 1;
1469
+ const reconnectExpired = existingSession.reconnect.lastDisconnectAt !== undefined && Date.now() - existingSession.reconnect.lastDisconnectAt > reconnect.timeout;
1470
+ const tooManyAttempts = nextAttempts > reconnect.maxAttempts;
1471
+ if (reconnect.strategy === "fail" && (reconnectExpired || tooManyAttempts)) {
1472
+ await failInternal(new Error("Voice session reconnect policy exhausted"));
1473
+ return;
1474
+ }
1475
+ if (reconnect.strategy === "restart" && (reconnectExpired || tooManyAttempts)) {
1476
+ session = resetVoiceSessionRecord(options.id, existingSession, options.scenarioId);
1477
+ shouldFireOnSession = true;
1478
+ } else {
1479
+ session = {
1480
+ ...existingSession,
1481
+ reconnect: {
1482
+ ...existingSession.reconnect,
1483
+ attempts: nextAttempts
1484
+ },
1485
+ status: "active"
1486
+ };
1487
+ }
1488
+ }
1489
+ await options.store.set(options.id, session);
1490
+ await send({
1491
+ sessionId: options.id,
1492
+ status: session.status,
1493
+ scenarioId: session.scenarioId,
1494
+ type: "session"
1495
+ });
1496
+ if (shouldFireOnSession) {
1497
+ await options.route.onSession?.({
561
1498
  api,
562
1499
  context: options.context,
563
1500
  session
564
1501
  });
565
- },
566
- connect: async (nextSocket) => {
567
- socket = nextSocket;
568
- const existingSession = await options.store.get(options.id);
569
- let session = existingSession ?? createVoiceSessionRecord(options.id);
570
- let shouldFireOnSession = !existingSession;
571
- if (existingSession?.status === "reconnecting") {
572
- const nextAttempts = existingSession.reconnect.attempts + 1;
573
- const reconnectExpired = existingSession.reconnect.lastDisconnectAt !== undefined && Date.now() - existingSession.reconnect.lastDisconnectAt > reconnect.timeout;
574
- const tooManyAttempts = nextAttempts > reconnect.maxAttempts;
575
- if (reconnect.strategy === "fail" && (reconnectExpired || tooManyAttempts)) {
576
- await api.fail(new Error("Voice session reconnect policy exhausted"));
577
- return;
578
- }
579
- if (reconnect.strategy === "restart" && (reconnectExpired || tooManyAttempts)) {
580
- session = resetVoiceSessionRecord(options.id, existingSession);
581
- shouldFireOnSession = true;
582
- } else {
583
- session = {
584
- ...existingSession,
585
- reconnect: {
586
- ...existingSession.reconnect,
587
- attempts: nextAttempts
588
- },
589
- status: "active"
590
- };
591
- }
592
- }
593
- await options.store.set(options.id, session);
1502
+ }
1503
+ if (session.status === "completed") {
594
1504
  await send({
595
1505
  sessionId: options.id,
596
- status: session.status,
597
- type: "session"
1506
+ type: "complete"
598
1507
  });
599
- if (shouldFireOnSession) {
600
- await options.route.onSession?.({
601
- api,
602
- context: options.context,
603
- session
604
- });
605
- }
606
- if (session.status === "completed") {
607
- await send({
608
- sessionId: options.id,
609
- type: "complete"
610
- });
1508
+ return;
1509
+ }
1510
+ resumePendingTurnCommit(session);
1511
+ await ensureAdapter();
1512
+ };
1513
+ const disconnectInternal = async (event) => {
1514
+ clearSilenceTimer();
1515
+ await closeAdapter(event?.reason);
1516
+ rewindFallbackTurnAudio();
1517
+ if (reconnect.strategy === "fail") {
1518
+ await failInternal(new Error(event?.reason ?? "Voice socket disconnected"));
1519
+ return;
1520
+ }
1521
+ await writeSession((session) => {
1522
+ if (session.status === "completed" || session.status === "failed") {
611
1523
  return;
612
1524
  }
613
- await ensureAdapter();
614
- },
615
- disconnect: async (event) => {
616
- clearSilenceTimer();
617
- await closeAdapter(event?.reason);
618
- if (reconnect.strategy === "fail") {
619
- await api.fail(new Error(event?.reason ?? "Voice socket disconnected"));
620
- return;
1525
+ session.lastActivityAt = Date.now();
1526
+ session.reconnect.lastDisconnectAt = Date.now();
1527
+ session.status = "reconnecting";
1528
+ });
1529
+ speechDetected = false;
1530
+ };
1531
+ const receiveAudioInternal = async (audio) => {
1532
+ const session = await readSession();
1533
+ if (session.status === "completed" || session.status === "failed") {
1534
+ return;
1535
+ }
1536
+ const adapter = await ensureAdapter();
1537
+ const conditionedAudio = conditionAudioChunk(audio, options.audioConditioning);
1538
+ const audioLevel = measureAudioLevel(conditionedAudio);
1539
+ const shouldStoreAudio = speechDetected || audioLevel >= turnDetection.speechThreshold;
1540
+ await writeSession((currentSession) => {
1541
+ currentSession.currentTurn.lastAudioAt = Date.now();
1542
+ currentSession.lastActivityAt = Date.now();
1543
+ currentSession.status = "active";
1544
+ if (audioLevel >= turnDetection.speechThreshold) {
1545
+ currentSession.currentTurn.lastSpeechAt = Date.now();
1546
+ currentSession.currentTurn.silenceStartedAt = undefined;
1547
+ } else if (speechDetected && currentSession.currentTurn.silenceStartedAt === undefined) {
1548
+ currentSession.currentTurn.silenceStartedAt = Date.now();
621
1549
  }
622
- await writeSession((session) => {
623
- if (session.status === "completed" || session.status === "failed") {
624
- return;
625
- }
626
- session.lastActivityAt = Date.now();
627
- session.reconnect.lastDisconnectAt = Date.now();
628
- session.status = "reconnecting";
629
- });
630
- speechDetected = false;
631
- },
632
- fail: async (error) => {
1550
+ });
1551
+ if (shouldStoreAudio) {
1552
+ pushTurnAudio(conditionedAudio);
1553
+ }
1554
+ if (audioLevel >= turnDetection.speechThreshold) {
1555
+ speechDetected = true;
633
1556
  clearSilenceTimer();
634
- const session = await writeSession((currentSession) => {
635
- currentSession.lastActivityAt = Date.now();
636
- currentSession.status = "failed";
637
- });
638
- const resolvedError = toError(error);
639
- await send({
640
- message: resolvedError.message,
641
- recoverable: false,
642
- type: "error"
643
- });
644
- await closeAdapter("failed");
645
- speechDetected = false;
646
- await options.route.onError?.({
647
- api,
648
- context: options.context,
649
- error: resolvedError,
650
- session,
651
- sessionId: options.id
652
- });
653
- },
654
- receiveAudio: async (audio) => {
655
- const session = await readSession();
656
- if (session.status === "completed" || session.status === "failed") {
657
- return;
1557
+ } else if (speechDetected) {
1558
+ const currentSession = await readSession();
1559
+ const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText, {
1560
+ partialEndedAtMs: currentSession.currentTurn.partialEndedAt,
1561
+ partialStartedAtMs: currentSession.currentTurn.partialStartedAt
1562
+ }));
1563
+ if (hasTurnText) {
1564
+ scheduleSilenceCommit(turnDetection.silenceMs, false);
658
1565
  }
659
- const adapter = await ensureAdapter();
660
- const audioLevel = measureAudioLevel(audio);
661
- await writeSession((currentSession) => {
662
- currentSession.currentTurn.lastAudioAt = Date.now();
663
- currentSession.lastActivityAt = Date.now();
664
- currentSession.status = "active";
665
- });
666
- if (audioLevel >= turnDetection.speechThreshold) {
667
- speechDetected = true;
1566
+ }
1567
+ await adapter.send(conditionedAudio);
1568
+ };
1569
+ const api = {
1570
+ id: options.id,
1571
+ close: async (reason) => {
1572
+ await runSerial("api.close", async () => {
668
1573
  clearSilenceTimer();
669
- } else if (speechDetected) {
670
- const currentSession = await readSession();
671
- const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText));
672
- if (hasTurnText) {
673
- scheduleSilenceCommit();
674
- }
675
- }
676
- await adapter.send(audio);
1574
+ await closeAdapter(reason);
1575
+ await Promise.resolve(socket.close(1000, reason));
1576
+ });
677
1577
  },
678
- snapshot: async () => readSession()
1578
+ commitTurn: async (reason = "manual") => runSerial("api.commitTurn", async () => {
1579
+ await commitTurnInternal(reason);
1580
+ }),
1581
+ complete: async (result) => runSerial("api.complete", async () => {
1582
+ await completeInternal(result);
1583
+ }),
1584
+ connect: async (nextSocket) => runSerial("api.connect", async () => {
1585
+ await connectInternal(nextSocket);
1586
+ }),
1587
+ disconnect: async (event) => runSerial("api.disconnect", async () => {
1588
+ await disconnectInternal(event);
1589
+ }),
1590
+ fail: async (error) => runSerial("api.fail", async () => {
1591
+ await failInternal(error);
1592
+ }),
1593
+ receiveAudio: async (audio) => runSerial("api.receiveAudio", async () => {
1594
+ await receiveAudioInternal(audio);
1595
+ }),
1596
+ snapshot: async () => runSerial("api.snapshot", async () => readSession())
679
1597
  };
680
1598
  return api;
681
1599
  };
682
1600
 
683
1601
  // src/plugin.ts
1602
+ var resolveQueryScenario = (query) => {
1603
+ if (typeof query?.scenarioId === "string" && query.scenarioId.trim()) {
1604
+ return query.scenarioId.trim();
1605
+ }
1606
+ if (typeof query?.mode === "string" && query.mode.trim()) {
1607
+ return query.mode.trim();
1608
+ }
1609
+ return null;
1610
+ };
684
1611
  var HTMX_BOOTSTRAP_DIST_CANDIDATES = [
685
1612
  resolve(import.meta.dir, "client", "htmxBootstrap.js"),
686
1613
  resolve(import.meta.dir, "..", "dist", "client", "htmxBootstrap.js")
@@ -727,6 +1654,21 @@ ${log}` : ""}`);
727
1654
  };
728
1655
  })();
729
1656
  var isArrayBufferView = (value) => typeof value === "object" && value !== null && ArrayBuffer.isView(value);
1657
+ var resolveSTTFallbackConfig = (config) => {
1658
+ if (!config) {
1659
+ return;
1660
+ }
1661
+ return {
1662
+ adapter: config.adapter,
1663
+ completionTimeoutMs: config.completionTimeoutMs ?? 2500,
1664
+ confidenceThreshold: config.confidenceThreshold ?? 0.6,
1665
+ maxAttemptsPerTurn: config.maxAttemptsPerTurn ?? 1,
1666
+ minTextLength: config.minTextLength ?? 2,
1667
+ replayWindowMs: config.replayWindowMs ?? 8000,
1668
+ settleMs: config.settleMs ?? 220,
1669
+ trigger: config.trigger ?? "empty-or-low-confidence"
1670
+ };
1671
+ };
730
1672
  var isVoiceClientMessage = (value) => {
731
1673
  if (!value || typeof value !== "object" || !("type" in value)) {
732
1674
  return false;
@@ -739,7 +1681,7 @@ var isVoiceClientMessage = (value) => {
739
1681
  case "ping":
740
1682
  return true;
741
1683
  case "start":
742
- return !("sessionId" in value) || typeof value.sessionId === "string";
1684
+ return (!("sessionId" in value) || typeof value.sessionId === "string") && (!("scenarioId" in value) || typeof value.scenarioId === "string");
743
1685
  default:
744
1686
  return false;
745
1687
  }
@@ -759,14 +1701,16 @@ var parseClientMessage = (raw) => {
759
1701
  return null;
760
1702
  };
761
1703
  var resolveSessionId = (runtime, ws) => {
762
- const existing = runtime.socketSessions.get(ws);
763
- if (existing) {
764
- return existing;
765
- }
766
1704
  const query = ws.data && typeof ws.data === "object" && "query" in ws.data ? ws.data.query : undefined;
767
- const providedSessionId = typeof query?.sessionId === "string" && query.sessionId.trim() ? query.sessionId.trim() : createId();
768
- runtime.socketSessions.set(ws, providedSessionId);
769
- return providedSessionId;
1705
+ const existing = runtime.socketSessions.get(ws);
1706
+ const providedSessionId = typeof query?.sessionId === "string" && query.sessionId.trim() ? query.sessionId.trim() : existing?.sessionId ?? createId();
1707
+ const scenarioId = resolveQueryScenario(query) ?? existing?.scenarioId ?? null;
1708
+ const resolved = {
1709
+ sessionId: providedSessionId,
1710
+ scenarioId
1711
+ };
1712
+ runtime.socketSessions.set(ws, resolved);
1713
+ return resolved;
770
1714
  };
771
1715
  var toAudioChunk = (raw) => {
772
1716
  if (raw instanceof ArrayBuffer) {
@@ -792,6 +1736,38 @@ var normalizeOnTurn = (handler) => {
792
1736
  }
793
1737
  return handler;
794
1738
  };
1739
+ var resolveSessionOptions = (config) => {
1740
+ const preset = resolveVoiceRuntimePreset(config.preset);
1741
+ return {
1742
+ audioConditioning: config.audioConditioning !== undefined ? resolveAudioConditioningConfig(config.audioConditioning) : preset.audioConditioning,
1743
+ sttFallback: resolveSTTFallbackConfig(config.sttFallback),
1744
+ logger: config.logger,
1745
+ reconnect: {
1746
+ maxAttempts: config.reconnect?.maxAttempts ?? 10,
1747
+ strategy: config.reconnect?.strategy ?? "resume-last-turn",
1748
+ timeout: config.reconnect?.timeout ?? 30000
1749
+ },
1750
+ sttLifecycle: config.sttLifecycle ?? preset.sttLifecycle,
1751
+ turnDetection: resolveTurnDetectionConfig({
1752
+ ...preset.turnDetection,
1753
+ ...config.turnDetection
1754
+ })
1755
+ };
1756
+ };
1757
+ var normalizePhraseHints = (hints) => (hints ?? []).map((hint) => ({
1758
+ ...hint,
1759
+ aliases: hint.aliases?.filter((value) => typeof value === "string" && value.trim().length > 0),
1760
+ text: hint.text.trim()
1761
+ })).filter((hint) => hint.text.length > 0);
1762
+ var resolvePhraseHints = async (config, input) => {
1763
+ if (!config.phraseHints) {
1764
+ return [];
1765
+ }
1766
+ if (typeof config.phraseHints === "function") {
1767
+ return normalizePhraseHints(await config.phraseHints(input));
1768
+ }
1769
+ return normalizePhraseHints(config.phraseHints);
1770
+ };
795
1771
  var voice = (config) => {
796
1772
  const runtime = {
797
1773
  activeSessions: new Map,
@@ -799,11 +1775,42 @@ var voice = (config) => {
799
1775
  socketSessions: new WeakMap
800
1776
  };
801
1777
  const onTurn = normalizeOnTurn(config.onTurn);
1778
+ const sessionOptions = resolveSessionOptions(config);
802
1779
  const htmxOptions = config.htmx && typeof config.htmx === "object" ? config.htmx : undefined;
803
1780
  const htmxRoute = htmxOptions?.route ?? `${config.path}/htmx/session`;
804
1781
  const htmxBootstrapRoute = htmxOptions?.bootstrapRoute ?? `${config.path}/htmx/bootstrap.js`;
805
1782
  const htmxRenderers = resolveVoiceHTMXRenderers(config.htmx && config.htmx !== true ? config.htmx : undefined);
806
1783
  const htmxTargets = resolveVoiceHTMXTargets(htmxOptions?.targets);
1784
+ const createManagedSession = async (ws, sessionId, scenarioId) => {
1785
+ const context = ws.data;
1786
+ const phraseHints = await resolvePhraseHints(config, {
1787
+ context,
1788
+ scenarioId,
1789
+ sessionId
1790
+ });
1791
+ return createVoiceSession({
1792
+ audioConditioning: sessionOptions.audioConditioning,
1793
+ context,
1794
+ id: sessionId,
1795
+ logger: sessionOptions.logger,
1796
+ phraseHints,
1797
+ reconnect: sessionOptions.reconnect,
1798
+ route: {
1799
+ correctTurn: config.correctTurn,
1800
+ onComplete: config.onComplete,
1801
+ onError: config.onError,
1802
+ onSession: config.onSession,
1803
+ onTurn
1804
+ },
1805
+ scenarioId,
1806
+ socket: createSocketAdapter(ws),
1807
+ store: config.session,
1808
+ stt: config.stt,
1809
+ sttFallback: sessionOptions.sttFallback,
1810
+ sttLifecycle: sessionOptions.sttLifecycle,
1811
+ turnDetection: sessionOptions.turnDetection
1812
+ });
1813
+ };
807
1814
  const htmxRoutes = () => {
808
1815
  if (!config.htmx) {
809
1816
  return new Elysia;
@@ -833,12 +1840,12 @@ var voice = (config) => {
833
1840
  };
834
1841
  return new Elysia({ name: "absolutejs-voice" }).ws(config.path, {
835
1842
  close: async (ws, code, reason) => {
836
- const sessionId = runtime.socketSessions.get(ws);
837
- if (!sessionId) {
1843
+ const socketState = runtime.socketSessions.get(ws);
1844
+ if (!socketState) {
838
1845
  return;
839
1846
  }
840
- const session = runtime.activeSessions.get(sessionId);
841
- runtime.activeSessions.delete(sessionId);
1847
+ const session = runtime.activeSessions.get(socketState.sessionId);
1848
+ runtime.activeSessions.delete(socketState.sessionId);
842
1849
  if (session) {
843
1850
  await session.disconnect({
844
1851
  code,
@@ -849,8 +1856,8 @@ var voice = (config) => {
849
1856
  }
850
1857
  },
851
1858
  message: async (ws, raw) => {
852
- const sessionId = resolveSessionId(runtime, ws);
853
- const current = runtime.activeSessions.get(sessionId);
1859
+ const sessionState = resolveSessionId(runtime, ws);
1860
+ const current = runtime.activeSessions.get(sessionState.sessionId);
854
1861
  const message = parseClientMessage(raw);
855
1862
  if (message) {
856
1863
  if (message.type === "ping") {
@@ -861,10 +1868,27 @@ var voice = (config) => {
861
1868
  }
862
1869
  if (message.type === "close" && current) {
863
1870
  await current.close(message.reason);
864
- runtime.activeSessions.delete(sessionId);
1871
+ runtime.activeSessions.delete(sessionState.sessionId);
1872
+ }
1873
+ if (message.type === "start" && message.sessionId && message.sessionId !== sessionState.sessionId) {
1874
+ const currentSession = runtime.activeSessions.get(sessionState.sessionId);
1875
+ if (currentSession) {
1876
+ await currentSession.close("session-switch");
1877
+ runtime.activeSessions.delete(sessionState.sessionId);
1878
+ }
1879
+ sessionState.sessionId = message.sessionId;
1880
+ runtime.socketSessions.set(ws, {
1881
+ ...sessionState,
1882
+ sessionId: message.sessionId,
1883
+ scenarioId: sessionState.scenarioId
1884
+ });
865
1885
  }
866
- if (message.type === "start" && message.sessionId && message.sessionId !== sessionId) {
867
- runtime.socketSessions.set(ws, message.sessionId);
1886
+ if (message.type === "start" && message.scenarioId) {
1887
+ sessionState.scenarioId = message.scenarioId;
1888
+ runtime.socketSessions.set(ws, {
1889
+ ...sessionState,
1890
+ scenarioId: message.scenarioId
1891
+ });
868
1892
  }
869
1893
  return;
870
1894
  }
@@ -872,66 +1896,22 @@ var voice = (config) => {
872
1896
  if (!audio) {
873
1897
  return;
874
1898
  }
875
- const session = current ?? createVoiceSession({
876
- context: ws.data,
877
- id: sessionId,
878
- logger: config.logger,
879
- reconnect: {
880
- maxAttempts: config.reconnect?.maxAttempts ?? 10,
881
- strategy: config.reconnect?.strategy ?? "resume-last-turn",
882
- timeout: config.reconnect?.timeout ?? 30000
883
- },
884
- route: {
885
- onComplete: config.onComplete,
886
- onError: config.onError,
887
- onSession: config.onSession,
888
- onTurn
889
- },
890
- socket: createSocketAdapter(ws),
891
- store: config.session,
892
- stt: config.stt,
893
- turnDetection: {
894
- silenceMs: config.turnDetection?.silenceMs ?? 700,
895
- speechThreshold: config.turnDetection?.speechThreshold ?? 0.015
896
- }
897
- });
1899
+ const session = current ?? await createManagedSession(ws, sessionState.sessionId, sessionState.scenarioId ?? undefined);
898
1900
  if (!current) {
899
- runtime.activeSessions.set(sessionId, session);
1901
+ runtime.activeSessions.set(sessionState.sessionId, session);
900
1902
  await session.connect(createSocketAdapter(ws));
901
1903
  }
902
1904
  await session.receiveAudio(audio);
903
1905
  },
904
1906
  open: async (ws) => {
905
- const sessionId = resolveSessionId(runtime, ws);
906
- const existing = runtime.activeSessions.get(sessionId);
1907
+ const sessionState = resolveSessionId(runtime, ws);
1908
+ const existing = runtime.activeSessions.get(sessionState.sessionId);
907
1909
  if (existing) {
908
1910
  await existing.close("superseded");
909
- runtime.activeSessions.delete(sessionId);
1911
+ runtime.activeSessions.delete(sessionState.sessionId);
910
1912
  }
911
- const session = createVoiceSession({
912
- context: ws.data,
913
- id: sessionId,
914
- logger: config.logger,
915
- reconnect: {
916
- maxAttempts: config.reconnect?.maxAttempts ?? 10,
917
- strategy: config.reconnect?.strategy ?? "resume-last-turn",
918
- timeout: config.reconnect?.timeout ?? 30000
919
- },
920
- route: {
921
- onComplete: config.onComplete,
922
- onError: config.onError,
923
- onSession: config.onSession,
924
- onTurn
925
- },
926
- socket: createSocketAdapter(ws),
927
- store: config.session,
928
- stt: config.stt,
929
- turnDetection: {
930
- silenceMs: config.turnDetection?.silenceMs ?? 700,
931
- speechThreshold: config.turnDetection?.speechThreshold ?? 0.015
932
- }
933
- });
934
- runtime.activeSessions.set(sessionId, session);
1913
+ const session = await createManagedSession(ws, sessionState.sessionId, sessionState.scenarioId ?? undefined);
1914
+ runtime.activeSessions.set(sessionState.sessionId, session);
935
1915
  await session.connect(createSocketAdapter(ws));
936
1916
  }
937
1917
  }).use(htmxRoutes());
@@ -957,10 +1937,61 @@ var createVoiceMemoryStore = () => {
957
1937
  };
958
1938
  return { get, getOrCreate, list, remove, set };
959
1939
  };
1940
+ // src/correction.ts
1941
+ var escapeRegExp = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1942
+ var normalizeHintAliases = (hint) => (hint.aliases ?? []).map((alias) => alias.trim()).filter((alias) => alias.length > 0).sort((left, right) => right.length - left.length);
1943
+ var applyPhraseHintCorrections = (text, phraseHints) => {
1944
+ let corrected = text;
1945
+ const matches = [];
1946
+ for (const hint of phraseHints) {
1947
+ for (const alias of normalizeHintAliases(hint)) {
1948
+ const matcher = new RegExp(`\\b${escapeRegExp(alias)}\\b`, "gi");
1949
+ if (!matcher.test(corrected)) {
1950
+ continue;
1951
+ }
1952
+ corrected = corrected.replace(matcher, hint.text);
1953
+ matches.push({
1954
+ alias,
1955
+ hint
1956
+ });
1957
+ }
1958
+ }
1959
+ return {
1960
+ changed: corrected !== text,
1961
+ matches,
1962
+ text: corrected
1963
+ };
1964
+ };
1965
+ var createPhraseHintCorrectionHandler = (options = {}) => {
1966
+ const provider = options.provider ?? "@absolutejs/voice";
1967
+ const reason = options.reason ?? "phrase-hint-correction";
1968
+ return async ({ phraseHints, text }) => {
1969
+ const result = applyPhraseHintCorrections(text, phraseHints);
1970
+ if (!result.changed) {
1971
+ return;
1972
+ }
1973
+ return {
1974
+ metadata: result.matches.length > 0 ? {
1975
+ matchedAliases: result.matches.map((match) => match.alias),
1976
+ matchedHints: result.matches.map((match) => match.hint.text)
1977
+ } : undefined,
1978
+ provider,
1979
+ reason,
1980
+ text: result.text
1981
+ };
1982
+ };
1983
+ };
960
1984
  export {
961
1985
  voice,
1986
+ resolveVoiceRuntimePreset,
1987
+ resolveTurnDetectionConfig,
1988
+ resolveAudioConditioningConfig,
962
1989
  createVoiceSessionRecord,
963
1990
  createVoiceSession,
964
1991
  createVoiceMemoryStore,
965
- createId
1992
+ createPhraseHintCorrectionHandler,
1993
+ createId,
1994
+ conditionAudioChunk,
1995
+ applyPhraseHintCorrections,
1996
+ TURN_PROFILE_DEFAULTS
966
1997
  };