@absolutejs/voice 0.0.20 → 0.0.22-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +884 -4
  2. package/dist/angular/index.d.ts +1 -0
  3. package/dist/angular/index.js +759 -3
  4. package/dist/angular/voice-controller.service.d.ts +27 -0
  5. package/dist/angular/voice-stream.service.d.ts +6 -0
  6. package/dist/audioConditioning.d.ts +3 -0
  7. package/dist/client/actions.d.ts +48 -0
  8. package/dist/client/audioPlayer.d.ts +40 -0
  9. package/dist/client/connection.d.ts +5 -0
  10. package/dist/client/controller.d.ts +2 -0
  11. package/dist/client/duplex.d.ts +3 -0
  12. package/dist/client/htmxBootstrap.js +660 -167
  13. package/dist/client/index.d.ts +3 -0
  14. package/dist/client/index.js +991 -6
  15. package/dist/client/microphone.d.ts +4 -2
  16. package/dist/correction.d.ts +33 -0
  17. package/dist/fileStore.d.ts +27 -0
  18. package/dist/index.d.ts +15 -0
  19. package/dist/index.js +3721 -298
  20. package/dist/ops.d.ts +100 -0
  21. package/dist/presets.d.ts +13 -0
  22. package/dist/react/index.d.ts +1 -0
  23. package/dist/react/index.js +728 -3
  24. package/dist/react/useVoiceController.d.ts +26 -0
  25. package/dist/react/useVoiceStream.d.ts +7 -0
  26. package/dist/routing.d.ts +3 -0
  27. package/dist/runtimeOps.d.ts +23 -0
  28. package/dist/store.d.ts +2 -2
  29. package/dist/svelte/index.d.ts +1 -0
  30. package/dist/svelte/index.js +691 -3
  31. package/dist/telephony/response.d.ts +7 -0
  32. package/dist/telephony/twilio.d.ts +116 -0
  33. package/dist/testing/benchmark.d.ts +93 -2
  34. package/dist/testing/corrected.d.ts +41 -0
  35. package/dist/testing/duplex.d.ts +59 -0
  36. package/dist/testing/fixtures.d.ts +18 -2
  37. package/dist/testing/index.d.ts +5 -0
  38. package/dist/testing/index.js +6247 -402
  39. package/dist/testing/review.d.ts +143 -0
  40. package/dist/testing/sessionBenchmark.d.ts +92 -2
  41. package/dist/testing/stt.d.ts +3 -1
  42. package/dist/testing/telephony.d.ts +70 -0
  43. package/dist/testing/tts.d.ts +73 -0
  44. package/dist/turnDetection.d.ts +5 -1
  45. package/dist/turnProfiles.d.ts +6 -0
  46. package/dist/types.d.ts +487 -10
  47. package/dist/vue/index.d.ts +1 -0
  48. package/dist/vue/index.js +750 -3
  49. package/dist/vue/useVoiceController.d.ts +30 -0
  50. package/dist/vue/useVoiceStream.d.ts +11 -0
  51. package/fixtures/README.md +9 -0
  52. package/fixtures/manifest.json +59 -1
  53. package/fixtures/pcm/dialogue-three-clean.pcm +0 -0
  54. package/fixtures/pcm/dialogue-three-mixed.pcm +0 -0
  55. package/fixtures/pcm/dialogue-two-clean.pcm +0 -0
  56. package/fixtures/pcm/dialogue-two-noisy.pcm +0 -0
  57. package/package.json +135 -1
package/dist/index.js CHANGED
@@ -69,6 +69,61 @@ var __decorateElement = (array, flags, name, decorators, target, extra) => {
69
69
  return k || __decoratorMetadata(array, target), desc && __defProp(target, name, desc), p ? k ^ 4 ? extra : desc : target;
70
70
  };
71
71
 
72
+ // src/audioConditioning.ts
73
+ var DEFAULT_TARGET_LEVEL = 0.08;
74
+ var DEFAULT_MAX_GAIN = 3;
75
+ var DEFAULT_NOISE_GATE_THRESHOLD = 0.006;
76
+ var DEFAULT_NOISE_GATE_ATTENUATION = 0.15;
77
+ var toInt16Array = (audio) => {
78
+ if (audio instanceof ArrayBuffer) {
79
+ return new Int16Array(audio, 0, Math.floor(audio.byteLength / 2));
80
+ }
81
+ return new Int16Array(audio.buffer, audio.byteOffset, Math.floor(audio.byteLength / 2));
82
+ };
83
+ var computeRms = (samples) => {
84
+ if (samples.length === 0) {
85
+ return 0;
86
+ }
87
+ let sumSquares = 0;
88
+ for (const sample of samples) {
89
+ const normalized = sample / 32768;
90
+ sumSquares += normalized * normalized;
91
+ }
92
+ return Math.sqrt(sumSquares / samples.length);
93
+ };
94
+ var resolveAudioConditioningConfig = (config) => {
95
+ if (!config || config.enabled === false) {
96
+ return;
97
+ }
98
+ return {
99
+ enabled: true,
100
+ maxGain: config.maxGain ?? DEFAULT_MAX_GAIN,
101
+ noiseGateAttenuation: config.noiseGateAttenuation ?? DEFAULT_NOISE_GATE_ATTENUATION,
102
+ noiseGateThreshold: config.noiseGateThreshold ?? DEFAULT_NOISE_GATE_THRESHOLD,
103
+ targetLevel: config.targetLevel ?? DEFAULT_TARGET_LEVEL
104
+ };
105
+ };
106
+ var conditionAudioChunk = (audio, config) => {
107
+ if (!config) {
108
+ return audio;
109
+ }
110
+ const source = toInt16Array(audio);
111
+ if (source.length === 0) {
112
+ return audio;
113
+ }
114
+ const rms = computeRms(source);
115
+ const output = new Int16Array(source.length);
116
+ const gateFactor = rms < config.noiseGateThreshold ? config.noiseGateAttenuation : 1;
117
+ const baseLevel = Math.max(rms * gateFactor, 0.000001);
118
+ const gain = Math.min(config.maxGain, config.targetLevel / baseLevel);
119
+ const appliedGain = Math.max(0.25, gain) * gateFactor;
120
+ for (let index = 0;index < source.length; index += 1) {
121
+ const next = Math.round(source[index] * appliedGain);
122
+ output[index] = Math.max(-32768, Math.min(32767, next));
123
+ }
124
+ return new Uint8Array(output.buffer);
125
+ };
126
+
72
127
  // src/plugin.ts
73
128
  import { Elysia } from "elysia";
74
129
  import { resolve } from "path";
@@ -118,6 +173,10 @@ var defaultMetrics = (input) => {
118
173
  '<span class="voice-metric-label">Session</span>',
119
174
  `<span class="voice-metric-value">${escapeHtml(input.sessionId)}</span>`,
120
175
  "</div>",
176
+ input.session?.scenarioId ? `<div class="voice-metric">
177
+ <span class="voice-metric-label">Scenario</span>
178
+ <span class="voice-metric-value">${escapeHtml(input.session.scenarioId)}</span>
179
+ </div>` : "",
121
180
  '<div class="voice-metric">',
122
181
  '<span class="voice-metric-label">Status</span>',
123
182
  `<span class="voice-metric-value">${escapeHtml(input.status)}</span>`,
@@ -207,24 +266,1224 @@ var resolveLogger = (logger) => ({
207
266
  ...logger
208
267
  });
209
268
 
269
+ // src/turnProfiles.ts
270
+ var TURN_PROFILE_DEFAULTS = {
271
+ balanced: {
272
+ qualityProfile: "general",
273
+ silenceMs: 1400,
274
+ speechThreshold: 0.012,
275
+ transcriptStabilityMs: 1000
276
+ },
277
+ fast: {
278
+ qualityProfile: "general",
279
+ silenceMs: 700,
280
+ speechThreshold: 0.015,
281
+ transcriptStabilityMs: 450
282
+ },
283
+ "long-form": {
284
+ qualityProfile: "general",
285
+ silenceMs: 2200,
286
+ speechThreshold: 0.01,
287
+ transcriptStabilityMs: 1500
288
+ }
289
+ };
290
+ var QUALITY_PROFILE_DEFAULTS = {
291
+ general: {},
292
+ "accent-heavy": {
293
+ silenceMs: 1200,
294
+ speechThreshold: 0.01,
295
+ transcriptStabilityMs: 1200
296
+ },
297
+ "noisy-room": {
298
+ silenceMs: 2000,
299
+ speechThreshold: 0.02,
300
+ transcriptStabilityMs: 1600
301
+ },
302
+ "short-command": {
303
+ silenceMs: 500,
304
+ speechThreshold: 0.016,
305
+ transcriptStabilityMs: 420
306
+ }
307
+ };
308
+ var DEFAULT_TURN_PROFILE = "fast";
309
+ var DEFAULT_QUALITY_PROFILE = "general";
310
+ var resolveTurnDetectionConfig = (config) => {
311
+ const profile = config?.profile ?? DEFAULT_TURN_PROFILE;
312
+ const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
313
+ const preset = TURN_PROFILE_DEFAULTS[profile];
314
+ const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
315
+ return {
316
+ profile,
317
+ qualityProfile,
318
+ silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
319
+ speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
320
+ transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
321
+ };
322
+ };
323
+
324
+ // src/presets.ts
325
+ var PRESET_INPUTS = {
326
+ chat: {
327
+ audioConditioning: {
328
+ enabled: true,
329
+ maxGain: 2.5,
330
+ noiseGateAttenuation: 0,
331
+ noiseGateThreshold: 0.004,
332
+ targetLevel: 0.08
333
+ },
334
+ capture: {
335
+ channelCount: 1,
336
+ sampleRateHz: 16000
337
+ },
338
+ connection: {
339
+ maxReconnectAttempts: 10,
340
+ pingInterval: 30000,
341
+ reconnect: true
342
+ },
343
+ sttLifecycle: "continuous",
344
+ turnDetection: {
345
+ qualityProfile: "short-command",
346
+ profile: "balanced"
347
+ }
348
+ },
349
+ default: {
350
+ capture: {
351
+ channelCount: 1,
352
+ sampleRateHz: 16000
353
+ },
354
+ connection: {
355
+ maxReconnectAttempts: 10,
356
+ pingInterval: 30000,
357
+ reconnect: true
358
+ },
359
+ sttLifecycle: "continuous",
360
+ turnDetection: {
361
+ qualityProfile: "general",
362
+ profile: "fast"
363
+ }
364
+ },
365
+ dictation: {
366
+ audioConditioning: {
367
+ enabled: true,
368
+ maxGain: 2.25,
369
+ noiseGateAttenuation: 0.05,
370
+ noiseGateThreshold: 0.003,
371
+ targetLevel: 0.08
372
+ },
373
+ capture: {
374
+ channelCount: 1,
375
+ sampleRateHz: 16000
376
+ },
377
+ connection: {
378
+ maxReconnectAttempts: 12,
379
+ pingInterval: 30000,
380
+ reconnect: true
381
+ },
382
+ sttLifecycle: "continuous",
383
+ turnDetection: {
384
+ qualityProfile: "accent-heavy",
385
+ profile: "long-form"
386
+ }
387
+ },
388
+ "guided-intake": {
389
+ audioConditioning: {
390
+ enabled: true,
391
+ maxGain: 2.5,
392
+ noiseGateAttenuation: 0,
393
+ noiseGateThreshold: 0.004,
394
+ targetLevel: 0.08
395
+ },
396
+ capture: {
397
+ channelCount: 1,
398
+ sampleRateHz: 16000
399
+ },
400
+ connection: {
401
+ maxReconnectAttempts: 12,
402
+ pingInterval: 30000,
403
+ reconnect: true
404
+ },
405
+ sttLifecycle: "turn-scoped",
406
+ turnDetection: {
407
+ qualityProfile: "accent-heavy",
408
+ profile: "long-form"
409
+ }
410
+ },
411
+ "noisy-room": {
412
+ audioConditioning: {
413
+ enabled: true,
414
+ maxGain: 3,
415
+ noiseGateAttenuation: 0.12,
416
+ noiseGateThreshold: 0.006,
417
+ targetLevel: 0.085
418
+ },
419
+ capture: {
420
+ channelCount: 1,
421
+ sampleRateHz: 16000
422
+ },
423
+ connection: {
424
+ maxReconnectAttempts: 14,
425
+ pingInterval: 45000,
426
+ reconnect: true
427
+ },
428
+ sttLifecycle: "continuous",
429
+ turnDetection: {
430
+ qualityProfile: "noisy-room",
431
+ profile: "long-form",
432
+ silenceMs: 2100,
433
+ speechThreshold: 0.02,
434
+ transcriptStabilityMs: 1650
435
+ }
436
+ },
437
+ "pstn-balanced": {
438
+ audioConditioning: {
439
+ enabled: true,
440
+ maxGain: 2.8,
441
+ noiseGateAttenuation: 0.07,
442
+ noiseGateThreshold: 0.005,
443
+ targetLevel: 0.08
444
+ },
445
+ capture: {
446
+ channelCount: 1,
447
+ sampleRateHz: 16000
448
+ },
449
+ connection: {
450
+ maxReconnectAttempts: 14,
451
+ pingInterval: 45000,
452
+ reconnect: true
453
+ },
454
+ sttLifecycle: "continuous",
455
+ turnDetection: {
456
+ qualityProfile: "noisy-room",
457
+ profile: "long-form",
458
+ silenceMs: 660,
459
+ speechThreshold: 0.012,
460
+ transcriptStabilityMs: 300
461
+ }
462
+ },
463
+ "pstn-fast": {
464
+ audioConditioning: {
465
+ enabled: true,
466
+ maxGain: 2.75,
467
+ noiseGateAttenuation: 0.06,
468
+ noiseGateThreshold: 0.005,
469
+ targetLevel: 0.08
470
+ },
471
+ capture: {
472
+ channelCount: 1,
473
+ sampleRateHz: 16000
474
+ },
475
+ connection: {
476
+ maxReconnectAttempts: 14,
477
+ pingInterval: 45000,
478
+ reconnect: true
479
+ },
480
+ sttLifecycle: "continuous",
481
+ turnDetection: {
482
+ qualityProfile: "noisy-room",
483
+ profile: "long-form",
484
+ silenceMs: 620,
485
+ speechThreshold: 0.012,
486
+ transcriptStabilityMs: 280
487
+ }
488
+ },
489
+ reliability: {
490
+ audioConditioning: {
491
+ enabled: true,
492
+ maxGain: 2.9,
493
+ noiseGateAttenuation: 0.08,
494
+ noiseGateThreshold: 0.005,
495
+ targetLevel: 0.08
496
+ },
497
+ capture: {
498
+ channelCount: 1,
499
+ sampleRateHz: 16000
500
+ },
501
+ connection: {
502
+ maxReconnectAttempts: 14,
503
+ pingInterval: 45000,
504
+ reconnect: true
505
+ },
506
+ sttLifecycle: "continuous",
507
+ turnDetection: {
508
+ qualityProfile: "noisy-room",
509
+ profile: "long-form"
510
+ }
511
+ }
512
+ };
513
+ var resolveVoiceRuntimePreset = (name = "default") => {
514
+ const preset = PRESET_INPUTS[name];
515
+ return {
516
+ audioConditioning: resolveAudioConditioningConfig(preset.audioConditioning),
517
+ capture: {
518
+ channelCount: preset.capture?.channelCount ?? 1,
519
+ sampleRateHz: preset.capture?.sampleRateHz ?? 16000
520
+ },
521
+ connection: {
522
+ ...preset.connection
523
+ },
524
+ name,
525
+ sttLifecycle: preset.sttLifecycle ?? "continuous",
526
+ turnDetection: resolveTurnDetectionConfig(preset.turnDetection)
527
+ };
528
+ };
529
+
530
+ // src/ops.ts
531
+ var ensureTaskHistory = (task, entry) => ({
532
+ ...task,
533
+ history: [
534
+ ...task.history ?? [],
535
+ {
536
+ ...entry,
537
+ at: entry.at ?? Date.now()
538
+ }
539
+ ],
540
+ updatedAt: Date.now()
541
+ });
542
+ var withVoiceOpsTaskId = (id, task) => ({
543
+ ...task,
544
+ id
545
+ });
546
+ var withVoiceIntegrationEventId = (id, event) => ({
547
+ ...event,
548
+ id
549
+ });
550
+ var buildVoiceOpsTaskFromReview = (review) => {
551
+ const createdAt = review.generatedAt ?? Date.now();
552
+ const common = {
553
+ createdAt,
554
+ history: [
555
+ {
556
+ actor: "system",
557
+ at: createdAt,
558
+ detail: review.postCall?.summary,
559
+ type: "created"
560
+ }
561
+ ],
562
+ id: `${review.id}:ops`,
563
+ intakeId: review.id,
564
+ outcome: review.summary.outcome,
565
+ recommendedAction: review.postCall?.recommendedAction ?? "Review the voice artifact and decide the next operator action.",
566
+ reviewId: review.id,
567
+ status: "open",
568
+ target: review.postCall?.target,
569
+ updatedAt: createdAt
570
+ };
571
+ switch (review.summary.outcome) {
572
+ case "voicemail":
573
+ return {
574
+ ...common,
575
+ description: review.postCall?.summary ?? "Caller reached voicemail and needs a callback follow-up.",
576
+ kind: "callback",
577
+ title: review.postCall?.target ? `Call back voicemail from ${review.postCall.target}` : "Call back voicemail lead"
578
+ };
579
+ case "no-answer":
580
+ return {
581
+ ...common,
582
+ description: review.postCall?.summary ?? "Live contact was not established and should be retried.",
583
+ kind: "callback",
584
+ title: "Retry no-answer call"
585
+ };
586
+ case "escalated":
587
+ return {
588
+ ...common,
589
+ description: review.postCall?.summary ?? "The automated path escalated this call for human review.",
590
+ kind: "escalation",
591
+ title: "Review escalated call"
592
+ };
593
+ case "transferred":
594
+ return {
595
+ ...common,
596
+ description: review.postCall?.summary ?? "The call was transferred and should be verified downstream.",
597
+ kind: "transfer-check",
598
+ title: review.postCall?.target ? `Verify transfer to ${review.postCall.target}` : "Verify call transfer"
599
+ };
600
+ case "failed":
601
+ return {
602
+ ...common,
603
+ description: review.postCall?.summary ?? "The call failed and needs operator review before retry.",
604
+ kind: "retry-review",
605
+ title: "Inspect failed call before retry"
606
+ };
607
+ default:
608
+ return null;
609
+ }
610
+ };
611
+ var assignVoiceOpsTask = (task, owner, input = {}) => {
612
+ const normalizedOwner = owner.trim() || "ops";
613
+ return ensureTaskHistory({
614
+ ...task,
615
+ assignee: normalizedOwner
616
+ }, {
617
+ actor: input.actor ?? normalizedOwner,
618
+ at: input.at,
619
+ detail: `Assigned to ${normalizedOwner}`,
620
+ type: "assigned"
621
+ });
622
+ };
623
+ var startVoiceOpsTask = (task, input = {}) => ensureTaskHistory({
624
+ ...task,
625
+ status: "in-progress"
626
+ }, {
627
+ actor: input.actor ?? task.assignee ?? "ops",
628
+ at: input.at,
629
+ detail: input.detail ?? "Work started",
630
+ type: "started"
631
+ });
632
+ var completeVoiceOpsTask = (task, input = {}) => ensureTaskHistory({
633
+ ...task,
634
+ status: "done"
635
+ }, {
636
+ actor: input.actor ?? task.assignee ?? "ops",
637
+ at: input.at,
638
+ detail: input.detail ?? "Marked done",
639
+ type: "completed"
640
+ });
641
+ var reopenVoiceOpsTask = (task, input = {}) => ensureTaskHistory({
642
+ ...task,
643
+ status: "open"
644
+ }, {
645
+ actor: input.actor ?? task.assignee ?? "ops",
646
+ at: input.at,
647
+ detail: input.detail ?? "Task reopened",
648
+ type: "reopened"
649
+ });
650
+ var listVoiceOpsTasks = (tasks) => [...tasks].sort((left, right) => right.createdAt - left.createdAt);
651
+ var summarizeVoiceOpsTasks = (tasks) => {
652
+ const summary = {
653
+ byKind: new Map,
654
+ byOutcome: new Map,
655
+ done: 0,
656
+ inProgress: 0,
657
+ open: 0,
658
+ topAssignees: new Map,
659
+ topTargets: new Map,
660
+ total: tasks.length
661
+ };
662
+ for (const task of tasks) {
663
+ if (task.status === "open") {
664
+ summary.open += 1;
665
+ } else if (task.status === "in-progress") {
666
+ summary.inProgress += 1;
667
+ } else if (task.status === "done") {
668
+ summary.done += 1;
669
+ }
670
+ summary.byKind.set(task.kind, (summary.byKind.get(task.kind) ?? 0) + 1);
671
+ if (task.outcome) {
672
+ summary.byOutcome.set(task.outcome, (summary.byOutcome.get(task.outcome) ?? 0) + 1);
673
+ }
674
+ if (task.target) {
675
+ summary.topTargets.set(task.target, (summary.topTargets.get(task.target) ?? 0) + 1);
676
+ }
677
+ if (task.assignee) {
678
+ summary.topAssignees.set(task.assignee, (summary.topAssignees.get(task.assignee) ?? 0) + 1);
679
+ }
680
+ }
681
+ return {
682
+ byKind: [...summary.byKind.entries()].sort((left, right) => right[1] - left[1]),
683
+ byOutcome: [...summary.byOutcome.entries()].sort((left, right) => right[1] - left[1]),
684
+ done: summary.done,
685
+ inProgress: summary.inProgress,
686
+ open: summary.open,
687
+ topAssignees: [...summary.topAssignees.entries()].sort((left, right) => right[1] - left[1]),
688
+ topTargets: [...summary.topTargets.entries()].sort((left, right) => right[1] - left[1]),
689
+ total: summary.total
690
+ };
691
+ };
692
+ var createVoiceIntegrationEvent = (type, payload, input = {}) => ({
693
+ createdAt: input.createdAt ?? Date.now(),
694
+ id: input.id ?? crypto.randomUUID(),
695
+ payload,
696
+ type
697
+ });
698
+ var createVoiceCallCompletedEvent = (input) => createVoiceIntegrationEvent("call.completed", {
699
+ call: input.session.call,
700
+ disposition: input.disposition ?? input.session.call?.disposition,
701
+ scenarioId: input.session.scenarioId,
702
+ sessionId: input.session.id,
703
+ sessionSummary: input.sessionSummary,
704
+ status: input.session.status,
705
+ turnCount: input.session.turns.length
706
+ }, {
707
+ id: `${input.session.id}:call.completed`
708
+ });
709
+ var createVoiceReviewSavedEvent = (review) => createVoiceIntegrationEvent("review.saved", {
710
+ elapsedMs: review.summary.elapsedMs,
711
+ firstTurnLatencyMs: review.summary.firstTurnLatencyMs,
712
+ outcome: review.summary.outcome,
713
+ postCall: review.postCall,
714
+ reviewId: review.id,
715
+ title: review.title
716
+ }, {
717
+ id: `${review.id}:review.saved`
718
+ });
719
+ var createVoiceTaskCreatedEvent = (task) => createVoiceIntegrationEvent("task.created", {
720
+ assignee: task.assignee,
721
+ kind: task.kind,
722
+ outcome: task.outcome,
723
+ recommendedAction: task.recommendedAction,
724
+ reviewId: task.reviewId,
725
+ status: task.status,
726
+ target: task.target,
727
+ taskId: task.id,
728
+ title: task.title
729
+ }, {
730
+ id: `${task.id}:task.created:${task.updatedAt}`
731
+ });
732
+ var createVoiceTaskUpdatedEvent = (task) => createVoiceIntegrationEvent("task.updated", {
733
+ assignee: task.assignee,
734
+ history: task.history,
735
+ kind: task.kind,
736
+ outcome: task.outcome,
737
+ recommendedAction: task.recommendedAction,
738
+ reviewId: task.reviewId,
739
+ status: task.status,
740
+ target: task.target,
741
+ taskId: task.id,
742
+ title: task.title,
743
+ updatedAt: task.updatedAt
744
+ }, {
745
+ id: `${task.id}:task.updated:${task.updatedAt}`
746
+ });
747
+
748
+ // src/testing/review.ts
749
+ var roundMetric = (value) => typeof value === "number" ? Math.round(value * 100) / 100 : undefined;
750
+ var formatMetric = (label, value, unit = "ms") => typeof value === "number" ? `${label}: ${roundMetric(value)}${unit}` : undefined;
751
+ var findTimelineEvent = (timeline, event, source) => timeline.find((entry) => entry.event === event && (source === undefined || entry.source === source));
752
+ var formatTimelineText = (entry) => {
753
+ const parts = [`- ${entry.atMs}ms`, `[${entry.source}]`, entry.event];
754
+ if (entry.text) {
755
+ parts.push(`"${entry.text}"`);
756
+ }
757
+ if (entry.reason) {
758
+ parts.push(`reason=${entry.reason}`);
759
+ }
760
+ if (typeof entry.bytes === "number") {
761
+ parts.push(`bytes=${entry.bytes}`);
762
+ }
763
+ if (typeof entry.confidence === "number") {
764
+ parts.push(`confidence=${roundMetric(entry.confidence)}`);
765
+ }
766
+ if (entry.name) {
767
+ parts.push(`name=${entry.name}`);
768
+ }
769
+ return parts.join(" ");
770
+ };
771
+ var isLowSignalTimelineEvent = (entry) => entry.event === "inbound-media" || entry.event === "inbound-silence-pad" || entry.event === "stt-send" || entry.event === "tts-audio";
772
+ var summarizeTimelineTraffic = (timeline) => {
773
+ const summaries = new Map;
774
+ for (const entry of timeline) {
775
+ const label = entry.event === "inbound-media" ? "inbound media chunks" : entry.event === "inbound-silence-pad" ? "inbound silence padding" : entry.event === "stt-send" ? "STT audio sends" : entry.event === "tts-audio" ? "post-first TTS audio chunks" : undefined;
776
+ if (!label) {
777
+ continue;
778
+ }
779
+ const summary = summaries.get(label) ?? {
780
+ audioMs: 0,
781
+ bytes: 0,
782
+ count: 0,
783
+ label
784
+ };
785
+ summary.count += 1;
786
+ summary.bytes += typeof entry.bytes === "number" ? entry.bytes : 0;
787
+ summary.audioMs = (summary.audioMs ?? 0) + (typeof entry.chunkDurationMs === "number" ? entry.chunkDurationMs : 0);
788
+ summaries.set(label, summary);
789
+ }
790
+ return [...summaries.values()];
791
+ };
792
+ var compactTimeline = (timeline) => {
793
+ const rows = [];
794
+ let index = 0;
795
+ while (index < timeline.length) {
796
+ const current = timeline[index];
797
+ if (!current) {
798
+ break;
799
+ }
800
+ const isBurstEvent = isLowSignalTimelineEvent(current) || current.event === "media" && current.source === "twilio";
801
+ if (!isBurstEvent) {
802
+ rows.push(formatTimelineText(current));
803
+ index += 1;
804
+ continue;
805
+ }
806
+ let endIndex = index;
807
+ let totalBytes = typeof current.bytes === "number" ? current.bytes : 0;
808
+ let totalChunkDurationMs = typeof current.chunkDurationMs === "number" ? current.chunkDurationMs : 0;
809
+ while (endIndex + 1 < timeline.length) {
810
+ const next = timeline[endIndex + 1];
811
+ if (!next) {
812
+ break;
813
+ }
814
+ if (next.event !== current.event || next.source !== current.source) {
815
+ break;
816
+ }
817
+ totalBytes += typeof next.bytes === "number" ? next.bytes : 0;
818
+ totalChunkDurationMs += typeof next.chunkDurationMs === "number" ? next.chunkDurationMs : 0;
819
+ endIndex += 1;
820
+ }
821
+ const startAt = current.atMs;
822
+ const endAt = timeline[endIndex]?.atMs ?? current.atMs;
823
+ const count = endIndex - index + 1;
824
+ const parts = [
825
+ `- ${startAt}-${endAt}ms`,
826
+ `[${current.source}]`,
827
+ `${current.event} x${count}`
828
+ ];
829
+ if (totalBytes > 0) {
830
+ parts.push(`bytes=${totalBytes}`);
831
+ }
832
+ if (totalChunkDurationMs > 0) {
833
+ parts.push(`audio=${roundMetric(totalChunkDurationMs)}ms`);
834
+ }
835
+ rows.push(parts.join(" "));
836
+ index = endIndex + 1;
837
+ }
838
+ return rows;
839
+ };
840
+ var withVoiceCallReviewId = (id, artifact) => ({
841
+ ...artifact,
842
+ id
843
+ });
844
+ var createVoiceCallReviewFromLiveTelephonyReport = (report, options = {}) => {
845
+ const fixture = report.fixtures?.[0];
846
+ if (!fixture) {
847
+ throw new Error("Live telephony review requires at least one fixture result.");
848
+ }
849
+ const timeline = [...report.trace ?? []].sort((left, right) => left.atMs - right.atMs);
850
+ const firstPartial = findTimelineEvent(timeline, "partial", "stt");
851
+ const commitEvent = findTimelineEvent(timeline, "commit", "turn");
852
+ const firstTtsAudio = findTimelineEvent(timeline, "tts-first-audio", "benchmark");
853
+ const firstOutboundMedia = findTimelineEvent(timeline, "media", "twilio");
854
+ const bargeInEvent = findTimelineEvent(timeline, "barge-in", "benchmark");
855
+ const clearEvent = findTimelineEvent(timeline, "clear", "twilio");
856
+ const lastSttText = [...timeline].reverse().find((entry) => entry.source === "stt" && (entry.event === "partial" || entry.event === "final") && typeof entry.text === "string" && entry.text.length > 0)?.text ?? undefined;
857
+ const latencyBreakdown = [
858
+ typeof firstPartial?.atMs === "number" ? {
859
+ label: "start to first partial",
860
+ valueMs: firstPartial.atMs
861
+ } : undefined,
862
+ typeof firstPartial?.atMs === "number" && typeof commitEvent?.atMs === "number" ? {
863
+ label: "first partial to commit",
864
+ valueMs: commitEvent.atMs - firstPartial.atMs
865
+ } : undefined,
866
+ typeof commitEvent?.atMs === "number" && typeof firstTtsAudio?.atMs === "number" ? {
867
+ label: "commit to first TTS audio",
868
+ valueMs: firstTtsAudio.atMs - commitEvent.atMs
869
+ } : undefined,
870
+ typeof commitEvent?.atMs === "number" && typeof firstOutboundMedia?.atMs === "number" ? {
871
+ label: "commit to first outbound media",
872
+ valueMs: firstOutboundMedia.atMs - commitEvent.atMs
873
+ } : undefined,
874
+ typeof bargeInEvent?.atMs === "number" && typeof clearEvent?.atMs === "number" ? {
875
+ label: "barge-in to clear",
876
+ valueMs: clearEvent.atMs - bargeInEvent.atMs
877
+ } : undefined
878
+ ].filter((value) => value !== undefined && value.valueMs >= 0);
879
+ const notes = [
880
+ report.variant?.description,
881
+ firstPartial?.text ? `First partial: "${firstPartial.text}"` : undefined,
882
+ lastSttText ? `Last STT text: "${lastSttText}"` : undefined
883
+ ].filter((value) => typeof value === "string" && value.length > 0);
884
+ return {
885
+ config: {
886
+ preset: options.preset,
887
+ stt: report.variant ? {
888
+ description: report.variant.description,
889
+ id: report.variant.id,
890
+ model: report.variant.model
891
+ } : undefined,
892
+ tts: report.ttsConfig,
893
+ turnDetection: report.turnDetectionConfig
894
+ },
895
+ errors: fixture.errors ?? [],
896
+ expectedText: fixture.expectedText,
897
+ fixtureId: fixture.fixtureId,
898
+ generatedAt: report.generatedAt,
899
+ latencyBreakdown,
900
+ notes,
901
+ path: options.path,
902
+ summary: {
903
+ clearLatencyMs: roundMetric(fixture.clearLatencyMs),
904
+ elapsedMs: roundMetric(fixture.elapsedMs),
905
+ firstOutboundMediaLatencyMs: roundMetric(fixture.firstOutboundMediaLatencyMs),
906
+ firstTurnLatencyMs: roundMetric(fixture.firstTurnLatencyMs),
907
+ markLatencyMs: roundMetric(fixture.markLatencyMs),
908
+ outboundMediaCount: fixture.outboundMediaCount,
909
+ pass: fixture.passes,
910
+ termRecall: roundMetric(fixture.termRecall),
911
+ turnCount: fixture.turnCount,
912
+ wordErrorRate: roundMetric(fixture.wordErrorRate)
913
+ },
914
+ title: fixture.title ?? "Voice Call Review",
915
+ timeline,
916
+ transcript: {
917
+ actual: fixture.actualText,
918
+ expected: fixture.expectedText
919
+ }
920
+ };
921
+ };
922
+ var toErrorMessage = (error) => {
923
+ if (typeof error === "string" && error.trim().length > 0) {
924
+ return error;
925
+ }
926
+ if (error instanceof Error && error.message.trim().length > 0) {
927
+ return error.message;
928
+ }
929
+ return "Unknown call error";
930
+ };
931
+ var createVoiceCallReviewRecorder = (options = {}) => {
932
+ const now = options.now ?? (() => Date.now());
933
+ const startedAt = now();
934
+ const errors = [];
935
+ const timeline = [];
936
+ const committedTurns = [];
937
+ const committedTurnIds = new Set;
938
+ const push = (source, event, fields = {}) => {
939
+ timeline.push({
940
+ atMs: Math.max(0, now() - startedAt),
941
+ event,
942
+ source,
943
+ ...fields
944
+ });
945
+ };
946
+ return {
947
+ finalize: () => {
948
+ const sortedTimeline = [...timeline].sort((left, right) => left.atMs - right.atMs);
949
+ const firstPartial = findTimelineEvent(sortedTimeline, "partial", "stt");
950
+ const commitEvent = findTimelineEvent(sortedTimeline, "commit", "turn");
951
+ const firstTtsAudio = findTimelineEvent(sortedTimeline, "tts-first-audio", "benchmark");
952
+ const firstOutboundMedia = findTimelineEvent(sortedTimeline, "media", "twilio");
953
+ const bargeInEvent = findTimelineEvent(sortedTimeline, "barge-in", "benchmark");
954
+ const clearEvent = findTimelineEvent(sortedTimeline, "clear", "twilio");
955
+ const markEvent = findTimelineEvent(sortedTimeline, "mark", "twilio");
956
+ const elapsedMs = sortedTimeline.at(-1)?.atMs ?? 0;
957
+ const lastSttText = [...sortedTimeline].reverse().find((entry) => entry.source === "stt" && (entry.event === "partial" || entry.event === "final") && typeof entry.text === "string" && entry.text.length > 0)?.text ?? undefined;
958
+ const latencyBreakdown = [
959
+ typeof firstPartial?.atMs === "number" ? {
960
+ label: "start to first partial",
961
+ valueMs: firstPartial.atMs
962
+ } : undefined,
963
+ typeof firstPartial?.atMs === "number" && typeof commitEvent?.atMs === "number" ? {
964
+ label: "first partial to commit",
965
+ valueMs: commitEvent.atMs - firstPartial.atMs
966
+ } : undefined,
967
+ typeof commitEvent?.atMs === "number" && typeof firstTtsAudio?.atMs === "number" ? {
968
+ label: "commit to first TTS audio",
969
+ valueMs: firstTtsAudio.atMs - commitEvent.atMs
970
+ } : undefined,
971
+ typeof commitEvent?.atMs === "number" && typeof firstOutboundMedia?.atMs === "number" ? {
972
+ label: "commit to first outbound media",
973
+ valueMs: firstOutboundMedia.atMs - commitEvent.atMs
974
+ } : undefined,
975
+ typeof bargeInEvent?.atMs === "number" && typeof clearEvent?.atMs === "number" ? {
976
+ label: "barge-in to clear",
977
+ valueMs: clearEvent.atMs - bargeInEvent.atMs
978
+ } : undefined
979
+ ].filter((value) => value !== undefined && value.valueMs >= 0);
980
+ return {
981
+ config: options.config,
982
+ errors,
983
+ fixtureId: options.fixtureId,
984
+ generatedAt: now(),
985
+ latencyBreakdown,
986
+ notes: [
987
+ firstPartial?.text ? `First partial: "${firstPartial.text}"` : undefined,
988
+ lastSttText ? `Last STT text: "${lastSttText}"` : undefined
989
+ ].filter((value) => typeof value === "string"),
990
+ path: options.path,
991
+ summary: {
992
+ clearLatencyMs: roundMetric(typeof clearEvent?.atMs === "number" && typeof bargeInEvent?.atMs === "number" ? clearEvent.atMs - bargeInEvent.atMs : undefined),
993
+ elapsedMs: roundMetric(elapsedMs),
994
+ firstOutboundMediaLatencyMs: roundMetric(firstOutboundMedia?.atMs),
995
+ firstTurnLatencyMs: roundMetric(commitEvent?.atMs),
996
+ markLatencyMs: roundMetric(markEvent?.atMs),
997
+ outboundMediaCount: sortedTimeline.filter((entry) => entry.source === "twilio" && entry.event === "media").length,
998
+ pass: errors.length === 0,
999
+ turnCount: committedTurns.length
1000
+ },
1001
+ title: options.title ?? "Voice Call Review",
1002
+ timeline: sortedTimeline,
1003
+ transcript: {
1004
+ actual: committedTurns.join(" ").trim()
1005
+ }
1006
+ };
1007
+ },
1008
+ recordError: (error) => {
1009
+ const message = toErrorMessage(error);
1010
+ errors.push(message);
1011
+ push("turn", "error", {
1012
+ reason: message
1013
+ });
1014
+ },
1015
+ recordTwilioInbound: (input) => {
1016
+ push("twilio", input.event, {
1017
+ bytes: input.bytes,
1018
+ chunkDurationMs: input.chunkDurationMs,
1019
+ name: input.name,
1020
+ reason: input.reason,
1021
+ text: input.text,
1022
+ track: input.track
1023
+ });
1024
+ },
1025
+ recordTwilioOutbound: (input) => {
1026
+ push("twilio", input.event, {
1027
+ bytes: input.bytes,
1028
+ chunkDurationMs: input.chunkDurationMs,
1029
+ name: input.name,
1030
+ reason: input.reason,
1031
+ text: input.text,
1032
+ track: input.track
1033
+ });
1034
+ },
1035
+ recordVoiceMessage: (message) => {
1036
+ switch (message.type) {
1037
+ case "partial":
1038
+ case "final":
1039
+ push("stt", message.type, {
1040
+ confidence: message.transcript.confidence,
1041
+ text: message.transcript.text
1042
+ });
1043
+ return;
1044
+ case "assistant":
1045
+ push("turn", "assistant", {
1046
+ text: message.text
1047
+ });
1048
+ return;
1049
+ case "audio":
1050
+ push("benchmark", timeline.some((entry) => entry.event === "tts-first-audio") ? "tts-audio" : "tts-first-audio", {
1051
+ bytes: Math.floor(message.chunkBase64.length * 3 / 4)
1052
+ });
1053
+ return;
1054
+ case "turn":
1055
+ if (committedTurnIds.has(message.turn.id)) {
1056
+ return;
1057
+ }
1058
+ committedTurnIds.add(message.turn.id);
1059
+ committedTurns.push(message.turn.text);
1060
+ push("turn", "commit", {
1061
+ confidence: message.turn.quality?.averageConfidence,
1062
+ text: message.turn.text
1063
+ });
1064
+ return;
1065
+ case "error":
1066
+ errors.push(message.message);
1067
+ push("turn", "error", {
1068
+ reason: message.message
1069
+ });
1070
+ return;
1071
+ case "complete":
1072
+ push("turn", "complete", {
1073
+ text: message.sessionId
1074
+ });
1075
+ return;
1076
+ case "session":
1077
+ push("turn", "session", {
1078
+ reason: message.status,
1079
+ text: message.sessionId
1080
+ });
1081
+ return;
1082
+ case "pong":
1083
+ push("benchmark", "pong");
1084
+ return;
1085
+ }
1086
+ }
1087
+ };
1088
+ };
1089
+ var renderConfigSection = (config) => {
1090
+ if (!config) {
1091
+ return "";
1092
+ }
1093
+ return [
1094
+ "## Config",
1095
+ "",
1096
+ "```json",
1097
+ JSON.stringify(config, null, 2),
1098
+ "```"
1099
+ ].join(`
1100
+ `);
1101
+ };
1102
+ var renderTimeline = (timeline) => {
1103
+ const focusedTimeline = timeline.filter((entry) => !isLowSignalTimelineEvent(entry));
1104
+ if (focusedTimeline.length === 0) {
1105
+ return `## Timeline
1106
+
1107
+ _No timeline events captured._`;
1108
+ }
1109
+ const lines = compactTimeline(focusedTimeline);
1110
+ return ["## Timeline", "", ...lines].join(`
1111
+ `);
1112
+ };
1113
+ var renderTransportSummary = (timeline) => {
1114
+ const summaries = summarizeTimelineTraffic(timeline);
1115
+ if (summaries.length === 0) {
1116
+ return "";
1117
+ }
1118
+ return [
1119
+ "## Transport Summary",
1120
+ "",
1121
+ ...summaries.map((summary) => {
1122
+ const parts = [`- ${summary.label}: ${summary.count}`];
1123
+ if (summary.bytes > 0) {
1124
+ parts.push(`${summary.bytes} bytes`);
1125
+ }
1126
+ if ((summary.audioMs ?? 0) > 0) {
1127
+ parts.push(`${roundMetric(summary.audioMs)}ms audio`);
1128
+ }
1129
+ return parts.join(", ");
1130
+ })
1131
+ ].join(`
1132
+ `);
1133
+ };
1134
+ var renderLatencyBreakdown = (breakdown) => {
1135
+ if (breakdown.length === 0) {
1136
+ return "";
1137
+ }
1138
+ return [
1139
+ "## Latency Breakdown",
1140
+ "",
1141
+ ...breakdown.map((entry) => `- ${entry.label}: ${roundMetric(entry.valueMs)}ms`)
1142
+ ].join(`
1143
+ `);
1144
+ };
1145
+ var renderVoiceCallReviewMarkdown = (artifact) => {
1146
+ const summaryLines = [
1147
+ `- pass: ${artifact.summary.pass ? "yes" : "no"}`,
1148
+ formatMetric("first turn", artifact.summary.firstTurnLatencyMs),
1149
+ formatMetric("first outbound media", artifact.summary.firstOutboundMediaLatencyMs),
1150
+ formatMetric("mark", artifact.summary.markLatencyMs),
1151
+ formatMetric("clear", artifact.summary.clearLatencyMs),
1152
+ formatMetric("elapsed", artifact.summary.elapsedMs),
1153
+ typeof artifact.summary.wordErrorRate === "number" ? `- word error rate: ${artifact.summary.wordErrorRate}` : undefined,
1154
+ typeof artifact.summary.termRecall === "number" ? `- term recall: ${artifact.summary.termRecall}` : undefined,
1155
+ typeof artifact.summary.turnCount === "number" ? `- turn count: ${artifact.summary.turnCount}` : undefined,
1156
+ typeof artifact.summary.outboundMediaCount === "number" ? `- outbound media count: ${artifact.summary.outboundMediaCount}` : undefined
1157
+ ].filter((value) => typeof value === "string");
1158
+ const notes = artifact.notes.length ? ["## Notes", "", ...artifact.notes.map((note) => `- ${note}`)].join(`
1159
+ `) : "";
1160
+ const errors = artifact.errors.length ? ["## Errors", "", ...artifact.errors.map((error) => `- ${error}`)].join(`
1161
+ `) : "";
1162
+ const latency = renderLatencyBreakdown(artifact.latencyBreakdown);
1163
+ const transportSummary = renderTransportSummary(artifact.timeline);
1164
+ return [
1165
+ `# ${artifact.title}`,
1166
+ "",
1167
+ artifact.path ? `Source: \`${artifact.path}\`` : undefined,
1168
+ artifact.fixtureId ? `Fixture: \`${artifact.fixtureId}\`` : undefined,
1169
+ "",
1170
+ "## Summary",
1171
+ "",
1172
+ ...summaryLines,
1173
+ "",
1174
+ "## Transcript",
1175
+ "",
1176
+ `- expected: ${artifact.transcript.expected ?? "_n/a_"}`,
1177
+ `- actual: ${artifact.transcript.actual}`,
1178
+ "",
1179
+ notes,
1180
+ notes ? "" : undefined,
1181
+ latency,
1182
+ latency ? "" : undefined,
1183
+ transportSummary,
1184
+ transportSummary ? "" : undefined,
1185
+ errors,
1186
+ errors ? "" : undefined,
1187
+ renderConfigSection(artifact.config),
1188
+ renderConfigSection(artifact.config) ? "" : undefined,
1189
+ renderTimeline(artifact.timeline)
1190
+ ].filter((value) => typeof value === "string").join(`
1191
+ `);
1192
+ };
1193
+ var escapeHtml2 = (value) => value.replaceAll("&", "&amp;").replaceAll("<", "&lt;").replaceAll(">", "&gt;").replaceAll('"', "&quot;").replaceAll("'", "&#39;");
1194
+ var renderVoiceCallReviewHTML = (artifact) => {
1195
+ const notes = artifact.notes.map((note) => `<li>${escapeHtml2(note)}</li>`).join("");
1196
+ const latency = artifact.latencyBreakdown.map((entry) => `<li><strong>${escapeHtml2(entry.label)}:</strong> ${roundMetric(entry.valueMs)}ms</li>`).join("");
1197
+ const transport = summarizeTimelineTraffic(artifact.timeline).map((summary) => {
1198
+ const parts = [`${summary.count}`, "events"];
1199
+ if (summary.bytes > 0) {
1200
+ parts.push(`${summary.bytes} bytes`);
1201
+ }
1202
+ if ((summary.audioMs ?? 0) > 0) {
1203
+ parts.push(`${roundMetric(summary.audioMs)}ms audio`);
1204
+ }
1205
+ return `<li><strong>${escapeHtml2(summary.label)}:</strong> ${escapeHtml2(parts.join(", "))}</li>`;
1206
+ }).join("");
1207
+ const timeline = compactTimeline(artifact.timeline.filter((entry) => !isLowSignalTimelineEvent(entry))).map((line) => `<li>${escapeHtml2(line.replace(/^- /u, ""))}</li>`).join("");
1208
+ return `<!doctype html>
1209
+ <html lang="en">
1210
+ <head>
1211
+ <meta charset="utf-8" />
1212
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
1213
+ <title>${escapeHtml2(artifact.title)}</title>
1214
+ <style>
1215
+ :root { color-scheme: dark; }
1216
+ body { font-family: ui-sans-serif, system-ui, sans-serif; margin: 0; padding: 24px; background: #0b0d10; color: #f4f4f5; }
1217
+ main { max-width: 980px; margin: 0 auto; display: grid; gap: 16px; }
1218
+ section { background: #13161b; border: 1px solid #232833; border-radius: 16px; padding: 18px; }
1219
+ h1, h2 { margin: 0 0 12px; }
1220
+ ul { margin: 0; padding-left: 20px; display: grid; gap: 8px; }
1221
+ code, pre { font-family: ui-monospace, SFMono-Regular, monospace; }
1222
+ pre { white-space: pre-wrap; overflow-wrap: anywhere; background: #0f1217; border-radius: 12px; padding: 14px; border: 1px solid #232833; }
1223
+ .grid { display: grid; gap: 16px; grid-template-columns: repeat(auto-fit, minmax(260px, 1fr)); }
1224
+ .metric { display: grid; gap: 4px; }
1225
+ .label { color: #a1a1aa; font-size: 0.82rem; text-transform: uppercase; letter-spacing: 0.08em; }
1226
+ .value { font-size: 1.05rem; }
1227
+ </style>
1228
+ </head>
1229
+ <body>
1230
+ <main>
1231
+ <section>
1232
+ <h1>${escapeHtml2(artifact.title)}</h1>
1233
+ <div class="grid">
1234
+ <div class="metric"><div class="label">Pass</div><div class="value">${artifact.summary.pass ? "yes" : "no"}</div></div>
1235
+ <div class="metric"><div class="label">First Turn</div><div class="value">${artifact.summary.firstTurnLatencyMs ?? "n/a"}ms</div></div>
1236
+ <div class="metric"><div class="label">First Outbound Media</div><div class="value">${artifact.summary.firstOutboundMediaLatencyMs ?? "n/a"}ms</div></div>
1237
+ <div class="metric"><div class="label">Turn Count</div><div class="value">${artifact.summary.turnCount ?? "n/a"}</div></div>
1238
+ </div>
1239
+ </section>
1240
+ <section>
1241
+ <h2>Transcript</h2>
1242
+ <ul>
1243
+ <li><strong>Expected:</strong> ${escapeHtml2(artifact.transcript.expected ?? "n/a")}</li>
1244
+ <li><strong>Actual:</strong> ${escapeHtml2(artifact.transcript.actual || "n/a")}</li>
1245
+ </ul>
1246
+ </section>
1247
+ <section>
1248
+ <h2>Notes</h2>
1249
+ <ul>${notes || "<li>No notes.</li>"}</ul>
1250
+ </section>
1251
+ <section>
1252
+ <h2>Latency Breakdown</h2>
1253
+ <ul>${latency || "<li>No latency data.</li>"}</ul>
1254
+ </section>
1255
+ <section>
1256
+ <h2>Transport Summary</h2>
1257
+ <ul>${transport || "<li>No transport data.</li>"}</ul>
1258
+ </section>
1259
+ <section>
1260
+ <h2>Timeline</h2>
1261
+ <ul>${timeline || "<li>No timeline events.</li>"}</ul>
1262
+ </section>
1263
+ <section>
1264
+ <h2>Config</h2>
1265
+ <pre>${escapeHtml2(JSON.stringify(artifact.config ?? {}, null, 2))}</pre>
1266
+ </section>
1267
+ </main>
1268
+ </body>
1269
+ </html>`;
1270
+ };
1271
+
1272
+ // src/runtimeOps.ts
1273
+ var defaultReviewTitle = (session) => session.scenarioId ? `Voice call review: ${session.scenarioId}` : `Voice call review: ${session.id}`;
1274
+ var buildDefaultPostCallSummary = (input) => {
1275
+ switch (input.disposition) {
1276
+ case "transferred":
1277
+ return {
1278
+ label: "Transferred",
1279
+ recommendedAction: input.target ? `Confirm the handoff to ${input.target} completed successfully.` : "Confirm the transfer completed successfully.",
1280
+ reason: input.reason,
1281
+ summary: input.target ? `The call was transferred to ${input.target}.` : "The call was transferred.",
1282
+ target: input.target
1283
+ };
1284
+ case "escalated":
1285
+ return {
1286
+ label: "Escalated",
1287
+ recommendedAction: "Review the escalated call and route it to a human operator.",
1288
+ reason: input.reason,
1289
+ summary: input.reason ? `The call escalated because ${input.reason}.` : "The call escalated for operator review."
1290
+ };
1291
+ case "voicemail":
1292
+ return {
1293
+ label: "Voicemail",
1294
+ recommendedAction: "Queue a callback follow-up for this caller.",
1295
+ reason: input.reason,
1296
+ summary: "The call reached voicemail and needs a callback."
1297
+ };
1298
+ case "no-answer":
1299
+ return {
1300
+ label: "No Answer",
1301
+ recommendedAction: "Retry the call or create a callback task.",
1302
+ reason: input.reason,
1303
+ summary: "The call did not reach a live respondent."
1304
+ };
1305
+ case "failed":
1306
+ return {
1307
+ label: "Failed",
1308
+ recommendedAction: "Inspect the call review before retrying this flow.",
1309
+ reason: input.reason,
1310
+ summary: input.reason ? `The call failed because ${input.reason}.` : "The call failed before a successful completion."
1311
+ };
1312
+ case "closed":
1313
+ return {
1314
+ label: "Closed",
1315
+ recommendedAction: "Inspect the review if this early closure was unexpected.",
1316
+ reason: input.reason,
1317
+ summary: "The call closed before an explicit completion."
1318
+ };
1319
+ case "completed":
1320
+ default:
1321
+ return {
1322
+ label: "Completed",
1323
+ recommendedAction: "No follow-up action is required.",
1324
+ reason: input.reason,
1325
+ summary: "The call completed successfully."
1326
+ };
1327
+ }
1328
+ };
1329
+ var createVoiceCallReviewFromSession = (input) => {
1330
+ const generatedAt = input.generatedAt ?? Date.now();
1331
+ const actual = input.session.turns.map((turn) => turn.text).join(" ").trim();
1332
+ const elapsedMs = (input.session.lastActivityAt ?? generatedAt) - input.session.createdAt;
1333
+ return {
1334
+ errors: input.disposition === "failed" && input.reason ? [input.reason] : [],
1335
+ generatedAt,
1336
+ latencyBreakdown: typeof elapsedMs === "number" && elapsedMs >= 0 ? [
1337
+ {
1338
+ label: "Session elapsed",
1339
+ valueMs: elapsedMs
1340
+ }
1341
+ ] : [],
1342
+ notes: [],
1343
+ postCall: buildDefaultPostCallSummary({
1344
+ disposition: input.disposition,
1345
+ reason: input.reason,
1346
+ target: input.target
1347
+ }),
1348
+ summary: {
1349
+ elapsedMs: elapsedMs >= 0 ? elapsedMs : undefined,
1350
+ outcome: input.disposition,
1351
+ pass: input.disposition !== "failed",
1352
+ turnCount: input.session.turns.length
1353
+ },
1354
+ title: defaultReviewTitle(input.session),
1355
+ timeline: input.session.call?.events.map((event) => ({
1356
+ atMs: Math.max(0, event.at - input.session.createdAt),
1357
+ event: `call-${event.type}`,
1358
+ reason: event.reason,
1359
+ source: "turn",
1360
+ text: event.target ?? event.disposition,
1361
+ track: event.target
1362
+ })) ?? [],
1363
+ transcript: {
1364
+ actual
1365
+ }
1366
+ };
1367
+ };
1368
+ var asStoredReview = (sessionId, review) => {
1369
+ if (typeof review.id === "string" && review.id.length > 0) {
1370
+ return review;
1371
+ }
1372
+ return withVoiceCallReviewId(`${sessionId}:review`, review);
1373
+ };
1374
+ var asStoredTask = (review, task) => {
1375
+ if ("id" in task && typeof task.id === "string" && task.id.length > 0) {
1376
+ return task;
1377
+ }
1378
+ return withVoiceOpsTaskId(`${review.id}:ops`, task);
1379
+ };
1380
+ var emitRuntimeEvent = async (input) => {
1381
+ await input.config.events?.set(input.event.id, input.event);
1382
+ await input.config.onEvent?.({
1383
+ api: input.api,
1384
+ context: input.context,
1385
+ event: input.event,
1386
+ session: input.session
1387
+ });
1388
+ };
1389
+ var recordVoiceRuntimeOps = async (input) => {
1390
+ if (!input.config) {
1391
+ return;
1392
+ }
1393
+ const result = input.session.turns.at(-1)?.result;
1394
+ const reviewCandidate = await input.config.buildReview?.({
1395
+ api: input.api,
1396
+ context: input.context,
1397
+ disposition: input.disposition,
1398
+ metadata: input.metadata,
1399
+ reason: input.reason,
1400
+ result,
1401
+ session: input.session,
1402
+ target: input.target
1403
+ }) ?? createVoiceCallReviewFromSession({
1404
+ disposition: input.disposition,
1405
+ reason: input.reason,
1406
+ session: input.session,
1407
+ target: input.target
1408
+ });
1409
+ const review = reviewCandidate ? asStoredReview(input.session.id, reviewCandidate) : undefined;
1410
+ if (review) {
1411
+ await input.config.reviews?.set(review.id, review);
1412
+ await emitRuntimeEvent({
1413
+ api: input.api,
1414
+ config: input.config,
1415
+ context: input.context,
1416
+ event: createVoiceReviewSavedEvent(review),
1417
+ session: input.session
1418
+ });
1419
+ }
1420
+ let task;
1421
+ if (review) {
1422
+ const taskCandidate = await input.config.createTaskFromReview?.({
1423
+ api: input.api,
1424
+ context: input.context,
1425
+ disposition: input.disposition,
1426
+ review,
1427
+ session: input.session
1428
+ }) ?? buildVoiceOpsTaskFromReview(review) ?? undefined;
1429
+ if (taskCandidate) {
1430
+ task = asStoredTask(review, taskCandidate);
1431
+ await input.config.tasks?.set(task.id, task);
1432
+ await emitRuntimeEvent({
1433
+ api: input.api,
1434
+ config: input.config,
1435
+ context: input.context,
1436
+ event: createVoiceTaskCreatedEvent(task),
1437
+ session: input.session
1438
+ });
1439
+ }
1440
+ }
1441
+ await emitRuntimeEvent({
1442
+ api: input.api,
1443
+ config: input.config,
1444
+ context: input.context,
1445
+ event: createVoiceCallCompletedEvent({
1446
+ disposition: input.disposition,
1447
+ session: input.session
1448
+ }),
1449
+ session: input.session
1450
+ });
1451
+ return {
1452
+ review,
1453
+ task
1454
+ };
1455
+ };
1456
+
210
1457
  // src/store.ts
211
1458
  var createId = () => crypto.randomUUID();
212
- var createVoiceSessionRecord = (id) => ({
1459
+ var createVoiceSessionRecord = (id, scenarioId) => ({
213
1460
  committedTurnIds: [],
214
1461
  createdAt: Date.now(),
215
1462
  currentTurn: {
216
1463
  finalText: "",
1464
+ lastSpeechAt: undefined,
1465
+ lastTranscriptAt: undefined,
1466
+ partialEndedAt: undefined,
1467
+ partialStartedAt: undefined,
217
1468
  partialText: "",
1469
+ silenceStartedAt: undefined,
218
1470
  transcripts: []
219
1471
  },
220
1472
  id,
1473
+ scenarioId,
221
1474
  reconnect: { attempts: 0 },
222
1475
  status: "active",
223
1476
  transcripts: [],
224
- turns: []
1477
+ turns: [],
1478
+ lastCommittedTurn: {
1479
+ committedAt: 0,
1480
+ signature: "",
1481
+ text: "",
1482
+ transcriptIds: []
1483
+ }
225
1484
  });
226
- var resetVoiceSessionRecord = (id, existing) => ({
227
- ...createVoiceSessionRecord(id),
1485
+ var resetVoiceSessionRecord = (id, existing, scenarioId) => ({
1486
+ ...createVoiceSessionRecord(id, scenarioId),
228
1487
  metadata: existing?.metadata
229
1488
  });
230
1489
  var toVoiceSessionSummary = (session) => ({
@@ -235,6 +1494,9 @@ var toVoiceSessionSummary = (session) => ({
235
1494
  turnCount: session.turns.length
236
1495
  });
237
1496
 
1497
+ // src/session.ts
1498
+ import { Buffer } from "buffer";
1499
+
238
1500
  // src/turnDetection.ts
239
1501
  var DEFAULT_SILENCE_MS = 700;
240
1502
  var DEFAULT_SPEECH_THRESHOLD = 0.015;
@@ -261,6 +1523,64 @@ var measureAudioLevel = (audio) => {
261
1523
  return Math.sqrt(sumSquares / samples.length);
262
1524
  };
263
1525
  var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
1526
+ var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
1527
+ var selectPreferredTranscriptText = (currentText, nextText) => {
1528
+ const current = normalizeText(currentText);
1529
+ const next = normalizeText(nextText);
1530
+ if (!current) {
1531
+ return next;
1532
+ }
1533
+ if (!next) {
1534
+ return current;
1535
+ }
1536
+ if (current === next || current.includes(next)) {
1537
+ return current;
1538
+ }
1539
+ if (next.includes(current)) {
1540
+ return next;
1541
+ }
1542
+ if (countWords(next) > countWords(current)) {
1543
+ return next;
1544
+ }
1545
+ if (countWords(next) === countWords(current) && next.length > current.length) {
1546
+ return next;
1547
+ }
1548
+ return current;
1549
+ };
1550
+ var mergeSequentialTranscriptText = (currentText, nextText) => {
1551
+ const current = normalizeText(currentText);
1552
+ const next = normalizeText(nextText);
1553
+ if (!current) {
1554
+ return next;
1555
+ }
1556
+ if (!next) {
1557
+ return current;
1558
+ }
1559
+ const currentWords = current.split(" ");
1560
+ const nextWords = next.split(" ");
1561
+ const maxOverlap = Math.min(currentWords.length, nextWords.length);
1562
+ for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
1563
+ const currentSuffix = currentWords.slice(-overlap).join(" ");
1564
+ const nextPrefix = nextWords.slice(0, overlap).join(" ");
1565
+ if (currentSuffix === nextPrefix) {
1566
+ return [...currentWords, ...nextWords.slice(overlap)].join(" ");
1567
+ }
1568
+ }
1569
+ return `${current} ${next}`.trim();
1570
+ };
1571
+ var countCommonPrefixWords = (currentText, nextText) => {
1572
+ const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
1573
+ const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
1574
+ const maxWords = Math.min(currentWords.length, nextWords.length);
1575
+ let count = 0;
1576
+ for (let index = 0;index < maxWords; index += 1) {
1577
+ if (currentWords[index] !== nextWords[index]) {
1578
+ break;
1579
+ }
1580
+ count += 1;
1581
+ }
1582
+ return count;
1583
+ };
264
1584
  var mergeTranscriptTexts = (transcripts) => {
265
1585
  const merged = [];
266
1586
  for (const transcript of transcripts) {
@@ -284,31 +1604,195 @@ var mergeTranscriptTexts = (transcripts) => {
284
1604
  }
285
1605
  return merged.join(" ").trim();
286
1606
  };
287
- var buildTurnText = (transcripts, partialText) => {
1607
+ var buildTurnText = (transcripts, partialText, options = {}) => {
288
1608
  const finalText = mergeTranscriptTexts(transcripts);
289
- if (finalText) {
290
- return finalText;
1609
+ const nextPartial = normalizeText(partialText);
1610
+ const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
1611
+ if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
1612
+ return mergeSequentialTranscriptText(finalText, nextPartial);
291
1613
  }
292
- return normalizeText(partialText);
1614
+ return selectPreferredTranscriptText(finalText, nextPartial);
293
1615
  };
294
1616
 
295
1617
  // src/session.ts
296
1618
  var DEFAULT_RECONNECT_TIMEOUT = 30000;
297
1619
  var DEFAULT_MAX_RECONNECT_ATTEMPTS = 10;
1620
+ var DEFAULT_TRANSCRIPT_STABILITY_MS = 450;
1621
+ var DEFAULT_FALLBACK_REPLAY_MS = 8000;
1622
+ var DEFAULT_FALLBACK_SETTLE_MS = 220;
1623
+ var DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS = 2500;
1624
+ var DEFAULT_FALLBACK_CONFIDENCE_THRESHOLD = 0.6;
1625
+ var DEFAULT_FALLBACK_MIN_TEXT_LENGTH = 2;
1626
+ var DEFAULT_FALLBACK_MAX_ATTEMPTS_PER_TURN = 1;
1627
+ var DEFAULT_DUPLICATE_TURN_WINDOW_MS = 5000;
1628
+ var FALLBACK_CONFIDENCE_SELECTION_DELTA = 0.05;
1629
+ var FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO = 0.12;
1630
+ var EXTENDED_VENDOR_COMMIT_SILENCE_THRESHOLD_MS = 200;
1631
+ var MAX_VENDOR_COMMIT_GRACE_MS = 1200;
1632
+ var DEFAULT_FORMAT = {
1633
+ channels: 1,
1634
+ container: "raw",
1635
+ encoding: "pcm_s16le",
1636
+ sampleRateHz: 16000
1637
+ };
298
1638
  var toError = (value) => value instanceof Error ? value : new Error(String(value));
299
1639
  var createEmptyCurrentTurn = () => ({
300
1640
  finalText: "",
1641
+ lastSpeechAt: undefined,
1642
+ lastTranscriptAt: undefined,
1643
+ partialEndedAt: undefined,
1644
+ partialStartedAt: undefined,
301
1645
  partialText: "",
1646
+ silenceStartedAt: undefined,
302
1647
  transcripts: []
303
1648
  });
304
1649
  var cloneTranscript = (transcript) => ({ ...transcript });
305
- var setTurnResult = (session, turnId, input) => {
306
- session.turns = session.turns.map((turn) => turn.id === turnId ? {
307
- ...turn,
1650
+ var encodeBase64 = (chunk) => Buffer.from(chunk).toString("base64");
1651
+ var countWords2 = (text) => text.trim().split(/\s+/).filter(Boolean).length;
1652
+ var normalizeText2 = (text) => text.trim().replace(/\s+/g, " ");
1653
+ var getAudioChunkDurationMs = (chunk) => chunk.byteLength / (DEFAULT_FORMAT.sampleRateHz * DEFAULT_FORMAT.channels * 2) * 1000;
1654
+ var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => total + getAudioChunkDurationMs(chunk), 0);
1655
+ var calculateMeanConfidence = (transcripts) => {
1656
+ let sum = 0;
1657
+ let total = 0;
1658
+ for (const transcript of transcripts) {
1659
+ if (typeof transcript.confidence === "number") {
1660
+ sum += transcript.confidence;
1661
+ total += 1;
1662
+ }
1663
+ }
1664
+ if (total === 0) {
1665
+ return 0;
1666
+ }
1667
+ return sum / total;
1668
+ };
1669
+ var createTurnQuality = (transcripts, source, fallbackUsed, fallbackDiagnostics, correctionDiagnostics, costEstimate) => {
1670
+ const sampledTranscripts = transcripts.filter((transcript) => typeof transcript.confidence === "number");
1671
+ const confidenceSampleCount = sampledTranscripts.length;
1672
+ return {
1673
+ averageConfidence: confidenceSampleCount > 0 ? sampledTranscripts.reduce((sum, transcript) => sum + transcript.confidence, 0) / confidenceSampleCount : undefined,
1674
+ confidenceSampleCount,
1675
+ correction: correctionDiagnostics,
1676
+ cost: costEstimate,
1677
+ fallback: fallbackDiagnostics,
1678
+ fallbackUsed,
1679
+ finalTranscriptCount: transcripts.filter((transcript) => transcript.isFinal).length,
1680
+ partialTranscriptCount: transcripts.filter((transcript) => !transcript.isFinal).length,
1681
+ selectedTranscriptCount: transcripts.length,
1682
+ source
1683
+ };
1684
+ };
1685
+ var createTurnCostEstimate = (input) => {
1686
+ const primaryMinutes = Math.max(0, input.primaryAudioMs) / 60000;
1687
+ const fallbackMinutes = Math.max(0, input.fallbackReplayAudioMs) / 60000;
1688
+ const primaryCostUnit = input.primaryPassCostUnit ?? 1;
1689
+ const fallbackCostUnit = input.fallbackPassCostUnit ?? primaryCostUnit;
1690
+ return {
1691
+ estimatedRelativeCostUnits: primaryMinutes * primaryCostUnit + fallbackMinutes * fallbackCostUnit,
1692
+ fallbackAttemptCount: input.fallbackAttemptCount,
1693
+ fallbackReplayAudioMs: Math.max(0, input.fallbackReplayAudioMs),
1694
+ primaryAudioMs: Math.max(0, input.primaryAudioMs),
1695
+ totalBillableAudioMs: Math.max(0, input.primaryAudioMs) + Math.max(0, input.fallbackReplayAudioMs)
1696
+ };
1697
+ };
1698
+ var normalizeCorrectionText = (text) => normalizeText2(text);
1699
+ var isFallbackNeeded = (candidate, config) => {
1700
+ const trimmed = normalizeText2(candidate.text);
1701
+ const wordCount = countWords2(trimmed);
1702
+ if (config.trigger === "always") {
1703
+ return true;
1704
+ }
1705
+ if (config.trigger === "empty-turn") {
1706
+ return wordCount < config.minTextLength;
1707
+ }
1708
+ const averageConfidence = calculateMeanConfidence(candidate.transcripts);
1709
+ if (config.trigger === "low-confidence") {
1710
+ return averageConfidence > 0 && averageConfidence < config.confidenceThreshold;
1711
+ }
1712
+ return averageConfidence > 0 && averageConfidence < config.confidenceThreshold || wordCount < config.minTextLength;
1713
+ };
1714
+ var selectBetterTurnText = (candidate, fallback) => {
1715
+ if (!fallback.text) {
1716
+ return {
1717
+ reason: "fallback-empty",
1718
+ winner: candidate
1719
+ };
1720
+ }
1721
+ if (!candidate.text) {
1722
+ return {
1723
+ reason: "primary-empty",
1724
+ winner: fallback
1725
+ };
1726
+ }
1727
+ const largestWordCount = Math.max(candidate.wordCount, fallback.wordCount, 1);
1728
+ const wordCountDelta = fallback.wordCount - candidate.wordCount;
1729
+ const wordCountDeltaRatio = Math.abs(wordCountDelta) / largestWordCount;
1730
+ if (wordCountDeltaRatio >= FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO && wordCountDelta !== 0) {
1731
+ return {
1732
+ reason: "word-count-margin",
1733
+ winner: wordCountDelta > 0 ? fallback : candidate
1734
+ };
1735
+ }
1736
+ if (fallback.confidence > candidate.confidence + FALLBACK_CONFIDENCE_SELECTION_DELTA) {
1737
+ return {
1738
+ reason: "confidence-margin",
1739
+ winner: fallback
1740
+ };
1741
+ }
1742
+ if (candidate.confidence > fallback.confidence + FALLBACK_CONFIDENCE_SELECTION_DELTA) {
1743
+ return {
1744
+ reason: "kept-primary",
1745
+ winner: candidate
1746
+ };
1747
+ }
1748
+ if (fallback.wordCount > candidate.wordCount) {
1749
+ return {
1750
+ reason: "word-count-tiebreak",
1751
+ winner: fallback
1752
+ };
1753
+ }
1754
+ return {
1755
+ reason: "kept-primary",
1756
+ winner: candidate
1757
+ };
1758
+ };
1759
+ var setTurnResult = (session, turnId, input) => {
1760
+ session.turns = session.turns.map((turn) => turn.id === turnId ? {
1761
+ ...turn,
308
1762
  assistantText: input.assistantText ?? turn.assistantText,
309
1763
  result: input.result ?? turn.result
310
1764
  } : turn);
311
1765
  };
1766
+ var ensureCallLifecycleState = (session) => {
1767
+ const startedAt = session.createdAt;
1768
+ session.call ??= {
1769
+ events: [],
1770
+ lastEventAt: startedAt,
1771
+ startedAt
1772
+ };
1773
+ return session.call;
1774
+ };
1775
+ var pushCallLifecycleEvent = (session, input) => {
1776
+ const lifecycle = ensureCallLifecycleState(session);
1777
+ const at = Date.now();
1778
+ lifecycle.events = [
1779
+ ...lifecycle.events,
1780
+ {
1781
+ at,
1782
+ disposition: input.disposition,
1783
+ metadata: input.metadata,
1784
+ reason: input.reason,
1785
+ target: input.target,
1786
+ type: input.type
1787
+ }
1788
+ ];
1789
+ lifecycle.lastEventAt = at;
1790
+ if (input.type === "end") {
1791
+ lifecycle.disposition = input.disposition;
1792
+ lifecycle.endedAt = at;
1793
+ }
1794
+ return lifecycle;
1795
+ };
312
1796
  var createVoiceSession = (options) => {
313
1797
  const logger = resolveLogger(options.logger);
314
1798
  const reconnect = {
@@ -318,18 +1802,74 @@ var createVoiceSession = (options) => {
318
1802
  };
319
1803
  const turnDetection = {
320
1804
  silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
321
- speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD
1805
+ speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
1806
+ transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
322
1807
  };
1808
+ const sttFallback = options.sttFallback ? {
1809
+ adapter: options.sttFallback.adapter,
1810
+ completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
1811
+ confidenceThreshold: options.sttFallback.confidenceThreshold ?? DEFAULT_FALLBACK_CONFIDENCE_THRESHOLD,
1812
+ maxAttemptsPerTurn: options.sttFallback.maxAttemptsPerTurn ?? DEFAULT_FALLBACK_MAX_ATTEMPTS_PER_TURN,
1813
+ minTextLength: options.sttFallback.minTextLength ?? DEFAULT_FALLBACK_MIN_TEXT_LENGTH,
1814
+ replayWindowMs: options.sttFallback.replayWindowMs ?? DEFAULT_FALLBACK_REPLAY_MS,
1815
+ settleMs: options.sttFallback.settleMs ?? DEFAULT_FALLBACK_SETTLE_MS,
1816
+ trigger: options.sttFallback.trigger ?? "empty-or-low-confidence"
1817
+ } : undefined;
1818
+ const phraseHints = options.phraseHints ?? [];
1819
+ const lexicon = options.lexicon ?? [];
323
1820
  let socket = options.socket;
324
1821
  let sttSession = null;
1822
+ let ttsSession = null;
1823
+ let ttsSessionPromise = null;
325
1824
  let silenceTimer = null;
1825
+ let pendingCommitReason = null;
326
1826
  let speechDetected = false;
1827
+ let operationQueue = Promise.resolve();
1828
+ let adapterGenerationCounter = 0;
1829
+ let activeAdapterGeneration = 0;
1830
+ let activeTTSTurnId;
1831
+ const currentTurnAudio = [];
1832
+ let fallbackAttemptsForCurrentTurn = 0;
1833
+ let fallbackReplayAudioMsForCurrentTurn = 0;
1834
+ const pruneTurnAudio = () => {
1835
+ const replayWindowMs = sttFallback?.replayWindowMs ?? DEFAULT_FALLBACK_REPLAY_MS;
1836
+ const cutoffAt = Date.now() - replayWindowMs;
1837
+ let index = 0;
1838
+ while (index < currentTurnAudio.length && currentTurnAudio[index].recordedAt < cutoffAt) {
1839
+ index += 1;
1840
+ }
1841
+ if (index > 0) {
1842
+ currentTurnAudio.splice(0, index);
1843
+ }
1844
+ };
1845
+ const pushTurnAudio = (audio) => {
1846
+ const chunk = audio instanceof ArrayBuffer ? new Uint8Array(audio.slice(0)) : new Uint8Array(audio.buffer.slice(audio.byteOffset, audio.byteOffset + audio.byteLength));
1847
+ currentTurnAudio.push({
1848
+ chunk,
1849
+ recordedAt: Date.now()
1850
+ });
1851
+ pruneTurnAudio();
1852
+ };
1853
+ const getFallbackWindowAudio = () => {
1854
+ if (!sttFallback?.adapter) {
1855
+ return [];
1856
+ }
1857
+ pruneTurnAudio();
1858
+ return currentTurnAudio.map((audio) => audio.chunk);
1859
+ };
327
1860
  const clearSilenceTimer = () => {
328
1861
  if (!silenceTimer) {
329
1862
  return;
330
1863
  }
331
1864
  clearTimeout(silenceTimer);
332
1865
  silenceTimer = null;
1866
+ pendingCommitReason = null;
1867
+ };
1868
+ const getVendorCommitDelayMs = () => {
1869
+ if (turnDetection.silenceMs < EXTENDED_VENDOR_COMMIT_SILENCE_THRESHOLD_MS || turnDetection.transcriptStabilityMs < EXTENDED_VENDOR_COMMIT_SILENCE_THRESHOLD_MS) {
1870
+ return turnDetection.transcriptStabilityMs;
1871
+ }
1872
+ return Math.max(turnDetection.transcriptStabilityMs, Math.min(MAX_VENDOR_COMMIT_GRACE_MS, turnDetection.silenceMs * 2));
333
1873
  };
334
1874
  const send = async (message) => {
335
1875
  try {
@@ -349,12 +1889,28 @@ var createVoiceSession = (options) => {
349
1889
  await options.store.set(options.id, session);
350
1890
  return session;
351
1891
  };
1892
+ const runSerial = (phase, operation) => {
1893
+ const result = operationQueue.then(async () => {
1894
+ logger.debug("voice session operation", {
1895
+ phase,
1896
+ sessionId: options.id
1897
+ });
1898
+ return await operation();
1899
+ });
1900
+ operationQueue = result.then(() => {
1901
+ return;
1902
+ }, () => {
1903
+ return;
1904
+ });
1905
+ return result;
1906
+ };
352
1907
  const closeAdapter = async (reason) => {
353
1908
  if (!sttSession) {
354
1909
  return;
355
1910
  }
356
1911
  const activeSession = sttSession;
357
1912
  sttSession = null;
1913
+ activeAdapterGeneration = 0;
358
1914
  try {
359
1915
  await activeSession.close(reason);
360
1916
  } catch (error) {
@@ -364,13 +1920,255 @@ var createVoiceSession = (options) => {
364
1920
  });
365
1921
  }
366
1922
  };
367
- const scheduleSilenceCommit = () => {
368
- if (silenceTimer) {
1923
+ const closeTTSSession = async (reason) => {
1924
+ const activeSession = ttsSession;
1925
+ ttsSession = null;
1926
+ ttsSessionPromise = null;
1927
+ activeTTSTurnId = undefined;
1928
+ if (!activeSession) {
1929
+ return;
1930
+ }
1931
+ try {
1932
+ await activeSession.close(reason);
1933
+ } catch (error) {
1934
+ logger.warn("voice tts adapter close failed", {
1935
+ error: toError(error).message,
1936
+ reason,
1937
+ sessionId: options.id
1938
+ });
1939
+ }
1940
+ };
1941
+ const scheduleTurnCommit = (delayMs, reason, reset = true) => {
1942
+ if (!reset && silenceTimer) {
369
1943
  return;
370
1944
  }
1945
+ if (reset) {
1946
+ clearSilenceTimer();
1947
+ }
1948
+ pendingCommitReason = reason;
371
1949
  silenceTimer = setTimeout(() => {
372
- api.commitTurn("silence");
373
- }, turnDetection.silenceMs);
1950
+ silenceTimer = null;
1951
+ pendingCommitReason = null;
1952
+ api.commitTurn(reason);
1953
+ }, delayMs);
1954
+ };
1955
+ const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
1956
+ const requestTurnCommit = async (reason) => {
1957
+ const session = await readSession();
1958
+ const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
1959
+ partialEndedAtMs: session.currentTurn.partialEndedAt,
1960
+ partialStartedAtMs: session.currentTurn.partialStartedAt
1961
+ });
1962
+ if (!text) {
1963
+ return;
1964
+ }
1965
+ const transcriptStabilityAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : undefined;
1966
+ if (reason === "vendor") {
1967
+ scheduleTurnCommit(getVendorCommitDelayMs(), reason);
1968
+ return;
1969
+ }
1970
+ if (reason !== "manual" && typeof transcriptStabilityAge === "number" && transcriptStabilityAge < turnDetection.transcriptStabilityMs) {
1971
+ scheduleTurnCommit(turnDetection.transcriptStabilityMs - transcriptStabilityAge, reason);
1972
+ return;
1973
+ }
1974
+ await commitTurnInternal(reason);
1975
+ };
1976
+ const failInternal = async (error) => {
1977
+ clearSilenceTimer();
1978
+ let didFail = false;
1979
+ const session = await writeSession((currentSession) => {
1980
+ if (currentSession.status === "failed") {
1981
+ return;
1982
+ }
1983
+ didFail = true;
1984
+ currentSession.lastActivityAt = Date.now();
1985
+ currentSession.status = "failed";
1986
+ if (!currentSession.call?.endedAt) {
1987
+ pushCallLifecycleEvent(currentSession, {
1988
+ disposition: "failed",
1989
+ reason: toError(error).message,
1990
+ type: "end"
1991
+ });
1992
+ }
1993
+ });
1994
+ if (!didFail) {
1995
+ return;
1996
+ }
1997
+ const resolvedError = toError(error);
1998
+ await send({
1999
+ message: resolvedError.message,
2000
+ recoverable: false,
2001
+ type: "error"
2002
+ });
2003
+ await closeTTSSession("failed");
2004
+ await closeAdapter("failed");
2005
+ speechDetected = false;
2006
+ rewindFallbackTurnAudio();
2007
+ await options.route.onError?.({
2008
+ api,
2009
+ context: options.context,
2010
+ error: resolvedError,
2011
+ session,
2012
+ sessionId: options.id
2013
+ });
2014
+ await options.route.onCallEnd?.({
2015
+ api,
2016
+ context: options.context,
2017
+ disposition: "failed",
2018
+ reason: resolvedError.message,
2019
+ session
2020
+ });
2021
+ };
2022
+ const completeInternal = async (result, input = {}) => {
2023
+ clearSilenceTimer();
2024
+ const disposition = input.disposition ?? "completed";
2025
+ const shouldInvokeOnComplete = input.invokeOnComplete ?? disposition === "completed";
2026
+ let didComplete = false;
2027
+ const session = await writeSession((currentSession) => {
2028
+ if (currentSession.status === "completed" || currentSession.status === "failed") {
2029
+ return;
2030
+ }
2031
+ didComplete = true;
2032
+ currentSession.lastActivityAt = Date.now();
2033
+ currentSession.status = "completed";
2034
+ if (result !== undefined && currentSession.turns.length > 0) {
2035
+ const lastTurn = currentSession.turns.at(-1);
2036
+ if (lastTurn) {
2037
+ setTurnResult(currentSession, lastTurn.id, {
2038
+ result
2039
+ });
2040
+ }
2041
+ }
2042
+ if (!currentSession.call?.endedAt) {
2043
+ pushCallLifecycleEvent(currentSession, {
2044
+ disposition,
2045
+ metadata: input.metadata,
2046
+ reason: input.reason,
2047
+ target: input.target,
2048
+ type: "end"
2049
+ });
2050
+ }
2051
+ });
2052
+ if (!didComplete) {
2053
+ return;
2054
+ }
2055
+ await send({
2056
+ sessionId: options.id,
2057
+ type: "complete"
2058
+ });
2059
+ await closeTTSSession("complete");
2060
+ await closeAdapter("complete");
2061
+ speechDetected = false;
2062
+ rewindFallbackTurnAudio();
2063
+ if (disposition === "transferred" && input.target) {
2064
+ await options.route.onTransfer?.({
2065
+ api,
2066
+ context: options.context,
2067
+ metadata: input.metadata,
2068
+ reason: input.reason,
2069
+ session,
2070
+ target: input.target
2071
+ });
2072
+ }
2073
+ if (disposition === "escalated" && input.reason) {
2074
+ await options.route.onEscalation?.({
2075
+ api,
2076
+ context: options.context,
2077
+ metadata: input.metadata,
2078
+ reason: input.reason,
2079
+ session
2080
+ });
2081
+ }
2082
+ if (disposition === "voicemail") {
2083
+ await options.route.onVoicemail?.({
2084
+ api,
2085
+ context: options.context,
2086
+ metadata: input.metadata,
2087
+ session
2088
+ });
2089
+ }
2090
+ if (disposition === "no-answer") {
2091
+ await options.route.onNoAnswer?.({
2092
+ api,
2093
+ context: options.context,
2094
+ metadata: input.metadata,
2095
+ session
2096
+ });
2097
+ }
2098
+ if (shouldInvokeOnComplete) {
2099
+ await options.route.onComplete({
2100
+ api,
2101
+ context: options.context,
2102
+ session
2103
+ });
2104
+ }
2105
+ await options.route.onCallEnd?.({
2106
+ api,
2107
+ context: options.context,
2108
+ disposition,
2109
+ metadata: input.metadata,
2110
+ reason: input.reason,
2111
+ session,
2112
+ target: input.target
2113
+ });
2114
+ };
2115
+ const transferInternal = async (input) => {
2116
+ await writeSession((currentSession) => {
2117
+ pushCallLifecycleEvent(currentSession, {
2118
+ metadata: input.metadata,
2119
+ reason: input.reason,
2120
+ target: input.target,
2121
+ type: "transfer"
2122
+ });
2123
+ });
2124
+ await completeInternal(input.result, {
2125
+ disposition: "transferred",
2126
+ invokeOnComplete: false,
2127
+ metadata: input.metadata,
2128
+ reason: input.reason,
2129
+ target: input.target
2130
+ });
2131
+ };
2132
+ const escalateInternal = async (input) => {
2133
+ await writeSession((currentSession) => {
2134
+ pushCallLifecycleEvent(currentSession, {
2135
+ metadata: input.metadata,
2136
+ reason: input.reason,
2137
+ type: "escalation"
2138
+ });
2139
+ });
2140
+ await completeInternal(input.result, {
2141
+ disposition: "escalated",
2142
+ invokeOnComplete: false,
2143
+ metadata: input.metadata,
2144
+ reason: input.reason
2145
+ });
2146
+ };
2147
+ const markNoAnswerInternal = async (input) => {
2148
+ await writeSession((currentSession) => {
2149
+ pushCallLifecycleEvent(currentSession, {
2150
+ metadata: input?.metadata,
2151
+ type: "no-answer"
2152
+ });
2153
+ });
2154
+ await completeInternal(input?.result, {
2155
+ disposition: "no-answer",
2156
+ invokeOnComplete: false,
2157
+ metadata: input?.metadata
2158
+ });
2159
+ };
2160
+ const markVoicemailInternal = async (input) => {
2161
+ await writeSession((currentSession) => {
2162
+ pushCallLifecycleEvent(currentSession, {
2163
+ metadata: input?.metadata,
2164
+ type: "voicemail"
2165
+ });
2166
+ });
2167
+ await completeInternal(input?.result, {
2168
+ disposition: "voicemail",
2169
+ invokeOnComplete: false,
2170
+ metadata: input?.metadata
2171
+ });
374
2172
  };
375
2173
  const handleError = async (event) => {
376
2174
  await send({
@@ -379,87 +2177,462 @@ var createVoiceSession = (options) => {
379
2177
  type: "error"
380
2178
  });
381
2179
  if (!event.recoverable) {
382
- await api.fail(event.error);
2180
+ await failInternal(event.error);
383
2181
  }
384
2182
  };
385
2183
  const handleClose = async (event) => {
386
2184
  if (event.recoverable === false) {
387
- await api.fail(new Error(event.reason ?? "Speech-to-text session closed"));
2185
+ await failInternal(new Error(event.reason ?? "Speech-to-text session closed"));
2186
+ return;
2187
+ }
2188
+ if (!event.reason) {
2189
+ await closeAdapter("provider stream closed");
2190
+ return;
2191
+ }
2192
+ await closeAdapter(event.reason);
2193
+ };
2194
+ const rewindFallbackTurnAudio = () => {
2195
+ fallbackAttemptsForCurrentTurn = 0;
2196
+ fallbackReplayAudioMsForCurrentTurn = 0;
2197
+ currentTurnAudio.length = 0;
2198
+ };
2199
+ const runFallbackTranscription = async (primaryText, primaryTranscripts) => {
2200
+ if (!sttFallback?.adapter || fallbackAttemptsForCurrentTurn >= sttFallback.maxAttemptsPerTurn) {
2201
+ return null;
2202
+ }
2203
+ const candidate = {
2204
+ text: primaryText,
2205
+ transcripts: primaryTranscripts
2206
+ };
2207
+ if (!isFallbackNeeded(candidate, sttFallback)) {
2208
+ return null;
2209
+ }
2210
+ fallbackAttemptsForCurrentTurn += 1;
2211
+ const replayAudio = getFallbackWindowAudio();
2212
+ if (replayAudio.length === 0) {
2213
+ return null;
2214
+ }
2215
+ let fallbackSession = null;
2216
+ const fallbackTranscripts = [];
2217
+ let fallbackClosed = false;
2218
+ let fallbackEndOfTurnReceived = false;
2219
+ let fallbackFinalReceived = false;
2220
+ let lastFallbackTranscriptAt = 0;
2221
+ try {
2222
+ fallbackSession = await sttFallback.adapter.open({
2223
+ format: DEFAULT_FORMAT,
2224
+ languageStrategy: options.languageStrategy,
2225
+ lexicon,
2226
+ phraseHints,
2227
+ sessionId: `${options.id}:fallback:${fallbackAttemptsForCurrentTurn}`
2228
+ });
2229
+ } catch (error) {
2230
+ logger.warn("voice stt fallback open failed", {
2231
+ error: toError(error).message,
2232
+ sessionId: options.id
2233
+ });
2234
+ return null;
2235
+ }
2236
+ const unsubscribers = [
2237
+ fallbackSession.on("final", ({ transcript }) => {
2238
+ fallbackFinalReceived = true;
2239
+ lastFallbackTranscriptAt = Date.now();
2240
+ fallbackTranscripts.push(cloneTranscript(transcript));
2241
+ }),
2242
+ fallbackSession.on("partial", ({ transcript }) => {
2243
+ lastFallbackTranscriptAt = Date.now();
2244
+ fallbackTranscripts.push(cloneTranscript(transcript));
2245
+ }),
2246
+ fallbackSession.on("endOfTurn", () => {
2247
+ fallbackEndOfTurnReceived = true;
2248
+ }),
2249
+ fallbackSession.on("error", (event) => {
2250
+ logger.warn("voice stt fallback error", {
2251
+ error: toError(event.error).message,
2252
+ sessionId: options.id
2253
+ });
2254
+ }),
2255
+ fallbackSession.on("close", () => {
2256
+ fallbackClosed = true;
2257
+ })
2258
+ ];
2259
+ const closeFallback = async (reason) => {
2260
+ if (!fallbackSession) {
2261
+ return;
2262
+ }
2263
+ try {
2264
+ await fallbackSession.close(reason);
2265
+ } catch (error) {
2266
+ logger.warn("voice stt fallback close failed", {
2267
+ error: toError(error).message,
2268
+ sessionId: options.id
2269
+ });
2270
+ } finally {
2271
+ fallbackSession = null;
2272
+ }
2273
+ };
2274
+ try {
2275
+ for (const chunk of replayAudio) {
2276
+ await fallbackSession.send(chunk);
2277
+ }
2278
+ const replayDurationMs = getBufferedAudioDurationMs(replayAudio);
2279
+ fallbackReplayAudioMsForCurrentTurn += replayDurationMs;
2280
+ const completionTimeoutMs = Math.max(sttFallback.completionTimeoutMs, Math.min(4000, Math.max(sttFallback.settleMs * 4, Math.round(replayDurationMs * 0.18))));
2281
+ const waitStartedAt = Date.now();
2282
+ while (Date.now() - waitStartedAt < completionTimeoutMs) {
2283
+ const idleMs = lastFallbackTranscriptAt > 0 ? Date.now() - lastFallbackTranscriptAt : Date.now() - waitStartedAt;
2284
+ if (fallbackEndOfTurnReceived && idleMs >= sttFallback.settleMs) {
2285
+ break;
2286
+ }
2287
+ if (fallbackFinalReceived && idleMs >= sttFallback.settleMs) {
2288
+ break;
2289
+ }
2290
+ if (fallbackClosed && (lastFallbackTranscriptAt === 0 || idleMs >= sttFallback.settleMs)) {
2291
+ break;
2292
+ }
2293
+ await Bun.sleep(Math.min(75, Math.max(25, sttFallback.settleMs / 2)));
2294
+ }
2295
+ } catch (error) {
2296
+ logger.warn("voice stt fallback failed", {
2297
+ error: toError(error).message,
2298
+ sessionId: options.id
2299
+ });
2300
+ } finally {
2301
+ await closeFallback("fallback-complete");
2302
+ for (const unsubscribe of unsubscribers) {
2303
+ unsubscribe();
2304
+ }
2305
+ }
2306
+ if (fallbackTranscripts.length === 0) {
2307
+ return null;
2308
+ }
2309
+ const fallbackText = buildTurnText(fallbackTranscripts, "", {});
2310
+ const fallbackConfidence = calculateMeanConfidence(fallbackTranscripts);
2311
+ const fallbackCandidate = {
2312
+ confidence: fallbackConfidence,
2313
+ text: fallbackText,
2314
+ wordCount: countWords2(normalizeText2(fallbackText))
2315
+ };
2316
+ const primaryCandidate = {
2317
+ confidence: calculateMeanConfidence(primaryTranscripts),
2318
+ text: primaryText,
2319
+ wordCount: countWords2(normalizeText2(primaryText))
2320
+ };
2321
+ const selection = selectBetterTurnText(primaryCandidate, fallbackCandidate);
2322
+ const diagnostics = {
2323
+ attempted: true,
2324
+ fallbackConfidence: fallbackCandidate.confidence,
2325
+ fallbackText: fallbackCandidate.text,
2326
+ fallbackWordCount: fallbackCandidate.wordCount,
2327
+ primaryConfidence: primaryCandidate.confidence,
2328
+ primaryText,
2329
+ primaryWordCount: primaryCandidate.wordCount,
2330
+ selected: selection.winner.text === fallbackCandidate.text,
2331
+ selectionReason: selection.reason,
2332
+ trigger: sttFallback.trigger
2333
+ };
2334
+ if (selection.winner.text === primaryCandidate.text) {
2335
+ return {
2336
+ diagnostics,
2337
+ fallbackUsed: false,
2338
+ source: "primary",
2339
+ text: primaryText,
2340
+ transcripts: primaryTranscripts.map((transcript) => ({
2341
+ ...transcript,
2342
+ isFinal: true
2343
+ }))
2344
+ };
2345
+ }
2346
+ const candidateTranscripts = fallbackText === fallbackCandidate.text ? fallbackTranscripts : [];
2347
+ return {
2348
+ diagnostics,
2349
+ fallbackUsed: true,
2350
+ source: "fallback",
2351
+ text: selection.winner.text,
2352
+ transcripts: candidateTranscripts.length > 0 ? candidateTranscripts.map((transcript) => ({
2353
+ ...transcript,
2354
+ isFinal: true
2355
+ })) : [{ id: createId(), isFinal: false, text: selection.winner.text }]
2356
+ };
2357
+ };
2358
+ const getFinalTranscriptIds = (transcripts) => {
2359
+ const finalTranscriptIds = transcripts.filter((transcript) => transcript.isFinal).map((transcript) => transcript.id);
2360
+ const fallbackIds = transcripts.map((transcript) => transcript.id);
2361
+ return finalTranscriptIds.length > 0 ? finalTranscriptIds : fallbackIds;
2362
+ };
2363
+ const runTurnCorrection = async (input) => {
2364
+ if (!options.route.correctTurn) {
2365
+ return;
2366
+ }
2367
+ const originalText = input.text;
2368
+ const result = await options.route.correctTurn({
2369
+ api,
2370
+ context: options.context,
2371
+ fallback: input.fallbackDiagnostics,
2372
+ lexicon,
2373
+ phraseHints,
2374
+ session: input.session,
2375
+ text: originalText,
2376
+ transcripts: input.transcripts.map(cloneTranscript)
2377
+ });
2378
+ const nextText = typeof result === "string" ? result : typeof result?.text === "string" ? result.text : originalText;
2379
+ const correctedText = normalizeCorrectionText(nextText);
2380
+ const normalizedOriginal = normalizeCorrectionText(originalText);
2381
+ return {
2382
+ diagnostics: {
2383
+ attempted: true,
2384
+ changed: correctedText.length > 0 && correctedText !== normalizedOriginal,
2385
+ correctedText: correctedText.length > 0 ? correctedText : normalizedOriginal,
2386
+ metadata: typeof result === "object" ? result.metadata : undefined,
2387
+ originalText,
2388
+ provider: typeof result === "object" ? result.provider : undefined,
2389
+ reason: typeof result === "object" ? result.reason : undefined
2390
+ },
2391
+ text: correctedText.length > 0 ? correctedText : originalText
2392
+ };
2393
+ };
2394
+ const ensureCommittedTurnGuard = (session) => {
2395
+ if (!session.lastCommittedTurn) {
2396
+ session.lastCommittedTurn = {
2397
+ committedAt: 0,
2398
+ signature: "",
2399
+ text: "",
2400
+ transcriptIds: []
2401
+ };
2402
+ }
2403
+ return session;
2404
+ };
2405
+ const buildTurnSignature = (session, finalText, transcriptIdsOverride) => {
2406
+ const finalTranscriptIds = transcriptIdsOverride ?? getFinalTranscriptIds(session.currentTurn.transcripts);
2407
+ return `${normalizeText2(finalText)}|${finalTranscriptIds.join(",")}`;
2408
+ };
2409
+ const isDuplicateTurnCommit = (session, finalText) => {
2410
+ const signature = buildTurnSignature(session, finalText);
2411
+ const committedTurn = session.lastCommittedTurn;
2412
+ const isRecent = committedTurn && committedTurn.committedAt > 0 && Date.now() - committedTurn.committedAt < DEFAULT_DUPLICATE_TURN_WINDOW_MS;
2413
+ const committedSignature = committedTurn?.signature ?? "";
2414
+ const committedTranscriptIds = committedTurn?.transcriptIds ?? [];
2415
+ const committedText = normalizeText2(committedTurn?.text ?? "");
2416
+ const isSameText = normalizeText2(finalText) === committedText;
2417
+ const hasNoNewAudioSinceCommit = (session.currentTurn.lastAudioAt ?? 0) <= (committedTurn?.committedAt ?? 0);
2418
+ if (!isRecent) {
2419
+ return false;
2420
+ }
2421
+ if (isSameText && hasNoNewAudioSinceCommit) {
2422
+ return true;
2423
+ }
2424
+ if (signature !== committedSignature) {
2425
+ return false;
388
2426
  }
2427
+ const lastSignatureIds = new Set(committedTranscriptIds);
2428
+ const hasNoNewFinalIds = session.currentTurn.transcripts.every((transcript) => !transcript.isFinal || lastSignatureIds.has(transcript.id));
2429
+ return isRecent && hasNoNewFinalIds;
2430
+ };
2431
+ const markTurnCommitted = (session, finalText, committedTranscripts) => {
2432
+ session.lastCommittedTurn = {
2433
+ ...session.lastCommittedTurn ?? {},
2434
+ committedAt: Date.now(),
2435
+ signature: buildTurnSignature(session, finalText, getFinalTranscriptIds(committedTranscripts)),
2436
+ text: normalizeText2(finalText),
2437
+ transcriptIds: getFinalTranscriptIds(committedTranscripts)
2438
+ };
389
2439
  };
390
2440
  const handlePartial = async (transcript) => {
391
- await writeSession((session) => {
392
- session.currentTurn.lastAudioAt = Date.now();
393
- session.currentTurn.partialText = buildTurnText(session.currentTurn.transcripts, transcript.text);
394
- session.lastActivityAt = Date.now();
395
- session.status = "active";
2441
+ const session = await writeSession((session2) => {
2442
+ const nextPartialStartedAt = transcript.startedAtMs ?? session2.currentTurn.partialStartedAt;
2443
+ const nextPartialEndedAt = transcript.endedAtMs ?? session2.currentTurn.partialEndedAt;
2444
+ const preferredPartial = selectPreferredTranscriptText(session2.currentTurn.partialText, transcript.text);
2445
+ session2.currentTurn.lastTranscriptAt = Date.now();
2446
+ session2.currentTurn.partialStartedAt = nextPartialStartedAt;
2447
+ session2.currentTurn.partialEndedAt = nextPartialEndedAt;
2448
+ session2.currentTurn.partialText = buildTurnText(session2.currentTurn.transcripts, preferredPartial, {
2449
+ partialEndedAtMs: nextPartialEndedAt,
2450
+ partialStartedAtMs: nextPartialStartedAt
2451
+ });
2452
+ session2.lastActivityAt = Date.now();
2453
+ session2.status = "active";
396
2454
  });
2455
+ if (silenceTimer && pendingCommitReason === "vendor") {
2456
+ scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
2457
+ }
397
2458
  await send({
398
2459
  transcript,
399
2460
  type: "partial"
400
2461
  });
401
2462
  };
402
2463
  const handleFinal = async (transcript) => {
403
- await writeSession((session) => {
404
- const alreadyPresent = session.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
2464
+ const session = await writeSession((session2) => {
2465
+ const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
405
2466
  if (!alreadyPresent) {
406
- session.currentTurn.transcripts = [
407
- ...session.currentTurn.transcripts,
2467
+ session2.currentTurn.transcripts = [
2468
+ ...session2.currentTurn.transcripts,
408
2469
  cloneTranscript(transcript)
409
2470
  ];
410
- session.transcripts = [
411
- ...session.transcripts,
2471
+ session2.transcripts = [
2472
+ ...session2.transcripts,
412
2473
  cloneTranscript(transcript)
413
2474
  ];
414
2475
  }
415
- session.currentTurn.finalText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText);
416
- session.currentTurn.lastAudioAt = Date.now();
417
- session.lastActivityAt = Date.now();
418
- session.status = "active";
2476
+ session2.currentTurn.finalText = buildTurnText(session2.currentTurn.transcripts, session2.currentTurn.partialText, {
2477
+ partialEndedAtMs: session2.currentTurn.partialEndedAt,
2478
+ partialStartedAtMs: session2.currentTurn.partialStartedAt
2479
+ });
2480
+ session2.currentTurn.lastTranscriptAt = Date.now();
2481
+ session2.lastActivityAt = Date.now();
2482
+ session2.status = "active";
419
2483
  });
2484
+ if (silenceTimer && pendingCommitReason === "vendor") {
2485
+ scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
2486
+ }
420
2487
  await send({
421
2488
  transcript,
422
2489
  type: "final"
423
2490
  });
424
2491
  };
2492
+ const resumePendingTurnCommit = (session) => {
2493
+ const pendingText = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
2494
+ partialEndedAtMs: session.currentTurn.partialEndedAt,
2495
+ partialStartedAtMs: session.currentTurn.partialStartedAt
2496
+ });
2497
+ if (!pendingText) {
2498
+ speechDetected = false;
2499
+ return;
2500
+ }
2501
+ speechDetected = true;
2502
+ const audioAge = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : session.currentTurn.lastSpeechAt !== undefined ? Date.now() - session.currentTurn.lastSpeechAt : 0;
2503
+ const transcriptAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : turnDetection.transcriptStabilityMs;
2504
+ const delayMs = Math.max(0, turnDetection.silenceMs - audioAge, turnDetection.transcriptStabilityMs - transcriptAge);
2505
+ scheduleSilenceCommit(delayMs);
2506
+ };
425
2507
  const ensureAdapter = async () => {
426
2508
  if (sttSession) {
427
2509
  return sttSession;
428
2510
  }
429
- sttSession = await options.stt.open({
430
- format: {
431
- channels: 1,
432
- container: "raw",
433
- encoding: "pcm_s16le",
434
- sampleRateHz: 16000
435
- },
2511
+ const openedSession = await options.stt.open({
2512
+ format: DEFAULT_FORMAT,
2513
+ languageStrategy: options.languageStrategy,
2514
+ lexicon,
2515
+ phraseHints,
436
2516
  sessionId: options.id
437
2517
  });
438
- sttSession.on("partial", ({ transcript }) => {
439
- handlePartial(transcript);
2518
+ const generation = ++adapterGenerationCounter;
2519
+ sttSession = openedSession;
2520
+ activeAdapterGeneration = generation;
2521
+ const runAdapterEvent = (phase, handler) => {
2522
+ runSerial(phase, async () => {
2523
+ if (activeAdapterGeneration !== generation) {
2524
+ return;
2525
+ }
2526
+ await handler();
2527
+ });
2528
+ };
2529
+ openedSession.on("partial", ({ transcript }) => {
2530
+ runAdapterEvent("adapter.partial", () => handlePartial(transcript));
440
2531
  });
441
- sttSession.on("final", ({ transcript }) => {
442
- handleFinal(transcript);
2532
+ openedSession.on("final", ({ transcript }) => {
2533
+ runAdapterEvent("adapter.final", () => handleFinal(transcript));
443
2534
  });
444
- sttSession.on("endOfTurn", ({ reason }) => {
445
- clearSilenceTimer();
446
- api.commitTurn(reason);
2535
+ openedSession.on("endOfTurn", ({ reason }) => {
2536
+ runAdapterEvent("adapter.endOfTurn", async () => {
2537
+ clearSilenceTimer();
2538
+ await requestTurnCommit(reason);
2539
+ });
2540
+ });
2541
+ openedSession.on("error", (event) => {
2542
+ runAdapterEvent("adapter.error", () => handleError(event));
447
2543
  });
448
- sttSession.on("error", (event) => {
449
- handleError(event);
2544
+ openedSession.on("close", (event) => {
2545
+ runAdapterEvent("adapter.close", () => handleClose(event));
2546
+ });
2547
+ return openedSession;
2548
+ };
2549
+ const ensureTTSSession = async () => {
2550
+ const ttsAdapter = options.tts;
2551
+ if (!ttsAdapter) {
2552
+ return null;
2553
+ }
2554
+ if (ttsSession) {
2555
+ return ttsSession;
2556
+ }
2557
+ if (ttsSessionPromise) {
2558
+ return ttsSessionPromise;
2559
+ }
2560
+ ttsSessionPromise = (async () => {
2561
+ const openedSession = await ttsAdapter.open({
2562
+ lexicon,
2563
+ sessionId: options.id
2564
+ });
2565
+ ttsSession = openedSession;
2566
+ openedSession.on("audio", ({ chunk, format, receivedAt }) => {
2567
+ runSerial("tts.audio", async () => {
2568
+ if (ttsSession !== openedSession) {
2569
+ return;
2570
+ }
2571
+ const normalizedChunk = chunk instanceof Uint8Array ? new Uint8Array(chunk) : chunk instanceof ArrayBuffer ? new Uint8Array(chunk.slice(0)) : new Uint8Array(chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength));
2572
+ await send({
2573
+ chunkBase64: encodeBase64(normalizedChunk),
2574
+ format,
2575
+ receivedAt,
2576
+ turnId: activeTTSTurnId,
2577
+ type: "audio"
2578
+ });
2579
+ });
2580
+ });
2581
+ openedSession.on("error", (event) => {
2582
+ runSerial("tts.error", async () => {
2583
+ if (ttsSession !== openedSession) {
2584
+ return;
2585
+ }
2586
+ await send({
2587
+ message: toError(event.error).message,
2588
+ recoverable: event.recoverable,
2589
+ type: "error"
2590
+ });
2591
+ });
2592
+ });
2593
+ openedSession.on("close", () => {
2594
+ runSerial("tts.close", async () => {
2595
+ if (ttsSession === openedSession) {
2596
+ ttsSession = null;
2597
+ ttsSessionPromise = null;
2598
+ activeTTSTurnId = undefined;
2599
+ }
2600
+ });
2601
+ });
2602
+ return openedSession;
2603
+ })().catch((error) => {
2604
+ ttsSessionPromise = null;
2605
+ throw error;
450
2606
  });
451
- sttSession.on("close", (event) => {
452
- handleClose(event);
2607
+ return ttsSessionPromise;
2608
+ };
2609
+ const warmTTSSession = () => {
2610
+ if (!options.tts || ttsSession || ttsSessionPromise) {
2611
+ return;
2612
+ }
2613
+ ensureTTSSession().catch((error) => {
2614
+ logger.warn("voice tts prewarm failed", {
2615
+ error: toError(error).message,
2616
+ sessionId: options.id
2617
+ });
453
2618
  });
454
- return sttSession;
455
2619
  };
456
2620
  const completeTurn = async (session, turn) => {
457
- const output = await options.route.onTurn({
2621
+ const committedOutput = await options.route.onTurn({
458
2622
  api,
459
2623
  context: options.context,
460
2624
  session,
461
2625
  turn
462
2626
  });
2627
+ const output = {
2628
+ assistantText: committedOutput?.assistantText,
2629
+ complete: committedOutput?.complete,
2630
+ escalate: committedOutput?.escalate,
2631
+ noAnswer: committedOutput?.noAnswer,
2632
+ result: committedOutput?.result,
2633
+ transfer: committedOutput?.transfer,
2634
+ voicemail: committedOutput?.voicemail
2635
+ };
463
2636
  if (output?.assistantText) {
464
2637
  await writeSession((currentSession) => {
465
2638
  setTurnResult(currentSession, turn.id, {
@@ -471,7 +2644,20 @@ var createVoiceSession = (options) => {
471
2644
  turnId: turn.id,
472
2645
  type: "assistant"
473
2646
  });
474
- }
2647
+ try {
2648
+ const activeTTSSession = await ensureTTSSession();
2649
+ if (activeTTSSession) {
2650
+ activeTTSTurnId = turn.id;
2651
+ await activeTTSSession.send(output.assistantText);
2652
+ }
2653
+ } catch (error) {
2654
+ logger.warn("voice tts send failed", {
2655
+ error: toError(error).message,
2656
+ sessionId: options.id,
2657
+ turnId: turn.id
2658
+ });
2659
+ }
2660
+ }
475
2661
  if (output?.result !== undefined) {
476
2662
  await writeSession((currentSession) => {
477
2663
  setTurnResult(currentSession, turn.id, {
@@ -479,208 +2665,358 @@ var createVoiceSession = (options) => {
479
2665
  });
480
2666
  });
481
2667
  }
2668
+ if (output?.transfer) {
2669
+ await transferInternal({
2670
+ metadata: output.transfer.metadata,
2671
+ reason: output.transfer.reason,
2672
+ result: output.result,
2673
+ target: output.transfer.target
2674
+ });
2675
+ return;
2676
+ }
2677
+ if (output?.escalate) {
2678
+ await escalateInternal({
2679
+ metadata: output.escalate.metadata,
2680
+ reason: output.escalate.reason,
2681
+ result: output.result
2682
+ });
2683
+ return;
2684
+ }
2685
+ if (output?.voicemail) {
2686
+ await markVoicemailInternal({
2687
+ metadata: output.voicemail.metadata,
2688
+ result: output.result
2689
+ });
2690
+ return;
2691
+ }
2692
+ if (output?.noAnswer) {
2693
+ await markNoAnswerInternal({
2694
+ metadata: output.noAnswer.metadata,
2695
+ result: output.result
2696
+ });
2697
+ return;
2698
+ }
482
2699
  if (output?.complete) {
483
- await api.complete(output.result);
2700
+ await completeInternal(output.result);
484
2701
  }
485
2702
  };
486
- const api = {
487
- id: options.id,
488
- close: async (reason) => {
489
- clearSilenceTimer();
490
- await closeAdapter(reason);
491
- await Promise.resolve(socket.close(1000, reason));
492
- },
493
- commitTurn: async (reason = "manual") => {
494
- clearSilenceTimer();
495
- const session = await readSession();
496
- if (session.status === "completed" || session.status === "failed") {
497
- return;
498
- }
499
- const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText);
500
- if (!text) {
501
- return;
2703
+ const commitTurnInternal = async (reason = "manual") => {
2704
+ clearSilenceTimer();
2705
+ const session = await readSession();
2706
+ if (session.status === "completed" || session.status === "failed") {
2707
+ return;
2708
+ }
2709
+ const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
2710
+ partialEndedAtMs: session.currentTurn.partialEndedAt,
2711
+ partialStartedAtMs: session.currentTurn.partialStartedAt
2712
+ });
2713
+ let transcripts = session.currentTurn.transcripts.length ? session.currentTurn.transcripts.map(cloneTranscript) : [];
2714
+ let finalText = text;
2715
+ const transcriptStabilityAge = session.currentTurn.lastTranscriptAt !== undefined ? Date.now() - session.currentTurn.lastTranscriptAt : undefined;
2716
+ const fallbackSelection = await runFallbackTranscription(text, session.currentTurn.transcripts);
2717
+ const source = fallbackSelection?.source ?? "primary";
2718
+ const fallbackUsed = fallbackSelection?.fallbackUsed ?? false;
2719
+ const fallbackDiagnostics = fallbackSelection?.diagnostics;
2720
+ if (fallbackSelection) {
2721
+ finalText = fallbackSelection.text;
2722
+ transcripts = fallbackSelection.transcripts.length ? fallbackSelection.transcripts.map(cloneTranscript) : transcripts.length ? transcripts : [
2723
+ {
2724
+ id: createId(),
2725
+ isFinal: false,
2726
+ text: finalText
2727
+ }
2728
+ ];
2729
+ if (fallbackSelection.fallbackUsed) {
2730
+ logger.info("voice fallback turn selected", {
2731
+ reason,
2732
+ sessionId: options.id,
2733
+ text: finalText
2734
+ });
502
2735
  }
503
- const turn = {
504
- committedAt: Date.now(),
505
- id: createId(),
506
- text,
507
- transcripts: session.currentTurn.transcripts.length > 0 ? session.currentTurn.transcripts.map(cloneTranscript) : [
508
- {
509
- id: createId(),
510
- isFinal: false,
511
- text
512
- }
513
- ]
514
- };
515
- const updatedSession = await writeSession((currentSession) => {
516
- currentSession.committedTurnIds = [
517
- ...currentSession.committedTurnIds,
518
- turn.id
519
- ];
520
- currentSession.currentTurn = createEmptyCurrentTurn();
521
- currentSession.lastActivityAt = Date.now();
522
- currentSession.status = "active";
523
- currentSession.turns = [...currentSession.turns, turn];
524
- });
525
- speechDetected = false;
526
- logger.info("voice turn committed", {
2736
+ }
2737
+ const correctionSelection = await runTurnCorrection({
2738
+ fallbackDiagnostics,
2739
+ fallbackUsed,
2740
+ session,
2741
+ source,
2742
+ text: finalText,
2743
+ transcripts
2744
+ });
2745
+ const correctionDiagnostics = correctionSelection?.diagnostics;
2746
+ if (correctionSelection) {
2747
+ finalText = correctionSelection.text;
2748
+ }
2749
+ if (!finalText) {
2750
+ return;
2751
+ }
2752
+ if (isDuplicateTurnCommit(session, finalText)) {
2753
+ logger.debug("voice turn commit deduped", {
527
2754
  reason,
528
- sessionId: options.id,
529
- turnId: turn.id
530
- });
531
- await send({
532
- turn,
533
- type: "turn"
2755
+ sessionId: options.id
534
2756
  });
535
- await completeTurn(updatedSession, turn);
536
- },
537
- complete: async (result) => {
538
- clearSilenceTimer();
539
- const session = await writeSession((currentSession) => {
540
- if (currentSession.status === "completed") {
541
- return;
542
- }
543
- currentSession.lastActivityAt = Date.now();
544
- currentSession.status = "completed";
545
- if (result !== undefined && currentSession.turns.length > 0) {
546
- const lastTurn = currentSession.turns.at(-1);
547
- if (lastTurn) {
548
- setTurnResult(currentSession, lastTurn.id, {
549
- result
550
- });
551
- }
2757
+ return;
2758
+ }
2759
+ if (typeof transcriptStabilityAge === "number" && transcriptStabilityAge < turnDetection.transcriptStabilityMs && reason !== "manual") {
2760
+ scheduleTurnCommit(turnDetection.transcriptStabilityMs - transcriptStabilityAge, reason, false);
2761
+ return;
2762
+ }
2763
+ const costEstimate = createTurnCostEstimate({
2764
+ fallbackAttemptCount: fallbackAttemptsForCurrentTurn,
2765
+ fallbackPassCostUnit: options.costTelemetry?.fallbackPassCostUnit,
2766
+ fallbackReplayAudioMs: fallbackReplayAudioMsForCurrentTurn,
2767
+ primaryAudioMs: getBufferedAudioDurationMs(currentTurnAudio.map((audio) => audio.chunk)),
2768
+ primaryPassCostUnit: options.costTelemetry?.primaryPassCostUnit
2769
+ });
2770
+ const turn = {
2771
+ committedAt: Date.now(),
2772
+ id: createId(),
2773
+ text: finalText,
2774
+ quality: createTurnQuality(transcripts, source, fallbackUsed, fallbackDiagnostics, correctionDiagnostics, costEstimate),
2775
+ transcripts: transcripts.length > 0 ? transcripts : [
2776
+ {
2777
+ id: createId(),
2778
+ isFinal: false,
2779
+ text: finalText
552
2780
  }
2781
+ ]
2782
+ };
2783
+ const updatedSession = await writeSession((currentSession) => {
2784
+ currentSession.committedTurnIds = [
2785
+ ...currentSession.committedTurnIds,
2786
+ turn.id
2787
+ ];
2788
+ currentSession.currentTurn = createEmptyCurrentTurn();
2789
+ currentSession.lastActivityAt = Date.now();
2790
+ currentSession.status = "active";
2791
+ currentSession.turns = [...currentSession.turns, turn];
2792
+ markTurnCommitted(currentSession, finalText, transcripts);
2793
+ });
2794
+ speechDetected = false;
2795
+ rewindFallbackTurnAudio();
2796
+ logger.info("voice turn committed", {
2797
+ reason,
2798
+ sessionId: options.id,
2799
+ turnId: turn.id
2800
+ });
2801
+ await options.costTelemetry?.onTurnCost?.({
2802
+ api,
2803
+ context: options.context,
2804
+ estimate: costEstimate,
2805
+ session: updatedSession,
2806
+ turn
2807
+ });
2808
+ await send({
2809
+ turn,
2810
+ type: "turn"
2811
+ });
2812
+ if (options.sttLifecycle === "turn-scoped") {
2813
+ await closeAdapter("turn-commit");
2814
+ }
2815
+ await completeTurn(updatedSession, turn);
2816
+ };
2817
+ const connectInternal = async (nextSocket) => {
2818
+ socket = nextSocket;
2819
+ const existingSession = await options.store.get(options.id);
2820
+ let session = existingSession ?? createVoiceSessionRecord(options.id, options.scenarioId);
2821
+ if (options.scenarioId && session.scenarioId !== options.scenarioId) {
2822
+ session.scenarioId = options.scenarioId;
2823
+ }
2824
+ ensureCommittedTurnGuard(session);
2825
+ let shouldFireOnSession = !existingSession;
2826
+ if (existingSession?.scenarioId && options.scenarioId && existingSession.scenarioId !== options.scenarioId) {
2827
+ session = resetVoiceSessionRecord(options.id, existingSession, options.scenarioId);
2828
+ shouldFireOnSession = true;
2829
+ }
2830
+ rewindFallbackTurnAudio();
2831
+ if (existingSession?.status === "reconnecting") {
2832
+ const nextAttempts = existingSession.reconnect.attempts + 1;
2833
+ const reconnectExpired = existingSession.reconnect.lastDisconnectAt !== undefined && Date.now() - existingSession.reconnect.lastDisconnectAt > reconnect.timeout;
2834
+ const tooManyAttempts = nextAttempts > reconnect.maxAttempts;
2835
+ if (reconnect.strategy === "fail" && (reconnectExpired || tooManyAttempts)) {
2836
+ await failInternal(new Error("Voice session reconnect policy exhausted"));
2837
+ return;
2838
+ }
2839
+ if (reconnect.strategy === "restart" && (reconnectExpired || tooManyAttempts)) {
2840
+ session = resetVoiceSessionRecord(options.id, existingSession, options.scenarioId);
2841
+ shouldFireOnSession = true;
2842
+ } else {
2843
+ session = {
2844
+ ...existingSession,
2845
+ reconnect: {
2846
+ ...existingSession.reconnect,
2847
+ attempts: nextAttempts
2848
+ },
2849
+ status: "active"
2850
+ };
2851
+ }
2852
+ }
2853
+ if (shouldFireOnSession) {
2854
+ pushCallLifecycleEvent(session, {
2855
+ type: "start"
553
2856
  });
554
- await send({
555
- sessionId: options.id,
556
- type: "complete"
2857
+ }
2858
+ await options.store.set(options.id, session);
2859
+ await send({
2860
+ sessionId: options.id,
2861
+ status: session.status,
2862
+ scenarioId: session.scenarioId,
2863
+ type: "session"
2864
+ });
2865
+ if (shouldFireOnSession) {
2866
+ await options.route.onCallStart?.({
2867
+ api,
2868
+ context: options.context,
2869
+ session
557
2870
  });
558
- await closeAdapter("complete");
559
- speechDetected = false;
560
- await options.route.onComplete({
2871
+ await options.route.onSession?.({
561
2872
  api,
562
2873
  context: options.context,
563
2874
  session
564
2875
  });
565
- },
566
- connect: async (nextSocket) => {
567
- socket = nextSocket;
568
- const existingSession = await options.store.get(options.id);
569
- let session = existingSession ?? createVoiceSessionRecord(options.id);
570
- let shouldFireOnSession = !existingSession;
571
- if (existingSession?.status === "reconnecting") {
572
- const nextAttempts = existingSession.reconnect.attempts + 1;
573
- const reconnectExpired = existingSession.reconnect.lastDisconnectAt !== undefined && Date.now() - existingSession.reconnect.lastDisconnectAt > reconnect.timeout;
574
- const tooManyAttempts = nextAttempts > reconnect.maxAttempts;
575
- if (reconnect.strategy === "fail" && (reconnectExpired || tooManyAttempts)) {
576
- await api.fail(new Error("Voice session reconnect policy exhausted"));
577
- return;
578
- }
579
- if (reconnect.strategy === "restart" && (reconnectExpired || tooManyAttempts)) {
580
- session = resetVoiceSessionRecord(options.id, existingSession);
581
- shouldFireOnSession = true;
582
- } else {
583
- session = {
584
- ...existingSession,
585
- reconnect: {
586
- ...existingSession.reconnect,
587
- attempts: nextAttempts
588
- },
589
- status: "active"
590
- };
591
- }
592
- }
593
- await options.store.set(options.id, session);
2876
+ }
2877
+ if (session.status === "completed") {
594
2878
  await send({
595
2879
  sessionId: options.id,
596
- status: session.status,
597
- type: "session"
598
- });
599
- if (shouldFireOnSession) {
600
- await options.route.onSession?.({
601
- api,
602
- context: options.context,
603
- session
604
- });
605
- }
606
- if (session.status === "completed") {
607
- await send({
608
- sessionId: options.id,
609
- type: "complete"
610
- });
611
- return;
612
- }
613
- await ensureAdapter();
614
- },
615
- disconnect: async (event) => {
616
- clearSilenceTimer();
617
- await closeAdapter(event?.reason);
618
- if (reconnect.strategy === "fail") {
619
- await api.fail(new Error(event?.reason ?? "Voice socket disconnected"));
620
- return;
621
- }
622
- await writeSession((session) => {
623
- if (session.status === "completed" || session.status === "failed") {
624
- return;
625
- }
626
- session.lastActivityAt = Date.now();
627
- session.reconnect.lastDisconnectAt = Date.now();
628
- session.status = "reconnecting";
629
- });
630
- speechDetected = false;
631
- },
632
- fail: async (error) => {
633
- clearSilenceTimer();
634
- const session = await writeSession((currentSession) => {
635
- currentSession.lastActivityAt = Date.now();
636
- currentSession.status = "failed";
637
- });
638
- const resolvedError = toError(error);
639
- await send({
640
- message: resolvedError.message,
641
- recoverable: false,
642
- type: "error"
643
- });
644
- await closeAdapter("failed");
645
- speechDetected = false;
646
- await options.route.onError?.({
647
- api,
648
- context: options.context,
649
- error: resolvedError,
650
- session,
651
- sessionId: options.id
2880
+ type: "complete"
652
2881
  });
653
- },
654
- receiveAudio: async (audio) => {
655
- const session = await readSession();
2882
+ return;
2883
+ }
2884
+ resumePendingTurnCommit(session);
2885
+ await ensureAdapter();
2886
+ warmTTSSession();
2887
+ };
2888
+ const disconnectInternal = async (event) => {
2889
+ clearSilenceTimer();
2890
+ await closeTTSSession(event?.reason);
2891
+ await closeAdapter(event?.reason);
2892
+ rewindFallbackTurnAudio();
2893
+ if (reconnect.strategy === "fail") {
2894
+ await failInternal(new Error(event?.reason ?? "Voice socket disconnected"));
2895
+ return;
2896
+ }
2897
+ await writeSession((session) => {
656
2898
  if (session.status === "completed" || session.status === "failed") {
657
2899
  return;
658
2900
  }
659
- const adapter = await ensureAdapter();
660
- const audioLevel = measureAudioLevel(audio);
661
- await writeSession((currentSession) => {
662
- currentSession.currentTurn.lastAudioAt = Date.now();
663
- currentSession.lastActivityAt = Date.now();
664
- currentSession.status = "active";
665
- });
2901
+ session.lastActivityAt = Date.now();
2902
+ session.reconnect.lastDisconnectAt = Date.now();
2903
+ session.status = "reconnecting";
2904
+ });
2905
+ speechDetected = false;
2906
+ };
2907
+ const receiveAudioInternal = async (audio) => {
2908
+ const session = await readSession();
2909
+ if (session.status === "completed" || session.status === "failed") {
2910
+ return;
2911
+ }
2912
+ const adapter = await ensureAdapter();
2913
+ const conditionedAudio = conditionAudioChunk(audio, options.audioConditioning);
2914
+ const audioLevel = measureAudioLevel(conditionedAudio);
2915
+ const shouldStoreAudio = speechDetected || audioLevel >= turnDetection.speechThreshold;
2916
+ await writeSession((currentSession) => {
2917
+ currentSession.currentTurn.lastAudioAt = Date.now();
2918
+ currentSession.lastActivityAt = Date.now();
2919
+ currentSession.status = "active";
666
2920
  if (audioLevel >= turnDetection.speechThreshold) {
667
- speechDetected = true;
2921
+ currentSession.currentTurn.lastSpeechAt = Date.now();
2922
+ currentSession.currentTurn.silenceStartedAt = undefined;
2923
+ } else if (speechDetected && currentSession.currentTurn.silenceStartedAt === undefined) {
2924
+ currentSession.currentTurn.silenceStartedAt = Date.now();
2925
+ }
2926
+ });
2927
+ if (shouldStoreAudio) {
2928
+ pushTurnAudio(conditionedAudio);
2929
+ }
2930
+ if (audioLevel >= turnDetection.speechThreshold) {
2931
+ speechDetected = true;
2932
+ clearSilenceTimer();
2933
+ } else if (speechDetected) {
2934
+ const currentSession = await readSession();
2935
+ const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText, {
2936
+ partialEndedAtMs: currentSession.currentTurn.partialEndedAt,
2937
+ partialStartedAtMs: currentSession.currentTurn.partialStartedAt
2938
+ }));
2939
+ if (hasTurnText) {
2940
+ scheduleSilenceCommit(turnDetection.silenceMs, false);
2941
+ }
2942
+ }
2943
+ await adapter.send(conditionedAudio);
2944
+ };
2945
+ const api = {
2946
+ id: options.id,
2947
+ close: async (reason) => {
2948
+ await runSerial("api.close", async () => {
2949
+ const session = await writeSession((currentSession) => {
2950
+ if (currentSession.status !== "completed" && currentSession.status !== "failed" && !currentSession.call?.endedAt) {
2951
+ currentSession.lastActivityAt = Date.now();
2952
+ currentSession.status = "completed";
2953
+ pushCallLifecycleEvent(currentSession, {
2954
+ disposition: "closed",
2955
+ reason,
2956
+ type: "end"
2957
+ });
2958
+ }
2959
+ });
668
2960
  clearSilenceTimer();
669
- } else if (speechDetected) {
670
- const currentSession = await readSession();
671
- const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText));
672
- if (hasTurnText) {
673
- scheduleSilenceCommit();
2961
+ await closeTTSSession(reason);
2962
+ await closeAdapter(reason);
2963
+ await Promise.resolve(socket.close(1000, reason));
2964
+ if (session.call?.endedAt && session.call.disposition === "closed") {
2965
+ await options.route.onCallEnd?.({
2966
+ api,
2967
+ context: options.context,
2968
+ disposition: "closed",
2969
+ reason,
2970
+ session
2971
+ });
674
2972
  }
675
- }
676
- await adapter.send(audio);
2973
+ });
677
2974
  },
678
- snapshot: async () => readSession()
2975
+ commitTurn: async (reason = "manual") => runSerial("api.commitTurn", async () => {
2976
+ await commitTurnInternal(reason);
2977
+ }),
2978
+ complete: async (result) => runSerial("api.complete", async () => {
2979
+ await completeInternal(result);
2980
+ }),
2981
+ connect: async (nextSocket) => runSerial("api.connect", async () => {
2982
+ await connectInternal(nextSocket);
2983
+ }),
2984
+ disconnect: async (event) => runSerial("api.disconnect", async () => {
2985
+ await disconnectInternal(event);
2986
+ }),
2987
+ fail: async (error) => runSerial("api.fail", async () => {
2988
+ await failInternal(error);
2989
+ }),
2990
+ escalate: async (input) => runSerial("api.escalate", async () => {
2991
+ await escalateInternal(input);
2992
+ }),
2993
+ markNoAnswer: async (input) => runSerial("api.markNoAnswer", async () => {
2994
+ await markNoAnswerInternal(input);
2995
+ }),
2996
+ markVoicemail: async (input) => runSerial("api.markVoicemail", async () => {
2997
+ await markVoicemailInternal(input);
2998
+ }),
2999
+ receiveAudio: async (audio) => runSerial("api.receiveAudio", async () => {
3000
+ await receiveAudioInternal(audio);
3001
+ }),
3002
+ transfer: async (input) => runSerial("api.transfer", async () => {
3003
+ await transferInternal(input);
3004
+ }),
3005
+ snapshot: async () => runSerial("api.snapshot", async () => readSession())
679
3006
  };
680
3007
  return api;
681
3008
  };
682
3009
 
683
3010
  // src/plugin.ts
3011
+ var resolveQueryScenario = (query) => {
3012
+ if (typeof query?.scenarioId === "string" && query.scenarioId.trim()) {
3013
+ return query.scenarioId.trim();
3014
+ }
3015
+ if (typeof query?.mode === "string" && query.mode.trim()) {
3016
+ return query.mode.trim();
3017
+ }
3018
+ return null;
3019
+ };
684
3020
  var HTMX_BOOTSTRAP_DIST_CANDIDATES = [
685
3021
  resolve(import.meta.dir, "client", "htmxBootstrap.js"),
686
3022
  resolve(import.meta.dir, "..", "dist", "client", "htmxBootstrap.js")
@@ -727,6 +3063,21 @@ ${log}` : ""}`);
727
3063
  };
728
3064
  })();
729
3065
  var isArrayBufferView = (value) => typeof value === "object" && value !== null && ArrayBuffer.isView(value);
3066
+ var resolveSTTFallbackConfig = (config) => {
3067
+ if (!config) {
3068
+ return;
3069
+ }
3070
+ return {
3071
+ adapter: config.adapter,
3072
+ completionTimeoutMs: config.completionTimeoutMs ?? 2500,
3073
+ confidenceThreshold: config.confidenceThreshold ?? 0.6,
3074
+ maxAttemptsPerTurn: config.maxAttemptsPerTurn ?? 1,
3075
+ minTextLength: config.minTextLength ?? 2,
3076
+ replayWindowMs: config.replayWindowMs ?? 8000,
3077
+ settleMs: config.settleMs ?? 220,
3078
+ trigger: config.trigger ?? "empty-or-low-confidence"
3079
+ };
3080
+ };
730
3081
  var isVoiceClientMessage = (value) => {
731
3082
  if (!value || typeof value !== "object" || !("type" in value)) {
732
3083
  return false;
@@ -739,7 +3090,7 @@ var isVoiceClientMessage = (value) => {
739
3090
  case "ping":
740
3091
  return true;
741
3092
  case "start":
742
- return !("sessionId" in value) || typeof value.sessionId === "string";
3093
+ return (!("sessionId" in value) || typeof value.sessionId === "string") && (!("scenarioId" in value) || typeof value.scenarioId === "string");
743
3094
  default:
744
3095
  return false;
745
3096
  }
@@ -759,14 +3110,16 @@ var parseClientMessage = (raw) => {
759
3110
  return null;
760
3111
  };
761
3112
  var resolveSessionId = (runtime, ws) => {
762
- const existing = runtime.socketSessions.get(ws);
763
- if (existing) {
764
- return existing;
765
- }
766
3113
  const query = ws.data && typeof ws.data === "object" && "query" in ws.data ? ws.data.query : undefined;
767
- const providedSessionId = typeof query?.sessionId === "string" && query.sessionId.trim() ? query.sessionId.trim() : createId();
768
- runtime.socketSessions.set(ws, providedSessionId);
769
- return providedSessionId;
3114
+ const existing = runtime.socketSessions.get(ws);
3115
+ const providedSessionId = typeof query?.sessionId === "string" && query.sessionId.trim() ? query.sessionId.trim() : existing?.sessionId ?? createId();
3116
+ const scenarioId = resolveQueryScenario(query) ?? existing?.scenarioId ?? null;
3117
+ const resolved = {
3118
+ sessionId: providedSessionId,
3119
+ scenarioId
3120
+ };
3121
+ runtime.socketSessions.set(ws, resolved);
3122
+ return resolved;
770
3123
  };
771
3124
  var toAudioChunk = (raw) => {
772
3125
  if (raw instanceof ArrayBuffer) {
@@ -792,6 +3145,55 @@ var normalizeOnTurn = (handler) => {
792
3145
  }
793
3146
  return handler;
794
3147
  };
3148
+ var resolveSessionOptions = (config) => {
3149
+ const preset = resolveVoiceRuntimePreset(config.preset);
3150
+ return {
3151
+ audioConditioning: config.audioConditioning !== undefined ? resolveAudioConditioningConfig(config.audioConditioning) : preset.audioConditioning,
3152
+ costTelemetry: config.costTelemetry,
3153
+ sttFallback: resolveSTTFallbackConfig(config.sttFallback),
3154
+ logger: config.logger,
3155
+ reconnect: {
3156
+ maxAttempts: config.reconnect?.maxAttempts ?? 10,
3157
+ strategy: config.reconnect?.strategy ?? "resume-last-turn",
3158
+ timeout: config.reconnect?.timeout ?? 30000
3159
+ },
3160
+ sttLifecycle: config.sttLifecycle ?? preset.sttLifecycle,
3161
+ turnDetection: resolveTurnDetectionConfig({
3162
+ ...preset.turnDetection,
3163
+ ...config.turnDetection
3164
+ })
3165
+ };
3166
+ };
3167
+ var normalizePhraseHints = (hints) => (hints ?? []).map((hint) => ({
3168
+ ...hint,
3169
+ aliases: hint.aliases?.filter((value) => typeof value === "string" && value.trim().length > 0),
3170
+ text: hint.text.trim()
3171
+ })).filter((hint) => hint.text.length > 0);
3172
+ var normalizeLexicon = (entries) => (entries ?? []).map((entry) => ({
3173
+ ...entry,
3174
+ aliases: entry.aliases?.filter((value) => typeof value === "string" && value.trim().length > 0),
3175
+ language: typeof entry.language === "string" && entry.language.trim().length > 0 ? entry.language.trim() : undefined,
3176
+ pronunciation: typeof entry.pronunciation === "string" && entry.pronunciation.trim().length > 0 ? entry.pronunciation.trim() : undefined,
3177
+ text: entry.text.trim()
3178
+ })).filter((entry) => entry.text.length > 0);
3179
+ var resolvePhraseHints = async (config, input) => {
3180
+ if (!config.phraseHints) {
3181
+ return [];
3182
+ }
3183
+ if (typeof config.phraseHints === "function") {
3184
+ return normalizePhraseHints(await config.phraseHints(input));
3185
+ }
3186
+ return normalizePhraseHints(config.phraseHints);
3187
+ };
3188
+ var resolveLexicon = async (config, input) => {
3189
+ if (!config.lexicon) {
3190
+ return [];
3191
+ }
3192
+ if (typeof config.lexicon === "function") {
3193
+ return normalizeLexicon(await config.lexicon(input));
3194
+ }
3195
+ return normalizeLexicon(config.lexicon);
3196
+ };
795
3197
  var voice = (config) => {
796
3198
  const runtime = {
797
3199
  activeSessions: new Map,
@@ -799,11 +3201,79 @@ var voice = (config) => {
799
3201
  socketSessions: new WeakMap
800
3202
  };
801
3203
  const onTurn = normalizeOnTurn(config.onTurn);
3204
+ const sessionOptions = resolveSessionOptions(config);
802
3205
  const htmxOptions = config.htmx && typeof config.htmx === "object" ? config.htmx : undefined;
803
3206
  const htmxRoute = htmxOptions?.route ?? `${config.path}/htmx/session`;
804
3207
  const htmxBootstrapRoute = htmxOptions?.bootstrapRoute ?? `${config.path}/htmx/bootstrap.js`;
805
3208
  const htmxRenderers = resolveVoiceHTMXRenderers(config.htmx && config.htmx !== true ? config.htmx : undefined);
806
3209
  const htmxTargets = resolveVoiceHTMXTargets(htmxOptions?.targets);
3210
+ const createManagedSession = async (ws, sessionId, scenarioId) => {
3211
+ const context = ws.data;
3212
+ const phraseHints = await resolvePhraseHints(config, {
3213
+ context,
3214
+ scenarioId,
3215
+ sessionId
3216
+ });
3217
+ const lexicon = await resolveLexicon(config, {
3218
+ context,
3219
+ scenarioId,
3220
+ sessionId
3221
+ });
3222
+ return createVoiceSession({
3223
+ audioConditioning: sessionOptions.audioConditioning,
3224
+ context,
3225
+ id: sessionId,
3226
+ languageStrategy: config.languageStrategy,
3227
+ lexicon,
3228
+ logger: sessionOptions.logger,
3229
+ phraseHints,
3230
+ reconnect: sessionOptions.reconnect,
3231
+ route: {
3232
+ correctTurn: config.correctTurn,
3233
+ onCallEnd: async (input) => {
3234
+ let hookError;
3235
+ try {
3236
+ await config.onCallEnd?.(input);
3237
+ } catch (error) {
3238
+ hookError = error;
3239
+ }
3240
+ try {
3241
+ await recordVoiceRuntimeOps({
3242
+ api: input.api,
3243
+ config: config.ops,
3244
+ context: input.context,
3245
+ disposition: input.disposition,
3246
+ metadata: input.metadata,
3247
+ reason: input.reason,
3248
+ session: input.session,
3249
+ target: input.target
3250
+ });
3251
+ } finally {
3252
+ if (hookError) {
3253
+ throw hookError;
3254
+ }
3255
+ }
3256
+ },
3257
+ onCallStart: config.onCallStart,
3258
+ onComplete: config.onComplete,
3259
+ onEscalation: config.onEscalation,
3260
+ onError: config.onError,
3261
+ onNoAnswer: config.onNoAnswer,
3262
+ onSession: config.onSession,
3263
+ onTransfer: config.onTransfer,
3264
+ onTurn,
3265
+ onVoicemail: config.onVoicemail
3266
+ },
3267
+ scenarioId,
3268
+ socket: createSocketAdapter(ws),
3269
+ store: config.session,
3270
+ stt: config.stt,
3271
+ sttFallback: sessionOptions.sttFallback,
3272
+ sttLifecycle: sessionOptions.sttLifecycle,
3273
+ tts: config.tts,
3274
+ turnDetection: sessionOptions.turnDetection
3275
+ });
3276
+ };
807
3277
  const htmxRoutes = () => {
808
3278
  if (!config.htmx) {
809
3279
  return new Elysia;
@@ -833,12 +3303,12 @@ var voice = (config) => {
833
3303
  };
834
3304
  return new Elysia({ name: "absolutejs-voice" }).ws(config.path, {
835
3305
  close: async (ws, code, reason) => {
836
- const sessionId = runtime.socketSessions.get(ws);
837
- if (!sessionId) {
3306
+ const socketState = runtime.socketSessions.get(ws);
3307
+ if (!socketState) {
838
3308
  return;
839
3309
  }
840
- const session = runtime.activeSessions.get(sessionId);
841
- runtime.activeSessions.delete(sessionId);
3310
+ const session = runtime.activeSessions.get(socketState.sessionId);
3311
+ runtime.activeSessions.delete(socketState.sessionId);
842
3312
  if (session) {
843
3313
  await session.disconnect({
844
3314
  code,
@@ -849,8 +3319,8 @@ var voice = (config) => {
849
3319
  }
850
3320
  },
851
3321
  message: async (ws, raw) => {
852
- const sessionId = resolveSessionId(runtime, ws);
853
- const current = runtime.activeSessions.get(sessionId);
3322
+ const sessionState = resolveSessionId(runtime, ws);
3323
+ const current = runtime.activeSessions.get(sessionState.sessionId);
854
3324
  const message = parseClientMessage(raw);
855
3325
  if (message) {
856
3326
  if (message.type === "ping") {
@@ -861,10 +3331,27 @@ var voice = (config) => {
861
3331
  }
862
3332
  if (message.type === "close" && current) {
863
3333
  await current.close(message.reason);
864
- runtime.activeSessions.delete(sessionId);
3334
+ runtime.activeSessions.delete(sessionState.sessionId);
865
3335
  }
866
- if (message.type === "start" && message.sessionId && message.sessionId !== sessionId) {
867
- runtime.socketSessions.set(ws, message.sessionId);
3336
+ if (message.type === "start" && message.sessionId && message.sessionId !== sessionState.sessionId) {
3337
+ const currentSession = runtime.activeSessions.get(sessionState.sessionId);
3338
+ if (currentSession) {
3339
+ await currentSession.close("session-switch");
3340
+ runtime.activeSessions.delete(sessionState.sessionId);
3341
+ }
3342
+ sessionState.sessionId = message.sessionId;
3343
+ runtime.socketSessions.set(ws, {
3344
+ ...sessionState,
3345
+ sessionId: message.sessionId,
3346
+ scenarioId: sessionState.scenarioId
3347
+ });
3348
+ }
3349
+ if (message.type === "start" && message.scenarioId) {
3350
+ sessionState.scenarioId = message.scenarioId;
3351
+ runtime.socketSessions.set(ws, {
3352
+ ...sessionState,
3353
+ scenarioId: message.scenarioId
3354
+ });
868
3355
  }
869
3356
  return;
870
3357
  }
@@ -872,70 +3359,191 @@ var voice = (config) => {
872
3359
  if (!audio) {
873
3360
  return;
874
3361
  }
875
- const session = current ?? createVoiceSession({
876
- context: ws.data,
877
- id: sessionId,
878
- logger: config.logger,
879
- reconnect: {
880
- maxAttempts: config.reconnect?.maxAttempts ?? 10,
881
- strategy: config.reconnect?.strategy ?? "resume-last-turn",
882
- timeout: config.reconnect?.timeout ?? 30000
883
- },
884
- route: {
885
- onComplete: config.onComplete,
886
- onError: config.onError,
887
- onSession: config.onSession,
888
- onTurn
889
- },
890
- socket: createSocketAdapter(ws),
891
- store: config.session,
892
- stt: config.stt,
893
- turnDetection: {
894
- silenceMs: config.turnDetection?.silenceMs ?? 700,
895
- speechThreshold: config.turnDetection?.speechThreshold ?? 0.015
896
- }
897
- });
3362
+ const session = current ?? await createManagedSession(ws, sessionState.sessionId, sessionState.scenarioId ?? undefined);
898
3363
  if (!current) {
899
- runtime.activeSessions.set(sessionId, session);
3364
+ runtime.activeSessions.set(sessionState.sessionId, session);
900
3365
  await session.connect(createSocketAdapter(ws));
901
3366
  }
902
3367
  await session.receiveAudio(audio);
903
3368
  },
904
3369
  open: async (ws) => {
905
- const sessionId = resolveSessionId(runtime, ws);
906
- const existing = runtime.activeSessions.get(sessionId);
3370
+ const sessionState = resolveSessionId(runtime, ws);
3371
+ const existing = runtime.activeSessions.get(sessionState.sessionId);
907
3372
  if (existing) {
908
3373
  await existing.close("superseded");
909
- runtime.activeSessions.delete(sessionId);
910
- }
911
- const session = createVoiceSession({
912
- context: ws.data,
913
- id: sessionId,
914
- logger: config.logger,
915
- reconnect: {
916
- maxAttempts: config.reconnect?.maxAttempts ?? 10,
917
- strategy: config.reconnect?.strategy ?? "resume-last-turn",
918
- timeout: config.reconnect?.timeout ?? 30000
919
- },
920
- route: {
921
- onComplete: config.onComplete,
922
- onError: config.onError,
923
- onSession: config.onSession,
924
- onTurn
925
- },
926
- socket: createSocketAdapter(ws),
927
- store: config.session,
928
- stt: config.stt,
929
- turnDetection: {
930
- silenceMs: config.turnDetection?.silenceMs ?? 700,
931
- speechThreshold: config.turnDetection?.speechThreshold ?? 0.015
932
- }
933
- });
934
- runtime.activeSessions.set(sessionId, session);
3374
+ runtime.activeSessions.delete(sessionState.sessionId);
3375
+ }
3376
+ const session = await createManagedSession(ws, sessionState.sessionId, sessionState.scenarioId ?? undefined);
3377
+ runtime.activeSessions.set(sessionState.sessionId, session);
935
3378
  await session.connect(createSocketAdapter(ws));
936
3379
  }
937
3380
  }).use(htmxRoutes());
938
3381
  };
3382
+ // src/fileStore.ts
3383
+ import { mkdir, readFile, readdir, rename, rm, writeFile } from "fs/promises";
3384
+ import { join } from "path";
3385
+ var listJsonFiles = async (directory) => {
3386
+ try {
3387
+ const entries = await readdir(directory, {
3388
+ withFileTypes: true
3389
+ });
3390
+ return entries.filter((entry) => entry.isFile() && entry.name.endsWith(".json")).map((entry) => join(directory, entry.name));
3391
+ } catch (error) {
3392
+ if (error.code === "ENOENT") {
3393
+ return [];
3394
+ }
3395
+ throw error;
3396
+ }
3397
+ };
3398
+ var encodeStoreId = (id) => `${encodeURIComponent(id)}.json`;
3399
+ var resolveFilePath = (directory, id) => join(directory, encodeStoreId(id));
3400
+ var readJsonFile = async (path) => JSON.parse(await readFile(path, "utf8"));
3401
+ var writeJsonFile = async (path, value, options) => {
3402
+ await mkdir(options.directory, {
3403
+ recursive: true
3404
+ });
3405
+ const tempPath = `${path}.${crypto.randomUUID()}.tmp`;
3406
+ await writeFile(tempPath, JSON.stringify(value, null, options.pretty === false ? undefined : 2));
3407
+ await rename(tempPath, path);
3408
+ };
3409
+ var createVoiceFileSessionStore = (options) => {
3410
+ const get = async (id) => {
3411
+ const path = resolveFilePath(options.directory, id);
3412
+ try {
3413
+ return await readJsonFile(path);
3414
+ } catch (error) {
3415
+ if (error.code === "ENOENT") {
3416
+ return;
3417
+ }
3418
+ throw error;
3419
+ }
3420
+ };
3421
+ const getOrCreate = async (id) => {
3422
+ const existing = await get(id);
3423
+ if (existing) {
3424
+ return existing;
3425
+ }
3426
+ const session = createVoiceSessionRecord(id);
3427
+ await writeJsonFile(resolveFilePath(options.directory, id), session, options);
3428
+ return session;
3429
+ };
3430
+ const set = async (id, value) => {
3431
+ await writeJsonFile(resolveFilePath(options.directory, id), value, options);
3432
+ };
3433
+ const list = async () => {
3434
+ const files = await listJsonFiles(options.directory);
3435
+ const sessions = await Promise.all(files.map((file) => readJsonFile(file)));
3436
+ return sessions.map((session) => toVoiceSessionSummary(session)).sort((first, second) => (second.lastActivityAt ?? second.createdAt) - (first.lastActivityAt ?? first.createdAt));
3437
+ };
3438
+ const remove = async (id) => {
3439
+ await rm(resolveFilePath(options.directory, id), {
3440
+ force: true
3441
+ });
3442
+ };
3443
+ return { get, getOrCreate, list, remove, set };
3444
+ };
3445
+ var createVoiceFileReviewStore = (options) => {
3446
+ const get = async (id) => {
3447
+ const path = resolveFilePath(options.directory, id);
3448
+ try {
3449
+ return await readJsonFile(path);
3450
+ } catch (error) {
3451
+ if (error.code === "ENOENT") {
3452
+ return;
3453
+ }
3454
+ throw error;
3455
+ }
3456
+ };
3457
+ const list = async () => {
3458
+ const files = await listJsonFiles(options.directory);
3459
+ const reviews = await Promise.all(files.map((file) => readJsonFile(file)));
3460
+ return reviews.sort((left, right) => (right.generatedAt ?? 0) - (left.generatedAt ?? 0));
3461
+ };
3462
+ const set = async (id, artifact) => {
3463
+ await writeJsonFile(resolveFilePath(options.directory, id), withVoiceCallReviewId(id, artifact), options);
3464
+ };
3465
+ const remove = async (id) => {
3466
+ await rm(resolveFilePath(options.directory, id), {
3467
+ force: true
3468
+ });
3469
+ };
3470
+ return { get, list, remove, set };
3471
+ };
3472
+ var createVoiceFileTaskStore = (options) => {
3473
+ const get = async (id) => {
3474
+ const path = resolveFilePath(options.directory, id);
3475
+ try {
3476
+ return await readJsonFile(path);
3477
+ } catch (error) {
3478
+ if (error.code === "ENOENT") {
3479
+ return;
3480
+ }
3481
+ throw error;
3482
+ }
3483
+ };
3484
+ const list = async () => {
3485
+ const files = await listJsonFiles(options.directory);
3486
+ const tasks = await Promise.all(files.map((file) => readJsonFile(file)));
3487
+ return tasks.sort((left, right) => right.createdAt - left.createdAt);
3488
+ };
3489
+ const set = async (id, task) => {
3490
+ await writeJsonFile(resolveFilePath(options.directory, id), withVoiceOpsTaskId(id, task), options);
3491
+ };
3492
+ const remove = async (id) => {
3493
+ await rm(resolveFilePath(options.directory, id), {
3494
+ force: true
3495
+ });
3496
+ };
3497
+ return { get, list, remove, set };
3498
+ };
3499
+ var createVoiceFileIntegrationEventStore = (options) => {
3500
+ const get = async (id) => {
3501
+ const path = resolveFilePath(options.directory, id);
3502
+ try {
3503
+ return await readJsonFile(path);
3504
+ } catch (error) {
3505
+ if (error.code === "ENOENT") {
3506
+ return;
3507
+ }
3508
+ throw error;
3509
+ }
3510
+ };
3511
+ const list = async () => {
3512
+ const files = await listJsonFiles(options.directory);
3513
+ const events = await Promise.all(files.map((file) => readJsonFile(file)));
3514
+ return events.sort((left, right) => right.createdAt - left.createdAt);
3515
+ };
3516
+ const set = async (id, event) => {
3517
+ await writeJsonFile(resolveFilePath(options.directory, id), withVoiceIntegrationEventId(id, event), options);
3518
+ };
3519
+ const remove = async (id) => {
3520
+ await rm(resolveFilePath(options.directory, id), {
3521
+ force: true
3522
+ });
3523
+ };
3524
+ return { get, list, remove, set };
3525
+ };
3526
+ var createVoiceFileRuntimeStorage = (options) => ({
3527
+ events: createVoiceFileIntegrationEventStore({
3528
+ ...options,
3529
+ directory: join(options.directory, "events")
3530
+ }),
3531
+ reviews: createVoiceFileReviewStore({
3532
+ ...options,
3533
+ directory: join(options.directory, "reviews")
3534
+ }),
3535
+ session: createVoiceFileSessionStore({
3536
+ ...options,
3537
+ directory: join(options.directory, "sessions")
3538
+ }),
3539
+ tasks: createVoiceFileTaskStore({
3540
+ ...options,
3541
+ directory: join(options.directory, "tasks")
3542
+ })
3543
+ });
3544
+ var createStoredVoiceCallReviewArtifact = (id, artifact) => withVoiceCallReviewId(id, artifact);
3545
+ var createStoredVoiceOpsTask = (id, task) => withVoiceOpsTaskId(id, task);
3546
+ var createStoredVoiceIntegrationEvent = (id, event) => withVoiceIntegrationEventId(id, event);
939
3547
  // src/memoryStore.ts
940
3548
  var createVoiceMemoryStore = () => {
941
3549
  const sessions = new Map;
@@ -957,10 +3565,825 @@ var createVoiceMemoryStore = () => {
957
3565
  };
958
3566
  return { get, getOrCreate, list, remove, set };
959
3567
  };
3568
+ // src/correction.ts
3569
+ var escapeRegExp = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
3570
+ var buildAliasMatcher = (alias) => new RegExp(`(?<![\\p{L}\\p{N}'])${escapeRegExp(alias)}(?![\\p{L}\\p{N}'])`, "giu");
3571
+ var WORD_PATTERN = /[\p{L}\p{N}']+/gu;
3572
+ var normalizeComparableText = (value) => value.toLowerCase().replace(/[^\p{L}\p{N}\s']/gu, " ").replace(/\s+/g, " ").trim();
3573
+ var normalizeDomainTerm = (value) => normalizeComparableText(value);
3574
+ var tokenizeWithIndices = (value) => {
3575
+ const matches = value.matchAll(WORD_PATTERN);
3576
+ const tokens = [];
3577
+ for (const match of matches) {
3578
+ const token = match[0];
3579
+ const start = match.index ?? -1;
3580
+ if (start < 0) {
3581
+ continue;
3582
+ }
3583
+ tokens.push({
3584
+ end: start + token.length,
3585
+ start,
3586
+ text: token
3587
+ });
3588
+ }
3589
+ return tokens;
3590
+ };
3591
+ var levenshteinDistance = (left, right) => {
3592
+ if (left === right) {
3593
+ return 0;
3594
+ }
3595
+ if (left.length === 0) {
3596
+ return right.length;
3597
+ }
3598
+ if (right.length === 0) {
3599
+ return left.length;
3600
+ }
3601
+ const previous = Array.from({ length: right.length + 1 }, (_, index) => index);
3602
+ const current = new Array(right.length + 1);
3603
+ for (let leftIndex = 1;leftIndex <= left.length; leftIndex += 1) {
3604
+ current[0] = leftIndex;
3605
+ for (let rightIndex = 1;rightIndex <= right.length; rightIndex += 1) {
3606
+ const cost = left[leftIndex - 1] === right[rightIndex - 1] ? 0 : 1;
3607
+ current[rightIndex] = Math.min(current[rightIndex - 1] + 1, previous[rightIndex] + 1, previous[rightIndex - 1] + cost);
3608
+ }
3609
+ for (let rightIndex = 0;rightIndex <= right.length; rightIndex += 1) {
3610
+ previous[rightIndex] = current[rightIndex];
3611
+ }
3612
+ }
3613
+ return previous[right.length];
3614
+ };
3615
+ var resolveFuzzyThreshold = (riskTier) => {
3616
+ switch (riskTier) {
3617
+ case "safe":
3618
+ return -1;
3619
+ case "balanced":
3620
+ return 0.14;
3621
+ case "risky":
3622
+ return 0.2;
3623
+ }
3624
+ };
3625
+ var canUseTieredFuzzyAlias = (alias, riskTier) => {
3626
+ if (riskTier === "safe") {
3627
+ return false;
3628
+ }
3629
+ const tokenCount = normalizeComparableText(alias).split(" ").filter((token) => token.length > 0).length;
3630
+ return riskTier === "balanced" ? tokenCount >= 3 : tokenCount >= 2;
3631
+ };
3632
+ var findFuzzyAliasMatch = (text, alias, riskTier) => {
3633
+ const tokens = tokenizeWithIndices(text);
3634
+ const aliasTokens = normalizeComparableText(alias).split(" ").filter((token) => token.length > 0);
3635
+ if (tokens.length === 0 || aliasTokens.length < 2) {
3636
+ return;
3637
+ }
3638
+ const minWindowLength = Math.max(1, aliasTokens.length - 1);
3639
+ const maxWindowLength = Math.min(tokens.length, aliasTokens.length + 1);
3640
+ const normalizedAlias = aliasTokens.join(" ");
3641
+ const normalizedAliasFirstToken = aliasTokens[0] ?? "";
3642
+ let bestMatch;
3643
+ for (let startIndex = 0;startIndex < tokens.length; startIndex += 1) {
3644
+ for (let windowLength = minWindowLength;windowLength <= maxWindowLength; windowLength += 1) {
3645
+ const endIndex = startIndex + windowLength - 1;
3646
+ if (endIndex >= tokens.length) {
3647
+ break;
3648
+ }
3649
+ const windowTokens = tokens.slice(startIndex, endIndex + 1);
3650
+ const normalizedWindow = normalizeComparableText(windowTokens.map((token) => token.text).join(" "));
3651
+ if (!normalizedWindow) {
3652
+ continue;
3653
+ }
3654
+ const [windowFirstToken] = normalizedWindow.split(" ");
3655
+ if (windowFirstToken !== normalizedAliasFirstToken) {
3656
+ continue;
3657
+ }
3658
+ const distance = levenshteinDistance(normalizedWindow, normalizedAlias);
3659
+ const denominator = Math.max(normalizedWindow.length, normalizedAlias.length);
3660
+ const score = denominator > 0 ? distance / denominator : 0;
3661
+ if (score > resolveFuzzyThreshold(riskTier)) {
3662
+ continue;
3663
+ }
3664
+ const candidate = {
3665
+ end: windowTokens[windowTokens.length - 1].end,
3666
+ score,
3667
+ start: windowTokens[0].start
3668
+ };
3669
+ if (!bestMatch || candidate.score < bestMatch.score || candidate.score === bestMatch.score && candidate.end - candidate.start > bestMatch.end - bestMatch.start) {
3670
+ bestMatch = candidate;
3671
+ }
3672
+ }
3673
+ }
3674
+ return bestMatch;
3675
+ };
3676
+ var normalizeHintAliases = (hint) => (hint.aliases ?? []).map((alias) => alias.trim()).filter((alias) => alias.length > 0).sort((left, right) => right.length - left.length);
3677
+ var applyPhraseHintCorrections = (text, phraseHints) => {
3678
+ return applyRiskTieredPhraseHintCorrections(text, phraseHints, {
3679
+ riskTier: "risky"
3680
+ });
3681
+ };
3682
+ var applyRiskTieredPhraseHintCorrections = (text, phraseHints, options = {}) => {
3683
+ const riskTier = options.riskTier ?? "safe";
3684
+ let corrected = text;
3685
+ const matches = [];
3686
+ for (const hint of phraseHints) {
3687
+ for (const alias of normalizeHintAliases(hint)) {
3688
+ const matcher = buildAliasMatcher(alias);
3689
+ if (!matcher.test(corrected)) {
3690
+ if (!canUseTieredFuzzyAlias(alias, riskTier)) {
3691
+ continue;
3692
+ }
3693
+ const fuzzyMatch = findFuzzyAliasMatch(corrected, alias, riskTier);
3694
+ if (!fuzzyMatch) {
3695
+ continue;
3696
+ }
3697
+ corrected = `${corrected.slice(0, fuzzyMatch.start)}${hint.text}${corrected.slice(fuzzyMatch.end)}`;
3698
+ matches.push({
3699
+ alias,
3700
+ hint
3701
+ });
3702
+ break;
3703
+ }
3704
+ corrected = corrected.replace(matcher, hint.text);
3705
+ matches.push({
3706
+ alias,
3707
+ hint
3708
+ });
3709
+ break;
3710
+ }
3711
+ }
3712
+ return {
3713
+ changed: corrected !== text,
3714
+ matches,
3715
+ text: corrected
3716
+ };
3717
+ };
3718
+ var dedupeAliases = (aliases) => {
3719
+ const seen = new Set;
3720
+ const deduped = [];
3721
+ for (const alias of aliases) {
3722
+ const normalized = normalizeDomainTerm(alias);
3723
+ if (!normalized || seen.has(normalized)) {
3724
+ continue;
3725
+ }
3726
+ seen.add(normalized);
3727
+ deduped.push(alias);
3728
+ }
3729
+ return deduped;
3730
+ };
3731
+ var isSafeAlias = (alias) => {
3732
+ const normalized = normalizeDomainTerm(alias);
3733
+ if (normalized.length < 4) {
3734
+ return false;
3735
+ }
3736
+ const tokens = normalized.split(" ").filter((token) => token.length > 0);
3737
+ return tokens.length >= 2 || normalized.length >= 7;
3738
+ };
3739
+ var createDomainPhraseHints = (terms, options = {}) => {
3740
+ const riskTier = options.riskTier ?? "safe";
3741
+ const hints = [];
3742
+ const seen = new Set;
3743
+ for (const term of terms) {
3744
+ const normalizedText = normalizeDomainTerm(term.text);
3745
+ if (!normalizedText || seen.has(normalizedText)) {
3746
+ continue;
3747
+ }
3748
+ const candidateAliases = dedupeAliases(term.aliases ?? []);
3749
+ const aliases = candidateAliases.filter((alias) => {
3750
+ if (riskTier === "risky") {
3751
+ return true;
3752
+ }
3753
+ if (riskTier === "balanced") {
3754
+ return isSafeAlias(alias) || normalizeDomainTerm(alias) === normalizedText;
3755
+ }
3756
+ return isSafeAlias(alias);
3757
+ });
3758
+ hints.push({
3759
+ aliases: aliases.length > 0 ? aliases : undefined,
3760
+ boost: term.boost,
3761
+ metadata: term.metadata,
3762
+ text: term.text
3763
+ });
3764
+ seen.add(normalizedText);
3765
+ }
3766
+ return hints;
3767
+ };
3768
+ var createDomainLexicon = (terms) => {
3769
+ const entries = [];
3770
+ const seen = new Set;
3771
+ for (const term of terms) {
3772
+ const normalizedText = normalizeDomainTerm(term.text);
3773
+ if (!normalizedText || seen.has(normalizedText)) {
3774
+ continue;
3775
+ }
3776
+ entries.push({
3777
+ aliases: dedupeAliases(term.aliases ?? []),
3778
+ language: term.language,
3779
+ metadata: term.metadata,
3780
+ pronunciation: term.pronunciation,
3781
+ text: term.text
3782
+ });
3783
+ seen.add(normalizedText);
3784
+ }
3785
+ return entries;
3786
+ };
3787
+ var averageTranscriptConfidence = (transcripts) => {
3788
+ const confidences = transcripts.map((transcript) => transcript.confidence).filter((value) => typeof value === "number");
3789
+ return confidences.length > 0 ? confidences.reduce((sum, value) => sum + value, 0) / confidences.length : undefined;
3790
+ };
3791
+ var createPhraseHintCorrectionHandler = (options = {}) => {
3792
+ const provider = options.provider ?? "@absolutejs/voice";
3793
+ const reason = options.reason ?? "phrase-hint-correction";
3794
+ return async ({ phraseHints, text }) => {
3795
+ const result = applyPhraseHintCorrections(text, phraseHints);
3796
+ if (!result.changed) {
3797
+ return;
3798
+ }
3799
+ return {
3800
+ metadata: result.matches.length > 0 ? {
3801
+ matchedAliases: result.matches.map((match) => match.alias),
3802
+ matchedHints: result.matches.map((match) => match.hint.text)
3803
+ } : undefined,
3804
+ provider,
3805
+ reason,
3806
+ text: result.text
3807
+ };
3808
+ };
3809
+ };
3810
+ var lexiconToPhraseHints = (lexicon) => lexicon.map((entry) => ({
3811
+ aliases: entry.aliases,
3812
+ metadata: entry.metadata,
3813
+ text: entry.text
3814
+ }));
3815
+ var applyLexiconCorrections = (text, lexicon) => applyPhraseHintCorrections(text, lexiconToPhraseHints(lexicon));
3816
+ var createLexiconCorrectionHandler = (options = {}) => {
3817
+ const provider = options.provider ?? "@absolutejs/voice";
3818
+ const reason = options.reason ?? "lexicon-correction";
3819
+ return async ({ lexicon, text }) => {
3820
+ const result = applyLexiconCorrections(text, lexicon);
3821
+ if (!result.changed) {
3822
+ return;
3823
+ }
3824
+ return {
3825
+ metadata: result.matches.length > 0 ? {
3826
+ matchedAliases: result.matches.map((match) => match.alias),
3827
+ matchedHints: result.matches.map((match) => match.hint.text)
3828
+ } : undefined,
3829
+ provider,
3830
+ reason,
3831
+ text: result.text
3832
+ };
3833
+ };
3834
+ };
3835
+ var createRiskyTurnCorrectionHandler = (options = {}) => {
3836
+ const provider = options.provider ?? "@absolutejs/voice";
3837
+ const reason = options.reason ?? "risky-turn-correction";
3838
+ const riskTier = options.riskTier ?? "balanced";
3839
+ const maxAverageConfidence = options.maxAverageConfidence ?? 0.92;
3840
+ return async ({ lexicon, phraseHints, text, transcripts }) => {
3841
+ const averageConfidence = averageTranscriptConfidence(transcripts);
3842
+ if (averageConfidence !== undefined && averageConfidence > maxAverageConfidence) {
3843
+ return;
3844
+ }
3845
+ const result = applyRiskTieredPhraseHintCorrections(text, [
3846
+ ...phraseHints,
3847
+ ...lexiconToPhraseHints(lexicon)
3848
+ ], { riskTier });
3849
+ if (!result.changed) {
3850
+ return;
3851
+ }
3852
+ return {
3853
+ metadata: {
3854
+ averageConfidence,
3855
+ matchedAliases: result.matches.map((match) => match.alias),
3856
+ matchedHints: result.matches.map((match) => match.hint.text),
3857
+ riskTier
3858
+ },
3859
+ provider,
3860
+ reason,
3861
+ text: result.text
3862
+ };
3863
+ };
3864
+ };
3865
+
3866
+ // src/routing.ts
3867
+ var resolveVoiceSTTRoutingStrategy = (goal = "best") => {
3868
+ if (goal === "low-cost") {
3869
+ return {
3870
+ benchmarkSessionTarget: "deepgram-flux",
3871
+ correctionMode: "none",
3872
+ goal,
3873
+ notes: [
3874
+ "Uses the cheapest in-package path: one primary STT pass with no correction hook.",
3875
+ "Good for baseline throughput and lower post-processing overhead."
3876
+ ],
3877
+ preset: "default",
3878
+ sttLifecycle: "turn-scoped"
3879
+ };
3880
+ }
3881
+ return {
3882
+ benchmarkSessionTarget: "deepgram-corrected",
3883
+ correctionMode: "generic",
3884
+ goal,
3885
+ notes: [
3886
+ "Uses the current best in-package path: Deepgram Flux with generic deterministic correction.",
3887
+ "Optimized for accuracy and robustness rather than minimum processing cost."
3888
+ ],
3889
+ preset: "reliability",
3890
+ sttLifecycle: "continuous"
3891
+ };
3892
+ };
3893
+ var createVoiceSTTRoutingCorrectionHandler = (mode = "generic") => {
3894
+ if (mode === "none") {
3895
+ return;
3896
+ }
3897
+ if (mode === "risky-turn") {
3898
+ return createRiskyTurnCorrectionHandler();
3899
+ }
3900
+ return createPhraseHintCorrectionHandler();
3901
+ };
3902
+ // src/telephony/twilio.ts
3903
+ import { Buffer as Buffer2 } from "buffer";
3904
+ var TWILIO_MULAW_SAMPLE_RATE = 8000;
3905
+ var VOICE_PCM_SAMPLE_RATE = 16000;
3906
+ var escapeXml = (value) => value.replaceAll("&", "&amp;").replaceAll('"', "&quot;").replaceAll("'", "&apos;").replaceAll("<", "&lt;").replaceAll(">", "&gt;");
3907
+ var normalizeOnTurn2 = (handler) => {
3908
+ if (handler.length > 1) {
3909
+ const directHandler = handler;
3910
+ return async ({ context, session, turn, api }) => directHandler(session, turn, api, context);
3911
+ }
3912
+ return handler;
3913
+ };
3914
+ var resolveSTTFallbackConfig2 = (config) => {
3915
+ if (!config) {
3916
+ return;
3917
+ }
3918
+ return {
3919
+ adapter: config.adapter,
3920
+ completionTimeoutMs: config.completionTimeoutMs ?? 2500,
3921
+ confidenceThreshold: config.confidenceThreshold ?? 0.6,
3922
+ maxAttemptsPerTurn: config.maxAttemptsPerTurn ?? 1,
3923
+ minTextLength: config.minTextLength ?? 2,
3924
+ replayWindowMs: config.replayWindowMs ?? 8000,
3925
+ settleMs: config.settleMs ?? 220,
3926
+ trigger: config.trigger ?? "empty-or-low-confidence"
3927
+ };
3928
+ };
3929
+ var normalizePhraseHints2 = (hints) => (hints ?? []).map((hint) => ({
3930
+ ...hint,
3931
+ aliases: hint.aliases?.filter((value) => typeof value === "string" && value.trim().length > 0),
3932
+ text: hint.text.trim()
3933
+ })).filter((hint) => hint.text.length > 0);
3934
+ var normalizeLexicon2 = (entries) => (entries ?? []).map((entry) => ({
3935
+ ...entry,
3936
+ aliases: entry.aliases?.filter((value) => typeof value === "string" && value.trim().length > 0),
3937
+ language: typeof entry.language === "string" && entry.language.trim().length > 0 ? entry.language.trim() : undefined,
3938
+ pronunciation: typeof entry.pronunciation === "string" && entry.pronunciation.trim().length > 0 ? entry.pronunciation.trim() : undefined,
3939
+ text: entry.text.trim()
3940
+ })).filter((entry) => entry.text.length > 0);
3941
+ var clamp16 = (value) => Math.max(-32768, Math.min(32767, Math.round(value)));
3942
+ var linearResample = (input, inputRate, outputRate) => {
3943
+ if (input.length === 0) {
3944
+ return new Int16Array(0);
3945
+ }
3946
+ if (inputRate === outputRate) {
3947
+ return new Int16Array(input);
3948
+ }
3949
+ const outputLength = Math.max(1, Math.round(input.length * outputRate / inputRate));
3950
+ const output = new Int16Array(outputLength);
3951
+ const ratio = inputRate / outputRate;
3952
+ for (let index = 0;index < outputLength; index += 1) {
3953
+ const sourcePosition = index * ratio;
3954
+ const leftIndex = Math.floor(sourcePosition);
3955
+ const rightIndex = Math.min(input.length - 1, leftIndex + 1);
3956
+ const blend = sourcePosition - leftIndex;
3957
+ const left = input[Math.min(leftIndex, input.length - 1)] ?? 0;
3958
+ const right = input[rightIndex] ?? left;
3959
+ output[index] = clamp16(left + (right - left) * blend);
3960
+ }
3961
+ return output;
3962
+ };
3963
+ var MULAW_BIAS = 132;
3964
+ var MULAW_CLIP = 32635;
3965
+ var encodeMulawSample = (sample) => {
3966
+ let value = clamp16(sample);
3967
+ let sign = 0;
3968
+ if (value < 0) {
3969
+ sign = 128;
3970
+ value = -value;
3971
+ }
3972
+ value = Math.min(MULAW_CLIP, value);
3973
+ value += MULAW_BIAS;
3974
+ let exponent = 7;
3975
+ for (let bit = 16384;(value & bit) === 0 && exponent > 0; bit >>= 1) {
3976
+ exponent -= 1;
3977
+ }
3978
+ const mantissa = value >> exponent + 3 & 15;
3979
+ return ~(sign | exponent << 4 | mantissa) & 255;
3980
+ };
3981
+ var decodeMulawSample = (value) => {
3982
+ const normalized = ~value & 255;
3983
+ const sign = normalized & 128;
3984
+ const exponent = normalized >> 4 & 7;
3985
+ const mantissa = normalized & 15;
3986
+ let sample = (mantissa << 3) + MULAW_BIAS << exponent;
3987
+ sample -= MULAW_BIAS;
3988
+ return sign ? -sample : sample;
3989
+ };
3990
+ var int16ArrayToBytes = (samples) => {
3991
+ const output = new Uint8Array(samples.length * 2);
3992
+ const view = new DataView(output.buffer);
3993
+ for (let index = 0;index < samples.length; index += 1) {
3994
+ view.setInt16(index * 2, samples[index] ?? 0, true);
3995
+ }
3996
+ return output;
3997
+ };
3998
+ var bytesToInt16Array = (bytes) => {
3999
+ const sampleCount = Math.floor(bytes.byteLength / 2);
4000
+ const output = new Int16Array(sampleCount);
4001
+ const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
4002
+ for (let index = 0;index < sampleCount; index += 1) {
4003
+ output[index] = view.getInt16(index * 2, true);
4004
+ }
4005
+ return output;
4006
+ };
4007
+ var decodeTwilioMulawBase64 = (payload) => {
4008
+ const bytes = Uint8Array.from(Buffer2.from(payload, "base64"));
4009
+ const samples = new Int16Array(bytes.length);
4010
+ for (let index = 0;index < bytes.length; index += 1) {
4011
+ samples[index] = decodeMulawSample(bytes[index] ?? 0);
4012
+ }
4013
+ return samples;
4014
+ };
4015
+ var encodeTwilioMulawBase64 = (samples) => {
4016
+ const bytes = new Uint8Array(samples.length);
4017
+ for (let index = 0;index < samples.length; index += 1) {
4018
+ bytes[index] = encodeMulawSample(samples[index] ?? 0);
4019
+ }
4020
+ return Buffer2.from(bytes).toString("base64");
4021
+ };
4022
+ var transcodeTwilioInboundPayloadToPCM16 = (payload) => {
4023
+ const narrowband = decodeTwilioMulawBase64(payload);
4024
+ const wideband = linearResample(narrowband, TWILIO_MULAW_SAMPLE_RATE, VOICE_PCM_SAMPLE_RATE);
4025
+ return int16ArrayToBytes(wideband);
4026
+ };
4027
+ var transcodePCMToTwilioOutboundPayload = (chunk, format) => {
4028
+ if (format.container === "raw" && format.encoding === "mulaw" && format.channels === 1 && format.sampleRateHz === TWILIO_MULAW_SAMPLE_RATE) {
4029
+ return Buffer2.from(chunk).toString("base64");
4030
+ }
4031
+ if (format.encoding !== "pcm_s16le") {
4032
+ throw new Error(`Unsupported outbound telephony audio format: ${format.container}/${format.encoding}`);
4033
+ }
4034
+ const pcm = bytesToInt16Array(chunk);
4035
+ const mono = format.channels === 1 ? pcm : new Int16Array(Array.from({ length: Math.floor(pcm.length / 2) }, (_, frameIndex) => {
4036
+ const left = pcm[frameIndex * 2] ?? 0;
4037
+ const right = pcm[frameIndex * 2 + 1] ?? 0;
4038
+ return clamp16((left + right) / 2);
4039
+ }));
4040
+ const telephony = linearResample(mono, format.sampleRateHz, TWILIO_MULAW_SAMPLE_RATE);
4041
+ return encodeTwilioMulawBase64(telephony);
4042
+ };
4043
+ var parseTwilioMessage = (raw) => {
4044
+ if (typeof raw !== "string") {
4045
+ return raw;
4046
+ }
4047
+ return JSON.parse(raw);
4048
+ };
4049
+ var createTwilioSocketAdapter = (socket, getState) => ({
4050
+ close: async (code, reason) => {
4051
+ await Promise.resolve(socket.close(code, reason));
4052
+ },
4053
+ send: async (data) => {
4054
+ if (typeof data !== "string") {
4055
+ return;
4056
+ }
4057
+ const state = getState();
4058
+ const message = JSON.parse(data);
4059
+ state.reviewRecorder?.recordVoiceMessage(message);
4060
+ await Promise.resolve(state.onVoiceMessage?.({
4061
+ callSid: state.callSid ?? undefined,
4062
+ message,
4063
+ sessionId: state.sessionId ?? "",
4064
+ streamSid: state.streamSid ?? undefined
4065
+ }));
4066
+ if (!state.streamSid) {
4067
+ return;
4068
+ }
4069
+ if (message.type === "audio") {
4070
+ const payload = transcodePCMToTwilioOutboundPayload(Uint8Array.from(Buffer2.from(message.chunkBase64, "base64")), message.format);
4071
+ state.hasOutboundAudioSinceLastInbound = true;
4072
+ state.reviewRecorder?.recordTwilioOutbound({
4073
+ bytes: payload.length,
4074
+ event: "media",
4075
+ track: "outbound"
4076
+ });
4077
+ await Promise.resolve(socket.send(JSON.stringify({
4078
+ event: "media",
4079
+ media: {
4080
+ payload
4081
+ },
4082
+ streamSid: state.streamSid
4083
+ })));
4084
+ return;
4085
+ }
4086
+ if (message.type === "assistant" && message.turnId) {
4087
+ state.reviewRecorder?.recordTwilioOutbound({
4088
+ event: "mark",
4089
+ name: `assistant:${message.turnId}`
4090
+ });
4091
+ await Promise.resolve(socket.send(JSON.stringify({
4092
+ event: "mark",
4093
+ mark: {
4094
+ name: `assistant:${message.turnId}`
4095
+ },
4096
+ streamSid: state.streamSid
4097
+ })));
4098
+ }
4099
+ }
4100
+ });
4101
+ var createTwilioVoiceResponse = (options) => {
4102
+ const parameters = Object.entries(options.parameters ?? {}).filter((entry) => entry[1] !== undefined).map(([name, value]) => `<Parameter name="${escapeXml(name)}" value="${escapeXml(String(value))}" />`).join("");
4103
+ return `<?xml version="1.0" encoding="UTF-8"?><Response><Connect><Stream url="${escapeXml(options.streamUrl)}"${options.track ? ` track="${escapeXml(options.track)}"` : ""}${options.streamName ? ` name="${escapeXml(options.streamName)}"` : ""}>${parameters}</Stream></Connect></Response>`;
4104
+ };
4105
+ var createTwilioMediaStreamBridge = (socket, options) => {
4106
+ const runtimePreset = resolveVoiceRuntimePreset(options.preset);
4107
+ const turnDetection = resolveTurnDetectionConfig({
4108
+ ...runtimePreset.turnDetection,
4109
+ ...options.turnDetection
4110
+ });
4111
+ const audioConditioning = options.audioConditioning !== undefined ? resolveAudioConditioningConfig(options.audioConditioning) : runtimePreset.audioConditioning;
4112
+ const logger = resolveLogger(options.logger);
4113
+ const reconnect = {
4114
+ maxAttempts: options.reconnect?.maxAttempts ?? 10,
4115
+ strategy: options.reconnect?.strategy ?? "resume-last-turn",
4116
+ timeout: options.reconnect?.timeout ?? 30000
4117
+ };
4118
+ const bridgeState = {
4119
+ callSid: null,
4120
+ hasOutboundAudioSinceLastInbound: false,
4121
+ onVoiceMessage: options.onVoiceMessage,
4122
+ reviewRecorder: options.review ? createVoiceCallReviewRecorder({
4123
+ config: options.review.config ?? {
4124
+ preset: options.preset,
4125
+ stt: {
4126
+ kind: options.stt.kind
4127
+ },
4128
+ tts: options.tts ? {
4129
+ kind: options.tts.kind
4130
+ } : undefined,
4131
+ turnDetection
4132
+ },
4133
+ fixtureId: options.review.fixtureId,
4134
+ path: options.review.path,
4135
+ title: options.review.title
4136
+ }) : undefined,
4137
+ scenarioId: options.scenarioId ?? null,
4138
+ sessionId: options.sessionId ?? null,
4139
+ streamSid: null
4140
+ };
4141
+ let sessionHandle = null;
4142
+ let reviewArtifactDelivered = false;
4143
+ const resolveLexicon2 = async () => {
4144
+ if (typeof options.lexicon === "function") {
4145
+ return normalizeLexicon2(await options.lexicon({
4146
+ context: options.context,
4147
+ scenarioId: bridgeState.scenarioId ?? undefined,
4148
+ sessionId: bridgeState.sessionId ?? ""
4149
+ }) ?? []);
4150
+ }
4151
+ return normalizeLexicon2(options.lexicon);
4152
+ };
4153
+ const resolvePhraseHints2 = async () => {
4154
+ if (typeof options.phraseHints === "function") {
4155
+ return normalizePhraseHints2(await options.phraseHints({
4156
+ context: options.context,
4157
+ scenarioId: bridgeState.scenarioId ?? undefined,
4158
+ sessionId: bridgeState.sessionId ?? ""
4159
+ }) ?? []);
4160
+ }
4161
+ return normalizePhraseHints2(options.phraseHints);
4162
+ };
4163
+ const ensureSession = async () => {
4164
+ if (sessionHandle) {
4165
+ return sessionHandle;
4166
+ }
4167
+ bridgeState.sessionId ??= `phone-${Date.now().toString(36)}`;
4168
+ const lexicon = await resolveLexicon2();
4169
+ const phraseHints = await resolvePhraseHints2();
4170
+ const normalizedOnTurn = normalizeOnTurn2(options.onTurn);
4171
+ const route = {
4172
+ correctTurn: options.correctTurn,
4173
+ onComplete: options.onComplete,
4174
+ onError: options.onError,
4175
+ onSession: options.onSession,
4176
+ onTurn: async (input) => {
4177
+ bridgeState.reviewRecorder?.recordVoiceMessage({
4178
+ type: "turn",
4179
+ turn: input.turn
4180
+ });
4181
+ const result = await normalizedOnTurn(input);
4182
+ if (result?.assistantText) {
4183
+ bridgeState.reviewRecorder?.recordVoiceMessage({
4184
+ type: "assistant",
4185
+ text: result.assistantText,
4186
+ turnId: input.turn.id
4187
+ });
4188
+ }
4189
+ return result;
4190
+ }
4191
+ };
4192
+ const voiceSocket = createTwilioSocketAdapter(socket, () => bridgeState);
4193
+ sessionHandle = createVoiceSession({
4194
+ audioConditioning,
4195
+ context: options.context,
4196
+ costTelemetry: options.costTelemetry,
4197
+ id: bridgeState.sessionId,
4198
+ languageStrategy: options.languageStrategy,
4199
+ lexicon,
4200
+ logger,
4201
+ phraseHints,
4202
+ reconnect,
4203
+ route,
4204
+ scenarioId: bridgeState.scenarioId ?? undefined,
4205
+ socket: voiceSocket,
4206
+ store: options.session,
4207
+ stt: options.stt,
4208
+ sttFallback: resolveSTTFallbackConfig2(options.sttFallback),
4209
+ sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
4210
+ tts: options.tts,
4211
+ turnDetection
4212
+ });
4213
+ return sessionHandle;
4214
+ };
4215
+ return {
4216
+ close: async (reason) => {
4217
+ await sessionHandle?.close(reason);
4218
+ if (bridgeState.reviewRecorder && options.review?.onArtifact && !reviewArtifactDelivered) {
4219
+ reviewArtifactDelivered = true;
4220
+ await Promise.resolve(options.review.onArtifact(bridgeState.reviewRecorder.finalize()));
4221
+ }
4222
+ },
4223
+ getSessionId: () => bridgeState.sessionId,
4224
+ getStreamSid: () => bridgeState.streamSid,
4225
+ handleMessage: async (raw) => {
4226
+ const message = parseTwilioMessage(raw);
4227
+ switch (message.event) {
4228
+ case "connected":
4229
+ bridgeState.reviewRecorder?.recordTwilioInbound({
4230
+ event: "connected"
4231
+ });
4232
+ return;
4233
+ case "start": {
4234
+ bridgeState.streamSid = message.start.streamSid;
4235
+ bridgeState.callSid = message.start.callSid ?? null;
4236
+ bridgeState.sessionId = message.start.customParameters?.sessionId?.trim() || bridgeState.sessionId;
4237
+ bridgeState.scenarioId = message.start.customParameters?.scenarioId?.trim() || bridgeState.scenarioId;
4238
+ bridgeState.reviewRecorder?.recordTwilioInbound({
4239
+ event: "start",
4240
+ reason: message.start.callSid,
4241
+ text: bridgeState.sessionId ?? undefined
4242
+ });
4243
+ await ensureSession();
4244
+ return;
4245
+ }
4246
+ case "media": {
4247
+ const activeSession = await ensureSession();
4248
+ bridgeState.reviewRecorder?.recordTwilioInbound({
4249
+ bytes: message.media.payload.length,
4250
+ event: "media",
4251
+ track: message.media.track
4252
+ });
4253
+ if (options.clearOnInboundMedia !== false && bridgeState.hasOutboundAudioSinceLastInbound && bridgeState.streamSid) {
4254
+ bridgeState.reviewRecorder?.recordTwilioOutbound({
4255
+ event: "clear"
4256
+ });
4257
+ await Promise.resolve(socket.send(JSON.stringify({
4258
+ event: "clear",
4259
+ streamSid: bridgeState.streamSid
4260
+ })));
4261
+ }
4262
+ bridgeState.hasOutboundAudioSinceLastInbound = false;
4263
+ await activeSession.receiveAudio(transcodeTwilioInboundPayloadToPCM16(message.media.payload));
4264
+ return;
4265
+ }
4266
+ case "mark":
4267
+ bridgeState.reviewRecorder?.recordTwilioInbound({
4268
+ event: "mark",
4269
+ name: message.mark?.name
4270
+ });
4271
+ return;
4272
+ case "stop":
4273
+ bridgeState.reviewRecorder?.recordTwilioInbound({
4274
+ event: "stop",
4275
+ reason: message.stop?.callSid
4276
+ });
4277
+ await sessionHandle?.close("twilio-stop");
4278
+ return;
4279
+ }
4280
+ }
4281
+ };
4282
+ };
4283
+ // src/telephony/response.ts
4284
+ var normalizeWhitespace = (value) => value.replace(/\s+/g, " ").trim();
4285
+ var DEFAULT_MAX_WORDS = 12;
4286
+ var CLAUSE_BOUNDARY_PATTERN = /(?<=[,.;!?])\s+/u;
4287
+ var clampWords = (text, maxWords) => {
4288
+ if (!Number.isFinite(maxWords) || maxWords <= 0) {
4289
+ return text;
4290
+ }
4291
+ const words = text.split(/\s+/u).filter(Boolean);
4292
+ if (words.length <= maxWords) {
4293
+ return text;
4294
+ }
4295
+ return words.slice(0, maxWords).join(" ");
4296
+ };
4297
+ var clampChars = (text, maxChars) => {
4298
+ if (!Number.isFinite(maxChars) || !maxChars || maxChars <= 0) {
4299
+ return text;
4300
+ }
4301
+ if (text.length <= maxChars) {
4302
+ return text;
4303
+ }
4304
+ return text.slice(0, maxChars).trim();
4305
+ };
4306
+ var ensureTerminalPunctuation = (text) => {
4307
+ if (!text) {
4308
+ return text;
4309
+ }
4310
+ return /[.!?]$/u.test(text) ? text : `${text}.`;
4311
+ };
4312
+ var extractLeadClause = (text) => {
4313
+ const normalized = normalizeWhitespace(text);
4314
+ if (!normalized) {
4315
+ return normalized;
4316
+ }
4317
+ const colonIndex = normalized.indexOf(":");
4318
+ const body = colonIndex >= 0 && colonIndex < 24 && colonIndex < normalized.length - 1 ? normalizeWhitespace(normalized.slice(colonIndex + 1)) : normalized;
4319
+ const clauses = body.split(CLAUSE_BOUNDARY_PATTERN).filter(Boolean);
4320
+ return clauses[0] ?? body;
4321
+ };
4322
+ var shapeTelephonyAssistantText = (text, options = {}) => {
4323
+ const normalized = normalizeWhitespace(text);
4324
+ if (!normalized) {
4325
+ return normalized;
4326
+ }
4327
+ if ((options.mode ?? "lead-clause") === "full") {
4328
+ return clampChars(normalized, options.maxChars);
4329
+ }
4330
+ const lead = extractLeadClause(normalized);
4331
+ const limitedWords = clampWords(lead, options.maxWords ?? DEFAULT_MAX_WORDS);
4332
+ const limitedChars = clampChars(limitedWords, options.maxChars);
4333
+ return ensureTerminalPunctuation(normalizeWhitespace(limitedChars));
4334
+ };
960
4335
  export {
4336
+ withVoiceOpsTaskId,
4337
+ withVoiceIntegrationEventId,
961
4338
  voice,
4339
+ transcodeTwilioInboundPayloadToPCM16,
4340
+ transcodePCMToTwilioOutboundPayload,
4341
+ summarizeVoiceOpsTasks,
4342
+ startVoiceOpsTask,
4343
+ shapeTelephonyAssistantText,
4344
+ resolveVoiceSTTRoutingStrategy,
4345
+ resolveVoiceRuntimePreset,
4346
+ resolveTurnDetectionConfig,
4347
+ resolveAudioConditioningConfig,
4348
+ reopenVoiceOpsTask,
4349
+ renderVoiceCallReviewMarkdown,
4350
+ renderVoiceCallReviewHTML,
4351
+ recordVoiceRuntimeOps,
4352
+ listVoiceOpsTasks,
4353
+ encodeTwilioMulawBase64,
4354
+ decodeTwilioMulawBase64,
4355
+ createVoiceTaskUpdatedEvent,
4356
+ createVoiceTaskCreatedEvent,
962
4357
  createVoiceSessionRecord,
963
4358
  createVoiceSession,
4359
+ createVoiceSTTRoutingCorrectionHandler,
4360
+ createVoiceReviewSavedEvent,
964
4361
  createVoiceMemoryStore,
965
- createId
4362
+ createVoiceIntegrationEvent,
4363
+ createVoiceFileTaskStore,
4364
+ createVoiceFileSessionStore,
4365
+ createVoiceFileRuntimeStorage,
4366
+ createVoiceFileReviewStore,
4367
+ createVoiceFileIntegrationEventStore,
4368
+ createVoiceCallReviewRecorder,
4369
+ createVoiceCallReviewFromSession,
4370
+ createVoiceCallReviewFromLiveTelephonyReport,
4371
+ createVoiceCallCompletedEvent,
4372
+ createTwilioVoiceResponse,
4373
+ createTwilioMediaStreamBridge,
4374
+ createStoredVoiceOpsTask,
4375
+ createStoredVoiceIntegrationEvent,
4376
+ createStoredVoiceCallReviewArtifact,
4377
+ createRiskyTurnCorrectionHandler,
4378
+ createPhraseHintCorrectionHandler,
4379
+ createId,
4380
+ createDomainPhraseHints,
4381
+ createDomainLexicon,
4382
+ conditionAudioChunk,
4383
+ completeVoiceOpsTask,
4384
+ buildVoiceOpsTaskFromReview,
4385
+ assignVoiceOpsTask,
4386
+ applyRiskTieredPhraseHintCorrections,
4387
+ applyPhraseHintCorrections,
4388
+ TURN_PROFILE_DEFAULTS
966
4389
  };