@absolutejs/voice 0.0.22-beta.584 → 0.0.22-beta.586

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -391,22 +391,146 @@ var resolveLogger = (logger) => ({
391
391
  ...logger
392
392
  });
393
393
 
394
+ // src/core/turnDetection.ts
395
+ var DEFAULT_SILENCE_MS = 700;
396
+ var DEFAULT_SPEECH_THRESHOLD = 0.015;
397
+ var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
398
+ var toUint8Array = (audio) => {
399
+ if (audio instanceof ArrayBuffer) {
400
+ return new Uint8Array(audio);
401
+ }
402
+ return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
403
+ };
404
+ var measureAudioLevel = (audio) => {
405
+ const bytes = toUint8Array(audio);
406
+ if (bytes.byteLength < 2) {
407
+ return 0;
408
+ }
409
+ const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
410
+ if (samples.length === 0) {
411
+ return 0;
412
+ }
413
+ let sumSquares = 0;
414
+ for (const sample of samples) {
415
+ const normalized = sample / 32768;
416
+ sumSquares += normalized * normalized;
417
+ }
418
+ return Math.sqrt(sumSquares / samples.length);
419
+ };
420
+ var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
421
+ var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
422
+ var selectPreferredTranscriptText = (currentText, nextText) => {
423
+ const current = normalizeText(currentText);
424
+ const next = normalizeText(nextText);
425
+ if (!current) {
426
+ return next;
427
+ }
428
+ if (!next) {
429
+ return current;
430
+ }
431
+ if (current === next || current.includes(next)) {
432
+ return current;
433
+ }
434
+ if (next.includes(current)) {
435
+ return next;
436
+ }
437
+ if (countWords(next) > countWords(current)) {
438
+ return next;
439
+ }
440
+ if (countWords(next) === countWords(current) && next.length > current.length) {
441
+ return next;
442
+ }
443
+ return current;
444
+ };
445
+ var mergeSequentialTranscriptText = (currentText, nextText) => {
446
+ const current = normalizeText(currentText);
447
+ const next = normalizeText(nextText);
448
+ if (!current) {
449
+ return next;
450
+ }
451
+ if (!next) {
452
+ return current;
453
+ }
454
+ const currentWords = current.split(" ");
455
+ const nextWords = next.split(" ");
456
+ const maxOverlap = Math.min(currentWords.length, nextWords.length);
457
+ for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
458
+ const currentSuffix = currentWords.slice(-overlap).join(" ");
459
+ const nextPrefix = nextWords.slice(0, overlap).join(" ");
460
+ if (currentSuffix === nextPrefix) {
461
+ return [...currentWords, ...nextWords.slice(overlap)].join(" ");
462
+ }
463
+ }
464
+ return `${current} ${next}`.trim();
465
+ };
466
+ var countCommonPrefixWords = (currentText, nextText) => {
467
+ const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
468
+ const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
469
+ const maxWords = Math.min(currentWords.length, nextWords.length);
470
+ let count = 0;
471
+ for (let index = 0;index < maxWords; index += 1) {
472
+ if (currentWords[index] !== nextWords[index]) {
473
+ break;
474
+ }
475
+ count += 1;
476
+ }
477
+ return count;
478
+ };
479
+ var mergeTranscriptTexts = (transcripts) => {
480
+ const merged = [];
481
+ for (const transcript of transcripts) {
482
+ const nextText = normalizeText(transcript.text);
483
+ if (!nextText) {
484
+ continue;
485
+ }
486
+ const previous = merged.at(-1);
487
+ if (!previous) {
488
+ merged.push(nextText);
489
+ continue;
490
+ }
491
+ if (nextText === previous || previous.includes(nextText)) {
492
+ continue;
493
+ }
494
+ if (nextText.includes(previous)) {
495
+ merged[merged.length - 1] = nextText;
496
+ continue;
497
+ }
498
+ merged.push(nextText);
499
+ }
500
+ return merged.join(" ").trim();
501
+ };
502
+ var buildTurnText = (transcripts, partialText, options = {}) => {
503
+ const finalText = mergeTranscriptTexts(transcripts);
504
+ const nextPartial = normalizeText(partialText);
505
+ const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
506
+ if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
507
+ return mergeSequentialTranscriptText(finalText, nextPartial);
508
+ }
509
+ return selectPreferredTranscriptText(finalText, nextPartial);
510
+ };
511
+
394
512
  // src/core/turnProfiles.ts
395
513
  var TURN_PROFILE_DEFAULTS = {
396
514
  balanced: {
397
515
  qualityProfile: "general",
516
+ semanticVetoMaxMs: 0,
517
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
398
518
  silenceMs: 1400,
399
519
  speechThreshold: 0.012,
400
520
  transcriptStabilityMs: 1000
401
521
  },
402
522
  fast: {
403
523
  qualityProfile: "general",
524
+ semanticVetoMaxMs: 0,
525
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
404
526
  silenceMs: 700,
405
527
  speechThreshold: 0.015,
406
528
  transcriptStabilityMs: 450
407
529
  },
408
530
  "long-form": {
409
531
  qualityProfile: "general",
532
+ semanticVetoMaxMs: 0,
533
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
410
534
  silenceMs: 2200,
411
535
  speechThreshold: 0.01,
412
536
  transcriptStabilityMs: 1500
@@ -440,6 +564,8 @@ var resolveTurnDetectionConfig = (config) => {
440
564
  return {
441
565
  profile,
442
566
  qualityProfile,
567
+ semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
568
+ semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
443
569
  silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
444
570
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
445
571
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
@@ -3454,123 +3580,6 @@ var createVoiceTwilioRedirectHandoffAdapter = (options) => ({
3454
3580
  }
3455
3581
  });
3456
3582
 
3457
- // src/core/turnDetection.ts
3458
- var DEFAULT_SILENCE_MS = 700;
3459
- var DEFAULT_SPEECH_THRESHOLD = 0.015;
3460
- var toUint8Array = (audio) => {
3461
- if (audio instanceof ArrayBuffer) {
3462
- return new Uint8Array(audio);
3463
- }
3464
- return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
3465
- };
3466
- var measureAudioLevel = (audio) => {
3467
- const bytes = toUint8Array(audio);
3468
- if (bytes.byteLength < 2) {
3469
- return 0;
3470
- }
3471
- const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
3472
- if (samples.length === 0) {
3473
- return 0;
3474
- }
3475
- let sumSquares = 0;
3476
- for (const sample of samples) {
3477
- const normalized = sample / 32768;
3478
- sumSquares += normalized * normalized;
3479
- }
3480
- return Math.sqrt(sumSquares / samples.length);
3481
- };
3482
- var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
3483
- var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
3484
- var selectPreferredTranscriptText = (currentText, nextText) => {
3485
- const current = normalizeText(currentText);
3486
- const next = normalizeText(nextText);
3487
- if (!current) {
3488
- return next;
3489
- }
3490
- if (!next) {
3491
- return current;
3492
- }
3493
- if (current === next || current.includes(next)) {
3494
- return current;
3495
- }
3496
- if (next.includes(current)) {
3497
- return next;
3498
- }
3499
- if (countWords(next) > countWords(current)) {
3500
- return next;
3501
- }
3502
- if (countWords(next) === countWords(current) && next.length > current.length) {
3503
- return next;
3504
- }
3505
- return current;
3506
- };
3507
- var mergeSequentialTranscriptText = (currentText, nextText) => {
3508
- const current = normalizeText(currentText);
3509
- const next = normalizeText(nextText);
3510
- if (!current) {
3511
- return next;
3512
- }
3513
- if (!next) {
3514
- return current;
3515
- }
3516
- const currentWords = current.split(" ");
3517
- const nextWords = next.split(" ");
3518
- const maxOverlap = Math.min(currentWords.length, nextWords.length);
3519
- for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
3520
- const currentSuffix = currentWords.slice(-overlap).join(" ");
3521
- const nextPrefix = nextWords.slice(0, overlap).join(" ");
3522
- if (currentSuffix === nextPrefix) {
3523
- return [...currentWords, ...nextWords.slice(overlap)].join(" ");
3524
- }
3525
- }
3526
- return `${current} ${next}`.trim();
3527
- };
3528
- var countCommonPrefixWords = (currentText, nextText) => {
3529
- const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
3530
- const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
3531
- const maxWords = Math.min(currentWords.length, nextWords.length);
3532
- let count = 0;
3533
- for (let index = 0;index < maxWords; index += 1) {
3534
- if (currentWords[index] !== nextWords[index]) {
3535
- break;
3536
- }
3537
- count += 1;
3538
- }
3539
- return count;
3540
- };
3541
- var mergeTranscriptTexts = (transcripts) => {
3542
- const merged = [];
3543
- for (const transcript of transcripts) {
3544
- const nextText = normalizeText(transcript.text);
3545
- if (!nextText) {
3546
- continue;
3547
- }
3548
- const previous = merged.at(-1);
3549
- if (!previous) {
3550
- merged.push(nextText);
3551
- continue;
3552
- }
3553
- if (nextText === previous || previous.includes(nextText)) {
3554
- continue;
3555
- }
3556
- if (nextText.includes(previous)) {
3557
- merged[merged.length - 1] = nextText;
3558
- continue;
3559
- }
3560
- merged.push(nextText);
3561
- }
3562
- return merged.join(" ").trim();
3563
- };
3564
- var buildTurnText = (transcripts, partialText, options = {}) => {
3565
- const finalText = mergeTranscriptTexts(transcripts);
3566
- const nextPartial = normalizeText(partialText);
3567
- const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
3568
- if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
3569
- return mergeSequentialTranscriptText(finalText, nextPartial);
3570
- }
3571
- return selectPreferredTranscriptText(finalText, nextPartial);
3572
- };
3573
-
3574
3583
  // src/core/types.ts
3575
3584
  var ttsAdapterSessionCanCancel = (session) => typeof session.cancel === "function";
3576
3585
 
@@ -3712,6 +3721,8 @@ var FALLBACK_CONFIDENCE_SELECTION_DELTA = 0.05;
3712
3721
  var FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO = 0.12;
3713
3722
  var EXTENDED_VENDOR_COMMIT_SILENCE_THRESHOLD_MS = 200;
3714
3723
  var MAX_VENDOR_COMMIT_GRACE_MS = 1200;
3724
+ var STT_RECONNECT_FLAP_WINDOW_MS = 4000;
3725
+ var MAX_STT_RECONNECTS_IN_FLAP_WINDOW = 3;
3715
3726
  var DEFAULT_FORMAT = {
3716
3727
  channels: 1,
3717
3728
  container: "raw",
@@ -3907,8 +3918,11 @@ var createVoiceSession = (options) => {
3907
3918
  const turnDetection = {
3908
3919
  silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
3909
3920
  speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
3910
- transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
3921
+ transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS,
3922
+ semanticVetoMaxMs: options.turnDetection.semanticVetoMaxMs ?? 0,
3923
+ semanticVetoRecheckMs: options.turnDetection.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS
3911
3924
  };
3925
+ let semanticVetoElapsedMs = 0;
3912
3926
  const sttFallback = options.sttFallback ? {
3913
3927
  adapter: options.sttFallback.adapter,
3914
3928
  completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
@@ -3949,6 +3963,8 @@ var createVoiceSession = (options) => {
3949
3963
  let operationQueue = Promise.resolve();
3950
3964
  let adapterGenerationCounter = 0;
3951
3965
  let activeAdapterGeneration = 0;
3966
+ let sttReconnectCount = 0;
3967
+ let lastSttReconnectAt = 0;
3952
3968
  let activeTTSTurnId;
3953
3969
  let assistantSpeechEndsAt = 0;
3954
3970
  let lastAssistantAudioAt = 0;
@@ -4423,10 +4439,51 @@ var createVoiceSession = (options) => {
4423
4439
  silenceTimer = setTimeout(() => {
4424
4440
  silenceTimer = null;
4425
4441
  pendingCommitReason = null;
4426
- api.commitTurn(reason);
4442
+ runScheduledCommit(reason);
4427
4443
  }, delayMs);
4428
4444
  };
4429
4445
  const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
4446
+ const shouldDeferSilenceCommit = async (reason) => {
4447
+ if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
4448
+ return false;
4449
+ }
4450
+ const session = await readSession();
4451
+ const { partialText, transcripts } = session.currentTurn;
4452
+ const userText = buildTurnText(transcripts, partialText, {
4453
+ partialEndedAtMs: session.currentTurn.partialEndedAt,
4454
+ partialStartedAtMs: session.currentTurn.partialStartedAt
4455
+ });
4456
+ if (!userText) {
4457
+ return false;
4458
+ }
4459
+ const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
4460
+ let endOfTurn = true;
4461
+ try {
4462
+ const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
4463
+ lastFinalTranscript: transcripts.at(-1),
4464
+ partialText,
4465
+ silenceMs,
4466
+ transcripts
4467
+ }));
4468
+ endOfTurn = verdict.endOfTurn;
4469
+ } catch {
4470
+ return false;
4471
+ }
4472
+ if (endOfTurn !== false) {
4473
+ return false;
4474
+ }
4475
+ const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
4476
+ const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
4477
+ semanticVetoElapsedMs += extendMs;
4478
+ scheduleTurnCommit(extendMs, reason);
4479
+ return true;
4480
+ };
4481
+ const runScheduledCommit = async (reason) => {
4482
+ if (await shouldDeferSilenceCommit(reason)) {
4483
+ return;
4484
+ }
4485
+ await api.commitTurn(reason);
4486
+ };
4430
4487
  const requestTurnCommit = async (reason) => {
4431
4488
  const session = await readSession();
4432
4489
  const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
@@ -4794,6 +4851,27 @@ var createVoiceSession = (options) => {
4794
4851
  }
4795
4852
  };
4796
4853
  const handleClose = async (event) => {
4854
+ const session = await readSession();
4855
+ const callLive = session.status !== "completed" && session.status !== "failed";
4856
+ if (callLive && (options.stt || options.realtime)) {
4857
+ const now = Date.now();
4858
+ sttReconnectCount = now - lastSttReconnectAt < STT_RECONNECT_FLAP_WINDOW_MS ? sttReconnectCount + 1 : 1;
4859
+ lastSttReconnectAt = now;
4860
+ if (sttReconnectCount <= MAX_STT_RECONNECTS_IN_FLAP_WINDOW) {
4861
+ await appendTrace({
4862
+ payload: {
4863
+ action: "stt-reconnect",
4864
+ attempt: sttReconnectCount,
4865
+ reason: event.reason ?? "stt stream closed",
4866
+ recoverable: event.recoverable
4867
+ },
4868
+ session,
4869
+ type: "session.error"
4870
+ });
4871
+ await closeAdapter(event.reason ?? "stt stream closed; reconnecting");
4872
+ return;
4873
+ }
4874
+ }
4797
4875
  if (event.recoverable === false) {
4798
4876
  await failInternal(new Error(event.reason ?? "Speech-to-text session closed"));
4799
4877
  return;
@@ -5118,6 +5196,7 @@ var createVoiceSession = (options) => {
5118
5196
  });
5119
5197
  };
5120
5198
  const handleFinal = async (transcript) => {
5199
+ sttReconnectCount = 0;
5121
5200
  const session = await writeSession((session2) => {
5122
5201
  const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
5123
5202
  if (!alreadyPresent) {
@@ -5138,6 +5217,7 @@ var createVoiceSession = (options) => {
5138
5217
  session2.lastActivityAt = Date.now();
5139
5218
  session2.status = "active";
5140
5219
  });
5220
+ semanticVetoElapsedMs = 0;
5141
5221
  if (silenceTimer && pendingCommitReason === "vendor") {
5142
5222
  scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
5143
5223
  }
@@ -5841,6 +5921,7 @@ var createVoiceSession = (options) => {
5841
5921
  };
5842
5922
  const commitTurnInternal = async (reason = "manual") => {
5843
5923
  clearSilenceTimer();
5924
+ semanticVetoElapsedMs = 0;
5844
5925
  backchannelDriver?.reset();
5845
5926
  amdLastTurnCommitAt = Date.now();
5846
5927
  const session = await readSession();
@@ -42388,9 +42469,12 @@ var createVoiceConfiguration = (configuration) => configuration;
42388
42469
  var DEFAULT_SPEECH_THRESHOLD2 = 0.015;
42389
42470
  var DEFAULT_SILENCE_MS2 = 700;
42390
42471
  var DEFAULT_TRANSCRIPT_STABILITY_MS2 = 200;
42472
+ var DEFAULT_SEMANTIC_VETO_RECHECK_MS2 = 1200;
42391
42473
  var resolveTurnDetection = (input) => ({
42392
42474
  profile: input?.profile ?? "balanced",
42393
42475
  qualityProfile: input?.qualityProfile ?? "general",
42476
+ semanticVetoMaxMs: input?.semanticVetoMaxMs ?? 0,
42477
+ semanticVetoRecheckMs: input?.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS2,
42394
42478
  silenceMs: input?.silenceMs ?? DEFAULT_SILENCE_MS2,
42395
42479
  speechThreshold: input?.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD2,
42396
42480
  transcriptStabilityMs: input?.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS2
@@ -12243,22 +12243,146 @@ var resolveAudioConditioningConfig = (config) => {
12243
12243
  };
12244
12244
  };
12245
12245
 
12246
+ // src/core/turnDetection.ts
12247
+ var DEFAULT_SILENCE_MS = 700;
12248
+ var DEFAULT_SPEECH_THRESHOLD = 0.015;
12249
+ var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
12250
+ var toUint8Array = (audio) => {
12251
+ if (audio instanceof ArrayBuffer) {
12252
+ return new Uint8Array(audio);
12253
+ }
12254
+ return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
12255
+ };
12256
+ var measureAudioLevel = (audio) => {
12257
+ const bytes = toUint8Array(audio);
12258
+ if (bytes.byteLength < 2) {
12259
+ return 0;
12260
+ }
12261
+ const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
12262
+ if (samples.length === 0) {
12263
+ return 0;
12264
+ }
12265
+ let sumSquares = 0;
12266
+ for (const sample of samples) {
12267
+ const normalized = sample / 32768;
12268
+ sumSquares += normalized * normalized;
12269
+ }
12270
+ return Math.sqrt(sumSquares / samples.length);
12271
+ };
12272
+ var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
12273
+ var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
12274
+ var selectPreferredTranscriptText = (currentText, nextText) => {
12275
+ const current = normalizeText(currentText);
12276
+ const next = normalizeText(nextText);
12277
+ if (!current) {
12278
+ return next;
12279
+ }
12280
+ if (!next) {
12281
+ return current;
12282
+ }
12283
+ if (current === next || current.includes(next)) {
12284
+ return current;
12285
+ }
12286
+ if (next.includes(current)) {
12287
+ return next;
12288
+ }
12289
+ if (countWords(next) > countWords(current)) {
12290
+ return next;
12291
+ }
12292
+ if (countWords(next) === countWords(current) && next.length > current.length) {
12293
+ return next;
12294
+ }
12295
+ return current;
12296
+ };
12297
+ var mergeSequentialTranscriptText = (currentText, nextText) => {
12298
+ const current = normalizeText(currentText);
12299
+ const next = normalizeText(nextText);
12300
+ if (!current) {
12301
+ return next;
12302
+ }
12303
+ if (!next) {
12304
+ return current;
12305
+ }
12306
+ const currentWords = current.split(" ");
12307
+ const nextWords = next.split(" ");
12308
+ const maxOverlap = Math.min(currentWords.length, nextWords.length);
12309
+ for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
12310
+ const currentSuffix = currentWords.slice(-overlap).join(" ");
12311
+ const nextPrefix = nextWords.slice(0, overlap).join(" ");
12312
+ if (currentSuffix === nextPrefix) {
12313
+ return [...currentWords, ...nextWords.slice(overlap)].join(" ");
12314
+ }
12315
+ }
12316
+ return `${current} ${next}`.trim();
12317
+ };
12318
+ var countCommonPrefixWords = (currentText, nextText) => {
12319
+ const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
12320
+ const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
12321
+ const maxWords = Math.min(currentWords.length, nextWords.length);
12322
+ let count = 0;
12323
+ for (let index = 0;index < maxWords; index += 1) {
12324
+ if (currentWords[index] !== nextWords[index]) {
12325
+ break;
12326
+ }
12327
+ count += 1;
12328
+ }
12329
+ return count;
12330
+ };
12331
+ var mergeTranscriptTexts = (transcripts) => {
12332
+ const merged = [];
12333
+ for (const transcript of transcripts) {
12334
+ const nextText = normalizeText(transcript.text);
12335
+ if (!nextText) {
12336
+ continue;
12337
+ }
12338
+ const previous = merged.at(-1);
12339
+ if (!previous) {
12340
+ merged.push(nextText);
12341
+ continue;
12342
+ }
12343
+ if (nextText === previous || previous.includes(nextText)) {
12344
+ continue;
12345
+ }
12346
+ if (nextText.includes(previous)) {
12347
+ merged[merged.length - 1] = nextText;
12348
+ continue;
12349
+ }
12350
+ merged.push(nextText);
12351
+ }
12352
+ return merged.join(" ").trim();
12353
+ };
12354
+ var buildTurnText = (transcripts, partialText, options = {}) => {
12355
+ const finalText = mergeTranscriptTexts(transcripts);
12356
+ const nextPartial = normalizeText(partialText);
12357
+ const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
12358
+ if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
12359
+ return mergeSequentialTranscriptText(finalText, nextPartial);
12360
+ }
12361
+ return selectPreferredTranscriptText(finalText, nextPartial);
12362
+ };
12363
+
12246
12364
  // src/core/turnProfiles.ts
12247
12365
  var TURN_PROFILE_DEFAULTS = {
12248
12366
  balanced: {
12249
12367
  qualityProfile: "general",
12368
+ semanticVetoMaxMs: 0,
12369
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12250
12370
  silenceMs: 1400,
12251
12371
  speechThreshold: 0.012,
12252
12372
  transcriptStabilityMs: 1000
12253
12373
  },
12254
12374
  fast: {
12255
12375
  qualityProfile: "general",
12376
+ semanticVetoMaxMs: 0,
12377
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12256
12378
  silenceMs: 700,
12257
12379
  speechThreshold: 0.015,
12258
12380
  transcriptStabilityMs: 450
12259
12381
  },
12260
12382
  "long-form": {
12261
12383
  qualityProfile: "general",
12384
+ semanticVetoMaxMs: 0,
12385
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12262
12386
  silenceMs: 2200,
12263
12387
  speechThreshold: 0.01,
12264
12388
  transcriptStabilityMs: 1500
@@ -12292,6 +12416,8 @@ var resolveTurnDetectionConfig = (config) => {
12292
12416
  return {
12293
12417
  profile,
12294
12418
  qualityProfile,
12419
+ semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
12420
+ semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
12295
12421
  silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
12296
12422
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
12297
12423
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs