@absolutejs/voice 0.0.22-beta.598 → 0.0.22-beta.599

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -391,146 +391,25 @@ var resolveLogger = (logger) => ({
391
391
  ...logger
392
392
  });
393
393
 
394
- // src/core/turnDetection.ts
395
- var DEFAULT_SILENCE_MS = 700;
396
- var DEFAULT_SPEECH_THRESHOLD = 0.015;
397
- var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
398
- var toUint8Array = (audio) => {
399
- if (audio instanceof ArrayBuffer) {
400
- return new Uint8Array(audio);
401
- }
402
- return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
403
- };
404
- var measureAudioLevel = (audio) => {
405
- const bytes = toUint8Array(audio);
406
- if (bytes.byteLength < 2) {
407
- return 0;
408
- }
409
- const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
410
- if (samples.length === 0) {
411
- return 0;
412
- }
413
- let sumSquares = 0;
414
- for (const sample of samples) {
415
- const normalized = sample / 32768;
416
- sumSquares += normalized * normalized;
417
- }
418
- return Math.sqrt(sumSquares / samples.length);
419
- };
420
- var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
421
- var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
422
- var selectPreferredTranscriptText = (currentText, nextText) => {
423
- const current = normalizeText(currentText);
424
- const next = normalizeText(nextText);
425
- if (!current) {
426
- return next;
427
- }
428
- if (!next) {
429
- return current;
430
- }
431
- if (current === next || current.includes(next)) {
432
- return current;
433
- }
434
- if (next.includes(current)) {
435
- return next;
436
- }
437
- if (countWords(next) > countWords(current)) {
438
- return next;
439
- }
440
- if (countWords(next) === countWords(current) && next.length > current.length) {
441
- return next;
442
- }
443
- return current;
444
- };
445
- var mergeSequentialTranscriptText = (currentText, nextText) => {
446
- const current = normalizeText(currentText);
447
- const next = normalizeText(nextText);
448
- if (!current) {
449
- return next;
450
- }
451
- if (!next) {
452
- return current;
453
- }
454
- const currentWords = current.split(" ");
455
- const nextWords = next.split(" ");
456
- const maxOverlap = Math.min(currentWords.length, nextWords.length);
457
- for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
458
- const currentSuffix = currentWords.slice(-overlap).join(" ");
459
- const nextPrefix = nextWords.slice(0, overlap).join(" ");
460
- if (currentSuffix === nextPrefix) {
461
- return [...currentWords, ...nextWords.slice(overlap)].join(" ");
462
- }
463
- }
464
- return `${current} ${next}`.trim();
465
- };
466
- var countCommonPrefixWords = (currentText, nextText) => {
467
- const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
468
- const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
469
- const maxWords = Math.min(currentWords.length, nextWords.length);
470
- let count = 0;
471
- for (let index = 0;index < maxWords; index += 1) {
472
- if (currentWords[index] !== nextWords[index]) {
473
- break;
474
- }
475
- count += 1;
476
- }
477
- return count;
478
- };
479
- var mergeTranscriptTexts = (transcripts) => {
480
- const merged = [];
481
- for (const transcript of transcripts) {
482
- const nextText = normalizeText(transcript.text);
483
- if (!nextText) {
484
- continue;
485
- }
486
- const previous = merged.at(-1);
487
- if (!previous) {
488
- merged.push(nextText);
489
- continue;
490
- }
491
- if (nextText === previous || previous.includes(nextText)) {
492
- continue;
493
- }
494
- if (nextText.includes(previous)) {
495
- merged[merged.length - 1] = nextText;
496
- continue;
497
- }
498
- merged.push(nextText);
499
- }
500
- return merged.join(" ").trim();
501
- };
502
- var buildTurnText = (transcripts, partialText, options = {}) => {
503
- const finalText = mergeTranscriptTexts(transcripts);
504
- const nextPartial = normalizeText(partialText);
505
- const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
506
- if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
507
- return mergeSequentialTranscriptText(finalText, nextPartial);
508
- }
509
- return selectPreferredTranscriptText(finalText, nextPartial);
510
- };
511
-
512
394
  // src/core/turnProfiles.ts
513
395
  var TURN_PROFILE_DEFAULTS = {
514
396
  balanced: {
515
397
  qualityProfile: "general",
516
- semanticVetoMaxMs: 0,
517
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
398
+ minSilenceMs: 400,
518
399
  silenceMs: 1400,
519
400
  speechThreshold: 0.012,
520
401
  transcriptStabilityMs: 1000
521
402
  },
522
403
  fast: {
523
404
  qualityProfile: "general",
524
- semanticVetoMaxMs: 0,
525
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
405
+ minSilenceMs: 300,
526
406
  silenceMs: 700,
527
407
  speechThreshold: 0.015,
528
408
  transcriptStabilityMs: 450
529
409
  },
530
410
  "long-form": {
531
411
  qualityProfile: "general",
532
- semanticVetoMaxMs: 0,
533
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
412
+ minSilenceMs: 600,
534
413
  silenceMs: 2200,
535
414
  speechThreshold: 0.01,
536
415
  transcriptStabilityMs: 1500
@@ -561,12 +440,12 @@ var resolveTurnDetectionConfig = (config) => {
561
440
  const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
562
441
  const preset = TURN_PROFILE_DEFAULTS[profile];
563
442
  const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
443
+ const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
564
444
  return {
565
445
  profile,
566
446
  qualityProfile,
567
- semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
568
- semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
569
- silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
447
+ minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
448
+ silenceMs,
570
449
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
571
450
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
572
451
  };
@@ -3580,6 +3459,124 @@ var createVoiceTwilioRedirectHandoffAdapter = (options) => ({
3580
3459
  }
3581
3460
  });
3582
3461
 
3462
+ // src/core/turnDetection.ts
3463
+ var DEFAULT_SILENCE_MS = 700;
3464
+ var DEFAULT_SPEECH_THRESHOLD = 0.015;
3465
+ var DEFAULT_MIN_SILENCE_MS = 400;
3466
+ var toUint8Array = (audio) => {
3467
+ if (audio instanceof ArrayBuffer) {
3468
+ return new Uint8Array(audio);
3469
+ }
3470
+ return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
3471
+ };
3472
+ var measureAudioLevel = (audio) => {
3473
+ const bytes = toUint8Array(audio);
3474
+ if (bytes.byteLength < 2) {
3475
+ return 0;
3476
+ }
3477
+ const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
3478
+ if (samples.length === 0) {
3479
+ return 0;
3480
+ }
3481
+ let sumSquares = 0;
3482
+ for (const sample of samples) {
3483
+ const normalized = sample / 32768;
3484
+ sumSquares += normalized * normalized;
3485
+ }
3486
+ return Math.sqrt(sumSquares / samples.length);
3487
+ };
3488
+ var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
3489
+ var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
3490
+ var selectPreferredTranscriptText = (currentText, nextText) => {
3491
+ const current = normalizeText(currentText);
3492
+ const next = normalizeText(nextText);
3493
+ if (!current) {
3494
+ return next;
3495
+ }
3496
+ if (!next) {
3497
+ return current;
3498
+ }
3499
+ if (current === next || current.includes(next)) {
3500
+ return current;
3501
+ }
3502
+ if (next.includes(current)) {
3503
+ return next;
3504
+ }
3505
+ if (countWords(next) > countWords(current)) {
3506
+ return next;
3507
+ }
3508
+ if (countWords(next) === countWords(current) && next.length > current.length) {
3509
+ return next;
3510
+ }
3511
+ return current;
3512
+ };
3513
+ var mergeSequentialTranscriptText = (currentText, nextText) => {
3514
+ const current = normalizeText(currentText);
3515
+ const next = normalizeText(nextText);
3516
+ if (!current) {
3517
+ return next;
3518
+ }
3519
+ if (!next) {
3520
+ return current;
3521
+ }
3522
+ const currentWords = current.split(" ");
3523
+ const nextWords = next.split(" ");
3524
+ const maxOverlap = Math.min(currentWords.length, nextWords.length);
3525
+ for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
3526
+ const currentSuffix = currentWords.slice(-overlap).join(" ");
3527
+ const nextPrefix = nextWords.slice(0, overlap).join(" ");
3528
+ if (currentSuffix === nextPrefix) {
3529
+ return [...currentWords, ...nextWords.slice(overlap)].join(" ");
3530
+ }
3531
+ }
3532
+ return `${current} ${next}`.trim();
3533
+ };
3534
+ var countCommonPrefixWords = (currentText, nextText) => {
3535
+ const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
3536
+ const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
3537
+ const maxWords = Math.min(currentWords.length, nextWords.length);
3538
+ let count = 0;
3539
+ for (let index = 0;index < maxWords; index += 1) {
3540
+ if (currentWords[index] !== nextWords[index]) {
3541
+ break;
3542
+ }
3543
+ count += 1;
3544
+ }
3545
+ return count;
3546
+ };
3547
+ var mergeTranscriptTexts = (transcripts) => {
3548
+ const merged = [];
3549
+ for (const transcript of transcripts) {
3550
+ const nextText = normalizeText(transcript.text);
3551
+ if (!nextText) {
3552
+ continue;
3553
+ }
3554
+ const previous = merged.at(-1);
3555
+ if (!previous) {
3556
+ merged.push(nextText);
3557
+ continue;
3558
+ }
3559
+ if (nextText === previous || previous.includes(nextText)) {
3560
+ continue;
3561
+ }
3562
+ if (nextText.includes(previous)) {
3563
+ merged[merged.length - 1] = nextText;
3564
+ continue;
3565
+ }
3566
+ merged.push(nextText);
3567
+ }
3568
+ return merged.join(" ").trim();
3569
+ };
3570
+ var buildTurnText = (transcripts, partialText, options = {}) => {
3571
+ const finalText = mergeTranscriptTexts(transcripts);
3572
+ const nextPartial = normalizeText(partialText);
3573
+ const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
3574
+ if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
3575
+ return mergeSequentialTranscriptText(finalText, nextPartial);
3576
+ }
3577
+ return selectPreferredTranscriptText(finalText, nextPartial);
3578
+ };
3579
+
3583
3580
  // src/core/types.ts
3584
3581
  var ttsAdapterSessionCanCancel = (session) => typeof session.cancel === "function";
3585
3582
 
@@ -3926,14 +3923,22 @@ var createVoiceSession = (options) => {
3926
3923
  strategy: options.reconnect.strategy ?? "resume-last-turn",
3927
3924
  timeout: options.reconnect.timeout ?? DEFAULT_RECONNECT_TIMEOUT
3928
3925
  };
3926
+ const resolvedSilenceMs = options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS;
3929
3927
  const turnDetection = {
3930
- silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
3928
+ silenceMs: resolvedSilenceMs,
3929
+ minSilenceMs: Math.min(resolvedSilenceMs, options.turnDetection.minSilenceMs ?? DEFAULT_MIN_SILENCE_MS),
3931
3930
  speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
3932
- transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS,
3933
- semanticVetoMaxMs: options.turnDetection.semanticVetoMaxMs ?? 0,
3934
- semanticVetoRecheckMs: options.turnDetection.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS
3931
+ transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
3932
+ };
3933
+ let lastTurnCompleteConfidence = null;
3934
+ const adaptiveSilenceMs = () => {
3935
+ const { minSilenceMs, silenceMs } = turnDetection;
3936
+ if (lastTurnCompleteConfidence === null || silenceMs <= minSilenceMs) {
3937
+ return silenceMs;
3938
+ }
3939
+ const complete = Math.max(0, Math.min(1, lastTurnCompleteConfidence));
3940
+ return Math.round(minSilenceMs + (silenceMs - minSilenceMs) * (1 - complete));
3935
3941
  };
3936
- let semanticVetoElapsedMs = 0;
3937
3942
  const sttFallback = options.sttFallback ? {
3938
3943
  adapter: options.sttFallback.adapter,
3939
3944
  completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
@@ -4466,47 +4471,8 @@ var createVoiceSession = (options) => {
4466
4471
  runScheduledCommit(reason);
4467
4472
  }, delayMs);
4468
4473
  };
4469
- const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
4470
- const shouldDeferSilenceCommit = async (reason) => {
4471
- if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
4472
- return false;
4473
- }
4474
- const session = await readSession();
4475
- const { partialText, transcripts } = session.currentTurn;
4476
- const userText = buildTurnText(transcripts, partialText, {
4477
- partialEndedAtMs: session.currentTurn.partialEndedAt,
4478
- partialStartedAtMs: session.currentTurn.partialStartedAt
4479
- });
4480
- if (!userText) {
4481
- return false;
4482
- }
4483
- const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
4484
- let endOfTurn = true;
4485
- try {
4486
- const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
4487
- lastFinalTranscript: transcripts.at(-1),
4488
- partialText,
4489
- silenceMs,
4490
- transcripts,
4491
- ...getTurnAudioForDetector()
4492
- }));
4493
- endOfTurn = verdict.endOfTurn;
4494
- } catch {
4495
- return false;
4496
- }
4497
- if (endOfTurn !== false) {
4498
- return false;
4499
- }
4500
- const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
4501
- const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
4502
- semanticVetoElapsedMs += extendMs;
4503
- scheduleTurnCommit(extendMs, reason);
4504
- return true;
4505
- };
4474
+ const scheduleSilenceCommit = (delayMs = adaptiveSilenceMs(), reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
4506
4475
  const runScheduledCommit = async (reason) => {
4507
- if (await shouldDeferSilenceCommit(reason)) {
4508
- return;
4509
- }
4510
4476
  await api.commitTurn(reason);
4511
4477
  };
4512
4478
  const requestTurnCommit = async (reason) => {
@@ -5246,7 +5212,7 @@ var createVoiceSession = (options) => {
5246
5212
  session2.lastActivityAt = Date.now();
5247
5213
  session2.status = "active";
5248
5214
  });
5249
- semanticVetoElapsedMs = 0;
5215
+ lastTurnCompleteConfidence = null;
5250
5216
  if (silenceTimer && pendingCommitReason === "vendor") {
5251
5217
  scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
5252
5218
  }
@@ -5276,6 +5242,12 @@ var createVoiceSession = (options) => {
5276
5242
  transcripts: session.currentTurn.transcripts,
5277
5243
  ...getTurnAudioForDetector()
5278
5244
  }));
5245
+ if (typeof verdict.confidence === "number") {
5246
+ lastTurnCompleteConfidence = verdict.confidence;
5247
+ if (silenceTimer && pendingCommitReason === "silence") {
5248
+ scheduleSilenceCommit();
5249
+ }
5250
+ }
5279
5251
  if (verdict.endOfTurn) {
5280
5252
  clearSilenceTimer();
5281
5253
  await requestTurnCommit("vendor");
@@ -5971,7 +5943,7 @@ var createVoiceSession = (options) => {
5971
5943
  };
5972
5944
  const commitTurnInternal = async (reason = "manual") => {
5973
5945
  clearSilenceTimer();
5974
- semanticVetoElapsedMs = 0;
5946
+ lastTurnCompleteConfidence = null;
5975
5947
  backchannelDriver?.reset();
5976
5948
  amdLastTurnCommitAt = Date.now();
5977
5949
  const session = await readSession();
@@ -42574,16 +42546,18 @@ var createVoiceConfiguration = (configuration) => configuration;
42574
42546
  var DEFAULT_SPEECH_THRESHOLD2 = 0.015;
42575
42547
  var DEFAULT_SILENCE_MS2 = 700;
42576
42548
  var DEFAULT_TRANSCRIPT_STABILITY_MS2 = 200;
42577
- var DEFAULT_SEMANTIC_VETO_RECHECK_MS2 = 1200;
42578
- var resolveTurnDetection = (input) => ({
42579
- profile: input?.profile ?? "balanced",
42580
- qualityProfile: input?.qualityProfile ?? "general",
42581
- semanticVetoMaxMs: input?.semanticVetoMaxMs ?? 0,
42582
- semanticVetoRecheckMs: input?.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS2,
42583
- silenceMs: input?.silenceMs ?? DEFAULT_SILENCE_MS2,
42584
- speechThreshold: input?.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD2,
42585
- transcriptStabilityMs: input?.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS2
42586
- });
42549
+ var DEFAULT_MIN_SILENCE_MS2 = 400;
42550
+ var resolveTurnDetection = (input) => {
42551
+ const silenceMs = input?.silenceMs ?? DEFAULT_SILENCE_MS2;
42552
+ return {
42553
+ profile: input?.profile ?? "balanced",
42554
+ qualityProfile: input?.qualityProfile ?? "general",
42555
+ minSilenceMs: Math.min(silenceMs, input?.minSilenceMs ?? DEFAULT_MIN_SILENCE_MS2),
42556
+ silenceMs,
42557
+ speechThreshold: input?.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD2,
42558
+ transcriptStabilityMs: input?.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS2
42559
+ };
42560
+ };
42587
42561
  var resolveReconnect = (input) => ({
42588
42562
  maxAttempts: input?.maxAttempts ?? 3,
42589
42563
  strategy: input?.strategy ?? "resume-last-turn",
@@ -12272,146 +12272,25 @@ var resolveAudioConditioningConfig = (config) => {
12272
12272
  };
12273
12273
  };
12274
12274
 
12275
- // src/core/turnDetection.ts
12276
- var DEFAULT_SILENCE_MS = 700;
12277
- var DEFAULT_SPEECH_THRESHOLD = 0.015;
12278
- var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
12279
- var toUint8Array = (audio) => {
12280
- if (audio instanceof ArrayBuffer) {
12281
- return new Uint8Array(audio);
12282
- }
12283
- return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
12284
- };
12285
- var measureAudioLevel = (audio) => {
12286
- const bytes = toUint8Array(audio);
12287
- if (bytes.byteLength < 2) {
12288
- return 0;
12289
- }
12290
- const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
12291
- if (samples.length === 0) {
12292
- return 0;
12293
- }
12294
- let sumSquares = 0;
12295
- for (const sample of samples) {
12296
- const normalized = sample / 32768;
12297
- sumSquares += normalized * normalized;
12298
- }
12299
- return Math.sqrt(sumSquares / samples.length);
12300
- };
12301
- var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
12302
- var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
12303
- var selectPreferredTranscriptText = (currentText, nextText) => {
12304
- const current = normalizeText(currentText);
12305
- const next = normalizeText(nextText);
12306
- if (!current) {
12307
- return next;
12308
- }
12309
- if (!next) {
12310
- return current;
12311
- }
12312
- if (current === next || current.includes(next)) {
12313
- return current;
12314
- }
12315
- if (next.includes(current)) {
12316
- return next;
12317
- }
12318
- if (countWords(next) > countWords(current)) {
12319
- return next;
12320
- }
12321
- if (countWords(next) === countWords(current) && next.length > current.length) {
12322
- return next;
12323
- }
12324
- return current;
12325
- };
12326
- var mergeSequentialTranscriptText = (currentText, nextText) => {
12327
- const current = normalizeText(currentText);
12328
- const next = normalizeText(nextText);
12329
- if (!current) {
12330
- return next;
12331
- }
12332
- if (!next) {
12333
- return current;
12334
- }
12335
- const currentWords = current.split(" ");
12336
- const nextWords = next.split(" ");
12337
- const maxOverlap = Math.min(currentWords.length, nextWords.length);
12338
- for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
12339
- const currentSuffix = currentWords.slice(-overlap).join(" ");
12340
- const nextPrefix = nextWords.slice(0, overlap).join(" ");
12341
- if (currentSuffix === nextPrefix) {
12342
- return [...currentWords, ...nextWords.slice(overlap)].join(" ");
12343
- }
12344
- }
12345
- return `${current} ${next}`.trim();
12346
- };
12347
- var countCommonPrefixWords = (currentText, nextText) => {
12348
- const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
12349
- const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
12350
- const maxWords = Math.min(currentWords.length, nextWords.length);
12351
- let count = 0;
12352
- for (let index = 0;index < maxWords; index += 1) {
12353
- if (currentWords[index] !== nextWords[index]) {
12354
- break;
12355
- }
12356
- count += 1;
12357
- }
12358
- return count;
12359
- };
12360
- var mergeTranscriptTexts = (transcripts) => {
12361
- const merged = [];
12362
- for (const transcript of transcripts) {
12363
- const nextText = normalizeText(transcript.text);
12364
- if (!nextText) {
12365
- continue;
12366
- }
12367
- const previous = merged.at(-1);
12368
- if (!previous) {
12369
- merged.push(nextText);
12370
- continue;
12371
- }
12372
- if (nextText === previous || previous.includes(nextText)) {
12373
- continue;
12374
- }
12375
- if (nextText.includes(previous)) {
12376
- merged[merged.length - 1] = nextText;
12377
- continue;
12378
- }
12379
- merged.push(nextText);
12380
- }
12381
- return merged.join(" ").trim();
12382
- };
12383
- var buildTurnText = (transcripts, partialText, options = {}) => {
12384
- const finalText = mergeTranscriptTexts(transcripts);
12385
- const nextPartial = normalizeText(partialText);
12386
- const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
12387
- if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
12388
- return mergeSequentialTranscriptText(finalText, nextPartial);
12389
- }
12390
- return selectPreferredTranscriptText(finalText, nextPartial);
12391
- };
12392
-
12393
12275
  // src/core/turnProfiles.ts
12394
12276
  var TURN_PROFILE_DEFAULTS = {
12395
12277
  balanced: {
12396
12278
  qualityProfile: "general",
12397
- semanticVetoMaxMs: 0,
12398
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12279
+ minSilenceMs: 400,
12399
12280
  silenceMs: 1400,
12400
12281
  speechThreshold: 0.012,
12401
12282
  transcriptStabilityMs: 1000
12402
12283
  },
12403
12284
  fast: {
12404
12285
  qualityProfile: "general",
12405
- semanticVetoMaxMs: 0,
12406
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12286
+ minSilenceMs: 300,
12407
12287
  silenceMs: 700,
12408
12288
  speechThreshold: 0.015,
12409
12289
  transcriptStabilityMs: 450
12410
12290
  },
12411
12291
  "long-form": {
12412
12292
  qualityProfile: "general",
12413
- semanticVetoMaxMs: 0,
12414
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12293
+ minSilenceMs: 600,
12415
12294
  silenceMs: 2200,
12416
12295
  speechThreshold: 0.01,
12417
12296
  transcriptStabilityMs: 1500
@@ -12442,12 +12321,12 @@ var resolveTurnDetectionConfig = (config) => {
12442
12321
  const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
12443
12322
  const preset = TURN_PROFILE_DEFAULTS[profile];
12444
12323
  const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
12324
+ const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
12445
12325
  return {
12446
12326
  profile,
12447
12327
  qualityProfile,
12448
- semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
12449
- semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
12450
- silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
12328
+ minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
12329
+ silenceMs,
12451
12330
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
12452
12331
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
12453
12332
  };