@absolutejs/voice 0.0.22-beta.584 → 0.0.22-beta.585

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -391,22 +391,146 @@ var resolveLogger = (logger) => ({
391
391
  ...logger
392
392
  });
393
393
 
394
+ // src/core/turnDetection.ts
395
+ var DEFAULT_SILENCE_MS = 700;
396
+ var DEFAULT_SPEECH_THRESHOLD = 0.015;
397
+ var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
398
+ var toUint8Array = (audio) => {
399
+ if (audio instanceof ArrayBuffer) {
400
+ return new Uint8Array(audio);
401
+ }
402
+ return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
403
+ };
404
+ var measureAudioLevel = (audio) => {
405
+ const bytes = toUint8Array(audio);
406
+ if (bytes.byteLength < 2) {
407
+ return 0;
408
+ }
409
+ const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
410
+ if (samples.length === 0) {
411
+ return 0;
412
+ }
413
+ let sumSquares = 0;
414
+ for (const sample of samples) {
415
+ const normalized = sample / 32768;
416
+ sumSquares += normalized * normalized;
417
+ }
418
+ return Math.sqrt(sumSquares / samples.length);
419
+ };
420
+ var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
421
+ var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
422
+ var selectPreferredTranscriptText = (currentText, nextText) => {
423
+ const current = normalizeText(currentText);
424
+ const next = normalizeText(nextText);
425
+ if (!current) {
426
+ return next;
427
+ }
428
+ if (!next) {
429
+ return current;
430
+ }
431
+ if (current === next || current.includes(next)) {
432
+ return current;
433
+ }
434
+ if (next.includes(current)) {
435
+ return next;
436
+ }
437
+ if (countWords(next) > countWords(current)) {
438
+ return next;
439
+ }
440
+ if (countWords(next) === countWords(current) && next.length > current.length) {
441
+ return next;
442
+ }
443
+ return current;
444
+ };
445
+ var mergeSequentialTranscriptText = (currentText, nextText) => {
446
+ const current = normalizeText(currentText);
447
+ const next = normalizeText(nextText);
448
+ if (!current) {
449
+ return next;
450
+ }
451
+ if (!next) {
452
+ return current;
453
+ }
454
+ const currentWords = current.split(" ");
455
+ const nextWords = next.split(" ");
456
+ const maxOverlap = Math.min(currentWords.length, nextWords.length);
457
+ for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
458
+ const currentSuffix = currentWords.slice(-overlap).join(" ");
459
+ const nextPrefix = nextWords.slice(0, overlap).join(" ");
460
+ if (currentSuffix === nextPrefix) {
461
+ return [...currentWords, ...nextWords.slice(overlap)].join(" ");
462
+ }
463
+ }
464
+ return `${current} ${next}`.trim();
465
+ };
466
+ var countCommonPrefixWords = (currentText, nextText) => {
467
+ const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
468
+ const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
469
+ const maxWords = Math.min(currentWords.length, nextWords.length);
470
+ let count = 0;
471
+ for (let index = 0;index < maxWords; index += 1) {
472
+ if (currentWords[index] !== nextWords[index]) {
473
+ break;
474
+ }
475
+ count += 1;
476
+ }
477
+ return count;
478
+ };
479
+ var mergeTranscriptTexts = (transcripts) => {
480
+ const merged = [];
481
+ for (const transcript of transcripts) {
482
+ const nextText = normalizeText(transcript.text);
483
+ if (!nextText) {
484
+ continue;
485
+ }
486
+ const previous = merged.at(-1);
487
+ if (!previous) {
488
+ merged.push(nextText);
489
+ continue;
490
+ }
491
+ if (nextText === previous || previous.includes(nextText)) {
492
+ continue;
493
+ }
494
+ if (nextText.includes(previous)) {
495
+ merged[merged.length - 1] = nextText;
496
+ continue;
497
+ }
498
+ merged.push(nextText);
499
+ }
500
+ return merged.join(" ").trim();
501
+ };
502
+ var buildTurnText = (transcripts, partialText, options = {}) => {
503
+ const finalText = mergeTranscriptTexts(transcripts);
504
+ const nextPartial = normalizeText(partialText);
505
+ const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
506
+ if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
507
+ return mergeSequentialTranscriptText(finalText, nextPartial);
508
+ }
509
+ return selectPreferredTranscriptText(finalText, nextPartial);
510
+ };
511
+
394
512
  // src/core/turnProfiles.ts
395
513
  var TURN_PROFILE_DEFAULTS = {
396
514
  balanced: {
397
515
  qualityProfile: "general",
516
+ semanticVetoMaxMs: 0,
517
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
398
518
  silenceMs: 1400,
399
519
  speechThreshold: 0.012,
400
520
  transcriptStabilityMs: 1000
401
521
  },
402
522
  fast: {
403
523
  qualityProfile: "general",
524
+ semanticVetoMaxMs: 0,
525
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
404
526
  silenceMs: 700,
405
527
  speechThreshold: 0.015,
406
528
  transcriptStabilityMs: 450
407
529
  },
408
530
  "long-form": {
409
531
  qualityProfile: "general",
532
+ semanticVetoMaxMs: 0,
533
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
410
534
  silenceMs: 2200,
411
535
  speechThreshold: 0.01,
412
536
  transcriptStabilityMs: 1500
@@ -440,6 +564,8 @@ var resolveTurnDetectionConfig = (config) => {
440
564
  return {
441
565
  profile,
442
566
  qualityProfile,
567
+ semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
568
+ semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
443
569
  silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
444
570
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
445
571
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
@@ -3454,123 +3580,6 @@ var createVoiceTwilioRedirectHandoffAdapter = (options) => ({
3454
3580
  }
3455
3581
  });
3456
3582
 
3457
- // src/core/turnDetection.ts
3458
- var DEFAULT_SILENCE_MS = 700;
3459
- var DEFAULT_SPEECH_THRESHOLD = 0.015;
3460
- var toUint8Array = (audio) => {
3461
- if (audio instanceof ArrayBuffer) {
3462
- return new Uint8Array(audio);
3463
- }
3464
- return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
3465
- };
3466
- var measureAudioLevel = (audio) => {
3467
- const bytes = toUint8Array(audio);
3468
- if (bytes.byteLength < 2) {
3469
- return 0;
3470
- }
3471
- const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
3472
- if (samples.length === 0) {
3473
- return 0;
3474
- }
3475
- let sumSquares = 0;
3476
- for (const sample of samples) {
3477
- const normalized = sample / 32768;
3478
- sumSquares += normalized * normalized;
3479
- }
3480
- return Math.sqrt(sumSquares / samples.length);
3481
- };
3482
- var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
3483
- var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
3484
- var selectPreferredTranscriptText = (currentText, nextText) => {
3485
- const current = normalizeText(currentText);
3486
- const next = normalizeText(nextText);
3487
- if (!current) {
3488
- return next;
3489
- }
3490
- if (!next) {
3491
- return current;
3492
- }
3493
- if (current === next || current.includes(next)) {
3494
- return current;
3495
- }
3496
- if (next.includes(current)) {
3497
- return next;
3498
- }
3499
- if (countWords(next) > countWords(current)) {
3500
- return next;
3501
- }
3502
- if (countWords(next) === countWords(current) && next.length > current.length) {
3503
- return next;
3504
- }
3505
- return current;
3506
- };
3507
- var mergeSequentialTranscriptText = (currentText, nextText) => {
3508
- const current = normalizeText(currentText);
3509
- const next = normalizeText(nextText);
3510
- if (!current) {
3511
- return next;
3512
- }
3513
- if (!next) {
3514
- return current;
3515
- }
3516
- const currentWords = current.split(" ");
3517
- const nextWords = next.split(" ");
3518
- const maxOverlap = Math.min(currentWords.length, nextWords.length);
3519
- for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
3520
- const currentSuffix = currentWords.slice(-overlap).join(" ");
3521
- const nextPrefix = nextWords.slice(0, overlap).join(" ");
3522
- if (currentSuffix === nextPrefix) {
3523
- return [...currentWords, ...nextWords.slice(overlap)].join(" ");
3524
- }
3525
- }
3526
- return `${current} ${next}`.trim();
3527
- };
3528
- var countCommonPrefixWords = (currentText, nextText) => {
3529
- const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
3530
- const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
3531
- const maxWords = Math.min(currentWords.length, nextWords.length);
3532
- let count = 0;
3533
- for (let index = 0;index < maxWords; index += 1) {
3534
- if (currentWords[index] !== nextWords[index]) {
3535
- break;
3536
- }
3537
- count += 1;
3538
- }
3539
- return count;
3540
- };
3541
- var mergeTranscriptTexts = (transcripts) => {
3542
- const merged = [];
3543
- for (const transcript of transcripts) {
3544
- const nextText = normalizeText(transcript.text);
3545
- if (!nextText) {
3546
- continue;
3547
- }
3548
- const previous = merged.at(-1);
3549
- if (!previous) {
3550
- merged.push(nextText);
3551
- continue;
3552
- }
3553
- if (nextText === previous || previous.includes(nextText)) {
3554
- continue;
3555
- }
3556
- if (nextText.includes(previous)) {
3557
- merged[merged.length - 1] = nextText;
3558
- continue;
3559
- }
3560
- merged.push(nextText);
3561
- }
3562
- return merged.join(" ").trim();
3563
- };
3564
- var buildTurnText = (transcripts, partialText, options = {}) => {
3565
- const finalText = mergeTranscriptTexts(transcripts);
3566
- const nextPartial = normalizeText(partialText);
3567
- const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
3568
- if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
3569
- return mergeSequentialTranscriptText(finalText, nextPartial);
3570
- }
3571
- return selectPreferredTranscriptText(finalText, nextPartial);
3572
- };
3573
-
3574
3583
  // src/core/types.ts
3575
3584
  var ttsAdapterSessionCanCancel = (session) => typeof session.cancel === "function";
3576
3585
 
@@ -3907,8 +3916,11 @@ var createVoiceSession = (options) => {
3907
3916
  const turnDetection = {
3908
3917
  silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
3909
3918
  speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
3910
- transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
3919
+ transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS,
3920
+ semanticVetoMaxMs: options.turnDetection.semanticVetoMaxMs ?? 0,
3921
+ semanticVetoRecheckMs: options.turnDetection.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS
3911
3922
  };
3923
+ let semanticVetoElapsedMs = 0;
3912
3924
  const sttFallback = options.sttFallback ? {
3913
3925
  adapter: options.sttFallback.adapter,
3914
3926
  completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
@@ -4423,10 +4435,51 @@ var createVoiceSession = (options) => {
4423
4435
  silenceTimer = setTimeout(() => {
4424
4436
  silenceTimer = null;
4425
4437
  pendingCommitReason = null;
4426
- api.commitTurn(reason);
4438
+ runScheduledCommit(reason);
4427
4439
  }, delayMs);
4428
4440
  };
4429
4441
  const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
4442
+ const shouldDeferSilenceCommit = async (reason) => {
4443
+ if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
4444
+ return false;
4445
+ }
4446
+ const session = await readSession();
4447
+ const { partialText, transcripts } = session.currentTurn;
4448
+ const userText = buildTurnText(transcripts, partialText, {
4449
+ partialEndedAtMs: session.currentTurn.partialEndedAt,
4450
+ partialStartedAtMs: session.currentTurn.partialStartedAt
4451
+ });
4452
+ if (!userText) {
4453
+ return false;
4454
+ }
4455
+ const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
4456
+ let endOfTurn = true;
4457
+ try {
4458
+ const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
4459
+ lastFinalTranscript: transcripts.at(-1),
4460
+ partialText,
4461
+ silenceMs,
4462
+ transcripts
4463
+ }));
4464
+ endOfTurn = verdict.endOfTurn;
4465
+ } catch {
4466
+ return false;
4467
+ }
4468
+ if (endOfTurn !== false) {
4469
+ return false;
4470
+ }
4471
+ const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
4472
+ const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
4473
+ semanticVetoElapsedMs += extendMs;
4474
+ scheduleTurnCommit(extendMs, reason);
4475
+ return true;
4476
+ };
4477
+ const runScheduledCommit = async (reason) => {
4478
+ if (await shouldDeferSilenceCommit(reason)) {
4479
+ return;
4480
+ }
4481
+ await api.commitTurn(reason);
4482
+ };
4430
4483
  const requestTurnCommit = async (reason) => {
4431
4484
  const session = await readSession();
4432
4485
  const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
@@ -5138,6 +5191,7 @@ var createVoiceSession = (options) => {
5138
5191
  session2.lastActivityAt = Date.now();
5139
5192
  session2.status = "active";
5140
5193
  });
5194
+ semanticVetoElapsedMs = 0;
5141
5195
  if (silenceTimer && pendingCommitReason === "vendor") {
5142
5196
  scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
5143
5197
  }
@@ -5841,6 +5895,7 @@ var createVoiceSession = (options) => {
5841
5895
  };
5842
5896
  const commitTurnInternal = async (reason = "manual") => {
5843
5897
  clearSilenceTimer();
5898
+ semanticVetoElapsedMs = 0;
5844
5899
  backchannelDriver?.reset();
5845
5900
  amdLastTurnCommitAt = Date.now();
5846
5901
  const session = await readSession();
@@ -42388,9 +42443,12 @@ var createVoiceConfiguration = (configuration) => configuration;
42388
42443
  var DEFAULT_SPEECH_THRESHOLD2 = 0.015;
42389
42444
  var DEFAULT_SILENCE_MS2 = 700;
42390
42445
  var DEFAULT_TRANSCRIPT_STABILITY_MS2 = 200;
42446
+ var DEFAULT_SEMANTIC_VETO_RECHECK_MS2 = 1200;
42391
42447
  var resolveTurnDetection = (input) => ({
42392
42448
  profile: input?.profile ?? "balanced",
42393
42449
  qualityProfile: input?.qualityProfile ?? "general",
42450
+ semanticVetoMaxMs: input?.semanticVetoMaxMs ?? 0,
42451
+ semanticVetoRecheckMs: input?.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS2,
42394
42452
  silenceMs: input?.silenceMs ?? DEFAULT_SILENCE_MS2,
42395
42453
  speechThreshold: input?.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD2,
42396
42454
  transcriptStabilityMs: input?.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS2
@@ -12243,22 +12243,146 @@ var resolveAudioConditioningConfig = (config) => {
12243
12243
  };
12244
12244
  };
12245
12245
 
12246
+ // src/core/turnDetection.ts
12247
+ var DEFAULT_SILENCE_MS = 700;
12248
+ var DEFAULT_SPEECH_THRESHOLD = 0.015;
12249
+ var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
12250
+ var toUint8Array = (audio) => {
12251
+ if (audio instanceof ArrayBuffer) {
12252
+ return new Uint8Array(audio);
12253
+ }
12254
+ return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
12255
+ };
12256
+ var measureAudioLevel = (audio) => {
12257
+ const bytes = toUint8Array(audio);
12258
+ if (bytes.byteLength < 2) {
12259
+ return 0;
12260
+ }
12261
+ const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
12262
+ if (samples.length === 0) {
12263
+ return 0;
12264
+ }
12265
+ let sumSquares = 0;
12266
+ for (const sample of samples) {
12267
+ const normalized = sample / 32768;
12268
+ sumSquares += normalized * normalized;
12269
+ }
12270
+ return Math.sqrt(sumSquares / samples.length);
12271
+ };
12272
+ var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
12273
+ var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
12274
+ var selectPreferredTranscriptText = (currentText, nextText) => {
12275
+ const current = normalizeText(currentText);
12276
+ const next = normalizeText(nextText);
12277
+ if (!current) {
12278
+ return next;
12279
+ }
12280
+ if (!next) {
12281
+ return current;
12282
+ }
12283
+ if (current === next || current.includes(next)) {
12284
+ return current;
12285
+ }
12286
+ if (next.includes(current)) {
12287
+ return next;
12288
+ }
12289
+ if (countWords(next) > countWords(current)) {
12290
+ return next;
12291
+ }
12292
+ if (countWords(next) === countWords(current) && next.length > current.length) {
12293
+ return next;
12294
+ }
12295
+ return current;
12296
+ };
12297
+ var mergeSequentialTranscriptText = (currentText, nextText) => {
12298
+ const current = normalizeText(currentText);
12299
+ const next = normalizeText(nextText);
12300
+ if (!current) {
12301
+ return next;
12302
+ }
12303
+ if (!next) {
12304
+ return current;
12305
+ }
12306
+ const currentWords = current.split(" ");
12307
+ const nextWords = next.split(" ");
12308
+ const maxOverlap = Math.min(currentWords.length, nextWords.length);
12309
+ for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
12310
+ const currentSuffix = currentWords.slice(-overlap).join(" ");
12311
+ const nextPrefix = nextWords.slice(0, overlap).join(" ");
12312
+ if (currentSuffix === nextPrefix) {
12313
+ return [...currentWords, ...nextWords.slice(overlap)].join(" ");
12314
+ }
12315
+ }
12316
+ return `${current} ${next}`.trim();
12317
+ };
12318
+ var countCommonPrefixWords = (currentText, nextText) => {
12319
+ const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
12320
+ const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
12321
+ const maxWords = Math.min(currentWords.length, nextWords.length);
12322
+ let count = 0;
12323
+ for (let index = 0;index < maxWords; index += 1) {
12324
+ if (currentWords[index] !== nextWords[index]) {
12325
+ break;
12326
+ }
12327
+ count += 1;
12328
+ }
12329
+ return count;
12330
+ };
12331
+ var mergeTranscriptTexts = (transcripts) => {
12332
+ const merged = [];
12333
+ for (const transcript of transcripts) {
12334
+ const nextText = normalizeText(transcript.text);
12335
+ if (!nextText) {
12336
+ continue;
12337
+ }
12338
+ const previous = merged.at(-1);
12339
+ if (!previous) {
12340
+ merged.push(nextText);
12341
+ continue;
12342
+ }
12343
+ if (nextText === previous || previous.includes(nextText)) {
12344
+ continue;
12345
+ }
12346
+ if (nextText.includes(previous)) {
12347
+ merged[merged.length - 1] = nextText;
12348
+ continue;
12349
+ }
12350
+ merged.push(nextText);
12351
+ }
12352
+ return merged.join(" ").trim();
12353
+ };
12354
+ var buildTurnText = (transcripts, partialText, options = {}) => {
12355
+ const finalText = mergeTranscriptTexts(transcripts);
12356
+ const nextPartial = normalizeText(partialText);
12357
+ const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
12358
+ if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
12359
+ return mergeSequentialTranscriptText(finalText, nextPartial);
12360
+ }
12361
+ return selectPreferredTranscriptText(finalText, nextPartial);
12362
+ };
12363
+
12246
12364
  // src/core/turnProfiles.ts
12247
12365
  var TURN_PROFILE_DEFAULTS = {
12248
12366
  balanced: {
12249
12367
  qualityProfile: "general",
12368
+ semanticVetoMaxMs: 0,
12369
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12250
12370
  silenceMs: 1400,
12251
12371
  speechThreshold: 0.012,
12252
12372
  transcriptStabilityMs: 1000
12253
12373
  },
12254
12374
  fast: {
12255
12375
  qualityProfile: "general",
12376
+ semanticVetoMaxMs: 0,
12377
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12256
12378
  silenceMs: 700,
12257
12379
  speechThreshold: 0.015,
12258
12380
  transcriptStabilityMs: 450
12259
12381
  },
12260
12382
  "long-form": {
12261
12383
  qualityProfile: "general",
12384
+ semanticVetoMaxMs: 0,
12385
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12262
12386
  silenceMs: 2200,
12263
12387
  speechThreshold: 0.01,
12264
12388
  transcriptStabilityMs: 1500
@@ -12292,6 +12416,8 @@ var resolveTurnDetectionConfig = (config) => {
12292
12416
  return {
12293
12417
  profile,
12294
12418
  qualityProfile,
12419
+ semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
12420
+ semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
12295
12421
  silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
12296
12422
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
12297
12423
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
@@ -1380,22 +1380,146 @@ var resolveAudioConditioningConfig = (config) => {
1380
1380
  };
1381
1381
  };
1382
1382
 
1383
+ // src/core/turnDetection.ts
1384
+ var DEFAULT_SILENCE_MS = 700;
1385
+ var DEFAULT_SPEECH_THRESHOLD = 0.015;
1386
+ var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
1387
+ var toUint8Array = (audio) => {
1388
+ if (audio instanceof ArrayBuffer) {
1389
+ return new Uint8Array(audio);
1390
+ }
1391
+ return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
1392
+ };
1393
+ var measureAudioLevel = (audio) => {
1394
+ const bytes = toUint8Array(audio);
1395
+ if (bytes.byteLength < 2) {
1396
+ return 0;
1397
+ }
1398
+ const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
1399
+ if (samples.length === 0) {
1400
+ return 0;
1401
+ }
1402
+ let sumSquares = 0;
1403
+ for (const sample of samples) {
1404
+ const normalized = sample / 32768;
1405
+ sumSquares += normalized * normalized;
1406
+ }
1407
+ return Math.sqrt(sumSquares / samples.length);
1408
+ };
1409
+ var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
1410
+ var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
1411
+ var selectPreferredTranscriptText = (currentText, nextText) => {
1412
+ const current = normalizeText(currentText);
1413
+ const next = normalizeText(nextText);
1414
+ if (!current) {
1415
+ return next;
1416
+ }
1417
+ if (!next) {
1418
+ return current;
1419
+ }
1420
+ if (current === next || current.includes(next)) {
1421
+ return current;
1422
+ }
1423
+ if (next.includes(current)) {
1424
+ return next;
1425
+ }
1426
+ if (countWords(next) > countWords(current)) {
1427
+ return next;
1428
+ }
1429
+ if (countWords(next) === countWords(current) && next.length > current.length) {
1430
+ return next;
1431
+ }
1432
+ return current;
1433
+ };
1434
+ var mergeSequentialTranscriptText = (currentText, nextText) => {
1435
+ const current = normalizeText(currentText);
1436
+ const next = normalizeText(nextText);
1437
+ if (!current) {
1438
+ return next;
1439
+ }
1440
+ if (!next) {
1441
+ return current;
1442
+ }
1443
+ const currentWords = current.split(" ");
1444
+ const nextWords = next.split(" ");
1445
+ const maxOverlap = Math.min(currentWords.length, nextWords.length);
1446
+ for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
1447
+ const currentSuffix = currentWords.slice(-overlap).join(" ");
1448
+ const nextPrefix = nextWords.slice(0, overlap).join(" ");
1449
+ if (currentSuffix === nextPrefix) {
1450
+ return [...currentWords, ...nextWords.slice(overlap)].join(" ");
1451
+ }
1452
+ }
1453
+ return `${current} ${next}`.trim();
1454
+ };
1455
+ var countCommonPrefixWords = (currentText, nextText) => {
1456
+ const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
1457
+ const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
1458
+ const maxWords = Math.min(currentWords.length, nextWords.length);
1459
+ let count = 0;
1460
+ for (let index = 0;index < maxWords; index += 1) {
1461
+ if (currentWords[index] !== nextWords[index]) {
1462
+ break;
1463
+ }
1464
+ count += 1;
1465
+ }
1466
+ return count;
1467
+ };
1468
+ var mergeTranscriptTexts = (transcripts) => {
1469
+ const merged = [];
1470
+ for (const transcript of transcripts) {
1471
+ const nextText = normalizeText(transcript.text);
1472
+ if (!nextText) {
1473
+ continue;
1474
+ }
1475
+ const previous = merged.at(-1);
1476
+ if (!previous) {
1477
+ merged.push(nextText);
1478
+ continue;
1479
+ }
1480
+ if (nextText === previous || previous.includes(nextText)) {
1481
+ continue;
1482
+ }
1483
+ if (nextText.includes(previous)) {
1484
+ merged[merged.length - 1] = nextText;
1485
+ continue;
1486
+ }
1487
+ merged.push(nextText);
1488
+ }
1489
+ return merged.join(" ").trim();
1490
+ };
1491
+ var buildTurnText = (transcripts, partialText, options = {}) => {
1492
+ const finalText = mergeTranscriptTexts(transcripts);
1493
+ const nextPartial = normalizeText(partialText);
1494
+ const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
1495
+ if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
1496
+ return mergeSequentialTranscriptText(finalText, nextPartial);
1497
+ }
1498
+ return selectPreferredTranscriptText(finalText, nextPartial);
1499
+ };
1500
+
1383
1501
  // src/core/turnProfiles.ts
1384
1502
  var TURN_PROFILE_DEFAULTS = {
1385
1503
  balanced: {
1386
1504
  qualityProfile: "general",
1505
+ semanticVetoMaxMs: 0,
1506
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
1387
1507
  silenceMs: 1400,
1388
1508
  speechThreshold: 0.012,
1389
1509
  transcriptStabilityMs: 1000
1390
1510
  },
1391
1511
  fast: {
1392
1512
  qualityProfile: "general",
1513
+ semanticVetoMaxMs: 0,
1514
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
1393
1515
  silenceMs: 700,
1394
1516
  speechThreshold: 0.015,
1395
1517
  transcriptStabilityMs: 450
1396
1518
  },
1397
1519
  "long-form": {
1398
1520
  qualityProfile: "general",
1521
+ semanticVetoMaxMs: 0,
1522
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
1399
1523
  silenceMs: 2200,
1400
1524
  speechThreshold: 0.01,
1401
1525
  transcriptStabilityMs: 1500
@@ -1429,6 +1553,8 @@ var resolveTurnDetectionConfig = (config) => {
1429
1553
  return {
1430
1554
  profile,
1431
1555
  qualityProfile,
1556
+ semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
1557
+ semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
1432
1558
  silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
1433
1559
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
1434
1560
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs