@absolutejs/voice 0.0.22-beta.583 → 0.0.22-beta.585

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -391,22 +391,146 @@ var resolveLogger = (logger) => ({
391
391
  ...logger
392
392
  });
393
393
 
394
+ // src/core/turnDetection.ts
395
+ var DEFAULT_SILENCE_MS = 700;
396
+ var DEFAULT_SPEECH_THRESHOLD = 0.015;
397
+ var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
398
+ var toUint8Array = (audio) => {
399
+ if (audio instanceof ArrayBuffer) {
400
+ return new Uint8Array(audio);
401
+ }
402
+ return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
403
+ };
404
+ var measureAudioLevel = (audio) => {
405
+ const bytes = toUint8Array(audio);
406
+ if (bytes.byteLength < 2) {
407
+ return 0;
408
+ }
409
+ const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
410
+ if (samples.length === 0) {
411
+ return 0;
412
+ }
413
+ let sumSquares = 0;
414
+ for (const sample of samples) {
415
+ const normalized = sample / 32768;
416
+ sumSquares += normalized * normalized;
417
+ }
418
+ return Math.sqrt(sumSquares / samples.length);
419
+ };
420
+ var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
421
+ var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
422
+ var selectPreferredTranscriptText = (currentText, nextText) => {
423
+ const current = normalizeText(currentText);
424
+ const next = normalizeText(nextText);
425
+ if (!current) {
426
+ return next;
427
+ }
428
+ if (!next) {
429
+ return current;
430
+ }
431
+ if (current === next || current.includes(next)) {
432
+ return current;
433
+ }
434
+ if (next.includes(current)) {
435
+ return next;
436
+ }
437
+ if (countWords(next) > countWords(current)) {
438
+ return next;
439
+ }
440
+ if (countWords(next) === countWords(current) && next.length > current.length) {
441
+ return next;
442
+ }
443
+ return current;
444
+ };
445
+ var mergeSequentialTranscriptText = (currentText, nextText) => {
446
+ const current = normalizeText(currentText);
447
+ const next = normalizeText(nextText);
448
+ if (!current) {
449
+ return next;
450
+ }
451
+ if (!next) {
452
+ return current;
453
+ }
454
+ const currentWords = current.split(" ");
455
+ const nextWords = next.split(" ");
456
+ const maxOverlap = Math.min(currentWords.length, nextWords.length);
457
+ for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
458
+ const currentSuffix = currentWords.slice(-overlap).join(" ");
459
+ const nextPrefix = nextWords.slice(0, overlap).join(" ");
460
+ if (currentSuffix === nextPrefix) {
461
+ return [...currentWords, ...nextWords.slice(overlap)].join(" ");
462
+ }
463
+ }
464
+ return `${current} ${next}`.trim();
465
+ };
466
+ var countCommonPrefixWords = (currentText, nextText) => {
467
+ const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
468
+ const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
469
+ const maxWords = Math.min(currentWords.length, nextWords.length);
470
+ let count = 0;
471
+ for (let index = 0;index < maxWords; index += 1) {
472
+ if (currentWords[index] !== nextWords[index]) {
473
+ break;
474
+ }
475
+ count += 1;
476
+ }
477
+ return count;
478
+ };
479
+ var mergeTranscriptTexts = (transcripts) => {
480
+ const merged = [];
481
+ for (const transcript of transcripts) {
482
+ const nextText = normalizeText(transcript.text);
483
+ if (!nextText) {
484
+ continue;
485
+ }
486
+ const previous = merged.at(-1);
487
+ if (!previous) {
488
+ merged.push(nextText);
489
+ continue;
490
+ }
491
+ if (nextText === previous || previous.includes(nextText)) {
492
+ continue;
493
+ }
494
+ if (nextText.includes(previous)) {
495
+ merged[merged.length - 1] = nextText;
496
+ continue;
497
+ }
498
+ merged.push(nextText);
499
+ }
500
+ return merged.join(" ").trim();
501
+ };
502
+ var buildTurnText = (transcripts, partialText, options = {}) => {
503
+ const finalText = mergeTranscriptTexts(transcripts);
504
+ const nextPartial = normalizeText(partialText);
505
+ const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
506
+ if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
507
+ return mergeSequentialTranscriptText(finalText, nextPartial);
508
+ }
509
+ return selectPreferredTranscriptText(finalText, nextPartial);
510
+ };
511
+
394
512
  // src/core/turnProfiles.ts
395
513
  var TURN_PROFILE_DEFAULTS = {
396
514
  balanced: {
397
515
  qualityProfile: "general",
516
+ semanticVetoMaxMs: 0,
517
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
398
518
  silenceMs: 1400,
399
519
  speechThreshold: 0.012,
400
520
  transcriptStabilityMs: 1000
401
521
  },
402
522
  fast: {
403
523
  qualityProfile: "general",
524
+ semanticVetoMaxMs: 0,
525
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
404
526
  silenceMs: 700,
405
527
  speechThreshold: 0.015,
406
528
  transcriptStabilityMs: 450
407
529
  },
408
530
  "long-form": {
409
531
  qualityProfile: "general",
532
+ semanticVetoMaxMs: 0,
533
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
410
534
  silenceMs: 2200,
411
535
  speechThreshold: 0.01,
412
536
  transcriptStabilityMs: 1500
@@ -440,6 +564,8 @@ var resolveTurnDetectionConfig = (config) => {
440
564
  return {
441
565
  profile,
442
566
  qualityProfile,
567
+ semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
568
+ semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
443
569
  silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
444
570
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
445
571
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
@@ -3454,123 +3580,6 @@ var createVoiceTwilioRedirectHandoffAdapter = (options) => ({
3454
3580
  }
3455
3581
  });
3456
3582
 
3457
- // src/core/turnDetection.ts
3458
- var DEFAULT_SILENCE_MS = 700;
3459
- var DEFAULT_SPEECH_THRESHOLD = 0.015;
3460
- var toUint8Array = (audio) => {
3461
- if (audio instanceof ArrayBuffer) {
3462
- return new Uint8Array(audio);
3463
- }
3464
- return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
3465
- };
3466
- var measureAudioLevel = (audio) => {
3467
- const bytes = toUint8Array(audio);
3468
- if (bytes.byteLength < 2) {
3469
- return 0;
3470
- }
3471
- const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
3472
- if (samples.length === 0) {
3473
- return 0;
3474
- }
3475
- let sumSquares = 0;
3476
- for (const sample of samples) {
3477
- const normalized = sample / 32768;
3478
- sumSquares += normalized * normalized;
3479
- }
3480
- return Math.sqrt(sumSquares / samples.length);
3481
- };
3482
- var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
3483
- var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
3484
- var selectPreferredTranscriptText = (currentText, nextText) => {
3485
- const current = normalizeText(currentText);
3486
- const next = normalizeText(nextText);
3487
- if (!current) {
3488
- return next;
3489
- }
3490
- if (!next) {
3491
- return current;
3492
- }
3493
- if (current === next || current.includes(next)) {
3494
- return current;
3495
- }
3496
- if (next.includes(current)) {
3497
- return next;
3498
- }
3499
- if (countWords(next) > countWords(current)) {
3500
- return next;
3501
- }
3502
- if (countWords(next) === countWords(current) && next.length > current.length) {
3503
- return next;
3504
- }
3505
- return current;
3506
- };
3507
- var mergeSequentialTranscriptText = (currentText, nextText) => {
3508
- const current = normalizeText(currentText);
3509
- const next = normalizeText(nextText);
3510
- if (!current) {
3511
- return next;
3512
- }
3513
- if (!next) {
3514
- return current;
3515
- }
3516
- const currentWords = current.split(" ");
3517
- const nextWords = next.split(" ");
3518
- const maxOverlap = Math.min(currentWords.length, nextWords.length);
3519
- for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
3520
- const currentSuffix = currentWords.slice(-overlap).join(" ");
3521
- const nextPrefix = nextWords.slice(0, overlap).join(" ");
3522
- if (currentSuffix === nextPrefix) {
3523
- return [...currentWords, ...nextWords.slice(overlap)].join(" ");
3524
- }
3525
- }
3526
- return `${current} ${next}`.trim();
3527
- };
3528
- var countCommonPrefixWords = (currentText, nextText) => {
3529
- const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
3530
- const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
3531
- const maxWords = Math.min(currentWords.length, nextWords.length);
3532
- let count = 0;
3533
- for (let index = 0;index < maxWords; index += 1) {
3534
- if (currentWords[index] !== nextWords[index]) {
3535
- break;
3536
- }
3537
- count += 1;
3538
- }
3539
- return count;
3540
- };
3541
- var mergeTranscriptTexts = (transcripts) => {
3542
- const merged = [];
3543
- for (const transcript of transcripts) {
3544
- const nextText = normalizeText(transcript.text);
3545
- if (!nextText) {
3546
- continue;
3547
- }
3548
- const previous = merged.at(-1);
3549
- if (!previous) {
3550
- merged.push(nextText);
3551
- continue;
3552
- }
3553
- if (nextText === previous || previous.includes(nextText)) {
3554
- continue;
3555
- }
3556
- if (nextText.includes(previous)) {
3557
- merged[merged.length - 1] = nextText;
3558
- continue;
3559
- }
3560
- merged.push(nextText);
3561
- }
3562
- return merged.join(" ").trim();
3563
- };
3564
- var buildTurnText = (transcripts, partialText, options = {}) => {
3565
- const finalText = mergeTranscriptTexts(transcripts);
3566
- const nextPartial = normalizeText(partialText);
3567
- const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
3568
- if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
3569
- return mergeSequentialTranscriptText(finalText, nextPartial);
3570
- }
3571
- return selectPreferredTranscriptText(finalText, nextPartial);
3572
- };
3573
-
3574
3583
  // src/core/types.ts
3575
3584
  var ttsAdapterSessionCanCancel = (session) => typeof session.cancel === "function";
3576
3585
 
@@ -3907,8 +3916,11 @@ var createVoiceSession = (options) => {
3907
3916
  const turnDetection = {
3908
3917
  silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
3909
3918
  speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
3910
- transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
3919
+ transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS,
3920
+ semanticVetoMaxMs: options.turnDetection.semanticVetoMaxMs ?? 0,
3921
+ semanticVetoRecheckMs: options.turnDetection.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS
3911
3922
  };
3923
+ let semanticVetoElapsedMs = 0;
3912
3924
  const sttFallback = options.sttFallback ? {
3913
3925
  adapter: options.sttFallback.adapter,
3914
3926
  completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
@@ -4423,10 +4435,51 @@ var createVoiceSession = (options) => {
4423
4435
  silenceTimer = setTimeout(() => {
4424
4436
  silenceTimer = null;
4425
4437
  pendingCommitReason = null;
4426
- api.commitTurn(reason);
4438
+ runScheduledCommit(reason);
4427
4439
  }, delayMs);
4428
4440
  };
4429
4441
  const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
4442
+ const shouldDeferSilenceCommit = async (reason) => {
4443
+ if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
4444
+ return false;
4445
+ }
4446
+ const session = await readSession();
4447
+ const { partialText, transcripts } = session.currentTurn;
4448
+ const userText = buildTurnText(transcripts, partialText, {
4449
+ partialEndedAtMs: session.currentTurn.partialEndedAt,
4450
+ partialStartedAtMs: session.currentTurn.partialStartedAt
4451
+ });
4452
+ if (!userText) {
4453
+ return false;
4454
+ }
4455
+ const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
4456
+ let endOfTurn = true;
4457
+ try {
4458
+ const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
4459
+ lastFinalTranscript: transcripts.at(-1),
4460
+ partialText,
4461
+ silenceMs,
4462
+ transcripts
4463
+ }));
4464
+ endOfTurn = verdict.endOfTurn;
4465
+ } catch {
4466
+ return false;
4467
+ }
4468
+ if (endOfTurn !== false) {
4469
+ return false;
4470
+ }
4471
+ const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
4472
+ const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
4473
+ semanticVetoElapsedMs += extendMs;
4474
+ scheduleTurnCommit(extendMs, reason);
4475
+ return true;
4476
+ };
4477
+ const runScheduledCommit = async (reason) => {
4478
+ if (await shouldDeferSilenceCommit(reason)) {
4479
+ return;
4480
+ }
4481
+ await api.commitTurn(reason);
4482
+ };
4430
4483
  const requestTurnCommit = async (reason) => {
4431
4484
  const session = await readSession();
4432
4485
  const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
@@ -5138,6 +5191,7 @@ var createVoiceSession = (options) => {
5138
5191
  session2.lastActivityAt = Date.now();
5139
5192
  session2.status = "active";
5140
5193
  });
5194
+ semanticVetoElapsedMs = 0;
5141
5195
  if (silenceTimer && pendingCommitReason === "vendor") {
5142
5196
  scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
5143
5197
  }
@@ -5841,6 +5895,7 @@ var createVoiceSession = (options) => {
5841
5895
  };
5842
5896
  const commitTurnInternal = async (reason = "manual") => {
5843
5897
  clearSilenceTimer();
5898
+ semanticVetoElapsedMs = 0;
5844
5899
  backchannelDriver?.reset();
5845
5900
  amdLastTurnCommitAt = Date.now();
5846
5901
  const session = await readSession();
@@ -40842,6 +40897,44 @@ Respond with only your spoken line. When your goal is met or you want to hang up
40842
40897
  persona: options.persona
40843
40898
  };
40844
40899
  };
40900
+ // src/core/hardenedFetch.ts
40901
+ var ATTEMPT_TIMEOUT_MS = 6000;
40902
+ var isBun = "Bun" in globalThis;
40903
+ var oneAttempt = async (baseFetch, input, init) => {
40904
+ const controller = new AbortController;
40905
+ const callerSignal = init?.signal ?? undefined;
40906
+ const onCallerAbort = () => controller.abort(callerSignal?.reason);
40907
+ if (callerSignal?.aborted)
40908
+ controller.abort(callerSignal.reason);
40909
+ else
40910
+ callerSignal?.addEventListener("abort", onCallerAbort, { once: true });
40911
+ const timer = setTimeout(() => {
40912
+ controller.abort(new Error(`fetch exceeded ${ATTEMPT_TIMEOUT_MS}ms before response headers (stale Bun keep-alive socket?)`));
40913
+ }, ATTEMPT_TIMEOUT_MS);
40914
+ const headers = new Headers(init?.headers);
40915
+ if (isBun)
40916
+ headers.set("Connection", "close");
40917
+ try {
40918
+ return await baseFetch(input, {
40919
+ ...init,
40920
+ headers,
40921
+ signal: controller.signal
40922
+ });
40923
+ } finally {
40924
+ clearTimeout(timer);
40925
+ callerSignal?.removeEventListener("abort", onCallerAbort);
40926
+ }
40927
+ };
40928
+ var hardenFetch = (baseFetch = globalThis.fetch) => Object.assign(async (input, init) => {
40929
+ try {
40930
+ return await oneAttempt(baseFetch, input, init);
40931
+ } catch (error) {
40932
+ if (init?.signal?.aborted)
40933
+ throw error;
40934
+ console.warn(`[voice] hardened fetch retrying on a fresh connection: ${error instanceof Error ? error.message : String(error)}`);
40935
+ return oneAttempt(baseFetch, input, init);
40936
+ }
40937
+ }, { preconnect: baseFetch.preconnect.bind(baseFetch) });
40845
40938
  // src/core/mcpToolset.ts
40846
40939
  var flattenContent = (result) => {
40847
40940
  const blocks = result.content ?? [];
@@ -42350,9 +42443,12 @@ var createVoiceConfiguration = (configuration) => configuration;
42350
42443
  var DEFAULT_SPEECH_THRESHOLD2 = 0.015;
42351
42444
  var DEFAULT_SILENCE_MS2 = 700;
42352
42445
  var DEFAULT_TRANSCRIPT_STABILITY_MS2 = 200;
42446
+ var DEFAULT_SEMANTIC_VETO_RECHECK_MS2 = 1200;
42353
42447
  var resolveTurnDetection = (input) => ({
42354
42448
  profile: input?.profile ?? "balanced",
42355
42449
  qualityProfile: input?.qualityProfile ?? "general",
42450
+ semanticVetoMaxMs: input?.semanticVetoMaxMs ?? 0,
42451
+ semanticVetoRecheckMs: input?.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS2,
42356
42452
  silenceMs: input?.silenceMs ?? DEFAULT_SILENCE_MS2,
42357
42453
  speechThreshold: input?.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD2,
42358
42454
  transcriptStabilityMs: input?.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS2
@@ -45390,7 +45486,7 @@ var consumeOpenAIResponsesStream = async (response, onTextDelta, abortOptions) =
45390
45486
  return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
45391
45487
  };
45392
45488
  var createOpenAIVoiceAssistantModel = (options) => {
45393
- const fetchImpl = options.fetch ?? globalThis.fetch;
45489
+ const fetchImpl = hardenFetch(options.fetch);
45394
45490
  const baseUrl = options.baseUrl ?? "https://api.openai.com/v1";
45395
45491
  const model = options.model ?? "gpt-4.1-mini";
45396
45492
  const timeoutMs = options.timeoutMs ?? 60000;
@@ -45515,7 +45611,7 @@ var consumeAnthropicStream = async (response, onTextDelta) => {
45515
45611
  return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
45516
45612
  };
45517
45613
  var createAnthropicVoiceAssistantModel = (options) => {
45518
- const fetchImpl = options.fetch ?? globalThis.fetch;
45614
+ const fetchImpl = hardenFetch(options.fetch);
45519
45615
  const baseUrl = options.baseUrl ?? "https://api.anthropic.com/v1";
45520
45616
  const model = options.model ?? "claude-sonnet-4-5";
45521
45617
  return {
@@ -45601,7 +45697,7 @@ var consumeGeminiStream = async (response, onTextDelta) => {
45601
45697
  return { assistantText, toolCalls, usage };
45602
45698
  };
45603
45699
  var createGeminiVoiceAssistantModel = (options) => {
45604
- const fetchImpl = options.fetch ?? globalThis.fetch;
45700
+ const fetchImpl = hardenFetch(options.fetch);
45605
45701
  const baseUrl = options.baseUrl ?? "https://generativelanguage.googleapis.com/v1beta";
45606
45702
  const model = options.model ?? "gemini-2.5-flash";
45607
45703
  const maxRetries = Math.max(0, options.maxRetries ?? 2);
@@ -52711,6 +52807,7 @@ export {
52711
52807
  importVoiceCampaignRecipients,
52712
52808
  heartbeatVoiceOpsTask,
52713
52809
  hasVoiceOpsTaskSLABreach,
52810
+ hardenFetch,
52714
52811
  getVoiceProofTargetLogicalFailure,
52715
52812
  getVoiceLiveOpsControlStatus,
52716
52813
  getVoiceCampaignDialerProofStatus,
@@ -12243,22 +12243,146 @@ var resolveAudioConditioningConfig = (config) => {
12243
12243
  };
12244
12244
  };
12245
12245
 
12246
+ // src/core/turnDetection.ts
12247
+ var DEFAULT_SILENCE_MS = 700;
12248
+ var DEFAULT_SPEECH_THRESHOLD = 0.015;
12249
+ var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
12250
+ var toUint8Array = (audio) => {
12251
+ if (audio instanceof ArrayBuffer) {
12252
+ return new Uint8Array(audio);
12253
+ }
12254
+ return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
12255
+ };
12256
+ var measureAudioLevel = (audio) => {
12257
+ const bytes = toUint8Array(audio);
12258
+ if (bytes.byteLength < 2) {
12259
+ return 0;
12260
+ }
12261
+ const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
12262
+ if (samples.length === 0) {
12263
+ return 0;
12264
+ }
12265
+ let sumSquares = 0;
12266
+ for (const sample of samples) {
12267
+ const normalized = sample / 32768;
12268
+ sumSquares += normalized * normalized;
12269
+ }
12270
+ return Math.sqrt(sumSquares / samples.length);
12271
+ };
12272
+ var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
12273
+ var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
12274
+ var selectPreferredTranscriptText = (currentText, nextText) => {
12275
+ const current = normalizeText(currentText);
12276
+ const next = normalizeText(nextText);
12277
+ if (!current) {
12278
+ return next;
12279
+ }
12280
+ if (!next) {
12281
+ return current;
12282
+ }
12283
+ if (current === next || current.includes(next)) {
12284
+ return current;
12285
+ }
12286
+ if (next.includes(current)) {
12287
+ return next;
12288
+ }
12289
+ if (countWords(next) > countWords(current)) {
12290
+ return next;
12291
+ }
12292
+ if (countWords(next) === countWords(current) && next.length > current.length) {
12293
+ return next;
12294
+ }
12295
+ return current;
12296
+ };
12297
+ var mergeSequentialTranscriptText = (currentText, nextText) => {
12298
+ const current = normalizeText(currentText);
12299
+ const next = normalizeText(nextText);
12300
+ if (!current) {
12301
+ return next;
12302
+ }
12303
+ if (!next) {
12304
+ return current;
12305
+ }
12306
+ const currentWords = current.split(" ");
12307
+ const nextWords = next.split(" ");
12308
+ const maxOverlap = Math.min(currentWords.length, nextWords.length);
12309
+ for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
12310
+ const currentSuffix = currentWords.slice(-overlap).join(" ");
12311
+ const nextPrefix = nextWords.slice(0, overlap).join(" ");
12312
+ if (currentSuffix === nextPrefix) {
12313
+ return [...currentWords, ...nextWords.slice(overlap)].join(" ");
12314
+ }
12315
+ }
12316
+ return `${current} ${next}`.trim();
12317
+ };
12318
+ var countCommonPrefixWords = (currentText, nextText) => {
12319
+ const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
12320
+ const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
12321
+ const maxWords = Math.min(currentWords.length, nextWords.length);
12322
+ let count = 0;
12323
+ for (let index = 0;index < maxWords; index += 1) {
12324
+ if (currentWords[index] !== nextWords[index]) {
12325
+ break;
12326
+ }
12327
+ count += 1;
12328
+ }
12329
+ return count;
12330
+ };
12331
+ var mergeTranscriptTexts = (transcripts) => {
12332
+ const merged = [];
12333
+ for (const transcript of transcripts) {
12334
+ const nextText = normalizeText(transcript.text);
12335
+ if (!nextText) {
12336
+ continue;
12337
+ }
12338
+ const previous = merged.at(-1);
12339
+ if (!previous) {
12340
+ merged.push(nextText);
12341
+ continue;
12342
+ }
12343
+ if (nextText === previous || previous.includes(nextText)) {
12344
+ continue;
12345
+ }
12346
+ if (nextText.includes(previous)) {
12347
+ merged[merged.length - 1] = nextText;
12348
+ continue;
12349
+ }
12350
+ merged.push(nextText);
12351
+ }
12352
+ return merged.join(" ").trim();
12353
+ };
12354
+ var buildTurnText = (transcripts, partialText, options = {}) => {
12355
+ const finalText = mergeTranscriptTexts(transcripts);
12356
+ const nextPartial = normalizeText(partialText);
12357
+ const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
12358
+ if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
12359
+ return mergeSequentialTranscriptText(finalText, nextPartial);
12360
+ }
12361
+ return selectPreferredTranscriptText(finalText, nextPartial);
12362
+ };
12363
+
12246
12364
  // src/core/turnProfiles.ts
12247
12365
  var TURN_PROFILE_DEFAULTS = {
12248
12366
  balanced: {
12249
12367
  qualityProfile: "general",
12368
+ semanticVetoMaxMs: 0,
12369
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12250
12370
  silenceMs: 1400,
12251
12371
  speechThreshold: 0.012,
12252
12372
  transcriptStabilityMs: 1000
12253
12373
  },
12254
12374
  fast: {
12255
12375
  qualityProfile: "general",
12376
+ semanticVetoMaxMs: 0,
12377
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12256
12378
  silenceMs: 700,
12257
12379
  speechThreshold: 0.015,
12258
12380
  transcriptStabilityMs: 450
12259
12381
  },
12260
12382
  "long-form": {
12261
12383
  qualityProfile: "general",
12384
+ semanticVetoMaxMs: 0,
12385
+ semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12262
12386
  silenceMs: 2200,
12263
12387
  speechThreshold: 0.01,
12264
12388
  transcriptStabilityMs: 1500
@@ -12292,6 +12416,8 @@ var resolveTurnDetectionConfig = (config) => {
12292
12416
  return {
12293
12417
  profile,
12294
12418
  qualityProfile,
12419
+ semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
12420
+ semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
12295
12421
  silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
12296
12422
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
12297
12423
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs