@absolutejs/voice 0.0.22-beta.598 → 0.0.22-beta.599
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/angular/index.js +6 -127
- package/dist/client/htmxBootstrap.js +6 -12
- package/dist/client/index.js +6 -127
- package/dist/core/turnDetection.d.ts +1 -1
- package/dist/core/types.d.ts +2 -4
- package/dist/embed/index.js +6 -12
- package/dist/embed/voice-widget.js +8 -8
- package/dist/index.js +158 -184
- package/dist/react/index.js +6 -127
- package/dist/svelte/index.js +6 -127
- package/dist/testing/index.js +29 -57
- package/dist/vue/index.js +6 -127
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -391,146 +391,25 @@ var resolveLogger = (logger) => ({
|
|
|
391
391
|
...logger
|
|
392
392
|
});
|
|
393
393
|
|
|
394
|
-
// src/core/turnDetection.ts
|
|
395
|
-
var DEFAULT_SILENCE_MS = 700;
|
|
396
|
-
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
397
|
-
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
398
|
-
var toUint8Array = (audio) => {
|
|
399
|
-
if (audio instanceof ArrayBuffer) {
|
|
400
|
-
return new Uint8Array(audio);
|
|
401
|
-
}
|
|
402
|
-
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
403
|
-
};
|
|
404
|
-
var measureAudioLevel = (audio) => {
|
|
405
|
-
const bytes = toUint8Array(audio);
|
|
406
|
-
if (bytes.byteLength < 2) {
|
|
407
|
-
return 0;
|
|
408
|
-
}
|
|
409
|
-
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
410
|
-
if (samples.length === 0) {
|
|
411
|
-
return 0;
|
|
412
|
-
}
|
|
413
|
-
let sumSquares = 0;
|
|
414
|
-
for (const sample of samples) {
|
|
415
|
-
const normalized = sample / 32768;
|
|
416
|
-
sumSquares += normalized * normalized;
|
|
417
|
-
}
|
|
418
|
-
return Math.sqrt(sumSquares / samples.length);
|
|
419
|
-
};
|
|
420
|
-
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
421
|
-
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
422
|
-
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
423
|
-
const current = normalizeText(currentText);
|
|
424
|
-
const next = normalizeText(nextText);
|
|
425
|
-
if (!current) {
|
|
426
|
-
return next;
|
|
427
|
-
}
|
|
428
|
-
if (!next) {
|
|
429
|
-
return current;
|
|
430
|
-
}
|
|
431
|
-
if (current === next || current.includes(next)) {
|
|
432
|
-
return current;
|
|
433
|
-
}
|
|
434
|
-
if (next.includes(current)) {
|
|
435
|
-
return next;
|
|
436
|
-
}
|
|
437
|
-
if (countWords(next) > countWords(current)) {
|
|
438
|
-
return next;
|
|
439
|
-
}
|
|
440
|
-
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
441
|
-
return next;
|
|
442
|
-
}
|
|
443
|
-
return current;
|
|
444
|
-
};
|
|
445
|
-
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
446
|
-
const current = normalizeText(currentText);
|
|
447
|
-
const next = normalizeText(nextText);
|
|
448
|
-
if (!current) {
|
|
449
|
-
return next;
|
|
450
|
-
}
|
|
451
|
-
if (!next) {
|
|
452
|
-
return current;
|
|
453
|
-
}
|
|
454
|
-
const currentWords = current.split(" ");
|
|
455
|
-
const nextWords = next.split(" ");
|
|
456
|
-
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
457
|
-
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
458
|
-
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
459
|
-
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
460
|
-
if (currentSuffix === nextPrefix) {
|
|
461
|
-
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
462
|
-
}
|
|
463
|
-
}
|
|
464
|
-
return `${current} ${next}`.trim();
|
|
465
|
-
};
|
|
466
|
-
var countCommonPrefixWords = (currentText, nextText) => {
|
|
467
|
-
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
468
|
-
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
469
|
-
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
470
|
-
let count = 0;
|
|
471
|
-
for (let index = 0;index < maxWords; index += 1) {
|
|
472
|
-
if (currentWords[index] !== nextWords[index]) {
|
|
473
|
-
break;
|
|
474
|
-
}
|
|
475
|
-
count += 1;
|
|
476
|
-
}
|
|
477
|
-
return count;
|
|
478
|
-
};
|
|
479
|
-
var mergeTranscriptTexts = (transcripts) => {
|
|
480
|
-
const merged = [];
|
|
481
|
-
for (const transcript of transcripts) {
|
|
482
|
-
const nextText = normalizeText(transcript.text);
|
|
483
|
-
if (!nextText) {
|
|
484
|
-
continue;
|
|
485
|
-
}
|
|
486
|
-
const previous = merged.at(-1);
|
|
487
|
-
if (!previous) {
|
|
488
|
-
merged.push(nextText);
|
|
489
|
-
continue;
|
|
490
|
-
}
|
|
491
|
-
if (nextText === previous || previous.includes(nextText)) {
|
|
492
|
-
continue;
|
|
493
|
-
}
|
|
494
|
-
if (nextText.includes(previous)) {
|
|
495
|
-
merged[merged.length - 1] = nextText;
|
|
496
|
-
continue;
|
|
497
|
-
}
|
|
498
|
-
merged.push(nextText);
|
|
499
|
-
}
|
|
500
|
-
return merged.join(" ").trim();
|
|
501
|
-
};
|
|
502
|
-
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
503
|
-
const finalText = mergeTranscriptTexts(transcripts);
|
|
504
|
-
const nextPartial = normalizeText(partialText);
|
|
505
|
-
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
506
|
-
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
507
|
-
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
508
|
-
}
|
|
509
|
-
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
510
|
-
};
|
|
511
|
-
|
|
512
394
|
// src/core/turnProfiles.ts
|
|
513
395
|
var TURN_PROFILE_DEFAULTS = {
|
|
514
396
|
balanced: {
|
|
515
397
|
qualityProfile: "general",
|
|
516
|
-
|
|
517
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
398
|
+
minSilenceMs: 400,
|
|
518
399
|
silenceMs: 1400,
|
|
519
400
|
speechThreshold: 0.012,
|
|
520
401
|
transcriptStabilityMs: 1000
|
|
521
402
|
},
|
|
522
403
|
fast: {
|
|
523
404
|
qualityProfile: "general",
|
|
524
|
-
|
|
525
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
405
|
+
minSilenceMs: 300,
|
|
526
406
|
silenceMs: 700,
|
|
527
407
|
speechThreshold: 0.015,
|
|
528
408
|
transcriptStabilityMs: 450
|
|
529
409
|
},
|
|
530
410
|
"long-form": {
|
|
531
411
|
qualityProfile: "general",
|
|
532
|
-
|
|
533
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
412
|
+
minSilenceMs: 600,
|
|
534
413
|
silenceMs: 2200,
|
|
535
414
|
speechThreshold: 0.01,
|
|
536
415
|
transcriptStabilityMs: 1500
|
|
@@ -561,12 +440,12 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
561
440
|
const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
|
|
562
441
|
const preset = TURN_PROFILE_DEFAULTS[profile];
|
|
563
442
|
const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
|
|
443
|
+
const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
|
|
564
444
|
return {
|
|
565
445
|
profile,
|
|
566
446
|
qualityProfile,
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
447
|
+
minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
|
|
448
|
+
silenceMs,
|
|
570
449
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
571
450
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
572
451
|
};
|
|
@@ -3580,6 +3459,124 @@ var createVoiceTwilioRedirectHandoffAdapter = (options) => ({
|
|
|
3580
3459
|
}
|
|
3581
3460
|
});
|
|
3582
3461
|
|
|
3462
|
+
// src/core/turnDetection.ts
|
|
3463
|
+
var DEFAULT_SILENCE_MS = 700;
|
|
3464
|
+
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
3465
|
+
var DEFAULT_MIN_SILENCE_MS = 400;
|
|
3466
|
+
var toUint8Array = (audio) => {
|
|
3467
|
+
if (audio instanceof ArrayBuffer) {
|
|
3468
|
+
return new Uint8Array(audio);
|
|
3469
|
+
}
|
|
3470
|
+
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
3471
|
+
};
|
|
3472
|
+
var measureAudioLevel = (audio) => {
|
|
3473
|
+
const bytes = toUint8Array(audio);
|
|
3474
|
+
if (bytes.byteLength < 2) {
|
|
3475
|
+
return 0;
|
|
3476
|
+
}
|
|
3477
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
3478
|
+
if (samples.length === 0) {
|
|
3479
|
+
return 0;
|
|
3480
|
+
}
|
|
3481
|
+
let sumSquares = 0;
|
|
3482
|
+
for (const sample of samples) {
|
|
3483
|
+
const normalized = sample / 32768;
|
|
3484
|
+
sumSquares += normalized * normalized;
|
|
3485
|
+
}
|
|
3486
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
3487
|
+
};
|
|
3488
|
+
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
3489
|
+
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
3490
|
+
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
3491
|
+
const current = normalizeText(currentText);
|
|
3492
|
+
const next = normalizeText(nextText);
|
|
3493
|
+
if (!current) {
|
|
3494
|
+
return next;
|
|
3495
|
+
}
|
|
3496
|
+
if (!next) {
|
|
3497
|
+
return current;
|
|
3498
|
+
}
|
|
3499
|
+
if (current === next || current.includes(next)) {
|
|
3500
|
+
return current;
|
|
3501
|
+
}
|
|
3502
|
+
if (next.includes(current)) {
|
|
3503
|
+
return next;
|
|
3504
|
+
}
|
|
3505
|
+
if (countWords(next) > countWords(current)) {
|
|
3506
|
+
return next;
|
|
3507
|
+
}
|
|
3508
|
+
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
3509
|
+
return next;
|
|
3510
|
+
}
|
|
3511
|
+
return current;
|
|
3512
|
+
};
|
|
3513
|
+
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
3514
|
+
const current = normalizeText(currentText);
|
|
3515
|
+
const next = normalizeText(nextText);
|
|
3516
|
+
if (!current) {
|
|
3517
|
+
return next;
|
|
3518
|
+
}
|
|
3519
|
+
if (!next) {
|
|
3520
|
+
return current;
|
|
3521
|
+
}
|
|
3522
|
+
const currentWords = current.split(" ");
|
|
3523
|
+
const nextWords = next.split(" ");
|
|
3524
|
+
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
3525
|
+
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
3526
|
+
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
3527
|
+
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
3528
|
+
if (currentSuffix === nextPrefix) {
|
|
3529
|
+
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
3530
|
+
}
|
|
3531
|
+
}
|
|
3532
|
+
return `${current} ${next}`.trim();
|
|
3533
|
+
};
|
|
3534
|
+
var countCommonPrefixWords = (currentText, nextText) => {
|
|
3535
|
+
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
3536
|
+
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
3537
|
+
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
3538
|
+
let count = 0;
|
|
3539
|
+
for (let index = 0;index < maxWords; index += 1) {
|
|
3540
|
+
if (currentWords[index] !== nextWords[index]) {
|
|
3541
|
+
break;
|
|
3542
|
+
}
|
|
3543
|
+
count += 1;
|
|
3544
|
+
}
|
|
3545
|
+
return count;
|
|
3546
|
+
};
|
|
3547
|
+
var mergeTranscriptTexts = (transcripts) => {
|
|
3548
|
+
const merged = [];
|
|
3549
|
+
for (const transcript of transcripts) {
|
|
3550
|
+
const nextText = normalizeText(transcript.text);
|
|
3551
|
+
if (!nextText) {
|
|
3552
|
+
continue;
|
|
3553
|
+
}
|
|
3554
|
+
const previous = merged.at(-1);
|
|
3555
|
+
if (!previous) {
|
|
3556
|
+
merged.push(nextText);
|
|
3557
|
+
continue;
|
|
3558
|
+
}
|
|
3559
|
+
if (nextText === previous || previous.includes(nextText)) {
|
|
3560
|
+
continue;
|
|
3561
|
+
}
|
|
3562
|
+
if (nextText.includes(previous)) {
|
|
3563
|
+
merged[merged.length - 1] = nextText;
|
|
3564
|
+
continue;
|
|
3565
|
+
}
|
|
3566
|
+
merged.push(nextText);
|
|
3567
|
+
}
|
|
3568
|
+
return merged.join(" ").trim();
|
|
3569
|
+
};
|
|
3570
|
+
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
3571
|
+
const finalText = mergeTranscriptTexts(transcripts);
|
|
3572
|
+
const nextPartial = normalizeText(partialText);
|
|
3573
|
+
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
3574
|
+
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
3575
|
+
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
3576
|
+
}
|
|
3577
|
+
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
3578
|
+
};
|
|
3579
|
+
|
|
3583
3580
|
// src/core/types.ts
|
|
3584
3581
|
var ttsAdapterSessionCanCancel = (session) => typeof session.cancel === "function";
|
|
3585
3582
|
|
|
@@ -3926,14 +3923,22 @@ var createVoiceSession = (options) => {
|
|
|
3926
3923
|
strategy: options.reconnect.strategy ?? "resume-last-turn",
|
|
3927
3924
|
timeout: options.reconnect.timeout ?? DEFAULT_RECONNECT_TIMEOUT
|
|
3928
3925
|
};
|
|
3926
|
+
const resolvedSilenceMs = options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS;
|
|
3929
3927
|
const turnDetection = {
|
|
3930
|
-
silenceMs:
|
|
3928
|
+
silenceMs: resolvedSilenceMs,
|
|
3929
|
+
minSilenceMs: Math.min(resolvedSilenceMs, options.turnDetection.minSilenceMs ?? DEFAULT_MIN_SILENCE_MS),
|
|
3931
3930
|
speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
|
|
3932
|
-
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
|
|
3933
|
-
|
|
3934
|
-
|
|
3931
|
+
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
|
|
3932
|
+
};
|
|
3933
|
+
let lastTurnCompleteConfidence = null;
|
|
3934
|
+
const adaptiveSilenceMs = () => {
|
|
3935
|
+
const { minSilenceMs, silenceMs } = turnDetection;
|
|
3936
|
+
if (lastTurnCompleteConfidence === null || silenceMs <= minSilenceMs) {
|
|
3937
|
+
return silenceMs;
|
|
3938
|
+
}
|
|
3939
|
+
const complete = Math.max(0, Math.min(1, lastTurnCompleteConfidence));
|
|
3940
|
+
return Math.round(minSilenceMs + (silenceMs - minSilenceMs) * (1 - complete));
|
|
3935
3941
|
};
|
|
3936
|
-
let semanticVetoElapsedMs = 0;
|
|
3937
3942
|
const sttFallback = options.sttFallback ? {
|
|
3938
3943
|
adapter: options.sttFallback.adapter,
|
|
3939
3944
|
completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
|
|
@@ -4466,47 +4471,8 @@ var createVoiceSession = (options) => {
|
|
|
4466
4471
|
runScheduledCommit(reason);
|
|
4467
4472
|
}, delayMs);
|
|
4468
4473
|
};
|
|
4469
|
-
const scheduleSilenceCommit = (delayMs =
|
|
4470
|
-
const shouldDeferSilenceCommit = async (reason) => {
|
|
4471
|
-
if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
|
|
4472
|
-
return false;
|
|
4473
|
-
}
|
|
4474
|
-
const session = await readSession();
|
|
4475
|
-
const { partialText, transcripts } = session.currentTurn;
|
|
4476
|
-
const userText = buildTurnText(transcripts, partialText, {
|
|
4477
|
-
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
4478
|
-
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
4479
|
-
});
|
|
4480
|
-
if (!userText) {
|
|
4481
|
-
return false;
|
|
4482
|
-
}
|
|
4483
|
-
const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
|
|
4484
|
-
let endOfTurn = true;
|
|
4485
|
-
try {
|
|
4486
|
-
const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
|
|
4487
|
-
lastFinalTranscript: transcripts.at(-1),
|
|
4488
|
-
partialText,
|
|
4489
|
-
silenceMs,
|
|
4490
|
-
transcripts,
|
|
4491
|
-
...getTurnAudioForDetector()
|
|
4492
|
-
}));
|
|
4493
|
-
endOfTurn = verdict.endOfTurn;
|
|
4494
|
-
} catch {
|
|
4495
|
-
return false;
|
|
4496
|
-
}
|
|
4497
|
-
if (endOfTurn !== false) {
|
|
4498
|
-
return false;
|
|
4499
|
-
}
|
|
4500
|
-
const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
|
|
4501
|
-
const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
|
|
4502
|
-
semanticVetoElapsedMs += extendMs;
|
|
4503
|
-
scheduleTurnCommit(extendMs, reason);
|
|
4504
|
-
return true;
|
|
4505
|
-
};
|
|
4474
|
+
const scheduleSilenceCommit = (delayMs = adaptiveSilenceMs(), reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
|
|
4506
4475
|
const runScheduledCommit = async (reason) => {
|
|
4507
|
-
if (await shouldDeferSilenceCommit(reason)) {
|
|
4508
|
-
return;
|
|
4509
|
-
}
|
|
4510
4476
|
await api.commitTurn(reason);
|
|
4511
4477
|
};
|
|
4512
4478
|
const requestTurnCommit = async (reason) => {
|
|
@@ -5246,7 +5212,7 @@ var createVoiceSession = (options) => {
|
|
|
5246
5212
|
session2.lastActivityAt = Date.now();
|
|
5247
5213
|
session2.status = "active";
|
|
5248
5214
|
});
|
|
5249
|
-
|
|
5215
|
+
lastTurnCompleteConfidence = null;
|
|
5250
5216
|
if (silenceTimer && pendingCommitReason === "vendor") {
|
|
5251
5217
|
scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
|
|
5252
5218
|
}
|
|
@@ -5276,6 +5242,12 @@ var createVoiceSession = (options) => {
|
|
|
5276
5242
|
transcripts: session.currentTurn.transcripts,
|
|
5277
5243
|
...getTurnAudioForDetector()
|
|
5278
5244
|
}));
|
|
5245
|
+
if (typeof verdict.confidence === "number") {
|
|
5246
|
+
lastTurnCompleteConfidence = verdict.confidence;
|
|
5247
|
+
if (silenceTimer && pendingCommitReason === "silence") {
|
|
5248
|
+
scheduleSilenceCommit();
|
|
5249
|
+
}
|
|
5250
|
+
}
|
|
5279
5251
|
if (verdict.endOfTurn) {
|
|
5280
5252
|
clearSilenceTimer();
|
|
5281
5253
|
await requestTurnCommit("vendor");
|
|
@@ -5971,7 +5943,7 @@ var createVoiceSession = (options) => {
|
|
|
5971
5943
|
};
|
|
5972
5944
|
const commitTurnInternal = async (reason = "manual") => {
|
|
5973
5945
|
clearSilenceTimer();
|
|
5974
|
-
|
|
5946
|
+
lastTurnCompleteConfidence = null;
|
|
5975
5947
|
backchannelDriver?.reset();
|
|
5976
5948
|
amdLastTurnCommitAt = Date.now();
|
|
5977
5949
|
const session = await readSession();
|
|
@@ -42574,16 +42546,18 @@ var createVoiceConfiguration = (configuration) => configuration;
|
|
|
42574
42546
|
var DEFAULT_SPEECH_THRESHOLD2 = 0.015;
|
|
42575
42547
|
var DEFAULT_SILENCE_MS2 = 700;
|
|
42576
42548
|
var DEFAULT_TRANSCRIPT_STABILITY_MS2 = 200;
|
|
42577
|
-
var
|
|
42578
|
-
var resolveTurnDetection = (input) =>
|
|
42579
|
-
|
|
42580
|
-
|
|
42581
|
-
|
|
42582
|
-
|
|
42583
|
-
|
|
42584
|
-
|
|
42585
|
-
|
|
42586
|
-
|
|
42549
|
+
var DEFAULT_MIN_SILENCE_MS2 = 400;
|
|
42550
|
+
var resolveTurnDetection = (input) => {
|
|
42551
|
+
const silenceMs = input?.silenceMs ?? DEFAULT_SILENCE_MS2;
|
|
42552
|
+
return {
|
|
42553
|
+
profile: input?.profile ?? "balanced",
|
|
42554
|
+
qualityProfile: input?.qualityProfile ?? "general",
|
|
42555
|
+
minSilenceMs: Math.min(silenceMs, input?.minSilenceMs ?? DEFAULT_MIN_SILENCE_MS2),
|
|
42556
|
+
silenceMs,
|
|
42557
|
+
speechThreshold: input?.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD2,
|
|
42558
|
+
transcriptStabilityMs: input?.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS2
|
|
42559
|
+
};
|
|
42560
|
+
};
|
|
42587
42561
|
var resolveReconnect = (input) => ({
|
|
42588
42562
|
maxAttempts: input?.maxAttempts ?? 3,
|
|
42589
42563
|
strategy: input?.strategy ?? "resume-last-turn",
|
package/dist/react/index.js
CHANGED
|
@@ -12272,146 +12272,25 @@ var resolveAudioConditioningConfig = (config) => {
|
|
|
12272
12272
|
};
|
|
12273
12273
|
};
|
|
12274
12274
|
|
|
12275
|
-
// src/core/turnDetection.ts
|
|
12276
|
-
var DEFAULT_SILENCE_MS = 700;
|
|
12277
|
-
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
12278
|
-
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
12279
|
-
var toUint8Array = (audio) => {
|
|
12280
|
-
if (audio instanceof ArrayBuffer) {
|
|
12281
|
-
return new Uint8Array(audio);
|
|
12282
|
-
}
|
|
12283
|
-
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
12284
|
-
};
|
|
12285
|
-
var measureAudioLevel = (audio) => {
|
|
12286
|
-
const bytes = toUint8Array(audio);
|
|
12287
|
-
if (bytes.byteLength < 2) {
|
|
12288
|
-
return 0;
|
|
12289
|
-
}
|
|
12290
|
-
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
12291
|
-
if (samples.length === 0) {
|
|
12292
|
-
return 0;
|
|
12293
|
-
}
|
|
12294
|
-
let sumSquares = 0;
|
|
12295
|
-
for (const sample of samples) {
|
|
12296
|
-
const normalized = sample / 32768;
|
|
12297
|
-
sumSquares += normalized * normalized;
|
|
12298
|
-
}
|
|
12299
|
-
return Math.sqrt(sumSquares / samples.length);
|
|
12300
|
-
};
|
|
12301
|
-
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
12302
|
-
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
12303
|
-
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
12304
|
-
const current = normalizeText(currentText);
|
|
12305
|
-
const next = normalizeText(nextText);
|
|
12306
|
-
if (!current) {
|
|
12307
|
-
return next;
|
|
12308
|
-
}
|
|
12309
|
-
if (!next) {
|
|
12310
|
-
return current;
|
|
12311
|
-
}
|
|
12312
|
-
if (current === next || current.includes(next)) {
|
|
12313
|
-
return current;
|
|
12314
|
-
}
|
|
12315
|
-
if (next.includes(current)) {
|
|
12316
|
-
return next;
|
|
12317
|
-
}
|
|
12318
|
-
if (countWords(next) > countWords(current)) {
|
|
12319
|
-
return next;
|
|
12320
|
-
}
|
|
12321
|
-
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
12322
|
-
return next;
|
|
12323
|
-
}
|
|
12324
|
-
return current;
|
|
12325
|
-
};
|
|
12326
|
-
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
12327
|
-
const current = normalizeText(currentText);
|
|
12328
|
-
const next = normalizeText(nextText);
|
|
12329
|
-
if (!current) {
|
|
12330
|
-
return next;
|
|
12331
|
-
}
|
|
12332
|
-
if (!next) {
|
|
12333
|
-
return current;
|
|
12334
|
-
}
|
|
12335
|
-
const currentWords = current.split(" ");
|
|
12336
|
-
const nextWords = next.split(" ");
|
|
12337
|
-
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
12338
|
-
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
12339
|
-
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
12340
|
-
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
12341
|
-
if (currentSuffix === nextPrefix) {
|
|
12342
|
-
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
12343
|
-
}
|
|
12344
|
-
}
|
|
12345
|
-
return `${current} ${next}`.trim();
|
|
12346
|
-
};
|
|
12347
|
-
var countCommonPrefixWords = (currentText, nextText) => {
|
|
12348
|
-
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
12349
|
-
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
12350
|
-
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
12351
|
-
let count = 0;
|
|
12352
|
-
for (let index = 0;index < maxWords; index += 1) {
|
|
12353
|
-
if (currentWords[index] !== nextWords[index]) {
|
|
12354
|
-
break;
|
|
12355
|
-
}
|
|
12356
|
-
count += 1;
|
|
12357
|
-
}
|
|
12358
|
-
return count;
|
|
12359
|
-
};
|
|
12360
|
-
var mergeTranscriptTexts = (transcripts) => {
|
|
12361
|
-
const merged = [];
|
|
12362
|
-
for (const transcript of transcripts) {
|
|
12363
|
-
const nextText = normalizeText(transcript.text);
|
|
12364
|
-
if (!nextText) {
|
|
12365
|
-
continue;
|
|
12366
|
-
}
|
|
12367
|
-
const previous = merged.at(-1);
|
|
12368
|
-
if (!previous) {
|
|
12369
|
-
merged.push(nextText);
|
|
12370
|
-
continue;
|
|
12371
|
-
}
|
|
12372
|
-
if (nextText === previous || previous.includes(nextText)) {
|
|
12373
|
-
continue;
|
|
12374
|
-
}
|
|
12375
|
-
if (nextText.includes(previous)) {
|
|
12376
|
-
merged[merged.length - 1] = nextText;
|
|
12377
|
-
continue;
|
|
12378
|
-
}
|
|
12379
|
-
merged.push(nextText);
|
|
12380
|
-
}
|
|
12381
|
-
return merged.join(" ").trim();
|
|
12382
|
-
};
|
|
12383
|
-
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
12384
|
-
const finalText = mergeTranscriptTexts(transcripts);
|
|
12385
|
-
const nextPartial = normalizeText(partialText);
|
|
12386
|
-
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
12387
|
-
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
12388
|
-
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
12389
|
-
}
|
|
12390
|
-
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
12391
|
-
};
|
|
12392
|
-
|
|
12393
12275
|
// src/core/turnProfiles.ts
|
|
12394
12276
|
var TURN_PROFILE_DEFAULTS = {
|
|
12395
12277
|
balanced: {
|
|
12396
12278
|
qualityProfile: "general",
|
|
12397
|
-
|
|
12398
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12279
|
+
minSilenceMs: 400,
|
|
12399
12280
|
silenceMs: 1400,
|
|
12400
12281
|
speechThreshold: 0.012,
|
|
12401
12282
|
transcriptStabilityMs: 1000
|
|
12402
12283
|
},
|
|
12403
12284
|
fast: {
|
|
12404
12285
|
qualityProfile: "general",
|
|
12405
|
-
|
|
12406
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12286
|
+
minSilenceMs: 300,
|
|
12407
12287
|
silenceMs: 700,
|
|
12408
12288
|
speechThreshold: 0.015,
|
|
12409
12289
|
transcriptStabilityMs: 450
|
|
12410
12290
|
},
|
|
12411
12291
|
"long-form": {
|
|
12412
12292
|
qualityProfile: "general",
|
|
12413
|
-
|
|
12414
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12293
|
+
minSilenceMs: 600,
|
|
12415
12294
|
silenceMs: 2200,
|
|
12416
12295
|
speechThreshold: 0.01,
|
|
12417
12296
|
transcriptStabilityMs: 1500
|
|
@@ -12442,12 +12321,12 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
12442
12321
|
const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
|
|
12443
12322
|
const preset = TURN_PROFILE_DEFAULTS[profile];
|
|
12444
12323
|
const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
|
|
12324
|
+
const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
|
|
12445
12325
|
return {
|
|
12446
12326
|
profile,
|
|
12447
12327
|
qualityProfile,
|
|
12448
|
-
|
|
12449
|
-
|
|
12450
|
-
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
12328
|
+
minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
|
|
12329
|
+
silenceMs,
|
|
12451
12330
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
12452
12331
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
12453
12332
|
};
|