@absolutejs/voice 0.0.22-beta.584 → 0.0.22-beta.586
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/angular/index.js +126 -0
- package/dist/client/htmxBootstrap.js +11 -0
- package/dist/client/index.js +126 -0
- package/dist/core/turnDetection.d.ts +1 -0
- package/dist/core/types.d.ts +4 -0
- package/dist/embed/index.js +11 -0
- package/dist/embed/voice-widget.js +8 -8
- package/dist/index.js +203 -119
- package/dist/react/index.js +126 -0
- package/dist/svelte/index.js +126 -0
- package/dist/testing/index.js +83 -2
- package/dist/vue/index.js +126 -0
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -391,22 +391,146 @@ var resolveLogger = (logger) => ({
|
|
|
391
391
|
...logger
|
|
392
392
|
});
|
|
393
393
|
|
|
394
|
+
// src/core/turnDetection.ts
|
|
395
|
+
var DEFAULT_SILENCE_MS = 700;
|
|
396
|
+
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
397
|
+
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
398
|
+
var toUint8Array = (audio) => {
|
|
399
|
+
if (audio instanceof ArrayBuffer) {
|
|
400
|
+
return new Uint8Array(audio);
|
|
401
|
+
}
|
|
402
|
+
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
403
|
+
};
|
|
404
|
+
var measureAudioLevel = (audio) => {
|
|
405
|
+
const bytes = toUint8Array(audio);
|
|
406
|
+
if (bytes.byteLength < 2) {
|
|
407
|
+
return 0;
|
|
408
|
+
}
|
|
409
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
410
|
+
if (samples.length === 0) {
|
|
411
|
+
return 0;
|
|
412
|
+
}
|
|
413
|
+
let sumSquares = 0;
|
|
414
|
+
for (const sample of samples) {
|
|
415
|
+
const normalized = sample / 32768;
|
|
416
|
+
sumSquares += normalized * normalized;
|
|
417
|
+
}
|
|
418
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
419
|
+
};
|
|
420
|
+
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
421
|
+
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
422
|
+
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
423
|
+
const current = normalizeText(currentText);
|
|
424
|
+
const next = normalizeText(nextText);
|
|
425
|
+
if (!current) {
|
|
426
|
+
return next;
|
|
427
|
+
}
|
|
428
|
+
if (!next) {
|
|
429
|
+
return current;
|
|
430
|
+
}
|
|
431
|
+
if (current === next || current.includes(next)) {
|
|
432
|
+
return current;
|
|
433
|
+
}
|
|
434
|
+
if (next.includes(current)) {
|
|
435
|
+
return next;
|
|
436
|
+
}
|
|
437
|
+
if (countWords(next) > countWords(current)) {
|
|
438
|
+
return next;
|
|
439
|
+
}
|
|
440
|
+
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
441
|
+
return next;
|
|
442
|
+
}
|
|
443
|
+
return current;
|
|
444
|
+
};
|
|
445
|
+
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
446
|
+
const current = normalizeText(currentText);
|
|
447
|
+
const next = normalizeText(nextText);
|
|
448
|
+
if (!current) {
|
|
449
|
+
return next;
|
|
450
|
+
}
|
|
451
|
+
if (!next) {
|
|
452
|
+
return current;
|
|
453
|
+
}
|
|
454
|
+
const currentWords = current.split(" ");
|
|
455
|
+
const nextWords = next.split(" ");
|
|
456
|
+
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
457
|
+
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
458
|
+
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
459
|
+
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
460
|
+
if (currentSuffix === nextPrefix) {
|
|
461
|
+
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
return `${current} ${next}`.trim();
|
|
465
|
+
};
|
|
466
|
+
var countCommonPrefixWords = (currentText, nextText) => {
|
|
467
|
+
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
468
|
+
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
469
|
+
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
470
|
+
let count = 0;
|
|
471
|
+
for (let index = 0;index < maxWords; index += 1) {
|
|
472
|
+
if (currentWords[index] !== nextWords[index]) {
|
|
473
|
+
break;
|
|
474
|
+
}
|
|
475
|
+
count += 1;
|
|
476
|
+
}
|
|
477
|
+
return count;
|
|
478
|
+
};
|
|
479
|
+
var mergeTranscriptTexts = (transcripts) => {
|
|
480
|
+
const merged = [];
|
|
481
|
+
for (const transcript of transcripts) {
|
|
482
|
+
const nextText = normalizeText(transcript.text);
|
|
483
|
+
if (!nextText) {
|
|
484
|
+
continue;
|
|
485
|
+
}
|
|
486
|
+
const previous = merged.at(-1);
|
|
487
|
+
if (!previous) {
|
|
488
|
+
merged.push(nextText);
|
|
489
|
+
continue;
|
|
490
|
+
}
|
|
491
|
+
if (nextText === previous || previous.includes(nextText)) {
|
|
492
|
+
continue;
|
|
493
|
+
}
|
|
494
|
+
if (nextText.includes(previous)) {
|
|
495
|
+
merged[merged.length - 1] = nextText;
|
|
496
|
+
continue;
|
|
497
|
+
}
|
|
498
|
+
merged.push(nextText);
|
|
499
|
+
}
|
|
500
|
+
return merged.join(" ").trim();
|
|
501
|
+
};
|
|
502
|
+
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
503
|
+
const finalText = mergeTranscriptTexts(transcripts);
|
|
504
|
+
const nextPartial = normalizeText(partialText);
|
|
505
|
+
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
506
|
+
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
507
|
+
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
508
|
+
}
|
|
509
|
+
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
510
|
+
};
|
|
511
|
+
|
|
394
512
|
// src/core/turnProfiles.ts
|
|
395
513
|
var TURN_PROFILE_DEFAULTS = {
|
|
396
514
|
balanced: {
|
|
397
515
|
qualityProfile: "general",
|
|
516
|
+
semanticVetoMaxMs: 0,
|
|
517
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
398
518
|
silenceMs: 1400,
|
|
399
519
|
speechThreshold: 0.012,
|
|
400
520
|
transcriptStabilityMs: 1000
|
|
401
521
|
},
|
|
402
522
|
fast: {
|
|
403
523
|
qualityProfile: "general",
|
|
524
|
+
semanticVetoMaxMs: 0,
|
|
525
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
404
526
|
silenceMs: 700,
|
|
405
527
|
speechThreshold: 0.015,
|
|
406
528
|
transcriptStabilityMs: 450
|
|
407
529
|
},
|
|
408
530
|
"long-form": {
|
|
409
531
|
qualityProfile: "general",
|
|
532
|
+
semanticVetoMaxMs: 0,
|
|
533
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
410
534
|
silenceMs: 2200,
|
|
411
535
|
speechThreshold: 0.01,
|
|
412
536
|
transcriptStabilityMs: 1500
|
|
@@ -440,6 +564,8 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
440
564
|
return {
|
|
441
565
|
profile,
|
|
442
566
|
qualityProfile,
|
|
567
|
+
semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
|
|
568
|
+
semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
|
|
443
569
|
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
444
570
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
445
571
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
@@ -3454,123 +3580,6 @@ var createVoiceTwilioRedirectHandoffAdapter = (options) => ({
|
|
|
3454
3580
|
}
|
|
3455
3581
|
});
|
|
3456
3582
|
|
|
3457
|
-
// src/core/turnDetection.ts
|
|
3458
|
-
var DEFAULT_SILENCE_MS = 700;
|
|
3459
|
-
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
3460
|
-
var toUint8Array = (audio) => {
|
|
3461
|
-
if (audio instanceof ArrayBuffer) {
|
|
3462
|
-
return new Uint8Array(audio);
|
|
3463
|
-
}
|
|
3464
|
-
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
3465
|
-
};
|
|
3466
|
-
var measureAudioLevel = (audio) => {
|
|
3467
|
-
const bytes = toUint8Array(audio);
|
|
3468
|
-
if (bytes.byteLength < 2) {
|
|
3469
|
-
return 0;
|
|
3470
|
-
}
|
|
3471
|
-
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
3472
|
-
if (samples.length === 0) {
|
|
3473
|
-
return 0;
|
|
3474
|
-
}
|
|
3475
|
-
let sumSquares = 0;
|
|
3476
|
-
for (const sample of samples) {
|
|
3477
|
-
const normalized = sample / 32768;
|
|
3478
|
-
sumSquares += normalized * normalized;
|
|
3479
|
-
}
|
|
3480
|
-
return Math.sqrt(sumSquares / samples.length);
|
|
3481
|
-
};
|
|
3482
|
-
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
3483
|
-
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
3484
|
-
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
3485
|
-
const current = normalizeText(currentText);
|
|
3486
|
-
const next = normalizeText(nextText);
|
|
3487
|
-
if (!current) {
|
|
3488
|
-
return next;
|
|
3489
|
-
}
|
|
3490
|
-
if (!next) {
|
|
3491
|
-
return current;
|
|
3492
|
-
}
|
|
3493
|
-
if (current === next || current.includes(next)) {
|
|
3494
|
-
return current;
|
|
3495
|
-
}
|
|
3496
|
-
if (next.includes(current)) {
|
|
3497
|
-
return next;
|
|
3498
|
-
}
|
|
3499
|
-
if (countWords(next) > countWords(current)) {
|
|
3500
|
-
return next;
|
|
3501
|
-
}
|
|
3502
|
-
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
3503
|
-
return next;
|
|
3504
|
-
}
|
|
3505
|
-
return current;
|
|
3506
|
-
};
|
|
3507
|
-
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
3508
|
-
const current = normalizeText(currentText);
|
|
3509
|
-
const next = normalizeText(nextText);
|
|
3510
|
-
if (!current) {
|
|
3511
|
-
return next;
|
|
3512
|
-
}
|
|
3513
|
-
if (!next) {
|
|
3514
|
-
return current;
|
|
3515
|
-
}
|
|
3516
|
-
const currentWords = current.split(" ");
|
|
3517
|
-
const nextWords = next.split(" ");
|
|
3518
|
-
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
3519
|
-
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
3520
|
-
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
3521
|
-
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
3522
|
-
if (currentSuffix === nextPrefix) {
|
|
3523
|
-
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
3524
|
-
}
|
|
3525
|
-
}
|
|
3526
|
-
return `${current} ${next}`.trim();
|
|
3527
|
-
};
|
|
3528
|
-
var countCommonPrefixWords = (currentText, nextText) => {
|
|
3529
|
-
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
3530
|
-
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
3531
|
-
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
3532
|
-
let count = 0;
|
|
3533
|
-
for (let index = 0;index < maxWords; index += 1) {
|
|
3534
|
-
if (currentWords[index] !== nextWords[index]) {
|
|
3535
|
-
break;
|
|
3536
|
-
}
|
|
3537
|
-
count += 1;
|
|
3538
|
-
}
|
|
3539
|
-
return count;
|
|
3540
|
-
};
|
|
3541
|
-
var mergeTranscriptTexts = (transcripts) => {
|
|
3542
|
-
const merged = [];
|
|
3543
|
-
for (const transcript of transcripts) {
|
|
3544
|
-
const nextText = normalizeText(transcript.text);
|
|
3545
|
-
if (!nextText) {
|
|
3546
|
-
continue;
|
|
3547
|
-
}
|
|
3548
|
-
const previous = merged.at(-1);
|
|
3549
|
-
if (!previous) {
|
|
3550
|
-
merged.push(nextText);
|
|
3551
|
-
continue;
|
|
3552
|
-
}
|
|
3553
|
-
if (nextText === previous || previous.includes(nextText)) {
|
|
3554
|
-
continue;
|
|
3555
|
-
}
|
|
3556
|
-
if (nextText.includes(previous)) {
|
|
3557
|
-
merged[merged.length - 1] = nextText;
|
|
3558
|
-
continue;
|
|
3559
|
-
}
|
|
3560
|
-
merged.push(nextText);
|
|
3561
|
-
}
|
|
3562
|
-
return merged.join(" ").trim();
|
|
3563
|
-
};
|
|
3564
|
-
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
3565
|
-
const finalText = mergeTranscriptTexts(transcripts);
|
|
3566
|
-
const nextPartial = normalizeText(partialText);
|
|
3567
|
-
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
3568
|
-
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
3569
|
-
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
3570
|
-
}
|
|
3571
|
-
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
3572
|
-
};
|
|
3573
|
-
|
|
3574
3583
|
// src/core/types.ts
|
|
3575
3584
|
var ttsAdapterSessionCanCancel = (session) => typeof session.cancel === "function";
|
|
3576
3585
|
|
|
@@ -3712,6 +3721,8 @@ var FALLBACK_CONFIDENCE_SELECTION_DELTA = 0.05;
|
|
|
3712
3721
|
var FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO = 0.12;
|
|
3713
3722
|
var EXTENDED_VENDOR_COMMIT_SILENCE_THRESHOLD_MS = 200;
|
|
3714
3723
|
var MAX_VENDOR_COMMIT_GRACE_MS = 1200;
|
|
3724
|
+
var STT_RECONNECT_FLAP_WINDOW_MS = 4000;
|
|
3725
|
+
var MAX_STT_RECONNECTS_IN_FLAP_WINDOW = 3;
|
|
3715
3726
|
var DEFAULT_FORMAT = {
|
|
3716
3727
|
channels: 1,
|
|
3717
3728
|
container: "raw",
|
|
@@ -3907,8 +3918,11 @@ var createVoiceSession = (options) => {
|
|
|
3907
3918
|
const turnDetection = {
|
|
3908
3919
|
silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
|
|
3909
3920
|
speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
|
|
3910
|
-
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
|
|
3921
|
+
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS,
|
|
3922
|
+
semanticVetoMaxMs: options.turnDetection.semanticVetoMaxMs ?? 0,
|
|
3923
|
+
semanticVetoRecheckMs: options.turnDetection.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS
|
|
3911
3924
|
};
|
|
3925
|
+
let semanticVetoElapsedMs = 0;
|
|
3912
3926
|
const sttFallback = options.sttFallback ? {
|
|
3913
3927
|
adapter: options.sttFallback.adapter,
|
|
3914
3928
|
completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
|
|
@@ -3949,6 +3963,8 @@ var createVoiceSession = (options) => {
|
|
|
3949
3963
|
let operationQueue = Promise.resolve();
|
|
3950
3964
|
let adapterGenerationCounter = 0;
|
|
3951
3965
|
let activeAdapterGeneration = 0;
|
|
3966
|
+
let sttReconnectCount = 0;
|
|
3967
|
+
let lastSttReconnectAt = 0;
|
|
3952
3968
|
let activeTTSTurnId;
|
|
3953
3969
|
let assistantSpeechEndsAt = 0;
|
|
3954
3970
|
let lastAssistantAudioAt = 0;
|
|
@@ -4423,10 +4439,51 @@ var createVoiceSession = (options) => {
|
|
|
4423
4439
|
silenceTimer = setTimeout(() => {
|
|
4424
4440
|
silenceTimer = null;
|
|
4425
4441
|
pendingCommitReason = null;
|
|
4426
|
-
|
|
4442
|
+
runScheduledCommit(reason);
|
|
4427
4443
|
}, delayMs);
|
|
4428
4444
|
};
|
|
4429
4445
|
const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
|
|
4446
|
+
const shouldDeferSilenceCommit = async (reason) => {
|
|
4447
|
+
if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
|
|
4448
|
+
return false;
|
|
4449
|
+
}
|
|
4450
|
+
const session = await readSession();
|
|
4451
|
+
const { partialText, transcripts } = session.currentTurn;
|
|
4452
|
+
const userText = buildTurnText(transcripts, partialText, {
|
|
4453
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
4454
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
4455
|
+
});
|
|
4456
|
+
if (!userText) {
|
|
4457
|
+
return false;
|
|
4458
|
+
}
|
|
4459
|
+
const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
|
|
4460
|
+
let endOfTurn = true;
|
|
4461
|
+
try {
|
|
4462
|
+
const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
|
|
4463
|
+
lastFinalTranscript: transcripts.at(-1),
|
|
4464
|
+
partialText,
|
|
4465
|
+
silenceMs,
|
|
4466
|
+
transcripts
|
|
4467
|
+
}));
|
|
4468
|
+
endOfTurn = verdict.endOfTurn;
|
|
4469
|
+
} catch {
|
|
4470
|
+
return false;
|
|
4471
|
+
}
|
|
4472
|
+
if (endOfTurn !== false) {
|
|
4473
|
+
return false;
|
|
4474
|
+
}
|
|
4475
|
+
const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
|
|
4476
|
+
const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
|
|
4477
|
+
semanticVetoElapsedMs += extendMs;
|
|
4478
|
+
scheduleTurnCommit(extendMs, reason);
|
|
4479
|
+
return true;
|
|
4480
|
+
};
|
|
4481
|
+
const runScheduledCommit = async (reason) => {
|
|
4482
|
+
if (await shouldDeferSilenceCommit(reason)) {
|
|
4483
|
+
return;
|
|
4484
|
+
}
|
|
4485
|
+
await api.commitTurn(reason);
|
|
4486
|
+
};
|
|
4430
4487
|
const requestTurnCommit = async (reason) => {
|
|
4431
4488
|
const session = await readSession();
|
|
4432
4489
|
const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
@@ -4794,6 +4851,27 @@ var createVoiceSession = (options) => {
|
|
|
4794
4851
|
}
|
|
4795
4852
|
};
|
|
4796
4853
|
const handleClose = async (event) => {
|
|
4854
|
+
const session = await readSession();
|
|
4855
|
+
const callLive = session.status !== "completed" && session.status !== "failed";
|
|
4856
|
+
if (callLive && (options.stt || options.realtime)) {
|
|
4857
|
+
const now = Date.now();
|
|
4858
|
+
sttReconnectCount = now - lastSttReconnectAt < STT_RECONNECT_FLAP_WINDOW_MS ? sttReconnectCount + 1 : 1;
|
|
4859
|
+
lastSttReconnectAt = now;
|
|
4860
|
+
if (sttReconnectCount <= MAX_STT_RECONNECTS_IN_FLAP_WINDOW) {
|
|
4861
|
+
await appendTrace({
|
|
4862
|
+
payload: {
|
|
4863
|
+
action: "stt-reconnect",
|
|
4864
|
+
attempt: sttReconnectCount,
|
|
4865
|
+
reason: event.reason ?? "stt stream closed",
|
|
4866
|
+
recoverable: event.recoverable
|
|
4867
|
+
},
|
|
4868
|
+
session,
|
|
4869
|
+
type: "session.error"
|
|
4870
|
+
});
|
|
4871
|
+
await closeAdapter(event.reason ?? "stt stream closed; reconnecting");
|
|
4872
|
+
return;
|
|
4873
|
+
}
|
|
4874
|
+
}
|
|
4797
4875
|
if (event.recoverable === false) {
|
|
4798
4876
|
await failInternal(new Error(event.reason ?? "Speech-to-text session closed"));
|
|
4799
4877
|
return;
|
|
@@ -5118,6 +5196,7 @@ var createVoiceSession = (options) => {
|
|
|
5118
5196
|
});
|
|
5119
5197
|
};
|
|
5120
5198
|
const handleFinal = async (transcript) => {
|
|
5199
|
+
sttReconnectCount = 0;
|
|
5121
5200
|
const session = await writeSession((session2) => {
|
|
5122
5201
|
const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
|
|
5123
5202
|
if (!alreadyPresent) {
|
|
@@ -5138,6 +5217,7 @@ var createVoiceSession = (options) => {
|
|
|
5138
5217
|
session2.lastActivityAt = Date.now();
|
|
5139
5218
|
session2.status = "active";
|
|
5140
5219
|
});
|
|
5220
|
+
semanticVetoElapsedMs = 0;
|
|
5141
5221
|
if (silenceTimer && pendingCommitReason === "vendor") {
|
|
5142
5222
|
scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
|
|
5143
5223
|
}
|
|
@@ -5841,6 +5921,7 @@ var createVoiceSession = (options) => {
|
|
|
5841
5921
|
};
|
|
5842
5922
|
const commitTurnInternal = async (reason = "manual") => {
|
|
5843
5923
|
clearSilenceTimer();
|
|
5924
|
+
semanticVetoElapsedMs = 0;
|
|
5844
5925
|
backchannelDriver?.reset();
|
|
5845
5926
|
amdLastTurnCommitAt = Date.now();
|
|
5846
5927
|
const session = await readSession();
|
|
@@ -42388,9 +42469,12 @@ var createVoiceConfiguration = (configuration) => configuration;
|
|
|
42388
42469
|
var DEFAULT_SPEECH_THRESHOLD2 = 0.015;
|
|
42389
42470
|
var DEFAULT_SILENCE_MS2 = 700;
|
|
42390
42471
|
var DEFAULT_TRANSCRIPT_STABILITY_MS2 = 200;
|
|
42472
|
+
var DEFAULT_SEMANTIC_VETO_RECHECK_MS2 = 1200;
|
|
42391
42473
|
var resolveTurnDetection = (input) => ({
|
|
42392
42474
|
profile: input?.profile ?? "balanced",
|
|
42393
42475
|
qualityProfile: input?.qualityProfile ?? "general",
|
|
42476
|
+
semanticVetoMaxMs: input?.semanticVetoMaxMs ?? 0,
|
|
42477
|
+
semanticVetoRecheckMs: input?.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS2,
|
|
42394
42478
|
silenceMs: input?.silenceMs ?? DEFAULT_SILENCE_MS2,
|
|
42395
42479
|
speechThreshold: input?.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD2,
|
|
42396
42480
|
transcriptStabilityMs: input?.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS2
|
package/dist/react/index.js
CHANGED
|
@@ -12243,22 +12243,146 @@ var resolveAudioConditioningConfig = (config) => {
|
|
|
12243
12243
|
};
|
|
12244
12244
|
};
|
|
12245
12245
|
|
|
12246
|
+
// src/core/turnDetection.ts
|
|
12247
|
+
var DEFAULT_SILENCE_MS = 700;
|
|
12248
|
+
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
12249
|
+
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
12250
|
+
var toUint8Array = (audio) => {
|
|
12251
|
+
if (audio instanceof ArrayBuffer) {
|
|
12252
|
+
return new Uint8Array(audio);
|
|
12253
|
+
}
|
|
12254
|
+
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
12255
|
+
};
|
|
12256
|
+
var measureAudioLevel = (audio) => {
|
|
12257
|
+
const bytes = toUint8Array(audio);
|
|
12258
|
+
if (bytes.byteLength < 2) {
|
|
12259
|
+
return 0;
|
|
12260
|
+
}
|
|
12261
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
12262
|
+
if (samples.length === 0) {
|
|
12263
|
+
return 0;
|
|
12264
|
+
}
|
|
12265
|
+
let sumSquares = 0;
|
|
12266
|
+
for (const sample of samples) {
|
|
12267
|
+
const normalized = sample / 32768;
|
|
12268
|
+
sumSquares += normalized * normalized;
|
|
12269
|
+
}
|
|
12270
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
12271
|
+
};
|
|
12272
|
+
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
12273
|
+
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
12274
|
+
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
12275
|
+
const current = normalizeText(currentText);
|
|
12276
|
+
const next = normalizeText(nextText);
|
|
12277
|
+
if (!current) {
|
|
12278
|
+
return next;
|
|
12279
|
+
}
|
|
12280
|
+
if (!next) {
|
|
12281
|
+
return current;
|
|
12282
|
+
}
|
|
12283
|
+
if (current === next || current.includes(next)) {
|
|
12284
|
+
return current;
|
|
12285
|
+
}
|
|
12286
|
+
if (next.includes(current)) {
|
|
12287
|
+
return next;
|
|
12288
|
+
}
|
|
12289
|
+
if (countWords(next) > countWords(current)) {
|
|
12290
|
+
return next;
|
|
12291
|
+
}
|
|
12292
|
+
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
12293
|
+
return next;
|
|
12294
|
+
}
|
|
12295
|
+
return current;
|
|
12296
|
+
};
|
|
12297
|
+
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
12298
|
+
const current = normalizeText(currentText);
|
|
12299
|
+
const next = normalizeText(nextText);
|
|
12300
|
+
if (!current) {
|
|
12301
|
+
return next;
|
|
12302
|
+
}
|
|
12303
|
+
if (!next) {
|
|
12304
|
+
return current;
|
|
12305
|
+
}
|
|
12306
|
+
const currentWords = current.split(" ");
|
|
12307
|
+
const nextWords = next.split(" ");
|
|
12308
|
+
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
12309
|
+
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
12310
|
+
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
12311
|
+
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
12312
|
+
if (currentSuffix === nextPrefix) {
|
|
12313
|
+
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
12314
|
+
}
|
|
12315
|
+
}
|
|
12316
|
+
return `${current} ${next}`.trim();
|
|
12317
|
+
};
|
|
12318
|
+
var countCommonPrefixWords = (currentText, nextText) => {
|
|
12319
|
+
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
12320
|
+
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
12321
|
+
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
12322
|
+
let count = 0;
|
|
12323
|
+
for (let index = 0;index < maxWords; index += 1) {
|
|
12324
|
+
if (currentWords[index] !== nextWords[index]) {
|
|
12325
|
+
break;
|
|
12326
|
+
}
|
|
12327
|
+
count += 1;
|
|
12328
|
+
}
|
|
12329
|
+
return count;
|
|
12330
|
+
};
|
|
12331
|
+
var mergeTranscriptTexts = (transcripts) => {
|
|
12332
|
+
const merged = [];
|
|
12333
|
+
for (const transcript of transcripts) {
|
|
12334
|
+
const nextText = normalizeText(transcript.text);
|
|
12335
|
+
if (!nextText) {
|
|
12336
|
+
continue;
|
|
12337
|
+
}
|
|
12338
|
+
const previous = merged.at(-1);
|
|
12339
|
+
if (!previous) {
|
|
12340
|
+
merged.push(nextText);
|
|
12341
|
+
continue;
|
|
12342
|
+
}
|
|
12343
|
+
if (nextText === previous || previous.includes(nextText)) {
|
|
12344
|
+
continue;
|
|
12345
|
+
}
|
|
12346
|
+
if (nextText.includes(previous)) {
|
|
12347
|
+
merged[merged.length - 1] = nextText;
|
|
12348
|
+
continue;
|
|
12349
|
+
}
|
|
12350
|
+
merged.push(nextText);
|
|
12351
|
+
}
|
|
12352
|
+
return merged.join(" ").trim();
|
|
12353
|
+
};
|
|
12354
|
+
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
12355
|
+
const finalText = mergeTranscriptTexts(transcripts);
|
|
12356
|
+
const nextPartial = normalizeText(partialText);
|
|
12357
|
+
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
12358
|
+
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
12359
|
+
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
12360
|
+
}
|
|
12361
|
+
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
12362
|
+
};
|
|
12363
|
+
|
|
12246
12364
|
// src/core/turnProfiles.ts
|
|
12247
12365
|
var TURN_PROFILE_DEFAULTS = {
|
|
12248
12366
|
balanced: {
|
|
12249
12367
|
qualityProfile: "general",
|
|
12368
|
+
semanticVetoMaxMs: 0,
|
|
12369
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12250
12370
|
silenceMs: 1400,
|
|
12251
12371
|
speechThreshold: 0.012,
|
|
12252
12372
|
transcriptStabilityMs: 1000
|
|
12253
12373
|
},
|
|
12254
12374
|
fast: {
|
|
12255
12375
|
qualityProfile: "general",
|
|
12376
|
+
semanticVetoMaxMs: 0,
|
|
12377
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12256
12378
|
silenceMs: 700,
|
|
12257
12379
|
speechThreshold: 0.015,
|
|
12258
12380
|
transcriptStabilityMs: 450
|
|
12259
12381
|
},
|
|
12260
12382
|
"long-form": {
|
|
12261
12383
|
qualityProfile: "general",
|
|
12384
|
+
semanticVetoMaxMs: 0,
|
|
12385
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12262
12386
|
silenceMs: 2200,
|
|
12263
12387
|
speechThreshold: 0.01,
|
|
12264
12388
|
transcriptStabilityMs: 1500
|
|
@@ -12292,6 +12416,8 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
12292
12416
|
return {
|
|
12293
12417
|
profile,
|
|
12294
12418
|
qualityProfile,
|
|
12419
|
+
semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
|
|
12420
|
+
semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
|
|
12295
12421
|
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
12296
12422
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
12297
12423
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|