@absolutejs/voice 0.0.22-beta.584 → 0.0.22-beta.585
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/angular/index.js +126 -0
- package/dist/client/htmxBootstrap.js +11 -0
- package/dist/client/index.js +126 -0
- package/dist/core/turnDetection.d.ts +1 -0
- package/dist/core/types.d.ts +4 -0
- package/dist/embed/index.js +11 -0
- package/dist/embed/voice-widget.js +8 -8
- package/dist/index.js +177 -119
- package/dist/react/index.js +126 -0
- package/dist/svelte/index.js +126 -0
- package/dist/testing/index.js +57 -2
- package/dist/vue/index.js +126 -0
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -391,22 +391,146 @@ var resolveLogger = (logger) => ({
|
|
|
391
391
|
...logger
|
|
392
392
|
});
|
|
393
393
|
|
|
394
|
+
// src/core/turnDetection.ts
|
|
395
|
+
var DEFAULT_SILENCE_MS = 700;
|
|
396
|
+
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
397
|
+
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
398
|
+
var toUint8Array = (audio) => {
|
|
399
|
+
if (audio instanceof ArrayBuffer) {
|
|
400
|
+
return new Uint8Array(audio);
|
|
401
|
+
}
|
|
402
|
+
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
403
|
+
};
|
|
404
|
+
var measureAudioLevel = (audio) => {
|
|
405
|
+
const bytes = toUint8Array(audio);
|
|
406
|
+
if (bytes.byteLength < 2) {
|
|
407
|
+
return 0;
|
|
408
|
+
}
|
|
409
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
410
|
+
if (samples.length === 0) {
|
|
411
|
+
return 0;
|
|
412
|
+
}
|
|
413
|
+
let sumSquares = 0;
|
|
414
|
+
for (const sample of samples) {
|
|
415
|
+
const normalized = sample / 32768;
|
|
416
|
+
sumSquares += normalized * normalized;
|
|
417
|
+
}
|
|
418
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
419
|
+
};
|
|
420
|
+
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
421
|
+
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
422
|
+
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
423
|
+
const current = normalizeText(currentText);
|
|
424
|
+
const next = normalizeText(nextText);
|
|
425
|
+
if (!current) {
|
|
426
|
+
return next;
|
|
427
|
+
}
|
|
428
|
+
if (!next) {
|
|
429
|
+
return current;
|
|
430
|
+
}
|
|
431
|
+
if (current === next || current.includes(next)) {
|
|
432
|
+
return current;
|
|
433
|
+
}
|
|
434
|
+
if (next.includes(current)) {
|
|
435
|
+
return next;
|
|
436
|
+
}
|
|
437
|
+
if (countWords(next) > countWords(current)) {
|
|
438
|
+
return next;
|
|
439
|
+
}
|
|
440
|
+
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
441
|
+
return next;
|
|
442
|
+
}
|
|
443
|
+
return current;
|
|
444
|
+
};
|
|
445
|
+
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
446
|
+
const current = normalizeText(currentText);
|
|
447
|
+
const next = normalizeText(nextText);
|
|
448
|
+
if (!current) {
|
|
449
|
+
return next;
|
|
450
|
+
}
|
|
451
|
+
if (!next) {
|
|
452
|
+
return current;
|
|
453
|
+
}
|
|
454
|
+
const currentWords = current.split(" ");
|
|
455
|
+
const nextWords = next.split(" ");
|
|
456
|
+
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
457
|
+
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
458
|
+
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
459
|
+
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
460
|
+
if (currentSuffix === nextPrefix) {
|
|
461
|
+
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
return `${current} ${next}`.trim();
|
|
465
|
+
};
|
|
466
|
+
var countCommonPrefixWords = (currentText, nextText) => {
|
|
467
|
+
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
468
|
+
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
469
|
+
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
470
|
+
let count = 0;
|
|
471
|
+
for (let index = 0;index < maxWords; index += 1) {
|
|
472
|
+
if (currentWords[index] !== nextWords[index]) {
|
|
473
|
+
break;
|
|
474
|
+
}
|
|
475
|
+
count += 1;
|
|
476
|
+
}
|
|
477
|
+
return count;
|
|
478
|
+
};
|
|
479
|
+
var mergeTranscriptTexts = (transcripts) => {
|
|
480
|
+
const merged = [];
|
|
481
|
+
for (const transcript of transcripts) {
|
|
482
|
+
const nextText = normalizeText(transcript.text);
|
|
483
|
+
if (!nextText) {
|
|
484
|
+
continue;
|
|
485
|
+
}
|
|
486
|
+
const previous = merged.at(-1);
|
|
487
|
+
if (!previous) {
|
|
488
|
+
merged.push(nextText);
|
|
489
|
+
continue;
|
|
490
|
+
}
|
|
491
|
+
if (nextText === previous || previous.includes(nextText)) {
|
|
492
|
+
continue;
|
|
493
|
+
}
|
|
494
|
+
if (nextText.includes(previous)) {
|
|
495
|
+
merged[merged.length - 1] = nextText;
|
|
496
|
+
continue;
|
|
497
|
+
}
|
|
498
|
+
merged.push(nextText);
|
|
499
|
+
}
|
|
500
|
+
return merged.join(" ").trim();
|
|
501
|
+
};
|
|
502
|
+
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
503
|
+
const finalText = mergeTranscriptTexts(transcripts);
|
|
504
|
+
const nextPartial = normalizeText(partialText);
|
|
505
|
+
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
506
|
+
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
507
|
+
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
508
|
+
}
|
|
509
|
+
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
510
|
+
};
|
|
511
|
+
|
|
394
512
|
// src/core/turnProfiles.ts
|
|
395
513
|
var TURN_PROFILE_DEFAULTS = {
|
|
396
514
|
balanced: {
|
|
397
515
|
qualityProfile: "general",
|
|
516
|
+
semanticVetoMaxMs: 0,
|
|
517
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
398
518
|
silenceMs: 1400,
|
|
399
519
|
speechThreshold: 0.012,
|
|
400
520
|
transcriptStabilityMs: 1000
|
|
401
521
|
},
|
|
402
522
|
fast: {
|
|
403
523
|
qualityProfile: "general",
|
|
524
|
+
semanticVetoMaxMs: 0,
|
|
525
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
404
526
|
silenceMs: 700,
|
|
405
527
|
speechThreshold: 0.015,
|
|
406
528
|
transcriptStabilityMs: 450
|
|
407
529
|
},
|
|
408
530
|
"long-form": {
|
|
409
531
|
qualityProfile: "general",
|
|
532
|
+
semanticVetoMaxMs: 0,
|
|
533
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
410
534
|
silenceMs: 2200,
|
|
411
535
|
speechThreshold: 0.01,
|
|
412
536
|
transcriptStabilityMs: 1500
|
|
@@ -440,6 +564,8 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
440
564
|
return {
|
|
441
565
|
profile,
|
|
442
566
|
qualityProfile,
|
|
567
|
+
semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
|
|
568
|
+
semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
|
|
443
569
|
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
444
570
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
445
571
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
@@ -3454,123 +3580,6 @@ var createVoiceTwilioRedirectHandoffAdapter = (options) => ({
|
|
|
3454
3580
|
}
|
|
3455
3581
|
});
|
|
3456
3582
|
|
|
3457
|
-
// src/core/turnDetection.ts
|
|
3458
|
-
var DEFAULT_SILENCE_MS = 700;
|
|
3459
|
-
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
3460
|
-
var toUint8Array = (audio) => {
|
|
3461
|
-
if (audio instanceof ArrayBuffer) {
|
|
3462
|
-
return new Uint8Array(audio);
|
|
3463
|
-
}
|
|
3464
|
-
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
3465
|
-
};
|
|
3466
|
-
var measureAudioLevel = (audio) => {
|
|
3467
|
-
const bytes = toUint8Array(audio);
|
|
3468
|
-
if (bytes.byteLength < 2) {
|
|
3469
|
-
return 0;
|
|
3470
|
-
}
|
|
3471
|
-
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
3472
|
-
if (samples.length === 0) {
|
|
3473
|
-
return 0;
|
|
3474
|
-
}
|
|
3475
|
-
let sumSquares = 0;
|
|
3476
|
-
for (const sample of samples) {
|
|
3477
|
-
const normalized = sample / 32768;
|
|
3478
|
-
sumSquares += normalized * normalized;
|
|
3479
|
-
}
|
|
3480
|
-
return Math.sqrt(sumSquares / samples.length);
|
|
3481
|
-
};
|
|
3482
|
-
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
3483
|
-
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
3484
|
-
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
3485
|
-
const current = normalizeText(currentText);
|
|
3486
|
-
const next = normalizeText(nextText);
|
|
3487
|
-
if (!current) {
|
|
3488
|
-
return next;
|
|
3489
|
-
}
|
|
3490
|
-
if (!next) {
|
|
3491
|
-
return current;
|
|
3492
|
-
}
|
|
3493
|
-
if (current === next || current.includes(next)) {
|
|
3494
|
-
return current;
|
|
3495
|
-
}
|
|
3496
|
-
if (next.includes(current)) {
|
|
3497
|
-
return next;
|
|
3498
|
-
}
|
|
3499
|
-
if (countWords(next) > countWords(current)) {
|
|
3500
|
-
return next;
|
|
3501
|
-
}
|
|
3502
|
-
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
3503
|
-
return next;
|
|
3504
|
-
}
|
|
3505
|
-
return current;
|
|
3506
|
-
};
|
|
3507
|
-
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
3508
|
-
const current = normalizeText(currentText);
|
|
3509
|
-
const next = normalizeText(nextText);
|
|
3510
|
-
if (!current) {
|
|
3511
|
-
return next;
|
|
3512
|
-
}
|
|
3513
|
-
if (!next) {
|
|
3514
|
-
return current;
|
|
3515
|
-
}
|
|
3516
|
-
const currentWords = current.split(" ");
|
|
3517
|
-
const nextWords = next.split(" ");
|
|
3518
|
-
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
3519
|
-
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
3520
|
-
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
3521
|
-
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
3522
|
-
if (currentSuffix === nextPrefix) {
|
|
3523
|
-
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
3524
|
-
}
|
|
3525
|
-
}
|
|
3526
|
-
return `${current} ${next}`.trim();
|
|
3527
|
-
};
|
|
3528
|
-
var countCommonPrefixWords = (currentText, nextText) => {
|
|
3529
|
-
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
3530
|
-
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
3531
|
-
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
3532
|
-
let count = 0;
|
|
3533
|
-
for (let index = 0;index < maxWords; index += 1) {
|
|
3534
|
-
if (currentWords[index] !== nextWords[index]) {
|
|
3535
|
-
break;
|
|
3536
|
-
}
|
|
3537
|
-
count += 1;
|
|
3538
|
-
}
|
|
3539
|
-
return count;
|
|
3540
|
-
};
|
|
3541
|
-
var mergeTranscriptTexts = (transcripts) => {
|
|
3542
|
-
const merged = [];
|
|
3543
|
-
for (const transcript of transcripts) {
|
|
3544
|
-
const nextText = normalizeText(transcript.text);
|
|
3545
|
-
if (!nextText) {
|
|
3546
|
-
continue;
|
|
3547
|
-
}
|
|
3548
|
-
const previous = merged.at(-1);
|
|
3549
|
-
if (!previous) {
|
|
3550
|
-
merged.push(nextText);
|
|
3551
|
-
continue;
|
|
3552
|
-
}
|
|
3553
|
-
if (nextText === previous || previous.includes(nextText)) {
|
|
3554
|
-
continue;
|
|
3555
|
-
}
|
|
3556
|
-
if (nextText.includes(previous)) {
|
|
3557
|
-
merged[merged.length - 1] = nextText;
|
|
3558
|
-
continue;
|
|
3559
|
-
}
|
|
3560
|
-
merged.push(nextText);
|
|
3561
|
-
}
|
|
3562
|
-
return merged.join(" ").trim();
|
|
3563
|
-
};
|
|
3564
|
-
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
3565
|
-
const finalText = mergeTranscriptTexts(transcripts);
|
|
3566
|
-
const nextPartial = normalizeText(partialText);
|
|
3567
|
-
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
3568
|
-
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
3569
|
-
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
3570
|
-
}
|
|
3571
|
-
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
3572
|
-
};
|
|
3573
|
-
|
|
3574
3583
|
// src/core/types.ts
|
|
3575
3584
|
var ttsAdapterSessionCanCancel = (session) => typeof session.cancel === "function";
|
|
3576
3585
|
|
|
@@ -3907,8 +3916,11 @@ var createVoiceSession = (options) => {
|
|
|
3907
3916
|
const turnDetection = {
|
|
3908
3917
|
silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
|
|
3909
3918
|
speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
|
|
3910
|
-
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
|
|
3919
|
+
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS,
|
|
3920
|
+
semanticVetoMaxMs: options.turnDetection.semanticVetoMaxMs ?? 0,
|
|
3921
|
+
semanticVetoRecheckMs: options.turnDetection.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS
|
|
3911
3922
|
};
|
|
3923
|
+
let semanticVetoElapsedMs = 0;
|
|
3912
3924
|
const sttFallback = options.sttFallback ? {
|
|
3913
3925
|
adapter: options.sttFallback.adapter,
|
|
3914
3926
|
completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
|
|
@@ -4423,10 +4435,51 @@ var createVoiceSession = (options) => {
|
|
|
4423
4435
|
silenceTimer = setTimeout(() => {
|
|
4424
4436
|
silenceTimer = null;
|
|
4425
4437
|
pendingCommitReason = null;
|
|
4426
|
-
|
|
4438
|
+
runScheduledCommit(reason);
|
|
4427
4439
|
}, delayMs);
|
|
4428
4440
|
};
|
|
4429
4441
|
const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
|
|
4442
|
+
const shouldDeferSilenceCommit = async (reason) => {
|
|
4443
|
+
if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
|
|
4444
|
+
return false;
|
|
4445
|
+
}
|
|
4446
|
+
const session = await readSession();
|
|
4447
|
+
const { partialText, transcripts } = session.currentTurn;
|
|
4448
|
+
const userText = buildTurnText(transcripts, partialText, {
|
|
4449
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
4450
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
4451
|
+
});
|
|
4452
|
+
if (!userText) {
|
|
4453
|
+
return false;
|
|
4454
|
+
}
|
|
4455
|
+
const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
|
|
4456
|
+
let endOfTurn = true;
|
|
4457
|
+
try {
|
|
4458
|
+
const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
|
|
4459
|
+
lastFinalTranscript: transcripts.at(-1),
|
|
4460
|
+
partialText,
|
|
4461
|
+
silenceMs,
|
|
4462
|
+
transcripts
|
|
4463
|
+
}));
|
|
4464
|
+
endOfTurn = verdict.endOfTurn;
|
|
4465
|
+
} catch {
|
|
4466
|
+
return false;
|
|
4467
|
+
}
|
|
4468
|
+
if (endOfTurn !== false) {
|
|
4469
|
+
return false;
|
|
4470
|
+
}
|
|
4471
|
+
const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
|
|
4472
|
+
const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
|
|
4473
|
+
semanticVetoElapsedMs += extendMs;
|
|
4474
|
+
scheduleTurnCommit(extendMs, reason);
|
|
4475
|
+
return true;
|
|
4476
|
+
};
|
|
4477
|
+
const runScheduledCommit = async (reason) => {
|
|
4478
|
+
if (await shouldDeferSilenceCommit(reason)) {
|
|
4479
|
+
return;
|
|
4480
|
+
}
|
|
4481
|
+
await api.commitTurn(reason);
|
|
4482
|
+
};
|
|
4430
4483
|
const requestTurnCommit = async (reason) => {
|
|
4431
4484
|
const session = await readSession();
|
|
4432
4485
|
const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
@@ -5138,6 +5191,7 @@ var createVoiceSession = (options) => {
|
|
|
5138
5191
|
session2.lastActivityAt = Date.now();
|
|
5139
5192
|
session2.status = "active";
|
|
5140
5193
|
});
|
|
5194
|
+
semanticVetoElapsedMs = 0;
|
|
5141
5195
|
if (silenceTimer && pendingCommitReason === "vendor") {
|
|
5142
5196
|
scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
|
|
5143
5197
|
}
|
|
@@ -5841,6 +5895,7 @@ var createVoiceSession = (options) => {
|
|
|
5841
5895
|
};
|
|
5842
5896
|
const commitTurnInternal = async (reason = "manual") => {
|
|
5843
5897
|
clearSilenceTimer();
|
|
5898
|
+
semanticVetoElapsedMs = 0;
|
|
5844
5899
|
backchannelDriver?.reset();
|
|
5845
5900
|
amdLastTurnCommitAt = Date.now();
|
|
5846
5901
|
const session = await readSession();
|
|
@@ -42388,9 +42443,12 @@ var createVoiceConfiguration = (configuration) => configuration;
|
|
|
42388
42443
|
var DEFAULT_SPEECH_THRESHOLD2 = 0.015;
|
|
42389
42444
|
var DEFAULT_SILENCE_MS2 = 700;
|
|
42390
42445
|
var DEFAULT_TRANSCRIPT_STABILITY_MS2 = 200;
|
|
42446
|
+
var DEFAULT_SEMANTIC_VETO_RECHECK_MS2 = 1200;
|
|
42391
42447
|
var resolveTurnDetection = (input) => ({
|
|
42392
42448
|
profile: input?.profile ?? "balanced",
|
|
42393
42449
|
qualityProfile: input?.qualityProfile ?? "general",
|
|
42450
|
+
semanticVetoMaxMs: input?.semanticVetoMaxMs ?? 0,
|
|
42451
|
+
semanticVetoRecheckMs: input?.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS2,
|
|
42394
42452
|
silenceMs: input?.silenceMs ?? DEFAULT_SILENCE_MS2,
|
|
42395
42453
|
speechThreshold: input?.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD2,
|
|
42396
42454
|
transcriptStabilityMs: input?.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS2
|
package/dist/react/index.js
CHANGED
|
@@ -12243,22 +12243,146 @@ var resolveAudioConditioningConfig = (config) => {
|
|
|
12243
12243
|
};
|
|
12244
12244
|
};
|
|
12245
12245
|
|
|
12246
|
+
// src/core/turnDetection.ts
|
|
12247
|
+
var DEFAULT_SILENCE_MS = 700;
|
|
12248
|
+
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
12249
|
+
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
12250
|
+
var toUint8Array = (audio) => {
|
|
12251
|
+
if (audio instanceof ArrayBuffer) {
|
|
12252
|
+
return new Uint8Array(audio);
|
|
12253
|
+
}
|
|
12254
|
+
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
12255
|
+
};
|
|
12256
|
+
var measureAudioLevel = (audio) => {
|
|
12257
|
+
const bytes = toUint8Array(audio);
|
|
12258
|
+
if (bytes.byteLength < 2) {
|
|
12259
|
+
return 0;
|
|
12260
|
+
}
|
|
12261
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
12262
|
+
if (samples.length === 0) {
|
|
12263
|
+
return 0;
|
|
12264
|
+
}
|
|
12265
|
+
let sumSquares = 0;
|
|
12266
|
+
for (const sample of samples) {
|
|
12267
|
+
const normalized = sample / 32768;
|
|
12268
|
+
sumSquares += normalized * normalized;
|
|
12269
|
+
}
|
|
12270
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
12271
|
+
};
|
|
12272
|
+
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
12273
|
+
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
12274
|
+
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
12275
|
+
const current = normalizeText(currentText);
|
|
12276
|
+
const next = normalizeText(nextText);
|
|
12277
|
+
if (!current) {
|
|
12278
|
+
return next;
|
|
12279
|
+
}
|
|
12280
|
+
if (!next) {
|
|
12281
|
+
return current;
|
|
12282
|
+
}
|
|
12283
|
+
if (current === next || current.includes(next)) {
|
|
12284
|
+
return current;
|
|
12285
|
+
}
|
|
12286
|
+
if (next.includes(current)) {
|
|
12287
|
+
return next;
|
|
12288
|
+
}
|
|
12289
|
+
if (countWords(next) > countWords(current)) {
|
|
12290
|
+
return next;
|
|
12291
|
+
}
|
|
12292
|
+
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
12293
|
+
return next;
|
|
12294
|
+
}
|
|
12295
|
+
return current;
|
|
12296
|
+
};
|
|
12297
|
+
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
12298
|
+
const current = normalizeText(currentText);
|
|
12299
|
+
const next = normalizeText(nextText);
|
|
12300
|
+
if (!current) {
|
|
12301
|
+
return next;
|
|
12302
|
+
}
|
|
12303
|
+
if (!next) {
|
|
12304
|
+
return current;
|
|
12305
|
+
}
|
|
12306
|
+
const currentWords = current.split(" ");
|
|
12307
|
+
const nextWords = next.split(" ");
|
|
12308
|
+
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
12309
|
+
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
12310
|
+
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
12311
|
+
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
12312
|
+
if (currentSuffix === nextPrefix) {
|
|
12313
|
+
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
12314
|
+
}
|
|
12315
|
+
}
|
|
12316
|
+
return `${current} ${next}`.trim();
|
|
12317
|
+
};
|
|
12318
|
+
var countCommonPrefixWords = (currentText, nextText) => {
|
|
12319
|
+
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
12320
|
+
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
12321
|
+
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
12322
|
+
let count = 0;
|
|
12323
|
+
for (let index = 0;index < maxWords; index += 1) {
|
|
12324
|
+
if (currentWords[index] !== nextWords[index]) {
|
|
12325
|
+
break;
|
|
12326
|
+
}
|
|
12327
|
+
count += 1;
|
|
12328
|
+
}
|
|
12329
|
+
return count;
|
|
12330
|
+
};
|
|
12331
|
+
var mergeTranscriptTexts = (transcripts) => {
|
|
12332
|
+
const merged = [];
|
|
12333
|
+
for (const transcript of transcripts) {
|
|
12334
|
+
const nextText = normalizeText(transcript.text);
|
|
12335
|
+
if (!nextText) {
|
|
12336
|
+
continue;
|
|
12337
|
+
}
|
|
12338
|
+
const previous = merged.at(-1);
|
|
12339
|
+
if (!previous) {
|
|
12340
|
+
merged.push(nextText);
|
|
12341
|
+
continue;
|
|
12342
|
+
}
|
|
12343
|
+
if (nextText === previous || previous.includes(nextText)) {
|
|
12344
|
+
continue;
|
|
12345
|
+
}
|
|
12346
|
+
if (nextText.includes(previous)) {
|
|
12347
|
+
merged[merged.length - 1] = nextText;
|
|
12348
|
+
continue;
|
|
12349
|
+
}
|
|
12350
|
+
merged.push(nextText);
|
|
12351
|
+
}
|
|
12352
|
+
return merged.join(" ").trim();
|
|
12353
|
+
};
|
|
12354
|
+
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
12355
|
+
const finalText = mergeTranscriptTexts(transcripts);
|
|
12356
|
+
const nextPartial = normalizeText(partialText);
|
|
12357
|
+
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
12358
|
+
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
12359
|
+
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
12360
|
+
}
|
|
12361
|
+
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
12362
|
+
};
|
|
12363
|
+
|
|
12246
12364
|
// src/core/turnProfiles.ts
|
|
12247
12365
|
var TURN_PROFILE_DEFAULTS = {
|
|
12248
12366
|
balanced: {
|
|
12249
12367
|
qualityProfile: "general",
|
|
12368
|
+
semanticVetoMaxMs: 0,
|
|
12369
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12250
12370
|
silenceMs: 1400,
|
|
12251
12371
|
speechThreshold: 0.012,
|
|
12252
12372
|
transcriptStabilityMs: 1000
|
|
12253
12373
|
},
|
|
12254
12374
|
fast: {
|
|
12255
12375
|
qualityProfile: "general",
|
|
12376
|
+
semanticVetoMaxMs: 0,
|
|
12377
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12256
12378
|
silenceMs: 700,
|
|
12257
12379
|
speechThreshold: 0.015,
|
|
12258
12380
|
transcriptStabilityMs: 450
|
|
12259
12381
|
},
|
|
12260
12382
|
"long-form": {
|
|
12261
12383
|
qualityProfile: "general",
|
|
12384
|
+
semanticVetoMaxMs: 0,
|
|
12385
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12262
12386
|
silenceMs: 2200,
|
|
12263
12387
|
speechThreshold: 0.01,
|
|
12264
12388
|
transcriptStabilityMs: 1500
|
|
@@ -12292,6 +12416,8 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
12292
12416
|
return {
|
|
12293
12417
|
profile,
|
|
12294
12418
|
qualityProfile,
|
|
12419
|
+
semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
|
|
12420
|
+
semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
|
|
12295
12421
|
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
12296
12422
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
12297
12423
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
package/dist/svelte/index.js
CHANGED
|
@@ -1380,22 +1380,146 @@ var resolveAudioConditioningConfig = (config) => {
|
|
|
1380
1380
|
};
|
|
1381
1381
|
};
|
|
1382
1382
|
|
|
1383
|
+
// src/core/turnDetection.ts
|
|
1384
|
+
var DEFAULT_SILENCE_MS = 700;
|
|
1385
|
+
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
1386
|
+
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
1387
|
+
var toUint8Array = (audio) => {
|
|
1388
|
+
if (audio instanceof ArrayBuffer) {
|
|
1389
|
+
return new Uint8Array(audio);
|
|
1390
|
+
}
|
|
1391
|
+
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
1392
|
+
};
|
|
1393
|
+
var measureAudioLevel = (audio) => {
|
|
1394
|
+
const bytes = toUint8Array(audio);
|
|
1395
|
+
if (bytes.byteLength < 2) {
|
|
1396
|
+
return 0;
|
|
1397
|
+
}
|
|
1398
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
1399
|
+
if (samples.length === 0) {
|
|
1400
|
+
return 0;
|
|
1401
|
+
}
|
|
1402
|
+
let sumSquares = 0;
|
|
1403
|
+
for (const sample of samples) {
|
|
1404
|
+
const normalized = sample / 32768;
|
|
1405
|
+
sumSquares += normalized * normalized;
|
|
1406
|
+
}
|
|
1407
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
1408
|
+
};
|
|
1409
|
+
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
1410
|
+
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
1411
|
+
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
1412
|
+
const current = normalizeText(currentText);
|
|
1413
|
+
const next = normalizeText(nextText);
|
|
1414
|
+
if (!current) {
|
|
1415
|
+
return next;
|
|
1416
|
+
}
|
|
1417
|
+
if (!next) {
|
|
1418
|
+
return current;
|
|
1419
|
+
}
|
|
1420
|
+
if (current === next || current.includes(next)) {
|
|
1421
|
+
return current;
|
|
1422
|
+
}
|
|
1423
|
+
if (next.includes(current)) {
|
|
1424
|
+
return next;
|
|
1425
|
+
}
|
|
1426
|
+
if (countWords(next) > countWords(current)) {
|
|
1427
|
+
return next;
|
|
1428
|
+
}
|
|
1429
|
+
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
1430
|
+
return next;
|
|
1431
|
+
}
|
|
1432
|
+
return current;
|
|
1433
|
+
};
|
|
1434
|
+
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
1435
|
+
const current = normalizeText(currentText);
|
|
1436
|
+
const next = normalizeText(nextText);
|
|
1437
|
+
if (!current) {
|
|
1438
|
+
return next;
|
|
1439
|
+
}
|
|
1440
|
+
if (!next) {
|
|
1441
|
+
return current;
|
|
1442
|
+
}
|
|
1443
|
+
const currentWords = current.split(" ");
|
|
1444
|
+
const nextWords = next.split(" ");
|
|
1445
|
+
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
1446
|
+
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
1447
|
+
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
1448
|
+
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
1449
|
+
if (currentSuffix === nextPrefix) {
|
|
1450
|
+
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
return `${current} ${next}`.trim();
|
|
1454
|
+
};
|
|
1455
|
+
var countCommonPrefixWords = (currentText, nextText) => {
|
|
1456
|
+
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
1457
|
+
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
1458
|
+
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
1459
|
+
let count = 0;
|
|
1460
|
+
for (let index = 0;index < maxWords; index += 1) {
|
|
1461
|
+
if (currentWords[index] !== nextWords[index]) {
|
|
1462
|
+
break;
|
|
1463
|
+
}
|
|
1464
|
+
count += 1;
|
|
1465
|
+
}
|
|
1466
|
+
return count;
|
|
1467
|
+
};
|
|
1468
|
+
var mergeTranscriptTexts = (transcripts) => {
|
|
1469
|
+
const merged = [];
|
|
1470
|
+
for (const transcript of transcripts) {
|
|
1471
|
+
const nextText = normalizeText(transcript.text);
|
|
1472
|
+
if (!nextText) {
|
|
1473
|
+
continue;
|
|
1474
|
+
}
|
|
1475
|
+
const previous = merged.at(-1);
|
|
1476
|
+
if (!previous) {
|
|
1477
|
+
merged.push(nextText);
|
|
1478
|
+
continue;
|
|
1479
|
+
}
|
|
1480
|
+
if (nextText === previous || previous.includes(nextText)) {
|
|
1481
|
+
continue;
|
|
1482
|
+
}
|
|
1483
|
+
if (nextText.includes(previous)) {
|
|
1484
|
+
merged[merged.length - 1] = nextText;
|
|
1485
|
+
continue;
|
|
1486
|
+
}
|
|
1487
|
+
merged.push(nextText);
|
|
1488
|
+
}
|
|
1489
|
+
return merged.join(" ").trim();
|
|
1490
|
+
};
|
|
1491
|
+
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
1492
|
+
const finalText = mergeTranscriptTexts(transcripts);
|
|
1493
|
+
const nextPartial = normalizeText(partialText);
|
|
1494
|
+
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
1495
|
+
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
1496
|
+
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
1497
|
+
}
|
|
1498
|
+
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
1499
|
+
};
|
|
1500
|
+
|
|
1383
1501
|
// src/core/turnProfiles.ts
|
|
1384
1502
|
var TURN_PROFILE_DEFAULTS = {
|
|
1385
1503
|
balanced: {
|
|
1386
1504
|
qualityProfile: "general",
|
|
1505
|
+
semanticVetoMaxMs: 0,
|
|
1506
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
1387
1507
|
silenceMs: 1400,
|
|
1388
1508
|
speechThreshold: 0.012,
|
|
1389
1509
|
transcriptStabilityMs: 1000
|
|
1390
1510
|
},
|
|
1391
1511
|
fast: {
|
|
1392
1512
|
qualityProfile: "general",
|
|
1513
|
+
semanticVetoMaxMs: 0,
|
|
1514
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
1393
1515
|
silenceMs: 700,
|
|
1394
1516
|
speechThreshold: 0.015,
|
|
1395
1517
|
transcriptStabilityMs: 450
|
|
1396
1518
|
},
|
|
1397
1519
|
"long-form": {
|
|
1398
1520
|
qualityProfile: "general",
|
|
1521
|
+
semanticVetoMaxMs: 0,
|
|
1522
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
1399
1523
|
silenceMs: 2200,
|
|
1400
1524
|
speechThreshold: 0.01,
|
|
1401
1525
|
transcriptStabilityMs: 1500
|
|
@@ -1429,6 +1553,8 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
1429
1553
|
return {
|
|
1430
1554
|
profile,
|
|
1431
1555
|
qualityProfile,
|
|
1556
|
+
semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
|
|
1557
|
+
semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
|
|
1432
1558
|
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
1433
1559
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
1434
1560
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|