@absolutejs/voice 0.0.22-beta.583 → 0.0.22-beta.585
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/angular/index.js +126 -0
- package/dist/client/htmxBootstrap.js +11 -0
- package/dist/client/index.js +126 -0
- package/dist/core/hardenedFetch.d.ts +3 -0
- package/dist/core/turnDetection.d.ts +1 -0
- package/dist/core/types.d.ts +4 -0
- package/dist/embed/index.js +11 -0
- package/dist/embed/voice-widget.js +8 -8
- package/dist/index.d.ts +1 -0
- package/dist/index.js +219 -122
- package/dist/react/index.js +126 -0
- package/dist/svelte/index.js +126 -0
- package/dist/testing/index.js +99 -5
- package/dist/vue/index.js +126 -0
- package/package.json +1 -1
package/dist/svelte/index.js
CHANGED
|
@@ -1380,22 +1380,146 @@ var resolveAudioConditioningConfig = (config) => {
|
|
|
1380
1380
|
};
|
|
1381
1381
|
};
|
|
1382
1382
|
|
|
1383
|
+
// src/core/turnDetection.ts
|
|
1384
|
+
var DEFAULT_SILENCE_MS = 700;
|
|
1385
|
+
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
1386
|
+
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
1387
|
+
var toUint8Array = (audio) => {
|
|
1388
|
+
if (audio instanceof ArrayBuffer) {
|
|
1389
|
+
return new Uint8Array(audio);
|
|
1390
|
+
}
|
|
1391
|
+
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
1392
|
+
};
|
|
1393
|
+
var measureAudioLevel = (audio) => {
|
|
1394
|
+
const bytes = toUint8Array(audio);
|
|
1395
|
+
if (bytes.byteLength < 2) {
|
|
1396
|
+
return 0;
|
|
1397
|
+
}
|
|
1398
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
1399
|
+
if (samples.length === 0) {
|
|
1400
|
+
return 0;
|
|
1401
|
+
}
|
|
1402
|
+
let sumSquares = 0;
|
|
1403
|
+
for (const sample of samples) {
|
|
1404
|
+
const normalized = sample / 32768;
|
|
1405
|
+
sumSquares += normalized * normalized;
|
|
1406
|
+
}
|
|
1407
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
1408
|
+
};
|
|
1409
|
+
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
1410
|
+
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
1411
|
+
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
1412
|
+
const current = normalizeText(currentText);
|
|
1413
|
+
const next = normalizeText(nextText);
|
|
1414
|
+
if (!current) {
|
|
1415
|
+
return next;
|
|
1416
|
+
}
|
|
1417
|
+
if (!next) {
|
|
1418
|
+
return current;
|
|
1419
|
+
}
|
|
1420
|
+
if (current === next || current.includes(next)) {
|
|
1421
|
+
return current;
|
|
1422
|
+
}
|
|
1423
|
+
if (next.includes(current)) {
|
|
1424
|
+
return next;
|
|
1425
|
+
}
|
|
1426
|
+
if (countWords(next) > countWords(current)) {
|
|
1427
|
+
return next;
|
|
1428
|
+
}
|
|
1429
|
+
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
1430
|
+
return next;
|
|
1431
|
+
}
|
|
1432
|
+
return current;
|
|
1433
|
+
};
|
|
1434
|
+
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
1435
|
+
const current = normalizeText(currentText);
|
|
1436
|
+
const next = normalizeText(nextText);
|
|
1437
|
+
if (!current) {
|
|
1438
|
+
return next;
|
|
1439
|
+
}
|
|
1440
|
+
if (!next) {
|
|
1441
|
+
return current;
|
|
1442
|
+
}
|
|
1443
|
+
const currentWords = current.split(" ");
|
|
1444
|
+
const nextWords = next.split(" ");
|
|
1445
|
+
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
1446
|
+
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
1447
|
+
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
1448
|
+
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
1449
|
+
if (currentSuffix === nextPrefix) {
|
|
1450
|
+
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
return `${current} ${next}`.trim();
|
|
1454
|
+
};
|
|
1455
|
+
var countCommonPrefixWords = (currentText, nextText) => {
|
|
1456
|
+
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
1457
|
+
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
1458
|
+
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
1459
|
+
let count = 0;
|
|
1460
|
+
for (let index = 0;index < maxWords; index += 1) {
|
|
1461
|
+
if (currentWords[index] !== nextWords[index]) {
|
|
1462
|
+
break;
|
|
1463
|
+
}
|
|
1464
|
+
count += 1;
|
|
1465
|
+
}
|
|
1466
|
+
return count;
|
|
1467
|
+
};
|
|
1468
|
+
var mergeTranscriptTexts = (transcripts) => {
|
|
1469
|
+
const merged = [];
|
|
1470
|
+
for (const transcript of transcripts) {
|
|
1471
|
+
const nextText = normalizeText(transcript.text);
|
|
1472
|
+
if (!nextText) {
|
|
1473
|
+
continue;
|
|
1474
|
+
}
|
|
1475
|
+
const previous = merged.at(-1);
|
|
1476
|
+
if (!previous) {
|
|
1477
|
+
merged.push(nextText);
|
|
1478
|
+
continue;
|
|
1479
|
+
}
|
|
1480
|
+
if (nextText === previous || previous.includes(nextText)) {
|
|
1481
|
+
continue;
|
|
1482
|
+
}
|
|
1483
|
+
if (nextText.includes(previous)) {
|
|
1484
|
+
merged[merged.length - 1] = nextText;
|
|
1485
|
+
continue;
|
|
1486
|
+
}
|
|
1487
|
+
merged.push(nextText);
|
|
1488
|
+
}
|
|
1489
|
+
return merged.join(" ").trim();
|
|
1490
|
+
};
|
|
1491
|
+
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
1492
|
+
const finalText = mergeTranscriptTexts(transcripts);
|
|
1493
|
+
const nextPartial = normalizeText(partialText);
|
|
1494
|
+
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
1495
|
+
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
1496
|
+
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
1497
|
+
}
|
|
1498
|
+
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
1499
|
+
};
|
|
1500
|
+
|
|
1383
1501
|
// src/core/turnProfiles.ts
|
|
1384
1502
|
var TURN_PROFILE_DEFAULTS = {
|
|
1385
1503
|
balanced: {
|
|
1386
1504
|
qualityProfile: "general",
|
|
1505
|
+
semanticVetoMaxMs: 0,
|
|
1506
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
1387
1507
|
silenceMs: 1400,
|
|
1388
1508
|
speechThreshold: 0.012,
|
|
1389
1509
|
transcriptStabilityMs: 1000
|
|
1390
1510
|
},
|
|
1391
1511
|
fast: {
|
|
1392
1512
|
qualityProfile: "general",
|
|
1513
|
+
semanticVetoMaxMs: 0,
|
|
1514
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
1393
1515
|
silenceMs: 700,
|
|
1394
1516
|
speechThreshold: 0.015,
|
|
1395
1517
|
transcriptStabilityMs: 450
|
|
1396
1518
|
},
|
|
1397
1519
|
"long-form": {
|
|
1398
1520
|
qualityProfile: "general",
|
|
1521
|
+
semanticVetoMaxMs: 0,
|
|
1522
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
1399
1523
|
silenceMs: 2200,
|
|
1400
1524
|
speechThreshold: 0.01,
|
|
1401
1525
|
transcriptStabilityMs: 1500
|
|
@@ -1429,6 +1553,8 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
1429
1553
|
return {
|
|
1430
1554
|
profile,
|
|
1431
1555
|
qualityProfile,
|
|
1556
|
+
semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
|
|
1557
|
+
semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
|
|
1432
1558
|
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
1433
1559
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
1434
1560
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
package/dist/testing/index.js
CHANGED
|
@@ -86,6 +86,7 @@ var __require = import.meta.require;
|
|
|
86
86
|
// src/core/turnDetection.ts
|
|
87
87
|
var DEFAULT_SILENCE_MS = 700;
|
|
88
88
|
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
89
|
+
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
89
90
|
var toUint8Array = (audio) => {
|
|
90
91
|
if (audio instanceof ArrayBuffer) {
|
|
91
92
|
return new Uint8Array(audio);
|
|
@@ -3133,18 +3134,24 @@ var resolveAudioConditioningConfig = (config) => {
|
|
|
3133
3134
|
var TURN_PROFILE_DEFAULTS = {
|
|
3134
3135
|
balanced: {
|
|
3135
3136
|
qualityProfile: "general",
|
|
3137
|
+
semanticVetoMaxMs: 0,
|
|
3138
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
3136
3139
|
silenceMs: 1400,
|
|
3137
3140
|
speechThreshold: 0.012,
|
|
3138
3141
|
transcriptStabilityMs: 1000
|
|
3139
3142
|
},
|
|
3140
3143
|
fast: {
|
|
3141
3144
|
qualityProfile: "general",
|
|
3145
|
+
semanticVetoMaxMs: 0,
|
|
3146
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
3142
3147
|
silenceMs: 700,
|
|
3143
3148
|
speechThreshold: 0.015,
|
|
3144
3149
|
transcriptStabilityMs: 450
|
|
3145
3150
|
},
|
|
3146
3151
|
"long-form": {
|
|
3147
3152
|
qualityProfile: "general",
|
|
3153
|
+
semanticVetoMaxMs: 0,
|
|
3154
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
3148
3155
|
silenceMs: 2200,
|
|
3149
3156
|
speechThreshold: 0.01,
|
|
3150
3157
|
transcriptStabilityMs: 1500
|
|
@@ -3178,6 +3185,8 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
3178
3185
|
return {
|
|
3179
3186
|
profile,
|
|
3180
3187
|
qualityProfile,
|
|
3188
|
+
semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
|
|
3189
|
+
semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
|
|
3181
3190
|
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
3182
3191
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
3183
3192
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
@@ -4210,6 +4219,45 @@ var startVoiceTimer = (sessionId) => {
|
|
|
4210
4219
|
};
|
|
4211
4220
|
var voiceTimingEnabled = () => timingEnabled();
|
|
4212
4221
|
|
|
4222
|
+
// src/core/hardenedFetch.ts
|
|
4223
|
+
var ATTEMPT_TIMEOUT_MS = 6000;
|
|
4224
|
+
var isBun = "Bun" in globalThis;
|
|
4225
|
+
var oneAttempt = async (baseFetch, input, init) => {
|
|
4226
|
+
const controller = new AbortController;
|
|
4227
|
+
const callerSignal = init?.signal ?? undefined;
|
|
4228
|
+
const onCallerAbort = () => controller.abort(callerSignal?.reason);
|
|
4229
|
+
if (callerSignal?.aborted)
|
|
4230
|
+
controller.abort(callerSignal.reason);
|
|
4231
|
+
else
|
|
4232
|
+
callerSignal?.addEventListener("abort", onCallerAbort, { once: true });
|
|
4233
|
+
const timer = setTimeout(() => {
|
|
4234
|
+
controller.abort(new Error(`fetch exceeded ${ATTEMPT_TIMEOUT_MS}ms before response headers (stale Bun keep-alive socket?)`));
|
|
4235
|
+
}, ATTEMPT_TIMEOUT_MS);
|
|
4236
|
+
const headers = new Headers(init?.headers);
|
|
4237
|
+
if (isBun)
|
|
4238
|
+
headers.set("Connection", "close");
|
|
4239
|
+
try {
|
|
4240
|
+
return await baseFetch(input, {
|
|
4241
|
+
...init,
|
|
4242
|
+
headers,
|
|
4243
|
+
signal: controller.signal
|
|
4244
|
+
});
|
|
4245
|
+
} finally {
|
|
4246
|
+
clearTimeout(timer);
|
|
4247
|
+
callerSignal?.removeEventListener("abort", onCallerAbort);
|
|
4248
|
+
}
|
|
4249
|
+
};
|
|
4250
|
+
var hardenFetch = (baseFetch = globalThis.fetch) => Object.assign(async (input, init) => {
|
|
4251
|
+
try {
|
|
4252
|
+
return await oneAttempt(baseFetch, input, init);
|
|
4253
|
+
} catch (error) {
|
|
4254
|
+
if (init?.signal?.aborted)
|
|
4255
|
+
throw error;
|
|
4256
|
+
console.warn(`[voice] hardened fetch retrying on a fresh connection: ${error instanceof Error ? error.message : String(error)}`);
|
|
4257
|
+
return oneAttempt(baseFetch, input, init);
|
|
4258
|
+
}
|
|
4259
|
+
}, { preconnect: baseFetch.preconnect.bind(baseFetch) });
|
|
4260
|
+
|
|
4213
4261
|
// src/core/modelAdapters.ts
|
|
4214
4262
|
var isVoiceProviderRoutingPolicyPreset = (value) => value === "balanced" || value === "cost-cap" || value === "cost-first" || value === "latency-first" || value === "quality-first";
|
|
4215
4263
|
var resolveVoiceProviderRoutingPolicyPreset = (preset, options = {}) => {
|
|
@@ -4914,7 +4962,7 @@ var consumeOpenAIResponsesStream = async (response, onTextDelta, abortOptions) =
|
|
|
4914
4962
|
return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
|
|
4915
4963
|
};
|
|
4916
4964
|
var createOpenAIVoiceAssistantModel = (options) => {
|
|
4917
|
-
const fetchImpl = options.fetch
|
|
4965
|
+
const fetchImpl = hardenFetch(options.fetch);
|
|
4918
4966
|
const baseUrl = options.baseUrl ?? "https://api.openai.com/v1";
|
|
4919
4967
|
const model = options.model ?? "gpt-4.1-mini";
|
|
4920
4968
|
const timeoutMs = options.timeoutMs ?? 60000;
|
|
@@ -5039,7 +5087,7 @@ var consumeAnthropicStream = async (response, onTextDelta) => {
|
|
|
5039
5087
|
return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
|
|
5040
5088
|
};
|
|
5041
5089
|
var createAnthropicVoiceAssistantModel = (options) => {
|
|
5042
|
-
const fetchImpl = options.fetch
|
|
5090
|
+
const fetchImpl = hardenFetch(options.fetch);
|
|
5043
5091
|
const baseUrl = options.baseUrl ?? "https://api.anthropic.com/v1";
|
|
5044
5092
|
const model = options.model ?? "claude-sonnet-4-5";
|
|
5045
5093
|
return {
|
|
@@ -5125,7 +5173,7 @@ var consumeGeminiStream = async (response, onTextDelta) => {
|
|
|
5125
5173
|
return { assistantText, toolCalls, usage };
|
|
5126
5174
|
};
|
|
5127
5175
|
var createGeminiVoiceAssistantModel = (options) => {
|
|
5128
|
-
const fetchImpl = options.fetch
|
|
5176
|
+
const fetchImpl = hardenFetch(options.fetch);
|
|
5129
5177
|
const baseUrl = options.baseUrl ?? "https://generativelanguage.googleapis.com/v1beta";
|
|
5130
5178
|
const model = options.model ?? "gemini-2.5-flash";
|
|
5131
5179
|
const maxRetries = Math.max(0, options.maxRetries ?? 2);
|
|
@@ -6066,8 +6114,11 @@ var createVoiceSession = (options) => {
|
|
|
6066
6114
|
const turnDetection = {
|
|
6067
6115
|
silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
|
|
6068
6116
|
speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
|
|
6069
|
-
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
|
|
6117
|
+
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS,
|
|
6118
|
+
semanticVetoMaxMs: options.turnDetection.semanticVetoMaxMs ?? 0,
|
|
6119
|
+
semanticVetoRecheckMs: options.turnDetection.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS
|
|
6070
6120
|
};
|
|
6121
|
+
let semanticVetoElapsedMs = 0;
|
|
6071
6122
|
const sttFallback = options.sttFallback ? {
|
|
6072
6123
|
adapter: options.sttFallback.adapter,
|
|
6073
6124
|
completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
|
|
@@ -6582,10 +6633,51 @@ var createVoiceSession = (options) => {
|
|
|
6582
6633
|
silenceTimer = setTimeout(() => {
|
|
6583
6634
|
silenceTimer = null;
|
|
6584
6635
|
pendingCommitReason = null;
|
|
6585
|
-
|
|
6636
|
+
runScheduledCommit(reason);
|
|
6586
6637
|
}, delayMs);
|
|
6587
6638
|
};
|
|
6588
6639
|
const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
|
|
6640
|
+
const shouldDeferSilenceCommit = async (reason) => {
|
|
6641
|
+
if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
|
|
6642
|
+
return false;
|
|
6643
|
+
}
|
|
6644
|
+
const session = await readSession();
|
|
6645
|
+
const { partialText, transcripts } = session.currentTurn;
|
|
6646
|
+
const userText = buildTurnText(transcripts, partialText, {
|
|
6647
|
+
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
6648
|
+
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
6649
|
+
});
|
|
6650
|
+
if (!userText) {
|
|
6651
|
+
return false;
|
|
6652
|
+
}
|
|
6653
|
+
const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
|
|
6654
|
+
let endOfTurn = true;
|
|
6655
|
+
try {
|
|
6656
|
+
const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
|
|
6657
|
+
lastFinalTranscript: transcripts.at(-1),
|
|
6658
|
+
partialText,
|
|
6659
|
+
silenceMs,
|
|
6660
|
+
transcripts
|
|
6661
|
+
}));
|
|
6662
|
+
endOfTurn = verdict.endOfTurn;
|
|
6663
|
+
} catch {
|
|
6664
|
+
return false;
|
|
6665
|
+
}
|
|
6666
|
+
if (endOfTurn !== false) {
|
|
6667
|
+
return false;
|
|
6668
|
+
}
|
|
6669
|
+
const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
|
|
6670
|
+
const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
|
|
6671
|
+
semanticVetoElapsedMs += extendMs;
|
|
6672
|
+
scheduleTurnCommit(extendMs, reason);
|
|
6673
|
+
return true;
|
|
6674
|
+
};
|
|
6675
|
+
const runScheduledCommit = async (reason) => {
|
|
6676
|
+
if (await shouldDeferSilenceCommit(reason)) {
|
|
6677
|
+
return;
|
|
6678
|
+
}
|
|
6679
|
+
await api.commitTurn(reason);
|
|
6680
|
+
};
|
|
6589
6681
|
const requestTurnCommit = async (reason) => {
|
|
6590
6682
|
const session = await readSession();
|
|
6591
6683
|
const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
|
|
@@ -7297,6 +7389,7 @@ var createVoiceSession = (options) => {
|
|
|
7297
7389
|
session2.lastActivityAt = Date.now();
|
|
7298
7390
|
session2.status = "active";
|
|
7299
7391
|
});
|
|
7392
|
+
semanticVetoElapsedMs = 0;
|
|
7300
7393
|
if (silenceTimer && pendingCommitReason === "vendor") {
|
|
7301
7394
|
scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
|
|
7302
7395
|
}
|
|
@@ -8000,6 +8093,7 @@ var createVoiceSession = (options) => {
|
|
|
8000
8093
|
};
|
|
8001
8094
|
const commitTurnInternal = async (reason = "manual") => {
|
|
8002
8095
|
clearSilenceTimer();
|
|
8096
|
+
semanticVetoElapsedMs = 0;
|
|
8003
8097
|
backchannelDriver?.reset();
|
|
8004
8098
|
amdLastTurnCommitAt = Date.now();
|
|
8005
8099
|
const session = await readSession();
|
package/dist/vue/index.js
CHANGED
|
@@ -11660,22 +11660,146 @@ var resolveAudioConditioningConfig = (config) => {
|
|
|
11660
11660
|
};
|
|
11661
11661
|
};
|
|
11662
11662
|
|
|
11663
|
+
// src/core/turnDetection.ts
|
|
11664
|
+
var DEFAULT_SILENCE_MS = 700;
|
|
11665
|
+
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
11666
|
+
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
11667
|
+
var toUint8Array = (audio) => {
|
|
11668
|
+
if (audio instanceof ArrayBuffer) {
|
|
11669
|
+
return new Uint8Array(audio);
|
|
11670
|
+
}
|
|
11671
|
+
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
11672
|
+
};
|
|
11673
|
+
var measureAudioLevel = (audio) => {
|
|
11674
|
+
const bytes = toUint8Array(audio);
|
|
11675
|
+
if (bytes.byteLength < 2) {
|
|
11676
|
+
return 0;
|
|
11677
|
+
}
|
|
11678
|
+
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
11679
|
+
if (samples.length === 0) {
|
|
11680
|
+
return 0;
|
|
11681
|
+
}
|
|
11682
|
+
let sumSquares = 0;
|
|
11683
|
+
for (const sample of samples) {
|
|
11684
|
+
const normalized = sample / 32768;
|
|
11685
|
+
sumSquares += normalized * normalized;
|
|
11686
|
+
}
|
|
11687
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
11688
|
+
};
|
|
11689
|
+
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
11690
|
+
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
11691
|
+
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
11692
|
+
const current = normalizeText(currentText);
|
|
11693
|
+
const next = normalizeText(nextText);
|
|
11694
|
+
if (!current) {
|
|
11695
|
+
return next;
|
|
11696
|
+
}
|
|
11697
|
+
if (!next) {
|
|
11698
|
+
return current;
|
|
11699
|
+
}
|
|
11700
|
+
if (current === next || current.includes(next)) {
|
|
11701
|
+
return current;
|
|
11702
|
+
}
|
|
11703
|
+
if (next.includes(current)) {
|
|
11704
|
+
return next;
|
|
11705
|
+
}
|
|
11706
|
+
if (countWords(next) > countWords(current)) {
|
|
11707
|
+
return next;
|
|
11708
|
+
}
|
|
11709
|
+
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
11710
|
+
return next;
|
|
11711
|
+
}
|
|
11712
|
+
return current;
|
|
11713
|
+
};
|
|
11714
|
+
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
11715
|
+
const current = normalizeText(currentText);
|
|
11716
|
+
const next = normalizeText(nextText);
|
|
11717
|
+
if (!current) {
|
|
11718
|
+
return next;
|
|
11719
|
+
}
|
|
11720
|
+
if (!next) {
|
|
11721
|
+
return current;
|
|
11722
|
+
}
|
|
11723
|
+
const currentWords = current.split(" ");
|
|
11724
|
+
const nextWords = next.split(" ");
|
|
11725
|
+
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
11726
|
+
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
11727
|
+
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
11728
|
+
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
11729
|
+
if (currentSuffix === nextPrefix) {
|
|
11730
|
+
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
11731
|
+
}
|
|
11732
|
+
}
|
|
11733
|
+
return `${current} ${next}`.trim();
|
|
11734
|
+
};
|
|
11735
|
+
var countCommonPrefixWords = (currentText, nextText) => {
|
|
11736
|
+
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
11737
|
+
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
11738
|
+
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
11739
|
+
let count = 0;
|
|
11740
|
+
for (let index = 0;index < maxWords; index += 1) {
|
|
11741
|
+
if (currentWords[index] !== nextWords[index]) {
|
|
11742
|
+
break;
|
|
11743
|
+
}
|
|
11744
|
+
count += 1;
|
|
11745
|
+
}
|
|
11746
|
+
return count;
|
|
11747
|
+
};
|
|
11748
|
+
var mergeTranscriptTexts = (transcripts) => {
|
|
11749
|
+
const merged = [];
|
|
11750
|
+
for (const transcript of transcripts) {
|
|
11751
|
+
const nextText = normalizeText(transcript.text);
|
|
11752
|
+
if (!nextText) {
|
|
11753
|
+
continue;
|
|
11754
|
+
}
|
|
11755
|
+
const previous = merged.at(-1);
|
|
11756
|
+
if (!previous) {
|
|
11757
|
+
merged.push(nextText);
|
|
11758
|
+
continue;
|
|
11759
|
+
}
|
|
11760
|
+
if (nextText === previous || previous.includes(nextText)) {
|
|
11761
|
+
continue;
|
|
11762
|
+
}
|
|
11763
|
+
if (nextText.includes(previous)) {
|
|
11764
|
+
merged[merged.length - 1] = nextText;
|
|
11765
|
+
continue;
|
|
11766
|
+
}
|
|
11767
|
+
merged.push(nextText);
|
|
11768
|
+
}
|
|
11769
|
+
return merged.join(" ").trim();
|
|
11770
|
+
};
|
|
11771
|
+
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
11772
|
+
const finalText = mergeTranscriptTexts(transcripts);
|
|
11773
|
+
const nextPartial = normalizeText(partialText);
|
|
11774
|
+
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
11775
|
+
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
11776
|
+
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
11777
|
+
}
|
|
11778
|
+
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
11779
|
+
};
|
|
11780
|
+
|
|
11663
11781
|
// src/core/turnProfiles.ts
|
|
11664
11782
|
var TURN_PROFILE_DEFAULTS = {
|
|
11665
11783
|
balanced: {
|
|
11666
11784
|
qualityProfile: "general",
|
|
11785
|
+
semanticVetoMaxMs: 0,
|
|
11786
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
11667
11787
|
silenceMs: 1400,
|
|
11668
11788
|
speechThreshold: 0.012,
|
|
11669
11789
|
transcriptStabilityMs: 1000
|
|
11670
11790
|
},
|
|
11671
11791
|
fast: {
|
|
11672
11792
|
qualityProfile: "general",
|
|
11793
|
+
semanticVetoMaxMs: 0,
|
|
11794
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
11673
11795
|
silenceMs: 700,
|
|
11674
11796
|
speechThreshold: 0.015,
|
|
11675
11797
|
transcriptStabilityMs: 450
|
|
11676
11798
|
},
|
|
11677
11799
|
"long-form": {
|
|
11678
11800
|
qualityProfile: "general",
|
|
11801
|
+
semanticVetoMaxMs: 0,
|
|
11802
|
+
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
11679
11803
|
silenceMs: 2200,
|
|
11680
11804
|
speechThreshold: 0.01,
|
|
11681
11805
|
transcriptStabilityMs: 1500
|
|
@@ -11709,6 +11833,8 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
11709
11833
|
return {
|
|
11710
11834
|
profile,
|
|
11711
11835
|
qualityProfile,
|
|
11836
|
+
semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
|
|
11837
|
+
semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
|
|
11712
11838
|
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
11713
11839
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
11714
11840
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|