@absolutejs/voice 0.0.22-beta.597 → 0.0.22-beta.599
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/angular/index.js +6 -127
- package/dist/client/htmxBootstrap.js +6 -12
- package/dist/client/index.js +6 -127
- package/dist/core/semanticTurn.d.ts +11 -1
- package/dist/core/turnDetection.d.ts +1 -1
- package/dist/core/types.d.ts +2 -4
- package/dist/embed/index.js +6 -12
- package/dist/embed/voice-widget.js +8 -8
- package/dist/index.js +171 -184
- package/dist/react/index.js +6 -127
- package/dist/svelte/index.js +6 -127
- package/dist/testing/index.js +42 -57
- package/dist/vue/index.js +6 -127
- package/package.json +1 -1
package/dist/react/index.js
CHANGED
|
@@ -12272,146 +12272,25 @@ var resolveAudioConditioningConfig = (config) => {
|
|
|
12272
12272
|
};
|
|
12273
12273
|
};
|
|
12274
12274
|
|
|
12275
|
-
// src/core/turnDetection.ts
|
|
12276
|
-
var DEFAULT_SILENCE_MS = 700;
|
|
12277
|
-
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
12278
|
-
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
12279
|
-
var toUint8Array = (audio) => {
|
|
12280
|
-
if (audio instanceof ArrayBuffer) {
|
|
12281
|
-
return new Uint8Array(audio);
|
|
12282
|
-
}
|
|
12283
|
-
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
12284
|
-
};
|
|
12285
|
-
var measureAudioLevel = (audio) => {
|
|
12286
|
-
const bytes = toUint8Array(audio);
|
|
12287
|
-
if (bytes.byteLength < 2) {
|
|
12288
|
-
return 0;
|
|
12289
|
-
}
|
|
12290
|
-
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
12291
|
-
if (samples.length === 0) {
|
|
12292
|
-
return 0;
|
|
12293
|
-
}
|
|
12294
|
-
let sumSquares = 0;
|
|
12295
|
-
for (const sample of samples) {
|
|
12296
|
-
const normalized = sample / 32768;
|
|
12297
|
-
sumSquares += normalized * normalized;
|
|
12298
|
-
}
|
|
12299
|
-
return Math.sqrt(sumSquares / samples.length);
|
|
12300
|
-
};
|
|
12301
|
-
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
12302
|
-
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
12303
|
-
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
12304
|
-
const current = normalizeText(currentText);
|
|
12305
|
-
const next = normalizeText(nextText);
|
|
12306
|
-
if (!current) {
|
|
12307
|
-
return next;
|
|
12308
|
-
}
|
|
12309
|
-
if (!next) {
|
|
12310
|
-
return current;
|
|
12311
|
-
}
|
|
12312
|
-
if (current === next || current.includes(next)) {
|
|
12313
|
-
return current;
|
|
12314
|
-
}
|
|
12315
|
-
if (next.includes(current)) {
|
|
12316
|
-
return next;
|
|
12317
|
-
}
|
|
12318
|
-
if (countWords(next) > countWords(current)) {
|
|
12319
|
-
return next;
|
|
12320
|
-
}
|
|
12321
|
-
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
12322
|
-
return next;
|
|
12323
|
-
}
|
|
12324
|
-
return current;
|
|
12325
|
-
};
|
|
12326
|
-
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
12327
|
-
const current = normalizeText(currentText);
|
|
12328
|
-
const next = normalizeText(nextText);
|
|
12329
|
-
if (!current) {
|
|
12330
|
-
return next;
|
|
12331
|
-
}
|
|
12332
|
-
if (!next) {
|
|
12333
|
-
return current;
|
|
12334
|
-
}
|
|
12335
|
-
const currentWords = current.split(" ");
|
|
12336
|
-
const nextWords = next.split(" ");
|
|
12337
|
-
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
12338
|
-
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
12339
|
-
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
12340
|
-
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
12341
|
-
if (currentSuffix === nextPrefix) {
|
|
12342
|
-
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
12343
|
-
}
|
|
12344
|
-
}
|
|
12345
|
-
return `${current} ${next}`.trim();
|
|
12346
|
-
};
|
|
12347
|
-
var countCommonPrefixWords = (currentText, nextText) => {
|
|
12348
|
-
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
12349
|
-
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
12350
|
-
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
12351
|
-
let count = 0;
|
|
12352
|
-
for (let index = 0;index < maxWords; index += 1) {
|
|
12353
|
-
if (currentWords[index] !== nextWords[index]) {
|
|
12354
|
-
break;
|
|
12355
|
-
}
|
|
12356
|
-
count += 1;
|
|
12357
|
-
}
|
|
12358
|
-
return count;
|
|
12359
|
-
};
|
|
12360
|
-
var mergeTranscriptTexts = (transcripts) => {
|
|
12361
|
-
const merged = [];
|
|
12362
|
-
for (const transcript of transcripts) {
|
|
12363
|
-
const nextText = normalizeText(transcript.text);
|
|
12364
|
-
if (!nextText) {
|
|
12365
|
-
continue;
|
|
12366
|
-
}
|
|
12367
|
-
const previous = merged.at(-1);
|
|
12368
|
-
if (!previous) {
|
|
12369
|
-
merged.push(nextText);
|
|
12370
|
-
continue;
|
|
12371
|
-
}
|
|
12372
|
-
if (nextText === previous || previous.includes(nextText)) {
|
|
12373
|
-
continue;
|
|
12374
|
-
}
|
|
12375
|
-
if (nextText.includes(previous)) {
|
|
12376
|
-
merged[merged.length - 1] = nextText;
|
|
12377
|
-
continue;
|
|
12378
|
-
}
|
|
12379
|
-
merged.push(nextText);
|
|
12380
|
-
}
|
|
12381
|
-
return merged.join(" ").trim();
|
|
12382
|
-
};
|
|
12383
|
-
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
12384
|
-
const finalText = mergeTranscriptTexts(transcripts);
|
|
12385
|
-
const nextPartial = normalizeText(partialText);
|
|
12386
|
-
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
12387
|
-
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
12388
|
-
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
12389
|
-
}
|
|
12390
|
-
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
12391
|
-
};
|
|
12392
|
-
|
|
12393
12275
|
// src/core/turnProfiles.ts
|
|
12394
12276
|
var TURN_PROFILE_DEFAULTS = {
|
|
12395
12277
|
balanced: {
|
|
12396
12278
|
qualityProfile: "general",
|
|
12397
|
-
|
|
12398
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12279
|
+
minSilenceMs: 400,
|
|
12399
12280
|
silenceMs: 1400,
|
|
12400
12281
|
speechThreshold: 0.012,
|
|
12401
12282
|
transcriptStabilityMs: 1000
|
|
12402
12283
|
},
|
|
12403
12284
|
fast: {
|
|
12404
12285
|
qualityProfile: "general",
|
|
12405
|
-
|
|
12406
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12286
|
+
minSilenceMs: 300,
|
|
12407
12287
|
silenceMs: 700,
|
|
12408
12288
|
speechThreshold: 0.015,
|
|
12409
12289
|
transcriptStabilityMs: 450
|
|
12410
12290
|
},
|
|
12411
12291
|
"long-form": {
|
|
12412
12292
|
qualityProfile: "general",
|
|
12413
|
-
|
|
12414
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
12293
|
+
minSilenceMs: 600,
|
|
12415
12294
|
silenceMs: 2200,
|
|
12416
12295
|
speechThreshold: 0.01,
|
|
12417
12296
|
transcriptStabilityMs: 1500
|
|
@@ -12442,12 +12321,12 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
12442
12321
|
const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
|
|
12443
12322
|
const preset = TURN_PROFILE_DEFAULTS[profile];
|
|
12444
12323
|
const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
|
|
12324
|
+
const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
|
|
12445
12325
|
return {
|
|
12446
12326
|
profile,
|
|
12447
12327
|
qualityProfile,
|
|
12448
|
-
|
|
12449
|
-
|
|
12450
|
-
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
12328
|
+
minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
|
|
12329
|
+
silenceMs,
|
|
12451
12330
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
12452
12331
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
12453
12332
|
};
|
package/dist/svelte/index.js
CHANGED
|
@@ -1409,146 +1409,25 @@ var resolveAudioConditioningConfig = (config) => {
|
|
|
1409
1409
|
};
|
|
1410
1410
|
};
|
|
1411
1411
|
|
|
1412
|
-
// src/core/turnDetection.ts
|
|
1413
|
-
var DEFAULT_SILENCE_MS = 700;
|
|
1414
|
-
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
1415
|
-
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
1416
|
-
var toUint8Array = (audio) => {
|
|
1417
|
-
if (audio instanceof ArrayBuffer) {
|
|
1418
|
-
return new Uint8Array(audio);
|
|
1419
|
-
}
|
|
1420
|
-
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
1421
|
-
};
|
|
1422
|
-
var measureAudioLevel = (audio) => {
|
|
1423
|
-
const bytes = toUint8Array(audio);
|
|
1424
|
-
if (bytes.byteLength < 2) {
|
|
1425
|
-
return 0;
|
|
1426
|
-
}
|
|
1427
|
-
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
1428
|
-
if (samples.length === 0) {
|
|
1429
|
-
return 0;
|
|
1430
|
-
}
|
|
1431
|
-
let sumSquares = 0;
|
|
1432
|
-
for (const sample of samples) {
|
|
1433
|
-
const normalized = sample / 32768;
|
|
1434
|
-
sumSquares += normalized * normalized;
|
|
1435
|
-
}
|
|
1436
|
-
return Math.sqrt(sumSquares / samples.length);
|
|
1437
|
-
};
|
|
1438
|
-
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
1439
|
-
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
1440
|
-
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
1441
|
-
const current = normalizeText(currentText);
|
|
1442
|
-
const next = normalizeText(nextText);
|
|
1443
|
-
if (!current) {
|
|
1444
|
-
return next;
|
|
1445
|
-
}
|
|
1446
|
-
if (!next) {
|
|
1447
|
-
return current;
|
|
1448
|
-
}
|
|
1449
|
-
if (current === next || current.includes(next)) {
|
|
1450
|
-
return current;
|
|
1451
|
-
}
|
|
1452
|
-
if (next.includes(current)) {
|
|
1453
|
-
return next;
|
|
1454
|
-
}
|
|
1455
|
-
if (countWords(next) > countWords(current)) {
|
|
1456
|
-
return next;
|
|
1457
|
-
}
|
|
1458
|
-
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
1459
|
-
return next;
|
|
1460
|
-
}
|
|
1461
|
-
return current;
|
|
1462
|
-
};
|
|
1463
|
-
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
1464
|
-
const current = normalizeText(currentText);
|
|
1465
|
-
const next = normalizeText(nextText);
|
|
1466
|
-
if (!current) {
|
|
1467
|
-
return next;
|
|
1468
|
-
}
|
|
1469
|
-
if (!next) {
|
|
1470
|
-
return current;
|
|
1471
|
-
}
|
|
1472
|
-
const currentWords = current.split(" ");
|
|
1473
|
-
const nextWords = next.split(" ");
|
|
1474
|
-
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
1475
|
-
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
1476
|
-
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
1477
|
-
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
1478
|
-
if (currentSuffix === nextPrefix) {
|
|
1479
|
-
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
1480
|
-
}
|
|
1481
|
-
}
|
|
1482
|
-
return `${current} ${next}`.trim();
|
|
1483
|
-
};
|
|
1484
|
-
var countCommonPrefixWords = (currentText, nextText) => {
|
|
1485
|
-
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
1486
|
-
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
1487
|
-
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
1488
|
-
let count = 0;
|
|
1489
|
-
for (let index = 0;index < maxWords; index += 1) {
|
|
1490
|
-
if (currentWords[index] !== nextWords[index]) {
|
|
1491
|
-
break;
|
|
1492
|
-
}
|
|
1493
|
-
count += 1;
|
|
1494
|
-
}
|
|
1495
|
-
return count;
|
|
1496
|
-
};
|
|
1497
|
-
var mergeTranscriptTexts = (transcripts) => {
|
|
1498
|
-
const merged = [];
|
|
1499
|
-
for (const transcript of transcripts) {
|
|
1500
|
-
const nextText = normalizeText(transcript.text);
|
|
1501
|
-
if (!nextText) {
|
|
1502
|
-
continue;
|
|
1503
|
-
}
|
|
1504
|
-
const previous = merged.at(-1);
|
|
1505
|
-
if (!previous) {
|
|
1506
|
-
merged.push(nextText);
|
|
1507
|
-
continue;
|
|
1508
|
-
}
|
|
1509
|
-
if (nextText === previous || previous.includes(nextText)) {
|
|
1510
|
-
continue;
|
|
1511
|
-
}
|
|
1512
|
-
if (nextText.includes(previous)) {
|
|
1513
|
-
merged[merged.length - 1] = nextText;
|
|
1514
|
-
continue;
|
|
1515
|
-
}
|
|
1516
|
-
merged.push(nextText);
|
|
1517
|
-
}
|
|
1518
|
-
return merged.join(" ").trim();
|
|
1519
|
-
};
|
|
1520
|
-
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
1521
|
-
const finalText = mergeTranscriptTexts(transcripts);
|
|
1522
|
-
const nextPartial = normalizeText(partialText);
|
|
1523
|
-
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
1524
|
-
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
1525
|
-
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
1526
|
-
}
|
|
1527
|
-
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
1528
|
-
};
|
|
1529
|
-
|
|
1530
1412
|
// src/core/turnProfiles.ts
|
|
1531
1413
|
var TURN_PROFILE_DEFAULTS = {
|
|
1532
1414
|
balanced: {
|
|
1533
1415
|
qualityProfile: "general",
|
|
1534
|
-
|
|
1535
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
1416
|
+
minSilenceMs: 400,
|
|
1536
1417
|
silenceMs: 1400,
|
|
1537
1418
|
speechThreshold: 0.012,
|
|
1538
1419
|
transcriptStabilityMs: 1000
|
|
1539
1420
|
},
|
|
1540
1421
|
fast: {
|
|
1541
1422
|
qualityProfile: "general",
|
|
1542
|
-
|
|
1543
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
1423
|
+
minSilenceMs: 300,
|
|
1544
1424
|
silenceMs: 700,
|
|
1545
1425
|
speechThreshold: 0.015,
|
|
1546
1426
|
transcriptStabilityMs: 450
|
|
1547
1427
|
},
|
|
1548
1428
|
"long-form": {
|
|
1549
1429
|
qualityProfile: "general",
|
|
1550
|
-
|
|
1551
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
1430
|
+
minSilenceMs: 600,
|
|
1552
1431
|
silenceMs: 2200,
|
|
1553
1432
|
speechThreshold: 0.01,
|
|
1554
1433
|
transcriptStabilityMs: 1500
|
|
@@ -1579,12 +1458,12 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
1579
1458
|
const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
|
|
1580
1459
|
const preset = TURN_PROFILE_DEFAULTS[profile];
|
|
1581
1460
|
const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
|
|
1461
|
+
const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
|
|
1582
1462
|
return {
|
|
1583
1463
|
profile,
|
|
1584
1464
|
qualityProfile,
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
1465
|
+
minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
|
|
1466
|
+
silenceMs,
|
|
1588
1467
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
1589
1468
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
1590
1469
|
};
|
package/dist/testing/index.js
CHANGED
|
@@ -86,7 +86,7 @@ var __require = import.meta.require;
|
|
|
86
86
|
// src/core/turnDetection.ts
|
|
87
87
|
var DEFAULT_SILENCE_MS = 700;
|
|
88
88
|
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
89
|
-
var
|
|
89
|
+
var DEFAULT_MIN_SILENCE_MS = 400;
|
|
90
90
|
var toUint8Array = (audio) => {
|
|
91
91
|
if (audio instanceof ArrayBuffer) {
|
|
92
92
|
return new Uint8Array(audio);
|
|
@@ -3163,24 +3163,21 @@ var resolveAudioConditioningConfig = (config) => {
|
|
|
3163
3163
|
var TURN_PROFILE_DEFAULTS = {
|
|
3164
3164
|
balanced: {
|
|
3165
3165
|
qualityProfile: "general",
|
|
3166
|
-
|
|
3167
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
3166
|
+
minSilenceMs: 400,
|
|
3168
3167
|
silenceMs: 1400,
|
|
3169
3168
|
speechThreshold: 0.012,
|
|
3170
3169
|
transcriptStabilityMs: 1000
|
|
3171
3170
|
},
|
|
3172
3171
|
fast: {
|
|
3173
3172
|
qualityProfile: "general",
|
|
3174
|
-
|
|
3175
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
3173
|
+
minSilenceMs: 300,
|
|
3176
3174
|
silenceMs: 700,
|
|
3177
3175
|
speechThreshold: 0.015,
|
|
3178
3176
|
transcriptStabilityMs: 450
|
|
3179
3177
|
},
|
|
3180
3178
|
"long-form": {
|
|
3181
3179
|
qualityProfile: "general",
|
|
3182
|
-
|
|
3183
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
3180
|
+
minSilenceMs: 600,
|
|
3184
3181
|
silenceMs: 2200,
|
|
3185
3182
|
speechThreshold: 0.01,
|
|
3186
3183
|
transcriptStabilityMs: 1500
|
|
@@ -3211,12 +3208,12 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
3211
3208
|
const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
|
|
3212
3209
|
const preset = TURN_PROFILE_DEFAULTS[profile];
|
|
3213
3210
|
const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
|
|
3211
|
+
const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
|
|
3214
3212
|
return {
|
|
3215
3213
|
profile,
|
|
3216
3214
|
qualityProfile,
|
|
3217
|
-
|
|
3218
|
-
|
|
3219
|
-
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
3215
|
+
minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
|
|
3216
|
+
silenceMs,
|
|
3220
3217
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
3221
3218
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
3222
3219
|
};
|
|
@@ -6153,14 +6150,22 @@ var createVoiceSession = (options) => {
|
|
|
6153
6150
|
strategy: options.reconnect.strategy ?? "resume-last-turn",
|
|
6154
6151
|
timeout: options.reconnect.timeout ?? DEFAULT_RECONNECT_TIMEOUT
|
|
6155
6152
|
};
|
|
6153
|
+
const resolvedSilenceMs = options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS;
|
|
6156
6154
|
const turnDetection = {
|
|
6157
|
-
silenceMs:
|
|
6155
|
+
silenceMs: resolvedSilenceMs,
|
|
6156
|
+
minSilenceMs: Math.min(resolvedSilenceMs, options.turnDetection.minSilenceMs ?? DEFAULT_MIN_SILENCE_MS),
|
|
6158
6157
|
speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
|
|
6159
|
-
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
|
|
6160
|
-
|
|
6161
|
-
|
|
6158
|
+
transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
|
|
6159
|
+
};
|
|
6160
|
+
let lastTurnCompleteConfidence = null;
|
|
6161
|
+
const adaptiveSilenceMs = () => {
|
|
6162
|
+
const { minSilenceMs, silenceMs } = turnDetection;
|
|
6163
|
+
if (lastTurnCompleteConfidence === null || silenceMs <= minSilenceMs) {
|
|
6164
|
+
return silenceMs;
|
|
6165
|
+
}
|
|
6166
|
+
const complete = Math.max(0, Math.min(1, lastTurnCompleteConfidence));
|
|
6167
|
+
return Math.round(minSilenceMs + (silenceMs - minSilenceMs) * (1 - complete));
|
|
6162
6168
|
};
|
|
6163
|
-
let semanticVetoElapsedMs = 0;
|
|
6164
6169
|
const sttFallback = options.sttFallback ? {
|
|
6165
6170
|
adapter: options.sttFallback.adapter,
|
|
6166
6171
|
completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
|
|
@@ -6364,6 +6369,17 @@ var createVoiceSession = (options) => {
|
|
|
6364
6369
|
pruneTurnAudio();
|
|
6365
6370
|
return currentTurnAudio.map((audio) => audio.chunk);
|
|
6366
6371
|
};
|
|
6372
|
+
const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
|
|
6373
|
+
const getTurnAudioForDetector = () => {
|
|
6374
|
+
if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
|
|
6375
|
+
return { turnAudio: undefined, turnAudioFormat: undefined };
|
|
6376
|
+
}
|
|
6377
|
+
const turnAudio = currentTurnAudio.map((audio) => {
|
|
6378
|
+
const c = audio.chunk;
|
|
6379
|
+
return c instanceof ArrayBuffer ? new Uint8Array(c) : new Uint8Array(c.buffer, c.byteOffset, c.byteLength);
|
|
6380
|
+
});
|
|
6381
|
+
return { turnAudio, turnAudioFormat: turnAudioInputFormat };
|
|
6382
|
+
};
|
|
6367
6383
|
const clearSilenceTimer = () => {
|
|
6368
6384
|
if (!silenceTimer) {
|
|
6369
6385
|
return;
|
|
@@ -6682,46 +6698,8 @@ var createVoiceSession = (options) => {
|
|
|
6682
6698
|
runScheduledCommit(reason);
|
|
6683
6699
|
}, delayMs);
|
|
6684
6700
|
};
|
|
6685
|
-
const scheduleSilenceCommit = (delayMs =
|
|
6686
|
-
const shouldDeferSilenceCommit = async (reason) => {
|
|
6687
|
-
if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
|
|
6688
|
-
return false;
|
|
6689
|
-
}
|
|
6690
|
-
const session = await readSession();
|
|
6691
|
-
const { partialText, transcripts } = session.currentTurn;
|
|
6692
|
-
const userText = buildTurnText(transcripts, partialText, {
|
|
6693
|
-
partialEndedAtMs: session.currentTurn.partialEndedAt,
|
|
6694
|
-
partialStartedAtMs: session.currentTurn.partialStartedAt
|
|
6695
|
-
});
|
|
6696
|
-
if (!userText) {
|
|
6697
|
-
return false;
|
|
6698
|
-
}
|
|
6699
|
-
const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
|
|
6700
|
-
let endOfTurn = true;
|
|
6701
|
-
try {
|
|
6702
|
-
const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
|
|
6703
|
-
lastFinalTranscript: transcripts.at(-1),
|
|
6704
|
-
partialText,
|
|
6705
|
-
silenceMs,
|
|
6706
|
-
transcripts
|
|
6707
|
-
}));
|
|
6708
|
-
endOfTurn = verdict.endOfTurn;
|
|
6709
|
-
} catch {
|
|
6710
|
-
return false;
|
|
6711
|
-
}
|
|
6712
|
-
if (endOfTurn !== false) {
|
|
6713
|
-
return false;
|
|
6714
|
-
}
|
|
6715
|
-
const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
|
|
6716
|
-
const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
|
|
6717
|
-
semanticVetoElapsedMs += extendMs;
|
|
6718
|
-
scheduleTurnCommit(extendMs, reason);
|
|
6719
|
-
return true;
|
|
6720
|
-
};
|
|
6701
|
+
const scheduleSilenceCommit = (delayMs = adaptiveSilenceMs(), reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
|
|
6721
6702
|
const runScheduledCommit = async (reason) => {
|
|
6722
|
-
if (await shouldDeferSilenceCommit(reason)) {
|
|
6723
|
-
return;
|
|
6724
|
-
}
|
|
6725
6703
|
await api.commitTurn(reason);
|
|
6726
6704
|
};
|
|
6727
6705
|
const requestTurnCommit = async (reason) => {
|
|
@@ -7461,7 +7439,7 @@ var createVoiceSession = (options) => {
|
|
|
7461
7439
|
session2.lastActivityAt = Date.now();
|
|
7462
7440
|
session2.status = "active";
|
|
7463
7441
|
});
|
|
7464
|
-
|
|
7442
|
+
lastTurnCompleteConfidence = null;
|
|
7465
7443
|
if (silenceTimer && pendingCommitReason === "vendor") {
|
|
7466
7444
|
scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
|
|
7467
7445
|
}
|
|
@@ -7488,8 +7466,15 @@ var createVoiceSession = (options) => {
|
|
|
7488
7466
|
lastFinalTranscript: transcript,
|
|
7489
7467
|
partialText: session.currentTurn.partialText,
|
|
7490
7468
|
silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
|
|
7491
|
-
transcripts: session.currentTurn.transcripts
|
|
7469
|
+
transcripts: session.currentTurn.transcripts,
|
|
7470
|
+
...getTurnAudioForDetector()
|
|
7492
7471
|
}));
|
|
7472
|
+
if (typeof verdict.confidence === "number") {
|
|
7473
|
+
lastTurnCompleteConfidence = verdict.confidence;
|
|
7474
|
+
if (silenceTimer && pendingCommitReason === "silence") {
|
|
7475
|
+
scheduleSilenceCommit();
|
|
7476
|
+
}
|
|
7477
|
+
}
|
|
7493
7478
|
if (verdict.endOfTurn) {
|
|
7494
7479
|
clearSilenceTimer();
|
|
7495
7480
|
await requestTurnCommit("vendor");
|
|
@@ -8185,7 +8170,7 @@ var createVoiceSession = (options) => {
|
|
|
8185
8170
|
};
|
|
8186
8171
|
const commitTurnInternal = async (reason = "manual") => {
|
|
8187
8172
|
clearSilenceTimer();
|
|
8188
|
-
|
|
8173
|
+
lastTurnCompleteConfidence = null;
|
|
8189
8174
|
backchannelDriver?.reset();
|
|
8190
8175
|
amdLastTurnCommitAt = Date.now();
|
|
8191
8176
|
const session = await readSession();
|
package/dist/vue/index.js
CHANGED
|
@@ -11689,146 +11689,25 @@ var resolveAudioConditioningConfig = (config) => {
|
|
|
11689
11689
|
};
|
|
11690
11690
|
};
|
|
11691
11691
|
|
|
11692
|
-
// src/core/turnDetection.ts
|
|
11693
|
-
var DEFAULT_SILENCE_MS = 700;
|
|
11694
|
-
var DEFAULT_SPEECH_THRESHOLD = 0.015;
|
|
11695
|
-
var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
|
|
11696
|
-
var toUint8Array = (audio) => {
|
|
11697
|
-
if (audio instanceof ArrayBuffer) {
|
|
11698
|
-
return new Uint8Array(audio);
|
|
11699
|
-
}
|
|
11700
|
-
return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
|
|
11701
|
-
};
|
|
11702
|
-
var measureAudioLevel = (audio) => {
|
|
11703
|
-
const bytes = toUint8Array(audio);
|
|
11704
|
-
if (bytes.byteLength < 2) {
|
|
11705
|
-
return 0;
|
|
11706
|
-
}
|
|
11707
|
-
const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
|
11708
|
-
if (samples.length === 0) {
|
|
11709
|
-
return 0;
|
|
11710
|
-
}
|
|
11711
|
-
let sumSquares = 0;
|
|
11712
|
-
for (const sample of samples) {
|
|
11713
|
-
const normalized = sample / 32768;
|
|
11714
|
-
sumSquares += normalized * normalized;
|
|
11715
|
-
}
|
|
11716
|
-
return Math.sqrt(sumSquares / samples.length);
|
|
11717
|
-
};
|
|
11718
|
-
var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
|
|
11719
|
-
var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
|
|
11720
|
-
var selectPreferredTranscriptText = (currentText, nextText) => {
|
|
11721
|
-
const current = normalizeText(currentText);
|
|
11722
|
-
const next = normalizeText(nextText);
|
|
11723
|
-
if (!current) {
|
|
11724
|
-
return next;
|
|
11725
|
-
}
|
|
11726
|
-
if (!next) {
|
|
11727
|
-
return current;
|
|
11728
|
-
}
|
|
11729
|
-
if (current === next || current.includes(next)) {
|
|
11730
|
-
return current;
|
|
11731
|
-
}
|
|
11732
|
-
if (next.includes(current)) {
|
|
11733
|
-
return next;
|
|
11734
|
-
}
|
|
11735
|
-
if (countWords(next) > countWords(current)) {
|
|
11736
|
-
return next;
|
|
11737
|
-
}
|
|
11738
|
-
if (countWords(next) === countWords(current) && next.length > current.length) {
|
|
11739
|
-
return next;
|
|
11740
|
-
}
|
|
11741
|
-
return current;
|
|
11742
|
-
};
|
|
11743
|
-
var mergeSequentialTranscriptText = (currentText, nextText) => {
|
|
11744
|
-
const current = normalizeText(currentText);
|
|
11745
|
-
const next = normalizeText(nextText);
|
|
11746
|
-
if (!current) {
|
|
11747
|
-
return next;
|
|
11748
|
-
}
|
|
11749
|
-
if (!next) {
|
|
11750
|
-
return current;
|
|
11751
|
-
}
|
|
11752
|
-
const currentWords = current.split(" ");
|
|
11753
|
-
const nextWords = next.split(" ");
|
|
11754
|
-
const maxOverlap = Math.min(currentWords.length, nextWords.length);
|
|
11755
|
-
for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
|
|
11756
|
-
const currentSuffix = currentWords.slice(-overlap).join(" ");
|
|
11757
|
-
const nextPrefix = nextWords.slice(0, overlap).join(" ");
|
|
11758
|
-
if (currentSuffix === nextPrefix) {
|
|
11759
|
-
return [...currentWords, ...nextWords.slice(overlap)].join(" ");
|
|
11760
|
-
}
|
|
11761
|
-
}
|
|
11762
|
-
return `${current} ${next}`.trim();
|
|
11763
|
-
};
|
|
11764
|
-
var countCommonPrefixWords = (currentText, nextText) => {
|
|
11765
|
-
const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
|
|
11766
|
-
const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
|
|
11767
|
-
const maxWords = Math.min(currentWords.length, nextWords.length);
|
|
11768
|
-
let count = 0;
|
|
11769
|
-
for (let index = 0;index < maxWords; index += 1) {
|
|
11770
|
-
if (currentWords[index] !== nextWords[index]) {
|
|
11771
|
-
break;
|
|
11772
|
-
}
|
|
11773
|
-
count += 1;
|
|
11774
|
-
}
|
|
11775
|
-
return count;
|
|
11776
|
-
};
|
|
11777
|
-
var mergeTranscriptTexts = (transcripts) => {
|
|
11778
|
-
const merged = [];
|
|
11779
|
-
for (const transcript of transcripts) {
|
|
11780
|
-
const nextText = normalizeText(transcript.text);
|
|
11781
|
-
if (!nextText) {
|
|
11782
|
-
continue;
|
|
11783
|
-
}
|
|
11784
|
-
const previous = merged.at(-1);
|
|
11785
|
-
if (!previous) {
|
|
11786
|
-
merged.push(nextText);
|
|
11787
|
-
continue;
|
|
11788
|
-
}
|
|
11789
|
-
if (nextText === previous || previous.includes(nextText)) {
|
|
11790
|
-
continue;
|
|
11791
|
-
}
|
|
11792
|
-
if (nextText.includes(previous)) {
|
|
11793
|
-
merged[merged.length - 1] = nextText;
|
|
11794
|
-
continue;
|
|
11795
|
-
}
|
|
11796
|
-
merged.push(nextText);
|
|
11797
|
-
}
|
|
11798
|
-
return merged.join(" ").trim();
|
|
11799
|
-
};
|
|
11800
|
-
var buildTurnText = (transcripts, partialText, options = {}) => {
|
|
11801
|
-
const finalText = mergeTranscriptTexts(transcripts);
|
|
11802
|
-
const nextPartial = normalizeText(partialText);
|
|
11803
|
-
const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
|
|
11804
|
-
if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
|
|
11805
|
-
return mergeSequentialTranscriptText(finalText, nextPartial);
|
|
11806
|
-
}
|
|
11807
|
-
return selectPreferredTranscriptText(finalText, nextPartial);
|
|
11808
|
-
};
|
|
11809
|
-
|
|
11810
11692
|
// src/core/turnProfiles.ts
|
|
11811
11693
|
var TURN_PROFILE_DEFAULTS = {
|
|
11812
11694
|
balanced: {
|
|
11813
11695
|
qualityProfile: "general",
|
|
11814
|
-
|
|
11815
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
11696
|
+
minSilenceMs: 400,
|
|
11816
11697
|
silenceMs: 1400,
|
|
11817
11698
|
speechThreshold: 0.012,
|
|
11818
11699
|
transcriptStabilityMs: 1000
|
|
11819
11700
|
},
|
|
11820
11701
|
fast: {
|
|
11821
11702
|
qualityProfile: "general",
|
|
11822
|
-
|
|
11823
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
11703
|
+
minSilenceMs: 300,
|
|
11824
11704
|
silenceMs: 700,
|
|
11825
11705
|
speechThreshold: 0.015,
|
|
11826
11706
|
transcriptStabilityMs: 450
|
|
11827
11707
|
},
|
|
11828
11708
|
"long-form": {
|
|
11829
11709
|
qualityProfile: "general",
|
|
11830
|
-
|
|
11831
|
-
semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
|
|
11710
|
+
minSilenceMs: 600,
|
|
11832
11711
|
silenceMs: 2200,
|
|
11833
11712
|
speechThreshold: 0.01,
|
|
11834
11713
|
transcriptStabilityMs: 1500
|
|
@@ -11859,12 +11738,12 @@ var resolveTurnDetectionConfig = (config) => {
|
|
|
11859
11738
|
const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
|
|
11860
11739
|
const preset = TURN_PROFILE_DEFAULTS[profile];
|
|
11861
11740
|
const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
|
|
11741
|
+
const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
|
|
11862
11742
|
return {
|
|
11863
11743
|
profile,
|
|
11864
11744
|
qualityProfile,
|
|
11865
|
-
|
|
11866
|
-
|
|
11867
|
-
silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
|
|
11745
|
+
minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
|
|
11746
|
+
silenceMs,
|
|
11868
11747
|
speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
|
|
11869
11748
|
transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
|
|
11870
11749
|
};
|