@absolutejs/voice 0.0.22-beta.597 → 0.0.22-beta.599

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12272,146 +12272,25 @@ var resolveAudioConditioningConfig = (config) => {
12272
12272
  };
12273
12273
  };
12274
12274
 
12275
- // src/core/turnDetection.ts
12276
- var DEFAULT_SILENCE_MS = 700;
12277
- var DEFAULT_SPEECH_THRESHOLD = 0.015;
12278
- var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
12279
- var toUint8Array = (audio) => {
12280
- if (audio instanceof ArrayBuffer) {
12281
- return new Uint8Array(audio);
12282
- }
12283
- return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
12284
- };
12285
- var measureAudioLevel = (audio) => {
12286
- const bytes = toUint8Array(audio);
12287
- if (bytes.byteLength < 2) {
12288
- return 0;
12289
- }
12290
- const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
12291
- if (samples.length === 0) {
12292
- return 0;
12293
- }
12294
- let sumSquares = 0;
12295
- for (const sample of samples) {
12296
- const normalized = sample / 32768;
12297
- sumSquares += normalized * normalized;
12298
- }
12299
- return Math.sqrt(sumSquares / samples.length);
12300
- };
12301
- var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
12302
- var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
12303
- var selectPreferredTranscriptText = (currentText, nextText) => {
12304
- const current = normalizeText(currentText);
12305
- const next = normalizeText(nextText);
12306
- if (!current) {
12307
- return next;
12308
- }
12309
- if (!next) {
12310
- return current;
12311
- }
12312
- if (current === next || current.includes(next)) {
12313
- return current;
12314
- }
12315
- if (next.includes(current)) {
12316
- return next;
12317
- }
12318
- if (countWords(next) > countWords(current)) {
12319
- return next;
12320
- }
12321
- if (countWords(next) === countWords(current) && next.length > current.length) {
12322
- return next;
12323
- }
12324
- return current;
12325
- };
12326
- var mergeSequentialTranscriptText = (currentText, nextText) => {
12327
- const current = normalizeText(currentText);
12328
- const next = normalizeText(nextText);
12329
- if (!current) {
12330
- return next;
12331
- }
12332
- if (!next) {
12333
- return current;
12334
- }
12335
- const currentWords = current.split(" ");
12336
- const nextWords = next.split(" ");
12337
- const maxOverlap = Math.min(currentWords.length, nextWords.length);
12338
- for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
12339
- const currentSuffix = currentWords.slice(-overlap).join(" ");
12340
- const nextPrefix = nextWords.slice(0, overlap).join(" ");
12341
- if (currentSuffix === nextPrefix) {
12342
- return [...currentWords, ...nextWords.slice(overlap)].join(" ");
12343
- }
12344
- }
12345
- return `${current} ${next}`.trim();
12346
- };
12347
- var countCommonPrefixWords = (currentText, nextText) => {
12348
- const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
12349
- const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
12350
- const maxWords = Math.min(currentWords.length, nextWords.length);
12351
- let count = 0;
12352
- for (let index = 0;index < maxWords; index += 1) {
12353
- if (currentWords[index] !== nextWords[index]) {
12354
- break;
12355
- }
12356
- count += 1;
12357
- }
12358
- return count;
12359
- };
12360
- var mergeTranscriptTexts = (transcripts) => {
12361
- const merged = [];
12362
- for (const transcript of transcripts) {
12363
- const nextText = normalizeText(transcript.text);
12364
- if (!nextText) {
12365
- continue;
12366
- }
12367
- const previous = merged.at(-1);
12368
- if (!previous) {
12369
- merged.push(nextText);
12370
- continue;
12371
- }
12372
- if (nextText === previous || previous.includes(nextText)) {
12373
- continue;
12374
- }
12375
- if (nextText.includes(previous)) {
12376
- merged[merged.length - 1] = nextText;
12377
- continue;
12378
- }
12379
- merged.push(nextText);
12380
- }
12381
- return merged.join(" ").trim();
12382
- };
12383
- var buildTurnText = (transcripts, partialText, options = {}) => {
12384
- const finalText = mergeTranscriptTexts(transcripts);
12385
- const nextPartial = normalizeText(partialText);
12386
- const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
12387
- if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
12388
- return mergeSequentialTranscriptText(finalText, nextPartial);
12389
- }
12390
- return selectPreferredTranscriptText(finalText, nextPartial);
12391
- };
12392
-
12393
12275
  // src/core/turnProfiles.ts
12394
12276
  var TURN_PROFILE_DEFAULTS = {
12395
12277
  balanced: {
12396
12278
  qualityProfile: "general",
12397
- semanticVetoMaxMs: 0,
12398
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12279
+ minSilenceMs: 400,
12399
12280
  silenceMs: 1400,
12400
12281
  speechThreshold: 0.012,
12401
12282
  transcriptStabilityMs: 1000
12402
12283
  },
12403
12284
  fast: {
12404
12285
  qualityProfile: "general",
12405
- semanticVetoMaxMs: 0,
12406
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12286
+ minSilenceMs: 300,
12407
12287
  silenceMs: 700,
12408
12288
  speechThreshold: 0.015,
12409
12289
  transcriptStabilityMs: 450
12410
12290
  },
12411
12291
  "long-form": {
12412
12292
  qualityProfile: "general",
12413
- semanticVetoMaxMs: 0,
12414
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
12293
+ minSilenceMs: 600,
12415
12294
  silenceMs: 2200,
12416
12295
  speechThreshold: 0.01,
12417
12296
  transcriptStabilityMs: 1500
@@ -12442,12 +12321,12 @@ var resolveTurnDetectionConfig = (config) => {
12442
12321
  const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
12443
12322
  const preset = TURN_PROFILE_DEFAULTS[profile];
12444
12323
  const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
12324
+ const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
12445
12325
  return {
12446
12326
  profile,
12447
12327
  qualityProfile,
12448
- semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
12449
- semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
12450
- silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
12328
+ minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
12329
+ silenceMs,
12451
12330
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
12452
12331
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
12453
12332
  };
@@ -1409,146 +1409,25 @@ var resolveAudioConditioningConfig = (config) => {
1409
1409
  };
1410
1410
  };
1411
1411
 
1412
- // src/core/turnDetection.ts
1413
- var DEFAULT_SILENCE_MS = 700;
1414
- var DEFAULT_SPEECH_THRESHOLD = 0.015;
1415
- var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
1416
- var toUint8Array = (audio) => {
1417
- if (audio instanceof ArrayBuffer) {
1418
- return new Uint8Array(audio);
1419
- }
1420
- return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
1421
- };
1422
- var measureAudioLevel = (audio) => {
1423
- const bytes = toUint8Array(audio);
1424
- if (bytes.byteLength < 2) {
1425
- return 0;
1426
- }
1427
- const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
1428
- if (samples.length === 0) {
1429
- return 0;
1430
- }
1431
- let sumSquares = 0;
1432
- for (const sample of samples) {
1433
- const normalized = sample / 32768;
1434
- sumSquares += normalized * normalized;
1435
- }
1436
- return Math.sqrt(sumSquares / samples.length);
1437
- };
1438
- var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
1439
- var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
1440
- var selectPreferredTranscriptText = (currentText, nextText) => {
1441
- const current = normalizeText(currentText);
1442
- const next = normalizeText(nextText);
1443
- if (!current) {
1444
- return next;
1445
- }
1446
- if (!next) {
1447
- return current;
1448
- }
1449
- if (current === next || current.includes(next)) {
1450
- return current;
1451
- }
1452
- if (next.includes(current)) {
1453
- return next;
1454
- }
1455
- if (countWords(next) > countWords(current)) {
1456
- return next;
1457
- }
1458
- if (countWords(next) === countWords(current) && next.length > current.length) {
1459
- return next;
1460
- }
1461
- return current;
1462
- };
1463
- var mergeSequentialTranscriptText = (currentText, nextText) => {
1464
- const current = normalizeText(currentText);
1465
- const next = normalizeText(nextText);
1466
- if (!current) {
1467
- return next;
1468
- }
1469
- if (!next) {
1470
- return current;
1471
- }
1472
- const currentWords = current.split(" ");
1473
- const nextWords = next.split(" ");
1474
- const maxOverlap = Math.min(currentWords.length, nextWords.length);
1475
- for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
1476
- const currentSuffix = currentWords.slice(-overlap).join(" ");
1477
- const nextPrefix = nextWords.slice(0, overlap).join(" ");
1478
- if (currentSuffix === nextPrefix) {
1479
- return [...currentWords, ...nextWords.slice(overlap)].join(" ");
1480
- }
1481
- }
1482
- return `${current} ${next}`.trim();
1483
- };
1484
- var countCommonPrefixWords = (currentText, nextText) => {
1485
- const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
1486
- const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
1487
- const maxWords = Math.min(currentWords.length, nextWords.length);
1488
- let count = 0;
1489
- for (let index = 0;index < maxWords; index += 1) {
1490
- if (currentWords[index] !== nextWords[index]) {
1491
- break;
1492
- }
1493
- count += 1;
1494
- }
1495
- return count;
1496
- };
1497
- var mergeTranscriptTexts = (transcripts) => {
1498
- const merged = [];
1499
- for (const transcript of transcripts) {
1500
- const nextText = normalizeText(transcript.text);
1501
- if (!nextText) {
1502
- continue;
1503
- }
1504
- const previous = merged.at(-1);
1505
- if (!previous) {
1506
- merged.push(nextText);
1507
- continue;
1508
- }
1509
- if (nextText === previous || previous.includes(nextText)) {
1510
- continue;
1511
- }
1512
- if (nextText.includes(previous)) {
1513
- merged[merged.length - 1] = nextText;
1514
- continue;
1515
- }
1516
- merged.push(nextText);
1517
- }
1518
- return merged.join(" ").trim();
1519
- };
1520
- var buildTurnText = (transcripts, partialText, options = {}) => {
1521
- const finalText = mergeTranscriptTexts(transcripts);
1522
- const nextPartial = normalizeText(partialText);
1523
- const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
1524
- if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
1525
- return mergeSequentialTranscriptText(finalText, nextPartial);
1526
- }
1527
- return selectPreferredTranscriptText(finalText, nextPartial);
1528
- };
1529
-
1530
1412
  // src/core/turnProfiles.ts
1531
1413
  var TURN_PROFILE_DEFAULTS = {
1532
1414
  balanced: {
1533
1415
  qualityProfile: "general",
1534
- semanticVetoMaxMs: 0,
1535
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
1416
+ minSilenceMs: 400,
1536
1417
  silenceMs: 1400,
1537
1418
  speechThreshold: 0.012,
1538
1419
  transcriptStabilityMs: 1000
1539
1420
  },
1540
1421
  fast: {
1541
1422
  qualityProfile: "general",
1542
- semanticVetoMaxMs: 0,
1543
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
1423
+ minSilenceMs: 300,
1544
1424
  silenceMs: 700,
1545
1425
  speechThreshold: 0.015,
1546
1426
  transcriptStabilityMs: 450
1547
1427
  },
1548
1428
  "long-form": {
1549
1429
  qualityProfile: "general",
1550
- semanticVetoMaxMs: 0,
1551
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
1430
+ minSilenceMs: 600,
1552
1431
  silenceMs: 2200,
1553
1432
  speechThreshold: 0.01,
1554
1433
  transcriptStabilityMs: 1500
@@ -1579,12 +1458,12 @@ var resolveTurnDetectionConfig = (config) => {
1579
1458
  const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
1580
1459
  const preset = TURN_PROFILE_DEFAULTS[profile];
1581
1460
  const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
1461
+ const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
1582
1462
  return {
1583
1463
  profile,
1584
1464
  qualityProfile,
1585
- semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
1586
- semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
1587
- silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
1465
+ minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
1466
+ silenceMs,
1588
1467
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
1589
1468
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
1590
1469
  };
@@ -86,7 +86,7 @@ var __require = import.meta.require;
86
86
  // src/core/turnDetection.ts
87
87
  var DEFAULT_SILENCE_MS = 700;
88
88
  var DEFAULT_SPEECH_THRESHOLD = 0.015;
89
- var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
89
+ var DEFAULT_MIN_SILENCE_MS = 400;
90
90
  var toUint8Array = (audio) => {
91
91
  if (audio instanceof ArrayBuffer) {
92
92
  return new Uint8Array(audio);
@@ -3163,24 +3163,21 @@ var resolveAudioConditioningConfig = (config) => {
3163
3163
  var TURN_PROFILE_DEFAULTS = {
3164
3164
  balanced: {
3165
3165
  qualityProfile: "general",
3166
- semanticVetoMaxMs: 0,
3167
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
3166
+ minSilenceMs: 400,
3168
3167
  silenceMs: 1400,
3169
3168
  speechThreshold: 0.012,
3170
3169
  transcriptStabilityMs: 1000
3171
3170
  },
3172
3171
  fast: {
3173
3172
  qualityProfile: "general",
3174
- semanticVetoMaxMs: 0,
3175
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
3173
+ minSilenceMs: 300,
3176
3174
  silenceMs: 700,
3177
3175
  speechThreshold: 0.015,
3178
3176
  transcriptStabilityMs: 450
3179
3177
  },
3180
3178
  "long-form": {
3181
3179
  qualityProfile: "general",
3182
- semanticVetoMaxMs: 0,
3183
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
3180
+ minSilenceMs: 600,
3184
3181
  silenceMs: 2200,
3185
3182
  speechThreshold: 0.01,
3186
3183
  transcriptStabilityMs: 1500
@@ -3211,12 +3208,12 @@ var resolveTurnDetectionConfig = (config) => {
3211
3208
  const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
3212
3209
  const preset = TURN_PROFILE_DEFAULTS[profile];
3213
3210
  const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
3211
+ const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
3214
3212
  return {
3215
3213
  profile,
3216
3214
  qualityProfile,
3217
- semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
3218
- semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
3219
- silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
3215
+ minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
3216
+ silenceMs,
3220
3217
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
3221
3218
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
3222
3219
  };
@@ -6153,14 +6150,22 @@ var createVoiceSession = (options) => {
6153
6150
  strategy: options.reconnect.strategy ?? "resume-last-turn",
6154
6151
  timeout: options.reconnect.timeout ?? DEFAULT_RECONNECT_TIMEOUT
6155
6152
  };
6153
+ const resolvedSilenceMs = options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS;
6156
6154
  const turnDetection = {
6157
- silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
6155
+ silenceMs: resolvedSilenceMs,
6156
+ minSilenceMs: Math.min(resolvedSilenceMs, options.turnDetection.minSilenceMs ?? DEFAULT_MIN_SILENCE_MS),
6158
6157
  speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
6159
- transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS,
6160
- semanticVetoMaxMs: options.turnDetection.semanticVetoMaxMs ?? 0,
6161
- semanticVetoRecheckMs: options.turnDetection.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS
6158
+ transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
6159
+ };
6160
+ let lastTurnCompleteConfidence = null;
6161
+ const adaptiveSilenceMs = () => {
6162
+ const { minSilenceMs, silenceMs } = turnDetection;
6163
+ if (lastTurnCompleteConfidence === null || silenceMs <= minSilenceMs) {
6164
+ return silenceMs;
6165
+ }
6166
+ const complete = Math.max(0, Math.min(1, lastTurnCompleteConfidence));
6167
+ return Math.round(minSilenceMs + (silenceMs - minSilenceMs) * (1 - complete));
6162
6168
  };
6163
- let semanticVetoElapsedMs = 0;
6164
6169
  const sttFallback = options.sttFallback ? {
6165
6170
  adapter: options.sttFallback.adapter,
6166
6171
  completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
@@ -6364,6 +6369,17 @@ var createVoiceSession = (options) => {
6364
6369
  pruneTurnAudio();
6365
6370
  return currentTurnAudio.map((audio) => audio.chunk);
6366
6371
  };
6372
+ const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
6373
+ const getTurnAudioForDetector = () => {
6374
+ if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
6375
+ return { turnAudio: undefined, turnAudioFormat: undefined };
6376
+ }
6377
+ const turnAudio = currentTurnAudio.map((audio) => {
6378
+ const c = audio.chunk;
6379
+ return c instanceof ArrayBuffer ? new Uint8Array(c) : new Uint8Array(c.buffer, c.byteOffset, c.byteLength);
6380
+ });
6381
+ return { turnAudio, turnAudioFormat: turnAudioInputFormat };
6382
+ };
6367
6383
  const clearSilenceTimer = () => {
6368
6384
  if (!silenceTimer) {
6369
6385
  return;
@@ -6682,46 +6698,8 @@ var createVoiceSession = (options) => {
6682
6698
  runScheduledCommit(reason);
6683
6699
  }, delayMs);
6684
6700
  };
6685
- const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
6686
- const shouldDeferSilenceCommit = async (reason) => {
6687
- if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
6688
- return false;
6689
- }
6690
- const session = await readSession();
6691
- const { partialText, transcripts } = session.currentTurn;
6692
- const userText = buildTurnText(transcripts, partialText, {
6693
- partialEndedAtMs: session.currentTurn.partialEndedAt,
6694
- partialStartedAtMs: session.currentTurn.partialStartedAt
6695
- });
6696
- if (!userText) {
6697
- return false;
6698
- }
6699
- const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
6700
- let endOfTurn = true;
6701
- try {
6702
- const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
6703
- lastFinalTranscript: transcripts.at(-1),
6704
- partialText,
6705
- silenceMs,
6706
- transcripts
6707
- }));
6708
- endOfTurn = verdict.endOfTurn;
6709
- } catch {
6710
- return false;
6711
- }
6712
- if (endOfTurn !== false) {
6713
- return false;
6714
- }
6715
- const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
6716
- const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
6717
- semanticVetoElapsedMs += extendMs;
6718
- scheduleTurnCommit(extendMs, reason);
6719
- return true;
6720
- };
6701
+ const scheduleSilenceCommit = (delayMs = adaptiveSilenceMs(), reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
6721
6702
  const runScheduledCommit = async (reason) => {
6722
- if (await shouldDeferSilenceCommit(reason)) {
6723
- return;
6724
- }
6725
6703
  await api.commitTurn(reason);
6726
6704
  };
6727
6705
  const requestTurnCommit = async (reason) => {
@@ -7461,7 +7439,7 @@ var createVoiceSession = (options) => {
7461
7439
  session2.lastActivityAt = Date.now();
7462
7440
  session2.status = "active";
7463
7441
  });
7464
- semanticVetoElapsedMs = 0;
7442
+ lastTurnCompleteConfidence = null;
7465
7443
  if (silenceTimer && pendingCommitReason === "vendor") {
7466
7444
  scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
7467
7445
  }
@@ -7488,8 +7466,15 @@ var createVoiceSession = (options) => {
7488
7466
  lastFinalTranscript: transcript,
7489
7467
  partialText: session.currentTurn.partialText,
7490
7468
  silenceMs: session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : 0,
7491
- transcripts: session.currentTurn.transcripts
7469
+ transcripts: session.currentTurn.transcripts,
7470
+ ...getTurnAudioForDetector()
7492
7471
  }));
7472
+ if (typeof verdict.confidence === "number") {
7473
+ lastTurnCompleteConfidence = verdict.confidence;
7474
+ if (silenceTimer && pendingCommitReason === "silence") {
7475
+ scheduleSilenceCommit();
7476
+ }
7477
+ }
7493
7478
  if (verdict.endOfTurn) {
7494
7479
  clearSilenceTimer();
7495
7480
  await requestTurnCommit("vendor");
@@ -8185,7 +8170,7 @@ var createVoiceSession = (options) => {
8185
8170
  };
8186
8171
  const commitTurnInternal = async (reason = "manual") => {
8187
8172
  clearSilenceTimer();
8188
- semanticVetoElapsedMs = 0;
8173
+ lastTurnCompleteConfidence = null;
8189
8174
  backchannelDriver?.reset();
8190
8175
  amdLastTurnCommitAt = Date.now();
8191
8176
  const session = await readSession();
package/dist/vue/index.js CHANGED
@@ -11689,146 +11689,25 @@ var resolveAudioConditioningConfig = (config) => {
11689
11689
  };
11690
11690
  };
11691
11691
 
11692
- // src/core/turnDetection.ts
11693
- var DEFAULT_SILENCE_MS = 700;
11694
- var DEFAULT_SPEECH_THRESHOLD = 0.015;
11695
- var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
11696
- var toUint8Array = (audio) => {
11697
- if (audio instanceof ArrayBuffer) {
11698
- return new Uint8Array(audio);
11699
- }
11700
- return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
11701
- };
11702
- var measureAudioLevel = (audio) => {
11703
- const bytes = toUint8Array(audio);
11704
- if (bytes.byteLength < 2) {
11705
- return 0;
11706
- }
11707
- const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
11708
- if (samples.length === 0) {
11709
- return 0;
11710
- }
11711
- let sumSquares = 0;
11712
- for (const sample of samples) {
11713
- const normalized = sample / 32768;
11714
- sumSquares += normalized * normalized;
11715
- }
11716
- return Math.sqrt(sumSquares / samples.length);
11717
- };
11718
- var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
11719
- var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
11720
- var selectPreferredTranscriptText = (currentText, nextText) => {
11721
- const current = normalizeText(currentText);
11722
- const next = normalizeText(nextText);
11723
- if (!current) {
11724
- return next;
11725
- }
11726
- if (!next) {
11727
- return current;
11728
- }
11729
- if (current === next || current.includes(next)) {
11730
- return current;
11731
- }
11732
- if (next.includes(current)) {
11733
- return next;
11734
- }
11735
- if (countWords(next) > countWords(current)) {
11736
- return next;
11737
- }
11738
- if (countWords(next) === countWords(current) && next.length > current.length) {
11739
- return next;
11740
- }
11741
- return current;
11742
- };
11743
- var mergeSequentialTranscriptText = (currentText, nextText) => {
11744
- const current = normalizeText(currentText);
11745
- const next = normalizeText(nextText);
11746
- if (!current) {
11747
- return next;
11748
- }
11749
- if (!next) {
11750
- return current;
11751
- }
11752
- const currentWords = current.split(" ");
11753
- const nextWords = next.split(" ");
11754
- const maxOverlap = Math.min(currentWords.length, nextWords.length);
11755
- for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
11756
- const currentSuffix = currentWords.slice(-overlap).join(" ");
11757
- const nextPrefix = nextWords.slice(0, overlap).join(" ");
11758
- if (currentSuffix === nextPrefix) {
11759
- return [...currentWords, ...nextWords.slice(overlap)].join(" ");
11760
- }
11761
- }
11762
- return `${current} ${next}`.trim();
11763
- };
11764
- var countCommonPrefixWords = (currentText, nextText) => {
11765
- const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
11766
- const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
11767
- const maxWords = Math.min(currentWords.length, nextWords.length);
11768
- let count = 0;
11769
- for (let index = 0;index < maxWords; index += 1) {
11770
- if (currentWords[index] !== nextWords[index]) {
11771
- break;
11772
- }
11773
- count += 1;
11774
- }
11775
- return count;
11776
- };
11777
- var mergeTranscriptTexts = (transcripts) => {
11778
- const merged = [];
11779
- for (const transcript of transcripts) {
11780
- const nextText = normalizeText(transcript.text);
11781
- if (!nextText) {
11782
- continue;
11783
- }
11784
- const previous = merged.at(-1);
11785
- if (!previous) {
11786
- merged.push(nextText);
11787
- continue;
11788
- }
11789
- if (nextText === previous || previous.includes(nextText)) {
11790
- continue;
11791
- }
11792
- if (nextText.includes(previous)) {
11793
- merged[merged.length - 1] = nextText;
11794
- continue;
11795
- }
11796
- merged.push(nextText);
11797
- }
11798
- return merged.join(" ").trim();
11799
- };
11800
- var buildTurnText = (transcripts, partialText, options = {}) => {
11801
- const finalText = mergeTranscriptTexts(transcripts);
11802
- const nextPartial = normalizeText(partialText);
11803
- const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
11804
- if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
11805
- return mergeSequentialTranscriptText(finalText, nextPartial);
11806
- }
11807
- return selectPreferredTranscriptText(finalText, nextPartial);
11808
- };
11809
-
11810
11692
  // src/core/turnProfiles.ts
11811
11693
  var TURN_PROFILE_DEFAULTS = {
11812
11694
  balanced: {
11813
11695
  qualityProfile: "general",
11814
- semanticVetoMaxMs: 0,
11815
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
11696
+ minSilenceMs: 400,
11816
11697
  silenceMs: 1400,
11817
11698
  speechThreshold: 0.012,
11818
11699
  transcriptStabilityMs: 1000
11819
11700
  },
11820
11701
  fast: {
11821
11702
  qualityProfile: "general",
11822
- semanticVetoMaxMs: 0,
11823
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
11703
+ minSilenceMs: 300,
11824
11704
  silenceMs: 700,
11825
11705
  speechThreshold: 0.015,
11826
11706
  transcriptStabilityMs: 450
11827
11707
  },
11828
11708
  "long-form": {
11829
11709
  qualityProfile: "general",
11830
- semanticVetoMaxMs: 0,
11831
- semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
11710
+ minSilenceMs: 600,
11832
11711
  silenceMs: 2200,
11833
11712
  speechThreshold: 0.01,
11834
11713
  transcriptStabilityMs: 1500
@@ -11859,12 +11738,12 @@ var resolveTurnDetectionConfig = (config) => {
11859
11738
  const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
11860
11739
  const preset = TURN_PROFILE_DEFAULTS[profile];
11861
11740
  const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
11741
+ const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
11862
11742
  return {
11863
11743
  profile,
11864
11744
  qualityProfile,
11865
- semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
11866
- semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
11867
- silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
11745
+ minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
11746
+ silenceMs,
11868
11747
  speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
11869
11748
  transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
11870
11749
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.597",
3
+ "version": "0.0.22-beta.599",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",