npm - @absolutejs/voice - Versions diffs - 0.0.22-beta.598 → 0.0.22-beta.599 - Mend

@absolutejs/voice 0.0.22-beta.598 → 0.0.22-beta.599

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/angular/index.js +6 -127
package/dist/client/htmxBootstrap.js +6 -12
package/dist/client/index.js +6 -127
package/dist/core/turnDetection.d.ts +1 -1
package/dist/core/types.d.ts +2 -4
package/dist/embed/index.js +6 -12
package/dist/embed/voice-widget.js +8 -8
package/dist/index.js +158 -184
package/dist/react/index.js +6 -127
package/dist/svelte/index.js +6 -127
package/dist/testing/index.js +29 -57
package/dist/vue/index.js +6 -127
package/package.json +1 -1

package/dist/svelte/index.js CHANGED Viewed

@@ -1409,146 +1409,25 @@ var resolveAudioConditioningConfig = (config) => {
   };
 };
-// src/core/turnDetection.ts
-var DEFAULT_SILENCE_MS = 700;
-var DEFAULT_SPEECH_THRESHOLD = 0.015;
-var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
-var toUint8Array = (audio) => {
-  if (audio instanceof ArrayBuffer) {
-    return new Uint8Array(audio);
-  }
-  return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
-};
-var measureAudioLevel = (audio) => {
-  const bytes = toUint8Array(audio);
-  if (bytes.byteLength < 2) {
-    return 0;
-  }
-  const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
-  if (samples.length === 0) {
-    return 0;
-  }
-  let sumSquares = 0;
-  for (const sample of samples) {
-    const normalized = sample / 32768;
-    sumSquares += normalized * normalized;
-  }
-  return Math.sqrt(sumSquares / samples.length);
-};
-var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
-var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
-var selectPreferredTranscriptText = (currentText, nextText) => {
-  const current = normalizeText(currentText);
-  const next = normalizeText(nextText);
-  if (!current) {
-    return next;
-  }
-  if (!next) {
-    return current;
-  }
-  if (current === next || current.includes(next)) {
-    return current;
-  }
-  if (next.includes(current)) {
-    return next;
-  }
-  if (countWords(next) > countWords(current)) {
-    return next;
-  }
-  if (countWords(next) === countWords(current) && next.length > current.length) {
-    return next;
-  }
-  return current;
-};
-var mergeSequentialTranscriptText = (currentText, nextText) => {
-  const current = normalizeText(currentText);
-  const next = normalizeText(nextText);
-  if (!current) {
-    return next;
-  }
-  if (!next) {
-    return current;
-  }
-  const currentWords = current.split(" ");
-  const nextWords = next.split(" ");
-  const maxOverlap = Math.min(currentWords.length, nextWords.length);
-  for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
-    const currentSuffix = currentWords.slice(-overlap).join(" ");
-    const nextPrefix = nextWords.slice(0, overlap).join(" ");
-    if (currentSuffix === nextPrefix) {
-      return [...currentWords, ...nextWords.slice(overlap)].join(" ");
-    }
-  }
-  return `${current} ${next}`.trim();
-};
-var countCommonPrefixWords = (currentText, nextText) => {
-  const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
-  const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
-  const maxWords = Math.min(currentWords.length, nextWords.length);
-  let count = 0;
-  for (let index = 0;index < maxWords; index += 1) {
-    if (currentWords[index] !== nextWords[index]) {
-      break;
-    }
-    count += 1;
-  }
-  return count;
-};
-var mergeTranscriptTexts = (transcripts) => {
-  const merged = [];
-  for (const transcript of transcripts) {
-    const nextText = normalizeText(transcript.text);
-    if (!nextText) {
-      continue;
-    }
-    const previous = merged.at(-1);
-    if (!previous) {
-      merged.push(nextText);
-      continue;
-    }
-    if (nextText === previous || previous.includes(nextText)) {
-      continue;
-    }
-    if (nextText.includes(previous)) {
-      merged[merged.length - 1] = nextText;
-      continue;
-    }
-    merged.push(nextText);
-  }
-  return merged.join(" ").trim();
-};
-var buildTurnText = (transcripts, partialText, options = {}) => {
-  const finalText = mergeTranscriptTexts(transcripts);
-  const nextPartial = normalizeText(partialText);
-  const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
-  if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
-    return mergeSequentialTranscriptText(finalText, nextPartial);
-  }
-  return selectPreferredTranscriptText(finalText, nextPartial);
-};
 // src/core/turnProfiles.ts
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 400,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 300,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 600,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -1579,12 +1458,12 @@ var resolveTurnDetectionConfig = (config) => {
   const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
   const preset = TURN_PROFILE_DEFAULTS[profile];
   const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
+  const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
   return {
     profile,
     qualityProfile,
-    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
-    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
-    silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
+    minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
+    silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
   };

package/dist/testing/index.js CHANGED Viewed

@@ -86,7 +86,7 @@ var __require = import.meta.require;
 // src/core/turnDetection.ts
 var DEFAULT_SILENCE_MS = 700;
 var DEFAULT_SPEECH_THRESHOLD = 0.015;
-var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
+var DEFAULT_MIN_SILENCE_MS = 400;
 var toUint8Array = (audio) => {
   if (audio instanceof ArrayBuffer) {
     return new Uint8Array(audio);
@@ -3163,24 +3163,21 @@ var resolveAudioConditioningConfig = (config) => {
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 400,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 300,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 600,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -3211,12 +3208,12 @@ var resolveTurnDetectionConfig = (config) => {
   const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
   const preset = TURN_PROFILE_DEFAULTS[profile];
   const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
+  const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
   return {
     profile,
     qualityProfile,
-    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
-    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
-    silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
+    minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
+    silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
   };
@@ -6153,14 +6150,22 @@ var createVoiceSession = (options) => {
     strategy: options.reconnect.strategy ?? "resume-last-turn",
     timeout: options.reconnect.timeout ?? DEFAULT_RECONNECT_TIMEOUT
   };
+  const resolvedSilenceMs = options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS;
   const turnDetection = {
-    silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
+    silenceMs: resolvedSilenceMs,
+    minSilenceMs: Math.min(resolvedSilenceMs, options.turnDetection.minSilenceMs ?? DEFAULT_MIN_SILENCE_MS),
     speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
-    transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS,
-    semanticVetoMaxMs: options.turnDetection.semanticVetoMaxMs ?? 0,
-    semanticVetoRecheckMs: options.turnDetection.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS
+    transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
+  };
+  let lastTurnCompleteConfidence = null;
+  const adaptiveSilenceMs = () => {
+    const { minSilenceMs, silenceMs } = turnDetection;
+    if (lastTurnCompleteConfidence === null || silenceMs <= minSilenceMs) {
+      return silenceMs;
+    }
+    const complete = Math.max(0, Math.min(1, lastTurnCompleteConfidence));
+    return Math.round(minSilenceMs + (silenceMs - minSilenceMs) * (1 - complete));
   };
-  let semanticVetoElapsedMs = 0;
   const sttFallback = options.sttFallback ? {
     adapter: options.sttFallback.adapter,
     completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
@@ -6693,47 +6698,8 @@ var createVoiceSession = (options) => {
       runScheduledCommit(reason);
     }, delayMs);
   };
-  const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
-  const shouldDeferSilenceCommit = async (reason) => {
-    if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
-      return false;
-    }
-    const session = await readSession();
-    const { partialText, transcripts } = session.currentTurn;
-    const userText = buildTurnText(transcripts, partialText, {
-      partialEndedAtMs: session.currentTurn.partialEndedAt,
-      partialStartedAtMs: session.currentTurn.partialStartedAt
-    });
-    if (!userText) {
-      return false;
-    }
-    const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
-    let endOfTurn = true;
-    try {
-      const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
-        lastFinalTranscript: transcripts.at(-1),
-        partialText,
-        silenceMs,
-        transcripts,
-        ...getTurnAudioForDetector()
-      }));
-      endOfTurn = verdict.endOfTurn;
-    } catch {
-      return false;
-    }
-    if (endOfTurn !== false) {
-      return false;
-    }
-    const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
-    const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
-    semanticVetoElapsedMs += extendMs;
-    scheduleTurnCommit(extendMs, reason);
-    return true;
-  };
+  const scheduleSilenceCommit = (delayMs = adaptiveSilenceMs(), reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
   const runScheduledCommit = async (reason) => {
-    if (await shouldDeferSilenceCommit(reason)) {
-      return;
-    }
     await api.commitTurn(reason);
   };
   const requestTurnCommit = async (reason) => {
@@ -7473,7 +7439,7 @@ var createVoiceSession = (options) => {
       session2.lastActivityAt = Date.now();
       session2.status = "active";
     });
-    semanticVetoElapsedMs = 0;
+    lastTurnCompleteConfidence = null;
     if (silenceTimer && pendingCommitReason === "vendor") {
       scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
     }
@@ -7503,6 +7469,12 @@ var createVoiceSession = (options) => {
         transcripts: session.currentTurn.transcripts,
         ...getTurnAudioForDetector()
       }));
+      if (typeof verdict.confidence === "number") {
+        lastTurnCompleteConfidence = verdict.confidence;
+        if (silenceTimer && pendingCommitReason === "silence") {
+          scheduleSilenceCommit();
+        }
+      }
       if (verdict.endOfTurn) {
         clearSilenceTimer();
         await requestTurnCommit("vendor");
@@ -8198,7 +8170,7 @@ var createVoiceSession = (options) => {
   };
   const commitTurnInternal = async (reason = "manual") => {
     clearSilenceTimer();
-    semanticVetoElapsedMs = 0;
+    lastTurnCompleteConfidence = null;
     backchannelDriver?.reset();
     amdLastTurnCommitAt = Date.now();
     const session = await readSession();

package/dist/vue/index.js CHANGED Viewed

@@ -11689,146 +11689,25 @@ var resolveAudioConditioningConfig = (config) => {
   };
 };
-// src/core/turnDetection.ts
-var DEFAULT_SILENCE_MS = 700;
-var DEFAULT_SPEECH_THRESHOLD = 0.015;
-var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
-var toUint8Array = (audio) => {
-  if (audio instanceof ArrayBuffer) {
-    return new Uint8Array(audio);
-  }
-  return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
-};
-var measureAudioLevel = (audio) => {
-  const bytes = toUint8Array(audio);
-  if (bytes.byteLength < 2) {
-    return 0;
-  }
-  const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
-  if (samples.length === 0) {
-    return 0;
-  }
-  let sumSquares = 0;
-  for (const sample of samples) {
-    const normalized = sample / 32768;
-    sumSquares += normalized * normalized;
-  }
-  return Math.sqrt(sumSquares / samples.length);
-};
-var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
-var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
-var selectPreferredTranscriptText = (currentText, nextText) => {
-  const current = normalizeText(currentText);
-  const next = normalizeText(nextText);
-  if (!current) {
-    return next;
-  }
-  if (!next) {
-    return current;
-  }
-  if (current === next || current.includes(next)) {
-    return current;
-  }
-  if (next.includes(current)) {
-    return next;
-  }
-  if (countWords(next) > countWords(current)) {
-    return next;
-  }
-  if (countWords(next) === countWords(current) && next.length > current.length) {
-    return next;
-  }
-  return current;
-};
-var mergeSequentialTranscriptText = (currentText, nextText) => {
-  const current = normalizeText(currentText);
-  const next = normalizeText(nextText);
-  if (!current) {
-    return next;
-  }
-  if (!next) {
-    return current;
-  }
-  const currentWords = current.split(" ");
-  const nextWords = next.split(" ");
-  const maxOverlap = Math.min(currentWords.length, nextWords.length);
-  for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
-    const currentSuffix = currentWords.slice(-overlap).join(" ");
-    const nextPrefix = nextWords.slice(0, overlap).join(" ");
-    if (currentSuffix === nextPrefix) {
-      return [...currentWords, ...nextWords.slice(overlap)].join(" ");
-    }
-  }
-  return `${current} ${next}`.trim();
-};
-var countCommonPrefixWords = (currentText, nextText) => {
-  const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
-  const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
-  const maxWords = Math.min(currentWords.length, nextWords.length);
-  let count = 0;
-  for (let index = 0;index < maxWords; index += 1) {
-    if (currentWords[index] !== nextWords[index]) {
-      break;
-    }
-    count += 1;
-  }
-  return count;
-};
-var mergeTranscriptTexts = (transcripts) => {
-  const merged = [];
-  for (const transcript of transcripts) {
-    const nextText = normalizeText(transcript.text);
-    if (!nextText) {
-      continue;
-    }
-    const previous = merged.at(-1);
-    if (!previous) {
-      merged.push(nextText);
-      continue;
-    }
-    if (nextText === previous || previous.includes(nextText)) {
-      continue;
-    }
-    if (nextText.includes(previous)) {
-      merged[merged.length - 1] = nextText;
-      continue;
-    }
-    merged.push(nextText);
-  }
-  return merged.join(" ").trim();
-};
-var buildTurnText = (transcripts, partialText, options = {}) => {
-  const finalText = mergeTranscriptTexts(transcripts);
-  const nextPartial = normalizeText(partialText);
-  const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
-  if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
-    return mergeSequentialTranscriptText(finalText, nextPartial);
-  }
-  return selectPreferredTranscriptText(finalText, nextPartial);
-};
 // src/core/turnProfiles.ts
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 400,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 300,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 600,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -11859,12 +11738,12 @@ var resolveTurnDetectionConfig = (config) => {
   const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
   const preset = TURN_PROFILE_DEFAULTS[profile];
   const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
+  const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
   return {
     profile,
     qualityProfile,
-    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
-    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
-    silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
+    minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
+    silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
   };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@absolutejs/voice",
-  "version": "0.0.22-beta.598",
+  "version": "0.0.22-beta.599",
   "description": "Voice primitives and Elysia plugin for AbsoluteJS",
   "repository": {
     "type": "git",