npm - @absolutejs/voice - Versions diffs - 0.0.22-beta.584 → 0.0.22-beta.586 - Mend

@absolutejs/voice 0.0.22-beta.584 → 0.0.22-beta.586

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/angular/index.js +126 -0
package/dist/client/htmxBootstrap.js +11 -0
package/dist/client/index.js +126 -0
package/dist/core/turnDetection.d.ts +1 -0
package/dist/core/types.d.ts +4 -0
package/dist/embed/index.js +11 -0
package/dist/embed/voice-widget.js +8 -8
package/dist/index.js +203 -119
package/dist/react/index.js +126 -0
package/dist/svelte/index.js +126 -0
package/dist/testing/index.js +83 -2
package/dist/vue/index.js +126 -0
package/package.json +1 -1

package/dist/svelte/index.js CHANGED Viewed

@@ -1380,22 +1380,146 @@ var resolveAudioConditioningConfig = (config) => {
   };
 };
+// src/core/turnDetection.ts
+var DEFAULT_SILENCE_MS = 700;
+var DEFAULT_SPEECH_THRESHOLD = 0.015;
+var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
+var toUint8Array = (audio) => {
+  if (audio instanceof ArrayBuffer) {
+    return new Uint8Array(audio);
+  }
+  return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
+};
+var measureAudioLevel = (audio) => {
+  const bytes = toUint8Array(audio);
+  if (bytes.byteLength < 2) {
+    return 0;
+  }
+  const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
+  if (samples.length === 0) {
+    return 0;
+  }
+  let sumSquares = 0;
+  for (const sample of samples) {
+    const normalized = sample / 32768;
+    sumSquares += normalized * normalized;
+  }
+  return Math.sqrt(sumSquares / samples.length);
+};
+var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
+var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
+var selectPreferredTranscriptText = (currentText, nextText) => {
+  const current = normalizeText(currentText);
+  const next = normalizeText(nextText);
+  if (!current) {
+    return next;
+  }
+  if (!next) {
+    return current;
+  }
+  if (current === next || current.includes(next)) {
+    return current;
+  }
+  if (next.includes(current)) {
+    return next;
+  }
+  if (countWords(next) > countWords(current)) {
+    return next;
+  }
+  if (countWords(next) === countWords(current) && next.length > current.length) {
+    return next;
+  }
+  return current;
+};
+var mergeSequentialTranscriptText = (currentText, nextText) => {
+  const current = normalizeText(currentText);
+  const next = normalizeText(nextText);
+  if (!current) {
+    return next;
+  }
+  if (!next) {
+    return current;
+  }
+  const currentWords = current.split(" ");
+  const nextWords = next.split(" ");
+  const maxOverlap = Math.min(currentWords.length, nextWords.length);
+  for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
+    const currentSuffix = currentWords.slice(-overlap).join(" ");
+    const nextPrefix = nextWords.slice(0, overlap).join(" ");
+    if (currentSuffix === nextPrefix) {
+      return [...currentWords, ...nextWords.slice(overlap)].join(" ");
+    }
+  }
+  return `${current} ${next}`.trim();
+};
+var countCommonPrefixWords = (currentText, nextText) => {
+  const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
+  const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
+  const maxWords = Math.min(currentWords.length, nextWords.length);
+  let count = 0;
+  for (let index = 0;index < maxWords; index += 1) {
+    if (currentWords[index] !== nextWords[index]) {
+      break;
+    }
+    count += 1;
+  }
+  return count;
+};
+var mergeTranscriptTexts = (transcripts) => {
+  const merged = [];
+  for (const transcript of transcripts) {
+    const nextText = normalizeText(transcript.text);
+    if (!nextText) {
+      continue;
+    }
+    const previous = merged.at(-1);
+    if (!previous) {
+      merged.push(nextText);
+      continue;
+    }
+    if (nextText === previous || previous.includes(nextText)) {
+      continue;
+    }
+    if (nextText.includes(previous)) {
+      merged[merged.length - 1] = nextText;
+      continue;
+    }
+    merged.push(nextText);
+  }
+  return merged.join(" ").trim();
+};
+var buildTurnText = (transcripts, partialText, options = {}) => {
+  const finalText = mergeTranscriptTexts(transcripts);
+  const nextPartial = normalizeText(partialText);
+  const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
+  if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
+    return mergeSequentialTranscriptText(finalText, nextPartial);
+  }
+  return selectPreferredTranscriptText(finalText, nextPartial);
+};
 // src/core/turnProfiles.ts
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -1429,6 +1553,8 @@ var resolveTurnDetectionConfig = (config) => {
   return {
     profile,
     qualityProfile,
+    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
+    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
     silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs

package/dist/testing/index.js CHANGED Viewed

@@ -86,6 +86,7 @@ var __require = import.meta.require;
 // src/core/turnDetection.ts
 var DEFAULT_SILENCE_MS = 700;
 var DEFAULT_SPEECH_THRESHOLD = 0.015;
+var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
 var toUint8Array = (audio) => {
   if (audio instanceof ArrayBuffer) {
     return new Uint8Array(audio);
@@ -3133,18 +3134,24 @@ var resolveAudioConditioningConfig = (config) => {
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -3178,6 +3185,8 @@ var resolveTurnDetectionConfig = (config) => {
   return {
     profile,
     qualityProfile,
+    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
+    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
     silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
@@ -5910,6 +5919,8 @@ var FALLBACK_CONFIDENCE_SELECTION_DELTA = 0.05;
 var FALLBACK_WORD_COUNT_SELECTION_MARGIN_RATIO = 0.12;
 var EXTENDED_VENDOR_COMMIT_SILENCE_THRESHOLD_MS = 200;
 var MAX_VENDOR_COMMIT_GRACE_MS = 1200;
+var STT_RECONNECT_FLAP_WINDOW_MS = 4000;
+var MAX_STT_RECONNECTS_IN_FLAP_WINDOW = 3;
 var DEFAULT_FORMAT = {
   channels: 1,
   container: "raw",
@@ -6105,8 +6116,11 @@ var createVoiceSession = (options) => {
   const turnDetection = {
     silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
     speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
-    transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
+    transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS,
+    semanticVetoMaxMs: options.turnDetection.semanticVetoMaxMs ?? 0,
+    semanticVetoRecheckMs: options.turnDetection.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS
   };
+  let semanticVetoElapsedMs = 0;
   const sttFallback = options.sttFallback ? {
     adapter: options.sttFallback.adapter,
     completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
@@ -6147,6 +6161,8 @@ var createVoiceSession = (options) => {
   let operationQueue = Promise.resolve();
   let adapterGenerationCounter = 0;
   let activeAdapterGeneration = 0;
+  let sttReconnectCount = 0;
+  let lastSttReconnectAt = 0;
   let activeTTSTurnId;
   let assistantSpeechEndsAt = 0;
   let lastAssistantAudioAt = 0;
@@ -6621,10 +6637,51 @@ var createVoiceSession = (options) => {
     silenceTimer = setTimeout(() => {
       silenceTimer = null;
       pendingCommitReason = null;
-      api.commitTurn(reason);
+      runScheduledCommit(reason);
     }, delayMs);
   };
   const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
+  const shouldDeferSilenceCommit = async (reason) => {
+    if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
+      return false;
+    }
+    const session = await readSession();
+    const { partialText, transcripts } = session.currentTurn;
+    const userText = buildTurnText(transcripts, partialText, {
+      partialEndedAtMs: session.currentTurn.partialEndedAt,
+      partialStartedAtMs: session.currentTurn.partialStartedAt
+    });
+    if (!userText) {
+      return false;
+    }
+    const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
+    let endOfTurn = true;
+    try {
+      const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
+        lastFinalTranscript: transcripts.at(-1),
+        partialText,
+        silenceMs,
+        transcripts
+      }));
+      endOfTurn = verdict.endOfTurn;
+    } catch {
+      return false;
+    }
+    if (endOfTurn !== false) {
+      return false;
+    }
+    const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
+    const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
+    semanticVetoElapsedMs += extendMs;
+    scheduleTurnCommit(extendMs, reason);
+    return true;
+  };
+  const runScheduledCommit = async (reason) => {
+    if (await shouldDeferSilenceCommit(reason)) {
+      return;
+    }
+    await api.commitTurn(reason);
+  };
   const requestTurnCommit = async (reason) => {
     const session = await readSession();
     const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
@@ -6992,6 +7049,27 @@ var createVoiceSession = (options) => {
     }
   };
   const handleClose = async (event) => {
+    const session = await readSession();
+    const callLive = session.status !== "completed" && session.status !== "failed";
+    if (callLive && (options.stt || options.realtime)) {
+      const now = Date.now();
+      sttReconnectCount = now - lastSttReconnectAt < STT_RECONNECT_FLAP_WINDOW_MS ? sttReconnectCount + 1 : 1;
+      lastSttReconnectAt = now;
+      if (sttReconnectCount <= MAX_STT_RECONNECTS_IN_FLAP_WINDOW) {
+        await appendTrace({
+          payload: {
+            action: "stt-reconnect",
+            attempt: sttReconnectCount,
+            reason: event.reason ?? "stt stream closed",
+            recoverable: event.recoverable
+          },
+          session,
+          type: "session.error"
+        });
+        await closeAdapter(event.reason ?? "stt stream closed; reconnecting");
+        return;
+      }
+    }
     if (event.recoverable === false) {
       await failInternal(new Error(event.reason ?? "Speech-to-text session closed"));
       return;
@@ -7316,6 +7394,7 @@ var createVoiceSession = (options) => {
     });
   };
   const handleFinal = async (transcript) => {
+    sttReconnectCount = 0;
     const session = await writeSession((session2) => {
       const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
       if (!alreadyPresent) {
@@ -7336,6 +7415,7 @@ var createVoiceSession = (options) => {
       session2.lastActivityAt = Date.now();
       session2.status = "active";
     });
+    semanticVetoElapsedMs = 0;
     if (silenceTimer && pendingCommitReason === "vendor") {
       scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
     }
@@ -8039,6 +8119,7 @@ var createVoiceSession = (options) => {
   };
   const commitTurnInternal = async (reason = "manual") => {
     clearSilenceTimer();
+    semanticVetoElapsedMs = 0;
     backchannelDriver?.reset();
     amdLastTurnCommitAt = Date.now();
     const session = await readSession();

package/dist/vue/index.js CHANGED Viewed

@@ -11660,22 +11660,146 @@ var resolveAudioConditioningConfig = (config) => {
   };
 };
+// src/core/turnDetection.ts
+var DEFAULT_SILENCE_MS = 700;
+var DEFAULT_SPEECH_THRESHOLD = 0.015;
+var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
+var toUint8Array = (audio) => {
+  if (audio instanceof ArrayBuffer) {
+    return new Uint8Array(audio);
+  }
+  return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
+};
+var measureAudioLevel = (audio) => {
+  const bytes = toUint8Array(audio);
+  if (bytes.byteLength < 2) {
+    return 0;
+  }
+  const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
+  if (samples.length === 0) {
+    return 0;
+  }
+  let sumSquares = 0;
+  for (const sample of samples) {
+    const normalized = sample / 32768;
+    sumSquares += normalized * normalized;
+  }
+  return Math.sqrt(sumSquares / samples.length);
+};
+var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
+var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
+var selectPreferredTranscriptText = (currentText, nextText) => {
+  const current = normalizeText(currentText);
+  const next = normalizeText(nextText);
+  if (!current) {
+    return next;
+  }
+  if (!next) {
+    return current;
+  }
+  if (current === next || current.includes(next)) {
+    return current;
+  }
+  if (next.includes(current)) {
+    return next;
+  }
+  if (countWords(next) > countWords(current)) {
+    return next;
+  }
+  if (countWords(next) === countWords(current) && next.length > current.length) {
+    return next;
+  }
+  return current;
+};
+var mergeSequentialTranscriptText = (currentText, nextText) => {
+  const current = normalizeText(currentText);
+  const next = normalizeText(nextText);
+  if (!current) {
+    return next;
+  }
+  if (!next) {
+    return current;
+  }
+  const currentWords = current.split(" ");
+  const nextWords = next.split(" ");
+  const maxOverlap = Math.min(currentWords.length, nextWords.length);
+  for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
+    const currentSuffix = currentWords.slice(-overlap).join(" ");
+    const nextPrefix = nextWords.slice(0, overlap).join(" ");
+    if (currentSuffix === nextPrefix) {
+      return [...currentWords, ...nextWords.slice(overlap)].join(" ");
+    }
+  }
+  return `${current} ${next}`.trim();
+};
+var countCommonPrefixWords = (currentText, nextText) => {
+  const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
+  const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
+  const maxWords = Math.min(currentWords.length, nextWords.length);
+  let count = 0;
+  for (let index = 0;index < maxWords; index += 1) {
+    if (currentWords[index] !== nextWords[index]) {
+      break;
+    }
+    count += 1;
+  }
+  return count;
+};
+var mergeTranscriptTexts = (transcripts) => {
+  const merged = [];
+  for (const transcript of transcripts) {
+    const nextText = normalizeText(transcript.text);
+    if (!nextText) {
+      continue;
+    }
+    const previous = merged.at(-1);
+    if (!previous) {
+      merged.push(nextText);
+      continue;
+    }
+    if (nextText === previous || previous.includes(nextText)) {
+      continue;
+    }
+    if (nextText.includes(previous)) {
+      merged[merged.length - 1] = nextText;
+      continue;
+    }
+    merged.push(nextText);
+  }
+  return merged.join(" ").trim();
+};
+var buildTurnText = (transcripts, partialText, options = {}) => {
+  const finalText = mergeTranscriptTexts(transcripts);
+  const nextPartial = normalizeText(partialText);
+  const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
+  if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
+    return mergeSequentialTranscriptText(finalText, nextPartial);
+  }
+  return selectPreferredTranscriptText(finalText, nextPartial);
+};
 // src/core/turnProfiles.ts
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -11709,6 +11833,8 @@ var resolveTurnDetectionConfig = (config) => {
   return {
     profile,
     qualityProfile,
+    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
+    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
     silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@absolutejs/voice",
-  "version": "0.0.22-beta.584",
+  "version": "0.0.22-beta.586",
   "description": "Voice primitives and Elysia plugin for AbsoluteJS",
   "repository": {
     "type": "git",