npm - @absolutejs/voice - Versions diffs - 0.0.22-beta.583 → 0.0.22-beta.585 - Mend

@absolutejs/voice 0.0.22-beta.583 → 0.0.22-beta.585

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/angular/index.js +126 -0
package/dist/client/htmxBootstrap.js +11 -0
package/dist/client/index.js +126 -0
package/dist/core/hardenedFetch.d.ts +3 -0
package/dist/core/turnDetection.d.ts +1 -0
package/dist/core/types.d.ts +4 -0
package/dist/embed/index.js +11 -0
package/dist/embed/voice-widget.js +8 -8
package/dist/index.d.ts +1 -0
package/dist/index.js +219 -122
package/dist/react/index.js +126 -0
package/dist/svelte/index.js +126 -0
package/dist/testing/index.js +99 -5
package/dist/vue/index.js +126 -0
package/package.json +1 -1

package/dist/svelte/index.js CHANGED Viewed

@@ -1380,22 +1380,146 @@ var resolveAudioConditioningConfig = (config) => {
   };
 };
+// src/core/turnDetection.ts
+var DEFAULT_SILENCE_MS = 700;
+var DEFAULT_SPEECH_THRESHOLD = 0.015;
+var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
+var toUint8Array = (audio) => {
+  if (audio instanceof ArrayBuffer) {
+    return new Uint8Array(audio);
+  }
+  return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
+};
+var measureAudioLevel = (audio) => {
+  const bytes = toUint8Array(audio);
+  if (bytes.byteLength < 2) {
+    return 0;
+  }
+  const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
+  if (samples.length === 0) {
+    return 0;
+  }
+  let sumSquares = 0;
+  for (const sample of samples) {
+    const normalized = sample / 32768;
+    sumSquares += normalized * normalized;
+  }
+  return Math.sqrt(sumSquares / samples.length);
+};
+var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
+var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
+var selectPreferredTranscriptText = (currentText, nextText) => {
+  const current = normalizeText(currentText);
+  const next = normalizeText(nextText);
+  if (!current) {
+    return next;
+  }
+  if (!next) {
+    return current;
+  }
+  if (current === next || current.includes(next)) {
+    return current;
+  }
+  if (next.includes(current)) {
+    return next;
+  }
+  if (countWords(next) > countWords(current)) {
+    return next;
+  }
+  if (countWords(next) === countWords(current) && next.length > current.length) {
+    return next;
+  }
+  return current;
+};
+var mergeSequentialTranscriptText = (currentText, nextText) => {
+  const current = normalizeText(currentText);
+  const next = normalizeText(nextText);
+  if (!current) {
+    return next;
+  }
+  if (!next) {
+    return current;
+  }
+  const currentWords = current.split(" ");
+  const nextWords = next.split(" ");
+  const maxOverlap = Math.min(currentWords.length, nextWords.length);
+  for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
+    const currentSuffix = currentWords.slice(-overlap).join(" ");
+    const nextPrefix = nextWords.slice(0, overlap).join(" ");
+    if (currentSuffix === nextPrefix) {
+      return [...currentWords, ...nextWords.slice(overlap)].join(" ");
+    }
+  }
+  return `${current} ${next}`.trim();
+};
+var countCommonPrefixWords = (currentText, nextText) => {
+  const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
+  const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
+  const maxWords = Math.min(currentWords.length, nextWords.length);
+  let count = 0;
+  for (let index = 0;index < maxWords; index += 1) {
+    if (currentWords[index] !== nextWords[index]) {
+      break;
+    }
+    count += 1;
+  }
+  return count;
+};
+var mergeTranscriptTexts = (transcripts) => {
+  const merged = [];
+  for (const transcript of transcripts) {
+    const nextText = normalizeText(transcript.text);
+    if (!nextText) {
+      continue;
+    }
+    const previous = merged.at(-1);
+    if (!previous) {
+      merged.push(nextText);
+      continue;
+    }
+    if (nextText === previous || previous.includes(nextText)) {
+      continue;
+    }
+    if (nextText.includes(previous)) {
+      merged[merged.length - 1] = nextText;
+      continue;
+    }
+    merged.push(nextText);
+  }
+  return merged.join(" ").trim();
+};
+var buildTurnText = (transcripts, partialText, options = {}) => {
+  const finalText = mergeTranscriptTexts(transcripts);
+  const nextPartial = normalizeText(partialText);
+  const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
+  if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
+    return mergeSequentialTranscriptText(finalText, nextPartial);
+  }
+  return selectPreferredTranscriptText(finalText, nextPartial);
+};
 // src/core/turnProfiles.ts
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -1429,6 +1553,8 @@ var resolveTurnDetectionConfig = (config) => {
   return {
     profile,
     qualityProfile,
+    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
+    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
     silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs

package/dist/testing/index.js CHANGED Viewed

@@ -86,6 +86,7 @@ var __require = import.meta.require;
 // src/core/turnDetection.ts
 var DEFAULT_SILENCE_MS = 700;
 var DEFAULT_SPEECH_THRESHOLD = 0.015;
+var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
 var toUint8Array = (audio) => {
   if (audio instanceof ArrayBuffer) {
     return new Uint8Array(audio);
@@ -3133,18 +3134,24 @@ var resolveAudioConditioningConfig = (config) => {
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -3178,6 +3185,8 @@ var resolveTurnDetectionConfig = (config) => {
   return {
     profile,
     qualityProfile,
+    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
+    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
     silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
@@ -4210,6 +4219,45 @@ var startVoiceTimer = (sessionId) => {
 };
 var voiceTimingEnabled = () => timingEnabled();
+// src/core/hardenedFetch.ts
+var ATTEMPT_TIMEOUT_MS = 6000;
+var isBun = "Bun" in globalThis;
+var oneAttempt = async (baseFetch, input, init) => {
+  const controller = new AbortController;
+  const callerSignal = init?.signal ?? undefined;
+  const onCallerAbort = () => controller.abort(callerSignal?.reason);
+  if (callerSignal?.aborted)
+    controller.abort(callerSignal.reason);
+  else
+    callerSignal?.addEventListener("abort", onCallerAbort, { once: true });
+  const timer = setTimeout(() => {
+    controller.abort(new Error(`fetch exceeded ${ATTEMPT_TIMEOUT_MS}ms before response headers (stale Bun keep-alive socket?)`));
+  }, ATTEMPT_TIMEOUT_MS);
+  const headers = new Headers(init?.headers);
+  if (isBun)
+    headers.set("Connection", "close");
+  try {
+    return await baseFetch(input, {
+      ...init,
+      headers,
+      signal: controller.signal
+    });
+  } finally {
+    clearTimeout(timer);
+    callerSignal?.removeEventListener("abort", onCallerAbort);
+  }
+};
+var hardenFetch = (baseFetch = globalThis.fetch) => Object.assign(async (input, init) => {
+  try {
+    return await oneAttempt(baseFetch, input, init);
+  } catch (error) {
+    if (init?.signal?.aborted)
+      throw error;
+    console.warn(`[voice] hardened fetch retrying on a fresh connection: ${error instanceof Error ? error.message : String(error)}`);
+    return oneAttempt(baseFetch, input, init);
+  }
+}, { preconnect: baseFetch.preconnect.bind(baseFetch) });
 // src/core/modelAdapters.ts
 var isVoiceProviderRoutingPolicyPreset = (value) => value === "balanced" || value === "cost-cap" || value === "cost-first" || value === "latency-first" || value === "quality-first";
 var resolveVoiceProviderRoutingPolicyPreset = (preset, options = {}) => {
@@ -4914,7 +4962,7 @@ var consumeOpenAIResponsesStream = async (response, onTextDelta, abortOptions) =
   return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
 };
 var createOpenAIVoiceAssistantModel = (options) => {
-  const fetchImpl = options.fetch ?? globalThis.fetch;
+  const fetchImpl = hardenFetch(options.fetch);
   const baseUrl = options.baseUrl ?? "https://api.openai.com/v1";
   const model = options.model ?? "gpt-4.1-mini";
   const timeoutMs = options.timeoutMs ?? 60000;
@@ -5039,7 +5087,7 @@ var consumeAnthropicStream = async (response, onTextDelta) => {
   return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
 };
 var createAnthropicVoiceAssistantModel = (options) => {
-  const fetchImpl = options.fetch ?? globalThis.fetch;
+  const fetchImpl = hardenFetch(options.fetch);
   const baseUrl = options.baseUrl ?? "https://api.anthropic.com/v1";
   const model = options.model ?? "claude-sonnet-4-5";
   return {
@@ -5125,7 +5173,7 @@ var consumeGeminiStream = async (response, onTextDelta) => {
   return { assistantText, toolCalls, usage };
 };
 var createGeminiVoiceAssistantModel = (options) => {
-  const fetchImpl = options.fetch ?? globalThis.fetch;
+  const fetchImpl = hardenFetch(options.fetch);
   const baseUrl = options.baseUrl ?? "https://generativelanguage.googleapis.com/v1beta";
   const model = options.model ?? "gemini-2.5-flash";
   const maxRetries = Math.max(0, options.maxRetries ?? 2);
@@ -6066,8 +6114,11 @@ var createVoiceSession = (options) => {
   const turnDetection = {
     silenceMs: options.turnDetection.silenceMs ?? DEFAULT_SILENCE_MS,
     speechThreshold: options.turnDetection.speechThreshold ?? DEFAULT_SPEECH_THRESHOLD,
-    transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS
+    transcriptStabilityMs: options.turnDetection.transcriptStabilityMs ?? DEFAULT_TRANSCRIPT_STABILITY_MS,
+    semanticVetoMaxMs: options.turnDetection.semanticVetoMaxMs ?? 0,
+    semanticVetoRecheckMs: options.turnDetection.semanticVetoRecheckMs ?? DEFAULT_SEMANTIC_VETO_RECHECK_MS
   };
+  let semanticVetoElapsedMs = 0;
   const sttFallback = options.sttFallback ? {
     adapter: options.sttFallback.adapter,
     completionTimeoutMs: options.sttFallback.completionTimeoutMs ?? DEFAULT_FALLBACK_COMPLETION_TIMEOUT_MS,
@@ -6582,10 +6633,51 @@ var createVoiceSession = (options) => {
     silenceTimer = setTimeout(() => {
       silenceTimer = null;
       pendingCommitReason = null;
-      api.commitTurn(reason);
+      runScheduledCommit(reason);
     }, delayMs);
   };
   const scheduleSilenceCommit = (delayMs = turnDetection.silenceMs, reset = true) => scheduleTurnCommit(delayMs, "silence", reset);
+  const shouldDeferSilenceCommit = async (reason) => {
+    if (reason !== "silence" || turnDetection.semanticVetoMaxMs <= 0 || !options.semanticTurnDetector || semanticVetoElapsedMs >= turnDetection.semanticVetoMaxMs) {
+      return false;
+    }
+    const session = await readSession();
+    const { partialText, transcripts } = session.currentTurn;
+    const userText = buildTurnText(transcripts, partialText, {
+      partialEndedAtMs: session.currentTurn.partialEndedAt,
+      partialStartedAtMs: session.currentTurn.partialStartedAt
+    });
+    if (!userText) {
+      return false;
+    }
+    const silenceMs = session.currentTurn.silenceStartedAt !== undefined ? Date.now() - session.currentTurn.silenceStartedAt : turnDetection.silenceMs;
+    let endOfTurn = true;
+    try {
+      const verdict = await Promise.resolve(options.semanticTurnDetector.evaluate({
+        lastFinalTranscript: transcripts.at(-1),
+        partialText,
+        silenceMs,
+        transcripts
+      }));
+      endOfTurn = verdict.endOfTurn;
+    } catch {
+      return false;
+    }
+    if (endOfTurn !== false) {
+      return false;
+    }
+    const remaining = turnDetection.semanticVetoMaxMs - semanticVetoElapsedMs;
+    const extendMs = Math.max(1, Math.min(turnDetection.semanticVetoRecheckMs, remaining));
+    semanticVetoElapsedMs += extendMs;
+    scheduleTurnCommit(extendMs, reason);
+    return true;
+  };
+  const runScheduledCommit = async (reason) => {
+    if (await shouldDeferSilenceCommit(reason)) {
+      return;
+    }
+    await api.commitTurn(reason);
+  };
   const requestTurnCommit = async (reason) => {
     const session = await readSession();
     const text = buildTurnText(session.currentTurn.transcripts, session.currentTurn.partialText, {
@@ -7297,6 +7389,7 @@ var createVoiceSession = (options) => {
       session2.lastActivityAt = Date.now();
       session2.status = "active";
     });
+    semanticVetoElapsedMs = 0;
     if (silenceTimer && pendingCommitReason === "vendor") {
       scheduleTurnCommit(getVendorCommitDelayMs(), "vendor");
     }
@@ -8000,6 +8093,7 @@ var createVoiceSession = (options) => {
   };
   const commitTurnInternal = async (reason = "manual") => {
     clearSilenceTimer();
+    semanticVetoElapsedMs = 0;
     backchannelDriver?.reset();
     amdLastTurnCommitAt = Date.now();
     const session = await readSession();

package/dist/vue/index.js CHANGED Viewed

@@ -11660,22 +11660,146 @@ var resolveAudioConditioningConfig = (config) => {
   };
 };
+// src/core/turnDetection.ts
+var DEFAULT_SILENCE_MS = 700;
+var DEFAULT_SPEECH_THRESHOLD = 0.015;
+var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
+var toUint8Array = (audio) => {
+  if (audio instanceof ArrayBuffer) {
+    return new Uint8Array(audio);
+  }
+  return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
+};
+var measureAudioLevel = (audio) => {
+  const bytes = toUint8Array(audio);
+  if (bytes.byteLength < 2) {
+    return 0;
+  }
+  const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
+  if (samples.length === 0) {
+    return 0;
+  }
+  let sumSquares = 0;
+  for (const sample of samples) {
+    const normalized = sample / 32768;
+    sumSquares += normalized * normalized;
+  }
+  return Math.sqrt(sumSquares / samples.length);
+};
+var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
+var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
+var selectPreferredTranscriptText = (currentText, nextText) => {
+  const current = normalizeText(currentText);
+  const next = normalizeText(nextText);
+  if (!current) {
+    return next;
+  }
+  if (!next) {
+    return current;
+  }
+  if (current === next || current.includes(next)) {
+    return current;
+  }
+  if (next.includes(current)) {
+    return next;
+  }
+  if (countWords(next) > countWords(current)) {
+    return next;
+  }
+  if (countWords(next) === countWords(current) && next.length > current.length) {
+    return next;
+  }
+  return current;
+};
+var mergeSequentialTranscriptText = (currentText, nextText) => {
+  const current = normalizeText(currentText);
+  const next = normalizeText(nextText);
+  if (!current) {
+    return next;
+  }
+  if (!next) {
+    return current;
+  }
+  const currentWords = current.split(" ");
+  const nextWords = next.split(" ");
+  const maxOverlap = Math.min(currentWords.length, nextWords.length);
+  for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
+    const currentSuffix = currentWords.slice(-overlap).join(" ");
+    const nextPrefix = nextWords.slice(0, overlap).join(" ");
+    if (currentSuffix === nextPrefix) {
+      return [...currentWords, ...nextWords.slice(overlap)].join(" ");
+    }
+  }
+  return `${current} ${next}`.trim();
+};
+var countCommonPrefixWords = (currentText, nextText) => {
+  const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
+  const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
+  const maxWords = Math.min(currentWords.length, nextWords.length);
+  let count = 0;
+  for (let index = 0;index < maxWords; index += 1) {
+    if (currentWords[index] !== nextWords[index]) {
+      break;
+    }
+    count += 1;
+  }
+  return count;
+};
+var mergeTranscriptTexts = (transcripts) => {
+  const merged = [];
+  for (const transcript of transcripts) {
+    const nextText = normalizeText(transcript.text);
+    if (!nextText) {
+      continue;
+    }
+    const previous = merged.at(-1);
+    if (!previous) {
+      merged.push(nextText);
+      continue;
+    }
+    if (nextText === previous || previous.includes(nextText)) {
+      continue;
+    }
+    if (nextText.includes(previous)) {
+      merged[merged.length - 1] = nextText;
+      continue;
+    }
+    merged.push(nextText);
+  }
+  return merged.join(" ").trim();
+};
+var buildTurnText = (transcripts, partialText, options = {}) => {
+  const finalText = mergeTranscriptTexts(transcripts);
+  const nextPartial = normalizeText(partialText);
+  const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
+  if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
+    return mergeSequentialTranscriptText(finalText, nextPartial);
+  }
+  return selectPreferredTranscriptText(finalText, nextPartial);
+};
 // src/core/turnProfiles.ts
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
+    semanticVetoMaxMs: 0,
+    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -11709,6 +11833,8 @@ var resolveTurnDetectionConfig = (config) => {
   return {
     profile,
     qualityProfile,
+    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
+    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
     silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@absolutejs/voice",
-  "version": "0.0.22-beta.583",
+  "version": "0.0.22-beta.585",
   "description": "Voice primitives and Elysia plugin for AbsoluteJS",
   "repository": {
     "type": "git",