npm - @absolutejs/voice - Versions diffs - 0.0.22-beta.597 → 0.0.22-beta.599 - Mend

@absolutejs/voice 0.0.22-beta.597 → 0.0.22-beta.599

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/angular/index.js +6 -127
package/dist/client/htmxBootstrap.js +6 -12
package/dist/client/index.js +6 -127
package/dist/core/semanticTurn.d.ts +11 -1
package/dist/core/turnDetection.d.ts +1 -1
package/dist/core/types.d.ts +2 -4
package/dist/embed/index.js +6 -12
package/dist/embed/voice-widget.js +8 -8
package/dist/index.js +171 -184
package/dist/react/index.js +6 -127
package/dist/svelte/index.js +6 -127
package/dist/testing/index.js +42 -57
package/dist/vue/index.js +6 -127
package/package.json +1 -1

package/dist/angular/index.js CHANGED Viewed

@@ -1227,146 +1227,25 @@ var resolveAudioConditioningConfig = (config) => {
   };
 };
-// src/core/turnDetection.ts
-var DEFAULT_SILENCE_MS = 700;
-var DEFAULT_SPEECH_THRESHOLD = 0.015;
-var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
-var toUint8Array = (audio) => {
-  if (audio instanceof ArrayBuffer) {
-    return new Uint8Array(audio);
-  }
-  return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
-};
-var measureAudioLevel = (audio) => {
-  const bytes = toUint8Array(audio);
-  if (bytes.byteLength < 2) {
-    return 0;
-  }
-  const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
-  if (samples.length === 0) {
-    return 0;
-  }
-  let sumSquares = 0;
-  for (const sample of samples) {
-    const normalized = sample / 32768;
-    sumSquares += normalized * normalized;
-  }
-  return Math.sqrt(sumSquares / samples.length);
-};
-var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
-var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
-var selectPreferredTranscriptText = (currentText, nextText) => {
-  const current = normalizeText(currentText);
-  const next = normalizeText(nextText);
-  if (!current) {
-    return next;
-  }
-  if (!next) {
-    return current;
-  }
-  if (current === next || current.includes(next)) {
-    return current;
-  }
-  if (next.includes(current)) {
-    return next;
-  }
-  if (countWords(next) > countWords(current)) {
-    return next;
-  }
-  if (countWords(next) === countWords(current) && next.length > current.length) {
-    return next;
-  }
-  return current;
-};
-var mergeSequentialTranscriptText = (currentText, nextText) => {
-  const current = normalizeText(currentText);
-  const next = normalizeText(nextText);
-  if (!current) {
-    return next;
-  }
-  if (!next) {
-    return current;
-  }
-  const currentWords = current.split(" ");
-  const nextWords = next.split(" ");
-  const maxOverlap = Math.min(currentWords.length, nextWords.length);
-  for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
-    const currentSuffix = currentWords.slice(-overlap).join(" ");
-    const nextPrefix = nextWords.slice(0, overlap).join(" ");
-    if (currentSuffix === nextPrefix) {
-      return [...currentWords, ...nextWords.slice(overlap)].join(" ");
-    }
-  }
-  return `${current} ${next}`.trim();
-};
-var countCommonPrefixWords = (currentText, nextText) => {
-  const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
-  const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
-  const maxWords = Math.min(currentWords.length, nextWords.length);
-  let count = 0;
-  for (let index = 0;index < maxWords; index += 1) {
-    if (currentWords[index] !== nextWords[index]) {
-      break;
-    }
-    count += 1;
-  }
-  return count;
-};
-var mergeTranscriptTexts = (transcripts) => {
-  const merged = [];
-  for (const transcript of transcripts) {
-    const nextText = normalizeText(transcript.text);
-    if (!nextText) {
-      continue;
-    }
-    const previous = merged.at(-1);
-    if (!previous) {
-      merged.push(nextText);
-      continue;
-    }
-    if (nextText === previous || previous.includes(nextText)) {
-      continue;
-    }
-    if (nextText.includes(previous)) {
-      merged[merged.length - 1] = nextText;
-      continue;
-    }
-    merged.push(nextText);
-  }
-  return merged.join(" ").trim();
-};
-var buildTurnText = (transcripts, partialText, options = {}) => {
-  const finalText = mergeTranscriptTexts(transcripts);
-  const nextPartial = normalizeText(partialText);
-  const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
-  if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
-    return mergeSequentialTranscriptText(finalText, nextPartial);
-  }
-  return selectPreferredTranscriptText(finalText, nextPartial);
-};
 // src/core/turnProfiles.ts
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 400,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 300,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 600,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -1397,12 +1276,12 @@ var resolveTurnDetectionConfig = (config) => {
   const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
   const preset = TURN_PROFILE_DEFAULTS[profile];
   const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
+  const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
   return {
     profile,
     qualityProfile,
-    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
-    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
-    silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
+    minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
+    silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
   };

package/dist/client/htmxBootstrap.js CHANGED Viewed

@@ -1107,31 +1107,25 @@ var resolveAudioConditioningConfig = (config) => {
   };
 };
-// src/core/turnDetection.ts
-var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
 // src/core/turnProfiles.ts
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 400,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 300,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 600,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -1162,12 +1156,12 @@ var resolveTurnDetectionConfig = (config) => {
   const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
   const preset = TURN_PROFILE_DEFAULTS[profile];
   const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
+  const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
   return {
     profile,
     qualityProfile,
-    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
-    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
-    silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
+    minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
+    silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
   };

package/dist/client/index.js CHANGED Viewed

@@ -1678,146 +1678,25 @@ var resolveAudioConditioningConfig = (config) => {
   };
 };
-// src/core/turnDetection.ts
-var DEFAULT_SILENCE_MS = 700;
-var DEFAULT_SPEECH_THRESHOLD = 0.015;
-var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
-var toUint8Array = (audio) => {
-  if (audio instanceof ArrayBuffer) {
-    return new Uint8Array(audio);
-  }
-  return new Uint8Array(audio.buffer, audio.byteOffset, audio.byteLength);
-};
-var measureAudioLevel = (audio) => {
-  const bytes = toUint8Array(audio);
-  if (bytes.byteLength < 2) {
-    return 0;
-  }
-  const samples = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
-  if (samples.length === 0) {
-    return 0;
-  }
-  let sumSquares = 0;
-  for (const sample of samples) {
-    const normalized = sample / 32768;
-    sumSquares += normalized * normalized;
-  }
-  return Math.sqrt(sumSquares / samples.length);
-};
-var normalizeText = (value) => value.trim().replace(/\s+/g, " ");
-var countWords = (value) => value.length > 0 ? value.split(" ").length : 0;
-var selectPreferredTranscriptText = (currentText, nextText) => {
-  const current = normalizeText(currentText);
-  const next = normalizeText(nextText);
-  if (!current) {
-    return next;
-  }
-  if (!next) {
-    return current;
-  }
-  if (current === next || current.includes(next)) {
-    return current;
-  }
-  if (next.includes(current)) {
-    return next;
-  }
-  if (countWords(next) > countWords(current)) {
-    return next;
-  }
-  if (countWords(next) === countWords(current) && next.length > current.length) {
-    return next;
-  }
-  return current;
-};
-var mergeSequentialTranscriptText = (currentText, nextText) => {
-  const current = normalizeText(currentText);
-  const next = normalizeText(nextText);
-  if (!current) {
-    return next;
-  }
-  if (!next) {
-    return current;
-  }
-  const currentWords = current.split(" ");
-  const nextWords = next.split(" ");
-  const maxOverlap = Math.min(currentWords.length, nextWords.length);
-  for (let overlap = maxOverlap;overlap > 0; overlap -= 1) {
-    const currentSuffix = currentWords.slice(-overlap).join(" ");
-    const nextPrefix = nextWords.slice(0, overlap).join(" ");
-    if (currentSuffix === nextPrefix) {
-      return [...currentWords, ...nextWords.slice(overlap)].join(" ");
-    }
-  }
-  return `${current} ${next}`.trim();
-};
-var countCommonPrefixWords = (currentText, nextText) => {
-  const currentWords = normalizeText(currentText).split(" ").filter(Boolean);
-  const nextWords = normalizeText(nextText).split(" ").filter(Boolean);
-  const maxWords = Math.min(currentWords.length, nextWords.length);
-  let count = 0;
-  for (let index = 0;index < maxWords; index += 1) {
-    if (currentWords[index] !== nextWords[index]) {
-      break;
-    }
-    count += 1;
-  }
-  return count;
-};
-var mergeTranscriptTexts = (transcripts) => {
-  const merged = [];
-  for (const transcript of transcripts) {
-    const nextText = normalizeText(transcript.text);
-    if (!nextText) {
-      continue;
-    }
-    const previous = merged.at(-1);
-    if (!previous) {
-      merged.push(nextText);
-      continue;
-    }
-    if (nextText === previous || previous.includes(nextText)) {
-      continue;
-    }
-    if (nextText.includes(previous)) {
-      merged[merged.length - 1] = nextText;
-      continue;
-    }
-    merged.push(nextText);
-  }
-  return merged.join(" ").trim();
-};
-var buildTurnText = (transcripts, partialText, options = {}) => {
-  const finalText = mergeTranscriptTexts(transcripts);
-  const nextPartial = normalizeText(partialText);
-  const lastFinalEndedAtMs = [...transcripts].reverse().find((transcript) => typeof transcript.endedAtMs === "number")?.endedAtMs;
-  if (finalText && nextPartial && typeof lastFinalEndedAtMs === "number" && typeof options.partialStartedAtMs === "number" && options.partialStartedAtMs - lastFinalEndedAtMs >= 250 && countCommonPrefixWords(finalText, nextPartial) === 0) {
-    return mergeSequentialTranscriptText(finalText, nextPartial);
-  }
-  return selectPreferredTranscriptText(finalText, nextPartial);
-};
 // src/core/turnProfiles.ts
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 400,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 300,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 600,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -1848,12 +1727,12 @@ var resolveTurnDetectionConfig = (config) => {
   const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
   const preset = TURN_PROFILE_DEFAULTS[profile];
   const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
+  const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
   return {
     profile,
     qualityProfile,
-    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
-    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
-    silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
+    minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
+    silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
   };

package/dist/core/semanticTurn.d.ts CHANGED Viewed

@@ -1,10 +1,20 @@
-import type { Transcript } from "./types";
+import type { AudioFormat, Transcript } from "./types";
 export type VoiceSemanticTurnInput = {
     audioLevel?: number;
     lastFinalTranscript?: Transcript;
     partialText: string;
     silenceMs: number;
     transcripts: Transcript[];
+    /**
+     * The current turn's buffered user audio (PCM chunks, oldest→newest) and its
+     * format. Lets an AUDIO-based end-of-turn detector (e.g. a smart-turn / Whisper
+     * EOT model) judge completion from prosody — pitch, pace, trailing intonation —
+     * which a transcript-only judge fundamentally cannot see. Undefined when no
+     * audio was buffered for the turn (the runtime only stores chunks above the
+     * speech threshold).
+     */
+    turnAudio?: ReadonlyArray<Uint8Array>;
+    turnAudioFormat?: AudioFormat;
 };
 export type VoiceSemanticTurnVerdict = {
     confidence?: number;

package/dist/core/turnDetection.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import type { AudioChunk, Transcript } from "./types";
 export declare const DEFAULT_SILENCE_MS = 700;
 export declare const DEFAULT_SPEECH_THRESHOLD = 0.015;
-export declare const DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
+export declare const DEFAULT_MIN_SILENCE_MS = 400;
 export declare const measureAudioLevel: (audio: AudioChunk) => number;
 export declare const selectPreferredTranscriptText: (currentText: string, nextText: string) => string;
 export declare const buildTurnText: (transcripts: Transcript[], partialText: string, options?: {

package/dist/core/types.d.ts CHANGED Viewed

@@ -436,19 +436,17 @@ export type VoiceTurnDetectionConfig = {
     profile?: VoiceTurnProfile;
     qualityProfile?: VoiceTurnQualityProfile;
     silenceMs?: number;
+    minSilenceMs?: number;
     speechThreshold?: number;
     transcriptStabilityMs?: number;
-    semanticVetoMaxMs?: number;
-    semanticVetoRecheckMs?: number;
 };
 export type VoiceResolvedTurnDetectionConfig = {
     qualityProfile: VoiceTurnQualityProfile;
     profile: VoiceTurnProfile;
     silenceMs: number;
+    minSilenceMs: number;
     speechThreshold: number;
     transcriptStabilityMs: number;
-    semanticVetoMaxMs: number;
-    semanticVetoRecheckMs: number;
 };
 export type VoiceAudioConditioningConfig = {
     enabled?: boolean;

package/dist/embed/index.js CHANGED Viewed

@@ -1104,31 +1104,25 @@ var resolveAudioConditioningConfig = (config) => {
   };
 };
-// src/core/turnDetection.ts
-var DEFAULT_SEMANTIC_VETO_RECHECK_MS = 1200;
 // src/core/turnProfiles.ts
 var TURN_PROFILE_DEFAULTS = {
   balanced: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 400,
     silenceMs: 1400,
     speechThreshold: 0.012,
     transcriptStabilityMs: 1000
   },
   fast: {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 300,
     silenceMs: 700,
     speechThreshold: 0.015,
     transcriptStabilityMs: 450
   },
   "long-form": {
     qualityProfile: "general",
-    semanticVetoMaxMs: 0,
-    semanticVetoRecheckMs: DEFAULT_SEMANTIC_VETO_RECHECK_MS,
+    minSilenceMs: 600,
     silenceMs: 2200,
     speechThreshold: 0.01,
     transcriptStabilityMs: 1500
@@ -1159,12 +1153,12 @@ var resolveTurnDetectionConfig = (config) => {
   const qualityProfile = config?.qualityProfile ?? DEFAULT_QUALITY_PROFILE;
   const preset = TURN_PROFILE_DEFAULTS[profile];
   const quality = QUALITY_PROFILE_DEFAULTS[qualityProfile];
+  const silenceMs = config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs;
   return {
     profile,
     qualityProfile,
-    semanticVetoMaxMs: config?.semanticVetoMaxMs ?? preset.semanticVetoMaxMs,
-    semanticVetoRecheckMs: config?.semanticVetoRecheckMs ?? preset.semanticVetoRecheckMs,
-    silenceMs: config?.silenceMs ?? quality.silenceMs ?? preset.silenceMs,
+    minSilenceMs: Math.min(silenceMs, config?.minSilenceMs ?? quality.minSilenceMs ?? preset.minSilenceMs),
+    silenceMs,
     speechThreshold: config?.speechThreshold ?? quality.speechThreshold ?? preset.speechThreshold,
     transcriptStabilityMs: config?.transcriptStabilityMs ?? quality.transcriptStabilityMs ?? preset.transcriptStabilityMs
   };