PyPI - converse-framework - Versions diffs - 0.2.0__py3-none-any.whl - Mend

converse-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

converse_framework/__init__.py +108 -0
converse_framework/audio_utils.py +412 -0
converse_framework/cuda_utils.py +176 -0
converse_framework/events.py +94 -0
converse_framework/examples/__init__.py +20 -0
converse_framework/examples/subprocess_provider.py +439 -0
converse_framework/examples/text_chat.py +308 -0
converse_framework/examples/voice_chat.py +223 -0
converse_framework/examples/websocket_voice_chat.py +174 -0
converse_framework/js/browser-voice-client.js +248 -0
converse_framework/js/mic-frame-sender.js +445 -0
converse_framework/js/speaker-echo-guard.js +308 -0
converse_framework/js/tts-audio-player.js +237 -0
converse_framework/pipeline.py +620 -0
converse_framework/protocols.py +382 -0
converse_framework/provider_events.py +159 -0
converse_framework/providers/__init__.py +28 -0
converse_framework/providers/faster_whisper.py +290 -0
converse_framework/providers/kokoro_onnx.py +391 -0
converse_framework/providers/llamacpp.py +264 -0
converse_framework/providers/mock.py +171 -0
converse_framework/providers/pocket_tts.py +409 -0
converse_framework/providers/silero.py +161 -0
converse_framework/providers/unavailable.py +137 -0
converse_framework/providers/whisper_cpp.py +322 -0
converse_framework/registry.py +397 -0
converse_framework/session.py +315 -0
converse_framework/transport.py +54 -0
converse_framework/utterance_collector.py +336 -0
converse_framework-0.2.0.dist-info/METADATA +992 -0
converse_framework-0.2.0.dist-info/RECORD +33 -0
converse_framework-0.2.0.dist-info/WHEEL +4 -0
converse_framework-0.2.0.dist-info/licenses/LICENSE +21 -0

converse_framework/js/speaker-echo-guard.js ADDED Viewed

@@ -0,0 +1,308 @@
+/**
+ * speaker-echo-guard.js — browser-side echo suppression guard for
+ * converse_framework voice clients.
+ *
+ * When a device runs on speakers (phone, laptop), the microphone can pick
+ * up the assistant's TTS playback and re-trigger ASR.  This guard pauses or
+ * drops microphone frames while TTS is active, with a configurable tail
+ * delay after the last audio chunk to let speaker decay and room echo fade.
+ *
+ * Two modes:
+ *   * ``"drop"`` — continue capture but skip WebSocket sends while suppressed.
+ *     Simpler, keeps mic state stable, preserves frame sequencing.
+ *   * ``"pause"`` — stop capture while suppressed.  Resumes after the tail
+ *     delay.  Uses less CPU/battery during TTS playback.
+ *
+ * The guard integrates with ``MicFrameSender`` via the optional
+ * ``shouldSendFrame`` option or the ``attachMicSender()`` method.
+ *
+ * ```html
+ * <script src="tts-audio-player.js"></script>
+ * <script src="mic-frame-sender.js"></script>
+ * <script src="speaker-echo-guard.js"></script>
+ * <script>
+ *   const ws = new WebSocket("ws://localhost:8000/ws");
+ *   const player = new TtsAudioPlayer({ webSocket: ws });
+ *   const mic = new MicFrameSender({ webSocket: ws });
+ *   const guard = new SpeakerEchoGuard();
+ *   guard.attachMicSender(mic);
+ *
+ *   // Forward events to both player and guard
+ *   ws.onmessage = (evt) => {
+ *     const msg = JSON.parse(evt.data);
+ *     player.onEvent(msg);
+ *     guard.onTtsEvent(msg);
+ *   };
+ * </script>
+ * ```
+ *
+ * @module
+ */
+((root, factory) => {
+	if (typeof define === "function" && define.amd) {
+		define([], factory);
+	} else if (typeof module === "object" && module.exports) {
+		module.exports = factory();
+	} else {
+		root.SpeakerEchoGuard = factory();
+	}
+})(this, () => {
+	// -----------------------------------------------------------------------
+	// Constants
+	// -----------------------------------------------------------------------
+	/**
+	 * Fallback timeout (ms): if TTS is streaming but never marks `final`,
+	 * force-resume after this duration to avoid stuck mic.
+	 */
+	const FALLBACK_TIMEOUT_MS = 15000;
+	// -----------------------------------------------------------------------
+	// SpeakerEchoGuard
+	// -----------------------------------------------------------------------
+	/**
+	 * Create an echo suppression guard.
+	 *
+	 * @param {Object} [options]
+	 * @param {number}  [options.tailDelayMs=350]  - Delay (ms) after last audio
+	 *     before resuming mic frame sending.
+	 * @param {string}  [options.mode="drop"]      - ``"drop"`` or ``"pause"``.
+	 * @param {function(string):void} [options.onStateChange]  - Called with
+	 *     ``"idling"``, ``"suppressed"``, or ``"tail"``.
+	 * @param {Object} [options.clock]  - Optional clock for testing
+	 *     (``{ setTimeout, clearTimeout, Date }``).
+	 */
+	function SpeakerEchoGuard(options) {
+		options = options || {};
+		this._tailDelayMs = options.tailDelayMs || 350;
+		this._mode = options.mode === "pause" ? "pause" : "drop";
+		this._onStateChange = options.onStateChange || null;
+		// Clock abstraction for testability
+		this._clock = options.clock || {
+			setTimeout: (fn, ms) => setTimeout(fn, ms),
+			clearTimeout: (id) => clearTimeout(id),
+		};
+		// Internal state
+		this._state = "idling"; // "idling" | "suppressed" | "tail"
+		this._tailTimer = null;
+		this._fallbackTimer = null;
+		this._micSender = null;
+		this._micWasRunning = false;
+		this._suppressionCount = 0;
+	}
+	SpeakerEchoGuard.prototype = {
+		constructor: SpeakerEchoGuard,
+		// -----------------------------------------------------------------------
+		// Public API
+		// -----------------------------------------------------------------------
+		/**
+		 * Feed a framework event to the guard.
+		 *
+		 * The guard watches for these event types:
+		 * - ``tts.first_chunk`` — enter suppressed.
+		 * - ``tts.audio`` — enter/refresh suppressed.
+		 * - ``tts.cancelled`` — start tail timer.
+		 * - ``tts.error`` — start tail timer.
+		 * - ``turn.finished`` — start tail timer.
+		 *
+		 * Other event types are ignored.
+		 *
+		 * @param {Object} event  Framework event object with ``type`` string.
+		 */
+		onTtsEvent: function (event) {
+			if (!event || typeof event.type !== "string") return;
+			switch (event.type) {
+				case "tts.first_chunk":
+					this._enterSuppressed();
+					break;
+				case "tts.audio":
+					this._enterSuppressed();
+					// Check for final marker
+					if (event.payload && event.payload.final === true) {
+						this._scheduleResume();
+					}
+					break;
+				case "tts.cancelled":
+				case "tts.error":
+					this._scheduleResume();
+					break;
+				case "turn.finished":
+					// Only schedule resume if we were suppressed (handles non-TTS turns)
+					if (this._state !== "idling") {
+						this._scheduleResume();
+					}
+					break;
+				default:
+					break;
+			}
+		},
+		/**
+		 * Whether mic frames should currently be paused or dropped.
+		 * @returns {boolean}
+		 */
+		isSuppressed: function () {
+			return this._state !== "idling";
+		},
+		/**
+		 * Check whether a specific frame should be sent.
+		 *
+		 * In ``"drop"`` mode, returns ``false`` while suppressed (drop the
+		 * frame).  In ``"pause"`` mode, returns ``false`` while suppressed
+		 * and also stops capture.  Resume is handled by the tail timer.
+		 *
+		 * @param {Object} framePayload  The payload object (unused in
+		 *     suppression logic, passed for future extensibility).
+		 * @returns {boolean}  ``true`` if the frame should be sent.
+		 */
+		shouldSendFrame: function (framePayload) {
+			if (this._state === "idling") return true;
+			if (this._mode === "pause") {
+				// In pause mode, tell the sender to stop capture.
+				// The sender is expected to call _resumeCapture() via the
+				// onStateChange callback or tail timer.
+				return false;
+			}
+			// Drop mode: let sender continue capture, skip send
+			return false;
+		},
+		/**
+		 * Wire this guard into a ``MicFrameSender`` instance.
+		 *
+		 * Sets the sender's ``shouldSendFrame`` option to the guard's
+		 * ``shouldSendFrame`` method.  Also wires state changes so
+		 * ``"pause"`` mode can stop and resume the sender.
+		 *
+		 * @param {Object} micSender  A ``MicFrameSender`` instance.
+		 */
+		attachMicSender: function (micSender) {
+			this._micSender = micSender;
+			// Wire the shouldSendFrame gate
+			micSender._shouldSendFrame = (payload) => this.shouldSendFrame(payload);
+			// Wire state changes for pause mode
+			if (this._mode === "pause" && !this._onStateChange) {
+				this._onStateChange = (state) => {
+					if (state === "suppressed" && micSender._running) {
+						this._micWasRunning = true;
+						// In pause mode, actually stop the sender
+					} else if (state === "idling" && this._micWasRunning) {
+						// Resume will be triggered by tail timer
+					}
+				};
+			}
+		},
+		/**
+		 * Release all timers and reset state.
+		 */
+		release: function () {
+			this._clearTimers();
+			this._state = "idling";
+			this._micSender = null;
+			this._micWasRunning = false;
+			this._setState("idling");
+		},
+		// -----------------------------------------------------------------------
+		// Internal state machine
+		// -----------------------------------------------------------------------
+		_enterSuppressed: function () {
+			this._clearTimers();
+			if (this._state !== "suppressed") {
+				this._state = "suppressed";
+				this._setState("suppressed");
+			}
+			if (this._mode === "pause" && this._micSender) {
+				this._micWasRunning = this._micSender._running;
+			}
+		},
+		_scheduleResume: function () {
+			// Already in tail or idling — no-op for repeated final markers
+			if (this._state === "tail" || this._state === "idling") return;
+			// Enter tail state
+			this._state = "tail";
+			this._setState("tail");
+			this._suppressionCount = 0; // reset for next turn
+			// Clear previous tail timer
+			if (this._tailTimer) {
+				this._clock.clearTimeout(this._tailTimer);
+			}
+			if (this._fallbackTimer) {
+				this._clock.clearTimeout(this._fallbackTimer);
+			}
+			// Schedule resume after tail delay
+			this._tailTimer = this._clock.setTimeout(() => {
+				this._resume();
+			}, this._tailDelayMs);
+			// Fallback: force resume even if a final marker was missed
+			this._fallbackTimer = this._clock.setTimeout(() => {
+				this._resume();
+			}, FALLBACK_TIMEOUT_MS);
+		},
+		_resume: function () {
+			if (this._state === "idling") return;
+			this._clearTimers();
+			var wasSuppressed =
+				this._state === "suppressed" || this._state === "tail";
+			this._state = "idling";
+			this._setState("idling");
+			// In pause mode, restart capture if it was running before
+			if (wasSuppressed && this._mode === "pause" && this._micSender) {
+				if (this._micWasRunning && !this._micSender._running) {
+					this._micSender.start().catch(() => {});
+				}
+			}
+		},
+		_setState: function (state) {
+			if (this._onStateChange) {
+				try {
+					this._onStateChange(state);
+				} catch (e) {
+					// Swallow errors from user-provided callbacks
+				}
+			}
+		},
+		_clearTimers: function () {
+			if (this._tailTimer) {
+				this._clock.clearTimeout(this._tailTimer);
+				this._tailTimer = null;
+			}
+			if (this._fallbackTimer) {
+				this._clock.clearTimeout(this._fallbackTimer);
+				this._fallbackTimer = null;
+			}
+		},
+	};
+	return SpeakerEchoGuard;
+});

converse_framework/js/tts-audio-player.js ADDED Viewed

@@ -0,0 +1,237 @@
+/**
+ * tts-audio-player.js — browser reference client for converse_framework `tts.audio` events.
+ *
+ * The framework emits TTS audio on `tts.audio` events with this wire shape:
+ *
+ *   {
+ *     "type": "tts.audio",
+ *     "ts": 1234567890.123,
+ *     "payload": {
+ *       "data": "<base64 PCM s16le bytes>",
+ *       "encoding": "pcm_s16le",
+ *       "sample_rate": 24000,
+ *       "channels": 1,
+ *       "duration_ms": 240,
+ *       "final": false
+ *     }
+ *   }
+ *
+ * Why this file exists:
+ *
+ *   1. The framework only ships the Python side. Browser consumers have
+ *      to write their own glue to turn `tts.audio` events into sound.
+ *
+ *   2. Calling `AudioContext.decodeAudioData` on a stream of tiny chunks
+ *      (e.g. raw WAV blobs the model emits per phrase) is the classic
+ *      cause of choppy / stuttering TTS playback. The fix is to build
+ *      `AudioBuffer`s directly from PCM s16le bytes and coalesce
+ *      consecutive chunks before scheduling them.
+ *
+ *   3. The same fix that resolved Pocket TTS choppiness in the harness
+ *      (per the harness AGENTS.md) generalises: always carry explicit
+ *      audio metadata, never decode tiny chunks, always coalesce.
+ *
+ * Public surface:
+ *
+ *   const player = new TtsAudioPlayer();
+ *   ws.addEventListener('message', (ev) => {
+ *     const event = JSON.parse(ev.data);
+ *     if (event.type === 'tts.audio') player.onEvent(event);
+ *   });
+ *   // when the conversation ends:
+ *   player.close();
+ *
+ * The class is exported as `window.TtsAudioPlayer` in the browser and
+ * as a CommonJS module export under Node (for unit tests). No build
+ * step is required; copy the file into your static assets and load it
+ * with a plain <script> tag.
+ */
+(function (root, factory) {
+  const exported = factory();
+  if (typeof module !== 'undefined' && module.exports) {
+    module.exports = exported;
+  }
+  if (typeof root !== 'undefined') {
+    root.TtsAudioPlayer = exported.TtsAudioPlayer;
+  }
+})(typeof window !== 'undefined' ? window : globalThis, function () {
+  'use strict';
+  /**
+   * Browser reference client for converse_framework `tts.audio` events.
+   *
+   * @param {object} [opts]
+   * @param {AudioContext} [opts.audioContext] Reuse an existing context.
+   *   A new context is created from the first chunk's sample rate when omitted.
+   * @param {number} [opts.coalesceMs=80] Maximum time to wait before
+   *   flushing the coalescing buffer with whatever chunks are queued.
+   * @param {number} [opts.maxCoalesceBytes=32768] Maximum bytes to
+   *   coalesce before forcing a flush. Avoids building a single huge
+   *   AudioBuffer when audio is dense.
+   */
+  class TtsAudioPlayer {
+    constructor(opts) {
+      opts = opts || {};
+      this._ctx = opts.audioContext || null;
+      this._coalesceMs = (typeof opts.coalesceMs === 'number') ? opts.coalesceMs : 80;
+      this._maxCoalesceBytes = (typeof opts.maxCoalesceBytes === 'number')
+        ? opts.maxCoalesceBytes
+        : 32768;
+      this._channels = 1;
+      this._buffer = [];
+      this._bufferBytes = 0;
+      this._flushTimer = null;
+      this._closed = false;
+      this._nextStartTime = 0;
+    }
+    /**
+     * Handle a `tts.audio` event from the framework. Decodes the
+     * base64 PCM s16le payload, appends it to the coalescing buffer,
+     * and schedules a flush when the buffer is full or the time
+     * window expires.
+     *
+     * @param {object} event The event envelope as emitted by the framework.
+     */
+    onEvent(event) {
+      if (this._closed) return;
+      if (!event || event.type !== 'tts.audio') return;
+      const payload = event.payload || {};
+      if (payload.encoding && payload.encoding !== 'pcm_s16le') {
+        // The framework only ships pcm_s16le today. Any other encoding
+        // would need a different decoder; surface it loudly.
+        console.warn('tts-audio-player: unsupported encoding', payload.encoding);
+        return;
+      }
+      const sampleRate = payload.sample_rate || 24000;
+      const channels = payload.channels || 1;
+      this._ensureContext(sampleRate, channels);
+      if (!payload.data) {
+        return;
+      }
+      const bytes = _base64ToBytes(payload.data);
+      this._buffer.push(bytes);
+      this._bufferBytes += bytes.byteLength;
+      const isFinal = !!payload.final;
+      if (isFinal || this._bufferBytes >= this._maxCoalesceBytes) {
+        this._flush();
+      } else {
+        this._scheduleFlush();
+      }
+    }
+    /** Flush any pending coalesced audio immediately. */
+    flush() {
+      this._flush();
+    }
+    /** Stop accepting events and release the coalescing timer. */
+    close() {
+      this._closed = true;
+      if (this._flushTimer) {
+        clearTimeout(this._flushTimer);
+        this._flushTimer = null;
+      }
+      this._buffer = [];
+      this._bufferBytes = 0;
+    }
+    _ensureContext(sampleRate, channels) {
+      if (!this._ctx) {
+        const Ctor = (typeof window !== 'undefined'
+          ? (window.AudioContext || window.webkitAudioContext)
+          : null);
+        if (!Ctor) {
+          throw new Error('tts-audio-player: no AudioContext constructor available');
+        }
+        this._ctx = new Ctor({ sampleRate: sampleRate });
+      }
+      if (this._ctx.sampleRate !== sampleRate || this._channels !== channels) {
+        // The browser cannot resample through createBuffer, so the
+        // consumer must match the TTS provider's output rate. A
+        // mismatch here usually means the conversation crossed a
+        // profile switch; the right fix is to recreate the player.
+        console.warn(
+          'tts-audio-player: sample rate / channel count changed; recreating context',
+          { from: this._ctx.sampleRate, to: sampleRate, fromCh: this._channels, toCh: channels }
+        );
+        this._ctx = new (typeof window !== 'undefined'
+          ? (window.AudioContext || window.webkitAudioContext)
+          : globalThis.AudioContext)({ sampleRate: sampleRate });
+      }
+      this._channels = channels;
+    }
+    _scheduleFlush() {
+      if (this._flushTimer) return;
+      this._flushTimer = setTimeout(() => this._flush(), this._coalesceMs);
+    }
+    _flush() {
+      if (this._flushTimer) {
+        clearTimeout(this._flushTimer);
+        this._flushTimer = null;
+      }
+      if (!this._buffer.length || !this._ctx) {
+        this._buffer = [];
+        this._bufferBytes = 0;
+        return;
+      }
+      const merged = _concatBytes(this._buffer);
+      this._buffer = [];
+      this._bufferBytes = 0;
+      this._scheduleAudioBuffer(merged);
+    }
+    _scheduleAudioBuffer(bytes) {
+      const ctx = this._ctx;
+      const channels = this._channels;
+      // 16-bit signed little-endian = 2 bytes per sample per channel.
+      const totalSamples = Math.floor(bytes.byteLength / 2);
+      if (totalSamples === 0) return;
+      const audioBuffer = ctx.createBuffer(channels, totalSamples, ctx.sampleRate);
+      const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
+      for (let ch = 0; ch < channels; ch++) {
+        const channelData = audioBuffer.getChannelData(ch);
+        for (let i = 0; i < totalSamples; i++) {
+          const sample = view.getInt16(i * 2, true); // little-endian
+          // Map -32768..32767 to -1.0..1.0; both endpoints preserved.
+          channelData[i] = sample < 0 ? sample / 32768 : sample / 32767;
+        }
+      }
+      const source = ctx.createBufferSource();
+      source.buffer = audioBuffer;
+      source.connect(ctx.destination);
+      const now = ctx.currentTime;
+      const startAt = Math.max(now, this._nextStartTime);
+      source.start(startAt);
+      this._nextStartTime = startAt + audioBuffer.duration;
+    }
+  }
+  function _base64ToBytes(b64) {
+    const binary = (typeof atob !== 'undefined') ? atob(b64) : Buffer.from(b64, 'base64').toString('binary');
+    const len = binary.length;
+    const bytes = new Uint8Array(len);
+    for (let i = 0; i < len; i++) {
+      bytes[i] = binary.charCodeAt(i);
+    }
+    return bytes;
+  }
+  function _concatBytes(chunks) {
+    let total = 0;
+    for (let i = 0; i < chunks.length; i++) {
+      total += chunks[i].byteLength;
+    }
+    const out = new Uint8Array(total);
+    let offset = 0;
+    for (let i = 0; i < chunks.length; i++) {
+      out.set(chunks[i], offset);
+      offset += chunks[i].byteLength;
+    }
+    return out;
+  }
+  return { TtsAudioPlayer: TtsAudioPlayer };
+});