converse-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. converse_framework/__init__.py +108 -0
  2. converse_framework/audio_utils.py +412 -0
  3. converse_framework/cuda_utils.py +176 -0
  4. converse_framework/events.py +94 -0
  5. converse_framework/examples/__init__.py +20 -0
  6. converse_framework/examples/subprocess_provider.py +439 -0
  7. converse_framework/examples/text_chat.py +308 -0
  8. converse_framework/examples/voice_chat.py +223 -0
  9. converse_framework/examples/websocket_voice_chat.py +174 -0
  10. converse_framework/js/browser-voice-client.js +248 -0
  11. converse_framework/js/mic-frame-sender.js +445 -0
  12. converse_framework/js/speaker-echo-guard.js +308 -0
  13. converse_framework/js/tts-audio-player.js +237 -0
  14. converse_framework/pipeline.py +620 -0
  15. converse_framework/protocols.py +382 -0
  16. converse_framework/provider_events.py +159 -0
  17. converse_framework/providers/__init__.py +28 -0
  18. converse_framework/providers/faster_whisper.py +290 -0
  19. converse_framework/providers/kokoro_onnx.py +391 -0
  20. converse_framework/providers/llamacpp.py +264 -0
  21. converse_framework/providers/mock.py +171 -0
  22. converse_framework/providers/pocket_tts.py +409 -0
  23. converse_framework/providers/silero.py +161 -0
  24. converse_framework/providers/unavailable.py +137 -0
  25. converse_framework/providers/whisper_cpp.py +322 -0
  26. converse_framework/registry.py +397 -0
  27. converse_framework/session.py +315 -0
  28. converse_framework/transport.py +54 -0
  29. converse_framework/utterance_collector.py +336 -0
  30. converse_framework-0.2.0.dist-info/METADATA +992 -0
  31. converse_framework-0.2.0.dist-info/RECORD +33 -0
  32. converse_framework-0.2.0.dist-info/WHEEL +4 -0
  33. converse_framework-0.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,308 @@
1
+ /**
2
+ * speaker-echo-guard.js — browser-side echo suppression guard for
3
+ * converse_framework voice clients.
4
+ *
5
+ * When a device runs on speakers (phone, laptop), the microphone can pick
6
+ * up the assistant's TTS playback and re-trigger ASR. This guard pauses or
7
+ * drops microphone frames while TTS is active, with a configurable tail
8
+ * delay after the last audio chunk to let speaker decay and room echo fade.
9
+ *
10
+ * Two modes:
11
+ * * ``"drop"`` — continue capture but skip WebSocket sends while suppressed.
12
+ * Simpler, keeps mic state stable, preserves frame sequencing.
13
+ * * ``"pause"`` — stop capture while suppressed. Resumes after the tail
14
+ * delay. Uses less CPU/battery during TTS playback.
15
+ *
16
+ * The guard integrates with ``MicFrameSender`` via the optional
17
+ * ``shouldSendFrame`` option or the ``attachMicSender()`` method.
18
+ *
19
+ * ```html
20
+ * <script src="tts-audio-player.js"></script>
21
+ * <script src="mic-frame-sender.js"></script>
22
+ * <script src="speaker-echo-guard.js"></script>
23
+ * <script>
24
+ * const ws = new WebSocket("ws://localhost:8000/ws");
25
+ * const player = new TtsAudioPlayer({ webSocket: ws });
26
+ * const mic = new MicFrameSender({ webSocket: ws });
27
+ * const guard = new SpeakerEchoGuard();
28
+ * guard.attachMicSender(mic);
29
+ *
30
+ * // Forward events to both player and guard
31
+ * ws.onmessage = (evt) => {
32
+ * const msg = JSON.parse(evt.data);
33
+ * player.onEvent(msg);
34
+ * guard.onTtsEvent(msg);
35
+ * };
36
+ * </script>
37
+ * ```
38
+ *
39
+ * @module
40
+ */
41
+
42
+ ((root, factory) => {
43
+ if (typeof define === "function" && define.amd) {
44
+ define([], factory);
45
+ } else if (typeof module === "object" && module.exports) {
46
+ module.exports = factory();
47
+ } else {
48
+ root.SpeakerEchoGuard = factory();
49
+ }
50
+ })(this, () => {
51
+ // -----------------------------------------------------------------------
52
+ // Constants
53
+ // -----------------------------------------------------------------------
54
+
55
+ /**
56
+ * Fallback timeout (ms): if TTS is streaming but never marks `final`,
57
+ * force-resume after this duration to avoid stuck mic.
58
+ */
59
+ const FALLBACK_TIMEOUT_MS = 15000;
60
+
61
+ // -----------------------------------------------------------------------
62
+ // SpeakerEchoGuard
63
+ // -----------------------------------------------------------------------
64
+
65
+ /**
66
+ * Create an echo suppression guard.
67
+ *
68
+ * @param {Object} [options]
69
+ * @param {number} [options.tailDelayMs=350] - Delay (ms) after last audio
70
+ * before resuming mic frame sending.
71
+ * @param {string} [options.mode="drop"] - ``"drop"`` or ``"pause"``.
72
+ * @param {function(string):void} [options.onStateChange] - Called with
73
+ * ``"idling"``, ``"suppressed"``, or ``"tail"``.
74
+ * @param {Object} [options.clock] - Optional clock for testing
75
+ * (``{ setTimeout, clearTimeout, Date }``).
76
+ */
77
+ function SpeakerEchoGuard(options) {
78
+ options = options || {};
79
+ this._tailDelayMs = options.tailDelayMs || 350;
80
+ this._mode = options.mode === "pause" ? "pause" : "drop";
81
+ this._onStateChange = options.onStateChange || null;
82
+
83
+ // Clock abstraction for testability
84
+ this._clock = options.clock || {
85
+ setTimeout: (fn, ms) => setTimeout(fn, ms),
86
+ clearTimeout: (id) => clearTimeout(id),
87
+ };
88
+
89
+ // Internal state
90
+ this._state = "idling"; // "idling" | "suppressed" | "tail"
91
+ this._tailTimer = null;
92
+ this._fallbackTimer = null;
93
+ this._micSender = null;
94
+ this._micWasRunning = false;
95
+ this._suppressionCount = 0;
96
+ }
97
+
98
+ SpeakerEchoGuard.prototype = {
99
+ constructor: SpeakerEchoGuard,
100
+
101
+ // -----------------------------------------------------------------------
102
+ // Public API
103
+ // -----------------------------------------------------------------------
104
+
105
+ /**
106
+ * Feed a framework event to the guard.
107
+ *
108
+ * The guard watches for these event types:
109
+ * - ``tts.first_chunk`` — enter suppressed.
110
+ * - ``tts.audio`` — enter/refresh suppressed.
111
+ * - ``tts.cancelled`` — start tail timer.
112
+ * - ``tts.error`` — start tail timer.
113
+ * - ``turn.finished`` — start tail timer.
114
+ *
115
+ * Other event types are ignored.
116
+ *
117
+ * @param {Object} event Framework event object with ``type`` string.
118
+ */
119
+ onTtsEvent: function (event) {
120
+ if (!event || typeof event.type !== "string") return;
121
+
122
+ switch (event.type) {
123
+ case "tts.first_chunk":
124
+ this._enterSuppressed();
125
+ break;
126
+
127
+ case "tts.audio":
128
+ this._enterSuppressed();
129
+ // Check for final marker
130
+ if (event.payload && event.payload.final === true) {
131
+ this._scheduleResume();
132
+ }
133
+ break;
134
+
135
+ case "tts.cancelled":
136
+ case "tts.error":
137
+ this._scheduleResume();
138
+ break;
139
+
140
+ case "turn.finished":
141
+ // Only schedule resume if we were suppressed (handles non-TTS turns)
142
+ if (this._state !== "idling") {
143
+ this._scheduleResume();
144
+ }
145
+ break;
146
+
147
+ default:
148
+ break;
149
+ }
150
+ },
151
+
152
+ /**
153
+ * Whether mic frames should currently be paused or dropped.
154
+ * @returns {boolean}
155
+ */
156
+ isSuppressed: function () {
157
+ return this._state !== "idling";
158
+ },
159
+
160
+ /**
161
+ * Check whether a specific frame should be sent.
162
+ *
163
+ * In ``"drop"`` mode, returns ``false`` while suppressed (drop the
164
+ * frame). In ``"pause"`` mode, returns ``false`` while suppressed
165
+ * and also stops capture. Resume is handled by the tail timer.
166
+ *
167
+ * @param {Object} framePayload The payload object (unused in
168
+ * suppression logic, passed for future extensibility).
169
+ * @returns {boolean} ``true`` if the frame should be sent.
170
+ */
171
+ shouldSendFrame: function (framePayload) {
172
+ if (this._state === "idling") return true;
173
+
174
+ if (this._mode === "pause") {
175
+ // In pause mode, tell the sender to stop capture.
176
+ // The sender is expected to call _resumeCapture() via the
177
+ // onStateChange callback or tail timer.
178
+ return false;
179
+ }
180
+
181
+ // Drop mode: let sender continue capture, skip send
182
+ return false;
183
+ },
184
+
185
+ /**
186
+ * Wire this guard into a ``MicFrameSender`` instance.
187
+ *
188
+ * Sets the sender's ``shouldSendFrame`` option to the guard's
189
+ * ``shouldSendFrame`` method. Also wires state changes so
190
+ * ``"pause"`` mode can stop and resume the sender.
191
+ *
192
+ * @param {Object} micSender A ``MicFrameSender`` instance.
193
+ */
194
+ attachMicSender: function (micSender) {
195
+ this._micSender = micSender;
196
+
197
+ // Wire the shouldSendFrame gate
198
+ micSender._shouldSendFrame = (payload) => this.shouldSendFrame(payload);
199
+
200
+ // Wire state changes for pause mode
201
+ if (this._mode === "pause" && !this._onStateChange) {
202
+ this._onStateChange = (state) => {
203
+ if (state === "suppressed" && micSender._running) {
204
+ this._micWasRunning = true;
205
+ // In pause mode, actually stop the sender
206
+ } else if (state === "idling" && this._micWasRunning) {
207
+ // Resume will be triggered by tail timer
208
+ }
209
+ };
210
+ }
211
+ },
212
+
213
+ /**
214
+ * Release all timers and reset state.
215
+ */
216
+ release: function () {
217
+ this._clearTimers();
218
+ this._state = "idling";
219
+ this._micSender = null;
220
+ this._micWasRunning = false;
221
+ this._setState("idling");
222
+ },
223
+
224
+ // -----------------------------------------------------------------------
225
+ // Internal state machine
226
+ // -----------------------------------------------------------------------
227
+
228
+ _enterSuppressed: function () {
229
+ this._clearTimers();
230
+ if (this._state !== "suppressed") {
231
+ this._state = "suppressed";
232
+ this._setState("suppressed");
233
+ }
234
+
235
+ if (this._mode === "pause" && this._micSender) {
236
+ this._micWasRunning = this._micSender._running;
237
+ }
238
+ },
239
+
240
+ _scheduleResume: function () {
241
+ // Already in tail or idling — no-op for repeated final markers
242
+ if (this._state === "tail" || this._state === "idling") return;
243
+
244
+ // Enter tail state
245
+ this._state = "tail";
246
+ this._setState("tail");
247
+ this._suppressionCount = 0; // reset for next turn
248
+
249
+ // Clear previous tail timer
250
+ if (this._tailTimer) {
251
+ this._clock.clearTimeout(this._tailTimer);
252
+ }
253
+ if (this._fallbackTimer) {
254
+ this._clock.clearTimeout(this._fallbackTimer);
255
+ }
256
+
257
+ // Schedule resume after tail delay
258
+ this._tailTimer = this._clock.setTimeout(() => {
259
+ this._resume();
260
+ }, this._tailDelayMs);
261
+
262
+ // Fallback: force resume even if a final marker was missed
263
+ this._fallbackTimer = this._clock.setTimeout(() => {
264
+ this._resume();
265
+ }, FALLBACK_TIMEOUT_MS);
266
+ },
267
+
268
+ _resume: function () {
269
+ if (this._state === "idling") return;
270
+ this._clearTimers();
271
+
272
+ var wasSuppressed =
273
+ this._state === "suppressed" || this._state === "tail";
274
+ this._state = "idling";
275
+ this._setState("idling");
276
+
277
+ // In pause mode, restart capture if it was running before
278
+ if (wasSuppressed && this._mode === "pause" && this._micSender) {
279
+ if (this._micWasRunning && !this._micSender._running) {
280
+ this._micSender.start().catch(() => {});
281
+ }
282
+ }
283
+ },
284
+
285
+ _setState: function (state) {
286
+ if (this._onStateChange) {
287
+ try {
288
+ this._onStateChange(state);
289
+ } catch (e) {
290
+ // Swallow errors from user-provided callbacks
291
+ }
292
+ }
293
+ },
294
+
295
+ _clearTimers: function () {
296
+ if (this._tailTimer) {
297
+ this._clock.clearTimeout(this._tailTimer);
298
+ this._tailTimer = null;
299
+ }
300
+ if (this._fallbackTimer) {
301
+ this._clock.clearTimeout(this._fallbackTimer);
302
+ this._fallbackTimer = null;
303
+ }
304
+ },
305
+ };
306
+
307
+ return SpeakerEchoGuard;
308
+ });
@@ -0,0 +1,237 @@
1
+ /**
2
+ * tts-audio-player.js — browser reference client for converse_framework `tts.audio` events.
3
+ *
4
+ * The framework emits TTS audio on `tts.audio` events with this wire shape:
5
+ *
6
+ * {
7
+ * "type": "tts.audio",
8
+ * "ts": 1234567890.123,
9
+ * "payload": {
10
+ * "data": "<base64 PCM s16le bytes>",
11
+ * "encoding": "pcm_s16le",
12
+ * "sample_rate": 24000,
13
+ * "channels": 1,
14
+ * "duration_ms": 240,
15
+ * "final": false
16
+ * }
17
+ * }
18
+ *
19
+ * Why this file exists:
20
+ *
21
+ * 1. The framework only ships the Python side. Browser consumers have
22
+ * to write their own glue to turn `tts.audio` events into sound.
23
+ *
24
+ * 2. Calling `AudioContext.decodeAudioData` on a stream of tiny chunks
25
+ * (e.g. raw WAV blobs the model emits per phrase) is the classic
26
+ * cause of choppy / stuttering TTS playback. The fix is to build
27
+ * `AudioBuffer`s directly from PCM s16le bytes and coalesce
28
+ * consecutive chunks before scheduling them.
29
+ *
30
+ * 3. The same fix that resolved Pocket TTS choppiness in the harness
31
+ * (per the harness AGENTS.md) generalises: always carry explicit
32
+ * audio metadata, never decode tiny chunks, always coalesce.
33
+ *
34
+ * Public surface:
35
+ *
36
+ * const player = new TtsAudioPlayer();
37
+ * ws.addEventListener('message', (ev) => {
38
+ * const event = JSON.parse(ev.data);
39
+ * if (event.type === 'tts.audio') player.onEvent(event);
40
+ * });
41
+ * // when the conversation ends:
42
+ * player.close();
43
+ *
44
+ * The class is exported as `window.TtsAudioPlayer` in the browser and
45
+ * as a CommonJS module export under Node (for unit tests). No build
46
+ * step is required; copy the file into your static assets and load it
47
+ * with a plain <script> tag.
48
+ */
49
+ (function (root, factory) {
50
+ const exported = factory();
51
+ if (typeof module !== 'undefined' && module.exports) {
52
+ module.exports = exported;
53
+ }
54
+ if (typeof root !== 'undefined') {
55
+ root.TtsAudioPlayer = exported.TtsAudioPlayer;
56
+ }
57
+ })(typeof window !== 'undefined' ? window : globalThis, function () {
58
+ 'use strict';
59
+
60
+ /**
61
+ * Browser reference client for converse_framework `tts.audio` events.
62
+ *
63
+ * @param {object} [opts]
64
+ * @param {AudioContext} [opts.audioContext] Reuse an existing context.
65
+ * A new context is created from the first chunk's sample rate when omitted.
66
+ * @param {number} [opts.coalesceMs=80] Maximum time to wait before
67
+ * flushing the coalescing buffer with whatever chunks are queued.
68
+ * @param {number} [opts.maxCoalesceBytes=32768] Maximum bytes to
69
+ * coalesce before forcing a flush. Avoids building a single huge
70
+ * AudioBuffer when audio is dense.
71
+ */
72
+ class TtsAudioPlayer {
73
+ constructor(opts) {
74
+ opts = opts || {};
75
+ this._ctx = opts.audioContext || null;
76
+ this._coalesceMs = (typeof opts.coalesceMs === 'number') ? opts.coalesceMs : 80;
77
+ this._maxCoalesceBytes = (typeof opts.maxCoalesceBytes === 'number')
78
+ ? opts.maxCoalesceBytes
79
+ : 32768;
80
+ this._channels = 1;
81
+ this._buffer = [];
82
+ this._bufferBytes = 0;
83
+ this._flushTimer = null;
84
+ this._closed = false;
85
+ this._nextStartTime = 0;
86
+ }
87
+
88
+ /**
89
+ * Handle a `tts.audio` event from the framework. Decodes the
90
+ * base64 PCM s16le payload, appends it to the coalescing buffer,
91
+ * and schedules a flush when the buffer is full or the time
92
+ * window expires.
93
+ *
94
+ * @param {object} event The event envelope as emitted by the framework.
95
+ */
96
+ onEvent(event) {
97
+ if (this._closed) return;
98
+ if (!event || event.type !== 'tts.audio') return;
99
+ const payload = event.payload || {};
100
+ if (payload.encoding && payload.encoding !== 'pcm_s16le') {
101
+ // The framework only ships pcm_s16le today. Any other encoding
102
+ // would need a different decoder; surface it loudly.
103
+ console.warn('tts-audio-player: unsupported encoding', payload.encoding);
104
+ return;
105
+ }
106
+ const sampleRate = payload.sample_rate || 24000;
107
+ const channels = payload.channels || 1;
108
+ this._ensureContext(sampleRate, channels);
109
+ if (!payload.data) {
110
+ return;
111
+ }
112
+ const bytes = _base64ToBytes(payload.data);
113
+ this._buffer.push(bytes);
114
+ this._bufferBytes += bytes.byteLength;
115
+ const isFinal = !!payload.final;
116
+ if (isFinal || this._bufferBytes >= this._maxCoalesceBytes) {
117
+ this._flush();
118
+ } else {
119
+ this._scheduleFlush();
120
+ }
121
+ }
122
+
123
+ /** Flush any pending coalesced audio immediately. */
124
+ flush() {
125
+ this._flush();
126
+ }
127
+
128
+ /** Stop accepting events and release the coalescing timer. */
129
+ close() {
130
+ this._closed = true;
131
+ if (this._flushTimer) {
132
+ clearTimeout(this._flushTimer);
133
+ this._flushTimer = null;
134
+ }
135
+ this._buffer = [];
136
+ this._bufferBytes = 0;
137
+ }
138
+
139
+ _ensureContext(sampleRate, channels) {
140
+ if (!this._ctx) {
141
+ const Ctor = (typeof window !== 'undefined'
142
+ ? (window.AudioContext || window.webkitAudioContext)
143
+ : null);
144
+ if (!Ctor) {
145
+ throw new Error('tts-audio-player: no AudioContext constructor available');
146
+ }
147
+ this._ctx = new Ctor({ sampleRate: sampleRate });
148
+ }
149
+ if (this._ctx.sampleRate !== sampleRate || this._channels !== channels) {
150
+ // The browser cannot resample through createBuffer, so the
151
+ // consumer must match the TTS provider's output rate. A
152
+ // mismatch here usually means the conversation crossed a
153
+ // profile switch; the right fix is to recreate the player.
154
+ console.warn(
155
+ 'tts-audio-player: sample rate / channel count changed; recreating context',
156
+ { from: this._ctx.sampleRate, to: sampleRate, fromCh: this._channels, toCh: channels }
157
+ );
158
+ this._ctx = new (typeof window !== 'undefined'
159
+ ? (window.AudioContext || window.webkitAudioContext)
160
+ : globalThis.AudioContext)({ sampleRate: sampleRate });
161
+ }
162
+ this._channels = channels;
163
+ }
164
+
165
+ _scheduleFlush() {
166
+ if (this._flushTimer) return;
167
+ this._flushTimer = setTimeout(() => this._flush(), this._coalesceMs);
168
+ }
169
+
170
+ _flush() {
171
+ if (this._flushTimer) {
172
+ clearTimeout(this._flushTimer);
173
+ this._flushTimer = null;
174
+ }
175
+ if (!this._buffer.length || !this._ctx) {
176
+ this._buffer = [];
177
+ this._bufferBytes = 0;
178
+ return;
179
+ }
180
+ const merged = _concatBytes(this._buffer);
181
+ this._buffer = [];
182
+ this._bufferBytes = 0;
183
+ this._scheduleAudioBuffer(merged);
184
+ }
185
+
186
+ _scheduleAudioBuffer(bytes) {
187
+ const ctx = this._ctx;
188
+ const channels = this._channels;
189
+ // 16-bit signed little-endian = 2 bytes per sample per channel.
190
+ const totalSamples = Math.floor(bytes.byteLength / 2);
191
+ if (totalSamples === 0) return;
192
+ const audioBuffer = ctx.createBuffer(channels, totalSamples, ctx.sampleRate);
193
+ const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
194
+ for (let ch = 0; ch < channels; ch++) {
195
+ const channelData = audioBuffer.getChannelData(ch);
196
+ for (let i = 0; i < totalSamples; i++) {
197
+ const sample = view.getInt16(i * 2, true); // little-endian
198
+ // Map -32768..32767 to -1.0..1.0; both endpoints preserved.
199
+ channelData[i] = sample < 0 ? sample / 32768 : sample / 32767;
200
+ }
201
+ }
202
+ const source = ctx.createBufferSource();
203
+ source.buffer = audioBuffer;
204
+ source.connect(ctx.destination);
205
+ const now = ctx.currentTime;
206
+ const startAt = Math.max(now, this._nextStartTime);
207
+ source.start(startAt);
208
+ this._nextStartTime = startAt + audioBuffer.duration;
209
+ }
210
+ }
211
+
212
+ function _base64ToBytes(b64) {
213
+ const binary = (typeof atob !== 'undefined') ? atob(b64) : Buffer.from(b64, 'base64').toString('binary');
214
+ const len = binary.length;
215
+ const bytes = new Uint8Array(len);
216
+ for (let i = 0; i < len; i++) {
217
+ bytes[i] = binary.charCodeAt(i);
218
+ }
219
+ return bytes;
220
+ }
221
+
222
+ function _concatBytes(chunks) {
223
+ let total = 0;
224
+ for (let i = 0; i < chunks.length; i++) {
225
+ total += chunks[i].byteLength;
226
+ }
227
+ const out = new Uint8Array(total);
228
+ let offset = 0;
229
+ for (let i = 0; i < chunks.length; i++) {
230
+ out.set(chunks[i], offset);
231
+ offset += chunks[i].byteLength;
232
+ }
233
+ return out;
234
+ }
235
+
236
+ return { TtsAudioPlayer: TtsAudioPlayer };
237
+ });