@runtypelabs/persona 1.47.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +140 -8
  2. package/dist/index.cjs +90 -39
  3. package/dist/index.cjs.map +1 -1
  4. package/dist/index.d.cts +1093 -25
  5. package/dist/index.d.ts +1093 -25
  6. package/dist/index.global.js +111 -60
  7. package/dist/index.global.js.map +1 -1
  8. package/dist/index.js +90 -39
  9. package/dist/index.js.map +1 -1
  10. package/dist/install.global.js +1 -1
  11. package/dist/install.global.js.map +1 -1
  12. package/dist/widget.css +852 -505
  13. package/package.json +1 -1
  14. package/src/artifacts-session.test.ts +80 -0
  15. package/src/client.test.ts +20 -21
  16. package/src/client.ts +153 -4
  17. package/src/components/approval-bubble.ts +45 -42
  18. package/src/components/artifact-card.ts +91 -0
  19. package/src/components/artifact-pane.ts +501 -0
  20. package/src/components/composer-builder.ts +32 -27
  21. package/src/components/event-stream-view.ts +40 -40
  22. package/src/components/feedback.ts +36 -36
  23. package/src/components/forms.ts +11 -11
  24. package/src/components/header-builder.test.ts +32 -0
  25. package/src/components/header-builder.ts +55 -36
  26. package/src/components/header-layouts.ts +58 -125
  27. package/src/components/launcher.ts +36 -21
  28. package/src/components/message-bubble.ts +92 -65
  29. package/src/components/messages.ts +2 -2
  30. package/src/components/panel.ts +42 -11
  31. package/src/components/reasoning-bubble.ts +23 -23
  32. package/src/components/registry.ts +4 -0
  33. package/src/components/suggestions.ts +1 -1
  34. package/src/components/tool-bubble.ts +32 -32
  35. package/src/defaults.ts +30 -4
  36. package/src/index.ts +80 -2
  37. package/src/install.ts +22 -0
  38. package/src/plugins/types.ts +23 -0
  39. package/src/postprocessors.ts +2 -2
  40. package/src/runtime/host-layout.ts +174 -0
  41. package/src/runtime/init.test.ts +236 -0
  42. package/src/runtime/init.ts +114 -55
  43. package/src/session.ts +173 -7
  44. package/src/styles/tailwind.css +1 -1
  45. package/src/styles/widget.css +852 -505
  46. package/src/types/theme.ts +354 -0
  47. package/src/types.ts +348 -16
  48. package/src/ui.docked.test.ts +104 -0
  49. package/src/ui.ts +1093 -244
  50. package/src/utils/artifact-gate.test.ts +255 -0
  51. package/src/utils/artifact-gate.ts +142 -0
  52. package/src/utils/artifact-resize.test.ts +64 -0
  53. package/src/utils/artifact-resize.ts +67 -0
  54. package/src/utils/attachment-manager.ts +10 -10
  55. package/src/utils/code-generators.test.ts +52 -0
  56. package/src/utils/code-generators.ts +40 -36
  57. package/src/utils/dock.ts +17 -0
  58. package/src/utils/dom-context.test.ts +504 -0
  59. package/src/utils/dom-context.ts +896 -0
  60. package/src/utils/dom.ts +12 -1
  61. package/src/utils/message-fingerprint.test.ts +187 -0
  62. package/src/utils/message-fingerprint.ts +105 -0
  63. package/src/utils/migration.ts +179 -0
  64. package/src/utils/morph.ts +1 -1
  65. package/src/utils/plugins.ts +175 -0
  66. package/src/utils/positioning.ts +4 -4
  67. package/src/utils/theme.test.ts +125 -0
  68. package/src/utils/theme.ts +216 -60
  69. package/src/utils/tokens.ts +682 -0
  70. package/src/voice/audio-playback-manager.ts +187 -0
  71. package/src/voice/runtype-voice-provider.ts +305 -69
  72. package/src/voice/voice-activity-detector.ts +90 -0
  73. package/src/voice/voice.test.ts +6 -5
@@ -0,0 +1,187 @@
1
+ /**
2
+ * AudioPlaybackManager
3
+ *
4
+ * Manages streaming playback of PCM audio chunks via the Web Audio API.
5
+ * Receives raw PCM data (24 kHz, 16-bit signed little-endian, mono),
6
+ * converts to Float32 AudioBuffers, and schedules gap-free sequential
7
+ * playback using AudioBufferSourceNode.
8
+ *
9
+ * Works on all browsers including iOS Safari (no MediaSource dependency).
10
+ */
11
+ export class AudioPlaybackManager {
12
+ private ctx: AudioContext | null = null;
13
+ private nextStartTime = 0;
14
+ private activeSources: AudioBufferSourceNode[] = [];
15
+ private finishedCallbacks: (() => void)[] = [];
16
+ private playing = false;
17
+ private streamEnded = false;
18
+ private pendingCount = 0;
19
+
20
+ // PCM format constants
21
+ private readonly sampleRate: number;
22
+
23
+ // Remainder byte from a previous chunk when the chunk had an odd byte count.
24
+ // Network chunks don't respect 2-byte sample boundaries, so we carry over
25
+ // the orphaned byte and prepend it to the next chunk.
26
+ private remainder: Uint8Array | null = null;
27
+
28
+ constructor(sampleRate = 24000) {
29
+ this.sampleRate = sampleRate;
30
+ }
31
+
32
+ /**
33
+ * Ensure AudioContext is created and running.
34
+ * Must be called after a user gesture on iOS Safari.
35
+ */
36
+ private ensureContext(): AudioContext {
37
+ if (!this.ctx) {
38
+ const w = typeof window !== "undefined" ? (window as any) : undefined;
39
+ if (!w) throw new Error("AudioPlaybackManager requires a browser environment");
40
+ const AudioCtx = w.AudioContext || w.webkitAudioContext;
41
+ this.ctx = new AudioCtx({ sampleRate: this.sampleRate }) as AudioContext;
42
+ }
43
+ const ctx = this.ctx!;
44
+ // Resume if suspended (autoplay policy)
45
+ if (ctx.state === "suspended") {
46
+ ctx.resume();
47
+ }
48
+ return ctx;
49
+ }
50
+
51
+ /**
52
+ * Enqueue a PCM chunk for playback.
53
+ * @param pcmData Raw PCM bytes (16-bit signed LE mono)
54
+ */
55
+ enqueue(pcmData: Uint8Array): void {
56
+ if (pcmData.length === 0) return;
57
+
58
+ // Prepend any remainder byte from the previous chunk
59
+ let data = pcmData;
60
+ if (this.remainder) {
61
+ const merged = new Uint8Array(this.remainder.length + pcmData.length);
62
+ merged.set(this.remainder);
63
+ merged.set(pcmData, this.remainder.length);
64
+ data = merged;
65
+ this.remainder = null;
66
+ }
67
+
68
+ // If odd byte count, save the trailing byte for next chunk
69
+ if (data.length % 2 !== 0) {
70
+ this.remainder = new Uint8Array([data[data.length - 1]]);
71
+ data = data.subarray(0, data.length - 1);
72
+ }
73
+
74
+ if (data.length === 0) return;
75
+
76
+ const ctx = this.ensureContext();
77
+ const float32 = this.pcmToFloat32(data);
78
+
79
+ const buffer = ctx.createBuffer(1, float32.length, this.sampleRate);
80
+ buffer.getChannelData(0).set(float32);
81
+
82
+ const source = ctx.createBufferSource();
83
+ source.buffer = buffer;
84
+ source.connect(ctx.destination);
85
+
86
+ // Schedule gap-free playback
87
+ const now = ctx.currentTime;
88
+ if (this.nextStartTime < now) {
89
+ this.nextStartTime = now;
90
+ }
91
+ source.start(this.nextStartTime);
92
+ this.nextStartTime += buffer.duration;
93
+
94
+ this.activeSources.push(source);
95
+ this.pendingCount++;
96
+ this.playing = true;
97
+
98
+ source.onended = () => {
99
+ const idx = this.activeSources.indexOf(source);
100
+ if (idx !== -1) this.activeSources.splice(idx, 1);
101
+ this.pendingCount--;
102
+ this.checkFinished();
103
+ };
104
+ }
105
+
106
+ /**
107
+ * Signal that no more chunks will arrive.
108
+ * The onFinished callback fires after all queued audio has played.
109
+ */
110
+ markStreamEnd(): void {
111
+ this.streamEnded = true;
112
+ this.checkFinished();
113
+ }
114
+
115
+ /**
116
+ * Immediately stop all playback and discard queued audio.
117
+ */
118
+ flush(): void {
119
+ for (const source of this.activeSources) {
120
+ try {
121
+ source.stop();
122
+ source.disconnect();
123
+ } catch {
124
+ // Ignore errors from already-stopped sources
125
+ }
126
+ }
127
+ this.activeSources = [];
128
+ this.pendingCount = 0;
129
+ this.nextStartTime = 0;
130
+ this.playing = false;
131
+ this.streamEnded = false;
132
+ this.finishedCallbacks = [];
133
+ this.remainder = null;
134
+ }
135
+
136
+ /**
137
+ * Whether audio is currently playing or queued.
138
+ */
139
+ isPlaying(): boolean {
140
+ return this.playing;
141
+ }
142
+
143
+ /**
144
+ * Register a callback for when all queued audio finishes playing.
145
+ */
146
+ onFinished(callback: () => void): void {
147
+ this.finishedCallbacks.push(callback);
148
+ }
149
+
150
+ /**
151
+ * Clean up AudioContext resources.
152
+ */
153
+ async destroy(): Promise<void> {
154
+ this.flush();
155
+ if (this.ctx) {
156
+ await this.ctx.close();
157
+ this.ctx = null;
158
+ }
159
+ }
160
+
161
+ private checkFinished(): void {
162
+ if (this.streamEnded && this.pendingCount <= 0 && this.playing) {
163
+ this.playing = false;
164
+ this.streamEnded = false;
165
+ const cbs = this.finishedCallbacks.slice();
166
+ this.finishedCallbacks = [];
167
+ for (const cb of cbs) cb();
168
+ }
169
+ }
170
+
171
+ /**
172
+ * Convert 16-bit signed LE PCM to Float32 samples in [-1, 1].
173
+ */
174
+ private pcmToFloat32(pcmData: Uint8Array): Float32Array {
175
+ // 2 bytes per sample (16-bit)
176
+ const numSamples = Math.floor(pcmData.length / 2);
177
+ const float32 = new Float32Array(numSamples);
178
+ const view = new DataView(pcmData.buffer, pcmData.byteOffset, pcmData.byteLength);
179
+
180
+ for (let i = 0; i < numSamples; i++) {
181
+ const int16 = view.getInt16(i * 2, true); // little-endian
182
+ float32[i] = int16 / 32768;
183
+ }
184
+
185
+ return float32;
186
+ }
187
+ }
@@ -7,6 +7,8 @@ import type {
7
7
  VoiceStatus,
8
8
  VoiceConfig,
9
9
  } from "../types";
10
+ import { AudioPlaybackManager } from "./audio-playback-manager";
11
+ import { VoiceActivityDetector } from "./voice-activity-detector";
10
12
 
11
13
  export class RuntypeVoiceProvider implements VoiceProvider {
12
14
  type: "runtype" = "runtype";
@@ -20,15 +22,46 @@ export class RuntypeVoiceProvider implements VoiceProvider {
20
22
  private processingStartCallbacks: (() => void)[] = [];
21
23
  private audioChunks: Blob[] = [];
22
24
  private isProcessing = false;
25
+ private isSpeaking = false;
23
26
 
24
- // Silence detection
25
- private analyserNode: AnalyserNode | null = null;
27
+ // Voice activity detection (silence auto-stop + barge-in speech detection)
28
+ private vad = new VoiceActivityDetector();
26
29
  private mediaStream: MediaStream | null = null;
27
- private silenceCheckInterval: ReturnType<typeof setInterval> | null = null;
28
- private silenceStart: number | null = null;
30
+
31
+ // Cancellation / interruption support
32
+ private currentAudio: HTMLAudioElement | null = null;
33
+ private currentAudioUrl: string | null = null;
34
+ private currentRequestId: string | null = null;
35
+ private interruptionMode: "none" | "cancel" | "barge-in" = "none";
36
+
37
+ // Streaming audio playback (PCM chunks)
38
+ private playbackManager: AudioPlaybackManager | null = null;
29
39
 
30
40
  constructor(private config: VoiceConfig["runtype"]) {}
31
41
 
42
+ /** Returns the current interruption mode received from the server */
43
+ getInterruptionMode(): "none" | "cancel" | "barge-in" {
44
+ return this.interruptionMode;
45
+ }
46
+
47
+ /** Returns true if the barge-in mic stream is alive (hot mic between turns) */
48
+ isBargeInActive(): boolean {
49
+ return this.interruptionMode === "barge-in" && this.mediaStream !== null;
50
+ }
51
+
52
+ /** Tear down the barge-in mic pipeline — "hang up" the always-on mic */
53
+ async deactivateBargeIn(): Promise<void> {
54
+ this.vad.stop();
55
+ if (this.mediaStream) {
56
+ this.mediaStream.getTracks().forEach((track) => track.stop());
57
+ this.mediaStream = null;
58
+ }
59
+ if (this.audioContext) {
60
+ await this.audioContext.close();
61
+ this.audioContext = null;
62
+ }
63
+ }
64
+
32
65
  async connect() {
33
66
  if (this.ws && this.ws.readyState === WebSocket.OPEN) {
34
67
  return; // Already connected
@@ -113,6 +146,11 @@ export class RuntypeVoiceProvider implements VoiceProvider {
113
146
  { once: true }
114
147
  );
115
148
  });
149
+
150
+ // Send a ping immediately so the server replies with session_config
151
+ // (which includes interruptionMode). This ensures the client knows
152
+ // about barge-in mode before the first recording starts.
153
+ this.sendHeartbeat();
116
154
  } catch (error) {
117
155
  this.ws = null;
118
156
  this.errorCallbacks.forEach((cb) => cb(error as Error));
@@ -137,7 +175,17 @@ export class RuntypeVoiceProvider implements VoiceProvider {
137
175
  this.statusCallbacks.forEach((cb) => cb("error"));
138
176
  };
139
177
 
178
+ // Receive binary frames for streaming audio (set binaryType to arraybuffer)
179
+ this.ws.binaryType = "arraybuffer";
180
+
140
181
  this.ws.onmessage = (event) => {
182
+ // Binary frame = raw PCM audio chunk for streaming playback
183
+ if (event.data instanceof ArrayBuffer) {
184
+ this.handleAudioChunk(new Uint8Array(event.data));
185
+ return;
186
+ }
187
+
188
+ // Text frame = JSON control message
141
189
  try {
142
190
  const message = JSON.parse(event.data);
143
191
  this.handleWebSocketMessage(message);
@@ -151,15 +199,16 @@ export class RuntypeVoiceProvider implements VoiceProvider {
151
199
 
152
200
  private handleWebSocketMessage(message: any) {
153
201
  switch (message.type) {
154
- case "voice_response":
155
- // Play TTS audio if present
156
- if (message.response.audio?.base64) {
157
- this.playAudio(message.response.audio).catch((err) =>
158
- this.errorCallbacks.forEach((cb) => cb(err instanceof Error ? err : new Error(String(err)))),
159
- );
202
+ case "session_config":
203
+ // Server sends voice settings on session init
204
+ if (message.interruptionMode) {
205
+ this.interruptionMode = message.interruptionMode;
160
206
  }
161
- // Use agentResponseText (the agent's reply) as the text result,
162
- // falling back to transcript (user's STT input) for backwards compat
207
+ break;
208
+
209
+ case "voice_response":
210
+ // Deliver text result immediately
211
+ this.isProcessing = false;
163
212
  this.resultCallbacks.forEach((cb) =>
164
213
  cb({
165
214
  text: message.response.agentResponseText || message.response.transcript,
@@ -169,8 +218,39 @@ export class RuntypeVoiceProvider implements VoiceProvider {
169
218
  provider: "runtype",
170
219
  }),
171
220
  );
221
+
222
+ // Batch path: play TTS audio if present in the response (backward compat)
223
+ if (message.response.audio?.base64) {
224
+ this.isSpeaking = true;
225
+ this.statusCallbacks.forEach((cb) => cb("speaking"));
226
+ this.playAudio(message.response.audio).catch((err) =>
227
+ this.errorCallbacks.forEach((cb) => cb(err instanceof Error ? err : new Error(String(err)))),
228
+ );
229
+ } else if (!message.response.audio?.base64) {
230
+ // Streaming path: text-only voice_response — audio will arrive as
231
+ // binary chunks followed by audio_end. Transition to speaking state
232
+ // once the first audio chunk arrives (see handleAudioChunk).
233
+ // Stay in processing state until then.
234
+ }
235
+ break;
236
+
237
+ case "audio_end":
238
+ // Guard: discard late audio_end from a cancelled request
239
+ if (message.requestId && message.requestId !== this.currentRequestId) break;
240
+ // All PCM chunks have been sent — signal the playback manager
241
+ if (this.playbackManager) {
242
+ this.playbackManager.markStreamEnd();
243
+ } else {
244
+ // No audio chunks arrived — go idle
245
+ this.isSpeaking = false;
246
+ this.isProcessing = false;
247
+ this.statusCallbacks.forEach((cb) => cb("idle"));
248
+ }
249
+ break;
250
+
251
+ case "cancelled":
252
+ // Server acknowledged cancellation — discard any late-arriving responses
172
253
  this.isProcessing = false;
173
- this.statusCallbacks.forEach((cb) => cb("idle"));
174
254
  break;
175
255
 
176
256
  case "error":
@@ -185,54 +265,119 @@ export class RuntypeVoiceProvider implements VoiceProvider {
185
265
  }
186
266
  }
187
267
 
268
+ /**
269
+ * Handle a binary audio chunk (raw PCM 24kHz 16-bit LE) for streaming playback.
270
+ */
271
+ private handleAudioChunk(pcmData: Uint8Array): void {
272
+ if (pcmData.length === 0) return;
273
+ if (!this.currentRequestId) return; // discard late chunks after cancel
274
+
275
+ // Lazily create playback manager on first chunk
276
+ if (!this.playbackManager) {
277
+ this.playbackManager = new AudioPlaybackManager(24000);
278
+ this.playbackManager.onFinished(() => {
279
+ this.isSpeaking = false;
280
+ this.playbackManager = null;
281
+ this.vad.stop(); // stop speech monitoring — audio ended naturally
282
+ this.statusCallbacks.forEach((cb) => cb("idle"));
283
+ });
284
+ }
285
+
286
+ // Transition to speaking on first chunk
287
+ if (!this.isSpeaking) {
288
+ this.isSpeaking = true;
289
+ this.statusCallbacks.forEach((cb) => cb("speaking"));
290
+ this.startBargeInMonitoring().catch(() => {}); // no-op if not barge-in mode
291
+ }
292
+
293
+ this.playbackManager.enqueue(pcmData);
294
+ }
295
+
296
+ /**
297
+ * Stop playback / cancel in-flight request and return to idle.
298
+ * This is the public "stop only" action — does NOT start recording.
299
+ */
300
+ stopPlayback(): void {
301
+ if (!this.isProcessing && !this.isSpeaking) return;
302
+ this.cancelCurrentPlayback();
303
+ this.statusCallbacks.forEach((cb) => cb("idle"));
304
+ }
305
+
306
+ /**
307
+ * Cancel the current playback and in-flight server request.
308
+ * Internal helper — does NOT fire status callbacks (caller decides next state).
309
+ */
310
+ private cancelCurrentPlayback(): void {
311
+ // Stop batch playback (Audio element)
312
+ if (this.currentAudio) {
313
+ this.currentAudio.pause();
314
+ this.currentAudio.src = "";
315
+ this.currentAudio = null;
316
+ }
317
+ if (this.currentAudioUrl) {
318
+ URL.revokeObjectURL(this.currentAudioUrl);
319
+ this.currentAudioUrl = null;
320
+ }
321
+
322
+ // Stop streaming playback (AudioPlaybackManager)
323
+ if (this.playbackManager) {
324
+ this.playbackManager.flush();
325
+ this.playbackManager = null;
326
+ }
327
+
328
+ // Tell server to abort the in-flight request
329
+ if (this.currentRequestId && this.ws && this.ws.readyState === WebSocket.OPEN) {
330
+ this.ws.send(
331
+ JSON.stringify({
332
+ type: "cancel",
333
+ requestId: this.currentRequestId,
334
+ }),
335
+ );
336
+ }
337
+
338
+ this.currentRequestId = null;
339
+ this.isProcessing = false;
340
+ this.isSpeaking = false;
341
+ }
342
+
188
343
  async startListening() {
189
344
  try {
190
- if (this.isProcessing) {
191
- throw new Error("Already processing audio");
345
+ if (this.isProcessing || this.isSpeaking) {
346
+ // If interruption is enabled, cancel current playback and proceed
347
+ if (this.interruptionMode !== "none") {
348
+ this.cancelCurrentPlayback();
349
+ } else {
350
+ // Mode is "none" — block mic while processing or speaking
351
+ return;
352
+ }
192
353
  }
193
354
 
194
- const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
195
- this.mediaStream = stream;
355
+ // Reuse existing mic stream in barge-in mode (mic stays hot)
356
+ if (!this.mediaStream) {
357
+ const constraints =
358
+ this.interruptionMode === "barge-in"
359
+ ? { audio: { echoCancellation: true, noiseSuppression: true } }
360
+ : { audio: true };
361
+ this.mediaStream = await navigator.mediaDevices.getUserMedia(constraints);
362
+ }
196
363
  const w = this.w!;
197
- this.audioContext = new (w.AudioContext || w.webkitAudioContext)();
198
-
199
- // Set up silence detection via AnalyserNode
200
- const audioCtx = this.audioContext!;
201
- const source = audioCtx.createMediaStreamSource(stream);
202
- this.analyserNode = audioCtx.createAnalyser();
203
- this.analyserNode.fftSize = 2048;
204
- source.connect(this.analyserNode);
364
+ if (!this.audioContext) {
365
+ this.audioContext = new (w.AudioContext || w.webkitAudioContext)();
366
+ }
367
+ const audioContext = this.audioContext!;
205
368
 
369
+ // VAD-based silence detection — fires once when user stops talking
206
370
  const pauseDuration = this.config?.pauseDuration ?? 2000;
207
371
  const silenceThreshold = this.config?.silenceThreshold ?? 0.01;
208
- this.silenceStart = null;
209
-
210
- const dataArray = new Float32Array(this.analyserNode.fftSize);
211
- this.silenceCheckInterval = setInterval(() => {
212
- if (!this.analyserNode) return;
213
- this.analyserNode.getFloatTimeDomainData(dataArray);
214
-
215
- // Compute RMS volume
216
- let sum = 0;
217
- for (let i = 0; i < dataArray.length; i++) {
218
- sum += dataArray[i] * dataArray[i];
219
- }
220
- const rms = Math.sqrt(sum / dataArray.length);
221
-
222
- if (rms < silenceThreshold) {
223
- if (this.silenceStart === null) {
224
- this.silenceStart = Date.now();
225
- } else if (Date.now() - this.silenceStart >= pauseDuration) {
226
- // Silence exceeded threshold — auto-stop
227
- this.stopListening();
228
- }
229
- } else {
230
- // Sound detected — reset silence timer
231
- this.silenceStart = null;
232
- }
233
- }, 100);
372
+ this.vad.start(
373
+ audioContext,
374
+ this.mediaStream,
375
+ "silence",
376
+ { threshold: silenceThreshold, duration: pauseDuration },
377
+ () => this.stopListening(),
378
+ );
234
379
 
235
- this.mediaRecorder = new MediaRecorder(stream);
380
+ this.mediaRecorder = new MediaRecorder(this.mediaStream);
236
381
  this.audioChunks = [];
237
382
 
238
383
  this.mediaRecorder.ondataavailable = (event) => {
@@ -265,32 +410,78 @@ export class RuntypeVoiceProvider implements VoiceProvider {
265
410
  }
266
411
 
267
412
  async stopListening() {
268
- // Clean up silence detection
269
- if (this.silenceCheckInterval) {
270
- clearInterval(this.silenceCheckInterval);
271
- this.silenceCheckInterval = null;
272
- }
273
- this.analyserNode = null;
274
- this.silenceStart = null;
413
+ this.vad.stop();
275
414
 
276
415
  if (this.mediaRecorder) {
416
+ if (this.interruptionMode !== "barge-in") {
417
+ this.mediaRecorder.stream.getTracks().forEach((track) => track.stop());
418
+ }
277
419
  this.mediaRecorder.stop();
278
- this.mediaRecorder.stream.getTracks().forEach((track) => track.stop());
279
420
  this.mediaRecorder = null;
280
421
  }
281
422
 
282
- // Clean up media stream reference
283
- if (this.mediaStream) {
284
- this.mediaStream.getTracks().forEach((track) => track.stop());
285
- this.mediaStream = null;
423
+ // Only tear down mic pipeline in non-barge-in modes
424
+ if (this.interruptionMode !== "barge-in") {
425
+ if (this.mediaStream) {
426
+ this.mediaStream.getTracks().forEach((track) => track.stop());
427
+ this.mediaStream = null;
428
+ }
429
+ if (this.audioContext) {
430
+ await this.audioContext.close();
431
+ this.audioContext = null;
432
+ }
286
433
  }
287
434
 
288
- if (this.audioContext) {
289
- await this.audioContext.close();
290
- this.audioContext = null;
435
+ this.statusCallbacks.forEach((cb) => cb("idle"));
436
+ }
437
+
438
+ /**
439
+ * Start VAD in speech mode during agent playback — detects when the user
440
+ * starts talking so we can interrupt (barge-in). No-op in other modes.
441
+ * Acquires mic if needed (e.g., first response where stopListening tore it down).
442
+ */
443
+ private async startBargeInMonitoring(): Promise<void> {
444
+ if (this.interruptionMode !== "barge-in") return;
445
+
446
+ // Acquire mic pipeline if not already available (first response scenario)
447
+ const w = this.w;
448
+ if (!this.mediaStream && w) {
449
+ this.mediaStream = await navigator.mediaDevices.getUserMedia({
450
+ audio: { echoCancellation: true, noiseSuppression: true },
451
+ });
452
+ }
453
+ if (!this.audioContext && w) {
454
+ this.audioContext = new (w.AudioContext || w.webkitAudioContext)();
291
455
  }
456
+ if (!this.audioContext || !this.mediaStream) return;
457
+
458
+ const audioContext = this.audioContext!;
459
+ const speechThreshold = this.config?.silenceThreshold ?? 0.01;
460
+ const speechDebounce = 200; // 200ms sustained sound = real speech, not echo blip
461
+
462
+ this.vad.start(
463
+ audioContext,
464
+ this.mediaStream,
465
+ "speech",
466
+ { threshold: speechThreshold, duration: speechDebounce },
467
+ () => this.handleBargeIn(),
468
+ );
469
+ }
292
470
 
293
- this.statusCallbacks.forEach((cb) => cb("idle"));
471
+ /**
472
+ * Handle a barge-in event: cancel playback and immediately start recording.
473
+ */
474
+ private handleBargeIn(): void {
475
+ this.cancelCurrentPlayback();
476
+ this.startListening().catch((err) => {
477
+ this.errorCallbacks.forEach((cb) =>
478
+ cb(err instanceof Error ? err : new Error(String(err))),
479
+ );
480
+ });
481
+ }
482
+
483
+ private generateRequestId(): string {
484
+ return "vreq_" + Math.random().toString(36).substring(2, 10) + Date.now().toString(36);
294
485
  }
295
486
 
296
487
  private async sendAudio(audioBlob: Blob) {
@@ -305,6 +496,8 @@ export class RuntypeVoiceProvider implements VoiceProvider {
305
496
  try {
306
497
  const base64Audio = await this.blobToBase64(audioBlob);
307
498
  const format = this.getFormatFromMimeType(audioBlob.type);
499
+ const requestId = this.generateRequestId();
500
+ this.currentRequestId = requestId;
308
501
 
309
502
  this.ws.send(
310
503
  JSON.stringify({
@@ -313,6 +506,7 @@ export class RuntypeVoiceProvider implements VoiceProvider {
313
506
  format,
314
507
  sampleRate: 16000,
315
508
  voiceId: this.config?.voiceId,
509
+ requestId,
316
510
  }),
317
511
  );
318
512
  } catch (error) {
@@ -357,7 +551,20 @@ export class RuntypeVoiceProvider implements VoiceProvider {
357
551
  const blob = new Blob([bytes], { type: mimeType });
358
552
  const url = URL.createObjectURL(blob);
359
553
  const audioEl = new Audio(url);
360
- audioEl.onended = () => URL.revokeObjectURL(url);
554
+
555
+ // Store references so playback can be cancelled
556
+ this.currentAudio = audioEl;
557
+ this.currentAudioUrl = url;
558
+
559
+ audioEl.onended = () => {
560
+ URL.revokeObjectURL(url);
561
+ if (this.currentAudio === audioEl) {
562
+ this.currentAudio = null;
563
+ this.currentAudioUrl = null;
564
+ this.isSpeaking = false;
565
+ this.statusCallbacks.forEach((cb) => cb("idle"));
566
+ }
567
+ };
361
568
  await audioEl.play();
362
569
  }
363
570
 
@@ -378,8 +585,37 @@ export class RuntypeVoiceProvider implements VoiceProvider {
378
585
  }
379
586
 
380
587
  async disconnect(): Promise<void> {
588
+ // Stop any playing audio (batch)
589
+ if (this.currentAudio) {
590
+ this.currentAudio.pause();
591
+ this.currentAudio.src = "";
592
+ this.currentAudio = null;
593
+ }
594
+ if (this.currentAudioUrl) {
595
+ URL.revokeObjectURL(this.currentAudioUrl);
596
+ this.currentAudioUrl = null;
597
+ }
598
+ // Stop streaming playback
599
+ if (this.playbackManager) {
600
+ await this.playbackManager.destroy();
601
+ this.playbackManager = null;
602
+ }
603
+ this.currentRequestId = null;
604
+ this.isSpeaking = false;
605
+
606
+ this.vad.stop();
381
607
  await this.stopListening();
382
608
 
609
+ // Force mic teardown (barge-in mode skips this in stopListening)
610
+ if (this.mediaStream) {
611
+ this.mediaStream.getTracks().forEach((track) => track.stop());
612
+ this.mediaStream = null;
613
+ }
614
+ if (this.audioContext) {
615
+ await this.audioContext.close();
616
+ this.audioContext = null;
617
+ }
618
+
383
619
  if (this.ws) {
384
620
  try {
385
621
  this.ws.close();