@runtypelabs/persona 1.47.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -8
- package/dist/index.cjs +90 -39
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1093 -25
- package/dist/index.d.ts +1093 -25
- package/dist/index.global.js +111 -60
- package/dist/index.global.js.map +1 -1
- package/dist/index.js +90 -39
- package/dist/index.js.map +1 -1
- package/dist/install.global.js +1 -1
- package/dist/install.global.js.map +1 -1
- package/dist/widget.css +852 -505
- package/package.json +1 -1
- package/src/artifacts-session.test.ts +80 -0
- package/src/client.test.ts +20 -21
- package/src/client.ts +153 -4
- package/src/components/approval-bubble.ts +45 -42
- package/src/components/artifact-card.ts +91 -0
- package/src/components/artifact-pane.ts +501 -0
- package/src/components/composer-builder.ts +32 -27
- package/src/components/event-stream-view.ts +40 -40
- package/src/components/feedback.ts +36 -36
- package/src/components/forms.ts +11 -11
- package/src/components/header-builder.test.ts +32 -0
- package/src/components/header-builder.ts +55 -36
- package/src/components/header-layouts.ts +58 -125
- package/src/components/launcher.ts +36 -21
- package/src/components/message-bubble.ts +92 -65
- package/src/components/messages.ts +2 -2
- package/src/components/panel.ts +42 -11
- package/src/components/reasoning-bubble.ts +23 -23
- package/src/components/registry.ts +4 -0
- package/src/components/suggestions.ts +1 -1
- package/src/components/tool-bubble.ts +32 -32
- package/src/defaults.ts +30 -4
- package/src/index.ts +80 -2
- package/src/install.ts +22 -0
- package/src/plugins/types.ts +23 -0
- package/src/postprocessors.ts +2 -2
- package/src/runtime/host-layout.ts +174 -0
- package/src/runtime/init.test.ts +236 -0
- package/src/runtime/init.ts +114 -55
- package/src/session.ts +173 -7
- package/src/styles/tailwind.css +1 -1
- package/src/styles/widget.css +852 -505
- package/src/types/theme.ts +354 -0
- package/src/types.ts +348 -16
- package/src/ui.docked.test.ts +104 -0
- package/src/ui.ts +1093 -244
- package/src/utils/artifact-gate.test.ts +255 -0
- package/src/utils/artifact-gate.ts +142 -0
- package/src/utils/artifact-resize.test.ts +64 -0
- package/src/utils/artifact-resize.ts +67 -0
- package/src/utils/attachment-manager.ts +10 -10
- package/src/utils/code-generators.test.ts +52 -0
- package/src/utils/code-generators.ts +40 -36
- package/src/utils/dock.ts +17 -0
- package/src/utils/dom-context.test.ts +504 -0
- package/src/utils/dom-context.ts +896 -0
- package/src/utils/dom.ts +12 -1
- package/src/utils/message-fingerprint.test.ts +187 -0
- package/src/utils/message-fingerprint.ts +105 -0
- package/src/utils/migration.ts +179 -0
- package/src/utils/morph.ts +1 -1
- package/src/utils/plugins.ts +175 -0
- package/src/utils/positioning.ts +4 -4
- package/src/utils/theme.test.ts +125 -0
- package/src/utils/theme.ts +216 -60
- package/src/utils/tokens.ts +682 -0
- package/src/voice/audio-playback-manager.ts +187 -0
- package/src/voice/runtype-voice-provider.ts +305 -69
- package/src/voice/voice-activity-detector.ts +90 -0
- package/src/voice/voice.test.ts +6 -5
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AudioPlaybackManager
|
|
3
|
+
*
|
|
4
|
+
* Manages streaming playback of PCM audio chunks via the Web Audio API.
|
|
5
|
+
* Receives raw PCM data (24 kHz, 16-bit signed little-endian, mono),
|
|
6
|
+
* converts to Float32 AudioBuffers, and schedules gap-free sequential
|
|
7
|
+
* playback using AudioBufferSourceNode.
|
|
8
|
+
*
|
|
9
|
+
* Works on all browsers including iOS Safari (no MediaSource dependency).
|
|
10
|
+
*/
|
|
11
|
+
export class AudioPlaybackManager {
|
|
12
|
+
private ctx: AudioContext | null = null;
|
|
13
|
+
private nextStartTime = 0;
|
|
14
|
+
private activeSources: AudioBufferSourceNode[] = [];
|
|
15
|
+
private finishedCallbacks: (() => void)[] = [];
|
|
16
|
+
private playing = false;
|
|
17
|
+
private streamEnded = false;
|
|
18
|
+
private pendingCount = 0;
|
|
19
|
+
|
|
20
|
+
// PCM format constants
|
|
21
|
+
private readonly sampleRate: number;
|
|
22
|
+
|
|
23
|
+
// Remainder byte from a previous chunk when the chunk had an odd byte count.
|
|
24
|
+
// Network chunks don't respect 2-byte sample boundaries, so we carry over
|
|
25
|
+
// the orphaned byte and prepend it to the next chunk.
|
|
26
|
+
private remainder: Uint8Array | null = null;
|
|
27
|
+
|
|
28
|
+
constructor(sampleRate = 24000) {
|
|
29
|
+
this.sampleRate = sampleRate;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Ensure AudioContext is created and running.
|
|
34
|
+
* Must be called after a user gesture on iOS Safari.
|
|
35
|
+
*/
|
|
36
|
+
private ensureContext(): AudioContext {
|
|
37
|
+
if (!this.ctx) {
|
|
38
|
+
const w = typeof window !== "undefined" ? (window as any) : undefined;
|
|
39
|
+
if (!w) throw new Error("AudioPlaybackManager requires a browser environment");
|
|
40
|
+
const AudioCtx = w.AudioContext || w.webkitAudioContext;
|
|
41
|
+
this.ctx = new AudioCtx({ sampleRate: this.sampleRate }) as AudioContext;
|
|
42
|
+
}
|
|
43
|
+
const ctx = this.ctx!;
|
|
44
|
+
// Resume if suspended (autoplay policy)
|
|
45
|
+
if (ctx.state === "suspended") {
|
|
46
|
+
ctx.resume();
|
|
47
|
+
}
|
|
48
|
+
return ctx;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Enqueue a PCM chunk for playback.
|
|
53
|
+
* @param pcmData Raw PCM bytes (16-bit signed LE mono)
|
|
54
|
+
*/
|
|
55
|
+
enqueue(pcmData: Uint8Array): void {
|
|
56
|
+
if (pcmData.length === 0) return;
|
|
57
|
+
|
|
58
|
+
// Prepend any remainder byte from the previous chunk
|
|
59
|
+
let data = pcmData;
|
|
60
|
+
if (this.remainder) {
|
|
61
|
+
const merged = new Uint8Array(this.remainder.length + pcmData.length);
|
|
62
|
+
merged.set(this.remainder);
|
|
63
|
+
merged.set(pcmData, this.remainder.length);
|
|
64
|
+
data = merged;
|
|
65
|
+
this.remainder = null;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// If odd byte count, save the trailing byte for next chunk
|
|
69
|
+
if (data.length % 2 !== 0) {
|
|
70
|
+
this.remainder = new Uint8Array([data[data.length - 1]]);
|
|
71
|
+
data = data.subarray(0, data.length - 1);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (data.length === 0) return;
|
|
75
|
+
|
|
76
|
+
const ctx = this.ensureContext();
|
|
77
|
+
const float32 = this.pcmToFloat32(data);
|
|
78
|
+
|
|
79
|
+
const buffer = ctx.createBuffer(1, float32.length, this.sampleRate);
|
|
80
|
+
buffer.getChannelData(0).set(float32);
|
|
81
|
+
|
|
82
|
+
const source = ctx.createBufferSource();
|
|
83
|
+
source.buffer = buffer;
|
|
84
|
+
source.connect(ctx.destination);
|
|
85
|
+
|
|
86
|
+
// Schedule gap-free playback
|
|
87
|
+
const now = ctx.currentTime;
|
|
88
|
+
if (this.nextStartTime < now) {
|
|
89
|
+
this.nextStartTime = now;
|
|
90
|
+
}
|
|
91
|
+
source.start(this.nextStartTime);
|
|
92
|
+
this.nextStartTime += buffer.duration;
|
|
93
|
+
|
|
94
|
+
this.activeSources.push(source);
|
|
95
|
+
this.pendingCount++;
|
|
96
|
+
this.playing = true;
|
|
97
|
+
|
|
98
|
+
source.onended = () => {
|
|
99
|
+
const idx = this.activeSources.indexOf(source);
|
|
100
|
+
if (idx !== -1) this.activeSources.splice(idx, 1);
|
|
101
|
+
this.pendingCount--;
|
|
102
|
+
this.checkFinished();
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Signal that no more chunks will arrive.
|
|
108
|
+
* The onFinished callback fires after all queued audio has played.
|
|
109
|
+
*/
|
|
110
|
+
markStreamEnd(): void {
|
|
111
|
+
this.streamEnded = true;
|
|
112
|
+
this.checkFinished();
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Immediately stop all playback and discard queued audio.
|
|
117
|
+
*/
|
|
118
|
+
flush(): void {
|
|
119
|
+
for (const source of this.activeSources) {
|
|
120
|
+
try {
|
|
121
|
+
source.stop();
|
|
122
|
+
source.disconnect();
|
|
123
|
+
} catch {
|
|
124
|
+
// Ignore errors from already-stopped sources
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
this.activeSources = [];
|
|
128
|
+
this.pendingCount = 0;
|
|
129
|
+
this.nextStartTime = 0;
|
|
130
|
+
this.playing = false;
|
|
131
|
+
this.streamEnded = false;
|
|
132
|
+
this.finishedCallbacks = [];
|
|
133
|
+
this.remainder = null;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Whether audio is currently playing or queued.
|
|
138
|
+
*/
|
|
139
|
+
isPlaying(): boolean {
|
|
140
|
+
return this.playing;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Register a callback for when all queued audio finishes playing.
|
|
145
|
+
*/
|
|
146
|
+
onFinished(callback: () => void): void {
|
|
147
|
+
this.finishedCallbacks.push(callback);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Clean up AudioContext resources.
|
|
152
|
+
*/
|
|
153
|
+
async destroy(): Promise<void> {
|
|
154
|
+
this.flush();
|
|
155
|
+
if (this.ctx) {
|
|
156
|
+
await this.ctx.close();
|
|
157
|
+
this.ctx = null;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
private checkFinished(): void {
|
|
162
|
+
if (this.streamEnded && this.pendingCount <= 0 && this.playing) {
|
|
163
|
+
this.playing = false;
|
|
164
|
+
this.streamEnded = false;
|
|
165
|
+
const cbs = this.finishedCallbacks.slice();
|
|
166
|
+
this.finishedCallbacks = [];
|
|
167
|
+
for (const cb of cbs) cb();
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Convert 16-bit signed LE PCM to Float32 samples in [-1, 1].
|
|
173
|
+
*/
|
|
174
|
+
private pcmToFloat32(pcmData: Uint8Array): Float32Array {
|
|
175
|
+
// 2 bytes per sample (16-bit)
|
|
176
|
+
const numSamples = Math.floor(pcmData.length / 2);
|
|
177
|
+
const float32 = new Float32Array(numSamples);
|
|
178
|
+
const view = new DataView(pcmData.buffer, pcmData.byteOffset, pcmData.byteLength);
|
|
179
|
+
|
|
180
|
+
for (let i = 0; i < numSamples; i++) {
|
|
181
|
+
const int16 = view.getInt16(i * 2, true); // little-endian
|
|
182
|
+
float32[i] = int16 / 32768;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
return float32;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
@@ -7,6 +7,8 @@ import type {
|
|
|
7
7
|
VoiceStatus,
|
|
8
8
|
VoiceConfig,
|
|
9
9
|
} from "../types";
|
|
10
|
+
import { AudioPlaybackManager } from "./audio-playback-manager";
|
|
11
|
+
import { VoiceActivityDetector } from "./voice-activity-detector";
|
|
10
12
|
|
|
11
13
|
export class RuntypeVoiceProvider implements VoiceProvider {
|
|
12
14
|
type: "runtype" = "runtype";
|
|
@@ -20,15 +22,46 @@ export class RuntypeVoiceProvider implements VoiceProvider {
|
|
|
20
22
|
private processingStartCallbacks: (() => void)[] = [];
|
|
21
23
|
private audioChunks: Blob[] = [];
|
|
22
24
|
private isProcessing = false;
|
|
25
|
+
private isSpeaking = false;
|
|
23
26
|
|
|
24
|
-
//
|
|
25
|
-
private
|
|
27
|
+
// Voice activity detection (silence auto-stop + barge-in speech detection)
|
|
28
|
+
private vad = new VoiceActivityDetector();
|
|
26
29
|
private mediaStream: MediaStream | null = null;
|
|
27
|
-
|
|
28
|
-
|
|
30
|
+
|
|
31
|
+
// Cancellation / interruption support
|
|
32
|
+
private currentAudio: HTMLAudioElement | null = null;
|
|
33
|
+
private currentAudioUrl: string | null = null;
|
|
34
|
+
private currentRequestId: string | null = null;
|
|
35
|
+
private interruptionMode: "none" | "cancel" | "barge-in" = "none";
|
|
36
|
+
|
|
37
|
+
// Streaming audio playback (PCM chunks)
|
|
38
|
+
private playbackManager: AudioPlaybackManager | null = null;
|
|
29
39
|
|
|
30
40
|
constructor(private config: VoiceConfig["runtype"]) {}
|
|
31
41
|
|
|
42
|
+
/** Returns the current interruption mode received from the server */
|
|
43
|
+
getInterruptionMode(): "none" | "cancel" | "barge-in" {
|
|
44
|
+
return this.interruptionMode;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/** Returns true if the barge-in mic stream is alive (hot mic between turns) */
|
|
48
|
+
isBargeInActive(): boolean {
|
|
49
|
+
return this.interruptionMode === "barge-in" && this.mediaStream !== null;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** Tear down the barge-in mic pipeline — "hang up" the always-on mic */
|
|
53
|
+
async deactivateBargeIn(): Promise<void> {
|
|
54
|
+
this.vad.stop();
|
|
55
|
+
if (this.mediaStream) {
|
|
56
|
+
this.mediaStream.getTracks().forEach((track) => track.stop());
|
|
57
|
+
this.mediaStream = null;
|
|
58
|
+
}
|
|
59
|
+
if (this.audioContext) {
|
|
60
|
+
await this.audioContext.close();
|
|
61
|
+
this.audioContext = null;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
32
65
|
async connect() {
|
|
33
66
|
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
|
|
34
67
|
return; // Already connected
|
|
@@ -113,6 +146,11 @@ export class RuntypeVoiceProvider implements VoiceProvider {
|
|
|
113
146
|
{ once: true }
|
|
114
147
|
);
|
|
115
148
|
});
|
|
149
|
+
|
|
150
|
+
// Send a ping immediately so the server replies with session_config
|
|
151
|
+
// (which includes interruptionMode). This ensures the client knows
|
|
152
|
+
// about barge-in mode before the first recording starts.
|
|
153
|
+
this.sendHeartbeat();
|
|
116
154
|
} catch (error) {
|
|
117
155
|
this.ws = null;
|
|
118
156
|
this.errorCallbacks.forEach((cb) => cb(error as Error));
|
|
@@ -137,7 +175,17 @@ export class RuntypeVoiceProvider implements VoiceProvider {
|
|
|
137
175
|
this.statusCallbacks.forEach((cb) => cb("error"));
|
|
138
176
|
};
|
|
139
177
|
|
|
178
|
+
// Receive binary frames for streaming audio (set binaryType to arraybuffer)
|
|
179
|
+
this.ws.binaryType = "arraybuffer";
|
|
180
|
+
|
|
140
181
|
this.ws.onmessage = (event) => {
|
|
182
|
+
// Binary frame = raw PCM audio chunk for streaming playback
|
|
183
|
+
if (event.data instanceof ArrayBuffer) {
|
|
184
|
+
this.handleAudioChunk(new Uint8Array(event.data));
|
|
185
|
+
return;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Text frame = JSON control message
|
|
141
189
|
try {
|
|
142
190
|
const message = JSON.parse(event.data);
|
|
143
191
|
this.handleWebSocketMessage(message);
|
|
@@ -151,15 +199,16 @@ export class RuntypeVoiceProvider implements VoiceProvider {
|
|
|
151
199
|
|
|
152
200
|
private handleWebSocketMessage(message: any) {
|
|
153
201
|
switch (message.type) {
|
|
154
|
-
case "
|
|
155
|
-
//
|
|
156
|
-
if (message.
|
|
157
|
-
this.
|
|
158
|
-
this.errorCallbacks.forEach((cb) => cb(err instanceof Error ? err : new Error(String(err)))),
|
|
159
|
-
);
|
|
202
|
+
case "session_config":
|
|
203
|
+
// Server sends voice settings on session init
|
|
204
|
+
if (message.interruptionMode) {
|
|
205
|
+
this.interruptionMode = message.interruptionMode;
|
|
160
206
|
}
|
|
161
|
-
|
|
162
|
-
|
|
207
|
+
break;
|
|
208
|
+
|
|
209
|
+
case "voice_response":
|
|
210
|
+
// Deliver text result immediately
|
|
211
|
+
this.isProcessing = false;
|
|
163
212
|
this.resultCallbacks.forEach((cb) =>
|
|
164
213
|
cb({
|
|
165
214
|
text: message.response.agentResponseText || message.response.transcript,
|
|
@@ -169,8 +218,39 @@ export class RuntypeVoiceProvider implements VoiceProvider {
|
|
|
169
218
|
provider: "runtype",
|
|
170
219
|
}),
|
|
171
220
|
);
|
|
221
|
+
|
|
222
|
+
// Batch path: play TTS audio if present in the response (backward compat)
|
|
223
|
+
if (message.response.audio?.base64) {
|
|
224
|
+
this.isSpeaking = true;
|
|
225
|
+
this.statusCallbacks.forEach((cb) => cb("speaking"));
|
|
226
|
+
this.playAudio(message.response.audio).catch((err) =>
|
|
227
|
+
this.errorCallbacks.forEach((cb) => cb(err instanceof Error ? err : new Error(String(err)))),
|
|
228
|
+
);
|
|
229
|
+
} else if (!message.response.audio?.base64) {
|
|
230
|
+
// Streaming path: text-only voice_response — audio will arrive as
|
|
231
|
+
// binary chunks followed by audio_end. Transition to speaking state
|
|
232
|
+
// once the first audio chunk arrives (see handleAudioChunk).
|
|
233
|
+
// Stay in processing state until then.
|
|
234
|
+
}
|
|
235
|
+
break;
|
|
236
|
+
|
|
237
|
+
case "audio_end":
|
|
238
|
+
// Guard: discard late audio_end from a cancelled request
|
|
239
|
+
if (message.requestId && message.requestId !== this.currentRequestId) break;
|
|
240
|
+
// All PCM chunks have been sent — signal the playback manager
|
|
241
|
+
if (this.playbackManager) {
|
|
242
|
+
this.playbackManager.markStreamEnd();
|
|
243
|
+
} else {
|
|
244
|
+
// No audio chunks arrived — go idle
|
|
245
|
+
this.isSpeaking = false;
|
|
246
|
+
this.isProcessing = false;
|
|
247
|
+
this.statusCallbacks.forEach((cb) => cb("idle"));
|
|
248
|
+
}
|
|
249
|
+
break;
|
|
250
|
+
|
|
251
|
+
case "cancelled":
|
|
252
|
+
// Server acknowledged cancellation — discard any late-arriving responses
|
|
172
253
|
this.isProcessing = false;
|
|
173
|
-
this.statusCallbacks.forEach((cb) => cb("idle"));
|
|
174
254
|
break;
|
|
175
255
|
|
|
176
256
|
case "error":
|
|
@@ -185,54 +265,119 @@ export class RuntypeVoiceProvider implements VoiceProvider {
|
|
|
185
265
|
}
|
|
186
266
|
}
|
|
187
267
|
|
|
268
|
+
/**
|
|
269
|
+
* Handle a binary audio chunk (raw PCM 24kHz 16-bit LE) for streaming playback.
|
|
270
|
+
*/
|
|
271
|
+
private handleAudioChunk(pcmData: Uint8Array): void {
|
|
272
|
+
if (pcmData.length === 0) return;
|
|
273
|
+
if (!this.currentRequestId) return; // discard late chunks after cancel
|
|
274
|
+
|
|
275
|
+
// Lazily create playback manager on first chunk
|
|
276
|
+
if (!this.playbackManager) {
|
|
277
|
+
this.playbackManager = new AudioPlaybackManager(24000);
|
|
278
|
+
this.playbackManager.onFinished(() => {
|
|
279
|
+
this.isSpeaking = false;
|
|
280
|
+
this.playbackManager = null;
|
|
281
|
+
this.vad.stop(); // stop speech monitoring — audio ended naturally
|
|
282
|
+
this.statusCallbacks.forEach((cb) => cb("idle"));
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// Transition to speaking on first chunk
|
|
287
|
+
if (!this.isSpeaking) {
|
|
288
|
+
this.isSpeaking = true;
|
|
289
|
+
this.statusCallbacks.forEach((cb) => cb("speaking"));
|
|
290
|
+
this.startBargeInMonitoring().catch(() => {}); // no-op if not barge-in mode
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
this.playbackManager.enqueue(pcmData);
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Stop playback / cancel in-flight request and return to idle.
|
|
298
|
+
* This is the public "stop only" action — does NOT start recording.
|
|
299
|
+
*/
|
|
300
|
+
stopPlayback(): void {
|
|
301
|
+
if (!this.isProcessing && !this.isSpeaking) return;
|
|
302
|
+
this.cancelCurrentPlayback();
|
|
303
|
+
this.statusCallbacks.forEach((cb) => cb("idle"));
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Cancel the current playback and in-flight server request.
|
|
308
|
+
* Internal helper — does NOT fire status callbacks (caller decides next state).
|
|
309
|
+
*/
|
|
310
|
+
private cancelCurrentPlayback(): void {
|
|
311
|
+
// Stop batch playback (Audio element)
|
|
312
|
+
if (this.currentAudio) {
|
|
313
|
+
this.currentAudio.pause();
|
|
314
|
+
this.currentAudio.src = "";
|
|
315
|
+
this.currentAudio = null;
|
|
316
|
+
}
|
|
317
|
+
if (this.currentAudioUrl) {
|
|
318
|
+
URL.revokeObjectURL(this.currentAudioUrl);
|
|
319
|
+
this.currentAudioUrl = null;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// Stop streaming playback (AudioPlaybackManager)
|
|
323
|
+
if (this.playbackManager) {
|
|
324
|
+
this.playbackManager.flush();
|
|
325
|
+
this.playbackManager = null;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Tell server to abort the in-flight request
|
|
329
|
+
if (this.currentRequestId && this.ws && this.ws.readyState === WebSocket.OPEN) {
|
|
330
|
+
this.ws.send(
|
|
331
|
+
JSON.stringify({
|
|
332
|
+
type: "cancel",
|
|
333
|
+
requestId: this.currentRequestId,
|
|
334
|
+
}),
|
|
335
|
+
);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
this.currentRequestId = null;
|
|
339
|
+
this.isProcessing = false;
|
|
340
|
+
this.isSpeaking = false;
|
|
341
|
+
}
|
|
342
|
+
|
|
188
343
|
async startListening() {
|
|
189
344
|
try {
|
|
190
|
-
if (this.isProcessing) {
|
|
191
|
-
|
|
345
|
+
if (this.isProcessing || this.isSpeaking) {
|
|
346
|
+
// If interruption is enabled, cancel current playback and proceed
|
|
347
|
+
if (this.interruptionMode !== "none") {
|
|
348
|
+
this.cancelCurrentPlayback();
|
|
349
|
+
} else {
|
|
350
|
+
// Mode is "none" — block mic while processing or speaking
|
|
351
|
+
return;
|
|
352
|
+
}
|
|
192
353
|
}
|
|
193
354
|
|
|
194
|
-
|
|
195
|
-
this.mediaStream
|
|
355
|
+
// Reuse existing mic stream in barge-in mode (mic stays hot)
|
|
356
|
+
if (!this.mediaStream) {
|
|
357
|
+
const constraints =
|
|
358
|
+
this.interruptionMode === "barge-in"
|
|
359
|
+
? { audio: { echoCancellation: true, noiseSuppression: true } }
|
|
360
|
+
: { audio: true };
|
|
361
|
+
this.mediaStream = await navigator.mediaDevices.getUserMedia(constraints);
|
|
362
|
+
}
|
|
196
363
|
const w = this.w!;
|
|
197
|
-
this.audioContext
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
const
|
|
201
|
-
const source = audioCtx.createMediaStreamSource(stream);
|
|
202
|
-
this.analyserNode = audioCtx.createAnalyser();
|
|
203
|
-
this.analyserNode.fftSize = 2048;
|
|
204
|
-
source.connect(this.analyserNode);
|
|
364
|
+
if (!this.audioContext) {
|
|
365
|
+
this.audioContext = new (w.AudioContext || w.webkitAudioContext)();
|
|
366
|
+
}
|
|
367
|
+
const audioContext = this.audioContext!;
|
|
205
368
|
|
|
369
|
+
// VAD-based silence detection — fires once when user stops talking
|
|
206
370
|
const pauseDuration = this.config?.pauseDuration ?? 2000;
|
|
207
371
|
const silenceThreshold = this.config?.silenceThreshold ?? 0.01;
|
|
208
|
-
this.
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
this.
|
|
214
|
-
|
|
215
|
-
// Compute RMS volume
|
|
216
|
-
let sum = 0;
|
|
217
|
-
for (let i = 0; i < dataArray.length; i++) {
|
|
218
|
-
sum += dataArray[i] * dataArray[i];
|
|
219
|
-
}
|
|
220
|
-
const rms = Math.sqrt(sum / dataArray.length);
|
|
221
|
-
|
|
222
|
-
if (rms < silenceThreshold) {
|
|
223
|
-
if (this.silenceStart === null) {
|
|
224
|
-
this.silenceStart = Date.now();
|
|
225
|
-
} else if (Date.now() - this.silenceStart >= pauseDuration) {
|
|
226
|
-
// Silence exceeded threshold — auto-stop
|
|
227
|
-
this.stopListening();
|
|
228
|
-
}
|
|
229
|
-
} else {
|
|
230
|
-
// Sound detected — reset silence timer
|
|
231
|
-
this.silenceStart = null;
|
|
232
|
-
}
|
|
233
|
-
}, 100);
|
|
372
|
+
this.vad.start(
|
|
373
|
+
audioContext,
|
|
374
|
+
this.mediaStream,
|
|
375
|
+
"silence",
|
|
376
|
+
{ threshold: silenceThreshold, duration: pauseDuration },
|
|
377
|
+
() => this.stopListening(),
|
|
378
|
+
);
|
|
234
379
|
|
|
235
|
-
this.mediaRecorder = new MediaRecorder(
|
|
380
|
+
this.mediaRecorder = new MediaRecorder(this.mediaStream);
|
|
236
381
|
this.audioChunks = [];
|
|
237
382
|
|
|
238
383
|
this.mediaRecorder.ondataavailable = (event) => {
|
|
@@ -265,32 +410,78 @@ export class RuntypeVoiceProvider implements VoiceProvider {
|
|
|
265
410
|
}
|
|
266
411
|
|
|
267
412
|
async stopListening() {
|
|
268
|
-
|
|
269
|
-
if (this.silenceCheckInterval) {
|
|
270
|
-
clearInterval(this.silenceCheckInterval);
|
|
271
|
-
this.silenceCheckInterval = null;
|
|
272
|
-
}
|
|
273
|
-
this.analyserNode = null;
|
|
274
|
-
this.silenceStart = null;
|
|
413
|
+
this.vad.stop();
|
|
275
414
|
|
|
276
415
|
if (this.mediaRecorder) {
|
|
416
|
+
if (this.interruptionMode !== "barge-in") {
|
|
417
|
+
this.mediaRecorder.stream.getTracks().forEach((track) => track.stop());
|
|
418
|
+
}
|
|
277
419
|
this.mediaRecorder.stop();
|
|
278
|
-
this.mediaRecorder.stream.getTracks().forEach((track) => track.stop());
|
|
279
420
|
this.mediaRecorder = null;
|
|
280
421
|
}
|
|
281
422
|
|
|
282
|
-
//
|
|
283
|
-
if (this.
|
|
284
|
-
this.mediaStream
|
|
285
|
-
|
|
423
|
+
// Only tear down mic pipeline in non-barge-in modes
|
|
424
|
+
if (this.interruptionMode !== "barge-in") {
|
|
425
|
+
if (this.mediaStream) {
|
|
426
|
+
this.mediaStream.getTracks().forEach((track) => track.stop());
|
|
427
|
+
this.mediaStream = null;
|
|
428
|
+
}
|
|
429
|
+
if (this.audioContext) {
|
|
430
|
+
await this.audioContext.close();
|
|
431
|
+
this.audioContext = null;
|
|
432
|
+
}
|
|
286
433
|
}
|
|
287
434
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
435
|
+
this.statusCallbacks.forEach((cb) => cb("idle"));
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
/**
|
|
439
|
+
* Start VAD in speech mode during agent playback — detects when the user
|
|
440
|
+
* starts talking so we can interrupt (barge-in). No-op in other modes.
|
|
441
|
+
* Acquires mic if needed (e.g., first response where stopListening tore it down).
|
|
442
|
+
*/
|
|
443
|
+
private async startBargeInMonitoring(): Promise<void> {
|
|
444
|
+
if (this.interruptionMode !== "barge-in") return;
|
|
445
|
+
|
|
446
|
+
// Acquire mic pipeline if not already available (first response scenario)
|
|
447
|
+
const w = this.w;
|
|
448
|
+
if (!this.mediaStream && w) {
|
|
449
|
+
this.mediaStream = await navigator.mediaDevices.getUserMedia({
|
|
450
|
+
audio: { echoCancellation: true, noiseSuppression: true },
|
|
451
|
+
});
|
|
452
|
+
}
|
|
453
|
+
if (!this.audioContext && w) {
|
|
454
|
+
this.audioContext = new (w.AudioContext || w.webkitAudioContext)();
|
|
291
455
|
}
|
|
456
|
+
if (!this.audioContext || !this.mediaStream) return;
|
|
457
|
+
|
|
458
|
+
const audioContext = this.audioContext!;
|
|
459
|
+
const speechThreshold = this.config?.silenceThreshold ?? 0.01;
|
|
460
|
+
const speechDebounce = 200; // 200ms sustained sound = real speech, not echo blip
|
|
461
|
+
|
|
462
|
+
this.vad.start(
|
|
463
|
+
audioContext,
|
|
464
|
+
this.mediaStream,
|
|
465
|
+
"speech",
|
|
466
|
+
{ threshold: speechThreshold, duration: speechDebounce },
|
|
467
|
+
() => this.handleBargeIn(),
|
|
468
|
+
);
|
|
469
|
+
}
|
|
292
470
|
|
|
293
|
-
|
|
471
|
+
/**
|
|
472
|
+
* Handle a barge-in event: cancel playback and immediately start recording.
|
|
473
|
+
*/
|
|
474
|
+
private handleBargeIn(): void {
|
|
475
|
+
this.cancelCurrentPlayback();
|
|
476
|
+
this.startListening().catch((err) => {
|
|
477
|
+
this.errorCallbacks.forEach((cb) =>
|
|
478
|
+
cb(err instanceof Error ? err : new Error(String(err))),
|
|
479
|
+
);
|
|
480
|
+
});
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
private generateRequestId(): string {
|
|
484
|
+
return "vreq_" + Math.random().toString(36).substring(2, 10) + Date.now().toString(36);
|
|
294
485
|
}
|
|
295
486
|
|
|
296
487
|
private async sendAudio(audioBlob: Blob) {
|
|
@@ -305,6 +496,8 @@ export class RuntypeVoiceProvider implements VoiceProvider {
|
|
|
305
496
|
try {
|
|
306
497
|
const base64Audio = await this.blobToBase64(audioBlob);
|
|
307
498
|
const format = this.getFormatFromMimeType(audioBlob.type);
|
|
499
|
+
const requestId = this.generateRequestId();
|
|
500
|
+
this.currentRequestId = requestId;
|
|
308
501
|
|
|
309
502
|
this.ws.send(
|
|
310
503
|
JSON.stringify({
|
|
@@ -313,6 +506,7 @@ export class RuntypeVoiceProvider implements VoiceProvider {
|
|
|
313
506
|
format,
|
|
314
507
|
sampleRate: 16000,
|
|
315
508
|
voiceId: this.config?.voiceId,
|
|
509
|
+
requestId,
|
|
316
510
|
}),
|
|
317
511
|
);
|
|
318
512
|
} catch (error) {
|
|
@@ -357,7 +551,20 @@ export class RuntypeVoiceProvider implements VoiceProvider {
|
|
|
357
551
|
const blob = new Blob([bytes], { type: mimeType });
|
|
358
552
|
const url = URL.createObjectURL(blob);
|
|
359
553
|
const audioEl = new Audio(url);
|
|
360
|
-
|
|
554
|
+
|
|
555
|
+
// Store references so playback can be cancelled
|
|
556
|
+
this.currentAudio = audioEl;
|
|
557
|
+
this.currentAudioUrl = url;
|
|
558
|
+
|
|
559
|
+
audioEl.onended = () => {
|
|
560
|
+
URL.revokeObjectURL(url);
|
|
561
|
+
if (this.currentAudio === audioEl) {
|
|
562
|
+
this.currentAudio = null;
|
|
563
|
+
this.currentAudioUrl = null;
|
|
564
|
+
this.isSpeaking = false;
|
|
565
|
+
this.statusCallbacks.forEach((cb) => cb("idle"));
|
|
566
|
+
}
|
|
567
|
+
};
|
|
361
568
|
await audioEl.play();
|
|
362
569
|
}
|
|
363
570
|
|
|
@@ -378,8 +585,37 @@ export class RuntypeVoiceProvider implements VoiceProvider {
|
|
|
378
585
|
}
|
|
379
586
|
|
|
380
587
|
async disconnect(): Promise<void> {
|
|
588
|
+
// Stop any playing audio (batch)
|
|
589
|
+
if (this.currentAudio) {
|
|
590
|
+
this.currentAudio.pause();
|
|
591
|
+
this.currentAudio.src = "";
|
|
592
|
+
this.currentAudio = null;
|
|
593
|
+
}
|
|
594
|
+
if (this.currentAudioUrl) {
|
|
595
|
+
URL.revokeObjectURL(this.currentAudioUrl);
|
|
596
|
+
this.currentAudioUrl = null;
|
|
597
|
+
}
|
|
598
|
+
// Stop streaming playback
|
|
599
|
+
if (this.playbackManager) {
|
|
600
|
+
await this.playbackManager.destroy();
|
|
601
|
+
this.playbackManager = null;
|
|
602
|
+
}
|
|
603
|
+
this.currentRequestId = null;
|
|
604
|
+
this.isSpeaking = false;
|
|
605
|
+
|
|
606
|
+
this.vad.stop();
|
|
381
607
|
await this.stopListening();
|
|
382
608
|
|
|
609
|
+
// Force mic teardown (barge-in mode skips this in stopListening)
|
|
610
|
+
if (this.mediaStream) {
|
|
611
|
+
this.mediaStream.getTracks().forEach((track) => track.stop());
|
|
612
|
+
this.mediaStream = null;
|
|
613
|
+
}
|
|
614
|
+
if (this.audioContext) {
|
|
615
|
+
await this.audioContext.close();
|
|
616
|
+
this.audioContext = null;
|
|
617
|
+
}
|
|
618
|
+
|
|
383
619
|
if (this.ws) {
|
|
384
620
|
try {
|
|
385
621
|
this.ws.close();
|