@keyframelabs/elements 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,60 +1,60 @@
1
- import { createClient as f } from "@keyframelabs/sdk";
2
- const l = 24e3;
3
- function g(i) {
4
- const e = atob(i), t = new Uint8Array(e.length);
5
- for (let s = 0; s < e.length; s++)
6
- t[s] = e.charCodeAt(s);
1
+ import { createClient as _ } from "@keyframelabs/sdk";
2
+ const u = 24e3;
3
+ function f(s) {
4
+ const e = atob(s), t = new Uint8Array(e.length);
5
+ for (let n = 0; n < e.length; n++)
6
+ t[n] = e.charCodeAt(n);
7
7
  return t;
8
8
  }
9
- function m(i) {
9
+ function S(s) {
10
10
  let e = "";
11
- for (let t = 0; t < i.length; t++)
12
- e += String.fromCharCode(i[t]);
11
+ for (let t = 0; t < s.length; t++)
12
+ e += String.fromCharCode(s[t]);
13
13
  return btoa(e);
14
14
  }
15
- function h(i, e, t) {
15
+ function p(s, e, t) {
16
16
  if (e === t)
17
- return i;
18
- const s = new Int16Array(i.buffer, i.byteOffset, i.length / 2), n = e / t, a = Math.floor(s.length / n), d = new Int16Array(a);
19
- for (let o = 0; o < a; o++) {
20
- const S = o * n, p = Math.floor(S), b = Math.min(p + 1, s.length - 1), v = S - p;
21
- d[o] = Math.round(
22
- s[p] * (1 - v) + s[b] * v
17
+ return s;
18
+ const n = new Int16Array(s.buffer, s.byteOffset, s.length / 2), a = e / t, i = Math.floor(n.length / a), c = new Int16Array(i);
19
+ for (let l = 0; l < i; l++) {
20
+ const m = l * a, d = Math.floor(m), w = Math.min(d + 1, n.length - 1), g = m - d;
21
+ c[l] = Math.round(
22
+ n[d] * (1 - g) + n[w] * g
23
23
  );
24
24
  }
25
- return new Uint8Array(d.buffer);
25
+ return new Uint8Array(c.buffer);
26
26
  }
27
- function E() {
28
- const i = /* @__PURE__ */ new Map();
27
+ function C() {
28
+ const s = /* @__PURE__ */ new Map();
29
29
  return {
30
30
  on(e, t) {
31
- i.has(e) || i.set(e, /* @__PURE__ */ new Set()), i.get(e).add(t);
31
+ s.has(e) || s.set(e, /* @__PURE__ */ new Set()), s.get(e).add(t);
32
32
  },
33
33
  off(e, t) {
34
- i.get(e)?.delete(t);
34
+ s.get(e)?.delete(t);
35
35
  },
36
36
  emit(e, t) {
37
- i.get(e)?.forEach((s) => s(t));
37
+ s.get(e)?.forEach((n) => n(t));
38
38
  },
39
39
  removeAllListeners() {
40
- i.clear();
40
+ s.clear();
41
41
  }
42
42
  };
43
43
  }
44
- function w(i) {
45
- const e = new Int16Array(i.length);
46
- for (let t = 0; t < i.length; t++) {
47
- const s = Math.max(-1, Math.min(1, i[t]));
48
- e[t] = s < 0 ? s * 32768 : s * 32767;
44
+ function v(s) {
45
+ const e = new Int16Array(s.length);
46
+ for (let t = 0; t < s.length; t++) {
47
+ const n = Math.max(-1, Math.min(1, s[t]));
48
+ e[t] = n < 0 ? n * 32768 : n * 32767;
49
49
  }
50
50
  return new Uint8Array(e.buffer);
51
51
  }
52
- const C = 16e3;
53
- class u {
52
+ const E = 16e3;
53
+ class y {
54
54
  ws = null;
55
55
  _state = "idle";
56
- events = E();
57
- inputSampleRate = C;
56
+ events = C();
57
+ inputSampleRate = E;
58
58
  /** Current agent state */
59
59
  get state() {
60
60
  return this._state;
@@ -113,84 +113,8 @@ class u {
113
113
  this.events.emit("closed", { code: e, reason: t });
114
114
  }
115
115
  }
116
- const A = "gemini-2.5-flash-native-audio-preview-12-2025", I = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent", k = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
117
- class R extends u {
118
- agentName = "GeminiLive";
119
- async connect(e) {
120
- if (this.ws)
121
- throw new Error("Already connected");
122
- if (!e.apiKey)
123
- throw new Error("Gemini API key is required");
124
- e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate);
125
- const t = e.model ?? A, n = (e.authType ?? "api_key") === "ephemeral_token" ? `${k}?access_token=${encodeURIComponent(e.apiKey)}` : `${I}?key=${encodeURIComponent(e.apiKey)}`;
126
- return new Promise((a, d) => {
127
- this.ws = new WebSocket(n), this.ws.onopen = () => {
128
- const o = {
129
- setup: {
130
- model: `models/${t}`,
131
- generationConfig: {
132
- responseModalities: ["AUDIO"]
133
- },
134
- systemInstruction: e.systemPrompt ? { parts: [{ text: e.systemPrompt }] } : void 0
135
- }
136
- };
137
- this.ws.send(JSON.stringify(o)), this.setState("listening"), a();
138
- }, this.ws.onerror = () => {
139
- d(new Error("Failed to connect to Gemini Live"));
140
- }, this.ws.onclose = (o) => {
141
- this.ws = null, this.setState("idle"), this.emitClosed(o.code, o.reason);
142
- }, this.ws.onmessage = (o) => {
143
- this.handleMessage(o.data);
144
- };
145
- });
146
- }
147
- handleParsedMessage(e) {
148
- const s = e.serverContent;
149
- if (s) {
150
- if (s.interrupted) {
151
- this.events.emit("interrupted", void 0), this.setState("listening");
152
- return;
153
- }
154
- if (s.turnComplete) {
155
- this.events.emit("turnEnd", void 0), this.setState("listening");
156
- return;
157
- }
158
- if (s.modelTurn?.parts) {
159
- this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
160
- for (const n of s.modelTurn.parts) {
161
- if (n.inlineData?.data) {
162
- const a = g(n.inlineData.data);
163
- this.events.emit("audio", a);
164
- }
165
- n.text && this.events.emit("transcript", {
166
- role: "assistant",
167
- text: n.text,
168
- isFinal: !0
169
- });
170
- }
171
- }
172
- }
173
- }
174
- sendAudio(e) {
175
- if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
176
- console.warn("[GeminiLive] Cannot send audio: not connected");
177
- return;
178
- }
179
- const t = {
180
- realtimeInput: {
181
- mediaChunks: [
182
- {
183
- mimeType: `audio/pcm;rate=${this.inputSampleRate}`,
184
- data: m(e)
185
- }
186
- ]
187
- }
188
- };
189
- this.ws.send(JSON.stringify(t));
190
- }
191
- }
192
- const M = ["neutral", "angry", "sad", "happy"], T = "wss://api.elevenlabs.io/v1/convai/conversation";
193
- class _ extends u {
116
+ const A = ["neutral", "angry", "sad", "happy"], I = "wss://api.elevenlabs.io/v1/convai/conversation";
117
+ class R extends y {
194
118
  agentName = "ElevenLabs";
195
119
  outputSampleRate = 24e3;
196
120
  // Default, updated from metadata
@@ -202,6 +126,12 @@ class _ extends u {
202
126
  // True after conversation_initiation_metadata received
203
127
  lastInterruptId = 0;
204
128
  // Track interruptions to filter stale audio
129
+ // Virtual buffer turn-end detection: track audio duration and emit turnEnd
130
+ // when agent_response has arrived and all audio "would have" finished playing.
131
+ agentResponseReceived = !1;
132
+ turnStartTime = 0;
133
+ accumulatedDurationMs = 0;
134
+ turnEndTimer = null;
205
135
  async connect(e) {
206
136
  if (this.ws)
207
137
  throw new Error("Already connected");
@@ -209,15 +139,15 @@ class _ extends u {
209
139
  throw new Error("ElevenLabs agent ID or signed URL is required");
210
140
  e.inputSampleRate && (this.sourceInputSampleRate = e.inputSampleRate);
211
141
  let t;
212
- return e.signedUrl ? t = e.signedUrl : (t = `${T}?agent_id=${e.agentId}`, e.apiKey && (t += `&xi-api-key=${e.apiKey}`)), new Promise((s, n) => {
142
+ return e.signedUrl ? t = e.signedUrl : (t = `${I}?agent_id=${e.agentId}`, e.apiKey && (t += `&xi-api-key=${e.apiKey}`)), new Promise((n, a) => {
213
143
  this.ws = new WebSocket(t), this.ws.onopen = () => {
214
- this.setState("listening"), s();
144
+ this.setState("listening"), n();
215
145
  }, this.ws.onerror = () => {
216
- n(new Error("Failed to connect to ElevenLabs"));
217
- }, this.ws.onclose = (a) => {
218
- this.ws = null, this.setState("idle"), this.emitClosed(a.code, a.reason);
219
- }, this.ws.onmessage = (a) => {
220
- this.handleMessage(a.data);
146
+ a(new Error("Failed to connect to ElevenLabs"));
147
+ }, this.ws.onclose = (i) => {
148
+ this.ws = null, this.setState("idle"), this.emitClosed(i.code, i.reason);
149
+ }, this.ws.onmessage = (i) => {
150
+ this.handleMessage(i.data);
221
151
  };
222
152
  });
223
153
  }
@@ -254,12 +184,12 @@ class _ extends u {
254
184
  const t = e.conversation_initiation_metadata_event;
255
185
  if (t) {
256
186
  if (t.agent_output_audio_format) {
257
- const s = t.agent_output_audio_format.match(/pcm_(\d+)/);
258
- s && (this.outputSampleRate = parseInt(s[1], 10));
187
+ const n = t.agent_output_audio_format.match(/pcm_(\d+)/);
188
+ n && (this.outputSampleRate = parseInt(n[1], 10));
259
189
  }
260
190
  if (t.user_input_audio_format) {
261
- const s = t.user_input_audio_format.match(/pcm_(\d+)/);
262
- s && (this.expectedInputSampleRate = parseInt(s[1], 10));
191
+ const n = t.user_input_audio_format.match(/pcm_(\d+)/);
192
+ n && (this.expectedInputSampleRate = parseInt(n[1], 10));
263
193
  }
264
194
  this.initialized = !0;
265
195
  }
@@ -275,8 +205,12 @@ class _ extends u {
275
205
  if (!t?.audio_base_64 || (t.event_id ?? 0) <= this.lastInterruptId)
276
206
  return;
277
207
  this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
278
- let n = g(t.audio_base_64);
279
- this.outputSampleRate !== l && (n = h(n, this.outputSampleRate, l)), this.events.emit("audio", n);
208
+ let a = f(t.audio_base_64);
209
+ this.outputSampleRate !== u && (a = p(a, this.outputSampleRate, u)), this.events.emit("audio", a);
210
+ const i = a.length / 2 / u * 1e3;
211
+ this.turnStartTime === 0 && (this.turnStartTime = Date.now()), this.accumulatedDurationMs += i, console.debug(
212
+ `[ElevenLabs] audio chunk: ${a.length} bytes, +${i.toFixed(0)}ms, totalDuration=${this.accumulatedDurationMs.toFixed(0)}ms, agentResponse=${this.agentResponseReceived}`
213
+ ), this.scheduleVirtualBufferCheck();
280
214
  }
281
215
  handleUserTranscript(e) {
282
216
  const t = e.user_transcription_event;
@@ -288,18 +222,38 @@ class _ extends u {
288
222
  }
289
223
  handleAgentResponse(e) {
290
224
  const t = e.agent_response_event;
291
- t?.agent_response && (this.events.emit("turnEnd", void 0), this.setState("listening"), this.events.emit("transcript", {
225
+ t?.agent_response && (this.events.emit("transcript", {
292
226
  role: "assistant",
293
227
  text: t.agent_response,
294
228
  isFinal: !0
295
- }));
229
+ }), this.agentResponseReceived = !0, console.debug(
230
+ `[ElevenLabs] agent_response received: totalDuration=${this.accumulatedDurationMs.toFixed(0)}ms, text="${t.agent_response.slice(0, 60)}${t.agent_response.length > 60 ? "..." : ""}"`
231
+ ), this.scheduleVirtualBufferCheck());
232
+ }
233
+ /**
234
+ * Schedule a timer to emit turnEnd when the virtual audio buffer
235
+ * "would have" finished playing. Replicates the ElevenLabs SDK's
236
+ * AudioWorklet buffer-empty detection without requiring local playback.
237
+ */
238
+ scheduleVirtualBufferCheck() {
239
+ if (!this.agentResponseReceived || this.turnStartTime === 0) return;
240
+ this.turnEndTimer !== null && clearTimeout(this.turnEndTimer);
241
+ const e = Date.now() - this.turnStartTime, t = Math.max(0, this.accumulatedDurationMs - e);
242
+ console.debug(
243
+ `[ElevenLabs] virtual buffer: elapsed=${e.toFixed(0)}ms, accumulated=${this.accumulatedDurationMs.toFixed(0)}ms, remaining=${t.toFixed(0)}ms`
244
+ ), this.turnEndTimer = setTimeout(() => {
245
+ this.turnEndTimer = null, console.debug("[ElevenLabs] virtual buffer drained, emitting turnEnd"), this.resetTurnState(), this.events.emit("turnEnd", void 0), this.setState("listening");
246
+ }, t);
247
+ }
248
+ resetTurnState() {
249
+ this.agentResponseReceived = !1, this.turnStartTime = 0, this.accumulatedDurationMs = 0, this.turnEndTimer !== null && (clearTimeout(this.turnEndTimer), this.turnEndTimer = null);
296
250
  }
297
251
  handleClientToolCall(e) {
298
252
  const t = e.client_tool_call;
299
253
  if (t) {
300
254
  if (t.tool_name === "set_emotion") {
301
- const s = t.parameters?.emotion?.toLowerCase();
302
- s && M.includes(s) && this.events.emit("emotion", s);
255
+ const n = t.parameters?.emotion?.toLowerCase();
256
+ n && A.includes(n) && this.events.emit("emotion", n);
303
257
  }
304
258
  this.ws && this.ws.readyState === WebSocket.OPEN && this.ws.send(JSON.stringify({
305
259
  type: "client_tool_result",
@@ -311,14 +265,16 @@ class _ extends u {
311
265
  }
312
266
  handleInterruption(e) {
313
267
  const t = e.interruption_event;
314
- t?.event_id && (this.lastInterruptId = t.event_id), this.events.emit("interrupted", void 0), this.setState("listening");
268
+ t?.event_id && (this.lastInterruptId = t.event_id), (this.agentResponseReceived || this.accumulatedDurationMs > 0) && console.debug(
269
+ `[ElevenLabs] interruption: discarding pending turn (duration=${this.accumulatedDurationMs.toFixed(0)}ms, agentResponse=${this.agentResponseReceived})`
270
+ ), this.resetTurnState(), this.events.emit("interrupted", void 0), this.setState("listening");
315
271
  }
316
272
  sendAudio(e) {
317
273
  if (!this.ws || this.ws.readyState !== WebSocket.OPEN || !this.initialized)
318
274
  return;
319
275
  let t = e;
320
- this.sourceInputSampleRate !== this.expectedInputSampleRate && (t = h(e, this.sourceInputSampleRate, this.expectedInputSampleRate)), this.ws.send(JSON.stringify({
321
- user_audio_chunk: m(t)
276
+ this.sourceInputSampleRate !== this.expectedInputSampleRate && (t = p(e, this.sourceInputSampleRate, this.expectedInputSampleRate)), this.ws.send(JSON.stringify({
277
+ user_audio_chunk: S(t)
322
278
  }));
323
279
  }
324
280
  /**
@@ -348,217 +304,256 @@ class _ extends u {
348
304
  }));
349
305
  }
350
306
  close() {
351
- this.initialized = !1, this.lastInterruptId = 0, super.close();
307
+ this.initialized = !1, this.lastInterruptId = 0, this.resetTurnState(), super.close();
352
308
  }
353
309
  }
354
- const P = "wss://api.cartesia.ai/agents/stream", x = "2025-04-16";
355
- class O extends u {
356
- agentName = "Cartesia";
357
- // Audio configuration
358
- cartesiaInputFormat = "pcm_16000";
359
- // Format we tell Cartesia we are sending
360
- cartesiaOutputRate = 16e3;
361
- // Cartesia defaults to 16kHz for web
362
- // Connection state
363
- streamId = null;
364
- isReady = !1;
365
- pingInterval = null;
310
+ const T = ["neutral", "angry", "sad", "happy"], k = "wss://api.openai.com/v1/realtime", M = "gpt-realtime", h = 24e3, O = {
311
+ type: "function",
312
+ name: "set_emotion",
313
+ description: "Set the emotional expression of the avatar. Call this on every turn to reflect the tone of your response.",
314
+ parameters: {
315
+ type: "object",
316
+ properties: {
317
+ emotion: {
318
+ type: "string",
319
+ enum: ["neutral", "angry", "sad", "happy"],
320
+ description: "The emotion to display"
321
+ }
322
+ },
323
+ required: ["emotion"]
324
+ }
325
+ };
326
+ class P extends y {
327
+ agentName = "OpenAIRealtime";
328
+ connectResolve = null;
329
+ connectReject = null;
330
+ connectTimeout = null;
331
+ initialSessionUpdate = null;
332
+ currentResponseHasAudio = !1;
333
+ currentTranscript = "";
334
+ handledFunctionCallIds = /* @__PURE__ */ new Set();
335
+ sourceInputSampleRate = 16e3;
336
+ pendingFunctionCallStartedAtMs = null;
337
+ pendingFunctionCallNames = [];
366
338
  async connect(e) {
367
339
  if (this.ws)
368
340
  throw new Error("Already connected");
369
- if (!e.agentId)
370
- throw new Error("Cartesia Agent ID is required");
371
341
  if (!e.apiKey)
372
- throw new Error("Cartesia API Key is required");
373
- e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate), this.inputSampleRate === 16e3 ? this.cartesiaInputFormat = "pcm_16000" : this.inputSampleRate === 24e3 ? this.cartesiaInputFormat = "pcm_24000" : this.inputSampleRate === 44100 ? this.cartesiaInputFormat = "pcm_44100" : this.cartesiaInputFormat = "pcm_16000";
374
- const t = `${P}/${e.agentId}?api_key=${e.apiKey}&cartesia_version=${x}`;
375
- return new Promise((s, n) => {
376
- this.ws = new WebSocket(t), this.ws.onopen = () => {
377
- this.sendStartEvent(), this.startHeartbeat(), s();
342
+ throw new Error("OpenAI Realtime token is required");
343
+ e.inputSampleRate && (this.sourceInputSampleRate = e.inputSampleRate);
344
+ const t = e.model ?? M;
345
+ return this.initialSessionUpdate = this.buildSessionUpdate(e, t), new Promise((n, a) => {
346
+ this.connectResolve = n, this.connectReject = a, this.connectTimeout = setTimeout(() => {
347
+ this.rejectPendingConnect(new Error("Timed out waiting for OpenAI Realtime session setup")), this.close();
348
+ }, 1e4), this.ws = new WebSocket(
349
+ `${k}?model=${encodeURIComponent(t)}`,
350
+ ["realtime", `openai-insecure-api-key.${e.apiKey}`]
351
+ ), this.ws.onopen = () => {
378
352
  }, this.ws.onerror = () => {
379
- n(new Error("Failed to connect to Cartesia"));
380
- }, this.ws.onclose = (a) => {
381
- this.stopHeartbeat(), this.ws = null, this.isReady = !1, this.streamId = null, this.setState("idle"), this.emitClosed(a.code, a.reason);
382
- }, this.ws.onmessage = (a) => {
383
- this.handleMessage(a.data);
353
+ this.rejectPendingConnect(new Error("Failed to connect to OpenAI Realtime"));
354
+ }, this.ws.onclose = (i) => {
355
+ if (this.clearConnectTimeout(), this.connectReject) {
356
+ const c = i.reason ? `: ${i.reason}` : "";
357
+ this.rejectPendingConnect(new Error(`OpenAI Realtime closed before initialization (${i.code}${c})`));
358
+ }
359
+ this.resetTurnState(), this.initialSessionUpdate = null, this.ws = null, this.setState("idle"), this.emitClosed(i.code, i.reason);
360
+ }, this.ws.onmessage = (i) => {
361
+ this.handleMessage(i.data);
384
362
  };
385
363
  });
386
364
  }
387
- sendStartEvent() {
388
- if (!this.ws) return;
389
- const e = {
390
- event: "start",
391
- config: {
392
- input_format: this.cartesiaInputFormat
393
- }
394
- };
395
- this.ws.send(JSON.stringify(e));
396
- }
397
- /**
398
- * Keep connection alive with periodic custom events.
399
- * Cartesia requires activity every 30s.
400
- */
401
- startHeartbeat() {
402
- this.pingInterval = window.setInterval(() => {
403
- this.ws?.readyState === WebSocket.OPEN && this.streamId && this.ws.send(JSON.stringify({
404
- event: "custom",
405
- stream_id: this.streamId,
406
- metadata: { keepalive: !0 }
407
- }));
408
- }, 2e4);
409
- }
410
- stopHeartbeat() {
411
- this.pingInterval && (clearInterval(this.pingInterval), this.pingInterval = null);
412
- }
413
365
  handleParsedMessage(e) {
414
366
  const t = e;
415
- switch (t.event) {
416
- case "ack":
417
- this.handleAck(t);
367
+ switch (t.type) {
368
+ case "session.created":
369
+ this.sendInitialSessionUpdate();
418
370
  break;
419
- case "media_output":
420
- this.handleMediaOutput(t);
371
+ case "session.updated":
372
+ this.clearConnectTimeout(), this.setState("listening"), this.resolvePendingConnect();
421
373
  break;
422
- case "clear":
423
- this.handleClear();
374
+ case "response.output_audio.delta":
375
+ case "response.audio.delta":
376
+ if (!t.delta)
377
+ return;
378
+ if (!this.currentResponseHasAudio) {
379
+ if (this.pendingFunctionCallStartedAtMs !== null) {
380
+ const n = performance.now() - this.pendingFunctionCallStartedAtMs;
381
+ console.debug("[OpenAIRealtime] Function call latency", {
382
+ calls: this.pendingFunctionCallNames,
383
+ latencyMs: Math.round(n)
384
+ }), this.pendingFunctionCallStartedAtMs = null, this.pendingFunctionCallNames = [];
385
+ }
386
+ this.currentResponseHasAudio = !0, this.events.emit("turnStart", void 0), this.setState("speaking");
387
+ }
388
+ this.events.emit("audio", f(t.delta));
389
+ break;
390
+ case "response.output_audio_transcript.delta":
391
+ if (!t.delta)
392
+ return;
393
+ this.currentTranscript += t.delta, this.events.emit("transcript", {
394
+ role: "assistant",
395
+ text: this.currentTranscript,
396
+ isFinal: !1
397
+ });
398
+ break;
399
+ case "response.output_audio_transcript.done":
400
+ if (!t.transcript)
401
+ return;
402
+ this.currentTranscript = t.transcript, this.events.emit("transcript", {
403
+ role: "assistant",
404
+ text: t.transcript,
405
+ isFinal: !0
406
+ });
407
+ break;
408
+ case "input_audio_buffer.speech_started":
409
+ this.resetTurnState(), this.events.emit("interrupted", void 0), this.setState("listening");
424
410
  break;
425
- case "error":
426
- console.error("[Cartesia] Server error:", t);
411
+ case "response.done":
412
+ this.handleResponseDone(t.response);
427
413
  break;
414
+ case "error": {
415
+ const n = t.error?.message ?? t.message ?? "Unknown OpenAI Realtime error";
416
+ this.rejectPendingConnect(new Error(n)), console.error("[OpenAIRealtime] Server error:", t);
417
+ break;
418
+ }
428
419
  }
429
420
  }
430
- handleAck(e) {
431
- this.streamId = e.stream_id || null, this.isReady = !0, this.setState("listening");
432
- }
433
- handleMediaOutput(e) {
434
- if (!e.media?.payload) return;
435
- this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
436
- let t = g(e.media.payload);
437
- this.cartesiaOutputRate !== l && (t = h(t, this.cartesiaOutputRate, l)), this.events.emit("audio", t);
438
- }
439
- handleClear() {
440
- this.events.emit("interrupted", void 0), this.setState("listening");
441
- }
442
421
  sendAudio(e) {
443
- if (!this.ws || this.ws.readyState !== WebSocket.OPEN || !this.isReady || !this.streamId)
422
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
423
+ console.warn("[OpenAIRealtime] Cannot send audio: not connected");
444
424
  return;
425
+ }
445
426
  let t = e;
446
- const s = parseInt(this.cartesiaInputFormat.split("_")[1]);
447
- this.inputSampleRate !== s && (t = h(e, this.inputSampleRate, s)), this.ws.send(JSON.stringify({
448
- event: "media_input",
449
- stream_id: this.streamId,
450
- media: {
451
- payload: m(t)
452
- }
427
+ this.sourceInputSampleRate !== h && (t = p(e, this.sourceInputSampleRate, h)), this.ws.send(JSON.stringify({
428
+ type: "input_audio_buffer.append",
429
+ audio: S(t)
453
430
  }));
454
431
  }
455
432
  close() {
456
- this.stopHeartbeat(), this.isReady = !1, this.streamId = null, super.close();
433
+ this.rejectPendingConnect(new Error("Connection closed")), this.clearConnectTimeout(), this.resetTurnState(), this.initialSessionUpdate = null, this.handledFunctionCallIds.clear(), super.close();
434
+ }
435
+ buildSessionUpdate(e, t) {
436
+ const n = e.turnDetection ?? { type: "semantic_vad", eagerness: "high" };
437
+ return {
438
+ type: "session.update",
439
+ session: {
440
+ type: "realtime",
441
+ model: t,
442
+ output_modalities: ["audio"],
443
+ instructions: e.systemPrompt,
444
+ audio: {
445
+ input: {
446
+ format: {
447
+ type: "audio/pcm",
448
+ rate: h
449
+ },
450
+ turn_detection: n
451
+ },
452
+ output: {
453
+ format: {
454
+ type: "audio/pcm",
455
+ rate: u
456
+ },
457
+ ...e.voice ? { voice: e.voice } : {}
458
+ }
459
+ },
460
+ tools: [O],
461
+ tool_choice: "auto"
462
+ }
463
+ };
457
464
  }
458
- }
459
- class N extends u {
460
- agentName = "Vapi";
461
- // Audio configuration - Vapi uses 16kHz PCM by default
462
- vapiSampleRate = 16e3;
463
- async connect(e) {
464
- if (this.ws)
465
- throw new Error("Already connected");
466
- if (!e.signedUrl)
467
- throw new Error("Vapi signed URL is required");
468
- return e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate), new Promise((t, s) => {
469
- this.ws = new WebSocket(e.signedUrl), this.ws.binaryType = "arraybuffer", this.ws.onopen = () => {
470
- this.setState("listening"), t();
471
- }, this.ws.onerror = () => {
472
- s(new Error("Failed to connect to Vapi"));
473
- }, this.ws.onclose = (n) => {
474
- this.ws = null, this.setState("idle"), this.emitClosed(n.code, n.reason);
475
- }, this.ws.onmessage = (n) => {
476
- n.data instanceof ArrayBuffer ? this.handleBinaryAudio(n.data) : this.handleMessage(n.data);
477
- };
478
- });
465
+ sendInitialSessionUpdate() {
466
+ this.initialSessionUpdate && this.sendEvent(this.initialSessionUpdate);
479
467
  }
480
- /**
481
- * Handle binary audio data from Vapi.
482
- * Vapi sends raw PCM 16-bit little-endian audio.
483
- */
484
- handleBinaryAudio(e) {
485
- this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
486
- const t = new Uint8Array(e), s = this.vapiSampleRate !== l ? h(t, this.vapiSampleRate, l) : t;
487
- this.events.emit("audio", s);
468
+ handleResponseDone(e) {
469
+ if (!e?.output?.length) {
470
+ this.currentResponseHasAudio && this.finishAudioTurn();
471
+ return;
472
+ }
473
+ const t = e.output.filter(x);
474
+ if (t.length > 0) {
475
+ this.handleFunctionCalls(t);
476
+ return;
477
+ }
478
+ this.currentResponseHasAudio && this.finishAudioTurn();
479
+ }
480
+ handleFunctionCalls(e) {
481
+ let t = !1;
482
+ const n = [];
483
+ for (const a of e) {
484
+ if (!a.call_id || this.handledFunctionCallIds.has(a.call_id))
485
+ continue;
486
+ this.handledFunctionCallIds.add(a.call_id), n.push(a.name ?? "unknown");
487
+ const i = this.handleFunctionCall(a);
488
+ this.sendEvent({
489
+ type: "conversation.item.create",
490
+ item: {
491
+ type: "function_call_output",
492
+ call_id: a.call_id,
493
+ output: JSON.stringify(i)
494
+ }
495
+ }), t = !0;
496
+ }
497
+ t && (this.pendingFunctionCallStartedAtMs = performance.now(), this.pendingFunctionCallNames = n, console.debug("[OpenAIRealtime] Function call received", {
498
+ calls: n
499
+ }), this.sendEvent({ type: "response.create" }));
488
500
  }
489
- handleParsedMessage(e) {
490
- const t = e;
491
- switch (t.type) {
492
- case "conversation-update":
493
- t.role === "user" && t.transcript ? this.events.emit("transcript", {
494
- role: "user",
495
- text: t.transcript,
496
- isFinal: !0
497
- }) : t.role === "assistant" && t.transcript && this.events.emit("transcript", {
498
- role: "assistant",
499
- text: t.transcript,
500
- isFinal: !0
501
- });
502
- break;
503
- case "speech-update":
504
- t.status === "started" ? (this.events.emit("turnStart", void 0), this.setState("speaking")) : t.status === "stopped" && (this.events.emit("turnEnd", void 0), this.setState("listening"));
505
- break;
506
- case "transcript":
507
- this.events.emit("transcript", {
508
- role: t.role === "user" ? "user" : "assistant",
509
- text: t.transcript || "",
510
- isFinal: t.transcriptType === "final"
511
- });
512
- break;
513
- case "hang":
514
- case "end-of-call-report":
515
- this.events.emit("turnEnd", void 0), this.setState("idle");
516
- break;
517
- case "error":
518
- console.error("[Vapi] Server error:", t);
519
- break;
501
+ handleFunctionCall(e) {
502
+ if (e.name !== "set_emotion")
503
+ return { error: `Unsupported function: ${e.name}` };
504
+ try {
505
+ const n = (e.arguments ? JSON.parse(e.arguments) : {}).emotion?.toLowerCase();
506
+ return n && T.includes(n) ? (this.events.emit("emotion", n), { result: "ok" }) : { error: "Invalid emotion" };
507
+ } catch {
508
+ return { error: "Invalid function arguments" };
520
509
  }
521
510
  }
522
- sendAudio(e) {
523
- if (!this.ws || this.ws.readyState !== WebSocket.OPEN)
511
+ finishAudioTurn() {
512
+ this.resetTurnState(), this.events.emit("turnEnd", void 0), this.setState("listening");
513
+ }
514
+ resetTurnState() {
515
+ this.currentResponseHasAudio = !1, this.currentTranscript = "";
516
+ }
517
+ sendEvent(e) {
518
+ !this.ws || this.ws.readyState !== WebSocket.OPEN || this.ws.send(JSON.stringify(e));
519
+ }
520
+ resolvePendingConnect() {
521
+ if (!this.connectResolve)
524
522
  return;
525
- let t = e;
526
- this.inputSampleRate !== this.vapiSampleRate && (t = h(e, this.inputSampleRate, this.vapiSampleRate)), this.ws.send(t.buffer);
523
+ const e = this.connectResolve;
524
+ this.connectResolve = null, this.connectReject = null, e();
527
525
  }
528
- /**
529
- * Send a control message to end the call.
530
- */
531
- hangup() {
532
- this.ws && this.ws.readyState === WebSocket.OPEN && this.ws.send(JSON.stringify({ type: "end-call" }));
526
+ rejectPendingConnect(e) {
527
+ if (!this.connectReject)
528
+ return;
529
+ const t = this.connectReject;
530
+ this.connectResolve = null, this.connectReject = null, t(e);
533
531
  }
534
- close() {
535
- this.hangup(), super.close();
532
+ clearConnectTimeout() {
533
+ this.connectTimeout !== null && (clearTimeout(this.connectTimeout), this.connectTimeout = null);
536
534
  }
537
535
  }
538
- const U = [
539
- { id: "gemini", name: "Gemini Live", description: "Google Gemini Live API" },
536
+ function x(s) {
537
+ return s.type === "function_call";
538
+ }
539
+ const D = [
540
540
  { id: "elevenlabs", name: "ElevenLabs", description: "ElevenLabs Conversational AI" },
541
- { id: "cartesia", name: "Cartesia", description: "Cartesia Agents API" },
542
- { id: "vapi", name: "Vapi", description: "Vapi WebSocket Transport" }
541
+ { id: "openai", name: "OpenAI Realtime", description: "OpenAI Realtime API" }
543
542
  ];
544
- function y(i) {
545
- switch (i) {
546
- case "gemini":
547
- return new R();
543
+ function b(s) {
544
+ switch (s) {
548
545
  case "elevenlabs":
549
- return new _();
550
- case "cartesia":
551
- return new O();
552
- case "vapi":
553
- return new N();
546
+ return new R();
547
+ case "openai":
548
+ return new P();
554
549
  default:
555
- throw new Error(`Unknown agent type: ${i}`);
550
+ throw new Error(`Unknown agent type: ${s}`);
556
551
  }
557
552
  }
558
- function F(i) {
559
- return U.find((e) => e.id === i);
553
+ function N(s) {
554
+ return D.find((e) => e.id === s);
560
555
  }
561
- class D extends Error {
556
+ class F extends Error {
562
557
  status;
563
558
  payload;
564
559
  url;
@@ -566,8 +561,8 @@ class D extends Error {
566
561
  super(e.message), this.name = "ApiError", this.status = e.status, this.payload = e.payload, this.url = e.url;
567
562
  }
568
563
  }
569
- const r = /* @__PURE__ */ new Set();
570
- class K {
564
+ const o = /* @__PURE__ */ new Set();
565
+ class L {
571
566
  apiBaseUrl;
572
567
  publishableKey;
573
568
  callbacks;
@@ -611,31 +606,31 @@ class K {
611
606
  }
612
607
  /** Connect to the embed session */
613
608
  async connect() {
614
- if (r.has(this.publishableKey)) {
609
+ if (o.has(this.publishableKey)) {
615
610
  console.log("[PersonaEmbed] Connection already in progress, skipping");
616
611
  return;
617
612
  }
618
- r.add(this.publishableKey), this.mounted = !0, this.abortController = new AbortController(), this.setStatus("connecting");
613
+ o.add(this.publishableKey), this.mounted = !0, this.abortController = new AbortController(), this.setStatus("connecting");
619
614
  try {
620
615
  const e = await this.fetchSession(this.abortController.signal);
621
616
  if (!this.mounted) {
622
- r.delete(this.publishableKey);
617
+ o.delete(this.publishableKey);
623
618
  return;
624
619
  }
625
620
  if (await this.initSession(e), await this.initMicrophone(), await this.connectAgent(e.voice_agent_details), !this.mounted) {
626
- this.cleanup(), r.delete(this.publishableKey);
621
+ this.cleanup(), o.delete(this.publishableKey);
627
622
  return;
628
623
  }
629
624
  this.setStatus("connected");
630
625
  } catch (e) {
631
- if (r.delete(this.publishableKey), e instanceof Error && e.name === "AbortError")
626
+ if (o.delete(this.publishableKey), e instanceof Error && e.name === "AbortError")
632
627
  return;
633
628
  console.error("[PersonaEmbed]", e), this.mounted && (this.setStatus("error"), this.callbacks.onError?.(e));
634
629
  }
635
630
  }
636
631
  /** Disconnect and cleanup */
637
632
  disconnect() {
638
- this.mounted = !1, this.abortController?.abort(), this.abortController = null, r.delete(this.publishableKey), this.cleanup(), this.setStatus("disconnected");
633
+ this.mounted = !1, this.abortController?.abort(), this.abortController = null, o.delete(this.publishableKey), this.cleanup(), this.setStatus("disconnected");
639
634
  }
640
635
  /** Toggle microphone mute */
641
636
  toggleMute() {
@@ -655,31 +650,31 @@ class K {
655
650
  signal: e
656
651
  });
657
652
  if (!t.ok) {
658
- let s;
653
+ let n;
659
654
  try {
660
- s = await t.json();
655
+ n = await t.json();
661
656
  } catch {
662
657
  }
663
- throw new D({
664
- message: s?.message ?? "create_session failed",
658
+ throw new F({
659
+ message: n?.message ?? "create_session failed",
665
660
  status: t.status,
666
- payload: s,
661
+ payload: n,
667
662
  url: t.url
668
663
  });
669
664
  }
670
665
  if (!t.ok) {
671
- const s = await t.json().catch(() => null);
672
- throw new Error(`create_session failed: ${t.status} ${JSON.stringify(s)}`);
666
+ const n = await t.json().catch(() => null);
667
+ throw new Error(`create_session failed: ${t.status} ${JSON.stringify(n)}`);
673
668
  }
674
669
  return t.json();
675
670
  }
676
671
  async initSession(e) {
677
- this.session = f({
672
+ this.session = _({
678
673
  serverUrl: e.session_details.server_url,
679
674
  participantToken: e.session_details.participant_token,
680
675
  agentIdentity: e.session_details.agent_identity,
681
676
  onVideoTrack: (t) => {
682
- console.log("[PersonaEmbed] Setting video track", t.readyState, t.enabled), this._video.srcObject = new MediaStream([t]), this._video.play().catch((s) => console.warn("[PersonaEmbed] Video play failed:", s));
677
+ console.log("[PersonaEmbed] Setting video track", t.readyState, t.enabled), this._video.srcObject = new MediaStream([t]), this._video.play().catch((n) => console.warn("[PersonaEmbed] Video play failed:", n));
683
678
  },
684
679
  onAudioTrack: (t) => {
685
680
  this._audio.srcObject = new MediaStream([t]), this._audio.play().catch(() => {
@@ -697,11 +692,11 @@ class K {
697
692
  onClose: () => {
698
693
  this.mounted && this.callbacks.onDisconnect?.();
699
694
  }
700
- }), this.agent = y(e.voice_agent_details.type), this.agent.on("audio", (t) => this.session?.sendAudio(t)), this.agent.on("turnEnd", () => this.session?.endAudioTurn()), this.agent.on("interrupted", () => {
695
+ }), this.agent = b(e.voice_agent_details.type), this.agent.on("audio", (t) => this.session?.sendAudio(t)), this.agent.on("turnEnd", () => this.session?.endAudioTurn()), this.agent.on("interrupted", () => {
701
696
  this.session?.endAudioTurn(), this.session?.interrupt();
702
697
  }), this.agent.on("closed", () => {
703
698
  this.mounted && this.callbacks.onDisconnect?.();
704
- }), this.agent instanceof _ && this.agent.on("emotion", (t) => this.session?.setEmotion(t)), await this.session.connect();
699
+ }), this.agent.on("emotion", (t) => this.session?.setEmotion(t)), await this.session.connect();
705
700
  }
706
701
  async initMicrophone() {
707
702
  this.stream = await navigator.mediaDevices.getUserMedia({
@@ -710,37 +705,32 @@ class K {
710
705
  const e = this.audioContext.createMediaStreamSource(this.stream);
711
706
  this.processor = this.audioContext.createScriptProcessor(4096, 1, 1), this.processor.onaudioprocess = (t) => {
712
707
  if (!this._isMuted) {
713
- const s = w(t.inputBuffer.getChannelData(0));
714
- this.agent?.sendAudio(s);
708
+ const n = v(t.inputBuffer.getChannelData(0));
709
+ this.agent?.sendAudio(n);
715
710
  }
716
711
  }, e.connect(this.processor), this.processor.connect(this.audioContext.destination);
717
712
  }
718
713
  async connectAgent(e) {
719
714
  if (!this.agent) return;
720
715
  const t = { inputSampleRate: 16e3 };
721
- e.type === "gemini" ? await this.agent.connect({
722
- ...t,
723
- apiKey: e.token,
724
- authType: "ephemeral_token"
725
- }) : e.type === "elevenlabs" ? await this.agent.connect({
716
+ e.type === "elevenlabs" ? await this.agent.connect({
726
717
  ...t,
727
718
  agentId: e.agent_id,
728
719
  signedUrl: e.signed_url
729
- }) : e.type === "cartesia" ? await this.agent.connect({
730
- ...t,
731
- agentId: e.agent_id,
732
- apiKey: e.token
733
- }) : e.type === "vapi" && await this.agent.connect({
720
+ }) : e.type === "openai" && await this.agent.connect({
734
721
  ...t,
735
- signedUrl: e.signed_url
722
+ apiKey: e.token,
723
+ systemPrompt: e.system_prompt,
724
+ voice: e.voice,
725
+ turnDetection: e.turn_detection
736
726
  });
737
727
  }
738
728
  cleanup() {
739
729
  this.stream?.getTracks().forEach((e) => e.stop()), this.processor?.disconnect(), this.audioContext?.close(), this.agent?.close(), this.session?.close(), this.stream = null, this.processor = null, this.audioContext = null, this.agent = null, this.session = null;
740
730
  }
741
731
  }
742
- const c = /* @__PURE__ */ new Set();
743
- class B {
732
+ const r = /* @__PURE__ */ new Set();
733
+ class $ {
744
734
  voiceAgentDetails;
745
735
  sessionDetails;
746
736
  callbacks;
@@ -784,24 +774,24 @@ class B {
784
774
  }
785
775
  /** Connect to the session */
786
776
  async connect() {
787
- if (c.has(this.connectionId)) {
777
+ if (r.has(this.connectionId)) {
788
778
  console.log("[PersonaView] Connection already in progress, skipping");
789
779
  return;
790
780
  }
791
- c.add(this.connectionId), this.mounted = !0, this.setStatus("connecting");
781
+ r.add(this.connectionId), this.mounted = !0, this.setStatus("connecting");
792
782
  try {
793
783
  if (await this.initSession(), await this.initMicrophone(), await this.connectAgent(), !this.mounted) {
794
- this.cleanup(), c.delete(this.connectionId);
784
+ this.cleanup(), r.delete(this.connectionId);
795
785
  return;
796
786
  }
797
787
  this.setStatus("connected");
798
788
  } catch (e) {
799
- c.delete(this.connectionId), console.error("[PersonaView]", e), this.mounted && (this.setStatus("error"), this.callbacks.onError?.(e));
789
+ r.delete(this.connectionId), console.error("[PersonaView]", e), this.mounted && (this.setStatus("error"), this.callbacks.onError?.(e));
800
790
  }
801
791
  }
802
792
  /** Disconnect and cleanup */
803
793
  disconnect() {
804
- this.mounted = !1, c.delete(this.connectionId), this.cleanup(), this.setStatus("disconnected");
794
+ this.mounted = !1, r.delete(this.connectionId), this.cleanup(), this.setStatus("disconnected");
805
795
  }
806
796
  /** Toggle microphone mute */
807
797
  toggleMute() {
@@ -814,7 +804,7 @@ class B {
814
804
  this._agentState !== e && (this._agentState = e, this.callbacks.onAgentStateChange?.(e));
815
805
  }
816
806
  async initSession() {
817
- this.session = f({
807
+ this.session = _({
818
808
  serverUrl: this.sessionDetails.server_url,
819
809
  participantToken: this.sessionDetails.participant_token,
820
810
  agentIdentity: this.sessionDetails.agent_identity,
@@ -837,11 +827,11 @@ class B {
837
827
  onClose: () => {
838
828
  this.mounted && this.callbacks.onDisconnect?.();
839
829
  }
840
- }), this.agent = y(this.voiceAgentDetails.type), this.agent.on("audio", (e) => this.session?.sendAudio(e)), this.agent.on("turnEnd", () => this.session?.endAudioTurn()), this.agent.on("interrupted", () => {
830
+ }), this.agent = b(this.voiceAgentDetails.type), this.agent.on("audio", (e) => this.session?.sendAudio(e)), this.agent.on("turnEnd", () => this.session?.endAudioTurn()), this.agent.on("interrupted", () => {
841
831
  this.session?.endAudioTurn(), this.session?.interrupt();
842
832
  }), this.agent.on("closed", () => {
843
833
  this.mounted && this.callbacks.onDisconnect?.();
844
- }), this.agent instanceof _ && this.agent.on("emotion", (e) => this.session?.setEmotion(e)), await this.session.connect();
834
+ }), this.agent.on("emotion", (e) => this.session?.setEmotion(e)), await this.session.connect();
845
835
  }
846
836
  async initMicrophone() {
847
837
  this.stream = await navigator.mediaDevices.getUserMedia({
@@ -850,29 +840,24 @@ class B {
850
840
  const e = this.audioContext.createMediaStreamSource(this.stream);
851
841
  this.processor = this.audioContext.createScriptProcessor(4096, 1, 1), this.processor.onaudioprocess = (t) => {
852
842
  if (!this._isMuted) {
853
- const s = w(t.inputBuffer.getChannelData(0));
854
- this.agent?.sendAudio(s);
843
+ const n = v(t.inputBuffer.getChannelData(0));
844
+ this.agent?.sendAudio(n);
855
845
  }
856
846
  }, e.connect(this.processor), this.processor.connect(this.audioContext.destination);
857
847
  }
858
848
  async connectAgent() {
859
849
  if (!this.agent) return;
860
850
  const e = this.voiceAgentDetails, t = { inputSampleRate: 16e3 };
861
- e.type === "gemini" ? await this.agent.connect({
862
- ...t,
863
- apiKey: e.token,
864
- authType: "ephemeral_token"
865
- }) : e.type === "elevenlabs" ? await this.agent.connect({
851
+ e.type === "elevenlabs" ? await this.agent.connect({
866
852
  ...t,
867
853
  agentId: e.agent_id,
868
854
  signedUrl: e.signed_url
869
- }) : e.type === "cartesia" ? await this.agent.connect({
855
+ }) : e.type === "openai" && await this.agent.connect({
870
856
  ...t,
871
- agentId: e.agent_id,
872
- apiKey: e.token
873
- }) : e.type === "vapi" && await this.agent.connect({
874
- ...t,
875
- signedUrl: e.signed_url
857
+ apiKey: e.token,
858
+ systemPrompt: e.system_prompt,
859
+ voice: e.voice,
860
+ turnDetection: e.turn_detection
876
861
  });
877
862
  }
878
863
  cleanup() {
@@ -880,20 +865,19 @@ class B {
880
865
  }
881
866
  }
882
867
  export {
883
- U as AGENT_REGISTRY,
884
- u as BaseAgent,
885
- O as CartesiaAgent,
886
- _ as ElevenLabsAgent,
887
- R as GeminiLiveAgent,
888
- D as KeyframeApiError,
889
- K as PersonaEmbed,
890
- B as PersonaView,
891
- l as SAMPLE_RATE,
892
- g as base64ToBytes,
893
- m as bytesToBase64,
894
- y as createAgent,
895
- E as createEventEmitter,
896
- w as floatTo16BitPCM,
897
- F as getAgentInfo,
898
- h as resamplePcm
868
+ D as AGENT_REGISTRY,
869
+ y as BaseAgent,
870
+ R as ElevenLabsAgent,
871
+ F as KeyframeApiError,
872
+ P as OpenAIRealtimeAgent,
873
+ L as PersonaEmbed,
874
+ $ as PersonaView,
875
+ u as SAMPLE_RATE,
876
+ f as base64ToBytes,
877
+ S as bytesToBase64,
878
+ b as createAgent,
879
+ C as createEventEmitter,
880
+ v as floatTo16BitPCM,
881
+ N as getAgentInfo,
882
+ p as resamplePcm
899
883
  };