@keyframelabs/elements 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,60 +1,60 @@
1
- import { createClient as f } from "@keyframelabs/sdk";
2
- const o = 24e3;
3
- function m(i) {
4
- const e = atob(i), t = new Uint8Array(e.length);
5
- for (let s = 0; s < e.length; s++)
6
- t[s] = e.charCodeAt(s);
1
+ import { createClient as _ } from "@keyframelabs/sdk";
2
+ const u = 24e3;
3
+ function f(s) {
4
+ const e = atob(s), t = new Uint8Array(e.length);
5
+ for (let n = 0; n < e.length; n++)
6
+ t[n] = e.charCodeAt(n);
7
7
  return t;
8
8
  }
9
- function g(i) {
9
+ function S(s) {
10
10
  let e = "";
11
- for (let t = 0; t < i.length; t++)
12
- e += String.fromCharCode(i[t]);
11
+ for (let t = 0; t < s.length; t++)
12
+ e += String.fromCharCode(s[t]);
13
13
  return btoa(e);
14
14
  }
15
- function h(i, e, t) {
15
+ function p(s, e, t) {
16
16
  if (e === t)
17
- return i;
18
- const s = new Int16Array(i.buffer, i.byteOffset, i.length / 2), n = e / t, a = Math.floor(s.length / n), d = new Int16Array(a);
19
- for (let r = 0; r < a; r++) {
20
- const _ = r * n, p = Math.floor(_), b = Math.min(p + 1, s.length - 1), v = _ - p;
21
- d[r] = Math.round(
22
- s[p] * (1 - v) + s[b] * v
17
+ return s;
18
+ const n = new Int16Array(s.buffer, s.byteOffset, s.length / 2), a = e / t, i = Math.floor(n.length / a), c = new Int16Array(i);
19
+ for (let l = 0; l < i; l++) {
20
+ const m = l * a, d = Math.floor(m), w = Math.min(d + 1, n.length - 1), g = m - d;
21
+ c[l] = Math.round(
22
+ n[d] * (1 - g) + n[w] * g
23
23
  );
24
24
  }
25
- return new Uint8Array(d.buffer);
25
+ return new Uint8Array(c.buffer);
26
26
  }
27
- function E() {
28
- const i = /* @__PURE__ */ new Map();
27
+ function C() {
28
+ const s = /* @__PURE__ */ new Map();
29
29
  return {
30
30
  on(e, t) {
31
- i.has(e) || i.set(e, /* @__PURE__ */ new Set()), i.get(e).add(t);
31
+ s.has(e) || s.set(e, /* @__PURE__ */ new Set()), s.get(e).add(t);
32
32
  },
33
33
  off(e, t) {
34
- i.get(e)?.delete(t);
34
+ s.get(e)?.delete(t);
35
35
  },
36
36
  emit(e, t) {
37
- i.get(e)?.forEach((s) => s(t));
37
+ s.get(e)?.forEach((n) => n(t));
38
38
  },
39
39
  removeAllListeners() {
40
- i.clear();
40
+ s.clear();
41
41
  }
42
42
  };
43
43
  }
44
- function w(i) {
45
- const e = new Int16Array(i.length);
46
- for (let t = 0; t < i.length; t++) {
47
- const s = Math.max(-1, Math.min(1, i[t]));
48
- e[t] = s < 0 ? s * 32768 : s * 32767;
44
+ function v(s) {
45
+ const e = new Int16Array(s.length);
46
+ for (let t = 0; t < s.length; t++) {
47
+ const n = Math.max(-1, Math.min(1, s[t]));
48
+ e[t] = n < 0 ? n * 32768 : n * 32767;
49
49
  }
50
50
  return new Uint8Array(e.buffer);
51
51
  }
52
- const C = 16e3;
53
- class u {
52
+ const E = 16e3;
53
+ class y {
54
54
  ws = null;
55
55
  _state = "idle";
56
- events = E();
57
- inputSampleRate = C;
56
+ events = C();
57
+ inputSampleRate = E;
58
58
  /** Current agent state */
59
59
  get state() {
60
60
  return this._state;
@@ -113,84 +113,8 @@ class u {
113
113
  this.events.emit("closed", { code: e, reason: t });
114
114
  }
115
115
  }
116
- const k = "gemini-2.5-flash-native-audio-preview-12-2025", A = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent", I = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
117
- class R extends u {
118
- agentName = "GeminiLive";
119
- async connect(e) {
120
- if (this.ws)
121
- throw new Error("Already connected");
122
- if (!e.apiKey)
123
- throw new Error("Gemini API key is required");
124
- e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate);
125
- const t = e.model ?? k, n = (e.authType ?? "api_key") === "ephemeral_token" ? `${I}?access_token=${encodeURIComponent(e.apiKey)}` : `${A}?key=${encodeURIComponent(e.apiKey)}`;
126
- return new Promise((a, d) => {
127
- this.ws = new WebSocket(n), this.ws.onopen = () => {
128
- const r = {
129
- setup: {
130
- model: `models/${t}`,
131
- generationConfig: {
132
- responseModalities: ["AUDIO"]
133
- },
134
- systemInstruction: e.systemPrompt ? { parts: [{ text: e.systemPrompt }] } : void 0
135
- }
136
- };
137
- this.ws.send(JSON.stringify(r)), this.setState("listening"), a();
138
- }, this.ws.onerror = () => {
139
- d(new Error("Failed to connect to Gemini Live"));
140
- }, this.ws.onclose = (r) => {
141
- this.ws = null, this.setState("idle"), this.emitClosed(r.code, r.reason);
142
- }, this.ws.onmessage = (r) => {
143
- this.handleMessage(r.data);
144
- };
145
- });
146
- }
147
- handleParsedMessage(e) {
148
- const s = e.serverContent;
149
- if (s) {
150
- if (s.interrupted) {
151
- this.events.emit("interrupted", void 0), this.setState("listening");
152
- return;
153
- }
154
- if (s.turnComplete) {
155
- this.events.emit("turnEnd", void 0), this.setState("listening");
156
- return;
157
- }
158
- if (s.modelTurn?.parts) {
159
- this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
160
- for (const n of s.modelTurn.parts) {
161
- if (n.inlineData?.data) {
162
- const a = m(n.inlineData.data);
163
- this.events.emit("audio", a);
164
- }
165
- n.text && this.events.emit("transcript", {
166
- role: "assistant",
167
- text: n.text,
168
- isFinal: !0
169
- });
170
- }
171
- }
172
- }
173
- }
174
- sendAudio(e) {
175
- if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
176
- console.warn("[GeminiLive] Cannot send audio: not connected");
177
- return;
178
- }
179
- const t = {
180
- realtimeInput: {
181
- mediaChunks: [
182
- {
183
- mimeType: `audio/pcm;rate=${this.inputSampleRate}`,
184
- data: g(e)
185
- }
186
- ]
187
- }
188
- };
189
- this.ws.send(JSON.stringify(t));
190
- }
191
- }
192
- const M = ["neutral", "angry", "sad", "happy"], T = "wss://api.elevenlabs.io/v1/convai/conversation";
193
- class S extends u {
116
+ const A = ["neutral", "angry", "sad", "happy"], I = "wss://api.elevenlabs.io/v1/convai/conversation";
117
+ class R extends y {
194
118
  agentName = "ElevenLabs";
195
119
  outputSampleRate = 24e3;
196
120
  // Default, updated from metadata
@@ -215,15 +139,15 @@ class S extends u {
215
139
  throw new Error("ElevenLabs agent ID or signed URL is required");
216
140
  e.inputSampleRate && (this.sourceInputSampleRate = e.inputSampleRate);
217
141
  let t;
218
- return e.signedUrl ? t = e.signedUrl : (t = `${T}?agent_id=${e.agentId}`, e.apiKey && (t += `&xi-api-key=${e.apiKey}`)), new Promise((s, n) => {
142
+ return e.signedUrl ? t = e.signedUrl : (t = `${I}?agent_id=${e.agentId}`, e.apiKey && (t += `&xi-api-key=${e.apiKey}`)), new Promise((n, a) => {
219
143
  this.ws = new WebSocket(t), this.ws.onopen = () => {
220
- this.setState("listening"), s();
144
+ this.setState("listening"), n();
221
145
  }, this.ws.onerror = () => {
222
- n(new Error("Failed to connect to ElevenLabs"));
223
- }, this.ws.onclose = (a) => {
224
- this.ws = null, this.setState("idle"), this.emitClosed(a.code, a.reason);
225
- }, this.ws.onmessage = (a) => {
226
- this.handleMessage(a.data);
146
+ a(new Error("Failed to connect to ElevenLabs"));
147
+ }, this.ws.onclose = (i) => {
148
+ this.ws = null, this.setState("idle"), this.emitClosed(i.code, i.reason);
149
+ }, this.ws.onmessage = (i) => {
150
+ this.handleMessage(i.data);
227
151
  };
228
152
  });
229
153
  }
@@ -260,12 +184,12 @@ class S extends u {
260
184
  const t = e.conversation_initiation_metadata_event;
261
185
  if (t) {
262
186
  if (t.agent_output_audio_format) {
263
- const s = t.agent_output_audio_format.match(/pcm_(\d+)/);
264
- s && (this.outputSampleRate = parseInt(s[1], 10));
187
+ const n = t.agent_output_audio_format.match(/pcm_(\d+)/);
188
+ n && (this.outputSampleRate = parseInt(n[1], 10));
265
189
  }
266
190
  if (t.user_input_audio_format) {
267
- const s = t.user_input_audio_format.match(/pcm_(\d+)/);
268
- s && (this.expectedInputSampleRate = parseInt(s[1], 10));
191
+ const n = t.user_input_audio_format.match(/pcm_(\d+)/);
192
+ n && (this.expectedInputSampleRate = parseInt(n[1], 10));
269
193
  }
270
194
  this.initialized = !0;
271
195
  }
@@ -281,11 +205,11 @@ class S extends u {
281
205
  if (!t?.audio_base_64 || (t.event_id ?? 0) <= this.lastInterruptId)
282
206
  return;
283
207
  this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
284
- let n = m(t.audio_base_64);
285
- this.outputSampleRate !== o && (n = h(n, this.outputSampleRate, o)), this.events.emit("audio", n);
286
- const a = n.length / 2 / o * 1e3;
287
- this.turnStartTime === 0 && (this.turnStartTime = Date.now()), this.accumulatedDurationMs += a, console.debug(
288
- `[ElevenLabs] audio chunk: ${n.length} bytes, +${a.toFixed(0)}ms, totalDuration=${this.accumulatedDurationMs.toFixed(0)}ms, agentResponse=${this.agentResponseReceived}`
208
+ let a = f(t.audio_base_64);
209
+ this.outputSampleRate !== u && (a = p(a, this.outputSampleRate, u)), this.events.emit("audio", a);
210
+ const i = a.length / 2 / u * 1e3;
211
+ this.turnStartTime === 0 && (this.turnStartTime = Date.now()), this.accumulatedDurationMs += i, console.debug(
212
+ `[ElevenLabs] audio chunk: ${a.length} bytes, +${i.toFixed(0)}ms, totalDuration=${this.accumulatedDurationMs.toFixed(0)}ms, agentResponse=${this.agentResponseReceived}`
289
213
  ), this.scheduleVirtualBufferCheck();
290
214
  }
291
215
  handleUserTranscript(e) {
@@ -328,8 +252,8 @@ class S extends u {
328
252
  const t = e.client_tool_call;
329
253
  if (t) {
330
254
  if (t.tool_name === "set_emotion") {
331
- const s = t.parameters?.emotion?.toLowerCase();
332
- s && M.includes(s) && this.events.emit("emotion", s);
255
+ const n = t.parameters?.emotion?.toLowerCase();
256
+ n && A.includes(n) && this.events.emit("emotion", n);
333
257
  }
334
258
  this.ws && this.ws.readyState === WebSocket.OPEN && this.ws.send(JSON.stringify({
335
259
  type: "client_tool_result",
@@ -349,8 +273,8 @@ class S extends u {
349
273
  if (!this.ws || this.ws.readyState !== WebSocket.OPEN || !this.initialized)
350
274
  return;
351
275
  let t = e;
352
- this.sourceInputSampleRate !== this.expectedInputSampleRate && (t = h(e, this.sourceInputSampleRate, this.expectedInputSampleRate)), this.ws.send(JSON.stringify({
353
- user_audio_chunk: g(t)
276
+ this.sourceInputSampleRate !== this.expectedInputSampleRate && (t = p(e, this.sourceInputSampleRate, this.expectedInputSampleRate)), this.ws.send(JSON.stringify({
277
+ user_audio_chunk: S(t)
354
278
  }));
355
279
  }
356
280
  /**
@@ -383,214 +307,253 @@ class S extends u {
383
307
  this.initialized = !1, this.lastInterruptId = 0, this.resetTurnState(), super.close();
384
308
  }
385
309
  }
386
- const x = "wss://api.cartesia.ai/agents/stream", P = "2025-04-16";
387
- class D extends u {
388
- agentName = "Cartesia";
389
- // Audio configuration
390
- cartesiaInputFormat = "pcm_16000";
391
- // Format we tell Cartesia we are sending
392
- cartesiaOutputRate = 16e3;
393
- // Cartesia defaults to 16kHz for web
394
- // Connection state
395
- streamId = null;
396
- isReady = !1;
397
- pingInterval = null;
310
+ const T = ["neutral", "angry", "sad", "happy"], k = "wss://api.openai.com/v1/realtime", M = "gpt-realtime", h = 24e3, O = {
311
+ type: "function",
312
+ name: "set_emotion",
313
+ description: "Set the emotional expression of the avatar. Call this on every turn to reflect the tone of your response.",
314
+ parameters: {
315
+ type: "object",
316
+ properties: {
317
+ emotion: {
318
+ type: "string",
319
+ enum: ["neutral", "angry", "sad", "happy"],
320
+ description: "The emotion to display"
321
+ }
322
+ },
323
+ required: ["emotion"]
324
+ }
325
+ };
326
+ class P extends y {
327
+ agentName = "OpenAIRealtime";
328
+ connectResolve = null;
329
+ connectReject = null;
330
+ connectTimeout = null;
331
+ initialSessionUpdate = null;
332
+ currentResponseHasAudio = !1;
333
+ currentTranscript = "";
334
+ handledFunctionCallIds = /* @__PURE__ */ new Set();
335
+ sourceInputSampleRate = 16e3;
336
+ pendingFunctionCallStartedAtMs = null;
337
+ pendingFunctionCallNames = [];
398
338
  async connect(e) {
399
339
  if (this.ws)
400
340
  throw new Error("Already connected");
401
- if (!e.agentId)
402
- throw new Error("Cartesia Agent ID is required");
403
341
  if (!e.apiKey)
404
- throw new Error("Cartesia API Key is required");
405
- e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate), this.inputSampleRate === 16e3 ? this.cartesiaInputFormat = "pcm_16000" : this.inputSampleRate === 24e3 ? this.cartesiaInputFormat = "pcm_24000" : this.inputSampleRate === 44100 ? this.cartesiaInputFormat = "pcm_44100" : this.cartesiaInputFormat = "pcm_16000";
406
- const t = `${x}/${e.agentId}?api_key=${e.apiKey}&cartesia_version=${P}`;
407
- return new Promise((s, n) => {
408
- this.ws = new WebSocket(t), this.ws.onopen = () => {
409
- this.sendStartEvent(), this.startHeartbeat(), s();
342
+ throw new Error("OpenAI Realtime token is required");
343
+ e.inputSampleRate && (this.sourceInputSampleRate = e.inputSampleRate);
344
+ const t = e.model ?? M;
345
+ return this.initialSessionUpdate = this.buildSessionUpdate(e, t), new Promise((n, a) => {
346
+ this.connectResolve = n, this.connectReject = a, this.connectTimeout = setTimeout(() => {
347
+ this.rejectPendingConnect(new Error("Timed out waiting for OpenAI Realtime session setup")), this.close();
348
+ }, 1e4), this.ws = new WebSocket(
349
+ `${k}?model=${encodeURIComponent(t)}`,
350
+ ["realtime", `openai-insecure-api-key.${e.apiKey}`]
351
+ ), this.ws.onopen = () => {
410
352
  }, this.ws.onerror = () => {
411
- n(new Error("Failed to connect to Cartesia"));
412
- }, this.ws.onclose = (a) => {
413
- this.stopHeartbeat(), this.ws = null, this.isReady = !1, this.streamId = null, this.setState("idle"), this.emitClosed(a.code, a.reason);
414
- }, this.ws.onmessage = (a) => {
415
- this.handleMessage(a.data);
353
+ this.rejectPendingConnect(new Error("Failed to connect to OpenAI Realtime"));
354
+ }, this.ws.onclose = (i) => {
355
+ if (this.clearConnectTimeout(), this.connectReject) {
356
+ const c = i.reason ? `: ${i.reason}` : "";
357
+ this.rejectPendingConnect(new Error(`OpenAI Realtime closed before initialization (${i.code}${c})`));
358
+ }
359
+ this.resetTurnState(), this.initialSessionUpdate = null, this.ws = null, this.setState("idle"), this.emitClosed(i.code, i.reason);
360
+ }, this.ws.onmessage = (i) => {
361
+ this.handleMessage(i.data);
416
362
  };
417
363
  });
418
364
  }
419
- sendStartEvent() {
420
- if (!this.ws) return;
421
- const e = {
422
- event: "start",
423
- config: {
424
- input_format: this.cartesiaInputFormat
425
- }
426
- };
427
- this.ws.send(JSON.stringify(e));
428
- }
429
- /**
430
- * Keep connection alive with periodic custom events.
431
- * Cartesia requires activity every 30s.
432
- */
433
- startHeartbeat() {
434
- this.pingInterval = window.setInterval(() => {
435
- this.ws?.readyState === WebSocket.OPEN && this.streamId && this.ws.send(JSON.stringify({
436
- event: "custom",
437
- stream_id: this.streamId,
438
- metadata: { keepalive: !0 }
439
- }));
440
- }, 2e4);
441
- }
442
- stopHeartbeat() {
443
- this.pingInterval && (clearInterval(this.pingInterval), this.pingInterval = null);
444
- }
445
365
  handleParsedMessage(e) {
446
366
  const t = e;
447
- switch (t.event) {
448
- case "ack":
449
- this.handleAck(t);
367
+ switch (t.type) {
368
+ case "session.created":
369
+ this.sendInitialSessionUpdate();
450
370
  break;
451
- case "media_output":
452
- this.handleMediaOutput(t);
371
+ case "session.updated":
372
+ this.clearConnectTimeout(), this.setState("listening"), this.resolvePendingConnect();
453
373
  break;
454
- case "clear":
455
- this.handleClear();
374
+ case "response.output_audio.delta":
375
+ case "response.audio.delta":
376
+ if (!t.delta)
377
+ return;
378
+ if (!this.currentResponseHasAudio) {
379
+ if (this.pendingFunctionCallStartedAtMs !== null) {
380
+ const n = performance.now() - this.pendingFunctionCallStartedAtMs;
381
+ console.debug("[OpenAIRealtime] Function call latency", {
382
+ calls: this.pendingFunctionCallNames,
383
+ latencyMs: Math.round(n)
384
+ }), this.pendingFunctionCallStartedAtMs = null, this.pendingFunctionCallNames = [];
385
+ }
386
+ this.currentResponseHasAudio = !0, this.events.emit("turnStart", void 0), this.setState("speaking");
387
+ }
388
+ this.events.emit("audio", f(t.delta));
456
389
  break;
457
- case "error":
458
- console.error("[Cartesia] Server error:", t);
390
+ case "response.output_audio_transcript.delta":
391
+ if (!t.delta)
392
+ return;
393
+ this.currentTranscript += t.delta, this.events.emit("transcript", {
394
+ role: "assistant",
395
+ text: this.currentTranscript,
396
+ isFinal: !1
397
+ });
459
398
  break;
399
+ case "response.output_audio_transcript.done":
400
+ if (!t.transcript)
401
+ return;
402
+ this.currentTranscript = t.transcript, this.events.emit("transcript", {
403
+ role: "assistant",
404
+ text: t.transcript,
405
+ isFinal: !0
406
+ });
407
+ break;
408
+ case "input_audio_buffer.speech_started":
409
+ this.resetTurnState(), this.events.emit("interrupted", void 0), this.setState("listening");
410
+ break;
411
+ case "response.done":
412
+ this.handleResponseDone(t.response);
413
+ break;
414
+ case "error": {
415
+ const n = t.error?.message ?? t.message ?? "Unknown OpenAI Realtime error";
416
+ this.rejectPendingConnect(new Error(n)), console.error("[OpenAIRealtime] Server error:", t);
417
+ break;
418
+ }
460
419
  }
461
420
  }
462
- handleAck(e) {
463
- this.streamId = e.stream_id || null, this.isReady = !0, this.setState("listening");
464
- }
465
- handleMediaOutput(e) {
466
- if (!e.media?.payload) return;
467
- this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
468
- let t = m(e.media.payload);
469
- this.cartesiaOutputRate !== o && (t = h(t, this.cartesiaOutputRate, o)), this.events.emit("audio", t);
470
- }
471
- handleClear() {
472
- this.events.emit("interrupted", void 0), this.setState("listening");
473
- }
474
421
  sendAudio(e) {
475
- if (!this.ws || this.ws.readyState !== WebSocket.OPEN || !this.isReady || !this.streamId)
422
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
423
+ console.warn("[OpenAIRealtime] Cannot send audio: not connected");
476
424
  return;
425
+ }
477
426
  let t = e;
478
- const s = parseInt(this.cartesiaInputFormat.split("_")[1]);
479
- this.inputSampleRate !== s && (t = h(e, this.inputSampleRate, s)), this.ws.send(JSON.stringify({
480
- event: "media_input",
481
- stream_id: this.streamId,
482
- media: {
483
- payload: g(t)
484
- }
427
+ this.sourceInputSampleRate !== h && (t = p(e, this.sourceInputSampleRate, h)), this.ws.send(JSON.stringify({
428
+ type: "input_audio_buffer.append",
429
+ audio: S(t)
485
430
  }));
486
431
  }
487
432
  close() {
488
- this.stopHeartbeat(), this.isReady = !1, this.streamId = null, super.close();
433
+ this.rejectPendingConnect(new Error("Connection closed")), this.clearConnectTimeout(), this.resetTurnState(), this.initialSessionUpdate = null, this.handledFunctionCallIds.clear(), super.close();
434
+ }
435
+ buildSessionUpdate(e, t) {
436
+ const n = e.turnDetection ?? { type: "semantic_vad", eagerness: "high" };
437
+ return {
438
+ type: "session.update",
439
+ session: {
440
+ type: "realtime",
441
+ model: t,
442
+ output_modalities: ["audio"],
443
+ instructions: e.systemPrompt,
444
+ audio: {
445
+ input: {
446
+ format: {
447
+ type: "audio/pcm",
448
+ rate: h
449
+ },
450
+ turn_detection: n
451
+ },
452
+ output: {
453
+ format: {
454
+ type: "audio/pcm",
455
+ rate: u
456
+ },
457
+ ...e.voice ? { voice: e.voice } : {}
458
+ }
459
+ },
460
+ tools: [O],
461
+ tool_choice: "auto"
462
+ }
463
+ };
489
464
  }
490
- }
491
- class O extends u {
492
- agentName = "Vapi";
493
- // Audio configuration - Vapi uses 16kHz PCM by default
494
- vapiSampleRate = 16e3;
495
- async connect(e) {
496
- if (this.ws)
497
- throw new Error("Already connected");
498
- if (!e.signedUrl)
499
- throw new Error("Vapi signed URL is required");
500
- return e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate), new Promise((t, s) => {
501
- this.ws = new WebSocket(e.signedUrl), this.ws.binaryType = "arraybuffer", this.ws.onopen = () => {
502
- this.setState("listening"), t();
503
- }, this.ws.onerror = () => {
504
- s(new Error("Failed to connect to Vapi"));
505
- }, this.ws.onclose = (n) => {
506
- this.ws = null, this.setState("idle"), this.emitClosed(n.code, n.reason);
507
- }, this.ws.onmessage = (n) => {
508
- n.data instanceof ArrayBuffer ? this.handleBinaryAudio(n.data) : this.handleMessage(n.data);
509
- };
510
- });
465
+ sendInitialSessionUpdate() {
466
+ this.initialSessionUpdate && this.sendEvent(this.initialSessionUpdate);
511
467
  }
512
- /**
513
- * Handle binary audio data from Vapi.
514
- * Vapi sends raw PCM 16-bit little-endian audio.
515
- */
516
- handleBinaryAudio(e) {
517
- this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
518
- const t = new Uint8Array(e), s = this.vapiSampleRate !== o ? h(t, this.vapiSampleRate, o) : t;
519
- this.events.emit("audio", s);
468
+ handleResponseDone(e) {
469
+ if (!e?.output?.length) {
470
+ this.currentResponseHasAudio && this.finishAudioTurn();
471
+ return;
472
+ }
473
+ const t = e.output.filter(x);
474
+ if (t.length > 0) {
475
+ this.handleFunctionCalls(t);
476
+ return;
477
+ }
478
+ this.currentResponseHasAudio && this.finishAudioTurn();
479
+ }
480
+ handleFunctionCalls(e) {
481
+ let t = !1;
482
+ const n = [];
483
+ for (const a of e) {
484
+ if (!a.call_id || this.handledFunctionCallIds.has(a.call_id))
485
+ continue;
486
+ this.handledFunctionCallIds.add(a.call_id), n.push(a.name ?? "unknown");
487
+ const i = this.handleFunctionCall(a);
488
+ this.sendEvent({
489
+ type: "conversation.item.create",
490
+ item: {
491
+ type: "function_call_output",
492
+ call_id: a.call_id,
493
+ output: JSON.stringify(i)
494
+ }
495
+ }), t = !0;
496
+ }
497
+ t && (this.pendingFunctionCallStartedAtMs = performance.now(), this.pendingFunctionCallNames = n, console.debug("[OpenAIRealtime] Function call received", {
498
+ calls: n
499
+ }), this.sendEvent({ type: "response.create" }));
520
500
  }
521
- handleParsedMessage(e) {
522
- const t = e;
523
- switch (t.type) {
524
- case "conversation-update":
525
- t.role === "user" && t.transcript ? this.events.emit("transcript", {
526
- role: "user",
527
- text: t.transcript,
528
- isFinal: !0
529
- }) : t.role === "assistant" && t.transcript && this.events.emit("transcript", {
530
- role: "assistant",
531
- text: t.transcript,
532
- isFinal: !0
533
- });
534
- break;
535
- case "speech-update":
536
- t.status === "started" ? (this.events.emit("turnStart", void 0), this.setState("speaking")) : t.status === "stopped" && (this.events.emit("turnEnd", void 0), this.setState("listening"));
537
- break;
538
- case "transcript":
539
- this.events.emit("transcript", {
540
- role: t.role === "user" ? "user" : "assistant",
541
- text: t.transcript || "",
542
- isFinal: t.transcriptType === "final"
543
- });
544
- break;
545
- case "hang":
546
- case "end-of-call-report":
547
- this.events.emit("turnEnd", void 0), this.setState("idle");
548
- break;
549
- case "error":
550
- console.error("[Vapi] Server error:", t);
551
- break;
501
+ handleFunctionCall(e) {
502
+ if (e.name !== "set_emotion")
503
+ return { error: `Unsupported function: ${e.name}` };
504
+ try {
505
+ const n = (e.arguments ? JSON.parse(e.arguments) : {}).emotion?.toLowerCase();
506
+ return n && T.includes(n) ? (this.events.emit("emotion", n), { result: "ok" }) : { error: "Invalid emotion" };
507
+ } catch {
508
+ return { error: "Invalid function arguments" };
552
509
  }
553
510
  }
554
- sendAudio(e) {
555
- if (!this.ws || this.ws.readyState !== WebSocket.OPEN)
511
+ finishAudioTurn() {
512
+ this.resetTurnState(), this.events.emit("turnEnd", void 0), this.setState("listening");
513
+ }
514
+ resetTurnState() {
515
+ this.currentResponseHasAudio = !1, this.currentTranscript = "";
516
+ }
517
+ sendEvent(e) {
518
+ !this.ws || this.ws.readyState !== WebSocket.OPEN || this.ws.send(JSON.stringify(e));
519
+ }
520
+ resolvePendingConnect() {
521
+ if (!this.connectResolve)
556
522
  return;
557
- let t = e;
558
- this.inputSampleRate !== this.vapiSampleRate && (t = h(e, this.inputSampleRate, this.vapiSampleRate)), this.ws.send(t.buffer);
523
+ const e = this.connectResolve;
524
+ this.connectResolve = null, this.connectReject = null, e();
559
525
  }
560
- /**
561
- * Send a control message to end the call.
562
- */
563
- hangup() {
564
- this.ws && this.ws.readyState === WebSocket.OPEN && this.ws.send(JSON.stringify({ type: "end-call" }));
526
+ rejectPendingConnect(e) {
527
+ if (!this.connectReject)
528
+ return;
529
+ const t = this.connectReject;
530
+ this.connectResolve = null, this.connectReject = null, t(e);
565
531
  }
566
- close() {
567
- this.hangup(), super.close();
532
+ clearConnectTimeout() {
533
+ this.connectTimeout !== null && (clearTimeout(this.connectTimeout), this.connectTimeout = null);
568
534
  }
569
535
  }
570
- const N = [
571
- { id: "gemini", name: "Gemini Live", description: "Google Gemini Live API" },
536
+ function x(s) {
537
+ return s.type === "function_call";
538
+ }
539
+ const D = [
572
540
  { id: "elevenlabs", name: "ElevenLabs", description: "ElevenLabs Conversational AI" },
573
- { id: "cartesia", name: "Cartesia", description: "Cartesia Agents API" },
574
- { id: "vapi", name: "Vapi", description: "Vapi WebSocket Transport" }
541
+ { id: "openai", name: "OpenAI Realtime", description: "OpenAI Realtime API" }
575
542
  ];
576
- function y(i) {
577
- switch (i) {
578
- case "gemini":
579
- return new R();
543
+ function b(s) {
544
+ switch (s) {
580
545
  case "elevenlabs":
581
- return new S();
582
- case "cartesia":
583
- return new D();
584
- case "vapi":
585
- return new O();
546
+ return new R();
547
+ case "openai":
548
+ return new P();
586
549
  default:
587
- throw new Error(`Unknown agent type: ${i}`);
550
+ throw new Error(`Unknown agent type: ${s}`);
588
551
  }
589
552
  }
590
- function F(i) {
591
- return N.find((e) => e.id === i);
553
+ function N(s) {
554
+ return D.find((e) => e.id === s);
592
555
  }
593
- class L extends Error {
556
+ class F extends Error {
594
557
  status;
595
558
  payload;
596
559
  url;
@@ -598,8 +561,8 @@ class L extends Error {
598
561
  super(e.message), this.name = "ApiError", this.status = e.status, this.payload = e.payload, this.url = e.url;
599
562
  }
600
563
  }
601
- const l = /* @__PURE__ */ new Set();
602
- class $ {
564
+ const o = /* @__PURE__ */ new Set();
565
+ class L {
603
566
  apiBaseUrl;
604
567
  publishableKey;
605
568
  callbacks;
@@ -643,31 +606,31 @@ class $ {
643
606
  }
644
607
  /** Connect to the embed session */
645
608
  async connect() {
646
- if (l.has(this.publishableKey)) {
609
+ if (o.has(this.publishableKey)) {
647
610
  console.log("[PersonaEmbed] Connection already in progress, skipping");
648
611
  return;
649
612
  }
650
- l.add(this.publishableKey), this.mounted = !0, this.abortController = new AbortController(), this.setStatus("connecting");
613
+ o.add(this.publishableKey), this.mounted = !0, this.abortController = new AbortController(), this.setStatus("connecting");
651
614
  try {
652
615
  const e = await this.fetchSession(this.abortController.signal);
653
616
  if (!this.mounted) {
654
- l.delete(this.publishableKey);
617
+ o.delete(this.publishableKey);
655
618
  return;
656
619
  }
657
620
  if (await this.initSession(e), await this.initMicrophone(), await this.connectAgent(e.voice_agent_details), !this.mounted) {
658
- this.cleanup(), l.delete(this.publishableKey);
621
+ this.cleanup(), o.delete(this.publishableKey);
659
622
  return;
660
623
  }
661
624
  this.setStatus("connected");
662
625
  } catch (e) {
663
- if (l.delete(this.publishableKey), e instanceof Error && e.name === "AbortError")
626
+ if (o.delete(this.publishableKey), e instanceof Error && e.name === "AbortError")
664
627
  return;
665
628
  console.error("[PersonaEmbed]", e), this.mounted && (this.setStatus("error"), this.callbacks.onError?.(e));
666
629
  }
667
630
  }
668
631
  /** Disconnect and cleanup */
669
632
  disconnect() {
670
- this.mounted = !1, this.abortController?.abort(), this.abortController = null, l.delete(this.publishableKey), this.cleanup(), this.setStatus("disconnected");
633
+ this.mounted = !1, this.abortController?.abort(), this.abortController = null, o.delete(this.publishableKey), this.cleanup(), this.setStatus("disconnected");
671
634
  }
672
635
  /** Toggle microphone mute */
673
636
  toggleMute() {
@@ -687,31 +650,31 @@ class $ {
687
650
  signal: e
688
651
  });
689
652
  if (!t.ok) {
690
- let s;
653
+ let n;
691
654
  try {
692
- s = await t.json();
655
+ n = await t.json();
693
656
  } catch {
694
657
  }
695
- throw new L({
696
- message: s?.message ?? "create_session failed",
658
+ throw new F({
659
+ message: n?.message ?? "create_session failed",
697
660
  status: t.status,
698
- payload: s,
661
+ payload: n,
699
662
  url: t.url
700
663
  });
701
664
  }
702
665
  if (!t.ok) {
703
- const s = await t.json().catch(() => null);
704
- throw new Error(`create_session failed: ${t.status} ${JSON.stringify(s)}`);
666
+ const n = await t.json().catch(() => null);
667
+ throw new Error(`create_session failed: ${t.status} ${JSON.stringify(n)}`);
705
668
  }
706
669
  return t.json();
707
670
  }
708
671
  async initSession(e) {
709
- this.session = f({
672
+ this.session = _({
710
673
  serverUrl: e.session_details.server_url,
711
674
  participantToken: e.session_details.participant_token,
712
675
  agentIdentity: e.session_details.agent_identity,
713
676
  onVideoTrack: (t) => {
714
- console.log("[PersonaEmbed] Setting video track", t.readyState, t.enabled), this._video.srcObject = new MediaStream([t]), this._video.play().catch((s) => console.warn("[PersonaEmbed] Video play failed:", s));
677
+ console.log("[PersonaEmbed] Setting video track", t.readyState, t.enabled), this._video.srcObject = new MediaStream([t]), this._video.play().catch((n) => console.warn("[PersonaEmbed] Video play failed:", n));
715
678
  },
716
679
  onAudioTrack: (t) => {
717
680
  this._audio.srcObject = new MediaStream([t]), this._audio.play().catch(() => {
@@ -729,11 +692,11 @@ class $ {
729
692
  onClose: () => {
730
693
  this.mounted && this.callbacks.onDisconnect?.();
731
694
  }
732
- }), this.agent = y(e.voice_agent_details.type), this.agent.on("audio", (t) => this.session?.sendAudio(t)), this.agent.on("turnEnd", () => this.session?.endAudioTurn()), this.agent.on("interrupted", () => {
695
+ }), this.agent = b(e.voice_agent_details.type), this.agent.on("audio", (t) => this.session?.sendAudio(t)), this.agent.on("turnEnd", () => this.session?.endAudioTurn()), this.agent.on("interrupted", () => {
733
696
  this.session?.endAudioTurn(), this.session?.interrupt();
734
697
  }), this.agent.on("closed", () => {
735
698
  this.mounted && this.callbacks.onDisconnect?.();
736
- }), this.agent instanceof S && this.agent.on("emotion", (t) => this.session?.setEmotion(t)), await this.session.connect();
699
+ }), this.agent.on("emotion", (t) => this.session?.setEmotion(t)), await this.session.connect();
737
700
  }
738
701
  async initMicrophone() {
739
702
  this.stream = await navigator.mediaDevices.getUserMedia({
@@ -742,37 +705,32 @@ class $ {
742
705
  const e = this.audioContext.createMediaStreamSource(this.stream);
743
706
  this.processor = this.audioContext.createScriptProcessor(4096, 1, 1), this.processor.onaudioprocess = (t) => {
744
707
  if (!this._isMuted) {
745
- const s = w(t.inputBuffer.getChannelData(0));
746
- this.agent?.sendAudio(s);
708
+ const n = v(t.inputBuffer.getChannelData(0));
709
+ this.agent?.sendAudio(n);
747
710
  }
748
711
  }, e.connect(this.processor), this.processor.connect(this.audioContext.destination);
749
712
  }
750
713
  async connectAgent(e) {
751
714
  if (!this.agent) return;
752
715
  const t = { inputSampleRate: 16e3 };
753
- e.type === "gemini" ? await this.agent.connect({
754
- ...t,
755
- apiKey: e.token,
756
- authType: "ephemeral_token"
757
- }) : e.type === "elevenlabs" ? await this.agent.connect({
716
+ e.type === "elevenlabs" ? await this.agent.connect({
758
717
  ...t,
759
718
  agentId: e.agent_id,
760
719
  signedUrl: e.signed_url
761
- }) : e.type === "cartesia" ? await this.agent.connect({
762
- ...t,
763
- agentId: e.agent_id,
764
- apiKey: e.token
765
- }) : e.type === "vapi" && await this.agent.connect({
720
+ }) : e.type === "openai" && await this.agent.connect({
766
721
  ...t,
767
- signedUrl: e.signed_url
722
+ apiKey: e.token,
723
+ systemPrompt: e.system_prompt,
724
+ voice: e.voice,
725
+ turnDetection: e.turn_detection
768
726
  });
769
727
  }
770
728
  cleanup() {
771
729
  this.stream?.getTracks().forEach((e) => e.stop()), this.processor?.disconnect(), this.audioContext?.close(), this.agent?.close(), this.session?.close(), this.stream = null, this.processor = null, this.audioContext = null, this.agent = null, this.session = null;
772
730
  }
773
731
  }
774
- const c = /* @__PURE__ */ new Set();
775
- class B {
732
+ const r = /* @__PURE__ */ new Set();
733
+ class $ {
776
734
  voiceAgentDetails;
777
735
  sessionDetails;
778
736
  callbacks;
@@ -816,24 +774,24 @@ class B {
816
774
  }
817
775
  /** Connect to the session */
818
776
  async connect() {
819
- if (c.has(this.connectionId)) {
777
+ if (r.has(this.connectionId)) {
820
778
  console.log("[PersonaView] Connection already in progress, skipping");
821
779
  return;
822
780
  }
823
- c.add(this.connectionId), this.mounted = !0, this.setStatus("connecting");
781
+ r.add(this.connectionId), this.mounted = !0, this.setStatus("connecting");
824
782
  try {
825
783
  if (await this.initSession(), await this.initMicrophone(), await this.connectAgent(), !this.mounted) {
826
- this.cleanup(), c.delete(this.connectionId);
784
+ this.cleanup(), r.delete(this.connectionId);
827
785
  return;
828
786
  }
829
787
  this.setStatus("connected");
830
788
  } catch (e) {
831
- c.delete(this.connectionId), console.error("[PersonaView]", e), this.mounted && (this.setStatus("error"), this.callbacks.onError?.(e));
789
+ r.delete(this.connectionId), console.error("[PersonaView]", e), this.mounted && (this.setStatus("error"), this.callbacks.onError?.(e));
832
790
  }
833
791
  }
834
792
  /** Disconnect and cleanup */
835
793
  disconnect() {
836
- this.mounted = !1, c.delete(this.connectionId), this.cleanup(), this.setStatus("disconnected");
794
+ this.mounted = !1, r.delete(this.connectionId), this.cleanup(), this.setStatus("disconnected");
837
795
  }
838
796
  /** Toggle microphone mute */
839
797
  toggleMute() {
@@ -846,7 +804,7 @@ class B {
846
804
  this._agentState !== e && (this._agentState = e, this.callbacks.onAgentStateChange?.(e));
847
805
  }
848
806
  async initSession() {
849
- this.session = f({
807
+ this.session = _({
850
808
  serverUrl: this.sessionDetails.server_url,
851
809
  participantToken: this.sessionDetails.participant_token,
852
810
  agentIdentity: this.sessionDetails.agent_identity,
@@ -869,11 +827,11 @@ class B {
869
827
  onClose: () => {
870
828
  this.mounted && this.callbacks.onDisconnect?.();
871
829
  }
872
- }), this.agent = y(this.voiceAgentDetails.type), this.agent.on("audio", (e) => this.session?.sendAudio(e)), this.agent.on("turnEnd", () => this.session?.endAudioTurn()), this.agent.on("interrupted", () => {
830
+ }), this.agent = b(this.voiceAgentDetails.type), this.agent.on("audio", (e) => this.session?.sendAudio(e)), this.agent.on("turnEnd", () => this.session?.endAudioTurn()), this.agent.on("interrupted", () => {
873
831
  this.session?.endAudioTurn(), this.session?.interrupt();
874
832
  }), this.agent.on("closed", () => {
875
833
  this.mounted && this.callbacks.onDisconnect?.();
876
- }), this.agent instanceof S && this.agent.on("emotion", (e) => this.session?.setEmotion(e)), await this.session.connect();
834
+ }), this.agent.on("emotion", (e) => this.session?.setEmotion(e)), await this.session.connect();
877
835
  }
878
836
  async initMicrophone() {
879
837
  this.stream = await navigator.mediaDevices.getUserMedia({
@@ -882,29 +840,24 @@ class B {
882
840
  const e = this.audioContext.createMediaStreamSource(this.stream);
883
841
  this.processor = this.audioContext.createScriptProcessor(4096, 1, 1), this.processor.onaudioprocess = (t) => {
884
842
  if (!this._isMuted) {
885
- const s = w(t.inputBuffer.getChannelData(0));
886
- this.agent?.sendAudio(s);
843
+ const n = v(t.inputBuffer.getChannelData(0));
844
+ this.agent?.sendAudio(n);
887
845
  }
888
846
  }, e.connect(this.processor), this.processor.connect(this.audioContext.destination);
889
847
  }
890
848
  async connectAgent() {
891
849
  if (!this.agent) return;
892
850
  const e = this.voiceAgentDetails, t = { inputSampleRate: 16e3 };
893
- e.type === "gemini" ? await this.agent.connect({
894
- ...t,
895
- apiKey: e.token,
896
- authType: "ephemeral_token"
897
- }) : e.type === "elevenlabs" ? await this.agent.connect({
851
+ e.type === "elevenlabs" ? await this.agent.connect({
898
852
  ...t,
899
853
  agentId: e.agent_id,
900
854
  signedUrl: e.signed_url
901
- }) : e.type === "cartesia" ? await this.agent.connect({
855
+ }) : e.type === "openai" && await this.agent.connect({
902
856
  ...t,
903
- agentId: e.agent_id,
904
- apiKey: e.token
905
- }) : e.type === "vapi" && await this.agent.connect({
906
- ...t,
907
- signedUrl: e.signed_url
857
+ apiKey: e.token,
858
+ systemPrompt: e.system_prompt,
859
+ voice: e.voice,
860
+ turnDetection: e.turn_detection
908
861
  });
909
862
  }
910
863
  cleanup() {
@@ -912,20 +865,19 @@ class B {
912
865
  }
913
866
  }
914
867
  export {
915
- N as AGENT_REGISTRY,
916
- u as BaseAgent,
917
- D as CartesiaAgent,
918
- S as ElevenLabsAgent,
919
- R as GeminiLiveAgent,
920
- L as KeyframeApiError,
921
- $ as PersonaEmbed,
922
- B as PersonaView,
923
- o as SAMPLE_RATE,
924
- m as base64ToBytes,
925
- g as bytesToBase64,
926
- y as createAgent,
927
- E as createEventEmitter,
928
- w as floatTo16BitPCM,
929
- F as getAgentInfo,
930
- h as resamplePcm
868
+ D as AGENT_REGISTRY,
869
+ y as BaseAgent,
870
+ R as ElevenLabsAgent,
871
+ F as KeyframeApiError,
872
+ P as OpenAIRealtimeAgent,
873
+ L as PersonaEmbed,
874
+ $ as PersonaView,
875
+ u as SAMPLE_RATE,
876
+ f as base64ToBytes,
877
+ S as bytesToBase64,
878
+ b as createAgent,
879
+ C as createEventEmitter,
880
+ v as floatTo16BitPCM,
881
+ N as getAgentInfo,
882
+ p as resamplePcm
931
883
  };