@keyframelabs/elements 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,10 @@ export declare class ElevenLabsAgent extends BaseAgent {
20
20
  private sourceInputSampleRate;
21
21
  private initialized;
22
22
  private lastInterruptId;
23
+ private agentResponseReceived;
24
+ private turnStartTime;
25
+ private accumulatedDurationMs;
26
+ private turnEndTimer;
23
27
  connect(config: ElevenLabsConfig): Promise<void>;
24
28
  protected handleParsedMessage(message: unknown): void;
25
29
  private handleInitMetadata;
@@ -27,6 +31,13 @@ export declare class ElevenLabsAgent extends BaseAgent {
27
31
  private handleAudio;
28
32
  private handleUserTranscript;
29
33
  private handleAgentResponse;
34
+ /**
35
+ * Schedule a timer to emit turnEnd when the virtual audio buffer
36
+ * "would have" finished playing. Replicates the ElevenLabs SDK's
37
+ * AudioWorklet buffer-empty detection without requiring local playback.
38
+ */
39
+ private scheduleVirtualBufferCheck;
40
+ private resetTurnState;
30
41
  private handleClientToolCall;
31
42
  private handleInterruption;
32
43
  sendAudio(pcmData: Uint8Array): void;
package/dist/index.js CHANGED
@@ -1,12 +1,12 @@
1
1
  import { createClient as f } from "@keyframelabs/sdk";
2
- const l = 24e3;
3
- function g(i) {
2
+ const o = 24e3;
3
+ function m(i) {
4
4
  const e = atob(i), t = new Uint8Array(e.length);
5
5
  for (let s = 0; s < e.length; s++)
6
6
  t[s] = e.charCodeAt(s);
7
7
  return t;
8
8
  }
9
- function m(i) {
9
+ function g(i) {
10
10
  let e = "";
11
11
  for (let t = 0; t < i.length; t++)
12
12
  e += String.fromCharCode(i[t]);
@@ -16,9 +16,9 @@ function h(i, e, t) {
16
16
  if (e === t)
17
17
  return i;
18
18
  const s = new Int16Array(i.buffer, i.byteOffset, i.length / 2), n = e / t, a = Math.floor(s.length / n), d = new Int16Array(a);
19
- for (let o = 0; o < a; o++) {
20
- const S = o * n, p = Math.floor(S), b = Math.min(p + 1, s.length - 1), v = S - p;
21
- d[o] = Math.round(
19
+ for (let r = 0; r < a; r++) {
20
+ const _ = r * n, p = Math.floor(_), b = Math.min(p + 1, s.length - 1), v = _ - p;
21
+ d[r] = Math.round(
22
22
  s[p] * (1 - v) + s[b] * v
23
23
  );
24
24
  }
@@ -113,7 +113,7 @@ class u {
113
113
  this.events.emit("closed", { code: e, reason: t });
114
114
  }
115
115
  }
116
- const A = "gemini-2.5-flash-native-audio-preview-12-2025", I = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent", k = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
116
+ const k = "gemini-2.5-flash-native-audio-preview-12-2025", A = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent", I = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
117
117
  class R extends u {
118
118
  agentName = "GeminiLive";
119
119
  async connect(e) {
@@ -122,10 +122,10 @@ class R extends u {
122
122
  if (!e.apiKey)
123
123
  throw new Error("Gemini API key is required");
124
124
  e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate);
125
- const t = e.model ?? A, n = (e.authType ?? "api_key") === "ephemeral_token" ? `${k}?access_token=${encodeURIComponent(e.apiKey)}` : `${I}?key=${encodeURIComponent(e.apiKey)}`;
125
+ const t = e.model ?? k, n = (e.authType ?? "api_key") === "ephemeral_token" ? `${I}?access_token=${encodeURIComponent(e.apiKey)}` : `${A}?key=${encodeURIComponent(e.apiKey)}`;
126
126
  return new Promise((a, d) => {
127
127
  this.ws = new WebSocket(n), this.ws.onopen = () => {
128
- const o = {
128
+ const r = {
129
129
  setup: {
130
130
  model: `models/${t}`,
131
131
  generationConfig: {
@@ -134,13 +134,13 @@ class R extends u {
134
134
  systemInstruction: e.systemPrompt ? { parts: [{ text: e.systemPrompt }] } : void 0
135
135
  }
136
136
  };
137
- this.ws.send(JSON.stringify(o)), this.setState("listening"), a();
137
+ this.ws.send(JSON.stringify(r)), this.setState("listening"), a();
138
138
  }, this.ws.onerror = () => {
139
139
  d(new Error("Failed to connect to Gemini Live"));
140
- }, this.ws.onclose = (o) => {
141
- this.ws = null, this.setState("idle"), this.emitClosed(o.code, o.reason);
142
- }, this.ws.onmessage = (o) => {
143
- this.handleMessage(o.data);
140
+ }, this.ws.onclose = (r) => {
141
+ this.ws = null, this.setState("idle"), this.emitClosed(r.code, r.reason);
142
+ }, this.ws.onmessage = (r) => {
143
+ this.handleMessage(r.data);
144
144
  };
145
145
  });
146
146
  }
@@ -159,7 +159,7 @@ class R extends u {
159
159
  this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
160
160
  for (const n of s.modelTurn.parts) {
161
161
  if (n.inlineData?.data) {
162
- const a = g(n.inlineData.data);
162
+ const a = m(n.inlineData.data);
163
163
  this.events.emit("audio", a);
164
164
  }
165
165
  n.text && this.events.emit("transcript", {
@@ -181,7 +181,7 @@ class R extends u {
181
181
  mediaChunks: [
182
182
  {
183
183
  mimeType: `audio/pcm;rate=${this.inputSampleRate}`,
184
- data: m(e)
184
+ data: g(e)
185
185
  }
186
186
  ]
187
187
  }
@@ -190,7 +190,7 @@ class R extends u {
190
190
  }
191
191
  }
192
192
  const M = ["neutral", "angry", "sad", "happy"], T = "wss://api.elevenlabs.io/v1/convai/conversation";
193
- class _ extends u {
193
+ class S extends u {
194
194
  agentName = "ElevenLabs";
195
195
  outputSampleRate = 24e3;
196
196
  // Default, updated from metadata
@@ -202,6 +202,12 @@ class _ extends u {
202
202
  // True after conversation_initiation_metadata received
203
203
  lastInterruptId = 0;
204
204
  // Track interruptions to filter stale audio
205
+ // Virtual buffer turn-end detection: track audio duration and emit turnEnd
206
+ // when agent_response has arrived and all audio "would have" finished playing.
207
+ agentResponseReceived = !1;
208
+ turnStartTime = 0;
209
+ accumulatedDurationMs = 0;
210
+ turnEndTimer = null;
205
211
  async connect(e) {
206
212
  if (this.ws)
207
213
  throw new Error("Already connected");
@@ -275,8 +281,12 @@ class _ extends u {
275
281
  if (!t?.audio_base_64 || (t.event_id ?? 0) <= this.lastInterruptId)
276
282
  return;
277
283
  this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
278
- let n = g(t.audio_base_64);
279
- this.outputSampleRate !== l && (n = h(n, this.outputSampleRate, l)), this.events.emit("audio", n);
284
+ let n = m(t.audio_base_64);
285
+ this.outputSampleRate !== o && (n = h(n, this.outputSampleRate, o)), this.events.emit("audio", n);
286
+ const a = n.length / 2 / o * 1e3;
287
+ this.turnStartTime === 0 && (this.turnStartTime = Date.now()), this.accumulatedDurationMs += a, console.debug(
288
+ `[ElevenLabs] audio chunk: ${n.length} bytes, +${a.toFixed(0)}ms, totalDuration=${this.accumulatedDurationMs.toFixed(0)}ms, agentResponse=${this.agentResponseReceived}`
289
+ ), this.scheduleVirtualBufferCheck();
280
290
  }
281
291
  handleUserTranscript(e) {
282
292
  const t = e.user_transcription_event;
@@ -288,11 +298,31 @@ class _ extends u {
288
298
  }
289
299
  handleAgentResponse(e) {
290
300
  const t = e.agent_response_event;
291
- t?.agent_response && (this.events.emit("turnEnd", void 0), this.setState("listening"), this.events.emit("transcript", {
301
+ t?.agent_response && (this.events.emit("transcript", {
292
302
  role: "assistant",
293
303
  text: t.agent_response,
294
304
  isFinal: !0
295
- }));
305
+ }), this.agentResponseReceived = !0, console.debug(
306
+ `[ElevenLabs] agent_response received: totalDuration=${this.accumulatedDurationMs.toFixed(0)}ms, text="${t.agent_response.slice(0, 60)}${t.agent_response.length > 60 ? "..." : ""}"`
307
+ ), this.scheduleVirtualBufferCheck());
308
+ }
309
+ /**
310
+ * Schedule a timer to emit turnEnd when the virtual audio buffer
311
+ * "would have" finished playing. Replicates the ElevenLabs SDK's
312
+ * AudioWorklet buffer-empty detection without requiring local playback.
313
+ */
314
+ scheduleVirtualBufferCheck() {
315
+ if (!this.agentResponseReceived || this.turnStartTime === 0) return;
316
+ this.turnEndTimer !== null && clearTimeout(this.turnEndTimer);
317
+ const e = Date.now() - this.turnStartTime, t = Math.max(0, this.accumulatedDurationMs - e);
318
+ console.debug(
319
+ `[ElevenLabs] virtual buffer: elapsed=${e.toFixed(0)}ms, accumulated=${this.accumulatedDurationMs.toFixed(0)}ms, remaining=${t.toFixed(0)}ms`
320
+ ), this.turnEndTimer = setTimeout(() => {
321
+ this.turnEndTimer = null, console.debug("[ElevenLabs] virtual buffer drained, emitting turnEnd"), this.resetTurnState(), this.events.emit("turnEnd", void 0), this.setState("listening");
322
+ }, t);
323
+ }
324
+ resetTurnState() {
325
+ this.agentResponseReceived = !1, this.turnStartTime = 0, this.accumulatedDurationMs = 0, this.turnEndTimer !== null && (clearTimeout(this.turnEndTimer), this.turnEndTimer = null);
296
326
  }
297
327
  handleClientToolCall(e) {
298
328
  const t = e.client_tool_call;
@@ -311,14 +341,16 @@ class _ extends u {
311
341
  }
312
342
  handleInterruption(e) {
313
343
  const t = e.interruption_event;
314
- t?.event_id && (this.lastInterruptId = t.event_id), this.events.emit("interrupted", void 0), this.setState("listening");
344
+ t?.event_id && (this.lastInterruptId = t.event_id), (this.agentResponseReceived || this.accumulatedDurationMs > 0) && console.debug(
345
+ `[ElevenLabs] interruption: discarding pending turn (duration=${this.accumulatedDurationMs.toFixed(0)}ms, agentResponse=${this.agentResponseReceived})`
346
+ ), this.resetTurnState(), this.events.emit("interrupted", void 0), this.setState("listening");
315
347
  }
316
348
  sendAudio(e) {
317
349
  if (!this.ws || this.ws.readyState !== WebSocket.OPEN || !this.initialized)
318
350
  return;
319
351
  let t = e;
320
352
  this.sourceInputSampleRate !== this.expectedInputSampleRate && (t = h(e, this.sourceInputSampleRate, this.expectedInputSampleRate)), this.ws.send(JSON.stringify({
321
- user_audio_chunk: m(t)
353
+ user_audio_chunk: g(t)
322
354
  }));
323
355
  }
324
356
  /**
@@ -348,11 +380,11 @@ class _ extends u {
348
380
  }));
349
381
  }
350
382
  close() {
351
- this.initialized = !1, this.lastInterruptId = 0, super.close();
383
+ this.initialized = !1, this.lastInterruptId = 0, this.resetTurnState(), super.close();
352
384
  }
353
385
  }
354
- const P = "wss://api.cartesia.ai/agents/stream", x = "2025-04-16";
355
- class O extends u {
386
+ const x = "wss://api.cartesia.ai/agents/stream", P = "2025-04-16";
387
+ class D extends u {
356
388
  agentName = "Cartesia";
357
389
  // Audio configuration
358
390
  cartesiaInputFormat = "pcm_16000";
@@ -371,7 +403,7 @@ class O extends u {
371
403
  if (!e.apiKey)
372
404
  throw new Error("Cartesia API Key is required");
373
405
  e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate), this.inputSampleRate === 16e3 ? this.cartesiaInputFormat = "pcm_16000" : this.inputSampleRate === 24e3 ? this.cartesiaInputFormat = "pcm_24000" : this.inputSampleRate === 44100 ? this.cartesiaInputFormat = "pcm_44100" : this.cartesiaInputFormat = "pcm_16000";
374
- const t = `${P}/${e.agentId}?api_key=${e.apiKey}&cartesia_version=${x}`;
406
+ const t = `${x}/${e.agentId}?api_key=${e.apiKey}&cartesia_version=${P}`;
375
407
  return new Promise((s, n) => {
376
408
  this.ws = new WebSocket(t), this.ws.onopen = () => {
377
409
  this.sendStartEvent(), this.startHeartbeat(), s();
@@ -433,8 +465,8 @@ class O extends u {
433
465
  handleMediaOutput(e) {
434
466
  if (!e.media?.payload) return;
435
467
  this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
436
- let t = g(e.media.payload);
437
- this.cartesiaOutputRate !== l && (t = h(t, this.cartesiaOutputRate, l)), this.events.emit("audio", t);
468
+ let t = m(e.media.payload);
469
+ this.cartesiaOutputRate !== o && (t = h(t, this.cartesiaOutputRate, o)), this.events.emit("audio", t);
438
470
  }
439
471
  handleClear() {
440
472
  this.events.emit("interrupted", void 0), this.setState("listening");
@@ -448,7 +480,7 @@ class O extends u {
448
480
  event: "media_input",
449
481
  stream_id: this.streamId,
450
482
  media: {
451
- payload: m(t)
483
+ payload: g(t)
452
484
  }
453
485
  }));
454
486
  }
@@ -456,7 +488,7 @@ class O extends u {
456
488
  this.stopHeartbeat(), this.isReady = !1, this.streamId = null, super.close();
457
489
  }
458
490
  }
459
- class N extends u {
491
+ class O extends u {
460
492
  agentName = "Vapi";
461
493
  // Audio configuration - Vapi uses 16kHz PCM by default
462
494
  vapiSampleRate = 16e3;
@@ -483,7 +515,7 @@ class N extends u {
483
515
  */
484
516
  handleBinaryAudio(e) {
485
517
  this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
486
- const t = new Uint8Array(e), s = this.vapiSampleRate !== l ? h(t, this.vapiSampleRate, l) : t;
518
+ const t = new Uint8Array(e), s = this.vapiSampleRate !== o ? h(t, this.vapiSampleRate, o) : t;
487
519
  this.events.emit("audio", s);
488
520
  }
489
521
  handleParsedMessage(e) {
@@ -535,7 +567,7 @@ class N extends u {
535
567
  this.hangup(), super.close();
536
568
  }
537
569
  }
538
- const U = [
570
+ const N = [
539
571
  { id: "gemini", name: "Gemini Live", description: "Google Gemini Live API" },
540
572
  { id: "elevenlabs", name: "ElevenLabs", description: "ElevenLabs Conversational AI" },
541
573
  { id: "cartesia", name: "Cartesia", description: "Cartesia Agents API" },
@@ -546,19 +578,19 @@ function y(i) {
546
578
  case "gemini":
547
579
  return new R();
548
580
  case "elevenlabs":
549
- return new _();
581
+ return new S();
550
582
  case "cartesia":
551
- return new O();
583
+ return new D();
552
584
  case "vapi":
553
- return new N();
585
+ return new O();
554
586
  default:
555
587
  throw new Error(`Unknown agent type: ${i}`);
556
588
  }
557
589
  }
558
590
  function F(i) {
559
- return U.find((e) => e.id === i);
591
+ return N.find((e) => e.id === i);
560
592
  }
561
- class D extends Error {
593
+ class L extends Error {
562
594
  status;
563
595
  payload;
564
596
  url;
@@ -566,8 +598,8 @@ class D extends Error {
566
598
  super(e.message), this.name = "ApiError", this.status = e.status, this.payload = e.payload, this.url = e.url;
567
599
  }
568
600
  }
569
- const r = /* @__PURE__ */ new Set();
570
- class K {
601
+ const l = /* @__PURE__ */ new Set();
602
+ class $ {
571
603
  apiBaseUrl;
572
604
  publishableKey;
573
605
  callbacks;
@@ -611,31 +643,31 @@ class K {
611
643
  }
612
644
  /** Connect to the embed session */
613
645
  async connect() {
614
- if (r.has(this.publishableKey)) {
646
+ if (l.has(this.publishableKey)) {
615
647
  console.log("[PersonaEmbed] Connection already in progress, skipping");
616
648
  return;
617
649
  }
618
- r.add(this.publishableKey), this.mounted = !0, this.abortController = new AbortController(), this.setStatus("connecting");
650
+ l.add(this.publishableKey), this.mounted = !0, this.abortController = new AbortController(), this.setStatus("connecting");
619
651
  try {
620
652
  const e = await this.fetchSession(this.abortController.signal);
621
653
  if (!this.mounted) {
622
- r.delete(this.publishableKey);
654
+ l.delete(this.publishableKey);
623
655
  return;
624
656
  }
625
657
  if (await this.initSession(e), await this.initMicrophone(), await this.connectAgent(e.voice_agent_details), !this.mounted) {
626
- this.cleanup(), r.delete(this.publishableKey);
658
+ this.cleanup(), l.delete(this.publishableKey);
627
659
  return;
628
660
  }
629
661
  this.setStatus("connected");
630
662
  } catch (e) {
631
- if (r.delete(this.publishableKey), e instanceof Error && e.name === "AbortError")
663
+ if (l.delete(this.publishableKey), e instanceof Error && e.name === "AbortError")
632
664
  return;
633
665
  console.error("[PersonaEmbed]", e), this.mounted && (this.setStatus("error"), this.callbacks.onError?.(e));
634
666
  }
635
667
  }
636
668
  /** Disconnect and cleanup */
637
669
  disconnect() {
638
- this.mounted = !1, this.abortController?.abort(), this.abortController = null, r.delete(this.publishableKey), this.cleanup(), this.setStatus("disconnected");
670
+ this.mounted = !1, this.abortController?.abort(), this.abortController = null, l.delete(this.publishableKey), this.cleanup(), this.setStatus("disconnected");
639
671
  }
640
672
  /** Toggle microphone mute */
641
673
  toggleMute() {
@@ -660,7 +692,7 @@ class K {
660
692
  s = await t.json();
661
693
  } catch {
662
694
  }
663
- throw new D({
695
+ throw new L({
664
696
  message: s?.message ?? "create_session failed",
665
697
  status: t.status,
666
698
  payload: s,
@@ -701,7 +733,7 @@ class K {
701
733
  this.session?.endAudioTurn(), this.session?.interrupt();
702
734
  }), this.agent.on("closed", () => {
703
735
  this.mounted && this.callbacks.onDisconnect?.();
704
- }), this.agent instanceof _ && this.agent.on("emotion", (t) => this.session?.setEmotion(t)), await this.session.connect();
736
+ }), this.agent instanceof S && this.agent.on("emotion", (t) => this.session?.setEmotion(t)), await this.session.connect();
705
737
  }
706
738
  async initMicrophone() {
707
739
  this.stream = await navigator.mediaDevices.getUserMedia({
@@ -841,7 +873,7 @@ class B {
841
873
  this.session?.endAudioTurn(), this.session?.interrupt();
842
874
  }), this.agent.on("closed", () => {
843
875
  this.mounted && this.callbacks.onDisconnect?.();
844
- }), this.agent instanceof _ && this.agent.on("emotion", (e) => this.session?.setEmotion(e)), await this.session.connect();
876
+ }), this.agent instanceof S && this.agent.on("emotion", (e) => this.session?.setEmotion(e)), await this.session.connect();
845
877
  }
846
878
  async initMicrophone() {
847
879
  this.stream = await navigator.mediaDevices.getUserMedia({
@@ -880,17 +912,17 @@ class B {
880
912
  }
881
913
  }
882
914
  export {
883
- U as AGENT_REGISTRY,
915
+ N as AGENT_REGISTRY,
884
916
  u as BaseAgent,
885
- O as CartesiaAgent,
886
- _ as ElevenLabsAgent,
917
+ D as CartesiaAgent,
918
+ S as ElevenLabsAgent,
887
919
  R as GeminiLiveAgent,
888
- D as KeyframeApiError,
889
- K as PersonaEmbed,
920
+ L as KeyframeApiError,
921
+ $ as PersonaEmbed,
890
922
  B as PersonaView,
891
- l as SAMPLE_RATE,
892
- g as base64ToBytes,
893
- m as bytesToBase64,
923
+ o as SAMPLE_RATE,
924
+ m as base64ToBytes,
925
+ g as bytesToBase64,
894
926
  y as createAgent,
895
927
  E as createEventEmitter,
896
928
  w as floatTo16BitPCM,
package/package.json CHANGED
@@ -4,7 +4,7 @@
4
4
  "publishConfig": {
5
5
  "access": "public"
6
6
  },
7
- "version": "0.2.0",
7
+ "version": "0.2.1",
8
8
  "type": "module",
9
9
  "main": "./dist/index.js",
10
10
  "types": "./dist/index.d.ts",