@keyframelabs/elements 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -81,19 +81,27 @@ For `PersonaView`, this is determined by `voiceAgentDetails`.
81
81
 
82
82
  The avatar can display emotional expressions (`neutral`, `angry`, `sad`, `happy`) that affect its facial expression and demeanor.
83
83
 
84
- ### Automatic Emotion Detection (ElevenLabs)
84
+ ### ElevenLabs: `set_emotion` Tool Call
85
85
 
86
- When using ElevenLabs as the voice agent, emotions are automatically detected from the agent's speech. The ElevenLabs agent parses emotion tags from audio alignment data (e.g., `[angry]`, `[happy]`) and the avatar expression updates in real-time.
86
+ When using ElevenLabs as the voice agent, emotions are driven by a **client tool call** named `set_emotion`. The ElevenLabs agent parses incoming `client_tool_call` WebSocket messages and, when the tool name is `set_emotion`, updates the avatar's expression accordingly.
87
87
 
88
- This requires no additional configuration—just configure your ElevenLabs agent to include emotion tags in its responses.
88
+ > **Important:** Transcripts from the ElevenLabs agent are **not** automatically consumed. The `transcript` event is emitted, but it is up to you to subscribe to it if you need transcript data.
89
+
90
+ #### Setup
91
+
92
+ You must create a `set_emotion` tool in the [ElevenLabs API](https://elevenlabs.io/docs) for your agent. The tool should accept a single parameter:
93
+
94
+ | Parameter | Type | Description |
95
+ | --------- | -------- | -------------------------------------------------------- |
96
+ | `emotion` | `enum` | One of `neutral`, `angry`, `sad`, `happy`. |
97
+
98
+ Then instruct your agent (via its system prompt) to call `set_emotion` on each turn with the appropriate emotion. The client library handles the rest — it validates the emotion, emits an `emotion` event, and sends a `client_tool_result` back to ElevenLabs.
89
99
 
90
100
  ### Manual Emotion Control
91
101
 
92
102
  For other agents or custom emotion logic, you can access the underlying session to set emotions manually:
93
103
 
94
104
  ```typescript
95
- // Access the underlying SDK session for manual control
96
- // (Available when using @keyframelabs/sdk directly)
97
105
  import { createClient } from '@keyframelabs/sdk';
98
106
 
99
107
  const session = createClient({ ... });
@@ -102,15 +110,15 @@ await session.setEmotion('happy');
102
110
 
103
111
  ### Agent Events
104
112
 
105
- The `emotion` event is emitted when an agent detects an emotion change:
113
+ The `emotion` event is emitted when the agent triggers a `set_emotion` tool call:
106
114
 
107
115
  ```typescript
108
116
  agent.on('emotion', (emotion) => {
109
- console.log('Emotion detected:', emotion); // 'neutral' | 'angry' | 'sad' | 'happy'
117
+ console.log('Emotion changed:', emotion); // 'neutral' | 'angry' | 'sad' | 'happy'
110
118
  });
111
119
  ```
112
120
 
113
- Currently, only the ElevenLabs agent emits emotion events.
121
+ Currently, only the ElevenLabs agent emits emotion events via tool calls.
114
122
 
115
123
  ## API
116
124
 
@@ -20,7 +20,6 @@ export declare class ElevenLabsAgent extends BaseAgent {
20
20
  private sourceInputSampleRate;
21
21
  private initialized;
22
22
  private lastInterruptId;
23
- private emotionEmittedForEventId;
24
23
  connect(config: ElevenLabsConfig): Promise<void>;
25
24
  protected handleParsedMessage(message: unknown): void;
26
25
  private handleInitMetadata;
@@ -28,6 +27,7 @@ export declare class ElevenLabsAgent extends BaseAgent {
28
27
  private handleAudio;
29
28
  private handleUserTranscript;
30
29
  private handleAgentResponse;
30
+ private handleClientToolCall;
31
31
  private handleInterruption;
32
32
  sendAudio(pcmData: Uint8Array): void;
33
33
  /**
package/dist/index.js CHANGED
@@ -12,17 +12,17 @@ function m(i) {
12
12
  e += String.fromCharCode(i[t]);
13
13
  return btoa(e);
14
14
  }
15
- function c(i, e, t) {
15
+ function h(i, e, t) {
16
16
  if (e === t)
17
17
  return i;
18
- const s = new Int16Array(i.buffer, i.byteOffset, i.length / 2), n = e / t, a = Math.floor(s.length / n), r = new Int16Array(a);
18
+ const s = new Int16Array(i.buffer, i.byteOffset, i.length / 2), n = e / t, a = Math.floor(s.length / n), d = new Int16Array(a);
19
19
  for (let o = 0; o < a; o++) {
20
- const _ = o * n, p = Math.floor(_), b = Math.min(p + 1, s.length - 1), v = _ - p;
21
- r[o] = Math.round(
20
+ const S = o * n, p = Math.floor(S), b = Math.min(p + 1, s.length - 1), v = S - p;
21
+ d[o] = Math.round(
22
22
  s[p] * (1 - v) + s[b] * v
23
23
  );
24
24
  }
25
- return new Uint8Array(r.buffer);
25
+ return new Uint8Array(d.buffer);
26
26
  }
27
27
  function E() {
28
28
  const i = /* @__PURE__ */ new Map();
@@ -49,12 +49,12 @@ function w(i) {
49
49
  }
50
50
  return new Uint8Array(e.buffer);
51
51
  }
52
- const I = 16e3;
52
+ const C = 16e3;
53
53
  class u {
54
54
  ws = null;
55
55
  _state = "idle";
56
56
  events = E();
57
- inputSampleRate = I;
57
+ inputSampleRate = C;
58
58
  /** Current agent state */
59
59
  get state() {
60
60
  return this._state;
@@ -113,7 +113,7 @@ class u {
113
113
  this.events.emit("closed", { code: e, reason: t });
114
114
  }
115
115
  }
116
- const A = "gemini-2.5-flash-native-audio-preview-12-2025", C = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent", k = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
116
+ const A = "gemini-2.5-flash-native-audio-preview-12-2025", I = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent", k = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
117
117
  class R extends u {
118
118
  agentName = "GeminiLive";
119
119
  async connect(e) {
@@ -122,8 +122,8 @@ class R extends u {
122
122
  if (!e.apiKey)
123
123
  throw new Error("Gemini API key is required");
124
124
  e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate);
125
- const t = e.model ?? A, n = (e.authType ?? "api_key") === "ephemeral_token" ? `${k}?access_token=${encodeURIComponent(e.apiKey)}` : `${C}?key=${encodeURIComponent(e.apiKey)}`;
126
- return new Promise((a, r) => {
125
+ const t = e.model ?? A, n = (e.authType ?? "api_key") === "ephemeral_token" ? `${k}?access_token=${encodeURIComponent(e.apiKey)}` : `${I}?key=${encodeURIComponent(e.apiKey)}`;
126
+ return new Promise((a, d) => {
127
127
  this.ws = new WebSocket(n), this.ws.onopen = () => {
128
128
  const o = {
129
129
  setup: {
@@ -136,7 +136,7 @@ class R extends u {
136
136
  };
137
137
  this.ws.send(JSON.stringify(o)), this.setState("listening"), a();
138
138
  }, this.ws.onerror = () => {
139
- r(new Error("Failed to connect to Gemini Live"));
139
+ d(new Error("Failed to connect to Gemini Live"));
140
140
  }, this.ws.onclose = (o) => {
141
141
  this.ws = null, this.setState("idle"), this.emitClosed(o.code, o.reason);
142
142
  }, this.ws.onmessage = (o) => {
@@ -189,8 +189,8 @@ class R extends u {
189
189
  this.ws.send(JSON.stringify(t));
190
190
  }
191
191
  }
192
- const M = ["neutral", "angry", "sad", "happy"], x = "wss://api.elevenlabs.io/v1/convai/conversation";
193
- class S extends u {
192
+ const M = ["neutral", "angry", "sad", "happy"], T = "wss://api.elevenlabs.io/v1/convai/conversation";
193
+ class _ extends u {
194
194
  agentName = "ElevenLabs";
195
195
  outputSampleRate = 24e3;
196
196
  // Default, updated from metadata
@@ -202,8 +202,6 @@ class S extends u {
202
202
  // True after conversation_initiation_metadata received
203
203
  lastInterruptId = 0;
204
204
  // Track interruptions to filter stale audio
205
- emotionEmittedForEventId = -1;
206
- // Track which turn's emotion we've already emitted
207
205
  async connect(e) {
208
206
  if (this.ws)
209
207
  throw new Error("Already connected");
@@ -211,7 +209,7 @@ class S extends u {
211
209
  throw new Error("ElevenLabs agent ID or signed URL is required");
212
210
  e.inputSampleRate && (this.sourceInputSampleRate = e.inputSampleRate);
213
211
  let t;
214
- return e.signedUrl ? t = e.signedUrl : (t = `${x}?agent_id=${e.agentId}`, e.apiKey && (t += `&xi-api-key=${e.apiKey}`)), new Promise((s, n) => {
212
+ return e.signedUrl ? t = e.signedUrl : (t = `${T}?agent_id=${e.agentId}`, e.apiKey && (t += `&xi-api-key=${e.apiKey}`)), new Promise((s, n) => {
215
213
  this.ws = new WebSocket(t), this.ws.onopen = () => {
216
214
  this.setState("listening"), s();
217
215
  }, this.ws.onerror = () => {
@@ -244,6 +242,9 @@ class S extends u {
244
242
  case "interruption":
245
243
  this.handleInterruption(t);
246
244
  break;
245
+ case "client_tool_call":
246
+ this.handleClientToolCall(t);
247
+ break;
247
248
  case "agent_response_correction":
248
249
  this.setState("listening");
249
250
  break;
@@ -271,19 +272,11 @@ class S extends u {
271
272
  }
272
273
  handleAudio(e) {
273
274
  const t = e.audio_event;
274
- if (!t?.audio_base_64) return;
275
- const s = t.event_id ?? 0;
276
- if (s <= this.lastInterruptId)
275
+ if (!t?.audio_base_64 || (t.event_id ?? 0) <= this.lastInterruptId)
277
276
  return;
278
- if (this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking")), this.emotionEmittedForEventId !== s && t.alignment?.chars) {
279
- const r = t.alignment.chars.join("").match(/\[(\w+)\]/);
280
- if (r) {
281
- const o = r[1].toLowerCase();
282
- M.includes(o) && (this.events.emit("emotion", o), this.emotionEmittedForEventId = s);
283
- }
284
- }
277
+ this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
285
278
  let n = g(t.audio_base_64);
286
- this.outputSampleRate !== l && (n = c(n, this.outputSampleRate, l)), this.events.emit("audio", n);
279
+ this.outputSampleRate !== l && (n = h(n, this.outputSampleRate, l)), this.events.emit("audio", n);
287
280
  }
288
281
  handleUserTranscript(e) {
289
282
  const t = e.user_transcription_event;
@@ -301,6 +294,21 @@ class S extends u {
301
294
  isFinal: !0
302
295
  }));
303
296
  }
297
+ handleClientToolCall(e) {
298
+ const t = e.client_tool_call;
299
+ if (t) {
300
+ if (t.tool_name === "set_emotion") {
301
+ const s = t.parameters?.emotion?.toLowerCase();
302
+ s && M.includes(s) && this.events.emit("emotion", s);
303
+ }
304
+ this.ws && this.ws.readyState === WebSocket.OPEN && this.ws.send(JSON.stringify({
305
+ type: "client_tool_result",
306
+ tool_call_id: t.tool_call_id,
307
+ result: "ok",
308
+ is_error: !1
309
+ }));
310
+ }
311
+ }
304
312
  handleInterruption(e) {
305
313
  const t = e.interruption_event;
306
314
  t?.event_id && (this.lastInterruptId = t.event_id), this.events.emit("interrupted", void 0), this.setState("listening");
@@ -309,7 +317,7 @@ class S extends u {
309
317
  if (!this.ws || this.ws.readyState !== WebSocket.OPEN || !this.initialized)
310
318
  return;
311
319
  let t = e;
312
- this.sourceInputSampleRate !== this.expectedInputSampleRate && (t = c(e, this.sourceInputSampleRate, this.expectedInputSampleRate)), this.ws.send(JSON.stringify({
320
+ this.sourceInputSampleRate !== this.expectedInputSampleRate && (t = h(e, this.sourceInputSampleRate, this.expectedInputSampleRate)), this.ws.send(JSON.stringify({
313
321
  user_audio_chunk: m(t)
314
322
  }));
315
323
  }
@@ -343,7 +351,7 @@ class S extends u {
343
351
  this.initialized = !1, this.lastInterruptId = 0, super.close();
344
352
  }
345
353
  }
346
- const P = "wss://api.cartesia.ai/agents/stream", T = "2025-04-16";
354
+ const P = "wss://api.cartesia.ai/agents/stream", x = "2025-04-16";
347
355
  class O extends u {
348
356
  agentName = "Cartesia";
349
357
  // Audio configuration
@@ -363,7 +371,7 @@ class O extends u {
363
371
  if (!e.apiKey)
364
372
  throw new Error("Cartesia API Key is required");
365
373
  e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate), this.inputSampleRate === 16e3 ? this.cartesiaInputFormat = "pcm_16000" : this.inputSampleRate === 24e3 ? this.cartesiaInputFormat = "pcm_24000" : this.inputSampleRate === 44100 ? this.cartesiaInputFormat = "pcm_44100" : this.cartesiaInputFormat = "pcm_16000";
366
- const t = `${P}/${e.agentId}?api_key=${e.apiKey}&cartesia_version=${T}`;
374
+ const t = `${P}/${e.agentId}?api_key=${e.apiKey}&cartesia_version=${x}`;
367
375
  return new Promise((s, n) => {
368
376
  this.ws = new WebSocket(t), this.ws.onopen = () => {
369
377
  this.sendStartEvent(), this.startHeartbeat(), s();
@@ -426,7 +434,7 @@ class O extends u {
426
434
  if (!e.media?.payload) return;
427
435
  this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
428
436
  let t = g(e.media.payload);
429
- this.cartesiaOutputRate !== l && (t = c(t, this.cartesiaOutputRate, l)), this.events.emit("audio", t);
437
+ this.cartesiaOutputRate !== l && (t = h(t, this.cartesiaOutputRate, l)), this.events.emit("audio", t);
430
438
  }
431
439
  handleClear() {
432
440
  this.events.emit("interrupted", void 0), this.setState("listening");
@@ -436,7 +444,7 @@ class O extends u {
436
444
  return;
437
445
  let t = e;
438
446
  const s = parseInt(this.cartesiaInputFormat.split("_")[1]);
439
- this.inputSampleRate !== s && (t = c(e, this.inputSampleRate, s)), this.ws.send(JSON.stringify({
447
+ this.inputSampleRate !== s && (t = h(e, this.inputSampleRate, s)), this.ws.send(JSON.stringify({
440
448
  event: "media_input",
441
449
  stream_id: this.streamId,
442
450
  media: {
@@ -475,7 +483,7 @@ class N extends u {
475
483
  */
476
484
  handleBinaryAudio(e) {
477
485
  this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
478
- const t = new Uint8Array(e), s = this.vapiSampleRate !== l ? c(t, this.vapiSampleRate, l) : t;
486
+ const t = new Uint8Array(e), s = this.vapiSampleRate !== l ? h(t, this.vapiSampleRate, l) : t;
479
487
  this.events.emit("audio", s);
480
488
  }
481
489
  handleParsedMessage(e) {
@@ -515,7 +523,7 @@ class N extends u {
515
523
  if (!this.ws || this.ws.readyState !== WebSocket.OPEN)
516
524
  return;
517
525
  let t = e;
518
- this.inputSampleRate !== this.vapiSampleRate && (t = c(e, this.inputSampleRate, this.vapiSampleRate)), this.ws.send(t.buffer);
526
+ this.inputSampleRate !== this.vapiSampleRate && (t = h(e, this.inputSampleRate, this.vapiSampleRate)), this.ws.send(t.buffer);
519
527
  }
520
528
  /**
521
529
  * Send a control message to end the call.
@@ -538,7 +546,7 @@ function y(i) {
538
546
  case "gemini":
539
547
  return new R();
540
548
  case "elevenlabs":
541
- return new S();
549
+ return new _();
542
550
  case "cartesia":
543
551
  return new O();
544
552
  case "vapi":
@@ -558,7 +566,7 @@ class D extends Error {
558
566
  super(e.message), this.name = "ApiError", this.status = e.status, this.payload = e.payload, this.url = e.url;
559
567
  }
560
568
  }
561
- const h = /* @__PURE__ */ new Set();
569
+ const r = /* @__PURE__ */ new Set();
562
570
  class K {
563
571
  apiBaseUrl;
564
572
  publishableKey;
@@ -603,31 +611,31 @@ class K {
603
611
  }
604
612
  /** Connect to the embed session */
605
613
  async connect() {
606
- if (h.has(this.publishableKey)) {
614
+ if (r.has(this.publishableKey)) {
607
615
  console.log("[PersonaEmbed] Connection already in progress, skipping");
608
616
  return;
609
617
  }
610
- h.add(this.publishableKey), this.mounted = !0, this.abortController = new AbortController(), this.setStatus("connecting");
618
+ r.add(this.publishableKey), this.mounted = !0, this.abortController = new AbortController(), this.setStatus("connecting");
611
619
  try {
612
620
  const e = await this.fetchSession(this.abortController.signal);
613
621
  if (!this.mounted) {
614
- h.delete(this.publishableKey);
622
+ r.delete(this.publishableKey);
615
623
  return;
616
624
  }
617
625
  if (await this.initSession(e), await this.initMicrophone(), await this.connectAgent(e.voice_agent_details), !this.mounted) {
618
- this.cleanup(), h.delete(this.publishableKey);
626
+ this.cleanup(), r.delete(this.publishableKey);
619
627
  return;
620
628
  }
621
629
  this.setStatus("connected");
622
630
  } catch (e) {
623
- if (h.delete(this.publishableKey), e instanceof Error && e.name === "AbortError")
631
+ if (r.delete(this.publishableKey), e instanceof Error && e.name === "AbortError")
624
632
  return;
625
633
  console.error("[PersonaEmbed]", e), this.mounted && (this.setStatus("error"), this.callbacks.onError?.(e));
626
634
  }
627
635
  }
628
636
  /** Disconnect and cleanup */
629
637
  disconnect() {
630
- this.mounted = !1, this.abortController?.abort(), this.abortController = null, h.delete(this.publishableKey), this.cleanup(), this.setStatus("disconnected");
638
+ this.mounted = !1, this.abortController?.abort(), this.abortController = null, r.delete(this.publishableKey), this.cleanup(), this.setStatus("disconnected");
631
639
  }
632
640
  /** Toggle microphone mute */
633
641
  toggleMute() {
@@ -693,7 +701,7 @@ class K {
693
701
  this.session?.endAudioTurn(), this.session?.interrupt();
694
702
  }), this.agent.on("closed", () => {
695
703
  this.mounted && this.callbacks.onDisconnect?.();
696
- }), this.agent instanceof S && this.agent.on("emotion", (t) => this.session?.setEmotion(t)), await this.session.connect();
704
+ }), this.agent instanceof _ && this.agent.on("emotion", (t) => this.session?.setEmotion(t)), await this.session.connect();
697
705
  }
698
706
  async initMicrophone() {
699
707
  this.stream = await navigator.mediaDevices.getUserMedia({
@@ -731,7 +739,7 @@ class K {
731
739
  this.stream?.getTracks().forEach((e) => e.stop()), this.processor?.disconnect(), this.audioContext?.close(), this.agent?.close(), this.session?.close(), this.stream = null, this.processor = null, this.audioContext = null, this.agent = null, this.session = null;
732
740
  }
733
741
  }
734
- const d = /* @__PURE__ */ new Set();
742
+ const c = /* @__PURE__ */ new Set();
735
743
  class B {
736
744
  voiceAgentDetails;
737
745
  sessionDetails;
@@ -776,24 +784,24 @@ class B {
776
784
  }
777
785
  /** Connect to the session */
778
786
  async connect() {
779
- if (d.has(this.connectionId)) {
787
+ if (c.has(this.connectionId)) {
780
788
  console.log("[PersonaView] Connection already in progress, skipping");
781
789
  return;
782
790
  }
783
- d.add(this.connectionId), this.mounted = !0, this.setStatus("connecting");
791
+ c.add(this.connectionId), this.mounted = !0, this.setStatus("connecting");
784
792
  try {
785
793
  if (await this.initSession(), await this.initMicrophone(), await this.connectAgent(), !this.mounted) {
786
- this.cleanup(), d.delete(this.connectionId);
794
+ this.cleanup(), c.delete(this.connectionId);
787
795
  return;
788
796
  }
789
797
  this.setStatus("connected");
790
798
  } catch (e) {
791
- d.delete(this.connectionId), console.error("[PersonaView]", e), this.mounted && (this.setStatus("error"), this.callbacks.onError?.(e));
799
+ c.delete(this.connectionId), console.error("[PersonaView]", e), this.mounted && (this.setStatus("error"), this.callbacks.onError?.(e));
792
800
  }
793
801
  }
794
802
  /** Disconnect and cleanup */
795
803
  disconnect() {
796
- this.mounted = !1, d.delete(this.connectionId), this.cleanup(), this.setStatus("disconnected");
804
+ this.mounted = !1, c.delete(this.connectionId), this.cleanup(), this.setStatus("disconnected");
797
805
  }
798
806
  /** Toggle microphone mute */
799
807
  toggleMute() {
@@ -833,7 +841,7 @@ class B {
833
841
  this.session?.endAudioTurn(), this.session?.interrupt();
834
842
  }), this.agent.on("closed", () => {
835
843
  this.mounted && this.callbacks.onDisconnect?.();
836
- }), this.agent instanceof S && this.agent.on("emotion", (e) => this.session?.setEmotion(e)), await this.session.connect();
844
+ }), this.agent instanceof _ && this.agent.on("emotion", (e) => this.session?.setEmotion(e)), await this.session.connect();
837
845
  }
838
846
  async initMicrophone() {
839
847
  this.stream = await navigator.mediaDevices.getUserMedia({
@@ -875,7 +883,7 @@ export {
875
883
  U as AGENT_REGISTRY,
876
884
  u as BaseAgent,
877
885
  O as CartesiaAgent,
878
- S as ElevenLabsAgent,
886
+ _ as ElevenLabsAgent,
879
887
  R as GeminiLiveAgent,
880
888
  D as KeyframeApiError,
881
889
  K as PersonaEmbed,
@@ -887,5 +895,5 @@ export {
887
895
  E as createEventEmitter,
888
896
  w as floatTo16BitPCM,
889
897
  F as getAgentInfo,
890
- c as resamplePcm
898
+ h as resamplePcm
891
899
  };
package/package.json CHANGED
@@ -4,7 +4,7 @@
4
4
  "publishConfig": {
5
5
  "access": "public"
6
6
  },
7
- "version": "0.1.0",
7
+ "version": "0.2.0",
8
8
  "type": "module",
9
9
  "main": "./dist/index.js",
10
10
  "types": "./dist/index.d.ts",