@keyframelabs/elements 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ import { AgentState } from './agents';
2
+ export type EmbedStatus = 'connecting' | 'connected' | 'error' | 'disconnected';
3
+ export type VideoFit = 'cover' | 'contain';
4
+ export interface PersonaEmbedOptions {
5
+ /** Target container element */
6
+ container: HTMLElement;
7
+ /** Publishable key from your embed config */
8
+ publishableKey: string;
9
+ /** Base API URL to generate embed sessions. Defaults to https://api.keyframelabs.com. */
10
+ apiBaseUrl?: string;
11
+ /** Video fit mode. 'cover' fills container (may crop), 'contain' shows full video (may have black bars). Default: 'cover' */
12
+ videoFit?: VideoFit;
13
+ /** Called when session disconnects */
14
+ onDisconnect?: () => void;
15
+ /** Called on error */
16
+ onError?: (err: Error) => void;
17
+ /** Called when status changes */
18
+ onStateChange?: (status: EmbedStatus) => void;
19
+ /** Called when agent state changes */
20
+ onAgentStateChange?: (state: AgentState) => void;
21
+ }
22
+ /**
23
+ * Headless Persona avatar with voice agent integration.
24
+ *
25
+ * Creates video/audio elements and handles all wiring.
26
+ * UI (overlays, controls, status) is the consumer's responsibility.
27
+ *
28
+ * @example
29
+ * ```ts
30
+ * const embed = new PersonaEmbed({
31
+ * container: document.getElementById('avatar'),
32
+ * publishableKey: 'kfl_pk_live_...',
33
+ * onStateChange: (status) => updateUI(status),
34
+ * });
35
+ * await embed.connect();
36
+ * ```
37
+ */
38
+ export declare class PersonaEmbed {
39
+ private readonly apiBaseUrl;
40
+ private readonly publishableKey;
41
+ private readonly callbacks;
42
+ private readonly _video;
43
+ private readonly _audio;
44
+ private session;
45
+ private agent;
46
+ private audioContext;
47
+ private processor;
48
+ private stream;
49
+ private abortController;
50
+ private _status;
51
+ private _agentState;
52
+ private _isMuted;
53
+ private mounted;
54
+ constructor(options: PersonaEmbedOptions);
55
+ get status(): EmbedStatus;
56
+ get agentState(): AgentState;
57
+ get isMuted(): boolean;
58
+ get videoElement(): HTMLVideoElement;
59
+ get audioElement(): HTMLAudioElement;
60
+ /** Connect to the embed session */
61
+ connect(): Promise<void>;
62
+ /** Disconnect and cleanup */
63
+ disconnect(): void;
64
+ /** Toggle microphone mute */
65
+ toggleMute(): void;
66
+ private setStatus;
67
+ private setAgentState;
68
+ private fetchSession;
69
+ private initSession;
70
+ private initMicrophone;
71
+ private connectAgent;
72
+ private cleanup;
73
+ }
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Audio utilities for agent implementations.
3
+ *
4
+ * These utilities help with PCM audio processing for voice AI integrations.
5
+ */
6
+ /** Sample rate for audio sent to Persona (matches Gemini output) */
7
+ export declare const SAMPLE_RATE = 24000;
8
+ /**
9
+ * Convert base64-encoded audio to Uint8Array.
10
+ */
11
+ export declare function base64ToBytes(base64: string): Uint8Array;
12
+ /**
13
+ * Convert Uint8Array to base64 string.
14
+ */
15
+ export declare function bytesToBase64(bytes: Uint8Array): string;
16
+ /**
17
+ * Resample PCM audio from one sample rate to another.
18
+ * Simple linear interpolation - not high quality but sufficient for real-time.
19
+ */
20
+ export declare function resamplePcm(input: Uint8Array, fromRate: number, toRate: number): Uint8Array;
21
+ /**
22
+ * Create a simple typed event emitter.
23
+ */
24
+ export declare function createEventEmitter<T extends Record<string, any>>(): {
25
+ on<K extends keyof T>(event: K, handler: (data: T[K]) => void): void;
26
+ off<K extends keyof T>(event: K, handler: (data: T[K]) => void): void;
27
+ emit<K extends keyof T>(event: K, data: T[K]): void;
28
+ removeAllListeners(): void;
29
+ };
30
+ export declare function floatTo16BitPCM(float32: Float32Array): Uint8Array;
@@ -0,0 +1,72 @@
1
+ import { Agent, AgentConfig, AgentEventMap, AgentState } from './types';
2
+ /** Default input sample rate (16kHz mono PCM) */
3
+ export declare const DEFAULT_INPUT_SAMPLE_RATE = 16000;
4
+ /**
5
+ * Abstract base class for voice AI agents.
6
+ *
7
+ * Provides:
8
+ * - Event emitter with typed events
9
+ * - State management with automatic stateChange emission
10
+ * - WebSocket message handling with JSON parsing
11
+ * - Standard lifecycle methods (on, off, close)
12
+ */
13
+ export declare abstract class BaseAgent implements Agent {
14
+ protected ws: WebSocket | null;
15
+ protected _state: AgentState;
16
+ protected events: {
17
+ on<K extends keyof AgentEventMap>(event: K, handler: (data: AgentEventMap[K]) => void): void;
18
+ off<K extends keyof AgentEventMap>(event: K, handler: (data: AgentEventMap[K]) => void): void;
19
+ emit<K extends keyof AgentEventMap>(event: K, data: AgentEventMap[K]): void;
20
+ removeAllListeners(): void;
21
+ };
22
+ protected inputSampleRate: number;
23
+ /** Agent name for logging (override in subclass) */
24
+ protected abstract readonly agentName: string;
25
+ /** Current agent state */
26
+ get state(): AgentState;
27
+ /**
28
+ * Update state and emit stateChange event.
29
+ */
30
+ protected setState(state: AgentState): void;
31
+ /**
32
+ * Connect to the agent service.
33
+ * Must be implemented by subclasses.
34
+ */
35
+ abstract connect(config: AgentConfig): Promise<void>;
36
+ /**
37
+ * Send audio input to the agent.
38
+ * Must be implemented by subclasses.
39
+ */
40
+ abstract sendAudio(pcmData: Uint8Array): void;
41
+ /**
42
+ * Handle parsed JSON message from WebSocket.
43
+ * Must be implemented by subclasses.
44
+ */
45
+ protected abstract handleParsedMessage(message: unknown): void;
46
+ /**
47
+ * Handle WebSocket message (string or Blob).
48
+ * Converts to string and parses JSON before calling handleParsedMessage.
49
+ */
50
+ protected handleMessage(data: string | Blob): void;
51
+ /**
52
+ * Parse JSON and call handleParsedMessage if valid.
53
+ */
54
+ private parseAndHandle;
55
+ /**
56
+ * Close the WebSocket connection and clean up resources.
57
+ * Subclasses can override to add custom cleanup, but should call super.close().
58
+ */
59
+ close(): void;
60
+ /**
61
+ * Register an event handler.
62
+ */
63
+ on<K extends keyof AgentEventMap>(event: K, handler: (data: AgentEventMap[K]) => void): void;
64
+ /**
65
+ * Remove an event handler.
66
+ */
67
+ off<K extends keyof AgentEventMap>(event: K, handler: (data: AgentEventMap[K]) => void): void;
68
+ /**
69
+ * Helper to emit the closed event with code and reason.
70
+ */
71
+ protected emitClosed(code?: number, reason?: string): void;
72
+ }
@@ -0,0 +1,32 @@
1
+ import { AgentConfig } from './types';
2
+ import { BaseAgent } from './base';
3
+ /** Cartesia specific configuration */
4
+ export interface CartesiaConfig extends AgentConfig {
5
+ /** Agent ID from Cartesia dashboard */
6
+ agentId: string;
7
+ }
8
+ /**
9
+ * Cartesia Agent implementation.
10
+ */
11
+ export declare class CartesiaAgent extends BaseAgent {
12
+ protected readonly agentName = "Cartesia";
13
+ private cartesiaInputFormat;
14
+ private cartesiaOutputRate;
15
+ private streamId;
16
+ private isReady;
17
+ private pingInterval;
18
+ connect(config: CartesiaConfig): Promise<void>;
19
+ private sendStartEvent;
20
+ /**
21
+ * Keep connection alive with periodic custom events.
22
+ * Cartesia requires activity every 30s.
23
+ */
24
+ private startHeartbeat;
25
+ private stopHeartbeat;
26
+ protected handleParsedMessage(message: unknown): void;
27
+ private handleAck;
28
+ private handleMediaOutput;
29
+ private handleClear;
30
+ sendAudio(pcmData: Uint8Array): void;
31
+ close(): void;
32
+ }
@@ -0,0 +1,41 @@
1
+ import { AgentConfig } from './types';
2
+ import { BaseAgent } from './base';
3
+ /** ElevenLabs specific configuration */
4
+ export interface ElevenLabsConfig extends AgentConfig {
5
+ /** Agent ID from ElevenLabs dashboard */
6
+ agentId: string;
7
+ /** Signed URL for authentication (alternative to API key) */
8
+ signedUrl?: string;
9
+ }
10
+ /**
11
+ * ElevenLabs Conversational AI agent implementation.
12
+ *
13
+ * Handles WebSocket connection to ElevenLabs and converts
14
+ * audio responses to events that Persona SDK can consume.
15
+ */
16
+ export declare class ElevenLabsAgent extends BaseAgent {
17
+ protected readonly agentName = "ElevenLabs";
18
+ private outputSampleRate;
19
+ private expectedInputSampleRate;
20
+ private sourceInputSampleRate;
21
+ private initialized;
22
+ private lastInterruptId;
23
+ connect(config: ElevenLabsConfig): Promise<void>;
24
+ protected handleParsedMessage(message: unknown): void;
25
+ private handleInitMetadata;
26
+ private handlePing;
27
+ private handleAudio;
28
+ private handleUserTranscript;
29
+ private handleAgentResponse;
30
+ private handleInterruption;
31
+ sendAudio(pcmData: Uint8Array): void;
32
+ /**
33
+ * Send a text message as if the user spoke it.
34
+ */
35
+ sendText(text: string): void;
36
+ /**
37
+ * Send contextual information to the agent without interrupting.
38
+ */
39
+ sendContext(text: string): void;
40
+ close(): void;
41
+ }
@@ -0,0 +1,25 @@
1
+ import { AgentConfig } from './types';
2
+ import { BaseAgent } from './base';
3
+ /** Gemini Live specific configuration */
4
+ export interface GeminiLiveConfig extends AgentConfig {
5
+ /** Model to use (defaults to gemini-2.5-flash-native-audio-preview) */
6
+ model?: string;
7
+ /**
8
+ * Auth mechanism for websocket connection.
9
+ * - `api_key`: use `?key=<apiKey>`
10
+ * - `ephemeral_token`: use `?access_token=<apiKey>` where apiKey is the ephemeral token value
11
+ */
12
+ authType?: 'api_key' | 'ephemeral_token';
13
+ }
14
+ /**
15
+ * Gemini Live agent implementation.
16
+ *
17
+ * Handles WebSocket connection to Gemini Live API and converts
18
+ * audio responses to events that Persona SDK can consume.
19
+ */
20
+ export declare class GeminiLiveAgent extends BaseAgent {
21
+ protected readonly agentName = "GeminiLive";
22
+ connect(config: GeminiLiveConfig): Promise<void>;
23
+ protected handleParsedMessage(message: unknown): void;
24
+ sendAudio(pcmData: Uint8Array): void;
25
+ }
@@ -0,0 +1,50 @@
1
+ import { GeminiLiveAgent, GeminiLiveConfig } from './gemini-live';
2
+ import { ElevenLabsAgent, ElevenLabsConfig } from './elevenlabs';
3
+ import { CartesiaAgent, CartesiaConfig } from './cartesia';
4
+ /**
5
+ * Agent implementations for voice AI platforms.
6
+ *
7
+ * These are reference implementations demonstrating how to connect
8
+ * various voice AI backends to the Persona SDK.
9
+ */
10
+ export { BaseAgent, DEFAULT_INPUT_SAMPLE_RATE } from './base';
11
+ export type { Agent, AgentConfig, AgentEventMap, AgentState } from './types';
12
+ export { GeminiLiveAgent, type GeminiLiveConfig };
13
+ export { ElevenLabsAgent, type ElevenLabsConfig };
14
+ export { CartesiaAgent, type CartesiaConfig };
15
+ export { SAMPLE_RATE, base64ToBytes, bytesToBase64, resamplePcm, createEventEmitter, floatTo16BitPCM } from './audio-utils';
16
+ /** Supported agent types */
17
+ export type AgentType = 'gemini' | 'elevenlabs' | 'cartesia';
18
+ /** Agent type metadata */
19
+ export interface AgentTypeInfo {
20
+ id: AgentType;
21
+ name: string;
22
+ description: string;
23
+ }
24
+ /** Registry of available agents */
25
+ export declare const AGENT_REGISTRY: AgentTypeInfo[];
26
+ /** Configuration types by agent type */
27
+ export interface AgentConfigMap {
28
+ gemini: GeminiLiveConfig;
29
+ elevenlabs: ElevenLabsConfig;
30
+ cartesia: CartesiaConfig;
31
+ }
32
+ /** Union type of all agent instances */
33
+ export type AnyAgent = GeminiLiveAgent | ElevenLabsAgent | CartesiaAgent;
34
+ /**
35
+ * Create an agent instance by type.
36
+ *
37
+ * @example
38
+ * ```ts
39
+ * const agent = createAgent('gemini');
40
+ * await agent.connect({ apiKey: 'YOUR_KEY' });
41
+ * ```
42
+ */
43
+ export declare function createAgent(type: 'gemini'): GeminiLiveAgent;
44
+ export declare function createAgent(type: 'elevenlabs'): ElevenLabsAgent;
45
+ export declare function createAgent(type: 'cartesia'): CartesiaAgent;
46
+ export declare function createAgent(type: AgentType): AnyAgent;
47
+ /**
48
+ * Get agent type metadata by ID.
49
+ */
50
+ export declare function getAgentInfo(type: AgentType): AgentTypeInfo | undefined;
@@ -0,0 +1,76 @@
1
+ /**
2
+ * Agent interface for voice AI backends.
3
+ *
4
+ * Agents handle the conversation logic (STT, LLM, TTS) and emit audio
5
+ * that gets sent to the Persona avatar for video synthesis.
6
+ *
7
+ * This is a reference implementation - copy and adapt for your own use.
8
+ */
9
+ /** Agent state for UI */
10
+ export type AgentState = 'idle' | 'listening' | 'thinking' | 'speaking';
11
+ /** Agent configuration */
12
+ export interface AgentConfig {
13
+ /** System prompt for the agent */
14
+ systemPrompt?: string;
15
+ /** Voice ID (agent-specific) */
16
+ voice?: string;
17
+ /** API key for the agent service */
18
+ apiKey?: string;
19
+ /** Sample rate of audio being sent via sendAudio (from microphone). Default: 16000 */
20
+ inputSampleRate?: number;
21
+ }
22
+ /** Events emitted by agents */
23
+ export interface AgentEventMap {
24
+ /** Audio data from agent (PCM bytes) */
25
+ audio: Uint8Array;
26
+ /** Agent started speaking */
27
+ turnStart: void;
28
+ /** Agent finished speaking */
29
+ turnEnd: void;
30
+ /** Agent was interrupted */
31
+ interrupted: void;
32
+ /** Agent state changed */
33
+ stateChange: AgentState;
34
+ /** Transcript update */
35
+ transcript: {
36
+ role: 'user' | 'assistant';
37
+ text: string;
38
+ isFinal: boolean;
39
+ };
40
+ /** Agent connection closed (unexpected disconnect) */
41
+ closed: {
42
+ code?: number;
43
+ reason?: string;
44
+ };
45
+ }
46
+ /**
47
+ * Abstract agent interface.
48
+ *
49
+ * Implement this for each voice AI platform (Gemini, ElevenLabs, Cartesia, etc.)
50
+ */
51
+ export interface Agent {
52
+ /** Current agent state */
53
+ readonly state: AgentState;
54
+ /**
55
+ * Connect to the agent service.
56
+ * @param config - Agent configuration
57
+ */
58
+ connect(config: AgentConfig): Promise<void>;
59
+ /**
60
+ * Send audio input to the agent (from microphone).
61
+ * @param pcmData - 16-bit PCM audio at 16kHz
62
+ */
63
+ sendAudio(pcmData: Uint8Array): void;
64
+ /**
65
+ * Disconnect and clean up.
66
+ */
67
+ close(): void;
68
+ /**
69
+ * Register an event handler.
70
+ */
71
+ on<K extends keyof AgentEventMap>(event: K, handler: (data: AgentEventMap[K]) => void): void;
72
+ /**
73
+ * Remove an event handler.
74
+ */
75
+ off<K extends keyof AgentEventMap>(event: K, handler: (data: AgentEventMap[K]) => void): void;
76
+ }
@@ -0,0 +1,5 @@
1
+ export { PersonaEmbed } from './PersonaEmbed';
2
+ export type { PersonaEmbedOptions, EmbedStatus, VideoFit } from './PersonaEmbed';
3
+ export { createAgent, GeminiLiveAgent, ElevenLabsAgent, CartesiaAgent, BaseAgent, AGENT_REGISTRY, getAgentInfo, } from './agents';
4
+ export type { AgentType, AgentState, AgentConfig, AgentEventMap, Agent, AnyAgent, AgentTypeInfo, GeminiLiveConfig, ElevenLabsConfig, CartesiaConfig, } from './agents';
5
+ export { floatTo16BitPCM, resamplePcm, base64ToBytes, bytesToBase64, SAMPLE_RATE, createEventEmitter, } from './agents';
package/dist/index.js ADDED
@@ -0,0 +1 @@
1
+ "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const A=require("@keyframelabs/sdk"),h=24e3;function c(i){const e=atob(i),t=new Uint8Array(e.length);for(let s=0;s<e.length;s++)t[s]=e.charCodeAt(s);return t}function u(i){let e="";for(let t=0;t<i.length;t++)e+=String.fromCharCode(i[t]);return btoa(e)}function d(i,e,t){if(e===t)return i;const s=new Int16Array(i.buffer,i.byteOffset,i.length/2),n=e/t,a=Math.floor(s.length/n),o=new Int16Array(a);for(let r=0;r<a;r++){const g=r*n,m=Math.floor(g),E=Math.min(m+1,s.length-1),_=g-m;o[r]=Math.round(s[m]*(1-_)+s[E]*_)}return new Uint8Array(o.buffer)}function S(){const i=new Map;return{on(e,t){i.has(e)||i.set(e,new Set),i.get(e).add(t)},off(e,t){i.get(e)?.delete(t)},emit(e,t){i.get(e)?.forEach(s=>s(t))},removeAllListeners(){i.clear()}}}function v(i){const e=new Int16Array(i.length);for(let t=0;t<i.length;t++){const s=Math.max(-1,Math.min(1,i[t]));e[t]=s<0?s*32768:s*32767}return new Uint8Array(e.buffer)}const C=16e3;class p{ws=null;_state="idle";events=S();inputSampleRate=C;get state(){return this._state}setState(e){this._state!==e&&(this._state=e,this.events.emit("stateChange",e))}handleMessage(e){if(e instanceof Blob){e.text().then(t=>this.parseAndHandle(t));return}this.parseAndHandle(e)}parseAndHandle(e){try{const t=JSON.parse(e);this.handleParsedMessage(t)}catch{console.warn(`[${this.agentName}] Failed to parse message:`,e.slice(0,200))}}close(){this.ws&&(this.ws.close(),this.ws=null),this.events.removeAllListeners(),this.setState("idle")}on(e,t){this.events.on(e,t)}off(e,t){this.events.off(e,t)}emitClosed(e,t){this.events.emit("closed",{code:e,reason:t})}}const k="gemini-2.5-flash-native-audio-preview-12-2025",R="wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent",M="wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";class f extends p{agentName="GeminiLive";async connect(e){if(this.ws)throw new Error("Already connected");if(!e.apiKey)throw new Error("Gemini API key is required");e.inputSampleRate&&(this.inputSampleRate=e.inputSampleRate);const t=e.model??k,n=(e.authType??"api_key")==="ephemeral_token"?`${M}?access_token=${encodeURIComponent(e.apiKey)}`:`${R}?key=${encodeURIComponent(e.apiKey)}`;return new Promise((a,o)=>{this.ws=new WebSocket(n),this.ws.onopen=()=>{const r={setup:{model:`models/${t}`,generationConfig:{responseModalities:["AUDIO"]},systemInstruction:e.systemPrompt?{parts:[{text:e.systemPrompt}]}:void 0}};this.ws.send(JSON.stringify(r)),this.setState("listening"),a()},this.ws.onerror=()=>{o(new Error("Failed to connect to Gemini Live"))},this.ws.onclose=r=>{this.ws=null,this.setState("idle"),this.emitClosed(r.code,r.reason)},this.ws.onmessage=r=>{this.handleMessage(r.data)}})}handleParsedMessage(e){const s=e.serverContent;if(s){if(s.interrupted){this.events.emit("interrupted",void 0),this.setState("listening");return}if(s.turnComplete){this.events.emit("turnEnd",void 0),this.setState("listening");return}if(s.modelTurn?.parts){this._state!=="speaking"&&(this.events.emit("turnStart",void 0),this.setState("speaking"));for(const n of s.modelTurn.parts){if(n.inlineData?.data){const a=c(n.inlineData.data);this.events.emit("audio",a)}n.text&&this.events.emit("transcript",{role:"assistant",text:n.text,isFinal:!0})}}}}sendAudio(e){if(!this.ws||this.ws.readyState!==WebSocket.OPEN){console.warn("[GeminiLive] Cannot send audio: not connected");return}const t={realtimeInput:{mediaChunks:[{mimeType:`audio/pcm;rate=${this.inputSampleRate}`,data:u(e)}]}};this.ws.send(JSON.stringify(t))}}const P="wss://api.elevenlabs.io/v1/convai/conversation";class w extends p{agentName="ElevenLabs";outputSampleRate=24e3;expectedInputSampleRate=16e3;sourceInputSampleRate=16e3;initialized=!1;lastInterruptId=0;async connect(e){if(this.ws)throw new Error("Already connected");if(!e.agentId&&!e.signedUrl)throw new Error("ElevenLabs agent ID or signed URL is required");e.inputSampleRate&&(this.sourceInputSampleRate=e.inputSampleRate);let t;return e.signedUrl?t=e.signedUrl:(t=`${P}?agent_id=${e.agentId}`,e.apiKey&&(t+=`&xi-api-key=${e.apiKey}`)),new Promise((s,n)=>{this.ws=new WebSocket(t),this.ws.onopen=()=>{this.setState("listening"),s()},this.ws.onerror=()=>{n(new Error("Failed to connect to ElevenLabs"))},this.ws.onclose=a=>{this.ws=null,this.setState("idle"),this.emitClosed(a.code,a.reason)},this.ws.onmessage=a=>{this.handleMessage(a.data)}})}handleParsedMessage(e){const t=e;switch(t.type){case"conversation_initiation_metadata":this.handleInitMetadata(t);break;case"ping":this.handlePing(t);break;case"audio":this.handleAudio(t);break;case"user_transcript":this.handleUserTranscript(t);break;case"agent_response":this.handleAgentResponse(t);break;case"interruption":this.handleInterruption(t);break;case"agent_response_correction":this.setState("listening");break}}handleInitMetadata(e){const t=e.conversation_initiation_metadata_event;if(t){if(t.agent_output_audio_format){const s=t.agent_output_audio_format.match(/pcm_(\d+)/);s&&(this.outputSampleRate=parseInt(s[1],10))}if(t.user_input_audio_format){const s=t.user_input_audio_format.match(/pcm_(\d+)/);s&&(this.expectedInputSampleRate=parseInt(s[1],10))}this.initialized=!0}}handlePing(e){if(this.ws&&this.ws.readyState===WebSocket.OPEN){const t=e.ping_event?.event_id;this.ws.send(JSON.stringify({type:"pong",event_id:t}))}}handleAudio(e){const t=e.audio_event;if(!t?.audio_base_64||(t.event_id??0)<=this.lastInterruptId)return;this._state!=="speaking"&&(this.events.emit("turnStart",void 0),this.setState("speaking"));let n=c(t.audio_base_64);this.outputSampleRate!==h&&(n=d(n,this.outputSampleRate,h));const a=4800;if(n.length<=a)this.events.emit("audio",n);else for(let o=0;o<n.length;o+=a){const r=n.slice(o,Math.min(o+a,n.length));this.events.emit("audio",r)}}handleUserTranscript(e){const t=e.user_transcription_event;t?.user_transcript&&this.events.emit("transcript",{role:"user",text:t.user_transcript,isFinal:!0})}handleAgentResponse(e){const t=e.agent_response_event;t?.agent_response&&this.events.emit("transcript",{role:"assistant",text:t.agent_response,isFinal:!0})}handleInterruption(e){const t=e.interruption_event;t?.event_id&&(this.lastInterruptId=t.event_id),this.events.emit("interrupted",void 0),this.setState("listening")}sendAudio(e){if(!this.ws||this.ws.readyState!==WebSocket.OPEN||!this.initialized)return;let t=e;this.sourceInputSampleRate!==this.expectedInputSampleRate&&(t=d(e,this.sourceInputSampleRate,this.expectedInputSampleRate)),this.ws.send(JSON.stringify({user_audio_chunk:u(t)}))}sendText(e){if(!this.ws||this.ws.readyState!==WebSocket.OPEN){console.warn("[ElevenLabs] Cannot send text: not connected");return}this.ws.send(JSON.stringify({type:"user_message",text:e}))}sendContext(e){if(!this.ws||this.ws.readyState!==WebSocket.OPEN){console.warn("[ElevenLabs] Cannot send context: not connected");return}this.ws.send(JSON.stringify({type:"contextual_update",text:e}))}close(){this.initialized=!1,this.lastInterruptId=0,super.close()}}const T="wss://api.cartesia.ai/agents/stream",N="2025-04-16";class y extends p{agentName="Cartesia";cartesiaInputFormat="pcm_16000";cartesiaOutputRate=16e3;streamId=null;isReady=!1;pingInterval=null;async connect(e){if(this.ws)throw new Error("Already connected");if(!e.agentId)throw new Error("Cartesia Agent ID is required");if(!e.apiKey)throw new Error("Cartesia API Key is required");e.inputSampleRate&&(this.inputSampleRate=e.inputSampleRate),this.inputSampleRate===16e3?this.cartesiaInputFormat="pcm_16000":this.inputSampleRate===24e3?this.cartesiaInputFormat="pcm_24000":this.inputSampleRate===44100?this.cartesiaInputFormat="pcm_44100":this.cartesiaInputFormat="pcm_16000";const t=`${T}/${e.agentId}?api_key=${e.apiKey}&cartesia_version=${N}`;return new Promise((s,n)=>{this.ws=new WebSocket(t),this.ws.onopen=()=>{this.sendStartEvent(),this.startHeartbeat(),s()},this.ws.onerror=()=>{n(new Error("Failed to connect to Cartesia"))},this.ws.onclose=a=>{this.stopHeartbeat(),this.ws=null,this.isReady=!1,this.streamId=null,this.setState("idle"),this.emitClosed(a.code,a.reason)},this.ws.onmessage=a=>{this.handleMessage(a.data)}})}sendStartEvent(){if(!this.ws)return;const e={event:"start",config:{input_format:this.cartesiaInputFormat}};this.ws.send(JSON.stringify(e))}startHeartbeat(){this.pingInterval=window.setInterval(()=>{this.ws?.readyState===WebSocket.OPEN&&this.streamId&&this.ws.send(JSON.stringify({event:"custom",stream_id:this.streamId,metadata:{keepalive:!0}}))},2e4)}stopHeartbeat(){this.pingInterval&&(clearInterval(this.pingInterval),this.pingInterval=null)}handleParsedMessage(e){const t=e;switch(t.event){case"ack":this.handleAck(t);break;case"media_output":this.handleMediaOutput(t);break;case"clear":this.handleClear();break;case"error":console.error("[Cartesia] Server error:",t);break}}handleAck(e){this.streamId=e.stream_id||null,this.isReady=!0,this.setState("listening")}handleMediaOutput(e){if(!e.media?.payload)return;this._state!=="speaking"&&(this.events.emit("turnStart",void 0),this.setState("speaking"));let t=c(e.media.payload);this.cartesiaOutputRate!==h&&(t=d(t,this.cartesiaOutputRate,h)),this.events.emit("audio",t)}handleClear(){this.events.emit("interrupted",void 0),this.setState("listening")}sendAudio(e){if(!this.ws||this.ws.readyState!==WebSocket.OPEN||!this.isReady||!this.streamId)return;let t=e;const s=parseInt(this.cartesiaInputFormat.split("_")[1]);this.inputSampleRate!==s&&(t=d(e,this.inputSampleRate,s)),this.ws.send(JSON.stringify({event:"media_input",stream_id:this.streamId,media:{payload:u(t)}}))}close(){this.stopHeartbeat(),this.isReady=!1,this.streamId=null,super.close()}}const b=[{id:"gemini",name:"Gemini Live",description:"Google Gemini Live API"},{id:"elevenlabs",name:"ElevenLabs",description:"ElevenLabs Conversational AI"},{id:"cartesia",name:"Cartesia",description:"Cartesia Agents API"}];function I(i){switch(i){case"gemini":return new f;case"elevenlabs":return new w;case"cartesia":return new y;default:throw new Error(`Unknown agent type: ${i}`)}}function O(i){return b.find(e=>e.id===i)}const l=new Set;class L{apiBaseUrl;publishableKey;callbacks;_video;_audio;session=null;agent=null;audioContext=null;processor=null;stream=null;abortController=null;_status="disconnected";_agentState="idle";_isMuted=!1;mounted=!0;constructor(e){this.apiBaseUrl=e.apiBaseUrl??"https://api.keyframelabs.com",this.publishableKey=e.publishableKey,this.callbacks={onDisconnect:e.onDisconnect,onError:e.onError,onStateChange:e.onStateChange,onAgentStateChange:e.onAgentStateChange},this._video=document.createElement("video"),this._video.style.position="absolute",this._video.style.inset="0",this._video.style.width="100%",this._video.style.height="100%",this._video.style.objectFit=e.videoFit??"cover",this._video.autoplay=!0,this._video.playsInline=!0,this._video.muted=!0,e.container.style.position="relative",e.container.style.backgroundColor="#000",this._audio=document.createElement("audio"),this._audio.autoplay=!0,e.container.appendChild(this._video),e.container.appendChild(this._audio)}get status(){return this._status}get agentState(){return this._agentState}get isMuted(){return this._isMuted}get videoElement(){return this._video}get audioElement(){return this._audio}async connect(){if(l.has(this.publishableKey)){console.log("[PersonaEmbed] Connection already in progress, skipping");return}l.add(this.publishableKey),this.mounted=!0,this.abortController=new AbortController,this.setStatus("connecting");try{const e=await this.fetchSession(this.abortController.signal);if(!this.mounted){l.delete(this.publishableKey);return}if(await this.initSession(e),await this.initMicrophone(),await this.connectAgent(e.voice_agent_details),!this.mounted){this.cleanup(),l.delete(this.publishableKey);return}this.setStatus("connected")}catch(e){if(l.delete(this.publishableKey),e instanceof Error&&e.name==="AbortError")return;console.error("[PersonaEmbed]",e),this.mounted&&(this.setStatus("error"),this.callbacks.onError?.(e))}}disconnect(){this.mounted=!1,this.abortController?.abort(),this.abortController=null,l.delete(this.publishableKey),this.cleanup(),this.setStatus("disconnected")}toggleMute(){this._isMuted=!this._isMuted}setStatus(e){this._status!==e&&(this._status=e,this.callbacks.onStateChange?.(e))}setAgentState(e){this._agentState!==e&&(this._agentState=e,this.callbacks.onAgentStateChange?.(e))}async fetchSession(e){const t=await fetch(`${this.apiBaseUrl}/v1/embed/create_session`,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({publishable_key:this.publishableKey}),signal:e});if(!t.ok){const s=await t.json().catch(()=>null);throw new Error(`create_session failed: ${t.status} ${JSON.stringify(s)}`)}return t.json()}async initSession(e){this.session=A.createClient({serverUrl:e.session_details.server_url,participantToken:e.session_details.participant_token,agentIdentity:e.session_details.agent_identity,onVideoTrack:t=>{console.log("[PersonaEmbed] Setting video track",t.readyState,t.enabled),this._video.srcObject=new MediaStream([t]),this._video.play().catch(s=>console.warn("[PersonaEmbed] Video play failed:",s))},onAudioTrack:t=>{this._audio.srcObject=new MediaStream([t]),this._audio.play().catch(()=>{})},onStateChange:t=>{this.mounted&&t==="disconnected"&&(this.setStatus("disconnected"),this.callbacks.onDisconnect?.())},onError:t=>{this.mounted&&this.callbacks.onError?.(t)},onClose:()=>{this.mounted&&this.callbacks.onDisconnect?.()}}),this.agent=I(e.voice_agent_details.type),this.agent.on("audio",t=>this.session?.sendAudio(t)),this.agent.on("interrupted",()=>this.session?.interrupt()),this.agent.on("stateChange",t=>this.setAgentState(t)),this.agent.on("closed",()=>{this.mounted&&this.callbacks.onDisconnect?.()}),await this.session.connect()}async initMicrophone(){this.stream=await navigator.mediaDevices.getUserMedia({audio:{sampleRate:16e3,echoCancellation:!0,noiseSuppression:!0}}),this.audioContext=new AudioContext({sampleRate:16e3});const e=this.audioContext.createMediaStreamSource(this.stream);this.processor=this.audioContext.createScriptProcessor(4096,1,1),this.processor.onaudioprocess=t=>{if(!this._isMuted){const s=v(t.inputBuffer.getChannelData(0));this.agent?.sendAudio(s)}},e.connect(this.processor),this.processor.connect(this.audioContext.destination)}async connectAgent(e){if(!this.agent)return;const t={inputSampleRate:16e3};e.type==="gemini"?await this.agent.connect({...t,apiKey:e.token,authType:"ephemeral_token"}):e.type==="elevenlabs"?await this.agent.connect({...t,agentId:e.agent_id,signedUrl:e.signed_url}):e.type==="cartesia"&&await this.agent.connect({...t,agentId:e.agent_id,apiKey:e.token})}cleanup(){this.stream?.getTracks().forEach(e=>e.stop()),this.processor?.disconnect(),this.audioContext?.close(),this.agent?.close(),this.session?.close(),this.stream=null,this.processor=null,this.audioContext=null,this.agent=null,this.session=null}}exports.AGENT_REGISTRY=b;exports.BaseAgent=p;exports.CartesiaAgent=y;exports.ElevenLabsAgent=w;exports.GeminiLiveAgent=f;exports.PersonaEmbed=L;exports.SAMPLE_RATE=h;exports.base64ToBytes=c;exports.bytesToBase64=u;exports.createAgent=I;exports.createEventEmitter=S;exports.floatTo16BitPCM=v;exports.getAgentInfo=O;exports.resamplePcm=d;
package/dist/index.mjs ADDED
@@ -0,0 +1,636 @@
1
+ import { createClient as v } from "@keyframelabs/sdk";
2
+ const h = 24e3;
3
+ function u(i) {
4
+ const e = atob(i), t = new Uint8Array(e.length);
5
+ for (let s = 0; s < e.length; s++)
6
+ t[s] = e.charCodeAt(s);
7
+ return t;
8
+ }
9
+ function p(i) {
10
+ let e = "";
11
+ for (let t = 0; t < i.length; t++)
12
+ e += String.fromCharCode(i[t]);
13
+ return btoa(e);
14
+ }
15
+ function d(i, e, t) {
16
+ if (e === t)
17
+ return i;
18
+ const s = new Int16Array(i.buffer, i.byteOffset, i.length / 2), n = e / t, a = Math.floor(s.length / n), o = new Int16Array(a);
19
+ for (let r = 0; r < a; r++) {
20
+ const g = r * n, c = Math.floor(g), S = Math.min(c + 1, s.length - 1), _ = g - c;
21
+ o[r] = Math.round(
22
+ s[c] * (1 - _) + s[S] * _
23
+ );
24
+ }
25
+ return new Uint8Array(o.buffer);
26
+ }
27
+ function f() {
28
+ const i = /* @__PURE__ */ new Map();
29
+ return {
30
+ on(e, t) {
31
+ i.has(e) || i.set(e, /* @__PURE__ */ new Set()), i.get(e).add(t);
32
+ },
33
+ off(e, t) {
34
+ i.get(e)?.delete(t);
35
+ },
36
+ emit(e, t) {
37
+ i.get(e)?.forEach((s) => s(t));
38
+ },
39
+ removeAllListeners() {
40
+ i.clear();
41
+ }
42
+ };
43
+ }
44
+ function w(i) {
45
+ const e = new Int16Array(i.length);
46
+ for (let t = 0; t < i.length; t++) {
47
+ const s = Math.max(-1, Math.min(1, i[t]));
48
+ e[t] = s < 0 ? s * 32768 : s * 32767;
49
+ }
50
+ return new Uint8Array(e.buffer);
51
+ }
52
+ const y = 16e3;
53
+ class m {
54
+ ws = null;
55
+ _state = "idle";
56
+ events = f();
57
+ inputSampleRate = y;
58
+ /** Current agent state */
59
+ get state() {
60
+ return this._state;
61
+ }
62
+ /**
63
+ * Update state and emit stateChange event.
64
+ */
65
+ setState(e) {
66
+ this._state !== e && (this._state = e, this.events.emit("stateChange", e));
67
+ }
68
+ /**
69
+ * Handle WebSocket message (string or Blob).
70
+ * Converts to string and parses JSON before calling handleParsedMessage.
71
+ */
72
+ handleMessage(e) {
73
+ if (e instanceof Blob) {
74
+ e.text().then((t) => this.parseAndHandle(t));
75
+ return;
76
+ }
77
+ this.parseAndHandle(e);
78
+ }
79
+ /**
80
+ * Parse JSON and call handleParsedMessage if valid.
81
+ */
82
+ parseAndHandle(e) {
83
+ try {
84
+ const t = JSON.parse(e);
85
+ this.handleParsedMessage(t);
86
+ } catch {
87
+ console.warn(`[${this.agentName}] Failed to parse message:`, e.slice(0, 200));
88
+ }
89
+ }
90
+ /**
91
+ * Close the WebSocket connection and clean up resources.
92
+ * Subclasses can override to add custom cleanup, but should call super.close().
93
+ */
94
+ close() {
95
+ this.ws && (this.ws.close(), this.ws = null), this.events.removeAllListeners(), this.setState("idle");
96
+ }
97
+ /**
98
+ * Register an event handler.
99
+ */
100
+ on(e, t) {
101
+ this.events.on(e, t);
102
+ }
103
+ /**
104
+ * Remove an event handler.
105
+ */
106
+ off(e, t) {
107
+ this.events.off(e, t);
108
+ }
109
+ /**
110
+ * Helper to emit the closed event with code and reason.
111
+ */
112
+ emitClosed(e, t) {
113
+ this.events.emit("closed", { code: e, reason: t });
114
+ }
115
+ }
116
+ const b = "gemini-2.5-flash-native-audio-preview-12-2025", I = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent", E = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
117
+ class C extends m {
118
+ agentName = "GeminiLive";
119
+ async connect(e) {
120
+ if (this.ws)
121
+ throw new Error("Already connected");
122
+ if (!e.apiKey)
123
+ throw new Error("Gemini API key is required");
124
+ e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate);
125
+ const t = e.model ?? b, n = (e.authType ?? "api_key") === "ephemeral_token" ? `${E}?access_token=${encodeURIComponent(e.apiKey)}` : `${I}?key=${encodeURIComponent(e.apiKey)}`;
126
+ return new Promise((a, o) => {
127
+ this.ws = new WebSocket(n), this.ws.onopen = () => {
128
+ const r = {
129
+ setup: {
130
+ model: `models/${t}`,
131
+ generationConfig: {
132
+ responseModalities: ["AUDIO"]
133
+ },
134
+ systemInstruction: e.systemPrompt ? { parts: [{ text: e.systemPrompt }] } : void 0
135
+ }
136
+ };
137
+ this.ws.send(JSON.stringify(r)), this.setState("listening"), a();
138
+ }, this.ws.onerror = () => {
139
+ o(new Error("Failed to connect to Gemini Live"));
140
+ }, this.ws.onclose = (r) => {
141
+ this.ws = null, this.setState("idle"), this.emitClosed(r.code, r.reason);
142
+ }, this.ws.onmessage = (r) => {
143
+ this.handleMessage(r.data);
144
+ };
145
+ });
146
+ }
147
+ handleParsedMessage(e) {
148
+ const s = e.serverContent;
149
+ if (s) {
150
+ if (s.interrupted) {
151
+ this.events.emit("interrupted", void 0), this.setState("listening");
152
+ return;
153
+ }
154
+ if (s.turnComplete) {
155
+ this.events.emit("turnEnd", void 0), this.setState("listening");
156
+ return;
157
+ }
158
+ if (s.modelTurn?.parts) {
159
+ this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
160
+ for (const n of s.modelTurn.parts) {
161
+ if (n.inlineData?.data) {
162
+ const a = u(n.inlineData.data);
163
+ this.events.emit("audio", a);
164
+ }
165
+ n.text && this.events.emit("transcript", {
166
+ role: "assistant",
167
+ text: n.text,
168
+ isFinal: !0
169
+ });
170
+ }
171
+ }
172
+ }
173
+ }
174
+ sendAudio(e) {
175
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
176
+ console.warn("[GeminiLive] Cannot send audio: not connected");
177
+ return;
178
+ }
179
+ const t = {
180
+ realtimeInput: {
181
+ mediaChunks: [
182
+ {
183
+ mimeType: `audio/pcm;rate=${this.inputSampleRate}`,
184
+ data: p(e)
185
+ }
186
+ ]
187
+ }
188
+ };
189
+ this.ws.send(JSON.stringify(t));
190
+ }
191
+ }
192
+ const A = "wss://api.elevenlabs.io/v1/convai/conversation";
193
+ class k extends m {
194
+ agentName = "ElevenLabs";
195
+ outputSampleRate = 24e3;
196
+ // Default, updated from metadata
197
+ expectedInputSampleRate = 16e3;
198
+ // What ElevenLabs expects, updated from metadata
199
+ sourceInputSampleRate = 16e3;
200
+ // What caller sends via sendAudio, from config
201
+ initialized = !1;
202
+ // True after conversation_initiation_metadata received
203
+ lastInterruptId = 0;
204
+ // Track interruptions to filter stale audio
205
+ async connect(e) {
206
+ if (this.ws)
207
+ throw new Error("Already connected");
208
+ if (!e.agentId && !e.signedUrl)
209
+ throw new Error("ElevenLabs agent ID or signed URL is required");
210
+ e.inputSampleRate && (this.sourceInputSampleRate = e.inputSampleRate);
211
+ let t;
212
+ return e.signedUrl ? t = e.signedUrl : (t = `${A}?agent_id=${e.agentId}`, e.apiKey && (t += `&xi-api-key=${e.apiKey}`)), new Promise((s, n) => {
213
+ this.ws = new WebSocket(t), this.ws.onopen = () => {
214
+ this.setState("listening"), s();
215
+ }, this.ws.onerror = () => {
216
+ n(new Error("Failed to connect to ElevenLabs"));
217
+ }, this.ws.onclose = (a) => {
218
+ this.ws = null, this.setState("idle"), this.emitClosed(a.code, a.reason);
219
+ }, this.ws.onmessage = (a) => {
220
+ this.handleMessage(a.data);
221
+ };
222
+ });
223
+ }
224
+ handleParsedMessage(e) {
225
+ const t = e;
226
+ switch (t.type) {
227
+ case "conversation_initiation_metadata":
228
+ this.handleInitMetadata(t);
229
+ break;
230
+ case "ping":
231
+ this.handlePing(t);
232
+ break;
233
+ case "audio":
234
+ this.handleAudio(t);
235
+ break;
236
+ case "user_transcript":
237
+ this.handleUserTranscript(t);
238
+ break;
239
+ case "agent_response":
240
+ this.handleAgentResponse(t);
241
+ break;
242
+ case "interruption":
243
+ this.handleInterruption(t);
244
+ break;
245
+ case "agent_response_correction":
246
+ this.setState("listening");
247
+ break;
248
+ }
249
+ }
250
+ handleInitMetadata(e) {
251
+ const t = e.conversation_initiation_metadata_event;
252
+ if (t) {
253
+ if (t.agent_output_audio_format) {
254
+ const s = t.agent_output_audio_format.match(/pcm_(\d+)/);
255
+ s && (this.outputSampleRate = parseInt(s[1], 10));
256
+ }
257
+ if (t.user_input_audio_format) {
258
+ const s = t.user_input_audio_format.match(/pcm_(\d+)/);
259
+ s && (this.expectedInputSampleRate = parseInt(s[1], 10));
260
+ }
261
+ this.initialized = !0;
262
+ }
263
+ }
264
+ handlePing(e) {
265
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
266
+ const t = e.ping_event?.event_id;
267
+ this.ws.send(JSON.stringify({ type: "pong", event_id: t }));
268
+ }
269
+ }
270
+ handleAudio(e) {
271
+ const t = e.audio_event;
272
+ if (!t?.audio_base_64 || (t.event_id ?? 0) <= this.lastInterruptId)
273
+ return;
274
+ this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
275
+ let n = u(t.audio_base_64);
276
+ this.outputSampleRate !== h && (n = d(n, this.outputSampleRate, h));
277
+ const a = 4800;
278
+ if (n.length <= a)
279
+ this.events.emit("audio", n);
280
+ else
281
+ for (let o = 0; o < n.length; o += a) {
282
+ const r = n.slice(o, Math.min(o + a, n.length));
283
+ this.events.emit("audio", r);
284
+ }
285
+ }
286
+ handleUserTranscript(e) {
287
+ const t = e.user_transcription_event;
288
+ t?.user_transcript && this.events.emit("transcript", {
289
+ role: "user",
290
+ text: t.user_transcript,
291
+ isFinal: !0
292
+ });
293
+ }
294
+ handleAgentResponse(e) {
295
+ const t = e.agent_response_event;
296
+ t?.agent_response && this.events.emit("transcript", {
297
+ role: "assistant",
298
+ text: t.agent_response,
299
+ isFinal: !0
300
+ });
301
+ }
302
+ handleInterruption(e) {
303
+ const t = e.interruption_event;
304
+ t?.event_id && (this.lastInterruptId = t.event_id), this.events.emit("interrupted", void 0), this.setState("listening");
305
+ }
306
+ sendAudio(e) {
307
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN || !this.initialized)
308
+ return;
309
+ let t = e;
310
+ this.sourceInputSampleRate !== this.expectedInputSampleRate && (t = d(e, this.sourceInputSampleRate, this.expectedInputSampleRate)), this.ws.send(JSON.stringify({
311
+ user_audio_chunk: p(t)
312
+ }));
313
+ }
314
+ /**
315
+ * Send a text message as if the user spoke it.
316
+ */
317
+ sendText(e) {
318
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
319
+ console.warn("[ElevenLabs] Cannot send text: not connected");
320
+ return;
321
+ }
322
+ this.ws.send(JSON.stringify({
323
+ type: "user_message",
324
+ text: e
325
+ }));
326
+ }
327
+ /**
328
+ * Send contextual information to the agent without interrupting.
329
+ */
330
+ sendContext(e) {
331
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
332
+ console.warn("[ElevenLabs] Cannot send context: not connected");
333
+ return;
334
+ }
335
+ this.ws.send(JSON.stringify({
336
+ type: "contextual_update",
337
+ text: e
338
+ }));
339
+ }
340
+ close() {
341
+ this.initialized = !1, this.lastInterruptId = 0, super.close();
342
+ }
343
+ }
344
+ const R = "wss://api.cartesia.ai/agents/stream", M = "2025-04-16";
345
+ class P extends m {
346
+ agentName = "Cartesia";
347
+ // Audio configuration
348
+ cartesiaInputFormat = "pcm_16000";
349
+ // Format we tell Cartesia we are sending
350
+ cartesiaOutputRate = 16e3;
351
+ // Cartesia defaults to 16kHz for web
352
+ // Connection state
353
+ streamId = null;
354
+ isReady = !1;
355
+ pingInterval = null;
356
+ async connect(e) {
357
+ if (this.ws)
358
+ throw new Error("Already connected");
359
+ if (!e.agentId)
360
+ throw new Error("Cartesia Agent ID is required");
361
+ if (!e.apiKey)
362
+ throw new Error("Cartesia API Key is required");
363
+ e.inputSampleRate && (this.inputSampleRate = e.inputSampleRate), this.inputSampleRate === 16e3 ? this.cartesiaInputFormat = "pcm_16000" : this.inputSampleRate === 24e3 ? this.cartesiaInputFormat = "pcm_24000" : this.inputSampleRate === 44100 ? this.cartesiaInputFormat = "pcm_44100" : this.cartesiaInputFormat = "pcm_16000";
364
+ const t = `${R}/${e.agentId}?api_key=${e.apiKey}&cartesia_version=${M}`;
365
+ return new Promise((s, n) => {
366
+ this.ws = new WebSocket(t), this.ws.onopen = () => {
367
+ this.sendStartEvent(), this.startHeartbeat(), s();
368
+ }, this.ws.onerror = () => {
369
+ n(new Error("Failed to connect to Cartesia"));
370
+ }, this.ws.onclose = (a) => {
371
+ this.stopHeartbeat(), this.ws = null, this.isReady = !1, this.streamId = null, this.setState("idle"), this.emitClosed(a.code, a.reason);
372
+ }, this.ws.onmessage = (a) => {
373
+ this.handleMessage(a.data);
374
+ };
375
+ });
376
+ }
377
+ sendStartEvent() {
378
+ if (!this.ws) return;
379
+ const e = {
380
+ event: "start",
381
+ config: {
382
+ input_format: this.cartesiaInputFormat
383
+ }
384
+ };
385
+ this.ws.send(JSON.stringify(e));
386
+ }
387
+ /**
388
+ * Keep connection alive with periodic custom events.
389
+ * Cartesia requires activity every 30s.
390
+ */
391
+ startHeartbeat() {
392
+ this.pingInterval = window.setInterval(() => {
393
+ this.ws?.readyState === WebSocket.OPEN && this.streamId && this.ws.send(JSON.stringify({
394
+ event: "custom",
395
+ stream_id: this.streamId,
396
+ metadata: { keepalive: !0 }
397
+ }));
398
+ }, 2e4);
399
+ }
400
+ stopHeartbeat() {
401
+ this.pingInterval && (clearInterval(this.pingInterval), this.pingInterval = null);
402
+ }
403
+ handleParsedMessage(e) {
404
+ const t = e;
405
+ switch (t.event) {
406
+ case "ack":
407
+ this.handleAck(t);
408
+ break;
409
+ case "media_output":
410
+ this.handleMediaOutput(t);
411
+ break;
412
+ case "clear":
413
+ this.handleClear();
414
+ break;
415
+ case "error":
416
+ console.error("[Cartesia] Server error:", t);
417
+ break;
418
+ }
419
+ }
420
+ handleAck(e) {
421
+ this.streamId = e.stream_id || null, this.isReady = !0, this.setState("listening");
422
+ }
423
+ handleMediaOutput(e) {
424
+ if (!e.media?.payload) return;
425
+ this._state !== "speaking" && (this.events.emit("turnStart", void 0), this.setState("speaking"));
426
+ let t = u(e.media.payload);
427
+ this.cartesiaOutputRate !== h && (t = d(t, this.cartesiaOutputRate, h)), this.events.emit("audio", t);
428
+ }
429
+ handleClear() {
430
+ this.events.emit("interrupted", void 0), this.setState("listening");
431
+ }
432
+ sendAudio(e) {
433
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN || !this.isReady || !this.streamId)
434
+ return;
435
+ let t = e;
436
+ const s = parseInt(this.cartesiaInputFormat.split("_")[1]);
437
+ this.inputSampleRate !== s && (t = d(e, this.inputSampleRate, s)), this.ws.send(JSON.stringify({
438
+ event: "media_input",
439
+ stream_id: this.streamId,
440
+ media: {
441
+ payload: p(t)
442
+ }
443
+ }));
444
+ }
445
+ close() {
446
+ this.stopHeartbeat(), this.isReady = !1, this.streamId = null, super.close();
447
+ }
448
+ }
449
+ const x = [
450
+ { id: "gemini", name: "Gemini Live", description: "Google Gemini Live API" },
451
+ { id: "elevenlabs", name: "ElevenLabs", description: "ElevenLabs Conversational AI" },
452
+ { id: "cartesia", name: "Cartesia", description: "Cartesia Agents API" }
453
+ ];
454
+ function N(i) {
455
+ switch (i) {
456
+ case "gemini":
457
+ return new C();
458
+ case "elevenlabs":
459
+ return new k();
460
+ case "cartesia":
461
+ return new P();
462
+ default:
463
+ throw new Error(`Unknown agent type: ${i}`);
464
+ }
465
+ }
466
+ function T(i) {
467
+ return x.find((e) => e.id === i);
468
+ }
469
+ const l = /* @__PURE__ */ new Set();
470
+ class L {
471
+ apiBaseUrl;
472
+ publishableKey;
473
+ callbacks;
474
+ // DOM
475
+ _video;
476
+ _audio;
477
+ // Session
478
+ session = null;
479
+ agent = null;
480
+ audioContext = null;
481
+ processor = null;
482
+ stream = null;
483
+ abortController = null;
484
+ _status = "disconnected";
485
+ _agentState = "idle";
486
+ _isMuted = !1;
487
+ mounted = !0;
488
+ constructor(e) {
489
+ this.apiBaseUrl = e.apiBaseUrl ?? "https://api.keyframelabs.com", this.publishableKey = e.publishableKey, this.callbacks = {
490
+ onDisconnect: e.onDisconnect,
491
+ onError: e.onError,
492
+ onStateChange: e.onStateChange,
493
+ onAgentStateChange: e.onAgentStateChange
494
+ }, this._video = document.createElement("video"), this._video.style.position = "absolute", this._video.style.inset = "0", this._video.style.width = "100%", this._video.style.height = "100%", this._video.style.objectFit = e.videoFit ?? "cover", this._video.autoplay = !0, this._video.playsInline = !0, this._video.muted = !0, e.container.style.position = "relative", e.container.style.backgroundColor = "#000", this._audio = document.createElement("audio"), this._audio.autoplay = !0, e.container.appendChild(this._video), e.container.appendChild(this._audio);
495
+ }
496
+ // Read-only state
497
+ get status() {
498
+ return this._status;
499
+ }
500
+ get agentState() {
501
+ return this._agentState;
502
+ }
503
+ get isMuted() {
504
+ return this._isMuted;
505
+ }
506
+ get videoElement() {
507
+ return this._video;
508
+ }
509
+ get audioElement() {
510
+ return this._audio;
511
+ }
512
+ /** Connect to the embed session */
513
+ async connect() {
514
+ if (l.has(this.publishableKey)) {
515
+ console.log("[PersonaEmbed] Connection already in progress, skipping");
516
+ return;
517
+ }
518
+ l.add(this.publishableKey), this.mounted = !0, this.abortController = new AbortController(), this.setStatus("connecting");
519
+ try {
520
+ const e = await this.fetchSession(this.abortController.signal);
521
+ if (!this.mounted) {
522
+ l.delete(this.publishableKey);
523
+ return;
524
+ }
525
+ if (await this.initSession(e), await this.initMicrophone(), await this.connectAgent(e.voice_agent_details), !this.mounted) {
526
+ this.cleanup(), l.delete(this.publishableKey);
527
+ return;
528
+ }
529
+ this.setStatus("connected");
530
+ } catch (e) {
531
+ if (l.delete(this.publishableKey), e instanceof Error && e.name === "AbortError")
532
+ return;
533
+ console.error("[PersonaEmbed]", e), this.mounted && (this.setStatus("error"), this.callbacks.onError?.(e));
534
+ }
535
+ }
536
+ /** Disconnect and cleanup */
537
+ disconnect() {
538
+ this.mounted = !1, this.abortController?.abort(), this.abortController = null, l.delete(this.publishableKey), this.cleanup(), this.setStatus("disconnected");
539
+ }
540
+ /** Toggle microphone mute */
541
+ toggleMute() {
542
+ this._isMuted = !this._isMuted;
543
+ }
544
+ setStatus(e) {
545
+ this._status !== e && (this._status = e, this.callbacks.onStateChange?.(e));
546
+ }
547
+ setAgentState(e) {
548
+ this._agentState !== e && (this._agentState = e, this.callbacks.onAgentStateChange?.(e));
549
+ }
550
+ async fetchSession(e) {
551
+ const t = await fetch(`${this.apiBaseUrl}/v1/embed/create_session`, {
552
+ method: "POST",
553
+ headers: { "Content-Type": "application/json" },
554
+ body: JSON.stringify({ publishable_key: this.publishableKey }),
555
+ signal: e
556
+ });
557
+ if (!t.ok) {
558
+ const s = await t.json().catch(() => null);
559
+ throw new Error(`create_session failed: ${t.status} ${JSON.stringify(s)}`);
560
+ }
561
+ return t.json();
562
+ }
563
+ async initSession(e) {
564
+ this.session = v({
565
+ serverUrl: e.session_details.server_url,
566
+ participantToken: e.session_details.participant_token,
567
+ agentIdentity: e.session_details.agent_identity,
568
+ onVideoTrack: (t) => {
569
+ console.log("[PersonaEmbed] Setting video track", t.readyState, t.enabled), this._video.srcObject = new MediaStream([t]), this._video.play().catch((s) => console.warn("[PersonaEmbed] Video play failed:", s));
570
+ },
571
+ onAudioTrack: (t) => {
572
+ this._audio.srcObject = new MediaStream([t]), this._audio.play().catch(() => {
573
+ });
574
+ },
575
+ onStateChange: (t) => {
576
+ this.mounted && t === "disconnected" && (this.setStatus("disconnected"), this.callbacks.onDisconnect?.());
577
+ },
578
+ onError: (t) => {
579
+ this.mounted && this.callbacks.onError?.(t);
580
+ },
581
+ onClose: () => {
582
+ this.mounted && this.callbacks.onDisconnect?.();
583
+ }
584
+ }), this.agent = N(e.voice_agent_details.type), this.agent.on("audio", (t) => this.session?.sendAudio(t)), this.agent.on("interrupted", () => this.session?.interrupt()), this.agent.on("stateChange", (t) => this.setAgentState(t)), this.agent.on("closed", () => {
585
+ this.mounted && this.callbacks.onDisconnect?.();
586
+ }), await this.session.connect();
587
+ }
588
+ async initMicrophone() {
589
+ this.stream = await navigator.mediaDevices.getUserMedia({
590
+ audio: { sampleRate: 16e3, echoCancellation: !0, noiseSuppression: !0 }
591
+ }), this.audioContext = new AudioContext({ sampleRate: 16e3 });
592
+ const e = this.audioContext.createMediaStreamSource(this.stream);
593
+ this.processor = this.audioContext.createScriptProcessor(4096, 1, 1), this.processor.onaudioprocess = (t) => {
594
+ if (!this._isMuted) {
595
+ const s = w(t.inputBuffer.getChannelData(0));
596
+ this.agent?.sendAudio(s);
597
+ }
598
+ }, e.connect(this.processor), this.processor.connect(this.audioContext.destination);
599
+ }
600
+ async connectAgent(e) {
601
+ if (!this.agent) return;
602
+ const t = { inputSampleRate: 16e3 };
603
+ e.type === "gemini" ? await this.agent.connect({
604
+ ...t,
605
+ apiKey: e.token,
606
+ authType: "ephemeral_token"
607
+ }) : e.type === "elevenlabs" ? await this.agent.connect({
608
+ ...t,
609
+ agentId: e.agent_id,
610
+ signedUrl: e.signed_url
611
+ }) : e.type === "cartesia" && await this.agent.connect({
612
+ ...t,
613
+ agentId: e.agent_id,
614
+ apiKey: e.token
615
+ });
616
+ }
617
+ cleanup() {
618
+ this.stream?.getTracks().forEach((e) => e.stop()), this.processor?.disconnect(), this.audioContext?.close(), this.agent?.close(), this.session?.close(), this.stream = null, this.processor = null, this.audioContext = null, this.agent = null, this.session = null;
619
+ }
620
+ }
621
+ export {
622
+ x as AGENT_REGISTRY,
623
+ m as BaseAgent,
624
+ P as CartesiaAgent,
625
+ k as ElevenLabsAgent,
626
+ C as GeminiLiveAgent,
627
+ L as PersonaEmbed,
628
+ h as SAMPLE_RATE,
629
+ u as base64ToBytes,
630
+ p as bytesToBase64,
631
+ N as createAgent,
632
+ f as createEventEmitter,
633
+ w as floatTo16BitPCM,
634
+ T as getAgentInfo,
635
+ d as resamplePcm
636
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@keyframelabs/elements",
3
- "version": "0.0.1",
3
+ "version": "0.0.2",
4
4
  "main": "dist/index.js",
5
5
  "module": "dist/index.mjs",
6
6
  "types": "dist/index.d.ts",