npm - @glydeunity/voice-sdk - Versions diffs - 1.0.0 → 1.1.0 - Mend

@glydeunity/voice-sdk 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -1,32 +1,243 @@
+/**
+ * GLYDE Voice SDK
+ *
+ * Voice agent client for GLYDE Unity with support for multiple authentication methods
+ * and voice context types.
+ *
+ * @packageDocumentation
+ */
+/**
+ * Deepgram agent configuration for LLM and voice settings
+ */
+export declare interface DeepgramAgentConfig {
+    think?: {
+        provider?: {
+            type: string;
+            model?: string;
+        };
+        functions?: Array<{
+            name: string;
+            description: string;
+            parameters: unknown;
+        }>;
+    };
+    speak?: {
+        provider?: {
+            type: string;
+            model?: string;
+        };
+    };
+    listen?: {
+        provider?: {
+            type: string;
+            model?: string;
+            version?: string;
+        };
+    };
+}
+/**
+ * GlydeVoice - Voice Agent Client
+ *
+ * Connects to Deepgram Voice Agent API for bidirectional voice conversation.
+ * Uses wss://agent.deepgram.com/agent WebSocket endpoint which:
+ * - Receives user audio (microphone)
+ * - Transcribes speech to text (STT)
+ * - Sends to LLM for response
+ * - Converts response to speech (TTS)
+ * - Streams audio back to user
+ *
+ * Audio Architecture:
+ * - Microphone capture: AudioWorklet (audio-capture-processor.js) at 48kHz
+ * - Playback: AudioWorklet (audio-playback-processor.js) with ring buffer
+ * - Ring buffer enables instant interruption (clear buffer when user speaks)
+ */
 export declare class GlydeVoice {
     private config;
-    private _deepgram;
     private unityUrl;
     private active;
+    private serverConfig;
+    private ws;
+    private audioContext;
+    private mediaStream;
+    private captureWorkletNode;
+    private playbackWorkletNode;
+    private isMuted;
+    private readonly outputSampleRate;
+    private readonly inputSampleRate;
+    private isAgentSpeaking;
+    private agentAudioDoneReceived;
+    /**
+     * Create a new GlydeVoice instance
+     * @param config - Configuration options
+     */
     constructor(config: GlydeVoiceConfig);
+    /**
+     * Get authentication headers based on configured auth method
+     * Supports publishableKey, apiKey, and JWT token (authToken)
+     * @returns Headers object with appropriate authentication
+     */
+    private getAuthHeaders;
+    /**
+     * Fetch voice configuration from Unity API
+     * @returns Voice configuration including system prompt, tools, and Deepgram settings
+     */
+    private fetchConfig;
     /**
      * Initialize and start the voice session
      */
     start(): Promise<void>;
+    /**
+     * Initialize the audio system with both capture and playback worklets
+     */
+    private initializeAudio;
+    /**
+     * Handle text messages from the Voice Agent
+     */
+    private handleTextMessage;
+    /**
+     * Handle binary audio data (Blob) from agent TTS
+     */
+    private handleAudioData;
+    /**
+     * Handle binary audio buffer from agent TTS
+     * Deepgram sends linear16 PCM at 24kHz, we need to resample to 48kHz for playback
+     */
+    private handleAudioBuffer;
+    /**
+     * Resample audio from 24kHz to 48kHz using linear interpolation
+     */
+    private resample24kTo48k;
+    /**
+     * Clear the playback buffer (for interruption handling)
+     */
+    private clearPlaybackBuffer;
+    /**
+     * Start capturing microphone audio using AudioWorklet
+     */
+    private startMicrophone;
+    /**
+     * Save transcript to Unity backend
+     */
+    private saveTranscript;
+    /**
+     * Toggle mute state
+     * @param muted - Whether to mute the microphone
+     */
+    setMuted(muted: boolean): void;
+    /**
+     * Get current mute state
+     */
+    getMuted(): boolean;
+    /**
+     * Check if the voice agent is currently active
+     */
+    isActive(): boolean;
+    /**
+     * Get the current server configuration
+     */
+    getServerConfig(): VoiceConfig | null;
     /**
      * Stop the voice session
      */
     stop(): void;
+    /**
+     * Cleanup resources
+     */
+    private cleanup;
+    /**
+     * Emit event to callback
+     */
     private emit;
+    /**
+     * Render a simple UI widget (optional)
+     */
     private renderUI;
 }
+/**
+ * Configuration options for GlydeVoice
+ */
 export declare interface GlydeVoiceConfig {
-    publishableKey: string;
+    /** Publishable key for external apps (Screen) */
+    publishableKey?: string;
+    /** API key for programmatic access */
+    apiKey?: string;
+    /** JWT token for GLYDEBuddy passthrough (Teams app) */
+    authToken?: string;
+    /** Voice context type - determines which prompt and tools to use */
+    contextType: VoiceContextType;
+    /** Context identifier (e.g., application_uuid) - required for screening */
     contextId?: string;
+    /** Unity API base URL - defaults to https://api.glydeunity.com */
     unityBaseUrl?: string;
+    /** DOM element to render the widget UI (optional) */
     container?: HTMLElement | string;
+    /** Event callback for voice agent events */
     onEvent?: (event: VoiceEvent) => void;
+    /** Transcript callback for conversation text */
+    onTranscript?: (text: string, role: 'user' | 'agent') => void;
+    /** Override system prompt (skips config fetch) */
+    systemPrompt?: string;
+    /** Override Deepgram configuration */
+    deepgramConfig?: DeepgramAgentConfig;
 }
+/**
+ * MCP Tool definition for voice agent
+ */
+export declare interface MCPTool {
+    name: string;
+    description: string;
+    inputSchema?: unknown;
+}
+/**
+ * Voice configuration response from Unity API
+ */
+export declare interface VoiceConfig {
+    system_prompt: string;
+    available_tools: MCPTool[];
+    deepgram_config: DeepgramAgentConfig;
+    context: {
+        type: VoiceContextType;
+        id: string | null;
+    };
+}
+/**
+ * GlydeVoice SDK - Voice Agent Client for GLYDE Unity
+ *
+ * Provides voice interaction capabilities with GLYDE AI agents through Deepgram Voice API.
+ * Supports multiple authentication methods: publishableKey, apiKey, and JWT token.
+ *
+ * @example
+ * // Using publishable key (external apps)
+ * const voice = new GlydeVoice({
+ *   publishableKey: 'pk_...',
+ *   contextType: 'screening',
+ *   contextId: 'application-uuid'
+ * });
+ *
+ * @example
+ * // Using JWT token (GLYDEBuddy Teams app)
+ * const voice = new GlydeVoice({
+ *   authToken: userSession.accessToken,
+ *   contextType: 'recruiter'
+ * });
+ */
+/**
+ * Voice context types supported by the voice agent
+ */
+export declare type VoiceContextType = 'screening' | 'recruiter' | 'custom' | 'phone';
+/**
+ * Voice events emitted by the agent
+ */
 export declare interface VoiceEvent {
-    type: 'open' | 'close' | 'error' | 'transcript' | 'agent_audio';
-    payload?: any;
+    type: 'open' | 'close' | 'error' | 'ready' | 'user_speaking' | 'agent_speaking' | 'microphone_ready' | 'transcript' | 'agent_audio';
+    payload?: unknown;
 }
 export { }

package/dist/voice-sdk.es.js CHANGED Viewed

@@ -1,4 +1,349 @@
-import { G as r } from "./index-BbD4w_Sz.js";
+class l {
+  config;
+  unityUrl;
+  active = !1;
+  serverConfig = null;
+  // WebSocket and Audio
+  ws = null;
+  audioContext = null;
+  mediaStream = null;
+  captureWorkletNode = null;
+  playbackWorkletNode = null;
+  isMuted = !1;
+  // Audio settings
+  outputSampleRate = 24e3;
+  // Deepgram TTS output rate
+  inputSampleRate = 48e3;
+  // Microphone input rate
+  // Agent state
+  isAgentSpeaking = !1;
+  agentAudioDoneReceived = !1;
+  /**
+   * Create a new GlydeVoice instance
+   * @param config - Configuration options
+   */
+  constructor(e) {
+    this.config = e, this.unityUrl = e.unityBaseUrl || "https://api.glydeunity.com", !e.publishableKey && !e.apiKey && !e.authToken && console.warn("[GlydeVoice] No authentication method provided. One of publishableKey, apiKey, or authToken is required.");
+  }
+  /**
+   * Get authentication headers based on configured auth method
+   * Supports publishableKey, apiKey, and JWT token (authToken)
+   * @returns Headers object with appropriate authentication
+   */
+  getAuthHeaders() {
+    const e = {
+      "Content-Type": "application/json"
+    };
+    return this.config.publishableKey && (e["x-publishable-key"] = this.config.publishableKey), this.config.apiKey && (e["x-api-key"] = this.config.apiKey), this.config.authToken && (e.Authorization = `Bearer ${this.config.authToken}`), e;
+  }
+  /**
+   * Fetch voice configuration from Unity API
+   * @returns Voice configuration including system prompt, tools, and Deepgram settings
+   */
+  async fetchConfig() {
+    const e = `${this.unityUrl}/api/unity/voice/config/${this.config.contextType}`, t = this.config.contextId ? `${e}/${this.config.contextId}` : e, o = await fetch(t, {
+      method: "GET",
+      headers: this.getAuthHeaders()
+    });
+    if (!o.ok) {
+      const a = await o.json();
+      throw new Error(a.error?.message || a.message || "Failed to fetch voice config");
+    }
+    const { data: s } = await o.json();
+    return s;
+  }
+  /**
+   * Initialize and start the voice session
+   */
+  async start() {
+    if (!this.active) {
+      this.active = !0;
+      try {
+        this.config.systemPrompt || (this.serverConfig = await this.fetchConfig(), console.log("[GlydeVoice] Fetched config:", this.serverConfig));
+        const e = await fetch(`${this.unityUrl}/api/unity/voice/auth`, {
+          method: "POST",
+          headers: this.getAuthHeaders(),
+          body: JSON.stringify({
+            context_id: this.config.contextId,
+            domain: typeof window < "u" ? window.location.hostname : "localhost"
+          })
+        });
+        if (!e.ok) {
+          const i = await e.json();
+          throw new Error(i.error?.message || i.message || "Failed to authenticate voice session");
+        }
+        const { data: t } = await e.json(), { token: o, agent_config: s } = t, a = this.config.systemPrompt || this.serverConfig?.system_prompt || s.instructions || "You are a helpful AI assistant.";
+        await this.initializeAudio();
+        const n = "wss://agent.deepgram.com/v1/agent/converse";
+        this.ws = new WebSocket(n, ["bearer", o]), this.ws.onopen = () => {
+          const i = this.config.deepgramConfig || this.serverConfig?.deepgram_config || {
+            think: { provider: { type: "open_ai", model: "gpt-4o-mini" } },
+            speak: { provider: { type: "deepgram", model: "aura-2-thalia-en" } },
+            listen: { provider: { type: "deepgram", model: "nova-2", version: "latest" } }
+          }, r = {
+            type: "Settings",
+            audio: {
+              input: {
+                encoding: "linear16",
+                sample_rate: this.inputSampleRate
+              },
+              output: {
+                encoding: "linear16",
+                sample_rate: this.outputSampleRate,
+                container: "none"
+              }
+            },
+            agent: {
+              language: "en",
+              speak: i.speak || {
+                provider: { type: "deepgram", model: "aura-2-thalia-en" }
+              },
+              listen: i.listen || {
+                provider: { type: "deepgram", version: "v2", model: "flux-general-en" }
+              },
+              think: {
+                provider: i.think?.provider || { type: "open_ai", model: "gpt-4o-mini" },
+                functions: i.think?.functions || [
+                  {
+                    name: "end_conversation",
+                    description: "End the conversation when stop phrases are detected.",
+                    parameters: {
+                      type: "object",
+                      properties: {
+                        item: { type: "string", description: "The phrase that triggered end of conversation" }
+                      },
+                      required: ["item"]
+                    }
+                  }
+                ]
+              },
+              greeting: "Hi! I'm ready to speak with you. How can I help you today?"
+            }
+          };
+          this.ws.send(JSON.stringify(r)), this.emit({ type: "open", payload: { config: s, serverConfig: this.serverConfig } });
+        };
+        const c = a;
+        this.ws.onmessage = (i) => {
+          if (typeof i.data == "string") {
+            try {
+              if (JSON.parse(i.data).type === "SettingsApplied") {
+                const d = {
+                  type: "UpdatePrompt",
+                  prompt: c
+                };
+                this.ws.send(JSON.stringify(d)), this.startMicrophone();
+              }
+            } catch {
+            }
+            this.handleTextMessage(i.data);
+          } else i.data instanceof Blob ? this.handleAudioData(i.data) : i.data instanceof ArrayBuffer && this.handleAudioBuffer(i.data);
+        }, this.ws.onerror = (i) => {
+          console.error("[GlydeVoice] WebSocket error:", i), this.emit({ type: "error", payload: i });
+        }, this.ws.onclose = () => {
+          this.cleanup(), this.emit({ type: "close" });
+        }, this.renderUI();
+      } catch (e) {
+        throw console.error("[GlydeVoice] Error starting session:", e), this.active = !1, this.emit({ type: "error", payload: e }), e;
+      }
+    }
+  }
+  /**
+   * Initialize the audio system with both capture and playback worklets
+   */
+  async initializeAudio() {
+    this.audioContext = new AudioContext({ sampleRate: this.inputSampleRate }), await Promise.all([
+      this.audioContext.audioWorklet.addModule("/audio-processor.js"),
+      this.audioContext.audioWorklet.addModule("/audio-playback-processor.js")
+    ]), this.playbackWorkletNode = new AudioWorkletNode(this.audioContext, "audio-playback-processor"), this.playbackWorkletNode.connect(this.audioContext.destination), this.playbackWorkletNode.port.onmessage = (e) => {
+      const { type: t } = e.data;
+      (t === "cleared" || t === "bufferEmpty") && (this.isAgentSpeaking = !1, this.agentAudioDoneReceived = !1, this.emit({ type: "agent_speaking", payload: !1 }));
+    };
+  }
+  /**
+   * Handle text messages from the Voice Agent
+   */
+  handleTextMessage(e) {
+    try {
+      const t = JSON.parse(e);
+      switch (t.type) {
+        case "Welcome":
+          this.emit({ type: "ready" });
+          break;
+        case "SettingsApplied":
+          break;
+        case "UserStartedSpeaking":
+          this.emit({ type: "user_speaking", payload: !0 }), this.clearPlaybackBuffer(), this.isAgentSpeaking = !1, this.agentAudioDoneReceived = !1;
+          break;
+        case "UserStoppedSpeaking":
+          this.emit({ type: "user_speaking", payload: !1 });
+          break;
+        case "ConversationText":
+          if (t.content && t.content.trim()) {
+            const o = t.role === "assistant" ? "agent" : "user";
+            this.config.onTranscript && this.config.onTranscript(t.content, o), this.emit({ type: "transcript", payload: { text: t.content, role: o } }), this.saveTranscript(t.content, t.role);
+          }
+          break;
+        case "AgentStartedSpeaking":
+          this.isAgentSpeaking = !0, this.agentAudioDoneReceived = !1, this.emit({ type: "agent_speaking", payload: !0 });
+          break;
+        case "AgentAudioDone":
+          this.agentAudioDoneReceived = !0;
+          break;
+        case "Error":
+          console.error("[GlydeVoice] Agent error:", t), this.emit({ type: "error", payload: t });
+          break;
+      }
+    } catch (t) {
+      console.error("[GlydeVoice] Failed to parse message:", t);
+    }
+  }
+  /**
+   * Handle binary audio data (Blob) from agent TTS
+   */
+  async handleAudioData(e) {
+    const t = await e.arrayBuffer();
+    this.handleAudioBuffer(t);
+  }
+  /**
+   * Handle binary audio buffer from agent TTS
+   * Deepgram sends linear16 PCM at 24kHz, we need to resample to 48kHz for playback
+   */
+  handleAudioBuffer(e) {
+    if (!this.playbackWorkletNode || !this.audioContext) return;
+    this.audioContext.state === "suspended" && this.audioContext.resume();
+    const t = e.byteLength;
+    if (t === 0) return;
+    const o = t - t % 2;
+    if (o === 0) return;
+    const s = o === t ? e : e.slice(0, o), a = new Int16Array(s), n = new Float32Array(a.length);
+    for (let r = 0; r < a.length; r++)
+      n[r] = a[r] / 32768;
+    const c = this.resample24kTo48k(n);
+    !this.isAgentSpeaking && !this.agentAudioDoneReceived && (this.isAgentSpeaking = !0, this.emit({ type: "agent_speaking", payload: !0 }));
+    const i = new Float32Array(c);
+    this.playbackWorkletNode.port.postMessage({
+      type: "audio",
+      data: i
+    }, [i.buffer]);
+  }
+  /**
+   * Resample audio from 24kHz to 48kHz using linear interpolation
+   */
+  resample24kTo48k(e) {
+    const t = e.length * 2, o = new Float32Array(t);
+    for (let a = 0; a < e.length - 1; a++) {
+      const n = e[a], c = e[a + 1];
+      o[a * 2] = n, o[a * 2 + 1] = (n + c) / 2;
+    }
+    const s = e.length - 1;
+    return o[s * 2] = e[s], o[s * 2 + 1] = e[s], o;
+  }
+  /**
+   * Clear the playback buffer (for interruption handling)
+   */
+  clearPlaybackBuffer() {
+    this.playbackWorkletNode && this.playbackWorkletNode.port.postMessage({ type: "clear" });
+  }
+  /**
+   * Start capturing microphone audio using AudioWorklet
+   */
+  async startMicrophone() {
+    if (!this.audioContext)
+      throw new Error("Audio context not initialized");
+    try {
+      this.mediaStream = await navigator.mediaDevices.getUserMedia({
+        audio: {
+          channelCount: 1,
+          sampleRate: this.inputSampleRate,
+          echoCancellation: !0,
+          noiseSuppression: !0
+        }
+      });
+      const e = this.audioContext.createMediaStreamSource(this.mediaStream);
+      this.captureWorkletNode = new AudioWorkletNode(this.audioContext, "audio-capture-processor"), this.captureWorkletNode.port.onmessage = (t) => {
+        !this.active || !this.ws || this.ws.readyState !== WebSocket.OPEN || this.isMuted || this.ws.send(t.data);
+      }, e.connect(this.captureWorkletNode), this.emit({ type: "microphone_ready" });
+    } catch (e) {
+      throw console.error("[GlydeVoice] Microphone error:", e), e;
+    }
+  }
+  /**
+   * Save transcript to Unity backend
+   */
+  async saveTranscript(e, t) {
+    if (!(!this.config.contextId || !e))
+      try {
+        await fetch(`${this.unityUrl}/api/unity/voice/transcript`, {
+          method: "POST",
+          headers: this.getAuthHeaders(),
+          body: JSON.stringify({
+            context_id: this.config.contextId,
+            content: e,
+            role: t === "assistant" ? "assistant" : "user"
+          })
+        });
+      } catch {
+      }
+  }
+  /**
+   * Toggle mute state
+   * @param muted - Whether to mute the microphone
+   */
+  setMuted(e) {
+    this.isMuted = e;
+  }
+  /**
+   * Get current mute state
+   */
+  getMuted() {
+    return this.isMuted;
+  }
+  /**
+   * Check if the voice agent is currently active
+   */
+  isActive() {
+    return this.active;
+  }
+  /**
+   * Get the current server configuration
+   */
+  getServerConfig() {
+    return this.serverConfig;
+  }
+  /**
+   * Stop the voice session
+   */
+  stop() {
+    this.active = !1, this.cleanup();
+  }
+  /**
+   * Cleanup resources
+   */
+  cleanup() {
+    this.captureWorkletNode && (this.captureWorkletNode.disconnect(), this.captureWorkletNode.port.close(), this.captureWorkletNode = null), this.playbackWorkletNode && (this.playbackWorkletNode.disconnect(), this.playbackWorkletNode.port.close(), this.playbackWorkletNode = null), this.mediaStream && (this.mediaStream.getTracks().forEach((e) => e.stop()), this.mediaStream = null), this.audioContext && (this.audioContext.close(), this.audioContext = null), this.ws && (this.ws.readyState === WebSocket.OPEN && this.ws.close(), this.ws = null);
+  }
+  /**
+   * Emit event to callback
+   */
+  emit(e) {
+    this.config.onEvent && this.config.onEvent(e);
+  }
+  /**
+   * Render a simple UI widget (optional)
+   */
+  renderUI() {
+    if (!this.config.container) return;
+    const e = typeof this.config.container == "string" ? document.querySelector(this.config.container) : this.config.container;
+    e && (e.innerHTML = `
+        <div style="padding: 20px; border: 1px solid #ccc; border-radius: 8px; background: #fff;">
+          <h3>Glyde Voice Agent</h3>
+          <p>Status: Active</p>
+          <p>Context: ${this.config.contextType}</p>
+          <button onclick="this.closest('div').remove()">Close</button>
+        </div>
+      `);
+  }
+}
 export {
-  r as GlydeVoice
+  l as GlydeVoice
 };