@glydeunity/voice-sdk 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,32 +1,243 @@
1
+ /**
2
+ * GLYDE Voice SDK
3
+ *
4
+ * Voice agent client for GLYDE Unity with support for multiple authentication methods
5
+ * and voice context types.
6
+ *
7
+ * @packageDocumentation
8
+ */
9
+
10
+ /**
11
+ * Deepgram agent configuration for LLM and voice settings
12
+ */
13
+ export declare interface DeepgramAgentConfig {
14
+ think?: {
15
+ provider?: {
16
+ type: string;
17
+ model?: string;
18
+ };
19
+ functions?: Array<{
20
+ name: string;
21
+ description: string;
22
+ parameters: unknown;
23
+ }>;
24
+ };
25
+ speak?: {
26
+ provider?: {
27
+ type: string;
28
+ model?: string;
29
+ };
30
+ };
31
+ listen?: {
32
+ provider?: {
33
+ type: string;
34
+ model?: string;
35
+ version?: string;
36
+ };
37
+ };
38
+ }
39
+
40
+ /**
41
+ * GlydeVoice - Voice Agent Client
42
+ *
43
+ * Connects to Deepgram Voice Agent API for bidirectional voice conversation.
44
+ * Uses wss://agent.deepgram.com/agent WebSocket endpoint which:
45
+ * - Receives user audio (microphone)
46
+ * - Transcribes speech to text (STT)
47
+ * - Sends to LLM for response
48
+ * - Converts response to speech (TTS)
49
+ * - Streams audio back to user
50
+ *
51
+ * Audio Architecture:
52
+ * - Microphone capture: AudioWorklet (audio-capture-processor.js) at 48kHz
53
+ * - Playback: AudioWorklet (audio-playback-processor.js) with ring buffer
54
+ * - Ring buffer enables instant interruption (clear buffer when user speaks)
55
+ */
1
56
  export declare class GlydeVoice {
2
57
  private config;
3
- private _deepgram;
4
58
  private unityUrl;
5
59
  private active;
60
+ private serverConfig;
61
+ private ws;
62
+ private audioContext;
63
+ private mediaStream;
64
+ private captureWorkletNode;
65
+ private playbackWorkletNode;
66
+ private isMuted;
67
+ private readonly outputSampleRate;
68
+ private readonly inputSampleRate;
69
+ private isAgentSpeaking;
70
+ private agentAudioDoneReceived;
71
+ /**
72
+ * Create a new GlydeVoice instance
73
+ * @param config - Configuration options
74
+ */
6
75
  constructor(config: GlydeVoiceConfig);
76
+ /**
77
+ * Get authentication headers based on configured auth method
78
+ * Supports publishableKey, apiKey, and JWT token (authToken)
79
+ * @returns Headers object with appropriate authentication
80
+ */
81
+ private getAuthHeaders;
82
+ /**
83
+ * Fetch voice configuration from Unity API
84
+ * @returns Voice configuration including system prompt, tools, and Deepgram settings
85
+ */
86
+ private fetchConfig;
7
87
  /**
8
88
  * Initialize and start the voice session
9
89
  */
10
90
  start(): Promise<void>;
91
+ /**
92
+ * Initialize the audio system with both capture and playback worklets
93
+ */
94
+ private initializeAudio;
95
+ /**
96
+ * Handle text messages from the Voice Agent
97
+ */
98
+ private handleTextMessage;
99
+ /**
100
+ * Handle binary audio data (Blob) from agent TTS
101
+ */
102
+ private handleAudioData;
103
+ /**
104
+ * Handle binary audio buffer from agent TTS
105
+ * Deepgram sends linear16 PCM at 24kHz, we need to resample to 48kHz for playback
106
+ */
107
+ private handleAudioBuffer;
108
+ /**
109
+ * Resample audio from 24kHz to 48kHz using linear interpolation
110
+ */
111
+ private resample24kTo48k;
112
+ /**
113
+ * Clear the playback buffer (for interruption handling)
114
+ */
115
+ private clearPlaybackBuffer;
116
+ /**
117
+ * Start capturing microphone audio using AudioWorklet
118
+ */
119
+ private startMicrophone;
120
+ /**
121
+ * Save transcript to Unity backend
122
+ */
123
+ private saveTranscript;
124
+ /**
125
+ * Toggle mute state
126
+ * @param muted - Whether to mute the microphone
127
+ */
128
+ setMuted(muted: boolean): void;
129
+ /**
130
+ * Get current mute state
131
+ */
132
+ getMuted(): boolean;
133
+ /**
134
+ * Check if the voice agent is currently active
135
+ */
136
+ isActive(): boolean;
137
+ /**
138
+ * Get the current server configuration
139
+ */
140
+ getServerConfig(): VoiceConfig | null;
11
141
  /**
12
142
  * Stop the voice session
13
143
  */
14
144
  stop(): void;
145
+ /**
146
+ * Cleanup resources
147
+ */
148
+ private cleanup;
149
+ /**
150
+ * Emit event to callback
151
+ */
15
152
  private emit;
153
+ /**
154
+ * Render a simple UI widget (optional)
155
+ */
16
156
  private renderUI;
17
157
  }
18
158
 
159
+ /**
160
+ * Configuration options for GlydeVoice
161
+ */
19
162
  export declare interface GlydeVoiceConfig {
20
- publishableKey: string;
163
+ /** Publishable key for external apps (Screen) */
164
+ publishableKey?: string;
165
+ /** API key for programmatic access */
166
+ apiKey?: string;
167
+ /** JWT token for GLYDEBuddy passthrough (Teams app) */
168
+ authToken?: string;
169
+ /** Voice context type - determines which prompt and tools to use */
170
+ contextType: VoiceContextType;
171
+ /** Context identifier (e.g., application_uuid) - required for screening */
21
172
  contextId?: string;
173
+ /** Unity API base URL - defaults to https://api.glydeunity.com */
22
174
  unityBaseUrl?: string;
175
+ /** DOM element to render the widget UI (optional) */
23
176
  container?: HTMLElement | string;
177
+ /** Event callback for voice agent events */
24
178
  onEvent?: (event: VoiceEvent) => void;
179
+ /** Transcript callback for conversation text */
180
+ onTranscript?: (text: string, role: 'user' | 'agent') => void;
181
+ /** Override system prompt (skips config fetch) */
182
+ systemPrompt?: string;
183
+ /** Override Deepgram configuration */
184
+ deepgramConfig?: DeepgramAgentConfig;
25
185
  }
26
186
 
187
+ /**
188
+ * MCP Tool definition for voice agent
189
+ */
190
+ export declare interface MCPTool {
191
+ name: string;
192
+ description: string;
193
+ inputSchema?: unknown;
194
+ }
195
+
196
+ /**
197
+ * Voice configuration response from Unity API
198
+ */
199
+ export declare interface VoiceConfig {
200
+ system_prompt: string;
201
+ available_tools: MCPTool[];
202
+ deepgram_config: DeepgramAgentConfig;
203
+ context: {
204
+ type: VoiceContextType;
205
+ id: string | null;
206
+ };
207
+ }
208
+
209
+ /**
210
+ * GlydeVoice SDK - Voice Agent Client for GLYDE Unity
211
+ *
212
+ * Provides voice interaction capabilities with GLYDE AI agents through Deepgram Voice API.
213
+ * Supports multiple authentication methods: publishableKey, apiKey, and JWT token.
214
+ *
215
+ * @example
216
+ * // Using publishable key (external apps)
217
+ * const voice = new GlydeVoice({
218
+ * publishableKey: 'pk_...',
219
+ * contextType: 'screening',
220
+ * contextId: 'application-uuid'
221
+ * });
222
+ *
223
+ * @example
224
+ * // Using JWT token (GLYDEBuddy Teams app)
225
+ * const voice = new GlydeVoice({
226
+ * authToken: userSession.accessToken,
227
+ * contextType: 'recruiter'
228
+ * });
229
+ */
230
+ /**
231
+ * Voice context types supported by the voice agent
232
+ */
233
+ export declare type VoiceContextType = 'screening' | 'recruiter' | 'custom' | 'phone';
234
+
235
+ /**
236
+ * Voice events emitted by the agent
237
+ */
27
238
  export declare interface VoiceEvent {
28
- type: 'open' | 'close' | 'error' | 'transcript' | 'agent_audio';
29
- payload?: any;
239
+ type: 'open' | 'close' | 'error' | 'ready' | 'user_speaking' | 'agent_speaking' | 'microphone_ready' | 'transcript' | 'agent_audio';
240
+ payload?: unknown;
30
241
  }
31
242
 
32
243
  export { }
@@ -1,4 +1,349 @@
1
- import { G as r } from "./index-BbD4w_Sz.js";
1
+ class l {
2
+ config;
3
+ unityUrl;
4
+ active = !1;
5
+ serverConfig = null;
6
+ // WebSocket and Audio
7
+ ws = null;
8
+ audioContext = null;
9
+ mediaStream = null;
10
+ captureWorkletNode = null;
11
+ playbackWorkletNode = null;
12
+ isMuted = !1;
13
+ // Audio settings
14
+ outputSampleRate = 24e3;
15
+ // Deepgram TTS output rate
16
+ inputSampleRate = 48e3;
17
+ // Microphone input rate
18
+ // Agent state
19
+ isAgentSpeaking = !1;
20
+ agentAudioDoneReceived = !1;
21
+ /**
22
+ * Create a new GlydeVoice instance
23
+ * @param config - Configuration options
24
+ */
25
+ constructor(e) {
26
+ this.config = e, this.unityUrl = e.unityBaseUrl || "https://api.glydeunity.com", !e.publishableKey && !e.apiKey && !e.authToken && console.warn("[GlydeVoice] No authentication method provided. One of publishableKey, apiKey, or authToken is required.");
27
+ }
28
+ /**
29
+ * Get authentication headers based on configured auth method
30
+ * Supports publishableKey, apiKey, and JWT token (authToken)
31
+ * @returns Headers object with appropriate authentication
32
+ */
33
+ getAuthHeaders() {
34
+ const e = {
35
+ "Content-Type": "application/json"
36
+ };
37
+ return this.config.publishableKey && (e["x-publishable-key"] = this.config.publishableKey), this.config.apiKey && (e["x-api-key"] = this.config.apiKey), this.config.authToken && (e.Authorization = `Bearer ${this.config.authToken}`), e;
38
+ }
39
+ /**
40
+ * Fetch voice configuration from Unity API
41
+ * @returns Voice configuration including system prompt, tools, and Deepgram settings
42
+ */
43
+ async fetchConfig() {
44
+ const e = `${this.unityUrl}/api/unity/voice/config/${this.config.contextType}`, t = this.config.contextId ? `${e}/${this.config.contextId}` : e, o = await fetch(t, {
45
+ method: "GET",
46
+ headers: this.getAuthHeaders()
47
+ });
48
+ if (!o.ok) {
49
+ const a = await o.json();
50
+ throw new Error(a.error?.message || a.message || "Failed to fetch voice config");
51
+ }
52
+ const { data: s } = await o.json();
53
+ return s;
54
+ }
55
+ /**
56
+ * Initialize and start the voice session
57
+ */
58
+ async start() {
59
+ if (!this.active) {
60
+ this.active = !0;
61
+ try {
62
+ this.config.systemPrompt || (this.serverConfig = await this.fetchConfig(), console.log("[GlydeVoice] Fetched config:", this.serverConfig));
63
+ const e = await fetch(`${this.unityUrl}/api/unity/voice/auth`, {
64
+ method: "POST",
65
+ headers: this.getAuthHeaders(),
66
+ body: JSON.stringify({
67
+ context_id: this.config.contextId,
68
+ domain: typeof window < "u" ? window.location.hostname : "localhost"
69
+ })
70
+ });
71
+ if (!e.ok) {
72
+ const i = await e.json();
73
+ throw new Error(i.error?.message || i.message || "Failed to authenticate voice session");
74
+ }
75
+ const { data: t } = await e.json(), { token: o, agent_config: s } = t, a = this.config.systemPrompt || this.serverConfig?.system_prompt || s.instructions || "You are a helpful AI assistant.";
76
+ await this.initializeAudio();
77
+ const n = "wss://agent.deepgram.com/v1/agent/converse";
78
+ this.ws = new WebSocket(n, ["bearer", o]), this.ws.onopen = () => {
79
+ const i = this.config.deepgramConfig || this.serverConfig?.deepgram_config || {
80
+ think: { provider: { type: "open_ai", model: "gpt-4o-mini" } },
81
+ speak: { provider: { type: "deepgram", model: "aura-2-thalia-en" } },
82
+ listen: { provider: { type: "deepgram", model: "nova-2", version: "latest" } }
83
+ }, r = {
84
+ type: "Settings",
85
+ audio: {
86
+ input: {
87
+ encoding: "linear16",
88
+ sample_rate: this.inputSampleRate
89
+ },
90
+ output: {
91
+ encoding: "linear16",
92
+ sample_rate: this.outputSampleRate,
93
+ container: "none"
94
+ }
95
+ },
96
+ agent: {
97
+ language: "en",
98
+ speak: i.speak || {
99
+ provider: { type: "deepgram", model: "aura-2-thalia-en" }
100
+ },
101
+ listen: i.listen || {
102
+ provider: { type: "deepgram", version: "v2", model: "flux-general-en" }
103
+ },
104
+ think: {
105
+ provider: i.think?.provider || { type: "open_ai", model: "gpt-4o-mini" },
106
+ functions: i.think?.functions || [
107
+ {
108
+ name: "end_conversation",
109
+ description: "End the conversation when stop phrases are detected.",
110
+ parameters: {
111
+ type: "object",
112
+ properties: {
113
+ item: { type: "string", description: "The phrase that triggered end of conversation" }
114
+ },
115
+ required: ["item"]
116
+ }
117
+ }
118
+ ]
119
+ },
120
+ greeting: "Hi! I'm ready to speak with you. How can I help you today?"
121
+ }
122
+ };
123
+ this.ws.send(JSON.stringify(r)), this.emit({ type: "open", payload: { config: s, serverConfig: this.serverConfig } });
124
+ };
125
+ const c = a;
126
+ this.ws.onmessage = (i) => {
127
+ if (typeof i.data == "string") {
128
+ try {
129
+ if (JSON.parse(i.data).type === "SettingsApplied") {
130
+ const d = {
131
+ type: "UpdatePrompt",
132
+ prompt: c
133
+ };
134
+ this.ws.send(JSON.stringify(d)), this.startMicrophone();
135
+ }
136
+ } catch {
137
+ }
138
+ this.handleTextMessage(i.data);
139
+ } else i.data instanceof Blob ? this.handleAudioData(i.data) : i.data instanceof ArrayBuffer && this.handleAudioBuffer(i.data);
140
+ }, this.ws.onerror = (i) => {
141
+ console.error("[GlydeVoice] WebSocket error:", i), this.emit({ type: "error", payload: i });
142
+ }, this.ws.onclose = () => {
143
+ this.cleanup(), this.emit({ type: "close" });
144
+ }, this.renderUI();
145
+ } catch (e) {
146
+ throw console.error("[GlydeVoice] Error starting session:", e), this.active = !1, this.emit({ type: "error", payload: e }), e;
147
+ }
148
+ }
149
+ }
150
+ /**
151
+ * Initialize the audio system with both capture and playback worklets
152
+ */
153
+ async initializeAudio() {
154
+ this.audioContext = new AudioContext({ sampleRate: this.inputSampleRate }), await Promise.all([
155
+ this.audioContext.audioWorklet.addModule("/audio-processor.js"),
156
+ this.audioContext.audioWorklet.addModule("/audio-playback-processor.js")
157
+ ]), this.playbackWorkletNode = new AudioWorkletNode(this.audioContext, "audio-playback-processor"), this.playbackWorkletNode.connect(this.audioContext.destination), this.playbackWorkletNode.port.onmessage = (e) => {
158
+ const { type: t } = e.data;
159
+ (t === "cleared" || t === "bufferEmpty") && (this.isAgentSpeaking = !1, this.agentAudioDoneReceived = !1, this.emit({ type: "agent_speaking", payload: !1 }));
160
+ };
161
+ }
162
+ /**
163
+ * Handle text messages from the Voice Agent
164
+ */
165
+ handleTextMessage(e) {
166
+ try {
167
+ const t = JSON.parse(e);
168
+ switch (t.type) {
169
+ case "Welcome":
170
+ this.emit({ type: "ready" });
171
+ break;
172
+ case "SettingsApplied":
173
+ break;
174
+ case "UserStartedSpeaking":
175
+ this.emit({ type: "user_speaking", payload: !0 }), this.clearPlaybackBuffer(), this.isAgentSpeaking = !1, this.agentAudioDoneReceived = !1;
176
+ break;
177
+ case "UserStoppedSpeaking":
178
+ this.emit({ type: "user_speaking", payload: !1 });
179
+ break;
180
+ case "ConversationText":
181
+ if (t.content && t.content.trim()) {
182
+ const o = t.role === "assistant" ? "agent" : "user";
183
+ this.config.onTranscript && this.config.onTranscript(t.content, o), this.emit({ type: "transcript", payload: { text: t.content, role: o } }), this.saveTranscript(t.content, t.role);
184
+ }
185
+ break;
186
+ case "AgentStartedSpeaking":
187
+ this.isAgentSpeaking = !0, this.agentAudioDoneReceived = !1, this.emit({ type: "agent_speaking", payload: !0 });
188
+ break;
189
+ case "AgentAudioDone":
190
+ this.agentAudioDoneReceived = !0;
191
+ break;
192
+ case "Error":
193
+ console.error("[GlydeVoice] Agent error:", t), this.emit({ type: "error", payload: t });
194
+ break;
195
+ }
196
+ } catch (t) {
197
+ console.error("[GlydeVoice] Failed to parse message:", t);
198
+ }
199
+ }
200
+ /**
201
+ * Handle binary audio data (Blob) from agent TTS
202
+ */
203
+ async handleAudioData(e) {
204
+ const t = await e.arrayBuffer();
205
+ this.handleAudioBuffer(t);
206
+ }
207
+ /**
208
+ * Handle binary audio buffer from agent TTS
209
+ * Deepgram sends linear16 PCM at 24kHz, we need to resample to 48kHz for playback
210
+ */
211
+ handleAudioBuffer(e) {
212
+ if (!this.playbackWorkletNode || !this.audioContext) return;
213
+ this.audioContext.state === "suspended" && this.audioContext.resume();
214
+ const t = e.byteLength;
215
+ if (t === 0) return;
216
+ const o = t - t % 2;
217
+ if (o === 0) return;
218
+ const s = o === t ? e : e.slice(0, o), a = new Int16Array(s), n = new Float32Array(a.length);
219
+ for (let r = 0; r < a.length; r++)
220
+ n[r] = a[r] / 32768;
221
+ const c = this.resample24kTo48k(n);
222
+ !this.isAgentSpeaking && !this.agentAudioDoneReceived && (this.isAgentSpeaking = !0, this.emit({ type: "agent_speaking", payload: !0 }));
223
+ const i = new Float32Array(c);
224
+ this.playbackWorkletNode.port.postMessage({
225
+ type: "audio",
226
+ data: i
227
+ }, [i.buffer]);
228
+ }
229
+ /**
230
+ * Resample audio from 24kHz to 48kHz using linear interpolation
231
+ */
232
+ resample24kTo48k(e) {
233
+ const t = e.length * 2, o = new Float32Array(t);
234
+ for (let a = 0; a < e.length - 1; a++) {
235
+ const n = e[a], c = e[a + 1];
236
+ o[a * 2] = n, o[a * 2 + 1] = (n + c) / 2;
237
+ }
238
+ const s = e.length - 1;
239
+ return o[s * 2] = e[s], o[s * 2 + 1] = e[s], o;
240
+ }
241
+ /**
242
+ * Clear the playback buffer (for interruption handling)
243
+ */
244
+ clearPlaybackBuffer() {
245
+ this.playbackWorkletNode && this.playbackWorkletNode.port.postMessage({ type: "clear" });
246
+ }
247
+ /**
248
+ * Start capturing microphone audio using AudioWorklet
249
+ */
250
+ async startMicrophone() {
251
+ if (!this.audioContext)
252
+ throw new Error("Audio context not initialized");
253
+ try {
254
+ this.mediaStream = await navigator.mediaDevices.getUserMedia({
255
+ audio: {
256
+ channelCount: 1,
257
+ sampleRate: this.inputSampleRate,
258
+ echoCancellation: !0,
259
+ noiseSuppression: !0
260
+ }
261
+ });
262
+ const e = this.audioContext.createMediaStreamSource(this.mediaStream);
263
+ this.captureWorkletNode = new AudioWorkletNode(this.audioContext, "audio-capture-processor"), this.captureWorkletNode.port.onmessage = (t) => {
264
+ !this.active || !this.ws || this.ws.readyState !== WebSocket.OPEN || this.isMuted || this.ws.send(t.data);
265
+ }, e.connect(this.captureWorkletNode), this.emit({ type: "microphone_ready" });
266
+ } catch (e) {
267
+ throw console.error("[GlydeVoice] Microphone error:", e), e;
268
+ }
269
+ }
270
+ /**
271
+ * Save transcript to Unity backend
272
+ */
273
+ async saveTranscript(e, t) {
274
+ if (!(!this.config.contextId || !e))
275
+ try {
276
+ await fetch(`${this.unityUrl}/api/unity/voice/transcript`, {
277
+ method: "POST",
278
+ headers: this.getAuthHeaders(),
279
+ body: JSON.stringify({
280
+ context_id: this.config.contextId,
281
+ content: e,
282
+ role: t === "assistant" ? "assistant" : "user"
283
+ })
284
+ });
285
+ } catch {
286
+ }
287
+ }
288
+ /**
289
+ * Toggle mute state
290
+ * @param muted - Whether to mute the microphone
291
+ */
292
+ setMuted(e) {
293
+ this.isMuted = e;
294
+ }
295
+ /**
296
+ * Get current mute state
297
+ */
298
+ getMuted() {
299
+ return this.isMuted;
300
+ }
301
+ /**
302
+ * Check if the voice agent is currently active
303
+ */
304
+ isActive() {
305
+ return this.active;
306
+ }
307
+ /**
308
+ * Get the current server configuration
309
+ */
310
+ getServerConfig() {
311
+ return this.serverConfig;
312
+ }
313
+ /**
314
+ * Stop the voice session
315
+ */
316
+ stop() {
317
+ this.active = !1, this.cleanup();
318
+ }
319
+ /**
320
+ * Cleanup resources
321
+ */
322
+ cleanup() {
323
+ this.captureWorkletNode && (this.captureWorkletNode.disconnect(), this.captureWorkletNode.port.close(), this.captureWorkletNode = null), this.playbackWorkletNode && (this.playbackWorkletNode.disconnect(), this.playbackWorkletNode.port.close(), this.playbackWorkletNode = null), this.mediaStream && (this.mediaStream.getTracks().forEach((e) => e.stop()), this.mediaStream = null), this.audioContext && (this.audioContext.close(), this.audioContext = null), this.ws && (this.ws.readyState === WebSocket.OPEN && this.ws.close(), this.ws = null);
324
+ }
325
+ /**
326
+ * Emit event to callback
327
+ */
328
+ emit(e) {
329
+ this.config.onEvent && this.config.onEvent(e);
330
+ }
331
+ /**
332
+ * Render a simple UI widget (optional)
333
+ */
334
+ renderUI() {
335
+ if (!this.config.container) return;
336
+ const e = typeof this.config.container == "string" ? document.querySelector(this.config.container) : this.config.container;
337
+ e && (e.innerHTML = `
338
+ <div style="padding: 20px; border: 1px solid #ccc; border-radius: 8px; background: #fff;">
339
+ <h3>Glyde Voice Agent</h3>
340
+ <p>Status: Active</p>
341
+ <p>Context: ${this.config.contextType}</p>
342
+ <button onclick="this.closest('div').remove()">Close</button>
343
+ </div>
344
+ `);
345
+ }
346
+ }
2
347
  export {
3
- r as GlydeVoice
348
+ l as GlydeVoice
4
349
  };