modular-voice-agent-sdk 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/README.md +102 -0
  2. package/USAGE.md +567 -0
  3. package/dist/backends/cloud/index.d.ts +7 -0
  4. package/dist/backends/cloud/index.d.ts.map +1 -0
  5. package/dist/backends/cloud/index.js +6 -0
  6. package/dist/backends/cloud/index.js.map +1 -0
  7. package/dist/backends/cloud/llm.d.ts +22 -0
  8. package/dist/backends/cloud/llm.d.ts.map +1 -0
  9. package/dist/backends/cloud/llm.js +234 -0
  10. package/dist/backends/cloud/llm.js.map +1 -0
  11. package/dist/backends/index.d.ts +2 -0
  12. package/dist/backends/index.d.ts.map +1 -0
  13. package/dist/backends/index.js +6 -0
  14. package/dist/backends/index.js.map +1 -0
  15. package/dist/backends/native/index.d.ts +5 -0
  16. package/dist/backends/native/index.d.ts.map +1 -0
  17. package/dist/backends/native/index.js +6 -0
  18. package/dist/backends/native/index.js.map +1 -0
  19. package/dist/backends/native/llm.d.ts +71 -0
  20. package/dist/backends/native/llm.d.ts.map +1 -0
  21. package/dist/backends/native/llm.js +435 -0
  22. package/dist/backends/native/llm.js.map +1 -0
  23. package/dist/backends/native/stt.d.ts +15 -0
  24. package/dist/backends/native/stt.d.ts.map +1 -0
  25. package/dist/backends/native/stt.js +94 -0
  26. package/dist/backends/native/stt.js.map +1 -0
  27. package/dist/backends/native/tts.d.ts +21 -0
  28. package/dist/backends/native/tts.d.ts.map +1 -0
  29. package/dist/backends/native/tts.js +105 -0
  30. package/dist/backends/native/tts.js.map +1 -0
  31. package/dist/backends/transformers/index.d.ts +4 -0
  32. package/dist/backends/transformers/index.d.ts.map +1 -0
  33. package/dist/backends/transformers/index.js +4 -0
  34. package/dist/backends/transformers/index.js.map +1 -0
  35. package/dist/backends/transformers/llm.d.ts +29 -0
  36. package/dist/backends/transformers/llm.d.ts.map +1 -0
  37. package/dist/backends/transformers/llm.js +117 -0
  38. package/dist/backends/transformers/llm.js.map +1 -0
  39. package/dist/backends/transformers/stt.d.ts +17 -0
  40. package/dist/backends/transformers/stt.d.ts.map +1 -0
  41. package/dist/backends/transformers/stt.js +43 -0
  42. package/dist/backends/transformers/stt.js.map +1 -0
  43. package/dist/backends/transformers/tts.d.ts +17 -0
  44. package/dist/backends/transformers/tts.d.ts.map +1 -0
  45. package/dist/backends/transformers/tts.js +40 -0
  46. package/dist/backends/transformers/tts.js.map +1 -0
  47. package/dist/cache.d.ts +37 -0
  48. package/dist/cache.d.ts.map +1 -0
  49. package/dist/cache.js +49 -0
  50. package/dist/cache.js.map +1 -0
  51. package/dist/cli.d.ts +11 -0
  52. package/dist/cli.d.ts.map +1 -0
  53. package/dist/cli.js +392 -0
  54. package/dist/cli.js.map +1 -0
  55. package/dist/client/audio-player.d.ts +45 -0
  56. package/dist/client/audio-player.d.ts.map +1 -0
  57. package/dist/client/audio-player.js +90 -0
  58. package/dist/client/audio-player.js.map +1 -0
  59. package/dist/client/audio-recorder.d.ts +42 -0
  60. package/dist/client/audio-recorder.d.ts.map +1 -0
  61. package/dist/client/audio-recorder.js +128 -0
  62. package/dist/client/audio-recorder.js.map +1 -0
  63. package/dist/client/index.d.ts +34 -0
  64. package/dist/client/index.d.ts.map +1 -0
  65. package/dist/client/index.js +33 -0
  66. package/dist/client/index.js.map +1 -0
  67. package/dist/client/protocol.d.ts +80 -0
  68. package/dist/client/protocol.d.ts.map +1 -0
  69. package/dist/client/protocol.js +29 -0
  70. package/dist/client/protocol.js.map +1 -0
  71. package/dist/client/voice-client.d.ts +249 -0
  72. package/dist/client/voice-client.d.ts.map +1 -0
  73. package/dist/client/voice-client.js +826 -0
  74. package/dist/client/voice-client.js.map +1 -0
  75. package/dist/client/web-speech-stt.d.ts +65 -0
  76. package/dist/client/web-speech-stt.d.ts.map +1 -0
  77. package/dist/client/web-speech-stt.js +122 -0
  78. package/dist/client/web-speech-stt.js.map +1 -0
  79. package/dist/client/web-speech-tts.d.ts +59 -0
  80. package/dist/client/web-speech-tts.d.ts.map +1 -0
  81. package/dist/client/web-speech-tts.js +145 -0
  82. package/dist/client/web-speech-tts.js.map +1 -0
  83. package/dist/index.d.ts +10 -0
  84. package/dist/index.d.ts.map +1 -0
  85. package/dist/index.js +13 -0
  86. package/dist/index.js.map +1 -0
  87. package/dist/server/encoding.d.ts +18 -0
  88. package/dist/server/encoding.d.ts.map +1 -0
  89. package/dist/server/encoding.js +41 -0
  90. package/dist/server/encoding.js.map +1 -0
  91. package/dist/server/handler.d.ts +86 -0
  92. package/dist/server/handler.d.ts.map +1 -0
  93. package/dist/server/handler.js +224 -0
  94. package/dist/server/handler.js.map +1 -0
  95. package/dist/server/index.d.ts +31 -0
  96. package/dist/server/index.d.ts.map +1 -0
  97. package/dist/server/index.js +32 -0
  98. package/dist/server/index.js.map +1 -0
  99. package/dist/services/function-service.d.ts +17 -0
  100. package/dist/services/function-service.d.ts.map +1 -0
  101. package/dist/services/function-service.js +82 -0
  102. package/dist/services/function-service.js.map +1 -0
  103. package/dist/services/index.d.ts +4 -0
  104. package/dist/services/index.d.ts.map +1 -0
  105. package/dist/services/index.js +3 -0
  106. package/dist/services/index.js.map +1 -0
  107. package/dist/services/llm-logger.d.ts +136 -0
  108. package/dist/services/llm-logger.d.ts.map +1 -0
  109. package/dist/services/llm-logger.js +275 -0
  110. package/dist/services/llm-logger.js.map +1 -0
  111. package/dist/services/text-normalizer.d.ts +17 -0
  112. package/dist/services/text-normalizer.d.ts.map +1 -0
  113. package/dist/services/text-normalizer.js +100 -0
  114. package/dist/services/text-normalizer.js.map +1 -0
  115. package/dist/types.d.ts +195 -0
  116. package/dist/types.d.ts.map +1 -0
  117. package/dist/types.js +48 -0
  118. package/dist/types.js.map +1 -0
  119. package/dist/voice-pipeline.d.ts +125 -0
  120. package/dist/voice-pipeline.d.ts.map +1 -0
  121. package/dist/voice-pipeline.js +390 -0
  122. package/dist/voice-pipeline.js.map +1 -0
  123. package/package.json +96 -0
  124. package/scripts/setup-binaries.sh +159 -0
  125. package/scripts/setup.sh +201 -0
@@ -0,0 +1,826 @@
1
+ /**
2
+ * Voice Client
3
+ *
4
+ * Unified browser SDK for voice assistants.
5
+ * Handles three modes:
6
+ * 1. Fully local - all components run in browser, no server needed
7
+ * 2. Fully remote - all processing on server via WebSocket
8
+ * 3. Hybrid - mix of local and server components
9
+ *
10
+ * Component logic:
11
+ * - Component provided → runs locally
12
+ * - Component is null + serverUrl → server handles it
13
+ * - All components local → no WebSocket needed
14
+ */
15
+ import { VoicePipeline } from '../voice-pipeline';
16
+ import { AudioRecorder } from './audio-recorder';
17
+ import { AudioPlayer } from './audio-player';
18
+ import { WebSpeechSTT } from './web-speech-stt';
19
+ import { WebSpeechTTS } from './web-speech-tts';
20
+ import { float32ToBase64, base64ToFloat32, } from './protocol';
21
+ // ============ Helpers ============
22
+ function isWebSpeechSTT(obj) {
23
+ return obj instanceof WebSpeechSTT;
24
+ }
25
+ function isWebSpeechTTS(obj) {
26
+ return obj instanceof WebSpeechTTS;
27
+ }
28
+ // ============ Voice Client ============
29
+ export class VoiceClient {
30
+ // ============ Static Methods ============
31
+ /**
32
+ * Check browser support for voice features.
33
+ * Call this before creating a VoiceClient to determine what's available.
34
+ *
35
+ * @example
36
+ * const support = VoiceClient.getBrowserSupport();
37
+ * if (!support.webSpeechSTT) {
38
+ * showMessage("Voice input requires Chrome, Edge, or Safari");
39
+ * }
40
+ */
41
+ static getBrowserSupport() {
42
+ const hasWindow = typeof window !== 'undefined';
43
+ return {
44
+ webSpeechSTT: hasWindow && !!(window.SpeechRecognition ||
45
+ window.webkitSpeechRecognition),
46
+ webSpeechTTS: hasWindow && 'speechSynthesis' in window,
47
+ webGPU: hasWindow && 'gpu' in navigator,
48
+ mediaDevices: hasWindow && !!(navigator.mediaDevices?.getUserMedia),
49
+ webSocket: hasWindow && 'WebSocket' in window,
50
+ audioContext: hasWindow && !!(window.AudioContext ||
51
+ window.webkitAudioContext),
52
+ };
53
+ }
54
+ /**
55
+ * Get a human-readable description of what's not supported.
56
+ * Returns null if everything needed for basic operation is supported.
57
+ */
58
+ static getUnsupportedFeatures() {
59
+ const support = VoiceClient.getBrowserSupport();
60
+ const issues = [];
61
+ if (!support.webSpeechSTT) {
62
+ issues.push('Speech recognition (WebSpeech STT) - use Chrome, Edge, or Safari, or use TransformersSTT for local transcription');
63
+ }
64
+ if (!support.mediaDevices) {
65
+ issues.push('Microphone access (MediaDevices API)');
66
+ }
67
+ if (!support.audioContext) {
68
+ issues.push('Audio processing (AudioContext)');
69
+ }
70
+ if (!support.webSocket) {
71
+ issues.push('WebSocket connections');
72
+ }
73
+ return issues;
74
+ }
75
+ // ============ Instance Properties ============
76
+ config;
77
+ // Mode detection
78
+ mode;
79
+ needsServer;
80
+ // Local components
81
+ localSTT = null;
82
+ localLLM = null;
83
+ localTTS = null;
84
+ localPipeline = null;
85
+ // Remote/hybrid components
86
+ ws = null;
87
+ recorder = null;
88
+ player = null;
89
+ // State
90
+ status = 'disconnected';
91
+ listeners = new Map();
92
+ currentResponse = '';
93
+ reconnectTimer = null;
94
+ pendingTTSText = '';
95
+ ttsQueue = [];
96
+ isSpeaking = false;
97
+ // Conversation state (for local mode)
98
+ conversationId = this.generateConversationId();
99
+ history = [];
100
+ // Recording state for local pipeline
101
+ audioContext = null;
102
+ mediaRecorder = null;
103
+ audioChunks = [];
104
+ mediaRecording = false;
105
+ generateConversationId() {
106
+ return `conv-${Date.now()}-${Math.random().toString(36).slice(2, 11)}`;
107
+ }
108
+ constructor(config) {
109
+ // Check browser support first
110
+ this.validateBrowserSupport(config);
111
+ // Determine what's local vs remote
112
+ const hasLocalSTT = config.stt !== undefined && config.stt !== null;
113
+ const hasLocalLLM = config.llm !== undefined && config.llm !== null;
114
+ const hasLocalTTS = config.tts !== undefined && config.tts !== null;
115
+ this.needsServer = !hasLocalSTT || !hasLocalLLM || !hasLocalTTS;
116
+ if (!hasLocalSTT && !hasLocalLLM && !hasLocalTTS) {
117
+ this.mode = 'remote';
118
+ }
119
+ else if (hasLocalSTT && hasLocalLLM && hasLocalTTS) {
120
+ this.mode = 'local';
121
+ }
122
+ else {
123
+ this.mode = 'hybrid';
124
+ }
125
+ // Validate config
126
+ if (this.needsServer && !config.serverUrl) {
127
+ throw new Error('serverUrl is required when any component (stt, llm, tts) is null. ' +
128
+ 'Either provide all components for fully-local mode, or specify a serverUrl.');
129
+ }
130
+ if (hasLocalLLM && !config.systemPrompt) {
131
+ throw new Error('systemPrompt is required when using a local LLM');
132
+ }
133
+ this.config = {
134
+ sampleRate: config.sampleRate ?? 16000,
135
+ autoReconnect: config.autoReconnect ?? true,
136
+ reconnectDelay: config.reconnectDelay ?? 2000,
137
+ serverUrl: config.serverUrl,
138
+ systemPrompt: config.systemPrompt ?? '',
139
+ };
140
+ // Store local components
141
+ if (hasLocalSTT)
142
+ this.localSTT = config.stt;
143
+ if (hasLocalLLM)
144
+ this.localLLM = config.llm;
145
+ if (hasLocalTTS)
146
+ this.localTTS = config.tts;
147
+ // Set up based on mode
148
+ this.setupComponents();
149
+ }
150
+ setupComponents() {
151
+ // Set up STT (local or recorder for server)
152
+ if (this.localSTT) {
153
+ if (isWebSpeechSTT(this.localSTT)) {
154
+ this.setupWebSpeechSTT();
155
+ }
156
+ // For STTPipeline, we'll use MediaRecorder to capture audio, then process locally
157
+ }
158
+ else if (this.needsServer) {
159
+ // Use AudioRecorder for server-side STT
160
+ this.recorder = new AudioRecorder({ sampleRate: this.config.sampleRate });
161
+ this.recorder.onChunk((chunk) => {
162
+ this.send({
163
+ type: 'audio',
164
+ data: float32ToBase64(chunk),
165
+ sampleRate: this.config.sampleRate,
166
+ });
167
+ });
168
+ }
169
+ // Set up TTS (local or player for server)
170
+ if (this.localTTS) {
171
+ // WebSpeechTTS or TTSPipeline - handled in handleLocalTTS methods
172
+ }
173
+ else if (this.needsServer) {
174
+ // Use AudioPlayer for server audio
175
+ this.player = new AudioPlayer({
176
+ onStart: () => this.setStatus('speaking'),
177
+ onEnd: () => {
178
+ if (this.status === 'speaking') {
179
+ this.setStatus('ready');
180
+ }
181
+ },
182
+ });
183
+ }
184
+ // Create local pipeline if fully local
185
+ if (this.mode === 'local') {
186
+ // For local mode with WebSpeech components, we need to handle them separately
187
+ const sttForPipeline = isWebSpeechSTT(this.localSTT) ? null : this.localSTT;
188
+ const ttsForPipeline = isWebSpeechTTS(this.localTTS) ? null : this.localTTS;
189
+ this.localPipeline = new VoicePipeline({
190
+ stt: sttForPipeline,
191
+ llm: this.localLLM,
192
+ tts: ttsForPipeline,
193
+ systemPrompt: this.config.systemPrompt,
194
+ });
195
+ }
196
+ }
197
+ setupWebSpeechSTT() {
198
+ const webSpeechSTT = this.localSTT;
199
+ webSpeechSTT.onResult((result) => {
200
+ if (result.isFinal && result.transcript.trim()) {
201
+ const text = result.transcript.trim();
202
+ this.emit('transcript', text);
203
+ if (this.mode === 'local' || (this.mode === 'hybrid' && this.localLLM)) {
204
+ // Process locally
205
+ this.processTextLocally(text);
206
+ }
207
+ else {
208
+ // Send to server
209
+ this.send({ type: 'text', text });
210
+ this.setStatus('processing');
211
+ }
212
+ }
213
+ });
214
+ webSpeechSTT.onEnd(() => {
215
+ if (this.status === 'listening') {
216
+ this.setStatus('ready');
217
+ }
218
+ });
219
+ webSpeechSTT.onError((error) => {
220
+ this.emit('error', error);
221
+ this.setStatus('ready');
222
+ });
223
+ }
224
+ // ============ Public API ============
225
+ /**
226
+ * Initialize and connect (if using server)
227
+ */
228
+ async connect() {
229
+ // Initialize local components
230
+ if (this.localSTT || this.localLLM || this.localTTS) {
231
+ this.setStatus('initializing');
232
+ await this.initializeLocalComponents();
233
+ }
234
+ // Connect to server if needed
235
+ if (this.needsServer) {
236
+ await this.connectWebSocket();
237
+ }
238
+ else {
239
+ this.setStatus('ready');
240
+ }
241
+ }
242
+ async initializeLocalComponents() {
243
+ const progressCallback = (progress) => {
244
+ this.emit('progress', {
245
+ status: progress.status,
246
+ file: progress.file,
247
+ progress: progress.progress,
248
+ });
249
+ };
250
+ const promises = [];
251
+ // Initialize STT (if not WebSpeechSTT)
252
+ if (this.localSTT && !isWebSpeechSTT(this.localSTT)) {
253
+ promises.push(this.localSTT.initialize(progressCallback));
254
+ }
255
+ // Initialize LLM
256
+ if (this.localLLM) {
257
+ promises.push(this.localLLM.initialize(progressCallback));
258
+ }
259
+ // Initialize TTS (WebSpeechTTS needs initialize too)
260
+ if (this.localTTS) {
261
+ if (isWebSpeechTTS(this.localTTS)) {
262
+ promises.push(this.localTTS.initialize());
263
+ }
264
+ else {
265
+ promises.push(this.localTTS.initialize(progressCallback));
266
+ }
267
+ }
268
+ // Initialize local pipeline if exists
269
+ if (this.localPipeline) {
270
+ promises.push(this.localPipeline.initialize(progressCallback));
271
+ }
272
+ await Promise.all(promises);
273
+ }
274
+ async connectWebSocket() {
275
+ if (!this.config.serverUrl)
276
+ return;
277
+ if (this.ws?.readyState === WebSocket.OPEN)
278
+ return;
279
+ this.setStatus('connecting');
280
+ this.ws = new WebSocket(this.config.serverUrl);
281
+ this.ws.onopen = () => {
282
+ // Send capabilities
283
+ this.send({
284
+ type: 'capabilities',
285
+ hasSTT: this.localSTT !== null,
286
+ hasTTS: this.localTTS !== null,
287
+ });
288
+ this.setStatus('ready');
289
+ };
290
+ this.ws.onclose = () => {
291
+ this.setStatus('disconnected');
292
+ if (this.config.autoReconnect && this.needsServer) {
293
+ this.scheduleReconnect();
294
+ }
295
+ };
296
+ this.ws.onerror = () => {
297
+ this.emit('error', new Error('WebSocket error'));
298
+ };
299
+ this.ws.onmessage = (event) => {
300
+ try {
301
+ const msg = JSON.parse(event.data);
302
+ this.handleServerMessage(msg);
303
+ }
304
+ catch {
305
+ this.emit('error', new Error('Failed to parse server message'));
306
+ }
307
+ };
308
+ }
309
+ /**
310
+ * Disconnect and clean up
311
+ */
312
+ disconnect() {
313
+ this.config.autoReconnect = false;
314
+ if (this.reconnectTimer) {
315
+ clearTimeout(this.reconnectTimer);
316
+ this.reconnectTimer = null;
317
+ }
318
+ this.ws?.close();
319
+ this.ws = null;
320
+ // Stop any TTS
321
+ if (isWebSpeechTTS(this.localTTS)) {
322
+ this.localTTS.stop();
323
+ }
324
+ this.setStatus('disconnected');
325
+ }
326
+ /**
327
+ * Start recording/listening
328
+ */
329
+ async startRecording() {
330
+ if (this.status !== 'ready' && this.status !== 'speaking')
331
+ return;
332
+ // Stop any current playback
333
+ this.player?.clear();
334
+ if (isWebSpeechTTS(this.localTTS)) {
335
+ this.localTTS.stop();
336
+ }
337
+ this.ttsQueue = [];
338
+ this.isSpeaking = false;
339
+ if (isWebSpeechSTT(this.localSTT)) {
340
+ // Use browser speech recognition
341
+ this.localSTT.start();
342
+ }
343
+ else if (this.localSTT) {
344
+ // Use MediaRecorder for local STT pipeline
345
+ await this.startMediaRecorder();
346
+ }
347
+ else if (this.recorder) {
348
+ // Use audio recorder for server STT
349
+ await this.recorder.start();
350
+ }
351
+ this.setStatus('listening');
352
+ }
353
+ /**
354
+ * Stop recording/listening and process
355
+ */
356
+ async stopRecording() {
357
+ if (this.status !== 'listening')
358
+ return;
359
+ if (isWebSpeechSTT(this.localSTT)) {
360
+ // Stop browser speech recognition - fires onResult with final transcript
361
+ this.localSTT.stop();
362
+ }
363
+ else if (this.localSTT) {
364
+ // Stop MediaRecorder and process locally
365
+ await this.stopMediaRecorder();
366
+ }
367
+ else if (this.recorder?.recording) {
368
+ // Stop audio recorder and send to server
369
+ await this.recorder.stop();
370
+ this.setStatus('processing');
371
+ this.send({ type: 'end_audio' });
372
+ }
373
+ }
374
+ async startMediaRecorder() {
375
+ this.mediaRecording = true;
376
+ this.audioChunks = [];
377
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
378
+ this.mediaRecorder = new MediaRecorder(stream);
379
+ this.mediaRecorder.ondataavailable = (e) => this.audioChunks.push(e.data);
380
+ this.mediaRecorder.onstop = async () => {
381
+ stream.getTracks().forEach((t) => t.stop());
382
+ await this.processLocalAudio();
383
+ };
384
+ this.mediaRecorder.start();
385
+ }
386
+ async stopMediaRecorder() {
387
+ if (!this.mediaRecording || !this.mediaRecorder)
388
+ return;
389
+ this.mediaRecording = false;
390
+ this.mediaRecorder.stop();
391
+ }
392
+ async processLocalAudio() {
393
+ this.setStatus('processing');
394
+ // Convert blob to Float32Array
395
+ const blob = new Blob(this.audioChunks, { type: 'audio/webm' });
396
+ const arrayBuffer = await blob.arrayBuffer();
397
+ this.audioContext = this.audioContext || new AudioContext({ sampleRate: 16000 });
398
+ const audioBuffer = await this.audioContext.decodeAudioData(arrayBuffer);
399
+ const audio = audioBuffer.getChannelData(0);
400
+ // Transcribe locally
401
+ if (!this.localSTT || isWebSpeechSTT(this.localSTT)) {
402
+ this.emit('error', new Error('Local STT pipeline not available'));
403
+ this.setStatus('ready');
404
+ return;
405
+ }
406
+ try {
407
+ const transcript = await this.localSTT.transcribe(audio);
408
+ if (!transcript.trim()) {
409
+ this.setStatus('ready');
410
+ return;
411
+ }
412
+ this.emit('transcript', transcript);
413
+ if (this.mode === 'local' || (this.mode === 'hybrid' && this.localLLM)) {
414
+ // Process locally
415
+ await this.processTextLocally(transcript);
416
+ }
417
+ else {
418
+ // Send to server
419
+ this.send({ type: 'text', text: transcript });
420
+ }
421
+ }
422
+ catch (error) {
423
+ this.emit('error', error instanceof Error ? error : new Error(String(error)));
424
+ this.setStatus('ready');
425
+ }
426
+ }
427
+ async processTextLocally(text) {
428
+ this.currentResponse = '';
429
+ if (this.localPipeline) {
430
+ // Fully local with pipeline
431
+ await this.runLocalPipeline(text);
432
+ }
433
+ else if (this.localLLM) {
434
+ // Hybrid: local LLM, possibly local TTS
435
+ await this.runLocalLLM(text);
436
+ }
437
+ }
438
+ async runLocalPipeline(text) {
439
+ if (!this.localPipeline)
440
+ return;
441
+ // Initialize history if empty
442
+ if (this.history.length === 0) {
443
+ this.history = this.localPipeline.createInitialHistory();
444
+ }
445
+ const context = {
446
+ conversationId: this.conversationId,
447
+ history: this.history,
448
+ };
449
+ const callbacks = {
450
+ onTranscript: (t) => this.emit('transcript', t),
451
+ onResponseChunk: (chunk) => {
452
+ this.currentResponse += chunk;
453
+ this.emit('responseChunk', chunk);
454
+ // If using WebSpeechTTS separately, queue text
455
+ if (isWebSpeechTTS(this.localTTS)) {
456
+ this.handleLocalTTSChunk(chunk);
457
+ }
458
+ },
459
+ onAudio: async (playable) => {
460
+ // If not using WebSpeechTTS, play the audio from pipeline
461
+ if (!isWebSpeechTTS(this.localTTS)) {
462
+ this.setStatus('speaking');
463
+ await playable.play();
464
+ }
465
+ },
466
+ onComplete: () => {
467
+ this.emit('responseComplete', this.currentResponse);
468
+ if (isWebSpeechTTS(this.localTTS)) {
469
+ this.flushLocalTTS();
470
+ }
471
+ else {
472
+ this.setStatus('ready');
473
+ }
474
+ },
475
+ onError: (err) => {
476
+ this.emit('error', err);
477
+ this.setStatus('ready');
478
+ },
479
+ };
480
+ // processText mutates context.history with new messages
481
+ await this.localPipeline.processText(text, context, callbacks);
482
+ }
483
+ async runLocalLLM(text) {
484
+ if (!this.localLLM)
485
+ return;
486
+ // Initialize history if empty
487
+ if (this.history.length === 0) {
488
+ this.history = [{ role: 'system', content: this.config.systemPrompt }];
489
+ }
490
+ // Add user message to history
491
+ this.history.push({ role: 'user', content: text });
492
+ try {
493
+ const result = await this.localLLM.generate(this.history, {
494
+ conversationId: this.conversationId,
495
+ onToken: (token) => {
496
+ this.currentResponse += token;
497
+ this.emit('responseChunk', token);
498
+ if (isWebSpeechTTS(this.localTTS)) {
499
+ this.handleLocalTTSChunk(token);
500
+ }
501
+ else if (this.localTTS) {
502
+ // For TTSPipeline, we'd need sentence-level TTS
503
+ // This is simplified - in practice you'd want sentence buffering
504
+ }
505
+ else {
506
+ // Server TTS - send text
507
+ // Server should handle this based on capabilities
508
+ }
509
+ },
510
+ });
511
+ // Add assistant response to history
512
+ this.history.push({ role: 'assistant', content: result.content });
513
+ this.emit('responseComplete', result.content);
514
+ if (isWebSpeechTTS(this.localTTS)) {
515
+ await this.flushLocalTTS();
516
+ }
517
+ else if (!this.localTTS && this.needsServer) {
518
+ // Wait for server TTS
519
+ }
520
+ else {
521
+ this.setStatus('ready');
522
+ }
523
+ }
524
+ catch (error) {
525
+ this.emit('error', error instanceof Error ? error : new Error(String(error)));
526
+ this.setStatus('ready');
527
+ }
528
+ }
529
+ /**
530
+ * Clear conversation history
531
+ */
532
+ clearHistory() {
533
+ // Reset local history
534
+ this.conversationId = this.generateConversationId();
535
+ if (this.localPipeline) {
536
+ this.history = this.localPipeline.createInitialHistory();
537
+ }
538
+ else {
539
+ this.history = [];
540
+ }
541
+ // Tell server to clear too
542
+ if (this.needsServer) {
543
+ this.send({ type: 'clear_history' });
544
+ }
545
+ }
546
+ /**
547
+ * Get current status
548
+ */
549
+ getStatus() {
550
+ return this.status;
551
+ }
552
+ /**
553
+ * Check if ready for interaction
554
+ */
555
+ isReady() {
556
+ if (this.mode === 'local') {
557
+ return this.localPipeline?.isReady() ?? false;
558
+ }
559
+ return this.ws?.readyState === WebSocket.OPEN;
560
+ }
561
+ /**
562
+ * Check if currently recording
563
+ */
564
+ isRecording() {
565
+ if (isWebSpeechSTT(this.localSTT)) {
566
+ return this.localSTT.listening;
567
+ }
568
+ if (this.mediaRecording)
569
+ return true;
570
+ return this.recorder?.recording ?? false;
571
+ }
572
+ /**
573
+ * Get current mode
574
+ */
575
+ getMode() {
576
+ return this.mode;
577
+ }
578
+ /**
579
+ * Check which components are local
580
+ */
581
+ getLocalComponents() {
582
+ return {
583
+ stt: this.localSTT !== null,
584
+ llm: this.localLLM !== null,
585
+ tts: this.localTTS !== null,
586
+ };
587
+ }
588
+ /**
589
+ * Subscribe to events
590
+ */
591
+ on(event, callback) {
592
+ if (!this.listeners.has(event)) {
593
+ this.listeners.set(event, new Set());
594
+ }
595
+ this.listeners.get(event).add(callback);
596
+ }
597
+ /**
598
+ * Unsubscribe from events
599
+ */
600
+ off(event, callback) {
601
+ this.listeners.get(event)?.delete(callback);
602
+ }
603
+ /**
604
+ * Clean up all resources
605
+ */
606
+ async dispose() {
607
+ this.disconnect();
608
+ await this.recorder?.dispose();
609
+ this.player?.dispose();
610
+ if (isWebSpeechSTT(this.localSTT)) {
611
+ this.localSTT.dispose();
612
+ }
613
+ this.listeners.clear();
614
+ }
615
+ // ============ Private Methods ============
616
+ send(msg) {
617
+ if (this.ws?.readyState === WebSocket.OPEN) {
618
+ this.ws.send(JSON.stringify(msg));
619
+ }
620
+ }
621
+ handleServerMessage(msg) {
622
+ switch (msg.type) {
623
+ case 'transcript':
624
+ // Server did STT - only relevant if we're not using local STT
625
+ if (!this.localSTT) {
626
+ this.currentResponse = '';
627
+ this.emit('transcript', msg.text);
628
+ }
629
+ break;
630
+ case 'response_chunk':
631
+ this.currentResponse += msg.text;
632
+ this.emit('responseChunk', msg.text);
633
+ // If using local TTS, queue text for speech
634
+ if (this.localTTS) {
635
+ this.handleLocalTTSChunk(msg.text);
636
+ }
637
+ break;
638
+ case 'audio':
639
+ // Only process server audio if not using local TTS
640
+ if (!this.localTTS && this.player) {
641
+ const audio = base64ToFloat32(msg.data);
642
+ this.player.enqueue(audio, msg.sampleRate);
643
+ }
644
+ break;
645
+ case 'tool_call':
646
+ this.emit('toolCall', {
647
+ id: msg.toolCallId,
648
+ name: msg.name,
649
+ arguments: msg.arguments,
650
+ });
651
+ break;
652
+ case 'tool_result':
653
+ this.emit('toolResult', msg.toolCallId, msg.result);
654
+ break;
655
+ case 'complete':
656
+ this.emit('responseComplete', this.currentResponse);
657
+ if (this.localTTS) {
658
+ // Flush any remaining TTS text
659
+ this.flushLocalTTS();
660
+ }
661
+ else if (this.player) {
662
+ // Status will change to 'ready' when audio finishes
663
+ if (!this.player.playing && this.player.queueLength === 0) {
664
+ this.setStatus('ready');
665
+ }
666
+ }
667
+ else {
668
+ this.setStatus('ready');
669
+ }
670
+ break;
671
+ case 'error':
672
+ this.emit('error', new Error(msg.message));
673
+ this.setStatus('ready');
674
+ break;
675
+ }
676
+ }
677
+ handleLocalTTSChunk(text) {
678
+ // Accumulate text and speak sentence by sentence
679
+ this.pendingTTSText += text;
680
+ // Check for sentence endings
681
+ const sentenceEnders = /[.!?]/;
682
+ const match = this.pendingTTSText.match(sentenceEnders);
683
+ if (match && match.index !== undefined) {
684
+ const sentence = this.pendingTTSText.slice(0, match.index + 1).trim();
685
+ this.pendingTTSText = this.pendingTTSText.slice(match.index + 1);
686
+ if (sentence) {
687
+ this.ttsQueue.push(sentence);
688
+ this.processLocalTTSQueue();
689
+ }
690
+ }
691
+ }
692
+ flushLocalTTS() {
693
+ // Speak any remaining text
694
+ if (this.pendingTTSText.trim()) {
695
+ this.ttsQueue.push(this.pendingTTSText.trim());
696
+ this.pendingTTSText = '';
697
+ }
698
+ this.processLocalTTSQueue();
699
+ }
700
+ async processLocalTTSQueue() {
701
+ if (this.isSpeaking || this.ttsQueue.length === 0 || !this.localTTS)
702
+ return;
703
+ this.isSpeaking = true;
704
+ this.setStatus('speaking');
705
+ while (this.ttsQueue.length > 0) {
706
+ const text = this.ttsQueue.shift();
707
+ try {
708
+ if (isWebSpeechTTS(this.localTTS)) {
709
+ await this.localTTS.speak(text);
710
+ }
711
+ else {
712
+ // TTSPipeline - synthesize and play
713
+ const playable = await this.localTTS.synthesize(text);
714
+ await playable.play();
715
+ }
716
+ }
717
+ catch {
718
+ // Ignore TTS errors (e.g., if speech was cancelled)
719
+ }
720
+ }
721
+ this.isSpeaking = false;
722
+ if (this.status === 'speaking') {
723
+ this.setStatus('ready');
724
+ }
725
+ }
726
+ setStatus(newStatus) {
727
+ if (this.status !== newStatus) {
728
+ this.status = newStatus;
729
+ this.emit('status', newStatus);
730
+ }
731
+ }
732
+ emit(event, ...args) {
733
+ const callbacks = this.listeners.get(event);
734
+ if (callbacks) {
735
+ for (const callback of callbacks) {
736
+ try {
737
+ callback(...args);
738
+ }
739
+ catch (err) {
740
+ console.error(`Error in ${event} listener:`, err);
741
+ }
742
+ }
743
+ }
744
+ }
745
+ scheduleReconnect() {
746
+ if (this.reconnectTimer)
747
+ return;
748
+ this.reconnectTimer = setTimeout(() => {
749
+ this.reconnectTimer = null;
750
+ this.connect();
751
+ }, this.config.reconnectDelay);
752
+ }
753
+ validateBrowserSupport(config) {
754
+ const support = VoiceClient.getBrowserSupport();
755
+ // Check WebSpeech STT if trying to use it
756
+ if (config.stt instanceof WebSpeechSTT) {
757
+ if (!support.webSpeechSTT) {
758
+ throw new Error('WebSpeech STT is not supported in this browser.\n\n' +
759
+ 'Options:\n' +
760
+ ' 1. Use Chrome, Edge, or Safari (they support Web Speech API)\n' +
761
+ ' 2. Use TransformersSTT for local transcription (works in all browsers with WebGPU)\n' +
762
+ ' 3. Use server-side STT by setting stt: null with a serverUrl\n\n' +
763
+ 'Example with TransformersSTT:\n' +
764
+ ' import { TransformersSTT } from "modular-voice-agent-sdk";\n' +
765
+ ' const client = new VoiceClient({ stt: new TransformersSTT({ model: "Xenova/whisper-tiny" }), ... })');
766
+ }
767
+ }
768
+ // Check WebSpeech TTS if trying to use it
769
+ if (config.tts instanceof WebSpeechTTS) {
770
+ if (!support.webSpeechTTS) {
771
+ throw new Error('WebSpeech TTS is not supported in this browser.\n\n' +
772
+ 'Options:\n' +
773
+ ' 1. Use a different browser (most modern browsers support speech synthesis)\n' +
774
+ ' 2. Use server-side TTS by setting tts: null with a serverUrl');
775
+ }
776
+ }
777
+ // Check MediaDevices for any STT (local or server)
778
+ const needsMicrophone = config.stt !== undefined || config.stt === null;
779
+ if (needsMicrophone && !support.mediaDevices) {
780
+ throw new Error('Microphone access (MediaDevices API) is not available.\n' +
781
+ 'This may be because:\n' +
782
+ ' 1. The page is not served over HTTPS\n' +
783
+ ' 2. The browser does not support getUserMedia\n' +
784
+ ' 3. Microphone permissions were denied');
785
+ }
786
+ // Check WebSocket if using server
787
+ if (config.serverUrl && !support.webSocket) {
788
+ throw new Error('WebSocket is not supported in this browser.');
789
+ }
790
+ // Check AudioContext for audio processing
791
+ if (!support.audioContext) {
792
+ throw new Error('AudioContext is not supported in this browser.\n' +
793
+ 'Audio processing requires a modern browser with Web Audio API support.');
794
+ }
795
+ }
796
+ }
797
+ // ============ Factory Function ============
798
+ /**
799
+ * Create a VoiceClient instance
800
+ * @example
801
+ * // Fully local
802
+ * const client = createVoiceClient({
803
+ * stt: new TransformersSTT({ model: '...' }),
804
+ * llm: new TransformersLLM({ model: '...' }),
805
+ * tts: new WebSpeechTTS(),
806
+ * systemPrompt: 'You are a helpful assistant.',
807
+ * });
808
+ *
809
+ * @example
810
+ * // Fully remote
811
+ * const client = createVoiceClient({
812
+ * serverUrl: 'ws://localhost:3000',
813
+ * });
814
+ *
815
+ * @example
816
+ * // Hybrid: local STT/TTS, server LLM
817
+ * const client = createVoiceClient({
818
+ * stt: new WebSpeechSTT(),
819
+ * tts: new WebSpeechTTS(),
820
+ * serverUrl: 'ws://localhost:3000',
821
+ * });
822
+ */
823
+ export function createVoiceClient(config) {
824
+ return new VoiceClient(config);
825
+ }
826
+ //# sourceMappingURL=voice-client.js.map