@micdrop/server 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -4,7 +4,7 @@ import { z } from 'zod';
4
4
  import WebSocket, { WebSocket as WebSocket$1 } from 'ws';
5
5
 
6
6
  declare class Logger {
7
- private readonly name;
7
+ name: string;
8
8
  constructor(name: string);
9
9
  log(...message: any[]): void;
10
10
  }
@@ -158,6 +158,7 @@ declare function handleError(socket: WebSocket, error: unknown): void;
158
158
 
159
159
  interface STTEvents {
160
160
  Transcript: [string];
161
+ Failed: [Buffer[]];
161
162
  }
162
163
  declare abstract class STT extends EventEmitter<STTEvents> {
163
164
  logger?: Logger;
@@ -171,9 +172,28 @@ declare class MockSTT extends STT {
171
172
  transcribe(): Promise<void>;
172
173
  }
173
174
 
174
- declare abstract class TTS {
175
+ interface FallbackSTTOptions {
176
+ factories: Array<() => STT>;
177
+ }
178
+ declare class FallbackSTT extends STT {
179
+ private readonly options;
180
+ private stt;
181
+ private sttIndex;
182
+ constructor(options: FallbackSTTOptions);
183
+ transcribe(audioStream: Readable): void;
184
+ destroy(): void;
185
+ private startNextSTT;
186
+ private onTranscript;
187
+ private onFailed;
188
+ }
189
+
190
+ interface TTSEvents {
191
+ Audio: [Buffer];
192
+ Failed: [string[]];
193
+ }
194
+ declare abstract class TTS extends EventEmitter<TTSEvents> {
175
195
  logger?: Logger;
176
- abstract speak(textStream: Readable): Readable;
196
+ abstract speak(textStream: Readable): void;
177
197
  abstract cancel(): void;
178
198
  protected log(...message: any[]): void;
179
199
  destroy(): void;
@@ -186,15 +206,35 @@ declare class MockTTS extends TTS {
186
206
  cancel(): void;
187
207
  }
188
208
 
209
+ interface FallbackTTSOptions {
210
+ factories: Array<() => TTS>;
211
+ }
212
+ declare class FallbackTTS extends TTS {
213
+ private readonly options;
214
+ private tts;
215
+ private ttsIndex;
216
+ constructor(options: FallbackTTSOptions);
217
+ speak(textStream: Readable): void;
218
+ cancel(): void;
219
+ destroy(): void;
220
+ private startNextTTS;
221
+ private onAudio;
222
+ private onFailed;
223
+ }
224
+
225
+ interface MicdropServerEvents {
226
+ End: [MicdropCallSummary];
227
+ UserAudio: [Buffer];
228
+ AssistantAudio: [Buffer];
229
+ }
189
230
  interface MicdropConfig {
190
231
  firstMessage?: string;
191
232
  generateFirstMessage?: boolean;
192
233
  agent: Agent;
193
234
  stt: STT;
194
235
  tts: TTS;
195
- onEnd?(call: MicdropCallSummary): void;
196
236
  }
197
- declare class MicdropServer {
237
+ declare class MicdropServer extends EventEmitter<MicdropServerEvents> {
198
238
  socket: WebSocket$1 | null;
199
239
  config: MicdropConfig | null;
200
240
  logger?: Logger;
@@ -211,20 +251,50 @@ declare class MicdropServer {
211
251
  cancel(): void;
212
252
  private onClose;
213
253
  private onMessage;
214
- private onAudioChunk;
254
+ private onUserAudio;
215
255
  private onMute;
216
256
  private onStartSpeaking;
217
257
  private onStopSpeaking;
218
- private onTranscript;
258
+ private onTranscriptSTT;
259
+ private onAudioTTS;
219
260
  private sendFirstMessage;
220
261
  answer(): void;
221
262
  private _answer;
222
263
  speak(message: string | Readable): void;
223
264
  private _speak;
224
- sendAudio(audio: Readable): void;
225
- private _sendAudio;
265
+ }
266
+
267
+ interface AudioMessage {
268
+ buffer: Buffer;
269
+ messageIndex: number;
270
+ message: string;
271
+ role: 'user' | 'assistant';
272
+ }
273
+ interface MicdropRecorderEvents {
274
+ AudioMessage: [AudioMessage];
275
+ Complete: [AudioMessage[]];
276
+ }
277
+ declare class MicdropRecorder extends EventEmitter<MicdropRecorderEvents> {
278
+ private server;
279
+ logger?: Logger;
280
+ private audioMessages;
281
+ private currentUserChunks;
282
+ private currentAssistantChunks;
283
+ private lastUserMessageIndex;
284
+ private lastAssistantMessageIndex;
285
+ constructor(server: MicdropServer);
286
+ private setupListeners;
287
+ private onUserAudio;
288
+ private onAssistantAudio;
289
+ private onMessage;
290
+ private finalizeUserAudio;
291
+ private finalizeAssistantAudio;
292
+ private onEnd;
293
+ getAudioMessages(): AudioMessage[];
294
+ destroy(): void;
295
+ protected log(...message: any[]): void;
226
296
  }
227
297
 
228
298
  declare function waitForParams<CallParams>(socket: WebSocket$1, validate: (params: any) => CallParams): Promise<CallParams>;
229
299
 
230
- export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions, Logger, type MicdropAnswerMetadata, type MicdropCallSummary, MicdropClientCommands, type MicdropConfig, type MicdropConversation, type MicdropConversationItem, type MicdropConversationMessage, type MicdropConversationToolCall, type MicdropConversationToolResult, MicdropError, MicdropErrorCode, MicdropServer, MicdropServerCommands, type MicdropToolCall, MockAgent, MockSTT, MockTTS, STT, type STTEvents, TTS, type Tool, handleError, waitForParams };
300
+ export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type AudioMessage, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions, FallbackSTT, type FallbackSTTOptions, FallbackTTS, type FallbackTTSOptions, Logger, type MicdropAnswerMetadata, type MicdropCallSummary, MicdropClientCommands, type MicdropConfig, type MicdropConversation, type MicdropConversationItem, type MicdropConversationMessage, type MicdropConversationToolCall, type MicdropConversationToolResult, MicdropError, MicdropErrorCode, MicdropRecorder, type MicdropRecorderEvents, MicdropServer, MicdropServerCommands, type MicdropServerEvents, type MicdropToolCall, MockAgent, MockSTT, MockTTS, STT, type STTEvents, TTS, type TTSEvents, type Tool, handleError, waitForParams };
package/dist/index.d.ts CHANGED
@@ -4,7 +4,7 @@ import { z } from 'zod';
4
4
  import WebSocket, { WebSocket as WebSocket$1 } from 'ws';
5
5
 
6
6
  declare class Logger {
7
- private readonly name;
7
+ name: string;
8
8
  constructor(name: string);
9
9
  log(...message: any[]): void;
10
10
  }
@@ -158,6 +158,7 @@ declare function handleError(socket: WebSocket, error: unknown): void;
158
158
 
159
159
  interface STTEvents {
160
160
  Transcript: [string];
161
+ Failed: [Buffer[]];
161
162
  }
162
163
  declare abstract class STT extends EventEmitter<STTEvents> {
163
164
  logger?: Logger;
@@ -171,9 +172,28 @@ declare class MockSTT extends STT {
171
172
  transcribe(): Promise<void>;
172
173
  }
173
174
 
174
- declare abstract class TTS {
175
+ interface FallbackSTTOptions {
176
+ factories: Array<() => STT>;
177
+ }
178
+ declare class FallbackSTT extends STT {
179
+ private readonly options;
180
+ private stt;
181
+ private sttIndex;
182
+ constructor(options: FallbackSTTOptions);
183
+ transcribe(audioStream: Readable): void;
184
+ destroy(): void;
185
+ private startNextSTT;
186
+ private onTranscript;
187
+ private onFailed;
188
+ }
189
+
190
+ interface TTSEvents {
191
+ Audio: [Buffer];
192
+ Failed: [string[]];
193
+ }
194
+ declare abstract class TTS extends EventEmitter<TTSEvents> {
175
195
  logger?: Logger;
176
- abstract speak(textStream: Readable): Readable;
196
+ abstract speak(textStream: Readable): void;
177
197
  abstract cancel(): void;
178
198
  protected log(...message: any[]): void;
179
199
  destroy(): void;
@@ -186,15 +206,35 @@ declare class MockTTS extends TTS {
186
206
  cancel(): void;
187
207
  }
188
208
 
209
+ interface FallbackTTSOptions {
210
+ factories: Array<() => TTS>;
211
+ }
212
+ declare class FallbackTTS extends TTS {
213
+ private readonly options;
214
+ private tts;
215
+ private ttsIndex;
216
+ constructor(options: FallbackTTSOptions);
217
+ speak(textStream: Readable): void;
218
+ cancel(): void;
219
+ destroy(): void;
220
+ private startNextTTS;
221
+ private onAudio;
222
+ private onFailed;
223
+ }
224
+
225
+ interface MicdropServerEvents {
226
+ End: [MicdropCallSummary];
227
+ UserAudio: [Buffer];
228
+ AssistantAudio: [Buffer];
229
+ }
189
230
  interface MicdropConfig {
190
231
  firstMessage?: string;
191
232
  generateFirstMessage?: boolean;
192
233
  agent: Agent;
193
234
  stt: STT;
194
235
  tts: TTS;
195
- onEnd?(call: MicdropCallSummary): void;
196
236
  }
197
- declare class MicdropServer {
237
+ declare class MicdropServer extends EventEmitter<MicdropServerEvents> {
198
238
  socket: WebSocket$1 | null;
199
239
  config: MicdropConfig | null;
200
240
  logger?: Logger;
@@ -211,20 +251,50 @@ declare class MicdropServer {
211
251
  cancel(): void;
212
252
  private onClose;
213
253
  private onMessage;
214
- private onAudioChunk;
254
+ private onUserAudio;
215
255
  private onMute;
216
256
  private onStartSpeaking;
217
257
  private onStopSpeaking;
218
- private onTranscript;
258
+ private onTranscriptSTT;
259
+ private onAudioTTS;
219
260
  private sendFirstMessage;
220
261
  answer(): void;
221
262
  private _answer;
222
263
  speak(message: string | Readable): void;
223
264
  private _speak;
224
- sendAudio(audio: Readable): void;
225
- private _sendAudio;
265
+ }
266
+
267
+ interface AudioMessage {
268
+ buffer: Buffer;
269
+ messageIndex: number;
270
+ message: string;
271
+ role: 'user' | 'assistant';
272
+ }
273
+ interface MicdropRecorderEvents {
274
+ AudioMessage: [AudioMessage];
275
+ Complete: [AudioMessage[]];
276
+ }
277
+ declare class MicdropRecorder extends EventEmitter<MicdropRecorderEvents> {
278
+ private server;
279
+ logger?: Logger;
280
+ private audioMessages;
281
+ private currentUserChunks;
282
+ private currentAssistantChunks;
283
+ private lastUserMessageIndex;
284
+ private lastAssistantMessageIndex;
285
+ constructor(server: MicdropServer);
286
+ private setupListeners;
287
+ private onUserAudio;
288
+ private onAssistantAudio;
289
+ private onMessage;
290
+ private finalizeUserAudio;
291
+ private finalizeAssistantAudio;
292
+ private onEnd;
293
+ getAudioMessages(): AudioMessage[];
294
+ destroy(): void;
295
+ protected log(...message: any[]): void;
226
296
  }
227
297
 
228
298
  declare function waitForParams<CallParams>(socket: WebSocket$1, validate: (params: any) => CallParams): Promise<CallParams>;
229
299
 
230
- export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions, Logger, type MicdropAnswerMetadata, type MicdropCallSummary, MicdropClientCommands, type MicdropConfig, type MicdropConversation, type MicdropConversationItem, type MicdropConversationMessage, type MicdropConversationToolCall, type MicdropConversationToolResult, MicdropError, MicdropErrorCode, MicdropServer, MicdropServerCommands, type MicdropToolCall, MockAgent, MockSTT, MockTTS, STT, type STTEvents, TTS, type Tool, handleError, waitForParams };
300
+ export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type AudioMessage, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions, FallbackSTT, type FallbackSTTOptions, FallbackTTS, type FallbackTTSOptions, Logger, type MicdropAnswerMetadata, type MicdropCallSummary, MicdropClientCommands, type MicdropConfig, type MicdropConversation, type MicdropConversationItem, type MicdropConversationMessage, type MicdropConversationToolCall, type MicdropConversationToolResult, MicdropError, MicdropErrorCode, MicdropRecorder, type MicdropRecorderEvents, MicdropServer, MicdropServerCommands, type MicdropServerEvents, type MicdropToolCall, MockAgent, MockSTT, MockTTS, STT, type STTEvents, TTS, type TTSEvents, type Tool, handleError, waitForParams };
package/dist/index.js CHANGED
@@ -37,10 +37,13 @@ __export(index_exports, {
37
37
  AUTO_SEMANTIC_TURN_PROMPT: () => AUTO_SEMANTIC_TURN_PROMPT,
38
38
  AUTO_SEMANTIC_TURN_TOOL_NAME: () => AUTO_SEMANTIC_TURN_TOOL_NAME,
39
39
  Agent: () => Agent,
40
+ FallbackSTT: () => FallbackSTT,
41
+ FallbackTTS: () => FallbackTTS,
40
42
  Logger: () => Logger,
41
43
  MicdropClientCommands: () => MicdropClientCommands,
42
44
  MicdropError: () => MicdropError,
43
45
  MicdropErrorCode: () => MicdropErrorCode,
46
+ MicdropRecorder: () => MicdropRecorder,
44
47
  MicdropServer: () => MicdropServer,
45
48
  MicdropServerCommands: () => MicdropServerCommands,
46
49
  MockAgent: () => MockAgent,
@@ -307,6 +310,7 @@ var Logger = class {
307
310
  };
308
311
 
309
312
  // src/MicdropServer.ts
313
+ var import_eventemitter32 = require("eventemitter3");
310
314
  var import_stream2 = require("stream");
311
315
 
312
316
  // src/types.ts
@@ -326,8 +330,9 @@ var MicdropServerCommands = /* @__PURE__ */ ((MicdropServerCommands2) => {
326
330
  })(MicdropServerCommands || {});
327
331
 
328
332
  // src/MicdropServer.ts
329
- var MicdropServer = class {
333
+ var MicdropServer = class extends import_eventemitter32.EventEmitter {
330
334
  constructor(socket, config) {
335
+ super();
331
336
  this.socket = null;
332
337
  this.config = null;
333
338
  this.startTime = Date.now();
@@ -342,9 +347,8 @@ var MicdropServer = class {
342
347
  this.config.agent.destroy();
343
348
  this.config.stt.destroy();
344
349
  this.config.tts.destroy();
345
- this.config.onEnd?.({
346
- conversation: this.config.agent.conversation.slice(1),
347
- // Remove system message
350
+ this.emit("End", {
351
+ conversation: this.config.agent.conversation,
348
352
  duration
349
353
  });
350
354
  this.socket = null;
@@ -367,10 +371,10 @@ var MicdropServer = class {
367
371
  this.onStopSpeaking();
368
372
  }
369
373
  } else if (this.currentUserStream) {
370
- this.onAudioChunk(message);
374
+ this.onUserAudio(message);
371
375
  }
372
376
  };
373
- this.onTranscript = async (transcript) => {
377
+ this.onTranscriptSTT = async (transcript) => {
374
378
  if (!this.config) return;
375
379
  if (transcript === "") {
376
380
  this.socket?.send("SkipAnswer" /* SkipAnswer */);
@@ -384,10 +388,17 @@ var MicdropServer = class {
384
388
  this.answer();
385
389
  }
386
390
  };
391
+ this.onAudioTTS = (audio) => {
392
+ if (!this.socket) return;
393
+ this.log(`Send audio chunk (${audio.byteLength} bytes)`);
394
+ this.socket.send(audio);
395
+ this.emit("AssistantAudio", audio);
396
+ };
387
397
  this.socket = socket;
388
398
  this.config = config;
389
399
  this.log(`Call started`);
390
- this.config.stt.on("Transcript", this.onTranscript);
400
+ this.config.stt.on("Transcript", this.onTranscriptSTT);
401
+ this.config.tts.on("Audio", this.onAudioTTS);
391
402
  this.config.agent.on(
392
403
  "Message",
393
404
  (message) => this.socket?.send(
@@ -443,10 +454,11 @@ var MicdropServer = class {
443
454
  this.config?.agent.cancel();
444
455
  this.operationQueue = [];
445
456
  }
446
- onAudioChunk(chunk) {
457
+ onUserAudio(chunk) {
447
458
  this.log(`Received chunk (${chunk.byteLength} bytes)`);
448
459
  this.currentUserStream?.write(chunk);
449
460
  this.userSpeechChunks++;
461
+ this.emit("UserAudio", chunk);
450
462
  }
451
463
  onMute() {
452
464
  this.userSpeechChunks = 0;
@@ -530,40 +542,143 @@ var MicdropServer = class {
530
542
  } else {
531
543
  textStream = message;
532
544
  }
533
- const audio = this.config.tts.speak(textStream);
534
- await this._sendAudio(audio);
545
+ this.config.tts.speak(textStream);
535
546
  }
536
- sendAudio(audio) {
537
- this.queueOperation(async () => {
538
- await this._sendAudio(audio);
539
- });
547
+ };
548
+
549
+ // src/recorder/MicdropRecorder.ts
550
+ var import_eventemitter33 = require("eventemitter3");
551
+ var MicdropRecorder = class extends import_eventemitter33.EventEmitter {
552
+ constructor(server) {
553
+ super();
554
+ this.server = server;
555
+ this.audioMessages = [];
556
+ this.currentUserChunks = [];
557
+ this.currentAssistantChunks = [];
558
+ this.lastUserMessageIndex = -1;
559
+ this.lastAssistantMessageIndex = -1;
560
+ this.onUserAudio = (chunk) => {
561
+ if (this.currentAssistantChunks.length > 0) {
562
+ if (this.lastAssistantMessageIndex >= 0) {
563
+ this.finalizeAssistantAudio();
564
+ } else {
565
+ this.log("Discarding orphaned assistant audio chunks");
566
+ this.currentAssistantChunks = [];
567
+ }
568
+ }
569
+ this.log("Recording user audio chunk");
570
+ this.currentUserChunks.push(chunk);
571
+ };
572
+ this.onAssistantAudio = (chunk) => {
573
+ if (this.currentUserChunks.length > 0) {
574
+ if (this.lastUserMessageIndex >= 0) {
575
+ this.finalizeUserAudio();
576
+ } else {
577
+ this.log("Discarding orphaned user audio chunks");
578
+ this.currentUserChunks = [];
579
+ }
580
+ }
581
+ this.log("Recording assistant audio chunk");
582
+ this.currentAssistantChunks.push(chunk);
583
+ };
584
+ this.onMessage = (message) => {
585
+ const conversation = this.server.config?.agent.conversation;
586
+ if (!conversation) return;
587
+ const messageIndex = conversation.length - 1;
588
+ if (message.role === "user") {
589
+ this.lastUserMessageIndex = messageIndex;
590
+ if (this.currentUserChunks.length > 0) {
591
+ this.finalizeUserAudio();
592
+ }
593
+ } else if (message.role === "assistant") {
594
+ this.lastAssistantMessageIndex = messageIndex;
595
+ }
596
+ };
597
+ this.onEnd = () => {
598
+ if (this.currentUserChunks.length > 0) {
599
+ this.finalizeUserAudio();
600
+ }
601
+ if (this.currentAssistantChunks.length > 0) {
602
+ this.finalizeAssistantAudio();
603
+ }
604
+ this.log(`Recording complete: ${this.audioMessages.length} audio messages`);
605
+ this.emit("Complete", this.audioMessages);
606
+ };
607
+ this.setupListeners();
608
+ }
609
+ setupListeners() {
610
+ this.server.on("UserAudio", this.onUserAudio);
611
+ this.server.on("AssistantAudio", this.onAssistantAudio);
612
+ this.server.on("End", this.onEnd);
613
+ const agent = this.server.config?.agent;
614
+ if (agent) {
615
+ agent.on("Message", this.onMessage);
616
+ }
540
617
  }
541
- async _sendAudio(audio) {
542
- if (!this.socket) return;
543
- if (!audio.readable) {
544
- this.log("Non readable audio, skipping", audio);
545
- return;
618
+ finalizeUserAudio() {
619
+ if (this.currentUserChunks.length === 0) return;
620
+ if (this.lastUserMessageIndex < 0) return;
621
+ const conversation = this.server.config?.agent.conversation;
622
+ if (!conversation) return;
623
+ const message = conversation[this.lastUserMessageIndex];
624
+ const buffer = Buffer.concat(this.currentUserChunks);
625
+ const audioMessage = {
626
+ buffer,
627
+ messageIndex: this.lastUserMessageIndex,
628
+ message: "content" in message ? message.content : "",
629
+ role: "user"
630
+ };
631
+ this.log(
632
+ `Finalized user audio: ${buffer.length} bytes, message index ${this.lastUserMessageIndex}`
633
+ );
634
+ this.audioMessages.push(audioMessage);
635
+ this.emit("AudioMessage", audioMessage);
636
+ this.currentUserChunks = [];
637
+ this.lastUserMessageIndex = -1;
638
+ }
639
+ finalizeAssistantAudio() {
640
+ if (this.currentAssistantChunks.length === 0) return;
641
+ if (this.lastAssistantMessageIndex < 0) return;
642
+ const conversation = this.server.config?.agent.conversation;
643
+ if (!conversation) return;
644
+ const message = conversation[this.lastAssistantMessageIndex];
645
+ const buffer = Buffer.concat(this.currentAssistantChunks);
646
+ const audioMessage = {
647
+ buffer,
648
+ messageIndex: this.lastAssistantMessageIndex,
649
+ message: "content" in message ? message.content : "",
650
+ role: "assistant"
651
+ };
652
+ this.log(
653
+ `Finalized assistant audio: ${buffer.length} bytes, message index ${this.lastAssistantMessageIndex}`
654
+ );
655
+ this.audioMessages.push(audioMessage);
656
+ this.emit("AudioMessage", audioMessage);
657
+ this.currentAssistantChunks = [];
658
+ this.lastAssistantMessageIndex = -1;
659
+ }
660
+ getAudioMessages() {
661
+ return [...this.audioMessages];
662
+ }
663
+ destroy() {
664
+ this.log("Destroyed");
665
+ this.server.off("UserAudio", this.onUserAudio);
666
+ this.server.off("AssistantAudio", this.onAssistantAudio);
667
+ this.server.off("End", this.onEnd);
668
+ const agent = this.server.config?.agent;
669
+ if (agent) {
670
+ agent.off("Message", this.onMessage);
546
671
  }
547
- await new Promise((resolve, reject) => {
548
- audio.on("data", (chunk) => {
549
- this.log(`Send audio chunk (${chunk.byteLength} bytes)`);
550
- this.socket?.send(chunk);
551
- });
552
- audio.on("error", (error) => {
553
- this.log("Error in audio stream", error);
554
- reject(error);
555
- });
556
- audio.on("end", () => {
557
- this.log("Audio stream ended");
558
- resolve();
559
- });
560
- });
672
+ this.removeAllListeners();
673
+ }
674
+ log(...message) {
675
+ this.logger?.log(...message);
561
676
  }
562
677
  };
563
678
 
564
679
  // src/stt/STT.ts
565
- var import_eventemitter32 = require("eventemitter3");
566
- var STT = class extends import_eventemitter32.EventEmitter {
680
+ var import_eventemitter34 = require("eventemitter3");
681
+ var STT = class extends import_eventemitter34.EventEmitter {
567
682
  log(...message) {
568
683
  this.logger?.log(...message);
569
684
  }
@@ -586,12 +701,67 @@ var MockSTT = class extends STT {
586
701
  }
587
702
  };
588
703
 
704
+ // src/stt/FallbackSTT.ts
705
+ var import_stream3 = require("stream");
706
+ var FallbackSTT = class extends STT {
707
+ // Start at -1 because we need to increment it before using it
708
+ constructor(options) {
709
+ super();
710
+ this.options = options;
711
+ this.stt = null;
712
+ this.sttIndex = -1;
713
+ this.onTranscript = (transcript) => {
714
+ this.emit("Transcript", transcript);
715
+ };
716
+ this.onFailed = (chunks) => {
717
+ this.log("STT failed, trying next STT");
718
+ this.startNextSTT();
719
+ if (chunks.length > 0) {
720
+ this.log("Sending audio chunks again");
721
+ const stream = new import_stream3.PassThrough();
722
+ this.stt?.transcribe(stream);
723
+ chunks.forEach((chunk) => stream.write(chunk));
724
+ stream.end();
725
+ }
726
+ };
727
+ if (this.options.factories.length === 0) {
728
+ throw new Error("FallbackSTT: No factories provided");
729
+ }
730
+ this.startNextSTT();
731
+ }
732
+ transcribe(audioStream) {
733
+ this.stt?.transcribe(audioStream);
734
+ }
735
+ destroy() {
736
+ super.destroy();
737
+ this.stt?.destroy();
738
+ this.stt = null;
739
+ this.sttIndex = -1;
740
+ }
741
+ startNextSTT() {
742
+ this.sttIndex++;
743
+ if (this.sttIndex >= this.options.factories.length) {
744
+ this.sttIndex = 0;
745
+ }
746
+ this.stt?.destroy();
747
+ this.stt = this.options.factories[this.sttIndex]();
748
+ this.stt.on("Transcript", this.onTranscript);
749
+ this.stt.on("Failed", this.onFailed);
750
+ setTimeout(() => {
751
+ if (this.stt && this.logger) {
752
+ this.stt.logger = new Logger(this.stt.constructor.name);
753
+ }
754
+ }, 0);
755
+ }
756
+ };
757
+
589
758
  // src/tts/MockTTS.ts
590
759
  var fs = __toESM(require("fs"));
591
- var import_stream3 = require("stream");
760
+ var import_stream4 = require("stream");
592
761
 
593
762
  // src/tts/TTS.ts
594
- var TTS = class {
763
+ var import_eventemitter35 = require("eventemitter3");
764
+ var TTS = class extends import_eventemitter35.EventEmitter {
595
765
  log(...message) {
596
766
  this.logger?.log(...message);
597
767
  }
@@ -608,7 +778,7 @@ var MockTTS = class extends TTS {
608
778
  this.audioFilePaths = audioFilePaths;
609
779
  }
610
780
  speak(textStream) {
611
- const audioStream = new import_stream3.PassThrough();
781
+ const audioStream = new import_stream4.PassThrough();
612
782
  textStream.once("data", async () => {
613
783
  for (const filePath of this.audioFilePaths) {
614
784
  await new Promise((resolve) => setTimeout(resolve, 200));
@@ -624,6 +794,63 @@ var MockTTS = class extends TTS {
624
794
  }
625
795
  };
626
796
 
797
+ // src/tts/FallbackTTS.ts
798
+ var import_stream5 = require("stream");
799
+ var FallbackTTS = class extends TTS {
800
+ // Start at -1 because we need to increment it before using it
801
+ constructor(options) {
802
+ super();
803
+ this.options = options;
804
+ this.tts = null;
805
+ this.ttsIndex = -1;
806
+ this.onAudio = (audio) => {
807
+ this.emit("Audio", audio);
808
+ };
809
+ this.onFailed = (chunks) => {
810
+ this.log("TTS failed, trying next TTS");
811
+ this.startNextTTS();
812
+ if (chunks.length > 0) {
813
+ this.log("Sending text chunks again");
814
+ const stream = new import_stream5.PassThrough();
815
+ this.tts?.speak(stream);
816
+ chunks.forEach((chunk) => stream.write(chunk));
817
+ stream.end();
818
+ }
819
+ };
820
+ if (this.options.factories.length === 0) {
821
+ throw new Error("FallbackTTS: No factories provided");
822
+ }
823
+ this.startNextTTS();
824
+ }
825
+ speak(textStream) {
826
+ this.tts?.speak(textStream);
827
+ }
828
+ cancel() {
829
+ this.tts?.cancel();
830
+ }
831
+ destroy() {
832
+ super.destroy();
833
+ this.tts?.destroy();
834
+ this.tts = null;
835
+ this.ttsIndex = -1;
836
+ }
837
+ startNextTTS() {
838
+ this.ttsIndex++;
839
+ if (this.ttsIndex >= this.options.factories.length) {
840
+ this.ttsIndex = 0;
841
+ }
842
+ this.tts?.destroy();
843
+ this.tts = this.options.factories[this.ttsIndex]();
844
+ this.tts.on("Audio", this.onAudio);
845
+ this.tts.on("Failed", this.onFailed);
846
+ setTimeout(() => {
847
+ if (this.tts && this.logger) {
848
+ this.tts.logger = new Logger(this.tts.constructor.name);
849
+ }
850
+ }, 0);
851
+ }
852
+ };
853
+
627
854
  // src/waitForParams.ts
628
855
  async function waitForParams(socket, validate) {
629
856
  return new Promise((resolve, reject) => {
@@ -652,10 +879,13 @@ async function waitForParams(socket, validate) {
652
879
  AUTO_SEMANTIC_TURN_PROMPT,
653
880
  AUTO_SEMANTIC_TURN_TOOL_NAME,
654
881
  Agent,
882
+ FallbackSTT,
883
+ FallbackTTS,
655
884
  Logger,
656
885
  MicdropClientCommands,
657
886
  MicdropError,
658
887
  MicdropErrorCode,
888
+ MicdropRecorder,
659
889
  MicdropServer,
660
890
  MicdropServerCommands,
661
891
  MockAgent,