@micdrop/server 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +80 -10
- package/dist/index.d.ts +80 -10
- package/dist/index.js +268 -38
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +265 -38
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -2
package/dist/index.d.mts
CHANGED
|
@@ -4,7 +4,7 @@ import { z } from 'zod';
|
|
|
4
4
|
import WebSocket, { WebSocket as WebSocket$1 } from 'ws';
|
|
5
5
|
|
|
6
6
|
declare class Logger {
|
|
7
|
-
|
|
7
|
+
name: string;
|
|
8
8
|
constructor(name: string);
|
|
9
9
|
log(...message: any[]): void;
|
|
10
10
|
}
|
|
@@ -158,6 +158,7 @@ declare function handleError(socket: WebSocket, error: unknown): void;
|
|
|
158
158
|
|
|
159
159
|
interface STTEvents {
|
|
160
160
|
Transcript: [string];
|
|
161
|
+
Failed: [Buffer[]];
|
|
161
162
|
}
|
|
162
163
|
declare abstract class STT extends EventEmitter<STTEvents> {
|
|
163
164
|
logger?: Logger;
|
|
@@ -171,9 +172,28 @@ declare class MockSTT extends STT {
|
|
|
171
172
|
transcribe(): Promise<void>;
|
|
172
173
|
}
|
|
173
174
|
|
|
174
|
-
|
|
175
|
+
interface FallbackSTTOptions {
|
|
176
|
+
factories: Array<() => STT>;
|
|
177
|
+
}
|
|
178
|
+
declare class FallbackSTT extends STT {
|
|
179
|
+
private readonly options;
|
|
180
|
+
private stt;
|
|
181
|
+
private sttIndex;
|
|
182
|
+
constructor(options: FallbackSTTOptions);
|
|
183
|
+
transcribe(audioStream: Readable): void;
|
|
184
|
+
destroy(): void;
|
|
185
|
+
private startNextSTT;
|
|
186
|
+
private onTranscript;
|
|
187
|
+
private onFailed;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
interface TTSEvents {
|
|
191
|
+
Audio: [Buffer];
|
|
192
|
+
Failed: [string[]];
|
|
193
|
+
}
|
|
194
|
+
declare abstract class TTS extends EventEmitter<TTSEvents> {
|
|
175
195
|
logger?: Logger;
|
|
176
|
-
abstract speak(textStream: Readable):
|
|
196
|
+
abstract speak(textStream: Readable): void;
|
|
177
197
|
abstract cancel(): void;
|
|
178
198
|
protected log(...message: any[]): void;
|
|
179
199
|
destroy(): void;
|
|
@@ -186,15 +206,35 @@ declare class MockTTS extends TTS {
|
|
|
186
206
|
cancel(): void;
|
|
187
207
|
}
|
|
188
208
|
|
|
209
|
+
interface FallbackTTSOptions {
|
|
210
|
+
factories: Array<() => TTS>;
|
|
211
|
+
}
|
|
212
|
+
declare class FallbackTTS extends TTS {
|
|
213
|
+
private readonly options;
|
|
214
|
+
private tts;
|
|
215
|
+
private ttsIndex;
|
|
216
|
+
constructor(options: FallbackTTSOptions);
|
|
217
|
+
speak(textStream: Readable): void;
|
|
218
|
+
cancel(): void;
|
|
219
|
+
destroy(): void;
|
|
220
|
+
private startNextTTS;
|
|
221
|
+
private onAudio;
|
|
222
|
+
private onFailed;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
interface MicdropServerEvents {
|
|
226
|
+
End: [MicdropCallSummary];
|
|
227
|
+
UserAudio: [Buffer];
|
|
228
|
+
AssistantAudio: [Buffer];
|
|
229
|
+
}
|
|
189
230
|
interface MicdropConfig {
|
|
190
231
|
firstMessage?: string;
|
|
191
232
|
generateFirstMessage?: boolean;
|
|
192
233
|
agent: Agent;
|
|
193
234
|
stt: STT;
|
|
194
235
|
tts: TTS;
|
|
195
|
-
onEnd?(call: MicdropCallSummary): void;
|
|
196
236
|
}
|
|
197
|
-
declare class MicdropServer {
|
|
237
|
+
declare class MicdropServer extends EventEmitter<MicdropServerEvents> {
|
|
198
238
|
socket: WebSocket$1 | null;
|
|
199
239
|
config: MicdropConfig | null;
|
|
200
240
|
logger?: Logger;
|
|
@@ -211,20 +251,50 @@ declare class MicdropServer {
|
|
|
211
251
|
cancel(): void;
|
|
212
252
|
private onClose;
|
|
213
253
|
private onMessage;
|
|
214
|
-
private
|
|
254
|
+
private onUserAudio;
|
|
215
255
|
private onMute;
|
|
216
256
|
private onStartSpeaking;
|
|
217
257
|
private onStopSpeaking;
|
|
218
|
-
private
|
|
258
|
+
private onTranscriptSTT;
|
|
259
|
+
private onAudioTTS;
|
|
219
260
|
private sendFirstMessage;
|
|
220
261
|
answer(): void;
|
|
221
262
|
private _answer;
|
|
222
263
|
speak(message: string | Readable): void;
|
|
223
264
|
private _speak;
|
|
224
|
-
|
|
225
|
-
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
interface AudioMessage {
|
|
268
|
+
buffer: Buffer;
|
|
269
|
+
messageIndex: number;
|
|
270
|
+
message: string;
|
|
271
|
+
role: 'user' | 'assistant';
|
|
272
|
+
}
|
|
273
|
+
interface MicdropRecorderEvents {
|
|
274
|
+
AudioMessage: [AudioMessage];
|
|
275
|
+
Complete: [AudioMessage[]];
|
|
276
|
+
}
|
|
277
|
+
declare class MicdropRecorder extends EventEmitter<MicdropRecorderEvents> {
|
|
278
|
+
private server;
|
|
279
|
+
logger?: Logger;
|
|
280
|
+
private audioMessages;
|
|
281
|
+
private currentUserChunks;
|
|
282
|
+
private currentAssistantChunks;
|
|
283
|
+
private lastUserMessageIndex;
|
|
284
|
+
private lastAssistantMessageIndex;
|
|
285
|
+
constructor(server: MicdropServer);
|
|
286
|
+
private setupListeners;
|
|
287
|
+
private onUserAudio;
|
|
288
|
+
private onAssistantAudio;
|
|
289
|
+
private onMessage;
|
|
290
|
+
private finalizeUserAudio;
|
|
291
|
+
private finalizeAssistantAudio;
|
|
292
|
+
private onEnd;
|
|
293
|
+
getAudioMessages(): AudioMessage[];
|
|
294
|
+
destroy(): void;
|
|
295
|
+
protected log(...message: any[]): void;
|
|
226
296
|
}
|
|
227
297
|
|
|
228
298
|
declare function waitForParams<CallParams>(socket: WebSocket$1, validate: (params: any) => CallParams): Promise<CallParams>;
|
|
229
299
|
|
|
230
|
-
export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions, Logger, type MicdropAnswerMetadata, type MicdropCallSummary, MicdropClientCommands, type MicdropConfig, type MicdropConversation, type MicdropConversationItem, type MicdropConversationMessage, type MicdropConversationToolCall, type MicdropConversationToolResult, MicdropError, MicdropErrorCode, MicdropServer, MicdropServerCommands, type MicdropToolCall, MockAgent, MockSTT, MockTTS, STT, type STTEvents, TTS, type Tool, handleError, waitForParams };
|
|
300
|
+
export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type AudioMessage, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions, FallbackSTT, type FallbackSTTOptions, FallbackTTS, type FallbackTTSOptions, Logger, type MicdropAnswerMetadata, type MicdropCallSummary, MicdropClientCommands, type MicdropConfig, type MicdropConversation, type MicdropConversationItem, type MicdropConversationMessage, type MicdropConversationToolCall, type MicdropConversationToolResult, MicdropError, MicdropErrorCode, MicdropRecorder, type MicdropRecorderEvents, MicdropServer, MicdropServerCommands, type MicdropServerEvents, type MicdropToolCall, MockAgent, MockSTT, MockTTS, STT, type STTEvents, TTS, type TTSEvents, type Tool, handleError, waitForParams };
|
package/dist/index.d.ts
CHANGED
|
@@ -4,7 +4,7 @@ import { z } from 'zod';
|
|
|
4
4
|
import WebSocket, { WebSocket as WebSocket$1 } from 'ws';
|
|
5
5
|
|
|
6
6
|
declare class Logger {
|
|
7
|
-
|
|
7
|
+
name: string;
|
|
8
8
|
constructor(name: string);
|
|
9
9
|
log(...message: any[]): void;
|
|
10
10
|
}
|
|
@@ -158,6 +158,7 @@ declare function handleError(socket: WebSocket, error: unknown): void;
|
|
|
158
158
|
|
|
159
159
|
interface STTEvents {
|
|
160
160
|
Transcript: [string];
|
|
161
|
+
Failed: [Buffer[]];
|
|
161
162
|
}
|
|
162
163
|
declare abstract class STT extends EventEmitter<STTEvents> {
|
|
163
164
|
logger?: Logger;
|
|
@@ -171,9 +172,28 @@ declare class MockSTT extends STT {
|
|
|
171
172
|
transcribe(): Promise<void>;
|
|
172
173
|
}
|
|
173
174
|
|
|
174
|
-
|
|
175
|
+
interface FallbackSTTOptions {
|
|
176
|
+
factories: Array<() => STT>;
|
|
177
|
+
}
|
|
178
|
+
declare class FallbackSTT extends STT {
|
|
179
|
+
private readonly options;
|
|
180
|
+
private stt;
|
|
181
|
+
private sttIndex;
|
|
182
|
+
constructor(options: FallbackSTTOptions);
|
|
183
|
+
transcribe(audioStream: Readable): void;
|
|
184
|
+
destroy(): void;
|
|
185
|
+
private startNextSTT;
|
|
186
|
+
private onTranscript;
|
|
187
|
+
private onFailed;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
interface TTSEvents {
|
|
191
|
+
Audio: [Buffer];
|
|
192
|
+
Failed: [string[]];
|
|
193
|
+
}
|
|
194
|
+
declare abstract class TTS extends EventEmitter<TTSEvents> {
|
|
175
195
|
logger?: Logger;
|
|
176
|
-
abstract speak(textStream: Readable):
|
|
196
|
+
abstract speak(textStream: Readable): void;
|
|
177
197
|
abstract cancel(): void;
|
|
178
198
|
protected log(...message: any[]): void;
|
|
179
199
|
destroy(): void;
|
|
@@ -186,15 +206,35 @@ declare class MockTTS extends TTS {
|
|
|
186
206
|
cancel(): void;
|
|
187
207
|
}
|
|
188
208
|
|
|
209
|
+
interface FallbackTTSOptions {
|
|
210
|
+
factories: Array<() => TTS>;
|
|
211
|
+
}
|
|
212
|
+
declare class FallbackTTS extends TTS {
|
|
213
|
+
private readonly options;
|
|
214
|
+
private tts;
|
|
215
|
+
private ttsIndex;
|
|
216
|
+
constructor(options: FallbackTTSOptions);
|
|
217
|
+
speak(textStream: Readable): void;
|
|
218
|
+
cancel(): void;
|
|
219
|
+
destroy(): void;
|
|
220
|
+
private startNextTTS;
|
|
221
|
+
private onAudio;
|
|
222
|
+
private onFailed;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
interface MicdropServerEvents {
|
|
226
|
+
End: [MicdropCallSummary];
|
|
227
|
+
UserAudio: [Buffer];
|
|
228
|
+
AssistantAudio: [Buffer];
|
|
229
|
+
}
|
|
189
230
|
interface MicdropConfig {
|
|
190
231
|
firstMessage?: string;
|
|
191
232
|
generateFirstMessage?: boolean;
|
|
192
233
|
agent: Agent;
|
|
193
234
|
stt: STT;
|
|
194
235
|
tts: TTS;
|
|
195
|
-
onEnd?(call: MicdropCallSummary): void;
|
|
196
236
|
}
|
|
197
|
-
declare class MicdropServer {
|
|
237
|
+
declare class MicdropServer extends EventEmitter<MicdropServerEvents> {
|
|
198
238
|
socket: WebSocket$1 | null;
|
|
199
239
|
config: MicdropConfig | null;
|
|
200
240
|
logger?: Logger;
|
|
@@ -211,20 +251,50 @@ declare class MicdropServer {
|
|
|
211
251
|
cancel(): void;
|
|
212
252
|
private onClose;
|
|
213
253
|
private onMessage;
|
|
214
|
-
private
|
|
254
|
+
private onUserAudio;
|
|
215
255
|
private onMute;
|
|
216
256
|
private onStartSpeaking;
|
|
217
257
|
private onStopSpeaking;
|
|
218
|
-
private
|
|
258
|
+
private onTranscriptSTT;
|
|
259
|
+
private onAudioTTS;
|
|
219
260
|
private sendFirstMessage;
|
|
220
261
|
answer(): void;
|
|
221
262
|
private _answer;
|
|
222
263
|
speak(message: string | Readable): void;
|
|
223
264
|
private _speak;
|
|
224
|
-
|
|
225
|
-
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
interface AudioMessage {
|
|
268
|
+
buffer: Buffer;
|
|
269
|
+
messageIndex: number;
|
|
270
|
+
message: string;
|
|
271
|
+
role: 'user' | 'assistant';
|
|
272
|
+
}
|
|
273
|
+
interface MicdropRecorderEvents {
|
|
274
|
+
AudioMessage: [AudioMessage];
|
|
275
|
+
Complete: [AudioMessage[]];
|
|
276
|
+
}
|
|
277
|
+
declare class MicdropRecorder extends EventEmitter<MicdropRecorderEvents> {
|
|
278
|
+
private server;
|
|
279
|
+
logger?: Logger;
|
|
280
|
+
private audioMessages;
|
|
281
|
+
private currentUserChunks;
|
|
282
|
+
private currentAssistantChunks;
|
|
283
|
+
private lastUserMessageIndex;
|
|
284
|
+
private lastAssistantMessageIndex;
|
|
285
|
+
constructor(server: MicdropServer);
|
|
286
|
+
private setupListeners;
|
|
287
|
+
private onUserAudio;
|
|
288
|
+
private onAssistantAudio;
|
|
289
|
+
private onMessage;
|
|
290
|
+
private finalizeUserAudio;
|
|
291
|
+
private finalizeAssistantAudio;
|
|
292
|
+
private onEnd;
|
|
293
|
+
getAudioMessages(): AudioMessage[];
|
|
294
|
+
destroy(): void;
|
|
295
|
+
protected log(...message: any[]): void;
|
|
226
296
|
}
|
|
227
297
|
|
|
228
298
|
declare function waitForParams<CallParams>(socket: WebSocket$1, validate: (params: any) => CallParams): Promise<CallParams>;
|
|
229
299
|
|
|
230
|
-
export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions, Logger, type MicdropAnswerMetadata, type MicdropCallSummary, MicdropClientCommands, type MicdropConfig, type MicdropConversation, type MicdropConversationItem, type MicdropConversationMessage, type MicdropConversationToolCall, type MicdropConversationToolResult, MicdropError, MicdropErrorCode, MicdropServer, MicdropServerCommands, type MicdropToolCall, MockAgent, MockSTT, MockTTS, STT, type STTEvents, TTS, type Tool, handleError, waitForParams };
|
|
300
|
+
export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type AudioMessage, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions, FallbackSTT, type FallbackSTTOptions, FallbackTTS, type FallbackTTSOptions, Logger, type MicdropAnswerMetadata, type MicdropCallSummary, MicdropClientCommands, type MicdropConfig, type MicdropConversation, type MicdropConversationItem, type MicdropConversationMessage, type MicdropConversationToolCall, type MicdropConversationToolResult, MicdropError, MicdropErrorCode, MicdropRecorder, type MicdropRecorderEvents, MicdropServer, MicdropServerCommands, type MicdropServerEvents, type MicdropToolCall, MockAgent, MockSTT, MockTTS, STT, type STTEvents, TTS, type TTSEvents, type Tool, handleError, waitForParams };
|
package/dist/index.js
CHANGED
|
@@ -37,10 +37,13 @@ __export(index_exports, {
|
|
|
37
37
|
AUTO_SEMANTIC_TURN_PROMPT: () => AUTO_SEMANTIC_TURN_PROMPT,
|
|
38
38
|
AUTO_SEMANTIC_TURN_TOOL_NAME: () => AUTO_SEMANTIC_TURN_TOOL_NAME,
|
|
39
39
|
Agent: () => Agent,
|
|
40
|
+
FallbackSTT: () => FallbackSTT,
|
|
41
|
+
FallbackTTS: () => FallbackTTS,
|
|
40
42
|
Logger: () => Logger,
|
|
41
43
|
MicdropClientCommands: () => MicdropClientCommands,
|
|
42
44
|
MicdropError: () => MicdropError,
|
|
43
45
|
MicdropErrorCode: () => MicdropErrorCode,
|
|
46
|
+
MicdropRecorder: () => MicdropRecorder,
|
|
44
47
|
MicdropServer: () => MicdropServer,
|
|
45
48
|
MicdropServerCommands: () => MicdropServerCommands,
|
|
46
49
|
MockAgent: () => MockAgent,
|
|
@@ -307,6 +310,7 @@ var Logger = class {
|
|
|
307
310
|
};
|
|
308
311
|
|
|
309
312
|
// src/MicdropServer.ts
|
|
313
|
+
var import_eventemitter32 = require("eventemitter3");
|
|
310
314
|
var import_stream2 = require("stream");
|
|
311
315
|
|
|
312
316
|
// src/types.ts
|
|
@@ -326,8 +330,9 @@ var MicdropServerCommands = /* @__PURE__ */ ((MicdropServerCommands2) => {
|
|
|
326
330
|
})(MicdropServerCommands || {});
|
|
327
331
|
|
|
328
332
|
// src/MicdropServer.ts
|
|
329
|
-
var MicdropServer = class {
|
|
333
|
+
var MicdropServer = class extends import_eventemitter32.EventEmitter {
|
|
330
334
|
constructor(socket, config) {
|
|
335
|
+
super();
|
|
331
336
|
this.socket = null;
|
|
332
337
|
this.config = null;
|
|
333
338
|
this.startTime = Date.now();
|
|
@@ -342,9 +347,8 @@ var MicdropServer = class {
|
|
|
342
347
|
this.config.agent.destroy();
|
|
343
348
|
this.config.stt.destroy();
|
|
344
349
|
this.config.tts.destroy();
|
|
345
|
-
this.
|
|
346
|
-
conversation: this.config.agent.conversation
|
|
347
|
-
// Remove system message
|
|
350
|
+
this.emit("End", {
|
|
351
|
+
conversation: this.config.agent.conversation,
|
|
348
352
|
duration
|
|
349
353
|
});
|
|
350
354
|
this.socket = null;
|
|
@@ -367,10 +371,10 @@ var MicdropServer = class {
|
|
|
367
371
|
this.onStopSpeaking();
|
|
368
372
|
}
|
|
369
373
|
} else if (this.currentUserStream) {
|
|
370
|
-
this.
|
|
374
|
+
this.onUserAudio(message);
|
|
371
375
|
}
|
|
372
376
|
};
|
|
373
|
-
this.
|
|
377
|
+
this.onTranscriptSTT = async (transcript) => {
|
|
374
378
|
if (!this.config) return;
|
|
375
379
|
if (transcript === "") {
|
|
376
380
|
this.socket?.send("SkipAnswer" /* SkipAnswer */);
|
|
@@ -384,10 +388,17 @@ var MicdropServer = class {
|
|
|
384
388
|
this.answer();
|
|
385
389
|
}
|
|
386
390
|
};
|
|
391
|
+
this.onAudioTTS = (audio) => {
|
|
392
|
+
if (!this.socket) return;
|
|
393
|
+
this.log(`Send audio chunk (${audio.byteLength} bytes)`);
|
|
394
|
+
this.socket.send(audio);
|
|
395
|
+
this.emit("AssistantAudio", audio);
|
|
396
|
+
};
|
|
387
397
|
this.socket = socket;
|
|
388
398
|
this.config = config;
|
|
389
399
|
this.log(`Call started`);
|
|
390
|
-
this.config.stt.on("Transcript", this.
|
|
400
|
+
this.config.stt.on("Transcript", this.onTranscriptSTT);
|
|
401
|
+
this.config.tts.on("Audio", this.onAudioTTS);
|
|
391
402
|
this.config.agent.on(
|
|
392
403
|
"Message",
|
|
393
404
|
(message) => this.socket?.send(
|
|
@@ -443,10 +454,11 @@ var MicdropServer = class {
|
|
|
443
454
|
this.config?.agent.cancel();
|
|
444
455
|
this.operationQueue = [];
|
|
445
456
|
}
|
|
446
|
-
|
|
457
|
+
onUserAudio(chunk) {
|
|
447
458
|
this.log(`Received chunk (${chunk.byteLength} bytes)`);
|
|
448
459
|
this.currentUserStream?.write(chunk);
|
|
449
460
|
this.userSpeechChunks++;
|
|
461
|
+
this.emit("UserAudio", chunk);
|
|
450
462
|
}
|
|
451
463
|
onMute() {
|
|
452
464
|
this.userSpeechChunks = 0;
|
|
@@ -530,40 +542,143 @@ var MicdropServer = class {
|
|
|
530
542
|
} else {
|
|
531
543
|
textStream = message;
|
|
532
544
|
}
|
|
533
|
-
|
|
534
|
-
await this._sendAudio(audio);
|
|
545
|
+
this.config.tts.speak(textStream);
|
|
535
546
|
}
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
547
|
+
};
|
|
548
|
+
|
|
549
|
+
// src/recorder/MicdropRecorder.ts
|
|
550
|
+
var import_eventemitter33 = require("eventemitter3");
|
|
551
|
+
var MicdropRecorder = class extends import_eventemitter33.EventEmitter {
|
|
552
|
+
constructor(server) {
|
|
553
|
+
super();
|
|
554
|
+
this.server = server;
|
|
555
|
+
this.audioMessages = [];
|
|
556
|
+
this.currentUserChunks = [];
|
|
557
|
+
this.currentAssistantChunks = [];
|
|
558
|
+
this.lastUserMessageIndex = -1;
|
|
559
|
+
this.lastAssistantMessageIndex = -1;
|
|
560
|
+
this.onUserAudio = (chunk) => {
|
|
561
|
+
if (this.currentAssistantChunks.length > 0) {
|
|
562
|
+
if (this.lastAssistantMessageIndex >= 0) {
|
|
563
|
+
this.finalizeAssistantAudio();
|
|
564
|
+
} else {
|
|
565
|
+
this.log("Discarding orphaned assistant audio chunks");
|
|
566
|
+
this.currentAssistantChunks = [];
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
this.log("Recording user audio chunk");
|
|
570
|
+
this.currentUserChunks.push(chunk);
|
|
571
|
+
};
|
|
572
|
+
this.onAssistantAudio = (chunk) => {
|
|
573
|
+
if (this.currentUserChunks.length > 0) {
|
|
574
|
+
if (this.lastUserMessageIndex >= 0) {
|
|
575
|
+
this.finalizeUserAudio();
|
|
576
|
+
} else {
|
|
577
|
+
this.log("Discarding orphaned user audio chunks");
|
|
578
|
+
this.currentUserChunks = [];
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
this.log("Recording assistant audio chunk");
|
|
582
|
+
this.currentAssistantChunks.push(chunk);
|
|
583
|
+
};
|
|
584
|
+
this.onMessage = (message) => {
|
|
585
|
+
const conversation = this.server.config?.agent.conversation;
|
|
586
|
+
if (!conversation) return;
|
|
587
|
+
const messageIndex = conversation.length - 1;
|
|
588
|
+
if (message.role === "user") {
|
|
589
|
+
this.lastUserMessageIndex = messageIndex;
|
|
590
|
+
if (this.currentUserChunks.length > 0) {
|
|
591
|
+
this.finalizeUserAudio();
|
|
592
|
+
}
|
|
593
|
+
} else if (message.role === "assistant") {
|
|
594
|
+
this.lastAssistantMessageIndex = messageIndex;
|
|
595
|
+
}
|
|
596
|
+
};
|
|
597
|
+
this.onEnd = () => {
|
|
598
|
+
if (this.currentUserChunks.length > 0) {
|
|
599
|
+
this.finalizeUserAudio();
|
|
600
|
+
}
|
|
601
|
+
if (this.currentAssistantChunks.length > 0) {
|
|
602
|
+
this.finalizeAssistantAudio();
|
|
603
|
+
}
|
|
604
|
+
this.log(`Recording complete: ${this.audioMessages.length} audio messages`);
|
|
605
|
+
this.emit("Complete", this.audioMessages);
|
|
606
|
+
};
|
|
607
|
+
this.setupListeners();
|
|
608
|
+
}
|
|
609
|
+
setupListeners() {
|
|
610
|
+
this.server.on("UserAudio", this.onUserAudio);
|
|
611
|
+
this.server.on("AssistantAudio", this.onAssistantAudio);
|
|
612
|
+
this.server.on("End", this.onEnd);
|
|
613
|
+
const agent = this.server.config?.agent;
|
|
614
|
+
if (agent) {
|
|
615
|
+
agent.on("Message", this.onMessage);
|
|
616
|
+
}
|
|
540
617
|
}
|
|
541
|
-
|
|
542
|
-
if (
|
|
543
|
-
if (
|
|
544
|
-
|
|
545
|
-
|
|
618
|
+
finalizeUserAudio() {
|
|
619
|
+
if (this.currentUserChunks.length === 0) return;
|
|
620
|
+
if (this.lastUserMessageIndex < 0) return;
|
|
621
|
+
const conversation = this.server.config?.agent.conversation;
|
|
622
|
+
if (!conversation) return;
|
|
623
|
+
const message = conversation[this.lastUserMessageIndex];
|
|
624
|
+
const buffer = Buffer.concat(this.currentUserChunks);
|
|
625
|
+
const audioMessage = {
|
|
626
|
+
buffer,
|
|
627
|
+
messageIndex: this.lastUserMessageIndex,
|
|
628
|
+
message: "content" in message ? message.content : "",
|
|
629
|
+
role: "user"
|
|
630
|
+
};
|
|
631
|
+
this.log(
|
|
632
|
+
`Finalized user audio: ${buffer.length} bytes, message index ${this.lastUserMessageIndex}`
|
|
633
|
+
);
|
|
634
|
+
this.audioMessages.push(audioMessage);
|
|
635
|
+
this.emit("AudioMessage", audioMessage);
|
|
636
|
+
this.currentUserChunks = [];
|
|
637
|
+
this.lastUserMessageIndex = -1;
|
|
638
|
+
}
|
|
639
|
+
finalizeAssistantAudio() {
|
|
640
|
+
if (this.currentAssistantChunks.length === 0) return;
|
|
641
|
+
if (this.lastAssistantMessageIndex < 0) return;
|
|
642
|
+
const conversation = this.server.config?.agent.conversation;
|
|
643
|
+
if (!conversation) return;
|
|
644
|
+
const message = conversation[this.lastAssistantMessageIndex];
|
|
645
|
+
const buffer = Buffer.concat(this.currentAssistantChunks);
|
|
646
|
+
const audioMessage = {
|
|
647
|
+
buffer,
|
|
648
|
+
messageIndex: this.lastAssistantMessageIndex,
|
|
649
|
+
message: "content" in message ? message.content : "",
|
|
650
|
+
role: "assistant"
|
|
651
|
+
};
|
|
652
|
+
this.log(
|
|
653
|
+
`Finalized assistant audio: ${buffer.length} bytes, message index ${this.lastAssistantMessageIndex}`
|
|
654
|
+
);
|
|
655
|
+
this.audioMessages.push(audioMessage);
|
|
656
|
+
this.emit("AudioMessage", audioMessage);
|
|
657
|
+
this.currentAssistantChunks = [];
|
|
658
|
+
this.lastAssistantMessageIndex = -1;
|
|
659
|
+
}
|
|
660
|
+
getAudioMessages() {
|
|
661
|
+
return [...this.audioMessages];
|
|
662
|
+
}
|
|
663
|
+
destroy() {
|
|
664
|
+
this.log("Destroyed");
|
|
665
|
+
this.server.off("UserAudio", this.onUserAudio);
|
|
666
|
+
this.server.off("AssistantAudio", this.onAssistantAudio);
|
|
667
|
+
this.server.off("End", this.onEnd);
|
|
668
|
+
const agent = this.server.config?.agent;
|
|
669
|
+
if (agent) {
|
|
670
|
+
agent.off("Message", this.onMessage);
|
|
546
671
|
}
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
});
|
|
552
|
-
audio.on("error", (error) => {
|
|
553
|
-
this.log("Error in audio stream", error);
|
|
554
|
-
reject(error);
|
|
555
|
-
});
|
|
556
|
-
audio.on("end", () => {
|
|
557
|
-
this.log("Audio stream ended");
|
|
558
|
-
resolve();
|
|
559
|
-
});
|
|
560
|
-
});
|
|
672
|
+
this.removeAllListeners();
|
|
673
|
+
}
|
|
674
|
+
log(...message) {
|
|
675
|
+
this.logger?.log(...message);
|
|
561
676
|
}
|
|
562
677
|
};
|
|
563
678
|
|
|
564
679
|
// src/stt/STT.ts
|
|
565
|
-
var
|
|
566
|
-
var STT = class extends
|
|
680
|
+
var import_eventemitter34 = require("eventemitter3");
|
|
681
|
+
var STT = class extends import_eventemitter34.EventEmitter {
|
|
567
682
|
log(...message) {
|
|
568
683
|
this.logger?.log(...message);
|
|
569
684
|
}
|
|
@@ -586,12 +701,67 @@ var MockSTT = class extends STT {
|
|
|
586
701
|
}
|
|
587
702
|
};
|
|
588
703
|
|
|
704
|
+
// src/stt/FallbackSTT.ts
|
|
705
|
+
var import_stream3 = require("stream");
|
|
706
|
+
var FallbackSTT = class extends STT {
|
|
707
|
+
// Start at -1 because we need to increment it before using it
|
|
708
|
+
constructor(options) {
|
|
709
|
+
super();
|
|
710
|
+
this.options = options;
|
|
711
|
+
this.stt = null;
|
|
712
|
+
this.sttIndex = -1;
|
|
713
|
+
this.onTranscript = (transcript) => {
|
|
714
|
+
this.emit("Transcript", transcript);
|
|
715
|
+
};
|
|
716
|
+
this.onFailed = (chunks) => {
|
|
717
|
+
this.log("STT failed, trying next STT");
|
|
718
|
+
this.startNextSTT();
|
|
719
|
+
if (chunks.length > 0) {
|
|
720
|
+
this.log("Sending audio chunks again");
|
|
721
|
+
const stream = new import_stream3.PassThrough();
|
|
722
|
+
this.stt?.transcribe(stream);
|
|
723
|
+
chunks.forEach((chunk) => stream.write(chunk));
|
|
724
|
+
stream.end();
|
|
725
|
+
}
|
|
726
|
+
};
|
|
727
|
+
if (this.options.factories.length === 0) {
|
|
728
|
+
throw new Error("FallbackSTT: No factories provided");
|
|
729
|
+
}
|
|
730
|
+
this.startNextSTT();
|
|
731
|
+
}
|
|
732
|
+
transcribe(audioStream) {
|
|
733
|
+
this.stt?.transcribe(audioStream);
|
|
734
|
+
}
|
|
735
|
+
destroy() {
|
|
736
|
+
super.destroy();
|
|
737
|
+
this.stt?.destroy();
|
|
738
|
+
this.stt = null;
|
|
739
|
+
this.sttIndex = -1;
|
|
740
|
+
}
|
|
741
|
+
startNextSTT() {
|
|
742
|
+
this.sttIndex++;
|
|
743
|
+
if (this.sttIndex >= this.options.factories.length) {
|
|
744
|
+
this.sttIndex = 0;
|
|
745
|
+
}
|
|
746
|
+
this.stt?.destroy();
|
|
747
|
+
this.stt = this.options.factories[this.sttIndex]();
|
|
748
|
+
this.stt.on("Transcript", this.onTranscript);
|
|
749
|
+
this.stt.on("Failed", this.onFailed);
|
|
750
|
+
setTimeout(() => {
|
|
751
|
+
if (this.stt && this.logger) {
|
|
752
|
+
this.stt.logger = new Logger(this.stt.constructor.name);
|
|
753
|
+
}
|
|
754
|
+
}, 0);
|
|
755
|
+
}
|
|
756
|
+
};
|
|
757
|
+
|
|
589
758
|
// src/tts/MockTTS.ts
|
|
590
759
|
var fs = __toESM(require("fs"));
|
|
591
|
-
var
|
|
760
|
+
var import_stream4 = require("stream");
|
|
592
761
|
|
|
593
762
|
// src/tts/TTS.ts
|
|
594
|
-
var
|
|
763
|
+
var import_eventemitter35 = require("eventemitter3");
|
|
764
|
+
var TTS = class extends import_eventemitter35.EventEmitter {
|
|
595
765
|
log(...message) {
|
|
596
766
|
this.logger?.log(...message);
|
|
597
767
|
}
|
|
@@ -608,7 +778,7 @@ var MockTTS = class extends TTS {
|
|
|
608
778
|
this.audioFilePaths = audioFilePaths;
|
|
609
779
|
}
|
|
610
780
|
speak(textStream) {
|
|
611
|
-
const audioStream = new
|
|
781
|
+
const audioStream = new import_stream4.PassThrough();
|
|
612
782
|
textStream.once("data", async () => {
|
|
613
783
|
for (const filePath of this.audioFilePaths) {
|
|
614
784
|
await new Promise((resolve) => setTimeout(resolve, 200));
|
|
@@ -624,6 +794,63 @@ var MockTTS = class extends TTS {
|
|
|
624
794
|
}
|
|
625
795
|
};
|
|
626
796
|
|
|
797
|
+
// src/tts/FallbackTTS.ts
|
|
798
|
+
var import_stream5 = require("stream");
|
|
799
|
+
var FallbackTTS = class extends TTS {
|
|
800
|
+
// Start at -1 because we need to increment it before using it
|
|
801
|
+
constructor(options) {
|
|
802
|
+
super();
|
|
803
|
+
this.options = options;
|
|
804
|
+
this.tts = null;
|
|
805
|
+
this.ttsIndex = -1;
|
|
806
|
+
this.onAudio = (audio) => {
|
|
807
|
+
this.emit("Audio", audio);
|
|
808
|
+
};
|
|
809
|
+
this.onFailed = (chunks) => {
|
|
810
|
+
this.log("TTS failed, trying next TTS");
|
|
811
|
+
this.startNextTTS();
|
|
812
|
+
if (chunks.length > 0) {
|
|
813
|
+
this.log("Sending text chunks again");
|
|
814
|
+
const stream = new import_stream5.PassThrough();
|
|
815
|
+
this.tts?.speak(stream);
|
|
816
|
+
chunks.forEach((chunk) => stream.write(chunk));
|
|
817
|
+
stream.end();
|
|
818
|
+
}
|
|
819
|
+
};
|
|
820
|
+
if (this.options.factories.length === 0) {
|
|
821
|
+
throw new Error("FallbackTTS: No factories provided");
|
|
822
|
+
}
|
|
823
|
+
this.startNextTTS();
|
|
824
|
+
}
|
|
825
|
+
speak(textStream) {
|
|
826
|
+
this.tts?.speak(textStream);
|
|
827
|
+
}
|
|
828
|
+
cancel() {
|
|
829
|
+
this.tts?.cancel();
|
|
830
|
+
}
|
|
831
|
+
destroy() {
|
|
832
|
+
super.destroy();
|
|
833
|
+
this.tts?.destroy();
|
|
834
|
+
this.tts = null;
|
|
835
|
+
this.ttsIndex = -1;
|
|
836
|
+
}
|
|
837
|
+
startNextTTS() {
|
|
838
|
+
this.ttsIndex++;
|
|
839
|
+
if (this.ttsIndex >= this.options.factories.length) {
|
|
840
|
+
this.ttsIndex = 0;
|
|
841
|
+
}
|
|
842
|
+
this.tts?.destroy();
|
|
843
|
+
this.tts = this.options.factories[this.ttsIndex]();
|
|
844
|
+
this.tts.on("Audio", this.onAudio);
|
|
845
|
+
this.tts.on("Failed", this.onFailed);
|
|
846
|
+
setTimeout(() => {
|
|
847
|
+
if (this.tts && this.logger) {
|
|
848
|
+
this.tts.logger = new Logger(this.tts.constructor.name);
|
|
849
|
+
}
|
|
850
|
+
}, 0);
|
|
851
|
+
}
|
|
852
|
+
};
|
|
853
|
+
|
|
627
854
|
// src/waitForParams.ts
|
|
628
855
|
async function waitForParams(socket, validate) {
|
|
629
856
|
return new Promise((resolve, reject) => {
|
|
@@ -652,10 +879,13 @@ async function waitForParams(socket, validate) {
|
|
|
652
879
|
AUTO_SEMANTIC_TURN_PROMPT,
|
|
653
880
|
AUTO_SEMANTIC_TURN_TOOL_NAME,
|
|
654
881
|
Agent,
|
|
882
|
+
FallbackSTT,
|
|
883
|
+
FallbackTTS,
|
|
655
884
|
Logger,
|
|
656
885
|
MicdropClientCommands,
|
|
657
886
|
MicdropError,
|
|
658
887
|
MicdropErrorCode,
|
|
888
|
+
MicdropRecorder,
|
|
659
889
|
MicdropServer,
|
|
660
890
|
MicdropServerCommands,
|
|
661
891
|
MockAgent,
|