@micdrop/server 2.0.13 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +9 -0
- package/dist/index.d.mts +44 -32
- package/dist/index.d.ts +44 -32
- package/dist/index.js +138 -145
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +136 -141
- package/dist/index.mjs.map +1 -1
- package/package.json +8 -12
package/LICENSE
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Rolebase
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/dist/index.d.mts
CHANGED
|
@@ -4,7 +4,7 @@ import { z } from 'zod';
|
|
|
4
4
|
import WebSocket, { WebSocket as WebSocket$1 } from 'ws';
|
|
5
5
|
|
|
6
6
|
declare class Logger {
|
|
7
|
-
|
|
7
|
+
name: string;
|
|
8
8
|
constructor(name: string);
|
|
9
9
|
log(...message: any[]): void;
|
|
10
10
|
}
|
|
@@ -145,10 +145,6 @@ declare class MockAgent extends Agent {
|
|
|
145
145
|
cancel(): void;
|
|
146
146
|
}
|
|
147
147
|
|
|
148
|
-
declare function convertToPCM(audioStream: Readable, sampleRate?: number, bitDepth?: number): PassThrough;
|
|
149
|
-
declare function convertToOpus(audioStream: Readable, sampleRate?: number): PassThrough;
|
|
150
|
-
declare function convertPCMToOpus(audioStream: Readable, sampleRate?: number): PassThrough;
|
|
151
|
-
|
|
152
148
|
declare enum MicdropErrorCode {
|
|
153
149
|
BadRequest = 4400,
|
|
154
150
|
Unauthorized = 4401,
|
|
@@ -160,43 +156,44 @@ declare class MicdropError extends Error {
|
|
|
160
156
|
}
|
|
161
157
|
declare function handleError(socket: WebSocket, error: unknown): void;
|
|
162
158
|
|
|
163
|
-
declare const MIME_TYPE_TO_EXTENSION: {
|
|
164
|
-
readonly 'audio/wav': "wav";
|
|
165
|
-
readonly 'audio/ogg': "ogg";
|
|
166
|
-
readonly 'audio/mpeg': "mp3";
|
|
167
|
-
readonly 'audio/webm': "webm";
|
|
168
|
-
readonly 'audio/mp4': "mp4";
|
|
169
|
-
readonly 'audio/flac': "flac";
|
|
170
|
-
};
|
|
171
159
|
interface STTEvents {
|
|
172
160
|
Transcript: [string];
|
|
161
|
+
Failed: [Buffer[]];
|
|
173
162
|
}
|
|
174
163
|
declare abstract class STT extends EventEmitter<STTEvents> {
|
|
175
|
-
protected mimeType?: keyof typeof MIME_TYPE_TO_EXTENSION;
|
|
176
164
|
logger?: Logger;
|
|
177
|
-
transcribe(audioStream: Readable): void;
|
|
165
|
+
abstract transcribe(audioStream: Readable): void;
|
|
178
166
|
protected log(...message: any[]): void;
|
|
179
167
|
destroy(): void;
|
|
180
|
-
protected get extension(): string;
|
|
181
|
-
private detectMimeType;
|
|
182
168
|
}
|
|
183
169
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
declare abstract class FileSTT extends STT {
|
|
188
|
-
abstract transcribeFile(file: File): Promise<string>;
|
|
189
|
-
transcribe(audioStream: Readable): void;
|
|
170
|
+
declare class MockSTT extends STT {
|
|
171
|
+
private i;
|
|
172
|
+
transcribe(): Promise<void>;
|
|
190
173
|
}
|
|
191
174
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
175
|
+
interface FallbackSTTOptions {
|
|
176
|
+
factories: Array<() => STT>;
|
|
177
|
+
}
|
|
178
|
+
declare class FallbackSTT extends STT {
|
|
179
|
+
private readonly options;
|
|
180
|
+
private stt;
|
|
181
|
+
private sttIndex;
|
|
182
|
+
constructor(options: FallbackSTTOptions);
|
|
183
|
+
transcribe(audioStream: Readable): void;
|
|
184
|
+
destroy(): void;
|
|
185
|
+
private startNextSTT;
|
|
186
|
+
private onTranscript;
|
|
187
|
+
private onFailed;
|
|
195
188
|
}
|
|
196
189
|
|
|
197
|
-
|
|
190
|
+
interface TTSEvents {
|
|
191
|
+
Audio: [Buffer];
|
|
192
|
+
Failed: [string[]];
|
|
193
|
+
}
|
|
194
|
+
declare abstract class TTS extends EventEmitter<TTSEvents> {
|
|
198
195
|
logger?: Logger;
|
|
199
|
-
abstract speak(textStream: Readable):
|
|
196
|
+
abstract speak(textStream: Readable): void;
|
|
200
197
|
abstract cancel(): void;
|
|
201
198
|
protected log(...message: any[]): void;
|
|
202
199
|
destroy(): void;
|
|
@@ -209,6 +206,22 @@ declare class MockTTS extends TTS {
|
|
|
209
206
|
cancel(): void;
|
|
210
207
|
}
|
|
211
208
|
|
|
209
|
+
interface FallbackTTSOptions {
|
|
210
|
+
factories: Array<() => TTS>;
|
|
211
|
+
}
|
|
212
|
+
declare class FallbackTTS extends TTS {
|
|
213
|
+
private readonly options;
|
|
214
|
+
private tts;
|
|
215
|
+
private ttsIndex;
|
|
216
|
+
constructor(options: FallbackTTSOptions);
|
|
217
|
+
speak(textStream: Readable): void;
|
|
218
|
+
cancel(): void;
|
|
219
|
+
destroy(): void;
|
|
220
|
+
private startNextTTS;
|
|
221
|
+
private onAudio;
|
|
222
|
+
private onFailed;
|
|
223
|
+
}
|
|
224
|
+
|
|
212
225
|
interface MicdropConfig {
|
|
213
226
|
firstMessage?: string;
|
|
214
227
|
generateFirstMessage?: boolean;
|
|
@@ -238,16 +251,15 @@ declare class MicdropServer {
|
|
|
238
251
|
private onMute;
|
|
239
252
|
private onStartSpeaking;
|
|
240
253
|
private onStopSpeaking;
|
|
241
|
-
private
|
|
254
|
+
private onTranscriptSTT;
|
|
255
|
+
private onAudioTTS;
|
|
242
256
|
private sendFirstMessage;
|
|
243
257
|
answer(): void;
|
|
244
258
|
private _answer;
|
|
245
259
|
speak(message: string | Readable): void;
|
|
246
260
|
private _speak;
|
|
247
|
-
sendAudio(audio: Readable): void;
|
|
248
|
-
private _sendAudio;
|
|
249
261
|
}
|
|
250
262
|
|
|
251
263
|
declare function waitForParams<CallParams>(socket: WebSocket$1, validate: (params: any) => CallParams): Promise<CallParams>;
|
|
252
264
|
|
|
253
|
-
export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions,
|
|
265
|
+
export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions, FallbackSTT, type FallbackSTTOptions, FallbackTTS, type FallbackTTSOptions, Logger, type MicdropAnswerMetadata, type MicdropCallSummary, MicdropClientCommands, type MicdropConfig, type MicdropConversation, type MicdropConversationItem, type MicdropConversationMessage, type MicdropConversationToolCall, type MicdropConversationToolResult, MicdropError, MicdropErrorCode, MicdropServer, MicdropServerCommands, type MicdropToolCall, MockAgent, MockSTT, MockTTS, STT, type STTEvents, TTS, type TTSEvents, type Tool, handleError, waitForParams };
|
package/dist/index.d.ts
CHANGED
|
@@ -4,7 +4,7 @@ import { z } from 'zod';
|
|
|
4
4
|
import WebSocket, { WebSocket as WebSocket$1 } from 'ws';
|
|
5
5
|
|
|
6
6
|
declare class Logger {
|
|
7
|
-
|
|
7
|
+
name: string;
|
|
8
8
|
constructor(name: string);
|
|
9
9
|
log(...message: any[]): void;
|
|
10
10
|
}
|
|
@@ -145,10 +145,6 @@ declare class MockAgent extends Agent {
|
|
|
145
145
|
cancel(): void;
|
|
146
146
|
}
|
|
147
147
|
|
|
148
|
-
declare function convertToPCM(audioStream: Readable, sampleRate?: number, bitDepth?: number): PassThrough;
|
|
149
|
-
declare function convertToOpus(audioStream: Readable, sampleRate?: number): PassThrough;
|
|
150
|
-
declare function convertPCMToOpus(audioStream: Readable, sampleRate?: number): PassThrough;
|
|
151
|
-
|
|
152
148
|
declare enum MicdropErrorCode {
|
|
153
149
|
BadRequest = 4400,
|
|
154
150
|
Unauthorized = 4401,
|
|
@@ -160,43 +156,44 @@ declare class MicdropError extends Error {
|
|
|
160
156
|
}
|
|
161
157
|
declare function handleError(socket: WebSocket, error: unknown): void;
|
|
162
158
|
|
|
163
|
-
declare const MIME_TYPE_TO_EXTENSION: {
|
|
164
|
-
readonly 'audio/wav': "wav";
|
|
165
|
-
readonly 'audio/ogg': "ogg";
|
|
166
|
-
readonly 'audio/mpeg': "mp3";
|
|
167
|
-
readonly 'audio/webm': "webm";
|
|
168
|
-
readonly 'audio/mp4': "mp4";
|
|
169
|
-
readonly 'audio/flac': "flac";
|
|
170
|
-
};
|
|
171
159
|
interface STTEvents {
|
|
172
160
|
Transcript: [string];
|
|
161
|
+
Failed: [Buffer[]];
|
|
173
162
|
}
|
|
174
163
|
declare abstract class STT extends EventEmitter<STTEvents> {
|
|
175
|
-
protected mimeType?: keyof typeof MIME_TYPE_TO_EXTENSION;
|
|
176
164
|
logger?: Logger;
|
|
177
|
-
transcribe(audioStream: Readable): void;
|
|
165
|
+
abstract transcribe(audioStream: Readable): void;
|
|
178
166
|
protected log(...message: any[]): void;
|
|
179
167
|
destroy(): void;
|
|
180
|
-
protected get extension(): string;
|
|
181
|
-
private detectMimeType;
|
|
182
168
|
}
|
|
183
169
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
declare abstract class FileSTT extends STT {
|
|
188
|
-
abstract transcribeFile(file: File): Promise<string>;
|
|
189
|
-
transcribe(audioStream: Readable): void;
|
|
170
|
+
declare class MockSTT extends STT {
|
|
171
|
+
private i;
|
|
172
|
+
transcribe(): Promise<void>;
|
|
190
173
|
}
|
|
191
174
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
175
|
+
interface FallbackSTTOptions {
|
|
176
|
+
factories: Array<() => STT>;
|
|
177
|
+
}
|
|
178
|
+
declare class FallbackSTT extends STT {
|
|
179
|
+
private readonly options;
|
|
180
|
+
private stt;
|
|
181
|
+
private sttIndex;
|
|
182
|
+
constructor(options: FallbackSTTOptions);
|
|
183
|
+
transcribe(audioStream: Readable): void;
|
|
184
|
+
destroy(): void;
|
|
185
|
+
private startNextSTT;
|
|
186
|
+
private onTranscript;
|
|
187
|
+
private onFailed;
|
|
195
188
|
}
|
|
196
189
|
|
|
197
|
-
|
|
190
|
+
interface TTSEvents {
|
|
191
|
+
Audio: [Buffer];
|
|
192
|
+
Failed: [string[]];
|
|
193
|
+
}
|
|
194
|
+
declare abstract class TTS extends EventEmitter<TTSEvents> {
|
|
198
195
|
logger?: Logger;
|
|
199
|
-
abstract speak(textStream: Readable):
|
|
196
|
+
abstract speak(textStream: Readable): void;
|
|
200
197
|
abstract cancel(): void;
|
|
201
198
|
protected log(...message: any[]): void;
|
|
202
199
|
destroy(): void;
|
|
@@ -209,6 +206,22 @@ declare class MockTTS extends TTS {
|
|
|
209
206
|
cancel(): void;
|
|
210
207
|
}
|
|
211
208
|
|
|
209
|
+
interface FallbackTTSOptions {
|
|
210
|
+
factories: Array<() => TTS>;
|
|
211
|
+
}
|
|
212
|
+
declare class FallbackTTS extends TTS {
|
|
213
|
+
private readonly options;
|
|
214
|
+
private tts;
|
|
215
|
+
private ttsIndex;
|
|
216
|
+
constructor(options: FallbackTTSOptions);
|
|
217
|
+
speak(textStream: Readable): void;
|
|
218
|
+
cancel(): void;
|
|
219
|
+
destroy(): void;
|
|
220
|
+
private startNextTTS;
|
|
221
|
+
private onAudio;
|
|
222
|
+
private onFailed;
|
|
223
|
+
}
|
|
224
|
+
|
|
212
225
|
interface MicdropConfig {
|
|
213
226
|
firstMessage?: string;
|
|
214
227
|
generateFirstMessage?: boolean;
|
|
@@ -238,16 +251,15 @@ declare class MicdropServer {
|
|
|
238
251
|
private onMute;
|
|
239
252
|
private onStartSpeaking;
|
|
240
253
|
private onStopSpeaking;
|
|
241
|
-
private
|
|
254
|
+
private onTranscriptSTT;
|
|
255
|
+
private onAudioTTS;
|
|
242
256
|
private sendFirstMessage;
|
|
243
257
|
answer(): void;
|
|
244
258
|
private _answer;
|
|
245
259
|
speak(message: string | Readable): void;
|
|
246
260
|
private _speak;
|
|
247
|
-
sendAudio(audio: Readable): void;
|
|
248
|
-
private _sendAudio;
|
|
249
261
|
}
|
|
250
262
|
|
|
251
263
|
declare function waitForParams<CallParams>(socket: WebSocket$1, validate: (params: any) => CallParams): Promise<CallParams>;
|
|
252
264
|
|
|
253
|
-
export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions,
|
|
265
|
+
export { AUTO_END_CALL_PROMPT, AUTO_END_CALL_TOOL_NAME, AUTO_IGNORE_USER_NOISE_PROMPT, AUTO_IGNORE_USER_NOISE_TOOL_NAME, AUTO_SEMANTIC_TURN_PROMPT, AUTO_SEMANTIC_TURN_TOOL_NAME, Agent, type AgentEvents, type AgentOptions, type DeepPartial, type ExtractJsonOptions, type ExtractOptions, type ExtractTagOptions, FallbackSTT, type FallbackSTTOptions, FallbackTTS, type FallbackTTSOptions, Logger, type MicdropAnswerMetadata, type MicdropCallSummary, MicdropClientCommands, type MicdropConfig, type MicdropConversation, type MicdropConversationItem, type MicdropConversationMessage, type MicdropConversationToolCall, type MicdropConversationToolResult, MicdropError, MicdropErrorCode, MicdropServer, MicdropServerCommands, type MicdropToolCall, MockAgent, MockSTT, MockTTS, STT, type STTEvents, TTS, type TTSEvents, type Tool, handleError, waitForParams };
|
package/dist/index.js
CHANGED
|
@@ -37,7 +37,8 @@ __export(index_exports, {
|
|
|
37
37
|
AUTO_SEMANTIC_TURN_PROMPT: () => AUTO_SEMANTIC_TURN_PROMPT,
|
|
38
38
|
AUTO_SEMANTIC_TURN_TOOL_NAME: () => AUTO_SEMANTIC_TURN_TOOL_NAME,
|
|
39
39
|
Agent: () => Agent,
|
|
40
|
-
|
|
40
|
+
FallbackSTT: () => FallbackSTT,
|
|
41
|
+
FallbackTTS: () => FallbackTTS,
|
|
41
42
|
Logger: () => Logger,
|
|
42
43
|
MicdropClientCommands: () => MicdropClientCommands,
|
|
43
44
|
MicdropError: () => MicdropError,
|
|
@@ -49,9 +50,6 @@ __export(index_exports, {
|
|
|
49
50
|
MockTTS: () => MockTTS,
|
|
50
51
|
STT: () => STT,
|
|
51
52
|
TTS: () => TTS,
|
|
52
|
-
convertPCMToOpus: () => convertPCMToOpus,
|
|
53
|
-
convertToOpus: () => convertToOpus,
|
|
54
|
-
convertToPCM: () => convertToPCM,
|
|
55
53
|
handleError: () => handleError,
|
|
56
54
|
waitForParams: () => waitForParams
|
|
57
55
|
});
|
|
@@ -276,41 +274,6 @@ var MockAgent = class extends Agent {
|
|
|
276
274
|
}
|
|
277
275
|
};
|
|
278
276
|
|
|
279
|
-
// src/audio-convert.ts
|
|
280
|
-
var import_ffmpeg = __toESM(require("@ffmpeg-installer/ffmpeg"));
|
|
281
|
-
var import_fluent_ffmpeg = __toESM(require("fluent-ffmpeg"));
|
|
282
|
-
var import_stream2 = require("stream");
|
|
283
|
-
import_fluent_ffmpeg.default.setFfmpegPath(import_ffmpeg.default.path);
|
|
284
|
-
function convertToPCM(audioStream, sampleRate = 16e3, bitDepth = 16) {
|
|
285
|
-
const pcmStream = new import_stream2.PassThrough();
|
|
286
|
-
(0, import_fluent_ffmpeg.default)(audioStream).audioChannels(1).audioFrequency(sampleRate).audioCodec(`pcm_s${bitDepth}le`).format(`s${bitDepth}le`).on("error", (error) => {
|
|
287
|
-
console.error("Error converting audio stream:", error.message);
|
|
288
|
-
}).pipe(pcmStream);
|
|
289
|
-
return pcmStream;
|
|
290
|
-
}
|
|
291
|
-
function convertToOpus(audioStream, sampleRate = 16e3) {
|
|
292
|
-
const webmStream = new import_stream2.PassThrough();
|
|
293
|
-
ffmpegToOpus((0, import_fluent_ffmpeg.default)(audioStream), sampleRate).pipe(webmStream);
|
|
294
|
-
return webmStream;
|
|
295
|
-
}
|
|
296
|
-
function convertPCMToOpus(audioStream, sampleRate = 16e3) {
|
|
297
|
-
const webmStream = new import_stream2.PassThrough();
|
|
298
|
-
ffmpegToOpus((0, import_fluent_ffmpeg.default)(audioStream), sampleRate).inputFormat("s16le").inputOptions(["-f s16le", "-ar 16000", "-ac 1"]).pipe(webmStream);
|
|
299
|
-
return webmStream;
|
|
300
|
-
}
|
|
301
|
-
function ffmpegToOpus(ffmpegCommand, sampleRate = 16e3) {
|
|
302
|
-
return ffmpegCommand.audioChannels(1).audioFrequency(sampleRate).audioCodec("libopus").format("webm").outputOptions([
|
|
303
|
-
"-application audio",
|
|
304
|
-
`-ac 1`,
|
|
305
|
-
`-ar ${sampleRate}`,
|
|
306
|
-
`-b:a 64k`,
|
|
307
|
-
`-f webm`,
|
|
308
|
-
`-map_metadata -1`
|
|
309
|
-
]).on("error", (error) => {
|
|
310
|
-
console.error("Error converting to Opus: ", error.message);
|
|
311
|
-
});
|
|
312
|
-
}
|
|
313
|
-
|
|
314
277
|
// src/errors.ts
|
|
315
278
|
var MicdropErrorCode = /* @__PURE__ */ ((MicdropErrorCode2) => {
|
|
316
279
|
MicdropErrorCode2[MicdropErrorCode2["BadRequest"] = 4400] = "BadRequest";
|
|
@@ -346,7 +309,7 @@ var Logger = class {
|
|
|
346
309
|
};
|
|
347
310
|
|
|
348
311
|
// src/MicdropServer.ts
|
|
349
|
-
var
|
|
312
|
+
var import_stream2 = require("stream");
|
|
350
313
|
|
|
351
314
|
// src/types.ts
|
|
352
315
|
var MicdropClientCommands = /* @__PURE__ */ ((MicdropClientCommands2) => {
|
|
@@ -409,8 +372,12 @@ var MicdropServer = class {
|
|
|
409
372
|
this.onAudioChunk(message);
|
|
410
373
|
}
|
|
411
374
|
};
|
|
412
|
-
this.
|
|
375
|
+
this.onTranscriptSTT = async (transcript) => {
|
|
413
376
|
if (!this.config) return;
|
|
377
|
+
if (transcript === "") {
|
|
378
|
+
this.socket?.send("SkipAnswer" /* SkipAnswer */);
|
|
379
|
+
return;
|
|
380
|
+
}
|
|
414
381
|
this.log(`User transcript: "${transcript}"`);
|
|
415
382
|
this.config.agent.addUserMessage(transcript);
|
|
416
383
|
if (!this.currentUserStream) {
|
|
@@ -419,10 +386,16 @@ var MicdropServer = class {
|
|
|
419
386
|
this.answer();
|
|
420
387
|
}
|
|
421
388
|
};
|
|
389
|
+
this.onAudioTTS = (audio) => {
|
|
390
|
+
if (!this.socket) return;
|
|
391
|
+
this.log(`Send audio chunk (${audio.byteLength} bytes)`);
|
|
392
|
+
this.socket.send(audio);
|
|
393
|
+
};
|
|
422
394
|
this.socket = socket;
|
|
423
395
|
this.config = config;
|
|
424
396
|
this.log(`Call started`);
|
|
425
|
-
this.config.stt.on("Transcript", this.
|
|
397
|
+
this.config.stt.on("Transcript", this.onTranscriptSTT);
|
|
398
|
+
this.config.tts.on("Audio", this.onAudioTTS);
|
|
426
399
|
this.config.agent.on(
|
|
427
400
|
"Message",
|
|
428
401
|
(message) => this.socket?.send(
|
|
@@ -493,7 +466,7 @@ var MicdropServer = class {
|
|
|
493
466
|
if (!this.config) return;
|
|
494
467
|
this.userSpeechChunks = 0;
|
|
495
468
|
this.currentUserStream?.end();
|
|
496
|
-
this.currentUserStream = new
|
|
469
|
+
this.currentUserStream = new import_stream2.PassThrough();
|
|
497
470
|
this.config.stt.transcribe(this.currentUserStream);
|
|
498
471
|
this.cancel();
|
|
499
472
|
}
|
|
@@ -558,61 +531,20 @@ var MicdropServer = class {
|
|
|
558
531
|
if (!this.socket || !this.config) return;
|
|
559
532
|
let textStream;
|
|
560
533
|
if (typeof message === "string") {
|
|
561
|
-
const stream = new
|
|
534
|
+
const stream = new import_stream2.PassThrough();
|
|
562
535
|
stream.write(message);
|
|
563
536
|
stream.end();
|
|
564
537
|
textStream = stream;
|
|
565
538
|
} else {
|
|
566
539
|
textStream = message;
|
|
567
540
|
}
|
|
568
|
-
|
|
569
|
-
await this._sendAudio(audio);
|
|
570
|
-
}
|
|
571
|
-
sendAudio(audio) {
|
|
572
|
-
this.queueOperation(async () => {
|
|
573
|
-
await this._sendAudio(audio);
|
|
574
|
-
});
|
|
575
|
-
}
|
|
576
|
-
async _sendAudio(audio) {
|
|
577
|
-
if (!this.socket) return;
|
|
578
|
-
if (!audio.readable) {
|
|
579
|
-
this.log("Non readable audio, skipping", audio);
|
|
580
|
-
return;
|
|
581
|
-
}
|
|
582
|
-
await new Promise((resolve, reject) => {
|
|
583
|
-
audio.on("data", (chunk) => {
|
|
584
|
-
this.log(`Send audio chunk (${chunk.byteLength} bytes)`);
|
|
585
|
-
this.socket?.send(chunk);
|
|
586
|
-
});
|
|
587
|
-
audio.on("error", (error) => {
|
|
588
|
-
this.log("Error in audio stream", error);
|
|
589
|
-
reject(error);
|
|
590
|
-
});
|
|
591
|
-
audio.on("end", () => {
|
|
592
|
-
this.log("Audio stream ended");
|
|
593
|
-
resolve();
|
|
594
|
-
});
|
|
595
|
-
});
|
|
541
|
+
this.config.tts.speak(textStream);
|
|
596
542
|
}
|
|
597
543
|
};
|
|
598
544
|
|
|
599
545
|
// src/stt/STT.ts
|
|
600
546
|
var import_eventemitter32 = require("eventemitter3");
|
|
601
|
-
var MIME_TYPE_TO_EXTENSION = {
|
|
602
|
-
"audio/wav": "wav",
|
|
603
|
-
"audio/ogg": "ogg",
|
|
604
|
-
"audio/mpeg": "mp3",
|
|
605
|
-
"audio/webm": "webm",
|
|
606
|
-
"audio/mp4": "mp4",
|
|
607
|
-
"audio/flac": "flac"
|
|
608
|
-
};
|
|
609
547
|
var STT = class extends import_eventemitter32.EventEmitter {
|
|
610
|
-
// Set stream of audio to transcribe
|
|
611
|
-
transcribe(audioStream) {
|
|
612
|
-
audioStream.once("data", (chunk) => {
|
|
613
|
-
this.mimeType = this.detectMimeType(chunk);
|
|
614
|
-
});
|
|
615
|
-
}
|
|
616
548
|
log(...message) {
|
|
617
549
|
this.logger?.log(...message);
|
|
618
550
|
}
|
|
@@ -620,67 +552,72 @@ var STT = class extends import_eventemitter32.EventEmitter {
|
|
|
620
552
|
this.log("Destroyed");
|
|
621
553
|
this.removeAllListeners();
|
|
622
554
|
}
|
|
623
|
-
get extension() {
|
|
624
|
-
return this.mimeType && MIME_TYPE_TO_EXTENSION[this.mimeType] || "bin";
|
|
625
|
-
}
|
|
626
|
-
detectMimeType(chunk) {
|
|
627
|
-
if (!chunk || chunk.byteLength === 0) {
|
|
628
|
-
throw new Error("Unable to detect mime type (empty chunk)");
|
|
629
|
-
}
|
|
630
|
-
const arr = new Uint8Array(chunk);
|
|
631
|
-
if (arr[0] === 26 && arr[1] === 69 && arr[2] === 223 && arr[3] === 163) {
|
|
632
|
-
return "audio/webm";
|
|
633
|
-
}
|
|
634
|
-
if (arr[0] === 79 && arr[1] === 103 && arr[2] === 103 && arr[3] === 83) {
|
|
635
|
-
return "audio/ogg";
|
|
636
|
-
}
|
|
637
|
-
if (arr[0] === 82 && arr[1] === 73 && arr[2] === 70 && arr[3] === 70 && arr[8] === 87 && arr[9] === 65 && arr[10] === 86 && arr[11] === 69) {
|
|
638
|
-
return "audio/wav";
|
|
639
|
-
}
|
|
640
|
-
if (arr[0] === 73 && arr[1] === 68 && arr[2] === 51) {
|
|
641
|
-
return "audio/mpeg";
|
|
642
|
-
}
|
|
643
|
-
if (arr[4] === 102 && arr[5] === 116 && arr[6] === 121 && arr[7] === 112) {
|
|
644
|
-
return "audio/mp4";
|
|
645
|
-
}
|
|
646
|
-
if (arr[0] === 102 && arr[1] === 76 && arr[2] === 97 && arr[3] === 67) {
|
|
647
|
-
return "audio/flac";
|
|
648
|
-
}
|
|
649
|
-
this.log("Unable to detect mime type, using default", chunk);
|
|
650
|
-
return "audio/wav";
|
|
651
|
-
}
|
|
652
|
-
};
|
|
653
|
-
|
|
654
|
-
// src/stt/FileSTT.ts
|
|
655
|
-
var FileSTT = class extends STT {
|
|
656
|
-
transcribe(audioStream) {
|
|
657
|
-
super.transcribe(audioStream);
|
|
658
|
-
this.log("Converting stream to file...");
|
|
659
|
-
const chunks = [];
|
|
660
|
-
audioStream.on("data", (chunk) => {
|
|
661
|
-
chunks.push(chunk);
|
|
662
|
-
});
|
|
663
|
-
audioStream.on("end", async () => {
|
|
664
|
-
if (chunks.length === 0) return;
|
|
665
|
-
const arrayBuffer = Buffer.concat(chunks);
|
|
666
|
-
const file = new File([arrayBuffer], `audio.${this.extension}`, {
|
|
667
|
-
type: this.mimeType
|
|
668
|
-
});
|
|
669
|
-
this.log("Transcribing file...");
|
|
670
|
-
const transcript = await this.transcribeFile(file);
|
|
671
|
-
this.emit("Transcript", transcript);
|
|
672
|
-
});
|
|
673
|
-
}
|
|
674
555
|
};
|
|
675
556
|
|
|
676
557
|
// src/stt/MockSTT.ts
|
|
677
|
-
var MockSTT = class extends
|
|
558
|
+
var MockSTT = class extends STT {
|
|
678
559
|
constructor() {
|
|
679
560
|
super(...arguments);
|
|
680
561
|
this.i = 0;
|
|
681
562
|
}
|
|
682
|
-
async
|
|
683
|
-
|
|
563
|
+
async transcribe() {
|
|
564
|
+
setTimeout(() => {
|
|
565
|
+
this.emit("Transcript", `User Message ${this.i++}`);
|
|
566
|
+
}, 300);
|
|
567
|
+
}
|
|
568
|
+
};
|
|
569
|
+
|
|
570
|
+
// src/stt/FallbackSTT.ts
|
|
571
|
+
var import_stream3 = require("stream");
|
|
572
|
+
var FallbackSTT = class extends STT {
|
|
573
|
+
// Start at -1 because we need to increment it before using it
|
|
574
|
+
constructor(options) {
|
|
575
|
+
super();
|
|
576
|
+
this.options = options;
|
|
577
|
+
this.stt = null;
|
|
578
|
+
this.sttIndex = -1;
|
|
579
|
+
this.onTranscript = (transcript) => {
|
|
580
|
+
this.emit("Transcript", transcript);
|
|
581
|
+
};
|
|
582
|
+
this.onFailed = (chunks) => {
|
|
583
|
+
this.log("STT failed, trying next STT");
|
|
584
|
+
this.startNextSTT();
|
|
585
|
+
if (chunks.length > 0) {
|
|
586
|
+
this.log("Sending audio chunks again");
|
|
587
|
+
const stream = new import_stream3.PassThrough();
|
|
588
|
+
this.stt?.transcribe(stream);
|
|
589
|
+
chunks.forEach((chunk) => stream.write(chunk));
|
|
590
|
+
stream.end();
|
|
591
|
+
}
|
|
592
|
+
};
|
|
593
|
+
if (this.options.factories.length === 0) {
|
|
594
|
+
throw new Error("FallbackSTT: No factories provided");
|
|
595
|
+
}
|
|
596
|
+
this.startNextSTT();
|
|
597
|
+
}
|
|
598
|
+
transcribe(audioStream) {
|
|
599
|
+
this.stt?.transcribe(audioStream);
|
|
600
|
+
}
|
|
601
|
+
destroy() {
|
|
602
|
+
super.destroy();
|
|
603
|
+
this.stt?.destroy();
|
|
604
|
+
this.stt = null;
|
|
605
|
+
this.sttIndex = -1;
|
|
606
|
+
}
|
|
607
|
+
startNextSTT() {
|
|
608
|
+
this.sttIndex++;
|
|
609
|
+
if (this.sttIndex >= this.options.factories.length) {
|
|
610
|
+
this.sttIndex = 0;
|
|
611
|
+
}
|
|
612
|
+
this.stt?.destroy();
|
|
613
|
+
this.stt = this.options.factories[this.sttIndex]();
|
|
614
|
+
this.stt.on("Transcript", this.onTranscript);
|
|
615
|
+
this.stt.on("Failed", this.onFailed);
|
|
616
|
+
setTimeout(() => {
|
|
617
|
+
if (this.stt && this.logger) {
|
|
618
|
+
this.stt.logger = new Logger(this.stt.constructor.name);
|
|
619
|
+
}
|
|
620
|
+
}, 0);
|
|
684
621
|
}
|
|
685
622
|
};
|
|
686
623
|
|
|
@@ -689,7 +626,8 @@ var fs = __toESM(require("fs"));
|
|
|
689
626
|
var import_stream4 = require("stream");
|
|
690
627
|
|
|
691
628
|
// src/tts/TTS.ts
|
|
692
|
-
var
|
|
629
|
+
var import_eventemitter33 = require("eventemitter3");
|
|
630
|
+
var TTS = class extends import_eventemitter33.EventEmitter {
|
|
693
631
|
log(...message) {
|
|
694
632
|
this.logger?.log(...message);
|
|
695
633
|
}
|
|
@@ -722,6 +660,63 @@ var MockTTS = class extends TTS {
|
|
|
722
660
|
}
|
|
723
661
|
};
|
|
724
662
|
|
|
663
|
+
// src/tts/FallbackTTS.ts
|
|
664
|
+
var import_stream5 = require("stream");
|
|
665
|
+
var FallbackTTS = class extends TTS {
|
|
666
|
+
// Start at -1 because we need to increment it before using it
|
|
667
|
+
constructor(options) {
|
|
668
|
+
super();
|
|
669
|
+
this.options = options;
|
|
670
|
+
this.tts = null;
|
|
671
|
+
this.ttsIndex = -1;
|
|
672
|
+
this.onAudio = (audio) => {
|
|
673
|
+
this.emit("Audio", audio);
|
|
674
|
+
};
|
|
675
|
+
this.onFailed = (chunks) => {
|
|
676
|
+
this.log("TTS failed, trying next TTS");
|
|
677
|
+
this.startNextTTS();
|
|
678
|
+
if (chunks.length > 0) {
|
|
679
|
+
this.log("Sending text chunks again");
|
|
680
|
+
const stream = new import_stream5.PassThrough();
|
|
681
|
+
this.tts?.speak(stream);
|
|
682
|
+
chunks.forEach((chunk) => stream.write(chunk));
|
|
683
|
+
stream.end();
|
|
684
|
+
}
|
|
685
|
+
};
|
|
686
|
+
if (this.options.factories.length === 0) {
|
|
687
|
+
throw new Error("FallbackTTS: No factories provided");
|
|
688
|
+
}
|
|
689
|
+
this.startNextTTS();
|
|
690
|
+
}
|
|
691
|
+
speak(textStream) {
|
|
692
|
+
this.tts?.speak(textStream);
|
|
693
|
+
}
|
|
694
|
+
cancel() {
|
|
695
|
+
this.tts?.cancel();
|
|
696
|
+
}
|
|
697
|
+
destroy() {
|
|
698
|
+
super.destroy();
|
|
699
|
+
this.tts?.destroy();
|
|
700
|
+
this.tts = null;
|
|
701
|
+
this.ttsIndex = -1;
|
|
702
|
+
}
|
|
703
|
+
startNextTTS() {
|
|
704
|
+
this.ttsIndex++;
|
|
705
|
+
if (this.ttsIndex >= this.options.factories.length) {
|
|
706
|
+
this.ttsIndex = 0;
|
|
707
|
+
}
|
|
708
|
+
this.tts?.destroy();
|
|
709
|
+
this.tts = this.options.factories[this.ttsIndex]();
|
|
710
|
+
this.tts.on("Audio", this.onAudio);
|
|
711
|
+
this.tts.on("Failed", this.onFailed);
|
|
712
|
+
setTimeout(() => {
|
|
713
|
+
if (this.tts && this.logger) {
|
|
714
|
+
this.tts.logger = new Logger(this.tts.constructor.name);
|
|
715
|
+
}
|
|
716
|
+
}, 0);
|
|
717
|
+
}
|
|
718
|
+
};
|
|
719
|
+
|
|
725
720
|
// src/waitForParams.ts
|
|
726
721
|
async function waitForParams(socket, validate) {
|
|
727
722
|
return new Promise((resolve, reject) => {
|
|
@@ -750,7 +745,8 @@ async function waitForParams(socket, validate) {
|
|
|
750
745
|
AUTO_SEMANTIC_TURN_PROMPT,
|
|
751
746
|
AUTO_SEMANTIC_TURN_TOOL_NAME,
|
|
752
747
|
Agent,
|
|
753
|
-
|
|
748
|
+
FallbackSTT,
|
|
749
|
+
FallbackTTS,
|
|
754
750
|
Logger,
|
|
755
751
|
MicdropClientCommands,
|
|
756
752
|
MicdropError,
|
|
@@ -762,9 +758,6 @@ async function waitForParams(socket, validate) {
|
|
|
762
758
|
MockTTS,
|
|
763
759
|
STT,
|
|
764
760
|
TTS,
|
|
765
|
-
convertPCMToOpus,
|
|
766
|
-
convertToOpus,
|
|
767
|
-
convertToPCM,
|
|
768
761
|
handleError,
|
|
769
762
|
waitForParams
|
|
770
763
|
});
|