@pompeii-labs/audio 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/voice.d.mts +45 -1
- package/dist/voice.d.ts +45 -1
- package/dist/voice.js +186 -17
- package/dist/voice.mjs +183 -16
- package/package.json +2 -2
package/dist/voice.d.mts
CHANGED
|
@@ -43,6 +43,7 @@ type MagmaFlowArgs = {
|
|
|
43
43
|
onSpeechDetected: () => void;
|
|
44
44
|
onTranscription: (transcription: MagmaFlowSTTOutput) => void;
|
|
45
45
|
onAudioOutput: (audio: Buffer) => void;
|
|
46
|
+
onNormalizedAudio?: (audio: Buffer) => void;
|
|
46
47
|
config?: MagmaFlowConfig;
|
|
47
48
|
};
|
|
48
49
|
declare class MagmaFlow {
|
|
@@ -51,6 +52,7 @@ declare class MagmaFlow {
|
|
|
51
52
|
private inputFormat;
|
|
52
53
|
private outputFormat;
|
|
53
54
|
private onAudioOutput;
|
|
55
|
+
private onNormalizedAudio?;
|
|
54
56
|
private textBuffer;
|
|
55
57
|
private textQueue;
|
|
56
58
|
private generatingAudio;
|
|
@@ -107,6 +109,36 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
|
|
|
107
109
|
private computeTurns;
|
|
108
110
|
}
|
|
109
111
|
|
|
112
|
+
type DeepgramFluxConfig = {
|
|
113
|
+
apiKey?: string;
|
|
114
|
+
eotThreshold?: number;
|
|
115
|
+
eagerEotThreshold?: number;
|
|
116
|
+
eotTimeoutMs?: number;
|
|
117
|
+
};
|
|
118
|
+
declare class DeepgramFluxSTT extends MagmaFlowSpeechToText {
|
|
119
|
+
private ws;
|
|
120
|
+
private apiKey;
|
|
121
|
+
private eotThreshold;
|
|
122
|
+
private eagerEotThreshold?;
|
|
123
|
+
private eotTimeoutMs;
|
|
124
|
+
private audioQueue;
|
|
125
|
+
private connecting;
|
|
126
|
+
private killed;
|
|
127
|
+
private reconnectAttempts;
|
|
128
|
+
private reconnectTimer?;
|
|
129
|
+
constructor(config?: DeepgramFluxConfig);
|
|
130
|
+
private connect;
|
|
131
|
+
private scheduleReconnect;
|
|
132
|
+
private clearConnection;
|
|
133
|
+
private handleMessage;
|
|
134
|
+
private handleTurnInfo;
|
|
135
|
+
input(audio: Buffer): void;
|
|
136
|
+
flush(): void;
|
|
137
|
+
kill(): void;
|
|
138
|
+
onEagerEndOfTurn(transcript: string): void;
|
|
139
|
+
onTurnResumed(): void;
|
|
140
|
+
}
|
|
141
|
+
|
|
110
142
|
declare enum GladiaModel {
|
|
111
143
|
SOLARIA_1 = "solaria-1"
|
|
112
144
|
}
|
|
@@ -278,9 +310,21 @@ declare class ElevenLabsTTS extends MagmaFlowTextToSpeech {
|
|
|
278
310
|
|
|
279
311
|
type HumeTTSArgs = {
|
|
280
312
|
client?: HumeClient;
|
|
313
|
+
voice?: {
|
|
314
|
+
name?: string;
|
|
315
|
+
id?: string;
|
|
316
|
+
provider?: string;
|
|
317
|
+
};
|
|
318
|
+
description?: string;
|
|
319
|
+
speed?: number;
|
|
320
|
+
version?: string;
|
|
281
321
|
};
|
|
282
322
|
declare class HumeTTS extends MagmaFlowTextToSpeech {
|
|
283
323
|
private client;
|
|
324
|
+
private voice?;
|
|
325
|
+
private description?;
|
|
326
|
+
private speed?;
|
|
327
|
+
private version?;
|
|
284
328
|
constructor(args: HumeTTSArgs);
|
|
285
329
|
setup(): Promise<void>;
|
|
286
330
|
input(text: string | null, requestId: string): void;
|
|
@@ -302,4 +346,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
|
|
|
302
346
|
|
|
303
347
|
declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
|
|
304
348
|
|
|
305
|
-
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
|
349
|
+
export { type DeepgramFluxConfig, DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
package/dist/voice.d.ts
CHANGED
|
@@ -43,6 +43,7 @@ type MagmaFlowArgs = {
|
|
|
43
43
|
onSpeechDetected: () => void;
|
|
44
44
|
onTranscription: (transcription: MagmaFlowSTTOutput) => void;
|
|
45
45
|
onAudioOutput: (audio: Buffer) => void;
|
|
46
|
+
onNormalizedAudio?: (audio: Buffer) => void;
|
|
46
47
|
config?: MagmaFlowConfig;
|
|
47
48
|
};
|
|
48
49
|
declare class MagmaFlow {
|
|
@@ -51,6 +52,7 @@ declare class MagmaFlow {
|
|
|
51
52
|
private inputFormat;
|
|
52
53
|
private outputFormat;
|
|
53
54
|
private onAudioOutput;
|
|
55
|
+
private onNormalizedAudio?;
|
|
54
56
|
private textBuffer;
|
|
55
57
|
private textQueue;
|
|
56
58
|
private generatingAudio;
|
|
@@ -107,6 +109,36 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
|
|
|
107
109
|
private computeTurns;
|
|
108
110
|
}
|
|
109
111
|
|
|
112
|
+
type DeepgramFluxConfig = {
|
|
113
|
+
apiKey?: string;
|
|
114
|
+
eotThreshold?: number;
|
|
115
|
+
eagerEotThreshold?: number;
|
|
116
|
+
eotTimeoutMs?: number;
|
|
117
|
+
};
|
|
118
|
+
declare class DeepgramFluxSTT extends MagmaFlowSpeechToText {
|
|
119
|
+
private ws;
|
|
120
|
+
private apiKey;
|
|
121
|
+
private eotThreshold;
|
|
122
|
+
private eagerEotThreshold?;
|
|
123
|
+
private eotTimeoutMs;
|
|
124
|
+
private audioQueue;
|
|
125
|
+
private connecting;
|
|
126
|
+
private killed;
|
|
127
|
+
private reconnectAttempts;
|
|
128
|
+
private reconnectTimer?;
|
|
129
|
+
constructor(config?: DeepgramFluxConfig);
|
|
130
|
+
private connect;
|
|
131
|
+
private scheduleReconnect;
|
|
132
|
+
private clearConnection;
|
|
133
|
+
private handleMessage;
|
|
134
|
+
private handleTurnInfo;
|
|
135
|
+
input(audio: Buffer): void;
|
|
136
|
+
flush(): void;
|
|
137
|
+
kill(): void;
|
|
138
|
+
onEagerEndOfTurn(transcript: string): void;
|
|
139
|
+
onTurnResumed(): void;
|
|
140
|
+
}
|
|
141
|
+
|
|
110
142
|
declare enum GladiaModel {
|
|
111
143
|
SOLARIA_1 = "solaria-1"
|
|
112
144
|
}
|
|
@@ -278,9 +310,21 @@ declare class ElevenLabsTTS extends MagmaFlowTextToSpeech {
|
|
|
278
310
|
|
|
279
311
|
type HumeTTSArgs = {
|
|
280
312
|
client?: HumeClient;
|
|
313
|
+
voice?: {
|
|
314
|
+
name?: string;
|
|
315
|
+
id?: string;
|
|
316
|
+
provider?: string;
|
|
317
|
+
};
|
|
318
|
+
description?: string;
|
|
319
|
+
speed?: number;
|
|
320
|
+
version?: string;
|
|
281
321
|
};
|
|
282
322
|
declare class HumeTTS extends MagmaFlowTextToSpeech {
|
|
283
323
|
private client;
|
|
324
|
+
private voice?;
|
|
325
|
+
private description?;
|
|
326
|
+
private speed?;
|
|
327
|
+
private version?;
|
|
284
328
|
constructor(args: HumeTTSArgs);
|
|
285
329
|
setup(): Promise<void>;
|
|
286
330
|
input(text: string | null, requestId: string): void;
|
|
@@ -302,4 +346,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
|
|
|
302
346
|
|
|
303
347
|
declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
|
|
304
348
|
|
|
305
|
-
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
|
349
|
+
export { type DeepgramFluxConfig, DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
package/dist/voice.js
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var sdk = require('@deepgram/sdk');
|
|
4
|
-
var
|
|
4
|
+
var WebSocket2 = require('ws');
|
|
5
5
|
var hume = require('hume');
|
|
6
6
|
var OpenAI = require('openai');
|
|
7
7
|
|
|
8
8
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
9
9
|
|
|
10
|
+
var WebSocket2__default = /*#__PURE__*/_interopDefault(WebSocket2);
|
|
10
11
|
var OpenAI__default = /*#__PURE__*/_interopDefault(OpenAI);
|
|
11
12
|
|
|
12
13
|
// src/helpers/bufferToInt16Array.ts
|
|
@@ -424,6 +425,7 @@ var MagmaFlow = class {
|
|
|
424
425
|
inputFormat;
|
|
425
426
|
outputFormat;
|
|
426
427
|
onAudioOutput;
|
|
428
|
+
onNormalizedAudio;
|
|
427
429
|
textBuffer = "";
|
|
428
430
|
textQueue = [];
|
|
429
431
|
generatingAudio = false;
|
|
@@ -440,6 +442,7 @@ var MagmaFlow = class {
|
|
|
440
442
|
this.inputFormat = args.inputFormat;
|
|
441
443
|
this.outputFormat = args.outputFormat;
|
|
442
444
|
this.onAudioOutput = args.onAudioOutput;
|
|
445
|
+
this.onNormalizedAudio = args.onNormalizedAudio;
|
|
443
446
|
this.config = { ...this.config, ...args.config };
|
|
444
447
|
this.tts.onOutput = (audio, requestId) => {
|
|
445
448
|
if (this.currentRequestId !== requestId) {
|
|
@@ -478,7 +481,9 @@ var MagmaFlow = class {
|
|
|
478
481
|
inputAudio(audio) {
|
|
479
482
|
const decodedAudio = decodeToPcm(audio, this.inputFormat.encoding);
|
|
480
483
|
const resampledPCM = resamplePcm(decodedAudio, this.inputFormat.sampleRate, 48e3);
|
|
481
|
-
|
|
484
|
+
const pcmBuffer = int16ArrayToBuffer(resampledPCM);
|
|
485
|
+
this.onNormalizedAudio?.(pcmBuffer);
|
|
486
|
+
this.stt.input(pcmBuffer);
|
|
482
487
|
}
|
|
483
488
|
inputText(text) {
|
|
484
489
|
if (text === void 0 || text === null) {
|
|
@@ -717,6 +722,156 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
717
722
|
}
|
|
718
723
|
}
|
|
719
724
|
};
|
|
725
|
+
var kReconnectBaseMs = 500;
|
|
726
|
+
var kReconnectMaxMs = 1e4;
|
|
727
|
+
var DeepgramFluxSTT = class extends MagmaFlowSpeechToText {
|
|
728
|
+
ws = null;
|
|
729
|
+
apiKey;
|
|
730
|
+
eotThreshold;
|
|
731
|
+
eagerEotThreshold;
|
|
732
|
+
eotTimeoutMs;
|
|
733
|
+
audioQueue = [];
|
|
734
|
+
connecting = false;
|
|
735
|
+
killed = false;
|
|
736
|
+
reconnectAttempts = 0;
|
|
737
|
+
reconnectTimer;
|
|
738
|
+
constructor(config) {
|
|
739
|
+
super();
|
|
740
|
+
this.apiKey = config?.apiKey ?? process.env.DEEPGRAM_API_KEY;
|
|
741
|
+
this.eotThreshold = config?.eotThreshold ?? 0.7;
|
|
742
|
+
this.eagerEotThreshold = config?.eagerEotThreshold;
|
|
743
|
+
this.eotTimeoutMs = config?.eotTimeoutMs ?? 5e3;
|
|
744
|
+
}
|
|
745
|
+
connect() {
|
|
746
|
+
if (this.connecting || this.killed) return;
|
|
747
|
+
this.connecting = true;
|
|
748
|
+
const params = new URLSearchParams({
|
|
749
|
+
model: "flux-general-en",
|
|
750
|
+
encoding: "linear16",
|
|
751
|
+
sample_rate: "48000",
|
|
752
|
+
eot_threshold: this.eotThreshold.toString(),
|
|
753
|
+
eot_timeout_ms: this.eotTimeoutMs.toString()
|
|
754
|
+
});
|
|
755
|
+
if (this.eagerEotThreshold !== void 0) {
|
|
756
|
+
params.set("eager_eot_threshold", this.eagerEotThreshold.toString());
|
|
757
|
+
}
|
|
758
|
+
const url = `wss://api.deepgram.com/v2/listen?${params.toString()}`;
|
|
759
|
+
this.ws = new WebSocket2__default.default(url, {
|
|
760
|
+
headers: { Authorization: `Token ${this.apiKey}` }
|
|
761
|
+
});
|
|
762
|
+
this.ws.on("open", () => {
|
|
763
|
+
console.log("[DeepgramFlux] Connected");
|
|
764
|
+
this.connecting = false;
|
|
765
|
+
this.reconnectAttempts = 0;
|
|
766
|
+
for (const queued of this.audioQueue) {
|
|
767
|
+
this.ws.send(queued);
|
|
768
|
+
}
|
|
769
|
+
this.audioQueue = [];
|
|
770
|
+
});
|
|
771
|
+
this.ws.on("message", (data) => {
|
|
772
|
+
try {
|
|
773
|
+
const msg = JSON.parse(data.toString());
|
|
774
|
+
this.handleMessage(msg);
|
|
775
|
+
} catch (err) {
|
|
776
|
+
console.error(`[DeepgramFlux] Parse error: ${err.message}`);
|
|
777
|
+
}
|
|
778
|
+
});
|
|
779
|
+
this.ws.on("error", (err) => {
|
|
780
|
+
console.error(`[DeepgramFlux] Error: ${err.message}`);
|
|
781
|
+
this.connecting = false;
|
|
782
|
+
});
|
|
783
|
+
this.ws.on("close", (code, reason) => {
|
|
784
|
+
console.log(`[DeepgramFlux] Closed: ${code} ${reason.toString()}`);
|
|
785
|
+
this.clearConnection();
|
|
786
|
+
if (!this.killed) {
|
|
787
|
+
this.scheduleReconnect();
|
|
788
|
+
}
|
|
789
|
+
});
|
|
790
|
+
}
|
|
791
|
+
scheduleReconnect() {
|
|
792
|
+
const delay = Math.min(
|
|
793
|
+
kReconnectBaseMs * Math.pow(2, this.reconnectAttempts),
|
|
794
|
+
kReconnectMaxMs
|
|
795
|
+
);
|
|
796
|
+
this.reconnectAttempts++;
|
|
797
|
+
console.log(
|
|
798
|
+
`[DeepgramFlux] Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`
|
|
799
|
+
);
|
|
800
|
+
this.reconnectTimer = setTimeout(() => {
|
|
801
|
+
this.reconnectTimer = void 0;
|
|
802
|
+
if (!this.killed && !this.ws) {
|
|
803
|
+
this.connect();
|
|
804
|
+
}
|
|
805
|
+
}, delay);
|
|
806
|
+
}
|
|
807
|
+
clearConnection() {
|
|
808
|
+
this.ws = null;
|
|
809
|
+
this.connecting = false;
|
|
810
|
+
}
|
|
811
|
+
handleMessage(msg) {
|
|
812
|
+
switch (msg.type) {
|
|
813
|
+
case "TurnInfo":
|
|
814
|
+
this.handleTurnInfo(msg);
|
|
815
|
+
break;
|
|
816
|
+
case "Connected":
|
|
817
|
+
break;
|
|
818
|
+
case "Error":
|
|
819
|
+
console.error(`[DeepgramFlux] Server error: ${JSON.stringify(msg)}`);
|
|
820
|
+
break;
|
|
821
|
+
default:
|
|
822
|
+
console.log(`[DeepgramFlux] Unknown message: ${JSON.stringify(msg)}`);
|
|
823
|
+
break;
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
handleTurnInfo(msg) {
|
|
827
|
+
switch (msg.event) {
|
|
828
|
+
case "StartOfTurn":
|
|
829
|
+
this.onSpeechDetected();
|
|
830
|
+
break;
|
|
831
|
+
case "EndOfTurn":
|
|
832
|
+
this.onOutput({ text: msg.transcript });
|
|
833
|
+
break;
|
|
834
|
+
case "EagerEndOfTurn":
|
|
835
|
+
this.onEagerEndOfTurn(msg.transcript);
|
|
836
|
+
break;
|
|
837
|
+
case "TurnResumed":
|
|
838
|
+
this.onTurnResumed();
|
|
839
|
+
break;
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
input(audio) {
|
|
843
|
+
if (!this.ws && !this.connecting && !this.reconnectTimer) {
|
|
844
|
+
this.connect();
|
|
845
|
+
}
|
|
846
|
+
if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
|
|
847
|
+
this.ws.send(audio);
|
|
848
|
+
} else {
|
|
849
|
+
this.audioQueue.push(audio);
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
flush() {
|
|
853
|
+
if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
|
|
854
|
+
this.ws.send(JSON.stringify({ type: "Finalize" }));
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
kill() {
|
|
858
|
+
this.killed = true;
|
|
859
|
+
if (this.reconnectTimer) {
|
|
860
|
+
clearTimeout(this.reconnectTimer);
|
|
861
|
+
this.reconnectTimer = void 0;
|
|
862
|
+
}
|
|
863
|
+
if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
|
|
864
|
+
this.ws.send(JSON.stringify({ type: "CloseStream" }));
|
|
865
|
+
this.ws.close();
|
|
866
|
+
}
|
|
867
|
+
this.clearConnection();
|
|
868
|
+
this.audioQueue = [];
|
|
869
|
+
}
|
|
870
|
+
onEagerEndOfTurn(transcript) {
|
|
871
|
+
}
|
|
872
|
+
onTurnResumed() {
|
|
873
|
+
}
|
|
874
|
+
};
|
|
720
875
|
var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
|
|
721
876
|
var DummyWebSocket = class {
|
|
722
877
|
url;
|
|
@@ -745,7 +900,7 @@ var QueueWebSocket = class {
|
|
|
745
900
|
if (NATIVE_WEBSOCKET_AVAILABLE) {
|
|
746
901
|
this.ws = new WebSocket(this.url);
|
|
747
902
|
} else {
|
|
748
|
-
this.ws = new
|
|
903
|
+
this.ws = new WebSocket2.WebSocket(this.url);
|
|
749
904
|
}
|
|
750
905
|
} else {
|
|
751
906
|
this.ws = new DummyWebSocket(null);
|
|
@@ -759,7 +914,7 @@ var QueueWebSocket = class {
|
|
|
759
914
|
if (NATIVE_WEBSOCKET_AVAILABLE) {
|
|
760
915
|
this.ws = new WebSocket(this.url);
|
|
761
916
|
} else {
|
|
762
|
-
this.ws = new
|
|
917
|
+
this.ws = new WebSocket2.WebSocket(this.url);
|
|
763
918
|
}
|
|
764
919
|
} else {
|
|
765
920
|
this.ws = new DummyWebSocket(null);
|
|
@@ -1084,7 +1239,6 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1084
1239
|
).then(async (response) => {
|
|
1085
1240
|
const reader = response.body?.getReader();
|
|
1086
1241
|
if (!reader) return;
|
|
1087
|
-
new TextDecoder();
|
|
1088
1242
|
while (true) {
|
|
1089
1243
|
const { done, value } = await reader.read();
|
|
1090
1244
|
if (done) break;
|
|
@@ -1101,9 +1255,17 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1101
1255
|
};
|
|
1102
1256
|
var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
1103
1257
|
client;
|
|
1258
|
+
voice;
|
|
1259
|
+
description;
|
|
1260
|
+
speed;
|
|
1261
|
+
version;
|
|
1104
1262
|
constructor(args) {
|
|
1105
1263
|
super();
|
|
1106
1264
|
this.client = args.client ?? new hume.HumeClient({ apiKey: process.env.HUME_API_KEY });
|
|
1265
|
+
this.voice = args.voice;
|
|
1266
|
+
this.description = args.description;
|
|
1267
|
+
this.speed = args.speed;
|
|
1268
|
+
this.version = args.version;
|
|
1107
1269
|
}
|
|
1108
1270
|
async setup() {
|
|
1109
1271
|
}
|
|
@@ -1111,22 +1273,28 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1111
1273
|
if (!text) {
|
|
1112
1274
|
return;
|
|
1113
1275
|
}
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
],
|
|
1120
|
-
format: {
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1276
|
+
const utterance = { text };
|
|
1277
|
+
if (this.voice) utterance.voice = this.voice;
|
|
1278
|
+
if (this.description) utterance.description = this.description;
|
|
1279
|
+
if (this.speed !== void 0) utterance.speed = this.speed;
|
|
1280
|
+
const params = {
|
|
1281
|
+
utterances: [utterance],
|
|
1282
|
+
format: { type: "pcm" },
|
|
1283
|
+
instantMode: true,
|
|
1284
|
+
stripHeaders: true
|
|
1285
|
+
};
|
|
1286
|
+
if (this.version) params.version = this.version;
|
|
1287
|
+
this.client.tts.synthesizeJsonStreaming(params).then(async (stream) => {
|
|
1125
1288
|
for await (const chunk of stream) {
|
|
1126
|
-
|
|
1289
|
+
if (chunk.type === "audio") {
|
|
1290
|
+
this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
|
|
1291
|
+
}
|
|
1127
1292
|
}
|
|
1128
1293
|
this.onOutput(null, requestId);
|
|
1129
1294
|
console.log("[Hume] Finished:", text);
|
|
1295
|
+
}).catch((err) => {
|
|
1296
|
+
console.error(`[Hume] Error: ${err.message}`);
|
|
1297
|
+
this.onOutput(null, requestId);
|
|
1130
1298
|
});
|
|
1131
1299
|
}
|
|
1132
1300
|
kill() {
|
|
@@ -1169,6 +1337,7 @@ var WhisperTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1169
1337
|
}
|
|
1170
1338
|
};
|
|
1171
1339
|
|
|
1340
|
+
exports.DeepgramFluxSTT = DeepgramFluxSTT;
|
|
1172
1341
|
exports.DeepgramLanguage = DeepgramLanguage;
|
|
1173
1342
|
exports.DeepgramModel = DeepgramModel;
|
|
1174
1343
|
exports.DeepgramSTT = DeepgramSTT;
|
package/dist/voice.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { DeepgramClient, LiveTranscriptionEvents } from '@deepgram/sdk';
|
|
2
|
-
import { WebSocket as WebSocket$1 } from 'ws';
|
|
2
|
+
import WebSocket2, { WebSocket as WebSocket$1 } from 'ws';
|
|
3
3
|
import { HumeClient } from 'hume';
|
|
4
4
|
import OpenAI from 'openai';
|
|
5
5
|
|
|
@@ -418,6 +418,7 @@ var MagmaFlow = class {
|
|
|
418
418
|
inputFormat;
|
|
419
419
|
outputFormat;
|
|
420
420
|
onAudioOutput;
|
|
421
|
+
onNormalizedAudio;
|
|
421
422
|
textBuffer = "";
|
|
422
423
|
textQueue = [];
|
|
423
424
|
generatingAudio = false;
|
|
@@ -434,6 +435,7 @@ var MagmaFlow = class {
|
|
|
434
435
|
this.inputFormat = args.inputFormat;
|
|
435
436
|
this.outputFormat = args.outputFormat;
|
|
436
437
|
this.onAudioOutput = args.onAudioOutput;
|
|
438
|
+
this.onNormalizedAudio = args.onNormalizedAudio;
|
|
437
439
|
this.config = { ...this.config, ...args.config };
|
|
438
440
|
this.tts.onOutput = (audio, requestId) => {
|
|
439
441
|
if (this.currentRequestId !== requestId) {
|
|
@@ -472,7 +474,9 @@ var MagmaFlow = class {
|
|
|
472
474
|
inputAudio(audio) {
|
|
473
475
|
const decodedAudio = decodeToPcm(audio, this.inputFormat.encoding);
|
|
474
476
|
const resampledPCM = resamplePcm(decodedAudio, this.inputFormat.sampleRate, 48e3);
|
|
475
|
-
|
|
477
|
+
const pcmBuffer = int16ArrayToBuffer(resampledPCM);
|
|
478
|
+
this.onNormalizedAudio?.(pcmBuffer);
|
|
479
|
+
this.stt.input(pcmBuffer);
|
|
476
480
|
}
|
|
477
481
|
inputText(text) {
|
|
478
482
|
if (text === void 0 || text === null) {
|
|
@@ -711,6 +715,156 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
711
715
|
}
|
|
712
716
|
}
|
|
713
717
|
};
|
|
718
|
+
var kReconnectBaseMs = 500;
|
|
719
|
+
var kReconnectMaxMs = 1e4;
|
|
720
|
+
var DeepgramFluxSTT = class extends MagmaFlowSpeechToText {
|
|
721
|
+
ws = null;
|
|
722
|
+
apiKey;
|
|
723
|
+
eotThreshold;
|
|
724
|
+
eagerEotThreshold;
|
|
725
|
+
eotTimeoutMs;
|
|
726
|
+
audioQueue = [];
|
|
727
|
+
connecting = false;
|
|
728
|
+
killed = false;
|
|
729
|
+
reconnectAttempts = 0;
|
|
730
|
+
reconnectTimer;
|
|
731
|
+
constructor(config) {
|
|
732
|
+
super();
|
|
733
|
+
this.apiKey = config?.apiKey ?? process.env.DEEPGRAM_API_KEY;
|
|
734
|
+
this.eotThreshold = config?.eotThreshold ?? 0.7;
|
|
735
|
+
this.eagerEotThreshold = config?.eagerEotThreshold;
|
|
736
|
+
this.eotTimeoutMs = config?.eotTimeoutMs ?? 5e3;
|
|
737
|
+
}
|
|
738
|
+
connect() {
|
|
739
|
+
if (this.connecting || this.killed) return;
|
|
740
|
+
this.connecting = true;
|
|
741
|
+
const params = new URLSearchParams({
|
|
742
|
+
model: "flux-general-en",
|
|
743
|
+
encoding: "linear16",
|
|
744
|
+
sample_rate: "48000",
|
|
745
|
+
eot_threshold: this.eotThreshold.toString(),
|
|
746
|
+
eot_timeout_ms: this.eotTimeoutMs.toString()
|
|
747
|
+
});
|
|
748
|
+
if (this.eagerEotThreshold !== void 0) {
|
|
749
|
+
params.set("eager_eot_threshold", this.eagerEotThreshold.toString());
|
|
750
|
+
}
|
|
751
|
+
const url = `wss://api.deepgram.com/v2/listen?${params.toString()}`;
|
|
752
|
+
this.ws = new WebSocket2(url, {
|
|
753
|
+
headers: { Authorization: `Token ${this.apiKey}` }
|
|
754
|
+
});
|
|
755
|
+
this.ws.on("open", () => {
|
|
756
|
+
console.log("[DeepgramFlux] Connected");
|
|
757
|
+
this.connecting = false;
|
|
758
|
+
this.reconnectAttempts = 0;
|
|
759
|
+
for (const queued of this.audioQueue) {
|
|
760
|
+
this.ws.send(queued);
|
|
761
|
+
}
|
|
762
|
+
this.audioQueue = [];
|
|
763
|
+
});
|
|
764
|
+
this.ws.on("message", (data) => {
|
|
765
|
+
try {
|
|
766
|
+
const msg = JSON.parse(data.toString());
|
|
767
|
+
this.handleMessage(msg);
|
|
768
|
+
} catch (err) {
|
|
769
|
+
console.error(`[DeepgramFlux] Parse error: ${err.message}`);
|
|
770
|
+
}
|
|
771
|
+
});
|
|
772
|
+
this.ws.on("error", (err) => {
|
|
773
|
+
console.error(`[DeepgramFlux] Error: ${err.message}`);
|
|
774
|
+
this.connecting = false;
|
|
775
|
+
});
|
|
776
|
+
this.ws.on("close", (code, reason) => {
|
|
777
|
+
console.log(`[DeepgramFlux] Closed: ${code} ${reason.toString()}`);
|
|
778
|
+
this.clearConnection();
|
|
779
|
+
if (!this.killed) {
|
|
780
|
+
this.scheduleReconnect();
|
|
781
|
+
}
|
|
782
|
+
});
|
|
783
|
+
}
|
|
784
|
+
scheduleReconnect() {
|
|
785
|
+
const delay = Math.min(
|
|
786
|
+
kReconnectBaseMs * Math.pow(2, this.reconnectAttempts),
|
|
787
|
+
kReconnectMaxMs
|
|
788
|
+
);
|
|
789
|
+
this.reconnectAttempts++;
|
|
790
|
+
console.log(
|
|
791
|
+
`[DeepgramFlux] Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`
|
|
792
|
+
);
|
|
793
|
+
this.reconnectTimer = setTimeout(() => {
|
|
794
|
+
this.reconnectTimer = void 0;
|
|
795
|
+
if (!this.killed && !this.ws) {
|
|
796
|
+
this.connect();
|
|
797
|
+
}
|
|
798
|
+
}, delay);
|
|
799
|
+
}
|
|
800
|
+
clearConnection() {
|
|
801
|
+
this.ws = null;
|
|
802
|
+
this.connecting = false;
|
|
803
|
+
}
|
|
804
|
+
handleMessage(msg) {
|
|
805
|
+
switch (msg.type) {
|
|
806
|
+
case "TurnInfo":
|
|
807
|
+
this.handleTurnInfo(msg);
|
|
808
|
+
break;
|
|
809
|
+
case "Connected":
|
|
810
|
+
break;
|
|
811
|
+
case "Error":
|
|
812
|
+
console.error(`[DeepgramFlux] Server error: ${JSON.stringify(msg)}`);
|
|
813
|
+
break;
|
|
814
|
+
default:
|
|
815
|
+
console.log(`[DeepgramFlux] Unknown message: ${JSON.stringify(msg)}`);
|
|
816
|
+
break;
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
handleTurnInfo(msg) {
|
|
820
|
+
switch (msg.event) {
|
|
821
|
+
case "StartOfTurn":
|
|
822
|
+
this.onSpeechDetected();
|
|
823
|
+
break;
|
|
824
|
+
case "EndOfTurn":
|
|
825
|
+
this.onOutput({ text: msg.transcript });
|
|
826
|
+
break;
|
|
827
|
+
case "EagerEndOfTurn":
|
|
828
|
+
this.onEagerEndOfTurn(msg.transcript);
|
|
829
|
+
break;
|
|
830
|
+
case "TurnResumed":
|
|
831
|
+
this.onTurnResumed();
|
|
832
|
+
break;
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
input(audio) {
|
|
836
|
+
if (!this.ws && !this.connecting && !this.reconnectTimer) {
|
|
837
|
+
this.connect();
|
|
838
|
+
}
|
|
839
|
+
if (this.ws?.readyState === WebSocket2.OPEN) {
|
|
840
|
+
this.ws.send(audio);
|
|
841
|
+
} else {
|
|
842
|
+
this.audioQueue.push(audio);
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
flush() {
|
|
846
|
+
if (this.ws?.readyState === WebSocket2.OPEN) {
|
|
847
|
+
this.ws.send(JSON.stringify({ type: "Finalize" }));
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
kill() {
|
|
851
|
+
this.killed = true;
|
|
852
|
+
if (this.reconnectTimer) {
|
|
853
|
+
clearTimeout(this.reconnectTimer);
|
|
854
|
+
this.reconnectTimer = void 0;
|
|
855
|
+
}
|
|
856
|
+
if (this.ws?.readyState === WebSocket2.OPEN) {
|
|
857
|
+
this.ws.send(JSON.stringify({ type: "CloseStream" }));
|
|
858
|
+
this.ws.close();
|
|
859
|
+
}
|
|
860
|
+
this.clearConnection();
|
|
861
|
+
this.audioQueue = [];
|
|
862
|
+
}
|
|
863
|
+
onEagerEndOfTurn(transcript) {
|
|
864
|
+
}
|
|
865
|
+
onTurnResumed() {
|
|
866
|
+
}
|
|
867
|
+
};
|
|
714
868
|
var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
|
|
715
869
|
var DummyWebSocket = class {
|
|
716
870
|
url;
|
|
@@ -1078,7 +1232,6 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1078
1232
|
).then(async (response) => {
|
|
1079
1233
|
const reader = response.body?.getReader();
|
|
1080
1234
|
if (!reader) return;
|
|
1081
|
-
new TextDecoder();
|
|
1082
1235
|
while (true) {
|
|
1083
1236
|
const { done, value } = await reader.read();
|
|
1084
1237
|
if (done) break;
|
|
@@ -1095,9 +1248,17 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1095
1248
|
};
|
|
1096
1249
|
var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
1097
1250
|
client;
|
|
1251
|
+
voice;
|
|
1252
|
+
description;
|
|
1253
|
+
speed;
|
|
1254
|
+
version;
|
|
1098
1255
|
constructor(args) {
|
|
1099
1256
|
super();
|
|
1100
1257
|
this.client = args.client ?? new HumeClient({ apiKey: process.env.HUME_API_KEY });
|
|
1258
|
+
this.voice = args.voice;
|
|
1259
|
+
this.description = args.description;
|
|
1260
|
+
this.speed = args.speed;
|
|
1261
|
+
this.version = args.version;
|
|
1101
1262
|
}
|
|
1102
1263
|
async setup() {
|
|
1103
1264
|
}
|
|
@@ -1105,22 +1266,28 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1105
1266
|
if (!text) {
|
|
1106
1267
|
return;
|
|
1107
1268
|
}
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
],
|
|
1114
|
-
format: {
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1269
|
+
const utterance = { text };
|
|
1270
|
+
if (this.voice) utterance.voice = this.voice;
|
|
1271
|
+
if (this.description) utterance.description = this.description;
|
|
1272
|
+
if (this.speed !== void 0) utterance.speed = this.speed;
|
|
1273
|
+
const params = {
|
|
1274
|
+
utterances: [utterance],
|
|
1275
|
+
format: { type: "pcm" },
|
|
1276
|
+
instantMode: true,
|
|
1277
|
+
stripHeaders: true
|
|
1278
|
+
};
|
|
1279
|
+
if (this.version) params.version = this.version;
|
|
1280
|
+
this.client.tts.synthesizeJsonStreaming(params).then(async (stream) => {
|
|
1119
1281
|
for await (const chunk of stream) {
|
|
1120
|
-
|
|
1282
|
+
if (chunk.type === "audio") {
|
|
1283
|
+
this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
|
|
1284
|
+
}
|
|
1121
1285
|
}
|
|
1122
1286
|
this.onOutput(null, requestId);
|
|
1123
1287
|
console.log("[Hume] Finished:", text);
|
|
1288
|
+
}).catch((err) => {
|
|
1289
|
+
console.error(`[Hume] Error: ${err.message}`);
|
|
1290
|
+
this.onOutput(null, requestId);
|
|
1124
1291
|
});
|
|
1125
1292
|
}
|
|
1126
1293
|
kill() {
|
|
@@ -1163,4 +1330,4 @@ var WhisperTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1163
1330
|
}
|
|
1164
1331
|
};
|
|
1165
1332
|
|
|
1166
|
-
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
|
|
1333
|
+
export { DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pompeii-labs/audio",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "The Audio SDK from Pompeii Labs",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"Pompeii",
|
|
@@ -42,7 +42,7 @@
|
|
|
42
42
|
},
|
|
43
43
|
"dependencies": {
|
|
44
44
|
"@deepgram/sdk": "4.2.0",
|
|
45
|
-
"hume": "0.
|
|
45
|
+
"hume": "0.15.13",
|
|
46
46
|
"openai": "4.86.2"
|
|
47
47
|
},
|
|
48
48
|
"devDependencies": {
|