@pompeii-labs/audio 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/voice.d.mts +45 -1
- package/dist/voice.d.ts +45 -1
- package/dist/voice.js +191 -17
- package/dist/voice.mjs +188 -16
- package/package.json +2 -2
package/dist/voice.d.mts
CHANGED
|
@@ -43,6 +43,7 @@ type MagmaFlowArgs = {
|
|
|
43
43
|
onSpeechDetected: () => void;
|
|
44
44
|
onTranscription: (transcription: MagmaFlowSTTOutput) => void;
|
|
45
45
|
onAudioOutput: (audio: Buffer) => void;
|
|
46
|
+
onNormalizedAudio?: (audio: Buffer) => void;
|
|
46
47
|
config?: MagmaFlowConfig;
|
|
47
48
|
};
|
|
48
49
|
declare class MagmaFlow {
|
|
@@ -51,6 +52,7 @@ declare class MagmaFlow {
|
|
|
51
52
|
private inputFormat;
|
|
52
53
|
private outputFormat;
|
|
53
54
|
private onAudioOutput;
|
|
55
|
+
private onNormalizedAudio?;
|
|
54
56
|
private textBuffer;
|
|
55
57
|
private textQueue;
|
|
56
58
|
private generatingAudio;
|
|
@@ -107,6 +109,36 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
|
|
|
107
109
|
private computeTurns;
|
|
108
110
|
}
|
|
109
111
|
|
|
112
|
+
type DeepgramFluxConfig = {
|
|
113
|
+
apiKey?: string;
|
|
114
|
+
eotThreshold?: number;
|
|
115
|
+
eagerEotThreshold?: number;
|
|
116
|
+
eotTimeoutMs?: number;
|
|
117
|
+
};
|
|
118
|
+
declare class DeepgramFluxSTT extends MagmaFlowSpeechToText {
|
|
119
|
+
private ws;
|
|
120
|
+
private apiKey;
|
|
121
|
+
private eotThreshold;
|
|
122
|
+
private eagerEotThreshold?;
|
|
123
|
+
private eotTimeoutMs;
|
|
124
|
+
private audioQueue;
|
|
125
|
+
private connecting;
|
|
126
|
+
private killed;
|
|
127
|
+
private reconnectAttempts;
|
|
128
|
+
private reconnectTimer?;
|
|
129
|
+
constructor(config?: DeepgramFluxConfig);
|
|
130
|
+
private connect;
|
|
131
|
+
private scheduleReconnect;
|
|
132
|
+
private clearConnection;
|
|
133
|
+
private handleMessage;
|
|
134
|
+
private handleTurnInfo;
|
|
135
|
+
input(audio: Buffer): void;
|
|
136
|
+
flush(): void;
|
|
137
|
+
kill(): void;
|
|
138
|
+
onEagerEndOfTurn(transcript: string): void;
|
|
139
|
+
onTurnResumed(): void;
|
|
140
|
+
}
|
|
141
|
+
|
|
110
142
|
declare enum GladiaModel {
|
|
111
143
|
SOLARIA_1 = "solaria-1"
|
|
112
144
|
}
|
|
@@ -278,9 +310,21 @@ declare class ElevenLabsTTS extends MagmaFlowTextToSpeech {
|
|
|
278
310
|
|
|
279
311
|
type HumeTTSArgs = {
|
|
280
312
|
client?: HumeClient;
|
|
313
|
+
voice?: {
|
|
314
|
+
name?: string;
|
|
315
|
+
id?: string;
|
|
316
|
+
provider?: string;
|
|
317
|
+
};
|
|
318
|
+
description?: string;
|
|
319
|
+
speed?: number;
|
|
320
|
+
version?: string;
|
|
281
321
|
};
|
|
282
322
|
declare class HumeTTS extends MagmaFlowTextToSpeech {
|
|
283
323
|
private client;
|
|
324
|
+
private voice?;
|
|
325
|
+
private description?;
|
|
326
|
+
private speed?;
|
|
327
|
+
private version?;
|
|
284
328
|
constructor(args: HumeTTSArgs);
|
|
285
329
|
setup(): Promise<void>;
|
|
286
330
|
input(text: string | null, requestId: string): void;
|
|
@@ -302,4 +346,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
|
|
|
302
346
|
|
|
303
347
|
declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
|
|
304
348
|
|
|
305
|
-
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
|
349
|
+
export { type DeepgramFluxConfig, DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
package/dist/voice.d.ts
CHANGED
|
@@ -43,6 +43,7 @@ type MagmaFlowArgs = {
|
|
|
43
43
|
onSpeechDetected: () => void;
|
|
44
44
|
onTranscription: (transcription: MagmaFlowSTTOutput) => void;
|
|
45
45
|
onAudioOutput: (audio: Buffer) => void;
|
|
46
|
+
onNormalizedAudio?: (audio: Buffer) => void;
|
|
46
47
|
config?: MagmaFlowConfig;
|
|
47
48
|
};
|
|
48
49
|
declare class MagmaFlow {
|
|
@@ -51,6 +52,7 @@ declare class MagmaFlow {
|
|
|
51
52
|
private inputFormat;
|
|
52
53
|
private outputFormat;
|
|
53
54
|
private onAudioOutput;
|
|
55
|
+
private onNormalizedAudio?;
|
|
54
56
|
private textBuffer;
|
|
55
57
|
private textQueue;
|
|
56
58
|
private generatingAudio;
|
|
@@ -107,6 +109,36 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
|
|
|
107
109
|
private computeTurns;
|
|
108
110
|
}
|
|
109
111
|
|
|
112
|
+
type DeepgramFluxConfig = {
|
|
113
|
+
apiKey?: string;
|
|
114
|
+
eotThreshold?: number;
|
|
115
|
+
eagerEotThreshold?: number;
|
|
116
|
+
eotTimeoutMs?: number;
|
|
117
|
+
};
|
|
118
|
+
declare class DeepgramFluxSTT extends MagmaFlowSpeechToText {
|
|
119
|
+
private ws;
|
|
120
|
+
private apiKey;
|
|
121
|
+
private eotThreshold;
|
|
122
|
+
private eagerEotThreshold?;
|
|
123
|
+
private eotTimeoutMs;
|
|
124
|
+
private audioQueue;
|
|
125
|
+
private connecting;
|
|
126
|
+
private killed;
|
|
127
|
+
private reconnectAttempts;
|
|
128
|
+
private reconnectTimer?;
|
|
129
|
+
constructor(config?: DeepgramFluxConfig);
|
|
130
|
+
private connect;
|
|
131
|
+
private scheduleReconnect;
|
|
132
|
+
private clearConnection;
|
|
133
|
+
private handleMessage;
|
|
134
|
+
private handleTurnInfo;
|
|
135
|
+
input(audio: Buffer): void;
|
|
136
|
+
flush(): void;
|
|
137
|
+
kill(): void;
|
|
138
|
+
onEagerEndOfTurn(transcript: string): void;
|
|
139
|
+
onTurnResumed(): void;
|
|
140
|
+
}
|
|
141
|
+
|
|
110
142
|
declare enum GladiaModel {
|
|
111
143
|
SOLARIA_1 = "solaria-1"
|
|
112
144
|
}
|
|
@@ -278,9 +310,21 @@ declare class ElevenLabsTTS extends MagmaFlowTextToSpeech {
|
|
|
278
310
|
|
|
279
311
|
type HumeTTSArgs = {
|
|
280
312
|
client?: HumeClient;
|
|
313
|
+
voice?: {
|
|
314
|
+
name?: string;
|
|
315
|
+
id?: string;
|
|
316
|
+
provider?: string;
|
|
317
|
+
};
|
|
318
|
+
description?: string;
|
|
319
|
+
speed?: number;
|
|
320
|
+
version?: string;
|
|
281
321
|
};
|
|
282
322
|
declare class HumeTTS extends MagmaFlowTextToSpeech {
|
|
283
323
|
private client;
|
|
324
|
+
private voice?;
|
|
325
|
+
private description?;
|
|
326
|
+
private speed?;
|
|
327
|
+
private version?;
|
|
284
328
|
constructor(args: HumeTTSArgs);
|
|
285
329
|
setup(): Promise<void>;
|
|
286
330
|
input(text: string | null, requestId: string): void;
|
|
@@ -302,4 +346,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
|
|
|
302
346
|
|
|
303
347
|
declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
|
|
304
348
|
|
|
305
|
-
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
|
349
|
+
export { type DeepgramFluxConfig, DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
package/dist/voice.js
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var sdk = require('@deepgram/sdk');
|
|
4
|
-
var
|
|
4
|
+
var WebSocket2 = require('ws');
|
|
5
5
|
var hume = require('hume');
|
|
6
6
|
var OpenAI = require('openai');
|
|
7
7
|
|
|
8
8
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
9
9
|
|
|
10
|
+
var WebSocket2__default = /*#__PURE__*/_interopDefault(WebSocket2);
|
|
10
11
|
var OpenAI__default = /*#__PURE__*/_interopDefault(OpenAI);
|
|
11
12
|
|
|
12
13
|
// src/helpers/bufferToInt16Array.ts
|
|
@@ -424,6 +425,7 @@ var MagmaFlow = class {
|
|
|
424
425
|
inputFormat;
|
|
425
426
|
outputFormat;
|
|
426
427
|
onAudioOutput;
|
|
428
|
+
onNormalizedAudio;
|
|
427
429
|
textBuffer = "";
|
|
428
430
|
textQueue = [];
|
|
429
431
|
generatingAudio = false;
|
|
@@ -440,6 +442,7 @@ var MagmaFlow = class {
|
|
|
440
442
|
this.inputFormat = args.inputFormat;
|
|
441
443
|
this.outputFormat = args.outputFormat;
|
|
442
444
|
this.onAudioOutput = args.onAudioOutput;
|
|
445
|
+
this.onNormalizedAudio = args.onNormalizedAudio;
|
|
443
446
|
this.config = { ...this.config, ...args.config };
|
|
444
447
|
this.tts.onOutput = (audio, requestId) => {
|
|
445
448
|
if (this.currentRequestId !== requestId) {
|
|
@@ -478,7 +481,9 @@ var MagmaFlow = class {
|
|
|
478
481
|
inputAudio(audio) {
|
|
479
482
|
const decodedAudio = decodeToPcm(audio, this.inputFormat.encoding);
|
|
480
483
|
const resampledPCM = resamplePcm(decodedAudio, this.inputFormat.sampleRate, 48e3);
|
|
481
|
-
|
|
484
|
+
const pcmBuffer = int16ArrayToBuffer(resampledPCM);
|
|
485
|
+
this.onNormalizedAudio?.(pcmBuffer);
|
|
486
|
+
this.stt.input(pcmBuffer);
|
|
482
487
|
}
|
|
483
488
|
inputText(text) {
|
|
484
489
|
if (text === void 0 || text === null) {
|
|
@@ -694,6 +699,11 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
694
699
|
} else {
|
|
695
700
|
if (currentTurn) {
|
|
696
701
|
currentTurn.confidence = currentTurnConfidence / currentTurnWordCount;
|
|
702
|
+
if (currentTurn.confidence < 0.5) {
|
|
703
|
+
currentTurn.text = "[inaudible]";
|
|
704
|
+
} else if (currentTurn.confidence < 0.75) {
|
|
705
|
+
currentTurn.text = `[unclear, confidence=${currentTurn.confidence.toFixed(2)}] ${currentTurn.text}`;
|
|
706
|
+
}
|
|
697
707
|
turns.push(currentTurn);
|
|
698
708
|
}
|
|
699
709
|
currentTurn = { speaker, text: utterance, confidence: 0 };
|
|
@@ -712,6 +722,156 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
712
722
|
}
|
|
713
723
|
}
|
|
714
724
|
};
|
|
725
|
+
var kReconnectBaseMs = 500;
|
|
726
|
+
var kReconnectMaxMs = 1e4;
|
|
727
|
+
var DeepgramFluxSTT = class extends MagmaFlowSpeechToText {
|
|
728
|
+
ws = null;
|
|
729
|
+
apiKey;
|
|
730
|
+
eotThreshold;
|
|
731
|
+
eagerEotThreshold;
|
|
732
|
+
eotTimeoutMs;
|
|
733
|
+
audioQueue = [];
|
|
734
|
+
connecting = false;
|
|
735
|
+
killed = false;
|
|
736
|
+
reconnectAttempts = 0;
|
|
737
|
+
reconnectTimer;
|
|
738
|
+
constructor(config) {
|
|
739
|
+
super();
|
|
740
|
+
this.apiKey = config?.apiKey ?? process.env.DEEPGRAM_API_KEY;
|
|
741
|
+
this.eotThreshold = config?.eotThreshold ?? 0.7;
|
|
742
|
+
this.eagerEotThreshold = config?.eagerEotThreshold;
|
|
743
|
+
this.eotTimeoutMs = config?.eotTimeoutMs ?? 5e3;
|
|
744
|
+
}
|
|
745
|
+
connect() {
|
|
746
|
+
if (this.connecting || this.killed) return;
|
|
747
|
+
this.connecting = true;
|
|
748
|
+
const params = new URLSearchParams({
|
|
749
|
+
model: "flux-general-en",
|
|
750
|
+
encoding: "linear16",
|
|
751
|
+
sample_rate: "48000",
|
|
752
|
+
eot_threshold: this.eotThreshold.toString(),
|
|
753
|
+
eot_timeout_ms: this.eotTimeoutMs.toString()
|
|
754
|
+
});
|
|
755
|
+
if (this.eagerEotThreshold !== void 0) {
|
|
756
|
+
params.set("eager_eot_threshold", this.eagerEotThreshold.toString());
|
|
757
|
+
}
|
|
758
|
+
const url = `wss://api.deepgram.com/v2/listen?${params.toString()}`;
|
|
759
|
+
this.ws = new WebSocket2__default.default(url, {
|
|
760
|
+
headers: { Authorization: `Token ${this.apiKey}` }
|
|
761
|
+
});
|
|
762
|
+
this.ws.on("open", () => {
|
|
763
|
+
console.log("[DeepgramFlux] Connected");
|
|
764
|
+
this.connecting = false;
|
|
765
|
+
this.reconnectAttempts = 0;
|
|
766
|
+
for (const queued of this.audioQueue) {
|
|
767
|
+
this.ws.send(queued);
|
|
768
|
+
}
|
|
769
|
+
this.audioQueue = [];
|
|
770
|
+
});
|
|
771
|
+
this.ws.on("message", (data) => {
|
|
772
|
+
try {
|
|
773
|
+
const msg = JSON.parse(data.toString());
|
|
774
|
+
this.handleMessage(msg);
|
|
775
|
+
} catch (err) {
|
|
776
|
+
console.error(`[DeepgramFlux] Parse error: ${err.message}`);
|
|
777
|
+
}
|
|
778
|
+
});
|
|
779
|
+
this.ws.on("error", (err) => {
|
|
780
|
+
console.error(`[DeepgramFlux] Error: ${err.message}`);
|
|
781
|
+
this.connecting = false;
|
|
782
|
+
});
|
|
783
|
+
this.ws.on("close", (code, reason) => {
|
|
784
|
+
console.log(`[DeepgramFlux] Closed: ${code} ${reason.toString()}`);
|
|
785
|
+
this.clearConnection();
|
|
786
|
+
if (!this.killed) {
|
|
787
|
+
this.scheduleReconnect();
|
|
788
|
+
}
|
|
789
|
+
});
|
|
790
|
+
}
|
|
791
|
+
scheduleReconnect() {
|
|
792
|
+
const delay = Math.min(
|
|
793
|
+
kReconnectBaseMs * Math.pow(2, this.reconnectAttempts),
|
|
794
|
+
kReconnectMaxMs
|
|
795
|
+
);
|
|
796
|
+
this.reconnectAttempts++;
|
|
797
|
+
console.log(
|
|
798
|
+
`[DeepgramFlux] Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`
|
|
799
|
+
);
|
|
800
|
+
this.reconnectTimer = setTimeout(() => {
|
|
801
|
+
this.reconnectTimer = void 0;
|
|
802
|
+
if (!this.killed && !this.ws) {
|
|
803
|
+
this.connect();
|
|
804
|
+
}
|
|
805
|
+
}, delay);
|
|
806
|
+
}
|
|
807
|
+
clearConnection() {
|
|
808
|
+
this.ws = null;
|
|
809
|
+
this.connecting = false;
|
|
810
|
+
}
|
|
811
|
+
handleMessage(msg) {
|
|
812
|
+
switch (msg.type) {
|
|
813
|
+
case "TurnInfo":
|
|
814
|
+
this.handleTurnInfo(msg);
|
|
815
|
+
break;
|
|
816
|
+
case "Connected":
|
|
817
|
+
break;
|
|
818
|
+
case "Error":
|
|
819
|
+
console.error(`[DeepgramFlux] Server error: ${JSON.stringify(msg)}`);
|
|
820
|
+
break;
|
|
821
|
+
default:
|
|
822
|
+
console.log(`[DeepgramFlux] Unknown message: ${JSON.stringify(msg)}`);
|
|
823
|
+
break;
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
handleTurnInfo(msg) {
|
|
827
|
+
switch (msg.event) {
|
|
828
|
+
case "StartOfTurn":
|
|
829
|
+
this.onSpeechDetected();
|
|
830
|
+
break;
|
|
831
|
+
case "EndOfTurn":
|
|
832
|
+
this.onOutput({ text: msg.transcript });
|
|
833
|
+
break;
|
|
834
|
+
case "EagerEndOfTurn":
|
|
835
|
+
this.onEagerEndOfTurn(msg.transcript);
|
|
836
|
+
break;
|
|
837
|
+
case "TurnResumed":
|
|
838
|
+
this.onTurnResumed();
|
|
839
|
+
break;
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
input(audio) {
|
|
843
|
+
if (!this.ws && !this.connecting && !this.reconnectTimer) {
|
|
844
|
+
this.connect();
|
|
845
|
+
}
|
|
846
|
+
if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
|
|
847
|
+
this.ws.send(audio);
|
|
848
|
+
} else {
|
|
849
|
+
this.audioQueue.push(audio);
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
flush() {
|
|
853
|
+
if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
|
|
854
|
+
this.ws.send(JSON.stringify({ type: "Finalize" }));
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
kill() {
|
|
858
|
+
this.killed = true;
|
|
859
|
+
if (this.reconnectTimer) {
|
|
860
|
+
clearTimeout(this.reconnectTimer);
|
|
861
|
+
this.reconnectTimer = void 0;
|
|
862
|
+
}
|
|
863
|
+
if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
|
|
864
|
+
this.ws.send(JSON.stringify({ type: "CloseStream" }));
|
|
865
|
+
this.ws.close();
|
|
866
|
+
}
|
|
867
|
+
this.clearConnection();
|
|
868
|
+
this.audioQueue = [];
|
|
869
|
+
}
|
|
870
|
+
onEagerEndOfTurn(transcript) {
|
|
871
|
+
}
|
|
872
|
+
onTurnResumed() {
|
|
873
|
+
}
|
|
874
|
+
};
|
|
715
875
|
var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
|
|
716
876
|
var DummyWebSocket = class {
|
|
717
877
|
url;
|
|
@@ -740,7 +900,7 @@ var QueueWebSocket = class {
|
|
|
740
900
|
if (NATIVE_WEBSOCKET_AVAILABLE) {
|
|
741
901
|
this.ws = new WebSocket(this.url);
|
|
742
902
|
} else {
|
|
743
|
-
this.ws = new
|
|
903
|
+
this.ws = new WebSocket2.WebSocket(this.url);
|
|
744
904
|
}
|
|
745
905
|
} else {
|
|
746
906
|
this.ws = new DummyWebSocket(null);
|
|
@@ -754,7 +914,7 @@ var QueueWebSocket = class {
|
|
|
754
914
|
if (NATIVE_WEBSOCKET_AVAILABLE) {
|
|
755
915
|
this.ws = new WebSocket(this.url);
|
|
756
916
|
} else {
|
|
757
|
-
this.ws = new
|
|
917
|
+
this.ws = new WebSocket2.WebSocket(this.url);
|
|
758
918
|
}
|
|
759
919
|
} else {
|
|
760
920
|
this.ws = new DummyWebSocket(null);
|
|
@@ -1079,7 +1239,6 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1079
1239
|
).then(async (response) => {
|
|
1080
1240
|
const reader = response.body?.getReader();
|
|
1081
1241
|
if (!reader) return;
|
|
1082
|
-
new TextDecoder();
|
|
1083
1242
|
while (true) {
|
|
1084
1243
|
const { done, value } = await reader.read();
|
|
1085
1244
|
if (done) break;
|
|
@@ -1096,9 +1255,17 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1096
1255
|
};
|
|
1097
1256
|
var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
1098
1257
|
client;
|
|
1258
|
+
voice;
|
|
1259
|
+
description;
|
|
1260
|
+
speed;
|
|
1261
|
+
version;
|
|
1099
1262
|
constructor(args) {
|
|
1100
1263
|
super();
|
|
1101
1264
|
this.client = args.client ?? new hume.HumeClient({ apiKey: process.env.HUME_API_KEY });
|
|
1265
|
+
this.voice = args.voice;
|
|
1266
|
+
this.description = args.description;
|
|
1267
|
+
this.speed = args.speed;
|
|
1268
|
+
this.version = args.version;
|
|
1102
1269
|
}
|
|
1103
1270
|
async setup() {
|
|
1104
1271
|
}
|
|
@@ -1106,22 +1273,28 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1106
1273
|
if (!text) {
|
|
1107
1274
|
return;
|
|
1108
1275
|
}
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
],
|
|
1115
|
-
format: {
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1276
|
+
const utterance = { text };
|
|
1277
|
+
if (this.voice) utterance.voice = this.voice;
|
|
1278
|
+
if (this.description) utterance.description = this.description;
|
|
1279
|
+
if (this.speed !== void 0) utterance.speed = this.speed;
|
|
1280
|
+
const params = {
|
|
1281
|
+
utterances: [utterance],
|
|
1282
|
+
format: { type: "pcm" },
|
|
1283
|
+
instantMode: true,
|
|
1284
|
+
stripHeaders: true
|
|
1285
|
+
};
|
|
1286
|
+
if (this.version) params.version = this.version;
|
|
1287
|
+
this.client.tts.synthesizeJsonStreaming(params).then(async (stream) => {
|
|
1120
1288
|
for await (const chunk of stream) {
|
|
1121
|
-
|
|
1289
|
+
if (chunk.type === "audio") {
|
|
1290
|
+
this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
|
|
1291
|
+
}
|
|
1122
1292
|
}
|
|
1123
1293
|
this.onOutput(null, requestId);
|
|
1124
1294
|
console.log("[Hume] Finished:", text);
|
|
1295
|
+
}).catch((err) => {
|
|
1296
|
+
console.error(`[Hume] Error: ${err.message}`);
|
|
1297
|
+
this.onOutput(null, requestId);
|
|
1125
1298
|
});
|
|
1126
1299
|
}
|
|
1127
1300
|
kill() {
|
|
@@ -1164,6 +1337,7 @@ var WhisperTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1164
1337
|
}
|
|
1165
1338
|
};
|
|
1166
1339
|
|
|
1340
|
+
exports.DeepgramFluxSTT = DeepgramFluxSTT;
|
|
1167
1341
|
exports.DeepgramLanguage = DeepgramLanguage;
|
|
1168
1342
|
exports.DeepgramModel = DeepgramModel;
|
|
1169
1343
|
exports.DeepgramSTT = DeepgramSTT;
|
package/dist/voice.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { DeepgramClient, LiveTranscriptionEvents } from '@deepgram/sdk';
|
|
2
|
-
import { WebSocket as WebSocket$1 } from 'ws';
|
|
2
|
+
import WebSocket2, { WebSocket as WebSocket$1 } from 'ws';
|
|
3
3
|
import { HumeClient } from 'hume';
|
|
4
4
|
import OpenAI from 'openai';
|
|
5
5
|
|
|
@@ -418,6 +418,7 @@ var MagmaFlow = class {
|
|
|
418
418
|
inputFormat;
|
|
419
419
|
outputFormat;
|
|
420
420
|
onAudioOutput;
|
|
421
|
+
onNormalizedAudio;
|
|
421
422
|
textBuffer = "";
|
|
422
423
|
textQueue = [];
|
|
423
424
|
generatingAudio = false;
|
|
@@ -434,6 +435,7 @@ var MagmaFlow = class {
|
|
|
434
435
|
this.inputFormat = args.inputFormat;
|
|
435
436
|
this.outputFormat = args.outputFormat;
|
|
436
437
|
this.onAudioOutput = args.onAudioOutput;
|
|
438
|
+
this.onNormalizedAudio = args.onNormalizedAudio;
|
|
437
439
|
this.config = { ...this.config, ...args.config };
|
|
438
440
|
this.tts.onOutput = (audio, requestId) => {
|
|
439
441
|
if (this.currentRequestId !== requestId) {
|
|
@@ -472,7 +474,9 @@ var MagmaFlow = class {
|
|
|
472
474
|
inputAudio(audio) {
|
|
473
475
|
const decodedAudio = decodeToPcm(audio, this.inputFormat.encoding);
|
|
474
476
|
const resampledPCM = resamplePcm(decodedAudio, this.inputFormat.sampleRate, 48e3);
|
|
475
|
-
|
|
477
|
+
const pcmBuffer = int16ArrayToBuffer(resampledPCM);
|
|
478
|
+
this.onNormalizedAudio?.(pcmBuffer);
|
|
479
|
+
this.stt.input(pcmBuffer);
|
|
476
480
|
}
|
|
477
481
|
inputText(text) {
|
|
478
482
|
if (text === void 0 || text === null) {
|
|
@@ -688,6 +692,11 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
688
692
|
} else {
|
|
689
693
|
if (currentTurn) {
|
|
690
694
|
currentTurn.confidence = currentTurnConfidence / currentTurnWordCount;
|
|
695
|
+
if (currentTurn.confidence < 0.5) {
|
|
696
|
+
currentTurn.text = "[inaudible]";
|
|
697
|
+
} else if (currentTurn.confidence < 0.75) {
|
|
698
|
+
currentTurn.text = `[unclear, confidence=${currentTurn.confidence.toFixed(2)}] ${currentTurn.text}`;
|
|
699
|
+
}
|
|
691
700
|
turns.push(currentTurn);
|
|
692
701
|
}
|
|
693
702
|
currentTurn = { speaker, text: utterance, confidence: 0 };
|
|
@@ -706,6 +715,156 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
706
715
|
}
|
|
707
716
|
}
|
|
708
717
|
};
|
|
718
|
+
var kReconnectBaseMs = 500;
|
|
719
|
+
var kReconnectMaxMs = 1e4;
|
|
720
|
+
var DeepgramFluxSTT = class extends MagmaFlowSpeechToText {
|
|
721
|
+
ws = null;
|
|
722
|
+
apiKey;
|
|
723
|
+
eotThreshold;
|
|
724
|
+
eagerEotThreshold;
|
|
725
|
+
eotTimeoutMs;
|
|
726
|
+
audioQueue = [];
|
|
727
|
+
connecting = false;
|
|
728
|
+
killed = false;
|
|
729
|
+
reconnectAttempts = 0;
|
|
730
|
+
reconnectTimer;
|
|
731
|
+
constructor(config) {
|
|
732
|
+
super();
|
|
733
|
+
this.apiKey = config?.apiKey ?? process.env.DEEPGRAM_API_KEY;
|
|
734
|
+
this.eotThreshold = config?.eotThreshold ?? 0.7;
|
|
735
|
+
this.eagerEotThreshold = config?.eagerEotThreshold;
|
|
736
|
+
this.eotTimeoutMs = config?.eotTimeoutMs ?? 5e3;
|
|
737
|
+
}
|
|
738
|
+
connect() {
|
|
739
|
+
if (this.connecting || this.killed) return;
|
|
740
|
+
this.connecting = true;
|
|
741
|
+
const params = new URLSearchParams({
|
|
742
|
+
model: "flux-general-en",
|
|
743
|
+
encoding: "linear16",
|
|
744
|
+
sample_rate: "48000",
|
|
745
|
+
eot_threshold: this.eotThreshold.toString(),
|
|
746
|
+
eot_timeout_ms: this.eotTimeoutMs.toString()
|
|
747
|
+
});
|
|
748
|
+
if (this.eagerEotThreshold !== void 0) {
|
|
749
|
+
params.set("eager_eot_threshold", this.eagerEotThreshold.toString());
|
|
750
|
+
}
|
|
751
|
+
const url = `wss://api.deepgram.com/v2/listen?${params.toString()}`;
|
|
752
|
+
this.ws = new WebSocket2(url, {
|
|
753
|
+
headers: { Authorization: `Token ${this.apiKey}` }
|
|
754
|
+
});
|
|
755
|
+
this.ws.on("open", () => {
|
|
756
|
+
console.log("[DeepgramFlux] Connected");
|
|
757
|
+
this.connecting = false;
|
|
758
|
+
this.reconnectAttempts = 0;
|
|
759
|
+
for (const queued of this.audioQueue) {
|
|
760
|
+
this.ws.send(queued);
|
|
761
|
+
}
|
|
762
|
+
this.audioQueue = [];
|
|
763
|
+
});
|
|
764
|
+
this.ws.on("message", (data) => {
|
|
765
|
+
try {
|
|
766
|
+
const msg = JSON.parse(data.toString());
|
|
767
|
+
this.handleMessage(msg);
|
|
768
|
+
} catch (err) {
|
|
769
|
+
console.error(`[DeepgramFlux] Parse error: ${err.message}`);
|
|
770
|
+
}
|
|
771
|
+
});
|
|
772
|
+
this.ws.on("error", (err) => {
|
|
773
|
+
console.error(`[DeepgramFlux] Error: ${err.message}`);
|
|
774
|
+
this.connecting = false;
|
|
775
|
+
});
|
|
776
|
+
this.ws.on("close", (code, reason) => {
|
|
777
|
+
console.log(`[DeepgramFlux] Closed: ${code} ${reason.toString()}`);
|
|
778
|
+
this.clearConnection();
|
|
779
|
+
if (!this.killed) {
|
|
780
|
+
this.scheduleReconnect();
|
|
781
|
+
}
|
|
782
|
+
});
|
|
783
|
+
}
|
|
784
|
+
scheduleReconnect() {
|
|
785
|
+
const delay = Math.min(
|
|
786
|
+
kReconnectBaseMs * Math.pow(2, this.reconnectAttempts),
|
|
787
|
+
kReconnectMaxMs
|
|
788
|
+
);
|
|
789
|
+
this.reconnectAttempts++;
|
|
790
|
+
console.log(
|
|
791
|
+
`[DeepgramFlux] Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`
|
|
792
|
+
);
|
|
793
|
+
this.reconnectTimer = setTimeout(() => {
|
|
794
|
+
this.reconnectTimer = void 0;
|
|
795
|
+
if (!this.killed && !this.ws) {
|
|
796
|
+
this.connect();
|
|
797
|
+
}
|
|
798
|
+
}, delay);
|
|
799
|
+
}
|
|
800
|
+
clearConnection() {
|
|
801
|
+
this.ws = null;
|
|
802
|
+
this.connecting = false;
|
|
803
|
+
}
|
|
804
|
+
handleMessage(msg) {
|
|
805
|
+
switch (msg.type) {
|
|
806
|
+
case "TurnInfo":
|
|
807
|
+
this.handleTurnInfo(msg);
|
|
808
|
+
break;
|
|
809
|
+
case "Connected":
|
|
810
|
+
break;
|
|
811
|
+
case "Error":
|
|
812
|
+
console.error(`[DeepgramFlux] Server error: ${JSON.stringify(msg)}`);
|
|
813
|
+
break;
|
|
814
|
+
default:
|
|
815
|
+
console.log(`[DeepgramFlux] Unknown message: ${JSON.stringify(msg)}`);
|
|
816
|
+
break;
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
handleTurnInfo(msg) {
|
|
820
|
+
switch (msg.event) {
|
|
821
|
+
case "StartOfTurn":
|
|
822
|
+
this.onSpeechDetected();
|
|
823
|
+
break;
|
|
824
|
+
case "EndOfTurn":
|
|
825
|
+
this.onOutput({ text: msg.transcript });
|
|
826
|
+
break;
|
|
827
|
+
case "EagerEndOfTurn":
|
|
828
|
+
this.onEagerEndOfTurn(msg.transcript);
|
|
829
|
+
break;
|
|
830
|
+
case "TurnResumed":
|
|
831
|
+
this.onTurnResumed();
|
|
832
|
+
break;
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
input(audio) {
|
|
836
|
+
if (!this.ws && !this.connecting && !this.reconnectTimer) {
|
|
837
|
+
this.connect();
|
|
838
|
+
}
|
|
839
|
+
if (this.ws?.readyState === WebSocket2.OPEN) {
|
|
840
|
+
this.ws.send(audio);
|
|
841
|
+
} else {
|
|
842
|
+
this.audioQueue.push(audio);
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
flush() {
|
|
846
|
+
if (this.ws?.readyState === WebSocket2.OPEN) {
|
|
847
|
+
this.ws.send(JSON.stringify({ type: "Finalize" }));
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
kill() {
|
|
851
|
+
this.killed = true;
|
|
852
|
+
if (this.reconnectTimer) {
|
|
853
|
+
clearTimeout(this.reconnectTimer);
|
|
854
|
+
this.reconnectTimer = void 0;
|
|
855
|
+
}
|
|
856
|
+
if (this.ws?.readyState === WebSocket2.OPEN) {
|
|
857
|
+
this.ws.send(JSON.stringify({ type: "CloseStream" }));
|
|
858
|
+
this.ws.close();
|
|
859
|
+
}
|
|
860
|
+
this.clearConnection();
|
|
861
|
+
this.audioQueue = [];
|
|
862
|
+
}
|
|
863
|
+
onEagerEndOfTurn(transcript) {
|
|
864
|
+
}
|
|
865
|
+
onTurnResumed() {
|
|
866
|
+
}
|
|
867
|
+
};
|
|
709
868
|
var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
|
|
710
869
|
var DummyWebSocket = class {
|
|
711
870
|
url;
|
|
@@ -1073,7 +1232,6 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1073
1232
|
).then(async (response) => {
|
|
1074
1233
|
const reader = response.body?.getReader();
|
|
1075
1234
|
if (!reader) return;
|
|
1076
|
-
new TextDecoder();
|
|
1077
1235
|
while (true) {
|
|
1078
1236
|
const { done, value } = await reader.read();
|
|
1079
1237
|
if (done) break;
|
|
@@ -1090,9 +1248,17 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1090
1248
|
};
|
|
1091
1249
|
var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
1092
1250
|
client;
|
|
1251
|
+
voice;
|
|
1252
|
+
description;
|
|
1253
|
+
speed;
|
|
1254
|
+
version;
|
|
1093
1255
|
constructor(args) {
|
|
1094
1256
|
super();
|
|
1095
1257
|
this.client = args.client ?? new HumeClient({ apiKey: process.env.HUME_API_KEY });
|
|
1258
|
+
this.voice = args.voice;
|
|
1259
|
+
this.description = args.description;
|
|
1260
|
+
this.speed = args.speed;
|
|
1261
|
+
this.version = args.version;
|
|
1096
1262
|
}
|
|
1097
1263
|
async setup() {
|
|
1098
1264
|
}
|
|
@@ -1100,22 +1266,28 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1100
1266
|
if (!text) {
|
|
1101
1267
|
return;
|
|
1102
1268
|
}
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
],
|
|
1109
|
-
format: {
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1269
|
+
const utterance = { text };
|
|
1270
|
+
if (this.voice) utterance.voice = this.voice;
|
|
1271
|
+
if (this.description) utterance.description = this.description;
|
|
1272
|
+
if (this.speed !== void 0) utterance.speed = this.speed;
|
|
1273
|
+
const params = {
|
|
1274
|
+
utterances: [utterance],
|
|
1275
|
+
format: { type: "pcm" },
|
|
1276
|
+
instantMode: true,
|
|
1277
|
+
stripHeaders: true
|
|
1278
|
+
};
|
|
1279
|
+
if (this.version) params.version = this.version;
|
|
1280
|
+
this.client.tts.synthesizeJsonStreaming(params).then(async (stream) => {
|
|
1114
1281
|
for await (const chunk of stream) {
|
|
1115
|
-
|
|
1282
|
+
if (chunk.type === "audio") {
|
|
1283
|
+
this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
|
|
1284
|
+
}
|
|
1116
1285
|
}
|
|
1117
1286
|
this.onOutput(null, requestId);
|
|
1118
1287
|
console.log("[Hume] Finished:", text);
|
|
1288
|
+
}).catch((err) => {
|
|
1289
|
+
console.error(`[Hume] Error: ${err.message}`);
|
|
1290
|
+
this.onOutput(null, requestId);
|
|
1119
1291
|
});
|
|
1120
1292
|
}
|
|
1121
1293
|
kill() {
|
|
@@ -1158,4 +1330,4 @@ var WhisperTTS = class extends MagmaFlowTextToSpeech {
|
|
|
1158
1330
|
}
|
|
1159
1331
|
};
|
|
1160
1332
|
|
|
1161
|
-
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
|
|
1333
|
+
export { DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pompeii-labs/audio",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "The Audio SDK from Pompeii Labs",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"Pompeii",
|
|
@@ -42,7 +42,7 @@
|
|
|
42
42
|
},
|
|
43
43
|
"dependencies": {
|
|
44
44
|
"@deepgram/sdk": "4.2.0",
|
|
45
|
-
"hume": "0.
|
|
45
|
+
"hume": "0.15.13",
|
|
46
46
|
"openai": "4.86.2"
|
|
47
47
|
},
|
|
48
48
|
"devDependencies": {
|