@pompeii-labs/audio 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/voice.d.mts +110 -1
- package/dist/voice.d.ts +110 -1
- package/dist/voice.js +277 -5
- package/dist/voice.mjs +275 -6
- package/package.json +5 -1
package/dist/voice.d.mts
CHANGED
|
@@ -107,6 +107,115 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
|
|
|
107
107
|
private computeTurns;
|
|
108
108
|
}
|
|
109
109
|
|
|
110
|
+
declare enum GladiaModel {
|
|
111
|
+
SOLARIA_1 = "solaria-1"
|
|
112
|
+
}
|
|
113
|
+
declare enum GladiaLanguage {
|
|
114
|
+
EN = "en"
|
|
115
|
+
}
|
|
116
|
+
type GladiaConfig = {
|
|
117
|
+
model: GladiaModel;
|
|
118
|
+
encoding?: 'wav/pcm' | 'wav/alaw' | 'wav/ulaw';
|
|
119
|
+
sample_rate?: number;
|
|
120
|
+
bit_depth?: number;
|
|
121
|
+
channels?: number;
|
|
122
|
+
custom_metadata?: Record<string, unknown>;
|
|
123
|
+
endpointing?: number;
|
|
124
|
+
maximum_duration_without_endpointing?: number;
|
|
125
|
+
language_config?: {
|
|
126
|
+
languages?: GladiaLanguage[];
|
|
127
|
+
};
|
|
128
|
+
pre_processing?: {
|
|
129
|
+
audio_enhancer?: boolean;
|
|
130
|
+
speech_threshold?: number;
|
|
131
|
+
};
|
|
132
|
+
realtime_processing?: {
|
|
133
|
+
custom_vocabulary?: boolean;
|
|
134
|
+
custom_vocabulary_config?: {
|
|
135
|
+
vocabulary: {
|
|
136
|
+
value: string;
|
|
137
|
+
intensity?: number;
|
|
138
|
+
pronunciations?: string[];
|
|
139
|
+
language?: string;
|
|
140
|
+
}[];
|
|
141
|
+
default_intensity?: number;
|
|
142
|
+
};
|
|
143
|
+
custom_spelling?: boolean;
|
|
144
|
+
custom_spelling_config?: {
|
|
145
|
+
spelling_dictionary: Record<string, string[]>;
|
|
146
|
+
};
|
|
147
|
+
translation?: boolean;
|
|
148
|
+
translation_config?: {
|
|
149
|
+
target_languages: GladiaLanguage[];
|
|
150
|
+
model?: 'base' | 'enhanced';
|
|
151
|
+
match_original_utterances?: boolean;
|
|
152
|
+
lipsync?: boolean;
|
|
153
|
+
context_adaptation?: boolean;
|
|
154
|
+
context?: string;
|
|
155
|
+
informal?: boolean;
|
|
156
|
+
};
|
|
157
|
+
named_entity_recognition?: boolean;
|
|
158
|
+
sentiment_analysis?: boolean;
|
|
159
|
+
};
|
|
160
|
+
post_processing?: {
|
|
161
|
+
summarization?: boolean;
|
|
162
|
+
summarization_config?: {
|
|
163
|
+
type?: 'general' | 'bullet_points' | 'concise';
|
|
164
|
+
};
|
|
165
|
+
chapterization?: boolean;
|
|
166
|
+
};
|
|
167
|
+
messages_config?: {
|
|
168
|
+
receive_partial_transcripts?: boolean;
|
|
169
|
+
receive_final_transcripts?: boolean;
|
|
170
|
+
receive_speech_events?: boolean;
|
|
171
|
+
receive_pre_processing_events?: boolean;
|
|
172
|
+
receive_realtime_processing_events?: boolean;
|
|
173
|
+
receive_post_processing_events?: boolean;
|
|
174
|
+
receive_acknowledgements?: boolean;
|
|
175
|
+
receive_errors?: boolean;
|
|
176
|
+
receive_lifecycle_events?: boolean;
|
|
177
|
+
};
|
|
178
|
+
callback?: boolean;
|
|
179
|
+
callback_config?: {
|
|
180
|
+
url: string;
|
|
181
|
+
receive_partial_transcripts?: boolean;
|
|
182
|
+
receive_final_transcripts?: boolean;
|
|
183
|
+
receive_speech_events?: boolean;
|
|
184
|
+
receive_pre_processing_events?: boolean;
|
|
185
|
+
receive_realtime_processing_events?: boolean;
|
|
186
|
+
receive_post_processing_events?: boolean;
|
|
187
|
+
receive_acknowledgements?: boolean;
|
|
188
|
+
receive_errors?: boolean;
|
|
189
|
+
receive_lifecycle_events?: boolean;
|
|
190
|
+
};
|
|
191
|
+
};
|
|
192
|
+
type GladiaSTTArgs = {
|
|
193
|
+
model: GladiaModel;
|
|
194
|
+
apiKey?: string;
|
|
195
|
+
config?: Omit<GladiaConfig, 'model' | 'encoding' | 'sample_rate' | 'channels'>;
|
|
196
|
+
};
|
|
197
|
+
declare class GladiaSTT extends MagmaFlowSpeechToText {
|
|
198
|
+
private connection;
|
|
199
|
+
private connectionUrl;
|
|
200
|
+
private settingUp;
|
|
201
|
+
private apiKey;
|
|
202
|
+
private config;
|
|
203
|
+
private turnBuffer;
|
|
204
|
+
private utteranceEnded;
|
|
205
|
+
constructor(args: GladiaSTTArgs);
|
|
206
|
+
private setup;
|
|
207
|
+
private connectWS;
|
|
208
|
+
input(audio: Buffer): void;
|
|
209
|
+
flush(): void;
|
|
210
|
+
kill(): void;
|
|
211
|
+
private handleTranscriptionEvent;
|
|
212
|
+
private handleUtteranceEnd;
|
|
213
|
+
private sendOutput;
|
|
214
|
+
private onOpen;
|
|
215
|
+
private computeTurns;
|
|
216
|
+
private handleMessage;
|
|
217
|
+
}
|
|
218
|
+
|
|
110
219
|
type DeepgramTTSArgs = {
|
|
111
220
|
client?: DeepgramClient;
|
|
112
221
|
};
|
|
@@ -193,4 +302,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
|
|
|
193
302
|
|
|
194
303
|
declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
|
|
195
304
|
|
|
196
|
-
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
|
305
|
+
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
package/dist/voice.d.ts
CHANGED
|
@@ -107,6 +107,115 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
|
|
|
107
107
|
private computeTurns;
|
|
108
108
|
}
|
|
109
109
|
|
|
110
|
+
declare enum GladiaModel {
|
|
111
|
+
SOLARIA_1 = "solaria-1"
|
|
112
|
+
}
|
|
113
|
+
declare enum GladiaLanguage {
|
|
114
|
+
EN = "en"
|
|
115
|
+
}
|
|
116
|
+
type GladiaConfig = {
|
|
117
|
+
model: GladiaModel;
|
|
118
|
+
encoding?: 'wav/pcm' | 'wav/alaw' | 'wav/ulaw';
|
|
119
|
+
sample_rate?: number;
|
|
120
|
+
bit_depth?: number;
|
|
121
|
+
channels?: number;
|
|
122
|
+
custom_metadata?: Record<string, unknown>;
|
|
123
|
+
endpointing?: number;
|
|
124
|
+
maximum_duration_without_endpointing?: number;
|
|
125
|
+
language_config?: {
|
|
126
|
+
languages?: GladiaLanguage[];
|
|
127
|
+
};
|
|
128
|
+
pre_processing?: {
|
|
129
|
+
audio_enhancer?: boolean;
|
|
130
|
+
speech_threshold?: number;
|
|
131
|
+
};
|
|
132
|
+
realtime_processing?: {
|
|
133
|
+
custom_vocabulary?: boolean;
|
|
134
|
+
custom_vocabulary_config?: {
|
|
135
|
+
vocabulary: {
|
|
136
|
+
value: string;
|
|
137
|
+
intensity?: number;
|
|
138
|
+
pronunciations?: string[];
|
|
139
|
+
language?: string;
|
|
140
|
+
}[];
|
|
141
|
+
default_intensity?: number;
|
|
142
|
+
};
|
|
143
|
+
custom_spelling?: boolean;
|
|
144
|
+
custom_spelling_config?: {
|
|
145
|
+
spelling_dictionary: Record<string, string[]>;
|
|
146
|
+
};
|
|
147
|
+
translation?: boolean;
|
|
148
|
+
translation_config?: {
|
|
149
|
+
target_languages: GladiaLanguage[];
|
|
150
|
+
model?: 'base' | 'enhanced';
|
|
151
|
+
match_original_utterances?: boolean;
|
|
152
|
+
lipsync?: boolean;
|
|
153
|
+
context_adaptation?: boolean;
|
|
154
|
+
context?: string;
|
|
155
|
+
informal?: boolean;
|
|
156
|
+
};
|
|
157
|
+
named_entity_recognition?: boolean;
|
|
158
|
+
sentiment_analysis?: boolean;
|
|
159
|
+
};
|
|
160
|
+
post_processing?: {
|
|
161
|
+
summarization?: boolean;
|
|
162
|
+
summarization_config?: {
|
|
163
|
+
type?: 'general' | 'bullet_points' | 'concise';
|
|
164
|
+
};
|
|
165
|
+
chapterization?: boolean;
|
|
166
|
+
};
|
|
167
|
+
messages_config?: {
|
|
168
|
+
receive_partial_transcripts?: boolean;
|
|
169
|
+
receive_final_transcripts?: boolean;
|
|
170
|
+
receive_speech_events?: boolean;
|
|
171
|
+
receive_pre_processing_events?: boolean;
|
|
172
|
+
receive_realtime_processing_events?: boolean;
|
|
173
|
+
receive_post_processing_events?: boolean;
|
|
174
|
+
receive_acknowledgements?: boolean;
|
|
175
|
+
receive_errors?: boolean;
|
|
176
|
+
receive_lifecycle_events?: boolean;
|
|
177
|
+
};
|
|
178
|
+
callback?: boolean;
|
|
179
|
+
callback_config?: {
|
|
180
|
+
url: string;
|
|
181
|
+
receive_partial_transcripts?: boolean;
|
|
182
|
+
receive_final_transcripts?: boolean;
|
|
183
|
+
receive_speech_events?: boolean;
|
|
184
|
+
receive_pre_processing_events?: boolean;
|
|
185
|
+
receive_realtime_processing_events?: boolean;
|
|
186
|
+
receive_post_processing_events?: boolean;
|
|
187
|
+
receive_acknowledgements?: boolean;
|
|
188
|
+
receive_errors?: boolean;
|
|
189
|
+
receive_lifecycle_events?: boolean;
|
|
190
|
+
};
|
|
191
|
+
};
|
|
192
|
+
type GladiaSTTArgs = {
|
|
193
|
+
model: GladiaModel;
|
|
194
|
+
apiKey?: string;
|
|
195
|
+
config?: Omit<GladiaConfig, 'model' | 'encoding' | 'sample_rate' | 'channels'>;
|
|
196
|
+
};
|
|
197
|
+
declare class GladiaSTT extends MagmaFlowSpeechToText {
|
|
198
|
+
private connection;
|
|
199
|
+
private connectionUrl;
|
|
200
|
+
private settingUp;
|
|
201
|
+
private apiKey;
|
|
202
|
+
private config;
|
|
203
|
+
private turnBuffer;
|
|
204
|
+
private utteranceEnded;
|
|
205
|
+
constructor(args: GladiaSTTArgs);
|
|
206
|
+
private setup;
|
|
207
|
+
private connectWS;
|
|
208
|
+
input(audio: Buffer): void;
|
|
209
|
+
flush(): void;
|
|
210
|
+
kill(): void;
|
|
211
|
+
private handleTranscriptionEvent;
|
|
212
|
+
private handleUtteranceEnd;
|
|
213
|
+
private sendOutput;
|
|
214
|
+
private onOpen;
|
|
215
|
+
private computeTurns;
|
|
216
|
+
private handleMessage;
|
|
217
|
+
}
|
|
218
|
+
|
|
110
219
|
type DeepgramTTSArgs = {
|
|
111
220
|
client?: DeepgramClient;
|
|
112
221
|
};
|
|
@@ -193,4 +302,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
|
|
|
193
302
|
|
|
194
303
|
declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
|
|
195
304
|
|
|
196
|
-
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
|
305
|
+
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
|
package/dist/voice.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var sdk = require('@deepgram/sdk');
|
|
4
|
+
var ws = require('ws');
|
|
4
5
|
var hume = require('hume');
|
|
5
6
|
var OpenAI = require('openai');
|
|
6
7
|
|
|
@@ -693,11 +694,6 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
693
694
|
} else {
|
|
694
695
|
if (currentTurn) {
|
|
695
696
|
currentTurn.confidence = currentTurnConfidence / currentTurnWordCount;
|
|
696
|
-
if (currentTurn.confidence < 0.5) {
|
|
697
|
-
currentTurn.text = "[inaudible]";
|
|
698
|
-
} else if (currentTurn.confidence < 0.75) {
|
|
699
|
-
currentTurn.text = `[unclear, confidence=${currentTurn.confidence.toFixed(2)}] ${currentTurn.text}`;
|
|
700
|
-
}
|
|
701
697
|
turns.push(currentTurn);
|
|
702
698
|
}
|
|
703
699
|
currentTurn = { speaker, text: utterance, confidence: 0 };
|
|
@@ -716,6 +712,279 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
716
712
|
}
|
|
717
713
|
}
|
|
718
714
|
};
|
|
715
|
+
var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
|
|
716
|
+
var DummyWebSocket = class {
|
|
717
|
+
url;
|
|
718
|
+
constructor(url) {
|
|
719
|
+
this.url = null;
|
|
720
|
+
}
|
|
721
|
+
send(data) {
|
|
722
|
+
console.error("Data send attempted through dummy websocket");
|
|
723
|
+
}
|
|
724
|
+
close() {
|
|
725
|
+
}
|
|
726
|
+
addEventListener() {
|
|
727
|
+
}
|
|
728
|
+
get readyState() {
|
|
729
|
+
return 3 /* CLOSED */;
|
|
730
|
+
}
|
|
731
|
+
};
|
|
732
|
+
var QueueWebSocket = class {
|
|
733
|
+
wsQueue = [];
|
|
734
|
+
handlingQueue = false;
|
|
735
|
+
url;
|
|
736
|
+
ws;
|
|
737
|
+
constructor(url) {
|
|
738
|
+
this.url = url;
|
|
739
|
+
if (this.url) {
|
|
740
|
+
if (NATIVE_WEBSOCKET_AVAILABLE) {
|
|
741
|
+
this.ws = new WebSocket(this.url);
|
|
742
|
+
} else {
|
|
743
|
+
this.ws = new ws.WebSocket(this.url);
|
|
744
|
+
}
|
|
745
|
+
} else {
|
|
746
|
+
this.ws = new DummyWebSocket(null);
|
|
747
|
+
}
|
|
748
|
+
this.ws.addEventListener("open", () => {
|
|
749
|
+
this.handleQueue();
|
|
750
|
+
});
|
|
751
|
+
}
|
|
752
|
+
connect() {
|
|
753
|
+
if (this.url) {
|
|
754
|
+
if (NATIVE_WEBSOCKET_AVAILABLE) {
|
|
755
|
+
this.ws = new WebSocket(this.url);
|
|
756
|
+
} else {
|
|
757
|
+
this.ws = new ws.WebSocket(this.url);
|
|
758
|
+
}
|
|
759
|
+
} else {
|
|
760
|
+
this.ws = new DummyWebSocket(null);
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
close(code, reason) {
|
|
764
|
+
this.ws.close(code, reason);
|
|
765
|
+
}
|
|
766
|
+
handleQueue(cb = this.handleQueue.bind(this)) {
|
|
767
|
+
if (this.handlingQueue) {
|
|
768
|
+
console.log(`[Gladia] handleQueue: Already running`);
|
|
769
|
+
return;
|
|
770
|
+
}
|
|
771
|
+
this.handlingQueue = true;
|
|
772
|
+
const data = this.wsQueue.shift();
|
|
773
|
+
if (!data) {
|
|
774
|
+
this.handlingQueue = false;
|
|
775
|
+
return;
|
|
776
|
+
}
|
|
777
|
+
if (this.readyState === 1 /* OPEN */) {
|
|
778
|
+
this.ws.send(data);
|
|
779
|
+
this.handlingQueue = false;
|
|
780
|
+
this.handleQueue(cb);
|
|
781
|
+
} else {
|
|
782
|
+
this.wsQueue.unshift(data);
|
|
783
|
+
}
|
|
784
|
+
this.handlingQueue = false;
|
|
785
|
+
}
|
|
786
|
+
send(data) {
|
|
787
|
+
this.wsQueue.push(data);
|
|
788
|
+
this.handleQueue();
|
|
789
|
+
}
|
|
790
|
+
get readyState() {
|
|
791
|
+
return this.ws.readyState;
|
|
792
|
+
}
|
|
793
|
+
};
|
|
794
|
+
|
|
795
|
+
// src/voice/speechToText/gladia.ts
|
|
796
|
+
var GladiaModel = /* @__PURE__ */ ((GladiaModel2) => {
|
|
797
|
+
GladiaModel2["SOLARIA_1"] = "solaria-1";
|
|
798
|
+
return GladiaModel2;
|
|
799
|
+
})(GladiaModel || {});
|
|
800
|
+
var GladiaLanguage = /* @__PURE__ */ ((GladiaLanguage2) => {
|
|
801
|
+
GladiaLanguage2["EN"] = "en";
|
|
802
|
+
return GladiaLanguage2;
|
|
803
|
+
})(GladiaLanguage || {});
|
|
804
|
+
var GladiaSTT = class extends MagmaFlowSpeechToText {
|
|
805
|
+
connection;
|
|
806
|
+
connectionUrl = null;
|
|
807
|
+
settingUp = false;
|
|
808
|
+
apiKey;
|
|
809
|
+
config;
|
|
810
|
+
turnBuffer = [];
|
|
811
|
+
utteranceEnded = false;
|
|
812
|
+
constructor(args) {
|
|
813
|
+
super();
|
|
814
|
+
this.config = {
|
|
815
|
+
model: args.model,
|
|
816
|
+
encoding: "wav/pcm",
|
|
817
|
+
sample_rate: 48e3,
|
|
818
|
+
channels: 1,
|
|
819
|
+
maximum_duration_without_endpointing: 1,
|
|
820
|
+
...args.config
|
|
821
|
+
};
|
|
822
|
+
if (!args.apiKey && !process.env.GLADIA_API_KEY) {
|
|
823
|
+
throw new Error("GLADIA_API_KEY not supplied and not found in env");
|
|
824
|
+
}
|
|
825
|
+
this.apiKey = args.apiKey ?? process.env.GLADIA_API_KEY;
|
|
826
|
+
this.connection = new QueueWebSocket(null);
|
|
827
|
+
}
|
|
828
|
+
setup() {
|
|
829
|
+
if (this.settingUp) {
|
|
830
|
+
return;
|
|
831
|
+
}
|
|
832
|
+
this.settingUp = true;
|
|
833
|
+
if (this.connectionUrl) {
|
|
834
|
+
this.connectWS();
|
|
835
|
+
return;
|
|
836
|
+
}
|
|
837
|
+
console.log("[Gladia] Setup: Getting connection url");
|
|
838
|
+
fetch("https://api.gladia.io/v2/live", {
|
|
839
|
+
method: "POST",
|
|
840
|
+
headers: {
|
|
841
|
+
"Content-Type": "application/json",
|
|
842
|
+
"X-Gladia-Key": this.apiKey
|
|
843
|
+
},
|
|
844
|
+
body: JSON.stringify({
|
|
845
|
+
encoding: "wav/pcm",
|
|
846
|
+
sample_rate: 48e3,
|
|
847
|
+
bit_depth: 16,
|
|
848
|
+
channels: 1
|
|
849
|
+
})
|
|
850
|
+
}).then(async (response) => {
|
|
851
|
+
if (!response.ok) {
|
|
852
|
+
console.error("[Gladia] Could not get WS url");
|
|
853
|
+
const errorMessage = `${response.status}: ${await response.text() || response.statusText}`;
|
|
854
|
+
console.error(errorMessage);
|
|
855
|
+
this.settingUp = false;
|
|
856
|
+
throw new Error(errorMessage);
|
|
857
|
+
}
|
|
858
|
+
const json = await response.json();
|
|
859
|
+
this.connectionUrl = json.url;
|
|
860
|
+
this.connectWS();
|
|
861
|
+
});
|
|
862
|
+
}
|
|
863
|
+
connectWS() {
|
|
864
|
+
if (!this.connectionUrl) {
|
|
865
|
+
console.log(`[Gladia] ConnectWS: No connection url`);
|
|
866
|
+
return;
|
|
867
|
+
}
|
|
868
|
+
if (this.connection.readyState === 0 /* CONNECTING */) {
|
|
869
|
+
return;
|
|
870
|
+
}
|
|
871
|
+
console.log("[Gladia] ConnectWS: Connecting to", this.connectionUrl);
|
|
872
|
+
this.connection = new QueueWebSocket(this.connectionUrl);
|
|
873
|
+
this.settingUp = false;
|
|
874
|
+
this.connection.ws.addEventListener("error", (event) => {
|
|
875
|
+
console.log(`[Gladia] Error: ${JSON.stringify(event)}`);
|
|
876
|
+
});
|
|
877
|
+
this.connection.ws.addEventListener("close", (event) => {
|
|
878
|
+
console.log(`[Gladia] Close: ${JSON.stringify(event)}`);
|
|
879
|
+
});
|
|
880
|
+
this.connection.ws.addEventListener("open", this.onOpen.bind(this));
|
|
881
|
+
this.connection.ws.addEventListener("message", this.handleMessage.bind(this));
|
|
882
|
+
}
|
|
883
|
+
input(audio) {
|
|
884
|
+
this.connection.send(audio.buffer);
|
|
885
|
+
if (this.connection.readyState !== 1 /* OPEN */) {
|
|
886
|
+
this.setup();
|
|
887
|
+
}
|
|
888
|
+
}
|
|
889
|
+
flush() {
|
|
890
|
+
console.log("[Gladia] Flush: sending stop_recording messsage");
|
|
891
|
+
this.connection.send(
|
|
892
|
+
JSON.stringify({
|
|
893
|
+
type: "stop_recording"
|
|
894
|
+
})
|
|
895
|
+
);
|
|
896
|
+
}
|
|
897
|
+
kill() {
|
|
898
|
+
console.log("[Gladia] Kill: Closing connection");
|
|
899
|
+
this.connection?.close(1e3);
|
|
900
|
+
this.connection = new QueueWebSocket(null);
|
|
901
|
+
}
|
|
902
|
+
handleTranscriptionEvent(transcriptionEvent) {
|
|
903
|
+
if (transcriptionEvent.utterance.text.trim() === "") {
|
|
904
|
+
return;
|
|
905
|
+
}
|
|
906
|
+
this.onSpeechDetected();
|
|
907
|
+
if (transcriptionEvent.is_final) {
|
|
908
|
+
const turns = this.computeTurns(transcriptionEvent);
|
|
909
|
+
this.turnBuffer = this.turnBuffer.concat(turns);
|
|
910
|
+
this.sendOutput();
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
handleUtteranceEnd() {
|
|
914
|
+
this.utteranceEnded = true;
|
|
915
|
+
this.sendOutput();
|
|
916
|
+
}
|
|
917
|
+
sendOutput() {
|
|
918
|
+
if (!this.utteranceEnded) {
|
|
919
|
+
return;
|
|
920
|
+
}
|
|
921
|
+
if (this.turnBuffer.length === 0) {
|
|
922
|
+
return;
|
|
923
|
+
}
|
|
924
|
+
const text = this.turnBuffer.map((turn) => turn.text).join(" ");
|
|
925
|
+
let turns = void 0;
|
|
926
|
+
if (this.turnBuffer.every((turn) => turn.speaker !== void 0 && turn.speaker !== null)) {
|
|
927
|
+
turns = this.turnBuffer.reduce((acc, turn) => {
|
|
928
|
+
if (acc.at(-1)?.speaker === turn.speaker) {
|
|
929
|
+
acc.at(-1).text += turn.text;
|
|
930
|
+
} else {
|
|
931
|
+
acc.push(turn);
|
|
932
|
+
}
|
|
933
|
+
return acc;
|
|
934
|
+
}, []);
|
|
935
|
+
}
|
|
936
|
+
this.onOutput({
|
|
937
|
+
text,
|
|
938
|
+
turns
|
|
939
|
+
});
|
|
940
|
+
this.turnBuffer = [];
|
|
941
|
+
this.utteranceEnded = false;
|
|
942
|
+
}
|
|
943
|
+
onOpen() {
|
|
944
|
+
console.log(`[Gladia] Connected`);
|
|
945
|
+
}
|
|
946
|
+
computeTurns(transcript) {
|
|
947
|
+
try {
|
|
948
|
+
const nowMs = (/* @__PURE__ */ new Date()).getTime();
|
|
949
|
+
const durationMs = (transcript.utterance.end - transcript.utterance.start) * 1e3;
|
|
950
|
+
const startMs = nowMs - durationMs;
|
|
951
|
+
const turns = [
|
|
952
|
+
{
|
|
953
|
+
text: transcript.utterance.text,
|
|
954
|
+
confidence: transcript.utterance.confidence,
|
|
955
|
+
durationMs,
|
|
956
|
+
start: new Date(startMs),
|
|
957
|
+
end: new Date(nowMs),
|
|
958
|
+
speaker: transcript.utterance.speaker
|
|
959
|
+
}
|
|
960
|
+
];
|
|
961
|
+
return turns;
|
|
962
|
+
} catch (error) {
|
|
963
|
+
console.error(error);
|
|
964
|
+
return [];
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
handleMessage(event) {
|
|
968
|
+
const message = JSON.parse(event.data.toString());
|
|
969
|
+
switch (message.type) {
|
|
970
|
+
case "audio_chunk":
|
|
971
|
+
break;
|
|
972
|
+
case "speech_start":
|
|
973
|
+
this.utteranceEnded = false;
|
|
974
|
+
this.onSpeechDetected();
|
|
975
|
+
break;
|
|
976
|
+
case "speech_end":
|
|
977
|
+
this.handleUtteranceEnd();
|
|
978
|
+
break;
|
|
979
|
+
case "transcript":
|
|
980
|
+
this.handleTranscriptionEvent(message.data);
|
|
981
|
+
break;
|
|
982
|
+
default:
|
|
983
|
+
console.log(`[Deepgram] Unhandled event: ${JSON.stringify(event)}`);
|
|
984
|
+
break;
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
};
|
|
719
988
|
|
|
720
989
|
// src/voice/textToSpeech/base.ts
|
|
721
990
|
var MagmaFlowTextToSpeech = class {
|
|
@@ -901,6 +1170,9 @@ exports.DeepgramSTT = DeepgramSTT;
|
|
|
901
1170
|
exports.DeepgramTTS = DeepgramTTS;
|
|
902
1171
|
exports.ElevenLabsTTS = ElevenLabsTTS;
|
|
903
1172
|
exports.ElevenLabsVoice = ElevenLabsVoice;
|
|
1173
|
+
exports.GladiaLanguage = GladiaLanguage;
|
|
1174
|
+
exports.GladiaModel = GladiaModel;
|
|
1175
|
+
exports.GladiaSTT = GladiaSTT;
|
|
904
1176
|
exports.HumeTTS = HumeTTS;
|
|
905
1177
|
exports.MagmaFlow = MagmaFlow;
|
|
906
1178
|
exports.MagmaFlowSpeechToText = MagmaFlowSpeechToText;
|
package/dist/voice.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { DeepgramClient, LiveTranscriptionEvents } from '@deepgram/sdk';
|
|
2
|
+
import { WebSocket as WebSocket$1 } from 'ws';
|
|
2
3
|
import { HumeClient } from 'hume';
|
|
3
4
|
import OpenAI from 'openai';
|
|
4
5
|
|
|
@@ -687,11 +688,6 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
687
688
|
} else {
|
|
688
689
|
if (currentTurn) {
|
|
689
690
|
currentTurn.confidence = currentTurnConfidence / currentTurnWordCount;
|
|
690
|
-
if (currentTurn.confidence < 0.5) {
|
|
691
|
-
currentTurn.text = "[inaudible]";
|
|
692
|
-
} else if (currentTurn.confidence < 0.75) {
|
|
693
|
-
currentTurn.text = `[unclear, confidence=${currentTurn.confidence.toFixed(2)}] ${currentTurn.text}`;
|
|
694
|
-
}
|
|
695
691
|
turns.push(currentTurn);
|
|
696
692
|
}
|
|
697
693
|
currentTurn = { speaker, text: utterance, confidence: 0 };
|
|
@@ -710,6 +706,279 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
|
|
|
710
706
|
}
|
|
711
707
|
}
|
|
712
708
|
};
|
|
709
|
+
var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
|
|
710
|
+
var DummyWebSocket = class {
|
|
711
|
+
url;
|
|
712
|
+
constructor(url) {
|
|
713
|
+
this.url = null;
|
|
714
|
+
}
|
|
715
|
+
send(data) {
|
|
716
|
+
console.error("Data send attempted through dummy websocket");
|
|
717
|
+
}
|
|
718
|
+
close() {
|
|
719
|
+
}
|
|
720
|
+
addEventListener() {
|
|
721
|
+
}
|
|
722
|
+
get readyState() {
|
|
723
|
+
return 3 /* CLOSED */;
|
|
724
|
+
}
|
|
725
|
+
};
|
|
726
|
+
var QueueWebSocket = class {
|
|
727
|
+
wsQueue = [];
|
|
728
|
+
handlingQueue = false;
|
|
729
|
+
url;
|
|
730
|
+
ws;
|
|
731
|
+
constructor(url) {
|
|
732
|
+
this.url = url;
|
|
733
|
+
if (this.url) {
|
|
734
|
+
if (NATIVE_WEBSOCKET_AVAILABLE) {
|
|
735
|
+
this.ws = new WebSocket(this.url);
|
|
736
|
+
} else {
|
|
737
|
+
this.ws = new WebSocket$1(this.url);
|
|
738
|
+
}
|
|
739
|
+
} else {
|
|
740
|
+
this.ws = new DummyWebSocket(null);
|
|
741
|
+
}
|
|
742
|
+
this.ws.addEventListener("open", () => {
|
|
743
|
+
this.handleQueue();
|
|
744
|
+
});
|
|
745
|
+
}
|
|
746
|
+
connect() {
|
|
747
|
+
if (this.url) {
|
|
748
|
+
if (NATIVE_WEBSOCKET_AVAILABLE) {
|
|
749
|
+
this.ws = new WebSocket(this.url);
|
|
750
|
+
} else {
|
|
751
|
+
this.ws = new WebSocket$1(this.url);
|
|
752
|
+
}
|
|
753
|
+
} else {
|
|
754
|
+
this.ws = new DummyWebSocket(null);
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
close(code, reason) {
|
|
758
|
+
this.ws.close(code, reason);
|
|
759
|
+
}
|
|
760
|
+
handleQueue(cb = this.handleQueue.bind(this)) {
|
|
761
|
+
if (this.handlingQueue) {
|
|
762
|
+
console.log(`[Gladia] handleQueue: Already running`);
|
|
763
|
+
return;
|
|
764
|
+
}
|
|
765
|
+
this.handlingQueue = true;
|
|
766
|
+
const data = this.wsQueue.shift();
|
|
767
|
+
if (!data) {
|
|
768
|
+
this.handlingQueue = false;
|
|
769
|
+
return;
|
|
770
|
+
}
|
|
771
|
+
if (this.readyState === 1 /* OPEN */) {
|
|
772
|
+
this.ws.send(data);
|
|
773
|
+
this.handlingQueue = false;
|
|
774
|
+
this.handleQueue(cb);
|
|
775
|
+
} else {
|
|
776
|
+
this.wsQueue.unshift(data);
|
|
777
|
+
}
|
|
778
|
+
this.handlingQueue = false;
|
|
779
|
+
}
|
|
780
|
+
send(data) {
|
|
781
|
+
this.wsQueue.push(data);
|
|
782
|
+
this.handleQueue();
|
|
783
|
+
}
|
|
784
|
+
get readyState() {
|
|
785
|
+
return this.ws.readyState;
|
|
786
|
+
}
|
|
787
|
+
};
|
|
788
|
+
|
|
789
|
+
// src/voice/speechToText/gladia.ts
|
|
790
|
+
var GladiaModel = /* @__PURE__ */ ((GladiaModel2) => {
|
|
791
|
+
GladiaModel2["SOLARIA_1"] = "solaria-1";
|
|
792
|
+
return GladiaModel2;
|
|
793
|
+
})(GladiaModel || {});
|
|
794
|
+
var GladiaLanguage = /* @__PURE__ */ ((GladiaLanguage2) => {
|
|
795
|
+
GladiaLanguage2["EN"] = "en";
|
|
796
|
+
return GladiaLanguage2;
|
|
797
|
+
})(GladiaLanguage || {});
|
|
798
|
+
var GladiaSTT = class extends MagmaFlowSpeechToText {
|
|
799
|
+
connection;
|
|
800
|
+
connectionUrl = null;
|
|
801
|
+
settingUp = false;
|
|
802
|
+
apiKey;
|
|
803
|
+
config;
|
|
804
|
+
turnBuffer = [];
|
|
805
|
+
utteranceEnded = false;
|
|
806
|
+
constructor(args) {
|
|
807
|
+
super();
|
|
808
|
+
this.config = {
|
|
809
|
+
model: args.model,
|
|
810
|
+
encoding: "wav/pcm",
|
|
811
|
+
sample_rate: 48e3,
|
|
812
|
+
channels: 1,
|
|
813
|
+
maximum_duration_without_endpointing: 1,
|
|
814
|
+
...args.config
|
|
815
|
+
};
|
|
816
|
+
if (!args.apiKey && !process.env.GLADIA_API_KEY) {
|
|
817
|
+
throw new Error("GLADIA_API_KEY not supplied and not found in env");
|
|
818
|
+
}
|
|
819
|
+
this.apiKey = args.apiKey ?? process.env.GLADIA_API_KEY;
|
|
820
|
+
this.connection = new QueueWebSocket(null);
|
|
821
|
+
}
|
|
822
|
+
setup() {
|
|
823
|
+
if (this.settingUp) {
|
|
824
|
+
return;
|
|
825
|
+
}
|
|
826
|
+
this.settingUp = true;
|
|
827
|
+
if (this.connectionUrl) {
|
|
828
|
+
this.connectWS();
|
|
829
|
+
return;
|
|
830
|
+
}
|
|
831
|
+
console.log("[Gladia] Setup: Getting connection url");
|
|
832
|
+
fetch("https://api.gladia.io/v2/live", {
|
|
833
|
+
method: "POST",
|
|
834
|
+
headers: {
|
|
835
|
+
"Content-Type": "application/json",
|
|
836
|
+
"X-Gladia-Key": this.apiKey
|
|
837
|
+
},
|
|
838
|
+
body: JSON.stringify({
|
|
839
|
+
encoding: "wav/pcm",
|
|
840
|
+
sample_rate: 48e3,
|
|
841
|
+
bit_depth: 16,
|
|
842
|
+
channels: 1
|
|
843
|
+
})
|
|
844
|
+
}).then(async (response) => {
|
|
845
|
+
if (!response.ok) {
|
|
846
|
+
console.error("[Gladia] Could not get WS url");
|
|
847
|
+
const errorMessage = `${response.status}: ${await response.text() || response.statusText}`;
|
|
848
|
+
console.error(errorMessage);
|
|
849
|
+
this.settingUp = false;
|
|
850
|
+
throw new Error(errorMessage);
|
|
851
|
+
}
|
|
852
|
+
const json = await response.json();
|
|
853
|
+
this.connectionUrl = json.url;
|
|
854
|
+
this.connectWS();
|
|
855
|
+
});
|
|
856
|
+
}
|
|
857
|
+
connectWS() {
|
|
858
|
+
if (!this.connectionUrl) {
|
|
859
|
+
console.log(`[Gladia] ConnectWS: No connection url`);
|
|
860
|
+
return;
|
|
861
|
+
}
|
|
862
|
+
if (this.connection.readyState === 0 /* CONNECTING */) {
|
|
863
|
+
return;
|
|
864
|
+
}
|
|
865
|
+
console.log("[Gladia] ConnectWS: Connecting to", this.connectionUrl);
|
|
866
|
+
this.connection = new QueueWebSocket(this.connectionUrl);
|
|
867
|
+
this.settingUp = false;
|
|
868
|
+
this.connection.ws.addEventListener("error", (event) => {
|
|
869
|
+
console.log(`[Gladia] Error: ${JSON.stringify(event)}`);
|
|
870
|
+
});
|
|
871
|
+
this.connection.ws.addEventListener("close", (event) => {
|
|
872
|
+
console.log(`[Gladia] Close: ${JSON.stringify(event)}`);
|
|
873
|
+
});
|
|
874
|
+
this.connection.ws.addEventListener("open", this.onOpen.bind(this));
|
|
875
|
+
this.connection.ws.addEventListener("message", this.handleMessage.bind(this));
|
|
876
|
+
}
|
|
877
|
+
input(audio) {
|
|
878
|
+
this.connection.send(audio.buffer);
|
|
879
|
+
if (this.connection.readyState !== 1 /* OPEN */) {
|
|
880
|
+
this.setup();
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
flush() {
|
|
884
|
+
console.log("[Gladia] Flush: sending stop_recording messsage");
|
|
885
|
+
this.connection.send(
|
|
886
|
+
JSON.stringify({
|
|
887
|
+
type: "stop_recording"
|
|
888
|
+
})
|
|
889
|
+
);
|
|
890
|
+
}
|
|
891
|
+
kill() {
|
|
892
|
+
console.log("[Gladia] Kill: Closing connection");
|
|
893
|
+
this.connection?.close(1e3);
|
|
894
|
+
this.connection = new QueueWebSocket(null);
|
|
895
|
+
}
|
|
896
|
+
handleTranscriptionEvent(transcriptionEvent) {
|
|
897
|
+
if (transcriptionEvent.utterance.text.trim() === "") {
|
|
898
|
+
return;
|
|
899
|
+
}
|
|
900
|
+
this.onSpeechDetected();
|
|
901
|
+
if (transcriptionEvent.is_final) {
|
|
902
|
+
const turns = this.computeTurns(transcriptionEvent);
|
|
903
|
+
this.turnBuffer = this.turnBuffer.concat(turns);
|
|
904
|
+
this.sendOutput();
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
handleUtteranceEnd() {
|
|
908
|
+
this.utteranceEnded = true;
|
|
909
|
+
this.sendOutput();
|
|
910
|
+
}
|
|
911
|
+
sendOutput() {
|
|
912
|
+
if (!this.utteranceEnded) {
|
|
913
|
+
return;
|
|
914
|
+
}
|
|
915
|
+
if (this.turnBuffer.length === 0) {
|
|
916
|
+
return;
|
|
917
|
+
}
|
|
918
|
+
const text = this.turnBuffer.map((turn) => turn.text).join(" ");
|
|
919
|
+
let turns = void 0;
|
|
920
|
+
if (this.turnBuffer.every((turn) => turn.speaker !== void 0 && turn.speaker !== null)) {
|
|
921
|
+
turns = this.turnBuffer.reduce((acc, turn) => {
|
|
922
|
+
if (acc.at(-1)?.speaker === turn.speaker) {
|
|
923
|
+
acc.at(-1).text += turn.text;
|
|
924
|
+
} else {
|
|
925
|
+
acc.push(turn);
|
|
926
|
+
}
|
|
927
|
+
return acc;
|
|
928
|
+
}, []);
|
|
929
|
+
}
|
|
930
|
+
this.onOutput({
|
|
931
|
+
text,
|
|
932
|
+
turns
|
|
933
|
+
});
|
|
934
|
+
this.turnBuffer = [];
|
|
935
|
+
this.utteranceEnded = false;
|
|
936
|
+
}
|
|
937
|
+
onOpen() {
|
|
938
|
+
console.log(`[Gladia] Connected`);
|
|
939
|
+
}
|
|
940
|
+
computeTurns(transcript) {
|
|
941
|
+
try {
|
|
942
|
+
const nowMs = (/* @__PURE__ */ new Date()).getTime();
|
|
943
|
+
const durationMs = (transcript.utterance.end - transcript.utterance.start) * 1e3;
|
|
944
|
+
const startMs = nowMs - durationMs;
|
|
945
|
+
const turns = [
|
|
946
|
+
{
|
|
947
|
+
text: transcript.utterance.text,
|
|
948
|
+
confidence: transcript.utterance.confidence,
|
|
949
|
+
durationMs,
|
|
950
|
+
start: new Date(startMs),
|
|
951
|
+
end: new Date(nowMs),
|
|
952
|
+
speaker: transcript.utterance.speaker
|
|
953
|
+
}
|
|
954
|
+
];
|
|
955
|
+
return turns;
|
|
956
|
+
} catch (error) {
|
|
957
|
+
console.error(error);
|
|
958
|
+
return [];
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
handleMessage(event) {
|
|
962
|
+
const message = JSON.parse(event.data.toString());
|
|
963
|
+
switch (message.type) {
|
|
964
|
+
case "audio_chunk":
|
|
965
|
+
break;
|
|
966
|
+
case "speech_start":
|
|
967
|
+
this.utteranceEnded = false;
|
|
968
|
+
this.onSpeechDetected();
|
|
969
|
+
break;
|
|
970
|
+
case "speech_end":
|
|
971
|
+
this.handleUtteranceEnd();
|
|
972
|
+
break;
|
|
973
|
+
case "transcript":
|
|
974
|
+
this.handleTranscriptionEvent(message.data);
|
|
975
|
+
break;
|
|
976
|
+
default:
|
|
977
|
+
console.log(`[Deepgram] Unhandled event: ${JSON.stringify(event)}`);
|
|
978
|
+
break;
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
};
|
|
713
982
|
|
|
714
983
|
// src/voice/textToSpeech/base.ts
|
|
715
984
|
var MagmaFlowTextToSpeech = class {
|
|
@@ -889,4 +1158,4 @@ var WhisperTTS = class extends MagmaFlowTextToSpeech {
|
|
|
889
1158
|
}
|
|
890
1159
|
};
|
|
891
1160
|
|
|
892
|
-
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
|
|
1161
|
+
export { DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pompeii-labs/audio",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.2",
|
|
4
4
|
"description": "The Audio SDK from Pompeii Labs",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"Pompeii",
|
|
@@ -47,6 +47,7 @@
|
|
|
47
47
|
},
|
|
48
48
|
"devDependencies": {
|
|
49
49
|
"@types/node": "^22.16.0",
|
|
50
|
+
"@types/ws": "^8.18.1",
|
|
50
51
|
"@typescript-eslint/eslint-plugin": "^6.21.0",
|
|
51
52
|
"@typescript-eslint/parser": "^6.21.0",
|
|
52
53
|
"eslint": "^8.57.1",
|
|
@@ -55,5 +56,8 @@
|
|
|
55
56
|
"prettier": "^3.6.2",
|
|
56
57
|
"tsup": "^8.5.0",
|
|
57
58
|
"typescript": "^5.8.3"
|
|
59
|
+
},
|
|
60
|
+
"peerDependencies": {
|
|
61
|
+
"ws": "^8.0.0"
|
|
58
62
|
}
|
|
59
63
|
}
|