@pompeii-labs/audio 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/voice.d.mts CHANGED
@@ -107,6 +107,115 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
107
107
  private computeTurns;
108
108
  }
109
109
 
110
+ declare enum GladiaModel {
111
+ SOLARIA_1 = "solaria-1"
112
+ }
113
+ declare enum GladiaLanguage {
114
+ EN = "en"
115
+ }
116
+ type GladiaConfig = {
117
+ model: GladiaModel;
118
+ encoding?: 'wav/pcm' | 'wav/alaw' | 'wav/ulaw';
119
+ sample_rate?: number;
120
+ bit_depth?: number;
121
+ channels?: number;
122
+ custom_metadata?: Record<string, unknown>;
123
+ endpointing?: number;
124
+ maximum_duration_without_endpointing?: number;
125
+ language_config?: {
126
+ languages?: GladiaLanguage[];
127
+ };
128
+ pre_processing?: {
129
+ audio_enhancer?: boolean;
130
+ speech_threshold?: number;
131
+ };
132
+ realtime_processing?: {
133
+ custom_vocabulary?: boolean;
134
+ custom_vocabulary_config?: {
135
+ vocabulary: {
136
+ value: string;
137
+ intensity?: number;
138
+ pronunciations?: string[];
139
+ language?: string;
140
+ }[];
141
+ default_intensity?: number;
142
+ };
143
+ custom_spelling?: boolean;
144
+ custom_spelling_config?: {
145
+ spelling_dictionary: Record<string, string[]>;
146
+ };
147
+ translation?: boolean;
148
+ translation_config?: {
149
+ target_languages: GladiaLanguage[];
150
+ model?: 'base' | 'enhanced';
151
+ match_original_utterances?: boolean;
152
+ lipsync?: boolean;
153
+ context_adaptation?: boolean;
154
+ context?: string;
155
+ informal?: boolean;
156
+ };
157
+ named_entity_recognition?: boolean;
158
+ sentiment_analysis?: boolean;
159
+ };
160
+ post_processing?: {
161
+ summarization?: boolean;
162
+ summarization_config?: {
163
+ type?: 'general' | 'bullet_points' | 'concise';
164
+ };
165
+ chapterization?: boolean;
166
+ };
167
+ messages_config?: {
168
+ receive_partial_transcripts?: boolean;
169
+ receive_final_transcripts?: boolean;
170
+ receive_speech_events?: boolean;
171
+ receive_pre_processing_events?: boolean;
172
+ receive_realtime_processing_events?: boolean;
173
+ receive_post_processing_events?: boolean;
174
+ receive_acknowledgements?: boolean;
175
+ receive_errors?: boolean;
176
+ receive_lifecycle_events?: boolean;
177
+ };
178
+ callback?: boolean;
179
+ callback_config?: {
180
+ url: string;
181
+ receive_partial_transcripts?: boolean;
182
+ receive_final_transcripts?: boolean;
183
+ receive_speech_events?: boolean;
184
+ receive_pre_processing_events?: boolean;
185
+ receive_realtime_processing_events?: boolean;
186
+ receive_post_processing_events?: boolean;
187
+ receive_acknowledgements?: boolean;
188
+ receive_errors?: boolean;
189
+ receive_lifecycle_events?: boolean;
190
+ };
191
+ };
192
+ type GladiaSTTArgs = {
193
+ model: GladiaModel;
194
+ apiKey?: string;
195
+ config?: Omit<GladiaConfig, 'model' | 'encoding' | 'sample_rate' | 'channels'>;
196
+ };
197
+ declare class GladiaSTT extends MagmaFlowSpeechToText {
198
+ private connection;
199
+ private connectionUrl;
200
+ private settingUp;
201
+ private apiKey;
202
+ private config;
203
+ private turnBuffer;
204
+ private utteranceEnded;
205
+ constructor(args: GladiaSTTArgs);
206
+ private setup;
207
+ private connectWS;
208
+ input(audio: Buffer): void;
209
+ flush(): void;
210
+ kill(): void;
211
+ private handleTranscriptionEvent;
212
+ private handleUtteranceEnd;
213
+ private sendOutput;
214
+ private onOpen;
215
+ private computeTurns;
216
+ private handleMessage;
217
+ }
218
+
110
219
  type DeepgramTTSArgs = {
111
220
  client?: DeepgramClient;
112
221
  };
@@ -193,4 +302,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
193
302
 
194
303
  declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
195
304
 
196
- export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
305
+ export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
package/dist/voice.d.ts CHANGED
@@ -107,6 +107,115 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
107
107
  private computeTurns;
108
108
  }
109
109
 
110
+ declare enum GladiaModel {
111
+ SOLARIA_1 = "solaria-1"
112
+ }
113
+ declare enum GladiaLanguage {
114
+ EN = "en"
115
+ }
116
+ type GladiaConfig = {
117
+ model: GladiaModel;
118
+ encoding?: 'wav/pcm' | 'wav/alaw' | 'wav/ulaw';
119
+ sample_rate?: number;
120
+ bit_depth?: number;
121
+ channels?: number;
122
+ custom_metadata?: Record<string, unknown>;
123
+ endpointing?: number;
124
+ maximum_duration_without_endpointing?: number;
125
+ language_config?: {
126
+ languages?: GladiaLanguage[];
127
+ };
128
+ pre_processing?: {
129
+ audio_enhancer?: boolean;
130
+ speech_threshold?: number;
131
+ };
132
+ realtime_processing?: {
133
+ custom_vocabulary?: boolean;
134
+ custom_vocabulary_config?: {
135
+ vocabulary: {
136
+ value: string;
137
+ intensity?: number;
138
+ pronunciations?: string[];
139
+ language?: string;
140
+ }[];
141
+ default_intensity?: number;
142
+ };
143
+ custom_spelling?: boolean;
144
+ custom_spelling_config?: {
145
+ spelling_dictionary: Record<string, string[]>;
146
+ };
147
+ translation?: boolean;
148
+ translation_config?: {
149
+ target_languages: GladiaLanguage[];
150
+ model?: 'base' | 'enhanced';
151
+ match_original_utterances?: boolean;
152
+ lipsync?: boolean;
153
+ context_adaptation?: boolean;
154
+ context?: string;
155
+ informal?: boolean;
156
+ };
157
+ named_entity_recognition?: boolean;
158
+ sentiment_analysis?: boolean;
159
+ };
160
+ post_processing?: {
161
+ summarization?: boolean;
162
+ summarization_config?: {
163
+ type?: 'general' | 'bullet_points' | 'concise';
164
+ };
165
+ chapterization?: boolean;
166
+ };
167
+ messages_config?: {
168
+ receive_partial_transcripts?: boolean;
169
+ receive_final_transcripts?: boolean;
170
+ receive_speech_events?: boolean;
171
+ receive_pre_processing_events?: boolean;
172
+ receive_realtime_processing_events?: boolean;
173
+ receive_post_processing_events?: boolean;
174
+ receive_acknowledgements?: boolean;
175
+ receive_errors?: boolean;
176
+ receive_lifecycle_events?: boolean;
177
+ };
178
+ callback?: boolean;
179
+ callback_config?: {
180
+ url: string;
181
+ receive_partial_transcripts?: boolean;
182
+ receive_final_transcripts?: boolean;
183
+ receive_speech_events?: boolean;
184
+ receive_pre_processing_events?: boolean;
185
+ receive_realtime_processing_events?: boolean;
186
+ receive_post_processing_events?: boolean;
187
+ receive_acknowledgements?: boolean;
188
+ receive_errors?: boolean;
189
+ receive_lifecycle_events?: boolean;
190
+ };
191
+ };
192
+ type GladiaSTTArgs = {
193
+ model: GladiaModel;
194
+ apiKey?: string;
195
+ config?: Omit<GladiaConfig, 'model' | 'encoding' | 'sample_rate' | 'channels'>;
196
+ };
197
+ declare class GladiaSTT extends MagmaFlowSpeechToText {
198
+ private connection;
199
+ private connectionUrl;
200
+ private settingUp;
201
+ private apiKey;
202
+ private config;
203
+ private turnBuffer;
204
+ private utteranceEnded;
205
+ constructor(args: GladiaSTTArgs);
206
+ private setup;
207
+ private connectWS;
208
+ input(audio: Buffer): void;
209
+ flush(): void;
210
+ kill(): void;
211
+ private handleTranscriptionEvent;
212
+ private handleUtteranceEnd;
213
+ private sendOutput;
214
+ private onOpen;
215
+ private computeTurns;
216
+ private handleMessage;
217
+ }
218
+
110
219
  type DeepgramTTSArgs = {
111
220
  client?: DeepgramClient;
112
221
  };
@@ -193,4 +302,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
193
302
 
194
303
  declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
195
304
 
196
- export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
305
+ export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
package/dist/voice.js CHANGED
@@ -1,6 +1,7 @@
1
1
  'use strict';
2
2
 
3
3
  var sdk = require('@deepgram/sdk');
4
+ var ws = require('ws');
4
5
  var hume = require('hume');
5
6
  var OpenAI = require('openai');
6
7
 
@@ -693,11 +694,6 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
693
694
  } else {
694
695
  if (currentTurn) {
695
696
  currentTurn.confidence = currentTurnConfidence / currentTurnWordCount;
696
- if (currentTurn.confidence < 0.5) {
697
- currentTurn.text = "[inaudible]";
698
- } else if (currentTurn.confidence < 0.75) {
699
- currentTurn.text = `[unclear, confidence=${currentTurn.confidence.toFixed(2)}] ${currentTurn.text}`;
700
- }
701
697
  turns.push(currentTurn);
702
698
  }
703
699
  currentTurn = { speaker, text: utterance, confidence: 0 };
@@ -716,6 +712,279 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
716
712
  }
717
713
  }
718
714
  };
715
+ var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
716
+ var DummyWebSocket = class {
717
+ url;
718
+ constructor(url) {
719
+ this.url = null;
720
+ }
721
+ send(data) {
722
+ console.error("Data send attempted through dummy websocket");
723
+ }
724
+ close() {
725
+ }
726
+ addEventListener() {
727
+ }
728
+ get readyState() {
729
+ return 3 /* CLOSED */;
730
+ }
731
+ };
732
+ var QueueWebSocket = class {
733
+ wsQueue = [];
734
+ handlingQueue = false;
735
+ url;
736
+ ws;
737
+ constructor(url) {
738
+ this.url = url;
739
+ if (this.url) {
740
+ if (NATIVE_WEBSOCKET_AVAILABLE) {
741
+ this.ws = new WebSocket(this.url);
742
+ } else {
743
+ this.ws = new ws.WebSocket(this.url);
744
+ }
745
+ } else {
746
+ this.ws = new DummyWebSocket(null);
747
+ }
748
+ this.ws.addEventListener("open", () => {
749
+ this.handleQueue();
750
+ });
751
+ }
752
+ connect() {
753
+ if (this.url) {
754
+ if (NATIVE_WEBSOCKET_AVAILABLE) {
755
+ this.ws = new WebSocket(this.url);
756
+ } else {
757
+ this.ws = new ws.WebSocket(this.url);
758
+ }
759
+ } else {
760
+ this.ws = new DummyWebSocket(null);
761
+ }
762
+ }
763
+ close(code, reason) {
764
+ this.ws.close(code, reason);
765
+ }
766
+ handleQueue(cb = this.handleQueue.bind(this)) {
767
+ if (this.handlingQueue) {
768
+ console.log(`[Gladia] handleQueue: Already running`);
769
+ return;
770
+ }
771
+ this.handlingQueue = true;
772
+ const data = this.wsQueue.shift();
773
+ if (!data) {
774
+ this.handlingQueue = false;
775
+ return;
776
+ }
777
+ if (this.readyState === 1 /* OPEN */) {
778
+ this.ws.send(data);
779
+ this.handlingQueue = false;
780
+ this.handleQueue(cb);
781
+ } else {
782
+ this.wsQueue.unshift(data);
783
+ }
784
+ this.handlingQueue = false;
785
+ }
786
+ send(data) {
787
+ this.wsQueue.push(data);
788
+ this.handleQueue();
789
+ }
790
+ get readyState() {
791
+ return this.ws.readyState;
792
+ }
793
+ };
794
+
795
+ // src/voice/speechToText/gladia.ts
796
+ var GladiaModel = /* @__PURE__ */ ((GladiaModel2) => {
797
+ GladiaModel2["SOLARIA_1"] = "solaria-1";
798
+ return GladiaModel2;
799
+ })(GladiaModel || {});
800
+ var GladiaLanguage = /* @__PURE__ */ ((GladiaLanguage2) => {
801
+ GladiaLanguage2["EN"] = "en";
802
+ return GladiaLanguage2;
803
+ })(GladiaLanguage || {});
804
+ var GladiaSTT = class extends MagmaFlowSpeechToText {
805
+ connection;
806
+ connectionUrl = null;
807
+ settingUp = false;
808
+ apiKey;
809
+ config;
810
+ turnBuffer = [];
811
+ utteranceEnded = false;
812
+ constructor(args) {
813
+ super();
814
+ this.config = {
815
+ model: args.model,
816
+ encoding: "wav/pcm",
817
+ sample_rate: 48e3,
818
+ channels: 1,
819
+ maximum_duration_without_endpointing: 1,
820
+ ...args.config
821
+ };
822
+ if (!args.apiKey && !process.env.GLADIA_API_KEY) {
823
+ throw new Error("GLADIA_API_KEY not supplied and not found in env");
824
+ }
825
+ this.apiKey = args.apiKey ?? process.env.GLADIA_API_KEY;
826
+ this.connection = new QueueWebSocket(null);
827
+ }
828
+ setup() {
829
+ if (this.settingUp) {
830
+ return;
831
+ }
832
+ this.settingUp = true;
833
+ if (this.connectionUrl) {
834
+ this.connectWS();
835
+ return;
836
+ }
837
+ console.log("[Gladia] Setup: Getting connection url");
838
+ fetch("https://api.gladia.io/v2/live", {
839
+ method: "POST",
840
+ headers: {
841
+ "Content-Type": "application/json",
842
+ "X-Gladia-Key": this.apiKey
843
+ },
844
+ body: JSON.stringify({
845
+ encoding: "wav/pcm",
846
+ sample_rate: 48e3,
847
+ bit_depth: 16,
848
+ channels: 1
849
+ })
850
+ }).then(async (response) => {
851
+ if (!response.ok) {
852
+ console.error("[Gladia] Could not get WS url");
853
+ const errorMessage = `${response.status}: ${await response.text() || response.statusText}`;
854
+ console.error(errorMessage);
855
+ this.settingUp = false;
856
+ throw new Error(errorMessage);
857
+ }
858
+ const json = await response.json();
859
+ this.connectionUrl = json.url;
860
+ this.connectWS();
861
+ });
862
+ }
863
+ connectWS() {
864
+ if (!this.connectionUrl) {
865
+ console.log(`[Gladia] ConnectWS: No connection url`);
866
+ return;
867
+ }
868
+ if (this.connection.readyState === 0 /* CONNECTING */) {
869
+ return;
870
+ }
871
+ console.log("[Gladia] ConnectWS: Connecting to", this.connectionUrl);
872
+ this.connection = new QueueWebSocket(this.connectionUrl);
873
+ this.settingUp = false;
874
+ this.connection.ws.addEventListener("error", (event) => {
875
+ console.log(`[Gladia] Error: ${JSON.stringify(event)}`);
876
+ });
877
+ this.connection.ws.addEventListener("close", (event) => {
878
+ console.log(`[Gladia] Close: ${JSON.stringify(event)}`);
879
+ });
880
+ this.connection.ws.addEventListener("open", this.onOpen.bind(this));
881
+ this.connection.ws.addEventListener("message", this.handleMessage.bind(this));
882
+ }
883
+ input(audio) {
884
+ this.connection.send(audio.buffer);
885
+ if (this.connection.readyState !== 1 /* OPEN */) {
886
+ this.setup();
887
+ }
888
+ }
889
+ flush() {
890
+ console.log("[Gladia] Flush: sending stop_recording messsage");
891
+ this.connection.send(
892
+ JSON.stringify({
893
+ type: "stop_recording"
894
+ })
895
+ );
896
+ }
897
+ kill() {
898
+ console.log("[Gladia] Kill: Closing connection");
899
+ this.connection?.close(1e3);
900
+ this.connection = new QueueWebSocket(null);
901
+ }
902
+ handleTranscriptionEvent(transcriptionEvent) {
903
+ if (transcriptionEvent.utterance.text.trim() === "") {
904
+ return;
905
+ }
906
+ this.onSpeechDetected();
907
+ if (transcriptionEvent.is_final) {
908
+ const turns = this.computeTurns(transcriptionEvent);
909
+ this.turnBuffer = this.turnBuffer.concat(turns);
910
+ this.sendOutput();
911
+ }
912
+ }
913
+ handleUtteranceEnd() {
914
+ this.utteranceEnded = true;
915
+ this.sendOutput();
916
+ }
917
+ sendOutput() {
918
+ if (!this.utteranceEnded) {
919
+ return;
920
+ }
921
+ if (this.turnBuffer.length === 0) {
922
+ return;
923
+ }
924
+ const text = this.turnBuffer.map((turn) => turn.text).join(" ");
925
+ let turns = void 0;
926
+ if (this.turnBuffer.every((turn) => turn.speaker !== void 0 && turn.speaker !== null)) {
927
+ turns = this.turnBuffer.reduce((acc, turn) => {
928
+ if (acc.at(-1)?.speaker === turn.speaker) {
929
+ acc.at(-1).text += turn.text;
930
+ } else {
931
+ acc.push(turn);
932
+ }
933
+ return acc;
934
+ }, []);
935
+ }
936
+ this.onOutput({
937
+ text,
938
+ turns
939
+ });
940
+ this.turnBuffer = [];
941
+ this.utteranceEnded = false;
942
+ }
943
+ onOpen() {
944
+ console.log(`[Gladia] Connected`);
945
+ }
946
+ computeTurns(transcript) {
947
+ try {
948
+ const nowMs = (/* @__PURE__ */ new Date()).getTime();
949
+ const durationMs = (transcript.utterance.end - transcript.utterance.start) * 1e3;
950
+ const startMs = nowMs - durationMs;
951
+ const turns = [
952
+ {
953
+ text: transcript.utterance.text,
954
+ confidence: transcript.utterance.confidence,
955
+ durationMs,
956
+ start: new Date(startMs),
957
+ end: new Date(nowMs),
958
+ speaker: transcript.utterance.speaker
959
+ }
960
+ ];
961
+ return turns;
962
+ } catch (error) {
963
+ console.error(error);
964
+ return [];
965
+ }
966
+ }
967
+ handleMessage(event) {
968
+ const message = JSON.parse(event.data.toString());
969
+ switch (message.type) {
970
+ case "audio_chunk":
971
+ break;
972
+ case "speech_start":
973
+ this.utteranceEnded = false;
974
+ this.onSpeechDetected();
975
+ break;
976
+ case "speech_end":
977
+ this.handleUtteranceEnd();
978
+ break;
979
+ case "transcript":
980
+ this.handleTranscriptionEvent(message.data);
981
+ break;
982
+ default:
983
+ console.log(`[Deepgram] Unhandled event: ${JSON.stringify(event)}`);
984
+ break;
985
+ }
986
+ }
987
+ };
719
988
 
720
989
  // src/voice/textToSpeech/base.ts
721
990
  var MagmaFlowTextToSpeech = class {
@@ -901,6 +1170,9 @@ exports.DeepgramSTT = DeepgramSTT;
901
1170
  exports.DeepgramTTS = DeepgramTTS;
902
1171
  exports.ElevenLabsTTS = ElevenLabsTTS;
903
1172
  exports.ElevenLabsVoice = ElevenLabsVoice;
1173
+ exports.GladiaLanguage = GladiaLanguage;
1174
+ exports.GladiaModel = GladiaModel;
1175
+ exports.GladiaSTT = GladiaSTT;
904
1176
  exports.HumeTTS = HumeTTS;
905
1177
  exports.MagmaFlow = MagmaFlow;
906
1178
  exports.MagmaFlowSpeechToText = MagmaFlowSpeechToText;
package/dist/voice.mjs CHANGED
@@ -1,4 +1,5 @@
1
1
  import { DeepgramClient, LiveTranscriptionEvents } from '@deepgram/sdk';
2
+ import { WebSocket as WebSocket$1 } from 'ws';
2
3
  import { HumeClient } from 'hume';
3
4
  import OpenAI from 'openai';
4
5
 
@@ -687,11 +688,6 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
687
688
  } else {
688
689
  if (currentTurn) {
689
690
  currentTurn.confidence = currentTurnConfidence / currentTurnWordCount;
690
- if (currentTurn.confidence < 0.5) {
691
- currentTurn.text = "[inaudible]";
692
- } else if (currentTurn.confidence < 0.75) {
693
- currentTurn.text = `[unclear, confidence=${currentTurn.confidence.toFixed(2)}] ${currentTurn.text}`;
694
- }
695
691
  turns.push(currentTurn);
696
692
  }
697
693
  currentTurn = { speaker, text: utterance, confidence: 0 };
@@ -710,6 +706,279 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
710
706
  }
711
707
  }
712
708
  };
709
+ var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
710
+ var DummyWebSocket = class {
711
+ url;
712
+ constructor(url) {
713
+ this.url = null;
714
+ }
715
+ send(data) {
716
+ console.error("Data send attempted through dummy websocket");
717
+ }
718
+ close() {
719
+ }
720
+ addEventListener() {
721
+ }
722
+ get readyState() {
723
+ return 3 /* CLOSED */;
724
+ }
725
+ };
726
+ var QueueWebSocket = class {
727
+ wsQueue = [];
728
+ handlingQueue = false;
729
+ url;
730
+ ws;
731
+ constructor(url) {
732
+ this.url = url;
733
+ if (this.url) {
734
+ if (NATIVE_WEBSOCKET_AVAILABLE) {
735
+ this.ws = new WebSocket(this.url);
736
+ } else {
737
+ this.ws = new WebSocket$1(this.url);
738
+ }
739
+ } else {
740
+ this.ws = new DummyWebSocket(null);
741
+ }
742
+ this.ws.addEventListener("open", () => {
743
+ this.handleQueue();
744
+ });
745
+ }
746
+ connect() {
747
+ if (this.url) {
748
+ if (NATIVE_WEBSOCKET_AVAILABLE) {
749
+ this.ws = new WebSocket(this.url);
750
+ } else {
751
+ this.ws = new WebSocket$1(this.url);
752
+ }
753
+ } else {
754
+ this.ws = new DummyWebSocket(null);
755
+ }
756
+ }
757
+ close(code, reason) {
758
+ this.ws.close(code, reason);
759
+ }
760
+ handleQueue(cb = this.handleQueue.bind(this)) {
761
+ if (this.handlingQueue) {
762
+ console.log(`[Gladia] handleQueue: Already running`);
763
+ return;
764
+ }
765
+ this.handlingQueue = true;
766
+ const data = this.wsQueue.shift();
767
+ if (!data) {
768
+ this.handlingQueue = false;
769
+ return;
770
+ }
771
+ if (this.readyState === 1 /* OPEN */) {
772
+ this.ws.send(data);
773
+ this.handlingQueue = false;
774
+ this.handleQueue(cb);
775
+ } else {
776
+ this.wsQueue.unshift(data);
777
+ }
778
+ this.handlingQueue = false;
779
+ }
780
+ send(data) {
781
+ this.wsQueue.push(data);
782
+ this.handleQueue();
783
+ }
784
+ get readyState() {
785
+ return this.ws.readyState;
786
+ }
787
+ };
788
+
789
+ // src/voice/speechToText/gladia.ts
790
+ var GladiaModel = /* @__PURE__ */ ((GladiaModel2) => {
791
+ GladiaModel2["SOLARIA_1"] = "solaria-1";
792
+ return GladiaModel2;
793
+ })(GladiaModel || {});
794
+ var GladiaLanguage = /* @__PURE__ */ ((GladiaLanguage2) => {
795
+ GladiaLanguage2["EN"] = "en";
796
+ return GladiaLanguage2;
797
+ })(GladiaLanguage || {});
798
+ var GladiaSTT = class extends MagmaFlowSpeechToText {
799
+ connection;
800
+ connectionUrl = null;
801
+ settingUp = false;
802
+ apiKey;
803
+ config;
804
+ turnBuffer = [];
805
+ utteranceEnded = false;
806
+ constructor(args) {
807
+ super();
808
+ this.config = {
809
+ model: args.model,
810
+ encoding: "wav/pcm",
811
+ sample_rate: 48e3,
812
+ channels: 1,
813
+ maximum_duration_without_endpointing: 1,
814
+ ...args.config
815
+ };
816
+ if (!args.apiKey && !process.env.GLADIA_API_KEY) {
817
+ throw new Error("GLADIA_API_KEY not supplied and not found in env");
818
+ }
819
+ this.apiKey = args.apiKey ?? process.env.GLADIA_API_KEY;
820
+ this.connection = new QueueWebSocket(null);
821
+ }
822
+ setup() {
823
+ if (this.settingUp) {
824
+ return;
825
+ }
826
+ this.settingUp = true;
827
+ if (this.connectionUrl) {
828
+ this.connectWS();
829
+ return;
830
+ }
831
+ console.log("[Gladia] Setup: Getting connection url");
832
+ fetch("https://api.gladia.io/v2/live", {
833
+ method: "POST",
834
+ headers: {
835
+ "Content-Type": "application/json",
836
+ "X-Gladia-Key": this.apiKey
837
+ },
838
+ body: JSON.stringify({
839
+ encoding: "wav/pcm",
840
+ sample_rate: 48e3,
841
+ bit_depth: 16,
842
+ channels: 1
843
+ })
844
+ }).then(async (response) => {
845
+ if (!response.ok) {
846
+ console.error("[Gladia] Could not get WS url");
847
+ const errorMessage = `${response.status}: ${await response.text() || response.statusText}`;
848
+ console.error(errorMessage);
849
+ this.settingUp = false;
850
+ throw new Error(errorMessage);
851
+ }
852
+ const json = await response.json();
853
+ this.connectionUrl = json.url;
854
+ this.connectWS();
855
+ });
856
+ }
857
+ connectWS() {
858
+ if (!this.connectionUrl) {
859
+ console.log(`[Gladia] ConnectWS: No connection url`);
860
+ return;
861
+ }
862
+ if (this.connection.readyState === 0 /* CONNECTING */) {
863
+ return;
864
+ }
865
+ console.log("[Gladia] ConnectWS: Connecting to", this.connectionUrl);
866
+ this.connection = new QueueWebSocket(this.connectionUrl);
867
+ this.settingUp = false;
868
+ this.connection.ws.addEventListener("error", (event) => {
869
+ console.log(`[Gladia] Error: ${JSON.stringify(event)}`);
870
+ });
871
+ this.connection.ws.addEventListener("close", (event) => {
872
+ console.log(`[Gladia] Close: ${JSON.stringify(event)}`);
873
+ });
874
+ this.connection.ws.addEventListener("open", this.onOpen.bind(this));
875
+ this.connection.ws.addEventListener("message", this.handleMessage.bind(this));
876
+ }
877
+ input(audio) {
878
+ this.connection.send(audio.buffer);
879
+ if (this.connection.readyState !== 1 /* OPEN */) {
880
+ this.setup();
881
+ }
882
+ }
883
+ flush() {
884
+ console.log("[Gladia] Flush: sending stop_recording messsage");
885
+ this.connection.send(
886
+ JSON.stringify({
887
+ type: "stop_recording"
888
+ })
889
+ );
890
+ }
891
+ kill() {
892
+ console.log("[Gladia] Kill: Closing connection");
893
+ this.connection?.close(1e3);
894
+ this.connection = new QueueWebSocket(null);
895
+ }
896
+ handleTranscriptionEvent(transcriptionEvent) {
897
+ if (transcriptionEvent.utterance.text.trim() === "") {
898
+ return;
899
+ }
900
+ this.onSpeechDetected();
901
+ if (transcriptionEvent.is_final) {
902
+ const turns = this.computeTurns(transcriptionEvent);
903
+ this.turnBuffer = this.turnBuffer.concat(turns);
904
+ this.sendOutput();
905
+ }
906
+ }
907
+ handleUtteranceEnd() {
908
+ this.utteranceEnded = true;
909
+ this.sendOutput();
910
+ }
911
+ sendOutput() {
912
+ if (!this.utteranceEnded) {
913
+ return;
914
+ }
915
+ if (this.turnBuffer.length === 0) {
916
+ return;
917
+ }
918
+ const text = this.turnBuffer.map((turn) => turn.text).join(" ");
919
+ let turns = void 0;
920
+ if (this.turnBuffer.every((turn) => turn.speaker !== void 0 && turn.speaker !== null)) {
921
+ turns = this.turnBuffer.reduce((acc, turn) => {
922
+ if (acc.at(-1)?.speaker === turn.speaker) {
923
+ acc.at(-1).text += turn.text;
924
+ } else {
925
+ acc.push(turn);
926
+ }
927
+ return acc;
928
+ }, []);
929
+ }
930
+ this.onOutput({
931
+ text,
932
+ turns
933
+ });
934
+ this.turnBuffer = [];
935
+ this.utteranceEnded = false;
936
+ }
937
+ onOpen() {
938
+ console.log(`[Gladia] Connected`);
939
+ }
940
+ computeTurns(transcript) {
941
+ try {
942
+ const nowMs = (/* @__PURE__ */ new Date()).getTime();
943
+ const durationMs = (transcript.utterance.end - transcript.utterance.start) * 1e3;
944
+ const startMs = nowMs - durationMs;
945
+ const turns = [
946
+ {
947
+ text: transcript.utterance.text,
948
+ confidence: transcript.utterance.confidence,
949
+ durationMs,
950
+ start: new Date(startMs),
951
+ end: new Date(nowMs),
952
+ speaker: transcript.utterance.speaker
953
+ }
954
+ ];
955
+ return turns;
956
+ } catch (error) {
957
+ console.error(error);
958
+ return [];
959
+ }
960
+ }
961
+ handleMessage(event) {
962
+ const message = JSON.parse(event.data.toString());
963
+ switch (message.type) {
964
+ case "audio_chunk":
965
+ break;
966
+ case "speech_start":
967
+ this.utteranceEnded = false;
968
+ this.onSpeechDetected();
969
+ break;
970
+ case "speech_end":
971
+ this.handleUtteranceEnd();
972
+ break;
973
+ case "transcript":
974
+ this.handleTranscriptionEvent(message.data);
975
+ break;
976
+ default:
977
+ console.log(`[Deepgram] Unhandled event: ${JSON.stringify(event)}`);
978
+ break;
979
+ }
980
+ }
981
+ };
713
982
 
714
983
  // src/voice/textToSpeech/base.ts
715
984
  var MagmaFlowTextToSpeech = class {
@@ -889,4 +1158,4 @@ var WhisperTTS = class extends MagmaFlowTextToSpeech {
889
1158
  }
890
1159
  };
891
1160
 
892
- export { DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
1161
+ export { DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pompeii-labs/audio",
3
- "version": "0.2.0",
3
+ "version": "0.2.2",
4
4
  "description": "The Audio SDK from Pompeii Labs",
5
5
  "keywords": [
6
6
  "Pompeii",
@@ -47,6 +47,7 @@
47
47
  },
48
48
  "devDependencies": {
49
49
  "@types/node": "^22.16.0",
50
+ "@types/ws": "^8.18.1",
50
51
  "@typescript-eslint/eslint-plugin": "^6.21.0",
51
52
  "@typescript-eslint/parser": "^6.21.0",
52
53
  "eslint": "^8.57.1",
@@ -55,5 +56,8 @@
55
56
  "prettier": "^3.6.2",
56
57
  "tsup": "^8.5.0",
57
58
  "typescript": "^5.8.3"
59
+ },
60
+ "peerDependencies": {
61
+ "ws": "^8.0.0"
58
62
  }
59
63
  }