@pompeii-labs/audio 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/voice.d.mts CHANGED
@@ -43,6 +43,7 @@ type MagmaFlowArgs = {
43
43
  onSpeechDetected: () => void;
44
44
  onTranscription: (transcription: MagmaFlowSTTOutput) => void;
45
45
  onAudioOutput: (audio: Buffer) => void;
46
+ onNormalizedAudio?: (audio: Buffer) => void;
46
47
  config?: MagmaFlowConfig;
47
48
  };
48
49
  declare class MagmaFlow {
@@ -51,6 +52,7 @@ declare class MagmaFlow {
51
52
  private inputFormat;
52
53
  private outputFormat;
53
54
  private onAudioOutput;
55
+ private onNormalizedAudio?;
54
56
  private textBuffer;
55
57
  private textQueue;
56
58
  private generatingAudio;
@@ -107,6 +109,36 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
107
109
  private computeTurns;
108
110
  }
109
111
 
112
+ type DeepgramFluxConfig = {
113
+ apiKey?: string;
114
+ eotThreshold?: number;
115
+ eagerEotThreshold?: number;
116
+ eotTimeoutMs?: number;
117
+ };
118
+ declare class DeepgramFluxSTT extends MagmaFlowSpeechToText {
119
+ private ws;
120
+ private apiKey;
121
+ private eotThreshold;
122
+ private eagerEotThreshold?;
123
+ private eotTimeoutMs;
124
+ private audioQueue;
125
+ private connecting;
126
+ private killed;
127
+ private reconnectAttempts;
128
+ private reconnectTimer?;
129
+ constructor(config?: DeepgramFluxConfig);
130
+ private connect;
131
+ private scheduleReconnect;
132
+ private clearConnection;
133
+ private handleMessage;
134
+ private handleTurnInfo;
135
+ input(audio: Buffer): void;
136
+ flush(): void;
137
+ kill(): void;
138
+ onEagerEndOfTurn(transcript: string): void;
139
+ onTurnResumed(): void;
140
+ }
141
+
110
142
  declare enum GladiaModel {
111
143
  SOLARIA_1 = "solaria-1"
112
144
  }
@@ -278,9 +310,21 @@ declare class ElevenLabsTTS extends MagmaFlowTextToSpeech {
278
310
 
279
311
  type HumeTTSArgs = {
280
312
  client?: HumeClient;
313
+ voice?: {
314
+ name?: string;
315
+ id?: string;
316
+ provider?: string;
317
+ };
318
+ description?: string;
319
+ speed?: number;
320
+ version?: string;
281
321
  };
282
322
  declare class HumeTTS extends MagmaFlowTextToSpeech {
283
323
  private client;
324
+ private voice?;
325
+ private description?;
326
+ private speed?;
327
+ private version?;
284
328
  constructor(args: HumeTTSArgs);
285
329
  setup(): Promise<void>;
286
330
  input(text: string | null, requestId: string): void;
@@ -302,4 +346,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
302
346
 
303
347
  declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
304
348
 
305
- export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
349
+ export { type DeepgramFluxConfig, DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
package/dist/voice.d.ts CHANGED
@@ -43,6 +43,7 @@ type MagmaFlowArgs = {
43
43
  onSpeechDetected: () => void;
44
44
  onTranscription: (transcription: MagmaFlowSTTOutput) => void;
45
45
  onAudioOutput: (audio: Buffer) => void;
46
+ onNormalizedAudio?: (audio: Buffer) => void;
46
47
  config?: MagmaFlowConfig;
47
48
  };
48
49
  declare class MagmaFlow {
@@ -51,6 +52,7 @@ declare class MagmaFlow {
51
52
  private inputFormat;
52
53
  private outputFormat;
53
54
  private onAudioOutput;
55
+ private onNormalizedAudio?;
54
56
  private textBuffer;
55
57
  private textQueue;
56
58
  private generatingAudio;
@@ -107,6 +109,36 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
107
109
  private computeTurns;
108
110
  }
109
111
 
112
+ type DeepgramFluxConfig = {
113
+ apiKey?: string;
114
+ eotThreshold?: number;
115
+ eagerEotThreshold?: number;
116
+ eotTimeoutMs?: number;
117
+ };
118
+ declare class DeepgramFluxSTT extends MagmaFlowSpeechToText {
119
+ private ws;
120
+ private apiKey;
121
+ private eotThreshold;
122
+ private eagerEotThreshold?;
123
+ private eotTimeoutMs;
124
+ private audioQueue;
125
+ private connecting;
126
+ private killed;
127
+ private reconnectAttempts;
128
+ private reconnectTimer?;
129
+ constructor(config?: DeepgramFluxConfig);
130
+ private connect;
131
+ private scheduleReconnect;
132
+ private clearConnection;
133
+ private handleMessage;
134
+ private handleTurnInfo;
135
+ input(audio: Buffer): void;
136
+ flush(): void;
137
+ kill(): void;
138
+ onEagerEndOfTurn(transcript: string): void;
139
+ onTurnResumed(): void;
140
+ }
141
+
110
142
  declare enum GladiaModel {
111
143
  SOLARIA_1 = "solaria-1"
112
144
  }
@@ -278,9 +310,21 @@ declare class ElevenLabsTTS extends MagmaFlowTextToSpeech {
278
310
 
279
311
  type HumeTTSArgs = {
280
312
  client?: HumeClient;
313
+ voice?: {
314
+ name?: string;
315
+ id?: string;
316
+ provider?: string;
317
+ };
318
+ description?: string;
319
+ speed?: number;
320
+ version?: string;
281
321
  };
282
322
  declare class HumeTTS extends MagmaFlowTextToSpeech {
283
323
  private client;
324
+ private voice?;
325
+ private description?;
326
+ private speed?;
327
+ private version?;
284
328
  constructor(args: HumeTTSArgs);
285
329
  setup(): Promise<void>;
286
330
  input(text: string | null, requestId: string): void;
@@ -302,4 +346,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
302
346
 
303
347
  declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
304
348
 
305
- export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
349
+ export { type DeepgramFluxConfig, DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
package/dist/voice.js CHANGED
@@ -1,12 +1,13 @@
1
1
  'use strict';
2
2
 
3
3
  var sdk = require('@deepgram/sdk');
4
- var ws = require('ws');
4
+ var WebSocket2 = require('ws');
5
5
  var hume = require('hume');
6
6
  var OpenAI = require('openai');
7
7
 
8
8
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
9
9
 
10
+ var WebSocket2__default = /*#__PURE__*/_interopDefault(WebSocket2);
10
11
  var OpenAI__default = /*#__PURE__*/_interopDefault(OpenAI);
11
12
 
12
13
  // src/helpers/bufferToInt16Array.ts
@@ -424,6 +425,7 @@ var MagmaFlow = class {
424
425
  inputFormat;
425
426
  outputFormat;
426
427
  onAudioOutput;
428
+ onNormalizedAudio;
427
429
  textBuffer = "";
428
430
  textQueue = [];
429
431
  generatingAudio = false;
@@ -440,6 +442,7 @@ var MagmaFlow = class {
440
442
  this.inputFormat = args.inputFormat;
441
443
  this.outputFormat = args.outputFormat;
442
444
  this.onAudioOutput = args.onAudioOutput;
445
+ this.onNormalizedAudio = args.onNormalizedAudio;
443
446
  this.config = { ...this.config, ...args.config };
444
447
  this.tts.onOutput = (audio, requestId) => {
445
448
  if (this.currentRequestId !== requestId) {
@@ -478,7 +481,9 @@ var MagmaFlow = class {
478
481
  inputAudio(audio) {
479
482
  const decodedAudio = decodeToPcm(audio, this.inputFormat.encoding);
480
483
  const resampledPCM = resamplePcm(decodedAudio, this.inputFormat.sampleRate, 48e3);
481
- this.stt.input(int16ArrayToBuffer(resampledPCM));
484
+ const pcmBuffer = int16ArrayToBuffer(resampledPCM);
485
+ this.onNormalizedAudio?.(pcmBuffer);
486
+ this.stt.input(pcmBuffer);
482
487
  }
483
488
  inputText(text) {
484
489
  if (text === void 0 || text === null) {
@@ -694,6 +699,11 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
694
699
  } else {
695
700
  if (currentTurn) {
696
701
  currentTurn.confidence = currentTurnConfidence / currentTurnWordCount;
702
+ if (currentTurn.confidence < 0.5) {
703
+ currentTurn.text = "[inaudible]";
704
+ } else if (currentTurn.confidence < 0.75) {
705
+ currentTurn.text = `[unclear, confidence=${currentTurn.confidence.toFixed(2)}] ${currentTurn.text}`;
706
+ }
697
707
  turns.push(currentTurn);
698
708
  }
699
709
  currentTurn = { speaker, text: utterance, confidence: 0 };
@@ -712,6 +722,156 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
712
722
  }
713
723
  }
714
724
  };
725
+ var kReconnectBaseMs = 500;
726
+ var kReconnectMaxMs = 1e4;
727
+ var DeepgramFluxSTT = class extends MagmaFlowSpeechToText {
728
+ ws = null;
729
+ apiKey;
730
+ eotThreshold;
731
+ eagerEotThreshold;
732
+ eotTimeoutMs;
733
+ audioQueue = [];
734
+ connecting = false;
735
+ killed = false;
736
+ reconnectAttempts = 0;
737
+ reconnectTimer;
738
+ constructor(config) {
739
+ super();
740
+ this.apiKey = config?.apiKey ?? process.env.DEEPGRAM_API_KEY;
741
+ this.eotThreshold = config?.eotThreshold ?? 0.7;
742
+ this.eagerEotThreshold = config?.eagerEotThreshold;
743
+ this.eotTimeoutMs = config?.eotTimeoutMs ?? 5e3;
744
+ }
745
+ connect() {
746
+ if (this.connecting || this.killed) return;
747
+ this.connecting = true;
748
+ const params = new URLSearchParams({
749
+ model: "flux-general-en",
750
+ encoding: "linear16",
751
+ sample_rate: "48000",
752
+ eot_threshold: this.eotThreshold.toString(),
753
+ eot_timeout_ms: this.eotTimeoutMs.toString()
754
+ });
755
+ if (this.eagerEotThreshold !== void 0) {
756
+ params.set("eager_eot_threshold", this.eagerEotThreshold.toString());
757
+ }
758
+ const url = `wss://api.deepgram.com/v2/listen?${params.toString()}`;
759
+ this.ws = new WebSocket2__default.default(url, {
760
+ headers: { Authorization: `Token ${this.apiKey}` }
761
+ });
762
+ this.ws.on("open", () => {
763
+ console.log("[DeepgramFlux] Connected");
764
+ this.connecting = false;
765
+ this.reconnectAttempts = 0;
766
+ for (const queued of this.audioQueue) {
767
+ this.ws.send(queued);
768
+ }
769
+ this.audioQueue = [];
770
+ });
771
+ this.ws.on("message", (data) => {
772
+ try {
773
+ const msg = JSON.parse(data.toString());
774
+ this.handleMessage(msg);
775
+ } catch (err) {
776
+ console.error(`[DeepgramFlux] Parse error: ${err.message}`);
777
+ }
778
+ });
779
+ this.ws.on("error", (err) => {
780
+ console.error(`[DeepgramFlux] Error: ${err.message}`);
781
+ this.connecting = false;
782
+ });
783
+ this.ws.on("close", (code, reason) => {
784
+ console.log(`[DeepgramFlux] Closed: ${code} ${reason.toString()}`);
785
+ this.clearConnection();
786
+ if (!this.killed) {
787
+ this.scheduleReconnect();
788
+ }
789
+ });
790
+ }
791
+ scheduleReconnect() {
792
+ const delay = Math.min(
793
+ kReconnectBaseMs * Math.pow(2, this.reconnectAttempts),
794
+ kReconnectMaxMs
795
+ );
796
+ this.reconnectAttempts++;
797
+ console.log(
798
+ `[DeepgramFlux] Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`
799
+ );
800
+ this.reconnectTimer = setTimeout(() => {
801
+ this.reconnectTimer = void 0;
802
+ if (!this.killed && !this.ws) {
803
+ this.connect();
804
+ }
805
+ }, delay);
806
+ }
807
+ clearConnection() {
808
+ this.ws = null;
809
+ this.connecting = false;
810
+ }
811
+ handleMessage(msg) {
812
+ switch (msg.type) {
813
+ case "TurnInfo":
814
+ this.handleTurnInfo(msg);
815
+ break;
816
+ case "Connected":
817
+ break;
818
+ case "Error":
819
+ console.error(`[DeepgramFlux] Server error: ${JSON.stringify(msg)}`);
820
+ break;
821
+ default:
822
+ console.log(`[DeepgramFlux] Unknown message: ${JSON.stringify(msg)}`);
823
+ break;
824
+ }
825
+ }
826
+ handleTurnInfo(msg) {
827
+ switch (msg.event) {
828
+ case "StartOfTurn":
829
+ this.onSpeechDetected();
830
+ break;
831
+ case "EndOfTurn":
832
+ this.onOutput({ text: msg.transcript });
833
+ break;
834
+ case "EagerEndOfTurn":
835
+ this.onEagerEndOfTurn(msg.transcript);
836
+ break;
837
+ case "TurnResumed":
838
+ this.onTurnResumed();
839
+ break;
840
+ }
841
+ }
842
+ input(audio) {
843
+ if (!this.ws && !this.connecting && !this.reconnectTimer) {
844
+ this.connect();
845
+ }
846
+ if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
847
+ this.ws.send(audio);
848
+ } else {
849
+ this.audioQueue.push(audio);
850
+ }
851
+ }
852
+ flush() {
853
+ if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
854
+ this.ws.send(JSON.stringify({ type: "Finalize" }));
855
+ }
856
+ }
857
+ kill() {
858
+ this.killed = true;
859
+ if (this.reconnectTimer) {
860
+ clearTimeout(this.reconnectTimer);
861
+ this.reconnectTimer = void 0;
862
+ }
863
+ if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
864
+ this.ws.send(JSON.stringify({ type: "CloseStream" }));
865
+ this.ws.close();
866
+ }
867
+ this.clearConnection();
868
+ this.audioQueue = [];
869
+ }
870
+ onEagerEndOfTurn(transcript) {
871
+ }
872
+ onTurnResumed() {
873
+ }
874
+ };
715
875
  var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
716
876
  var DummyWebSocket = class {
717
877
  url;
@@ -740,7 +900,7 @@ var QueueWebSocket = class {
740
900
  if (NATIVE_WEBSOCKET_AVAILABLE) {
741
901
  this.ws = new WebSocket(this.url);
742
902
  } else {
743
- this.ws = new ws.WebSocket(this.url);
903
+ this.ws = new WebSocket2.WebSocket(this.url);
744
904
  }
745
905
  } else {
746
906
  this.ws = new DummyWebSocket(null);
@@ -754,7 +914,7 @@ var QueueWebSocket = class {
754
914
  if (NATIVE_WEBSOCKET_AVAILABLE) {
755
915
  this.ws = new WebSocket(this.url);
756
916
  } else {
757
- this.ws = new ws.WebSocket(this.url);
917
+ this.ws = new WebSocket2.WebSocket(this.url);
758
918
  }
759
919
  } else {
760
920
  this.ws = new DummyWebSocket(null);
@@ -1079,7 +1239,6 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
1079
1239
  ).then(async (response) => {
1080
1240
  const reader = response.body?.getReader();
1081
1241
  if (!reader) return;
1082
- new TextDecoder();
1083
1242
  while (true) {
1084
1243
  const { done, value } = await reader.read();
1085
1244
  if (done) break;
@@ -1096,9 +1255,17 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
1096
1255
  };
1097
1256
  var HumeTTS = class extends MagmaFlowTextToSpeech {
1098
1257
  client;
1258
+ voice;
1259
+ description;
1260
+ speed;
1261
+ version;
1099
1262
  constructor(args) {
1100
1263
  super();
1101
1264
  this.client = args.client ?? new hume.HumeClient({ apiKey: process.env.HUME_API_KEY });
1265
+ this.voice = args.voice;
1266
+ this.description = args.description;
1267
+ this.speed = args.speed;
1268
+ this.version = args.version;
1102
1269
  }
1103
1270
  async setup() {
1104
1271
  }
@@ -1106,22 +1273,28 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
1106
1273
  if (!text) {
1107
1274
  return;
1108
1275
  }
1109
- this.client.tts.synthesizeJsonStreaming({
1110
- utterances: [
1111
- {
1112
- text
1113
- }
1114
- ],
1115
- format: {
1116
- type: "pcm"
1117
- },
1118
- instantMode: true
1119
- }).then(async (stream) => {
1276
+ const utterance = { text };
1277
+ if (this.voice) utterance.voice = this.voice;
1278
+ if (this.description) utterance.description = this.description;
1279
+ if (this.speed !== void 0) utterance.speed = this.speed;
1280
+ const params = {
1281
+ utterances: [utterance],
1282
+ format: { type: "pcm" },
1283
+ instantMode: true,
1284
+ stripHeaders: true
1285
+ };
1286
+ if (this.version) params.version = this.version;
1287
+ this.client.tts.synthesizeJsonStreaming(params).then(async (stream) => {
1120
1288
  for await (const chunk of stream) {
1121
- this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
1289
+ if (chunk.type === "audio") {
1290
+ this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
1291
+ }
1122
1292
  }
1123
1293
  this.onOutput(null, requestId);
1124
1294
  console.log("[Hume] Finished:", text);
1295
+ }).catch((err) => {
1296
+ console.error(`[Hume] Error: ${err.message}`);
1297
+ this.onOutput(null, requestId);
1125
1298
  });
1126
1299
  }
1127
1300
  kill() {
@@ -1164,6 +1337,7 @@ var WhisperTTS = class extends MagmaFlowTextToSpeech {
1164
1337
  }
1165
1338
  };
1166
1339
 
1340
+ exports.DeepgramFluxSTT = DeepgramFluxSTT;
1167
1341
  exports.DeepgramLanguage = DeepgramLanguage;
1168
1342
  exports.DeepgramModel = DeepgramModel;
1169
1343
  exports.DeepgramSTT = DeepgramSTT;
package/dist/voice.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  import { DeepgramClient, LiveTranscriptionEvents } from '@deepgram/sdk';
2
- import { WebSocket as WebSocket$1 } from 'ws';
2
+ import WebSocket2, { WebSocket as WebSocket$1 } from 'ws';
3
3
  import { HumeClient } from 'hume';
4
4
  import OpenAI from 'openai';
5
5
 
@@ -418,6 +418,7 @@ var MagmaFlow = class {
418
418
  inputFormat;
419
419
  outputFormat;
420
420
  onAudioOutput;
421
+ onNormalizedAudio;
421
422
  textBuffer = "";
422
423
  textQueue = [];
423
424
  generatingAudio = false;
@@ -434,6 +435,7 @@ var MagmaFlow = class {
434
435
  this.inputFormat = args.inputFormat;
435
436
  this.outputFormat = args.outputFormat;
436
437
  this.onAudioOutput = args.onAudioOutput;
438
+ this.onNormalizedAudio = args.onNormalizedAudio;
437
439
  this.config = { ...this.config, ...args.config };
438
440
  this.tts.onOutput = (audio, requestId) => {
439
441
  if (this.currentRequestId !== requestId) {
@@ -472,7 +474,9 @@ var MagmaFlow = class {
472
474
  inputAudio(audio) {
473
475
  const decodedAudio = decodeToPcm(audio, this.inputFormat.encoding);
474
476
  const resampledPCM = resamplePcm(decodedAudio, this.inputFormat.sampleRate, 48e3);
475
- this.stt.input(int16ArrayToBuffer(resampledPCM));
477
+ const pcmBuffer = int16ArrayToBuffer(resampledPCM);
478
+ this.onNormalizedAudio?.(pcmBuffer);
479
+ this.stt.input(pcmBuffer);
476
480
  }
477
481
  inputText(text) {
478
482
  if (text === void 0 || text === null) {
@@ -688,6 +692,11 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
688
692
  } else {
689
693
  if (currentTurn) {
690
694
  currentTurn.confidence = currentTurnConfidence / currentTurnWordCount;
695
+ if (currentTurn.confidence < 0.5) {
696
+ currentTurn.text = "[inaudible]";
697
+ } else if (currentTurn.confidence < 0.75) {
698
+ currentTurn.text = `[unclear, confidence=${currentTurn.confidence.toFixed(2)}] ${currentTurn.text}`;
699
+ }
691
700
  turns.push(currentTurn);
692
701
  }
693
702
  currentTurn = { speaker, text: utterance, confidence: 0 };
@@ -706,6 +715,156 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
706
715
  }
707
716
  }
708
717
  };
718
+ var kReconnectBaseMs = 500;
719
+ var kReconnectMaxMs = 1e4;
720
+ var DeepgramFluxSTT = class extends MagmaFlowSpeechToText {
721
+ ws = null;
722
+ apiKey;
723
+ eotThreshold;
724
+ eagerEotThreshold;
725
+ eotTimeoutMs;
726
+ audioQueue = [];
727
+ connecting = false;
728
+ killed = false;
729
+ reconnectAttempts = 0;
730
+ reconnectTimer;
731
+ constructor(config) {
732
+ super();
733
+ this.apiKey = config?.apiKey ?? process.env.DEEPGRAM_API_KEY;
734
+ this.eotThreshold = config?.eotThreshold ?? 0.7;
735
+ this.eagerEotThreshold = config?.eagerEotThreshold;
736
+ this.eotTimeoutMs = config?.eotTimeoutMs ?? 5e3;
737
+ }
738
+ connect() {
739
+ if (this.connecting || this.killed) return;
740
+ this.connecting = true;
741
+ const params = new URLSearchParams({
742
+ model: "flux-general-en",
743
+ encoding: "linear16",
744
+ sample_rate: "48000",
745
+ eot_threshold: this.eotThreshold.toString(),
746
+ eot_timeout_ms: this.eotTimeoutMs.toString()
747
+ });
748
+ if (this.eagerEotThreshold !== void 0) {
749
+ params.set("eager_eot_threshold", this.eagerEotThreshold.toString());
750
+ }
751
+ const url = `wss://api.deepgram.com/v2/listen?${params.toString()}`;
752
+ this.ws = new WebSocket2(url, {
753
+ headers: { Authorization: `Token ${this.apiKey}` }
754
+ });
755
+ this.ws.on("open", () => {
756
+ console.log("[DeepgramFlux] Connected");
757
+ this.connecting = false;
758
+ this.reconnectAttempts = 0;
759
+ for (const queued of this.audioQueue) {
760
+ this.ws.send(queued);
761
+ }
762
+ this.audioQueue = [];
763
+ });
764
+ this.ws.on("message", (data) => {
765
+ try {
766
+ const msg = JSON.parse(data.toString());
767
+ this.handleMessage(msg);
768
+ } catch (err) {
769
+ console.error(`[DeepgramFlux] Parse error: ${err.message}`);
770
+ }
771
+ });
772
+ this.ws.on("error", (err) => {
773
+ console.error(`[DeepgramFlux] Error: ${err.message}`);
774
+ this.connecting = false;
775
+ });
776
+ this.ws.on("close", (code, reason) => {
777
+ console.log(`[DeepgramFlux] Closed: ${code} ${reason.toString()}`);
778
+ this.clearConnection();
779
+ if (!this.killed) {
780
+ this.scheduleReconnect();
781
+ }
782
+ });
783
+ }
784
+ scheduleReconnect() {
785
+ const delay = Math.min(
786
+ kReconnectBaseMs * Math.pow(2, this.reconnectAttempts),
787
+ kReconnectMaxMs
788
+ );
789
+ this.reconnectAttempts++;
790
+ console.log(
791
+ `[DeepgramFlux] Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`
792
+ );
793
+ this.reconnectTimer = setTimeout(() => {
794
+ this.reconnectTimer = void 0;
795
+ if (!this.killed && !this.ws) {
796
+ this.connect();
797
+ }
798
+ }, delay);
799
+ }
800
+ clearConnection() {
801
+ this.ws = null;
802
+ this.connecting = false;
803
+ }
804
+ handleMessage(msg) {
805
+ switch (msg.type) {
806
+ case "TurnInfo":
807
+ this.handleTurnInfo(msg);
808
+ break;
809
+ case "Connected":
810
+ break;
811
+ case "Error":
812
+ console.error(`[DeepgramFlux] Server error: ${JSON.stringify(msg)}`);
813
+ break;
814
+ default:
815
+ console.log(`[DeepgramFlux] Unknown message: ${JSON.stringify(msg)}`);
816
+ break;
817
+ }
818
+ }
819
+ handleTurnInfo(msg) {
820
+ switch (msg.event) {
821
+ case "StartOfTurn":
822
+ this.onSpeechDetected();
823
+ break;
824
+ case "EndOfTurn":
825
+ this.onOutput({ text: msg.transcript });
826
+ break;
827
+ case "EagerEndOfTurn":
828
+ this.onEagerEndOfTurn(msg.transcript);
829
+ break;
830
+ case "TurnResumed":
831
+ this.onTurnResumed();
832
+ break;
833
+ }
834
+ }
835
+ input(audio) {
836
+ if (!this.ws && !this.connecting && !this.reconnectTimer) {
837
+ this.connect();
838
+ }
839
+ if (this.ws?.readyState === WebSocket2.OPEN) {
840
+ this.ws.send(audio);
841
+ } else {
842
+ this.audioQueue.push(audio);
843
+ }
844
+ }
845
+ flush() {
846
+ if (this.ws?.readyState === WebSocket2.OPEN) {
847
+ this.ws.send(JSON.stringify({ type: "Finalize" }));
848
+ }
849
+ }
850
+ kill() {
851
+ this.killed = true;
852
+ if (this.reconnectTimer) {
853
+ clearTimeout(this.reconnectTimer);
854
+ this.reconnectTimer = void 0;
855
+ }
856
+ if (this.ws?.readyState === WebSocket2.OPEN) {
857
+ this.ws.send(JSON.stringify({ type: "CloseStream" }));
858
+ this.ws.close();
859
+ }
860
+ this.clearConnection();
861
+ this.audioQueue = [];
862
+ }
863
+ onEagerEndOfTurn(transcript) {
864
+ }
865
+ onTurnResumed() {
866
+ }
867
+ };
709
868
  var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
710
869
  var DummyWebSocket = class {
711
870
  url;
@@ -1073,7 +1232,6 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
1073
1232
  ).then(async (response) => {
1074
1233
  const reader = response.body?.getReader();
1075
1234
  if (!reader) return;
1076
- new TextDecoder();
1077
1235
  while (true) {
1078
1236
  const { done, value } = await reader.read();
1079
1237
  if (done) break;
@@ -1090,9 +1248,17 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
1090
1248
  };
1091
1249
  var HumeTTS = class extends MagmaFlowTextToSpeech {
1092
1250
  client;
1251
+ voice;
1252
+ description;
1253
+ speed;
1254
+ version;
1093
1255
  constructor(args) {
1094
1256
  super();
1095
1257
  this.client = args.client ?? new HumeClient({ apiKey: process.env.HUME_API_KEY });
1258
+ this.voice = args.voice;
1259
+ this.description = args.description;
1260
+ this.speed = args.speed;
1261
+ this.version = args.version;
1096
1262
  }
1097
1263
  async setup() {
1098
1264
  }
@@ -1100,22 +1266,28 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
1100
1266
  if (!text) {
1101
1267
  return;
1102
1268
  }
1103
- this.client.tts.synthesizeJsonStreaming({
1104
- utterances: [
1105
- {
1106
- text
1107
- }
1108
- ],
1109
- format: {
1110
- type: "pcm"
1111
- },
1112
- instantMode: true
1113
- }).then(async (stream) => {
1269
+ const utterance = { text };
1270
+ if (this.voice) utterance.voice = this.voice;
1271
+ if (this.description) utterance.description = this.description;
1272
+ if (this.speed !== void 0) utterance.speed = this.speed;
1273
+ const params = {
1274
+ utterances: [utterance],
1275
+ format: { type: "pcm" },
1276
+ instantMode: true,
1277
+ stripHeaders: true
1278
+ };
1279
+ if (this.version) params.version = this.version;
1280
+ this.client.tts.synthesizeJsonStreaming(params).then(async (stream) => {
1114
1281
  for await (const chunk of stream) {
1115
- this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
1282
+ if (chunk.type === "audio") {
1283
+ this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
1284
+ }
1116
1285
  }
1117
1286
  this.onOutput(null, requestId);
1118
1287
  console.log("[Hume] Finished:", text);
1288
+ }).catch((err) => {
1289
+ console.error(`[Hume] Error: ${err.message}`);
1290
+ this.onOutput(null, requestId);
1119
1291
  });
1120
1292
  }
1121
1293
  kill() {
@@ -1158,4 +1330,4 @@ var WhisperTTS = class extends MagmaFlowTextToSpeech {
1158
1330
  }
1159
1331
  };
1160
1332
 
1161
- export { DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
1333
+ export { DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pompeii-labs/audio",
3
- "version": "0.2.2",
3
+ "version": "0.3.1",
4
4
  "description": "The Audio SDK from Pompeii Labs",
5
5
  "keywords": [
6
6
  "Pompeii",
@@ -42,7 +42,7 @@
42
42
  },
43
43
  "dependencies": {
44
44
  "@deepgram/sdk": "4.2.0",
45
- "hume": "0.11.1",
45
+ "hume": "0.15.13",
46
46
  "openai": "4.86.2"
47
47
  },
48
48
  "devDependencies": {