@pompeii-labs/audio 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/voice.d.mts CHANGED
@@ -43,6 +43,7 @@ type MagmaFlowArgs = {
43
43
  onSpeechDetected: () => void;
44
44
  onTranscription: (transcription: MagmaFlowSTTOutput) => void;
45
45
  onAudioOutput: (audio: Buffer) => void;
46
+ onNormalizedAudio?: (audio: Buffer) => void;
46
47
  config?: MagmaFlowConfig;
47
48
  };
48
49
  declare class MagmaFlow {
@@ -51,6 +52,7 @@ declare class MagmaFlow {
51
52
  private inputFormat;
52
53
  private outputFormat;
53
54
  private onAudioOutput;
55
+ private onNormalizedAudio?;
54
56
  private textBuffer;
55
57
  private textQueue;
56
58
  private generatingAudio;
@@ -107,6 +109,36 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
107
109
  private computeTurns;
108
110
  }
109
111
 
112
+ type DeepgramFluxConfig = {
113
+ apiKey?: string;
114
+ eotThreshold?: number;
115
+ eagerEotThreshold?: number;
116
+ eotTimeoutMs?: number;
117
+ };
118
+ declare class DeepgramFluxSTT extends MagmaFlowSpeechToText {
119
+ private ws;
120
+ private apiKey;
121
+ private eotThreshold;
122
+ private eagerEotThreshold?;
123
+ private eotTimeoutMs;
124
+ private audioQueue;
125
+ private connecting;
126
+ private killed;
127
+ private reconnectAttempts;
128
+ private reconnectTimer?;
129
+ constructor(config?: DeepgramFluxConfig);
130
+ private connect;
131
+ private scheduleReconnect;
132
+ private clearConnection;
133
+ private handleMessage;
134
+ private handleTurnInfo;
135
+ input(audio: Buffer): void;
136
+ flush(): void;
137
+ kill(): void;
138
+ onEagerEndOfTurn(transcript: string): void;
139
+ onTurnResumed(): void;
140
+ }
141
+
110
142
  declare enum GladiaModel {
111
143
  SOLARIA_1 = "solaria-1"
112
144
  }
@@ -278,9 +310,21 @@ declare class ElevenLabsTTS extends MagmaFlowTextToSpeech {
278
310
 
279
311
  type HumeTTSArgs = {
280
312
  client?: HumeClient;
313
+ voice?: {
314
+ name?: string;
315
+ id?: string;
316
+ provider?: string;
317
+ };
318
+ description?: string;
319
+ speed?: number;
320
+ version?: string;
281
321
  };
282
322
  declare class HumeTTS extends MagmaFlowTextToSpeech {
283
323
  private client;
324
+ private voice?;
325
+ private description?;
326
+ private speed?;
327
+ private version?;
284
328
  constructor(args: HumeTTSArgs);
285
329
  setup(): Promise<void>;
286
330
  input(text: string | null, requestId: string): void;
@@ -302,4 +346,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
302
346
 
303
347
  declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
304
348
 
305
- export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
349
+ export { type DeepgramFluxConfig, DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
package/dist/voice.d.ts CHANGED
@@ -43,6 +43,7 @@ type MagmaFlowArgs = {
43
43
  onSpeechDetected: () => void;
44
44
  onTranscription: (transcription: MagmaFlowSTTOutput) => void;
45
45
  onAudioOutput: (audio: Buffer) => void;
46
+ onNormalizedAudio?: (audio: Buffer) => void;
46
47
  config?: MagmaFlowConfig;
47
48
  };
48
49
  declare class MagmaFlow {
@@ -51,6 +52,7 @@ declare class MagmaFlow {
51
52
  private inputFormat;
52
53
  private outputFormat;
53
54
  private onAudioOutput;
55
+ private onNormalizedAudio?;
54
56
  private textBuffer;
55
57
  private textQueue;
56
58
  private generatingAudio;
@@ -107,6 +109,36 @@ declare class DeepgramSTT extends MagmaFlowSpeechToText {
107
109
  private computeTurns;
108
110
  }
109
111
 
112
+ type DeepgramFluxConfig = {
113
+ apiKey?: string;
114
+ eotThreshold?: number;
115
+ eagerEotThreshold?: number;
116
+ eotTimeoutMs?: number;
117
+ };
118
+ declare class DeepgramFluxSTT extends MagmaFlowSpeechToText {
119
+ private ws;
120
+ private apiKey;
121
+ private eotThreshold;
122
+ private eagerEotThreshold?;
123
+ private eotTimeoutMs;
124
+ private audioQueue;
125
+ private connecting;
126
+ private killed;
127
+ private reconnectAttempts;
128
+ private reconnectTimer?;
129
+ constructor(config?: DeepgramFluxConfig);
130
+ private connect;
131
+ private scheduleReconnect;
132
+ private clearConnection;
133
+ private handleMessage;
134
+ private handleTurnInfo;
135
+ input(audio: Buffer): void;
136
+ flush(): void;
137
+ kill(): void;
138
+ onEagerEndOfTurn(transcript: string): void;
139
+ onTurnResumed(): void;
140
+ }
141
+
110
142
  declare enum GladiaModel {
111
143
  SOLARIA_1 = "solaria-1"
112
144
  }
@@ -278,9 +310,21 @@ declare class ElevenLabsTTS extends MagmaFlowTextToSpeech {
278
310
 
279
311
  type HumeTTSArgs = {
280
312
  client?: HumeClient;
313
+ voice?: {
314
+ name?: string;
315
+ id?: string;
316
+ provider?: string;
317
+ };
318
+ description?: string;
319
+ speed?: number;
320
+ version?: string;
281
321
  };
282
322
  declare class HumeTTS extends MagmaFlowTextToSpeech {
283
323
  private client;
324
+ private voice?;
325
+ private description?;
326
+ private speed?;
327
+ private version?;
284
328
  constructor(args: HumeTTSArgs);
285
329
  setup(): Promise<void>;
286
330
  input(text: string | null, requestId: string): void;
@@ -302,4 +346,4 @@ declare class WhisperTTS extends MagmaFlowTextToSpeech {
302
346
 
303
347
  declare function splitTextIntoChunks(text: string, targetLength?: number): string[];
304
348
 
305
- export { DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
349
+ export { type DeepgramFluxConfig, DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, type DeepgramSTTArgs, DeepgramTTS, type DeepgramTTSArgs, type DeepgramWord, ElevenLabsTTS, type ElevenLabsTTSArgs, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, type GladiaSTTArgs, HumeTTS, type HumeTTSArgs, MagmaFlow, type MagmaFlowArgs, type MagmaFlowSTTOutput, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, type Turn, WhisperTTS, type WhisperTTSArgs, splitTextIntoChunks };
package/dist/voice.js CHANGED
@@ -1,12 +1,13 @@
1
1
  'use strict';
2
2
 
3
3
  var sdk = require('@deepgram/sdk');
4
- var ws = require('ws');
4
+ var WebSocket2 = require('ws');
5
5
  var hume = require('hume');
6
6
  var OpenAI = require('openai');
7
7
 
8
8
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
9
9
 
10
+ var WebSocket2__default = /*#__PURE__*/_interopDefault(WebSocket2);
10
11
  var OpenAI__default = /*#__PURE__*/_interopDefault(OpenAI);
11
12
 
12
13
  // src/helpers/bufferToInt16Array.ts
@@ -424,6 +425,7 @@ var MagmaFlow = class {
424
425
  inputFormat;
425
426
  outputFormat;
426
427
  onAudioOutput;
428
+ onNormalizedAudio;
427
429
  textBuffer = "";
428
430
  textQueue = [];
429
431
  generatingAudio = false;
@@ -440,6 +442,7 @@ var MagmaFlow = class {
440
442
  this.inputFormat = args.inputFormat;
441
443
  this.outputFormat = args.outputFormat;
442
444
  this.onAudioOutput = args.onAudioOutput;
445
+ this.onNormalizedAudio = args.onNormalizedAudio;
443
446
  this.config = { ...this.config, ...args.config };
444
447
  this.tts.onOutput = (audio, requestId) => {
445
448
  if (this.currentRequestId !== requestId) {
@@ -478,7 +481,9 @@ var MagmaFlow = class {
478
481
  inputAudio(audio) {
479
482
  const decodedAudio = decodeToPcm(audio, this.inputFormat.encoding);
480
483
  const resampledPCM = resamplePcm(decodedAudio, this.inputFormat.sampleRate, 48e3);
481
- this.stt.input(int16ArrayToBuffer(resampledPCM));
484
+ const pcmBuffer = int16ArrayToBuffer(resampledPCM);
485
+ this.onNormalizedAudio?.(pcmBuffer);
486
+ this.stt.input(pcmBuffer);
482
487
  }
483
488
  inputText(text) {
484
489
  if (text === void 0 || text === null) {
@@ -717,6 +722,156 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
717
722
  }
718
723
  }
719
724
  };
725
+ var kReconnectBaseMs = 500;
726
+ var kReconnectMaxMs = 1e4;
727
+ var DeepgramFluxSTT = class extends MagmaFlowSpeechToText {
728
+ ws = null;
729
+ apiKey;
730
+ eotThreshold;
731
+ eagerEotThreshold;
732
+ eotTimeoutMs;
733
+ audioQueue = [];
734
+ connecting = false;
735
+ killed = false;
736
+ reconnectAttempts = 0;
737
+ reconnectTimer;
738
+ constructor(config) {
739
+ super();
740
+ this.apiKey = config?.apiKey ?? process.env.DEEPGRAM_API_KEY;
741
+ this.eotThreshold = config?.eotThreshold ?? 0.7;
742
+ this.eagerEotThreshold = config?.eagerEotThreshold;
743
+ this.eotTimeoutMs = config?.eotTimeoutMs ?? 5e3;
744
+ }
745
+ connect() {
746
+ if (this.connecting || this.killed) return;
747
+ this.connecting = true;
748
+ const params = new URLSearchParams({
749
+ model: "flux-general-en",
750
+ encoding: "linear16",
751
+ sample_rate: "48000",
752
+ eot_threshold: this.eotThreshold.toString(),
753
+ eot_timeout_ms: this.eotTimeoutMs.toString()
754
+ });
755
+ if (this.eagerEotThreshold !== void 0) {
756
+ params.set("eager_eot_threshold", this.eagerEotThreshold.toString());
757
+ }
758
+ const url = `wss://api.deepgram.com/v2/listen?${params.toString()}`;
759
+ this.ws = new WebSocket2__default.default(url, {
760
+ headers: { Authorization: `Token ${this.apiKey}` }
761
+ });
762
+ this.ws.on("open", () => {
763
+ console.log("[DeepgramFlux] Connected");
764
+ this.connecting = false;
765
+ this.reconnectAttempts = 0;
766
+ for (const queued of this.audioQueue) {
767
+ this.ws.send(queued);
768
+ }
769
+ this.audioQueue = [];
770
+ });
771
+ this.ws.on("message", (data) => {
772
+ try {
773
+ const msg = JSON.parse(data.toString());
774
+ this.handleMessage(msg);
775
+ } catch (err) {
776
+ console.error(`[DeepgramFlux] Parse error: ${err.message}`);
777
+ }
778
+ });
779
+ this.ws.on("error", (err) => {
780
+ console.error(`[DeepgramFlux] Error: ${err.message}`);
781
+ this.connecting = false;
782
+ });
783
+ this.ws.on("close", (code, reason) => {
784
+ console.log(`[DeepgramFlux] Closed: ${code} ${reason.toString()}`);
785
+ this.clearConnection();
786
+ if (!this.killed) {
787
+ this.scheduleReconnect();
788
+ }
789
+ });
790
+ }
791
+ scheduleReconnect() {
792
+ const delay = Math.min(
793
+ kReconnectBaseMs * Math.pow(2, this.reconnectAttempts),
794
+ kReconnectMaxMs
795
+ );
796
+ this.reconnectAttempts++;
797
+ console.log(
798
+ `[DeepgramFlux] Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`
799
+ );
800
+ this.reconnectTimer = setTimeout(() => {
801
+ this.reconnectTimer = void 0;
802
+ if (!this.killed && !this.ws) {
803
+ this.connect();
804
+ }
805
+ }, delay);
806
+ }
807
+ clearConnection() {
808
+ this.ws = null;
809
+ this.connecting = false;
810
+ }
811
+ handleMessage(msg) {
812
+ switch (msg.type) {
813
+ case "TurnInfo":
814
+ this.handleTurnInfo(msg);
815
+ break;
816
+ case "Connected":
817
+ break;
818
+ case "Error":
819
+ console.error(`[DeepgramFlux] Server error: ${JSON.stringify(msg)}`);
820
+ break;
821
+ default:
822
+ console.log(`[DeepgramFlux] Unknown message: ${JSON.stringify(msg)}`);
823
+ break;
824
+ }
825
+ }
826
+ handleTurnInfo(msg) {
827
+ switch (msg.event) {
828
+ case "StartOfTurn":
829
+ this.onSpeechDetected();
830
+ break;
831
+ case "EndOfTurn":
832
+ this.onOutput({ text: msg.transcript });
833
+ break;
834
+ case "EagerEndOfTurn":
835
+ this.onEagerEndOfTurn(msg.transcript);
836
+ break;
837
+ case "TurnResumed":
838
+ this.onTurnResumed();
839
+ break;
840
+ }
841
+ }
842
+ input(audio) {
843
+ if (!this.ws && !this.connecting && !this.reconnectTimer) {
844
+ this.connect();
845
+ }
846
+ if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
847
+ this.ws.send(audio);
848
+ } else {
849
+ this.audioQueue.push(audio);
850
+ }
851
+ }
852
+ flush() {
853
+ if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
854
+ this.ws.send(JSON.stringify({ type: "Finalize" }));
855
+ }
856
+ }
857
+ kill() {
858
+ this.killed = true;
859
+ if (this.reconnectTimer) {
860
+ clearTimeout(this.reconnectTimer);
861
+ this.reconnectTimer = void 0;
862
+ }
863
+ if (this.ws?.readyState === WebSocket2__default.default.OPEN) {
864
+ this.ws.send(JSON.stringify({ type: "CloseStream" }));
865
+ this.ws.close();
866
+ }
867
+ this.clearConnection();
868
+ this.audioQueue = [];
869
+ }
870
+ onEagerEndOfTurn(transcript) {
871
+ }
872
+ onTurnResumed() {
873
+ }
874
+ };
720
875
  var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
721
876
  var DummyWebSocket = class {
722
877
  url;
@@ -745,7 +900,7 @@ var QueueWebSocket = class {
745
900
  if (NATIVE_WEBSOCKET_AVAILABLE) {
746
901
  this.ws = new WebSocket(this.url);
747
902
  } else {
748
- this.ws = new ws.WebSocket(this.url);
903
+ this.ws = new WebSocket2.WebSocket(this.url);
749
904
  }
750
905
  } else {
751
906
  this.ws = new DummyWebSocket(null);
@@ -759,7 +914,7 @@ var QueueWebSocket = class {
759
914
  if (NATIVE_WEBSOCKET_AVAILABLE) {
760
915
  this.ws = new WebSocket(this.url);
761
916
  } else {
762
- this.ws = new ws.WebSocket(this.url);
917
+ this.ws = new WebSocket2.WebSocket(this.url);
763
918
  }
764
919
  } else {
765
920
  this.ws = new DummyWebSocket(null);
@@ -1084,7 +1239,6 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
1084
1239
  ).then(async (response) => {
1085
1240
  const reader = response.body?.getReader();
1086
1241
  if (!reader) return;
1087
- new TextDecoder();
1088
1242
  while (true) {
1089
1243
  const { done, value } = await reader.read();
1090
1244
  if (done) break;
@@ -1101,9 +1255,17 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
1101
1255
  };
1102
1256
  var HumeTTS = class extends MagmaFlowTextToSpeech {
1103
1257
  client;
1258
+ voice;
1259
+ description;
1260
+ speed;
1261
+ version;
1104
1262
  constructor(args) {
1105
1263
  super();
1106
1264
  this.client = args.client ?? new hume.HumeClient({ apiKey: process.env.HUME_API_KEY });
1265
+ this.voice = args.voice;
1266
+ this.description = args.description;
1267
+ this.speed = args.speed;
1268
+ this.version = args.version;
1107
1269
  }
1108
1270
  async setup() {
1109
1271
  }
@@ -1111,22 +1273,28 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
1111
1273
  if (!text) {
1112
1274
  return;
1113
1275
  }
1114
- this.client.tts.synthesizeJsonStreaming({
1115
- utterances: [
1116
- {
1117
- text
1118
- }
1119
- ],
1120
- format: {
1121
- type: "pcm"
1122
- },
1123
- instantMode: true
1124
- }).then(async (stream) => {
1276
+ const utterance = { text };
1277
+ if (this.voice) utterance.voice = this.voice;
1278
+ if (this.description) utterance.description = this.description;
1279
+ if (this.speed !== void 0) utterance.speed = this.speed;
1280
+ const params = {
1281
+ utterances: [utterance],
1282
+ format: { type: "pcm" },
1283
+ instantMode: true,
1284
+ stripHeaders: true
1285
+ };
1286
+ if (this.version) params.version = this.version;
1287
+ this.client.tts.synthesizeJsonStreaming(params).then(async (stream) => {
1125
1288
  for await (const chunk of stream) {
1126
- this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
1289
+ if (chunk.type === "audio") {
1290
+ this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
1291
+ }
1127
1292
  }
1128
1293
  this.onOutput(null, requestId);
1129
1294
  console.log("[Hume] Finished:", text);
1295
+ }).catch((err) => {
1296
+ console.error(`[Hume] Error: ${err.message}`);
1297
+ this.onOutput(null, requestId);
1130
1298
  });
1131
1299
  }
1132
1300
  kill() {
@@ -1169,6 +1337,7 @@ var WhisperTTS = class extends MagmaFlowTextToSpeech {
1169
1337
  }
1170
1338
  };
1171
1339
 
1340
+ exports.DeepgramFluxSTT = DeepgramFluxSTT;
1172
1341
  exports.DeepgramLanguage = DeepgramLanguage;
1173
1342
  exports.DeepgramModel = DeepgramModel;
1174
1343
  exports.DeepgramSTT = DeepgramSTT;
package/dist/voice.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  import { DeepgramClient, LiveTranscriptionEvents } from '@deepgram/sdk';
2
- import { WebSocket as WebSocket$1 } from 'ws';
2
+ import WebSocket2, { WebSocket as WebSocket$1 } from 'ws';
3
3
  import { HumeClient } from 'hume';
4
4
  import OpenAI from 'openai';
5
5
 
@@ -418,6 +418,7 @@ var MagmaFlow = class {
418
418
  inputFormat;
419
419
  outputFormat;
420
420
  onAudioOutput;
421
+ onNormalizedAudio;
421
422
  textBuffer = "";
422
423
  textQueue = [];
423
424
  generatingAudio = false;
@@ -434,6 +435,7 @@ var MagmaFlow = class {
434
435
  this.inputFormat = args.inputFormat;
435
436
  this.outputFormat = args.outputFormat;
436
437
  this.onAudioOutput = args.onAudioOutput;
438
+ this.onNormalizedAudio = args.onNormalizedAudio;
437
439
  this.config = { ...this.config, ...args.config };
438
440
  this.tts.onOutput = (audio, requestId) => {
439
441
  if (this.currentRequestId !== requestId) {
@@ -472,7 +474,9 @@ var MagmaFlow = class {
472
474
  inputAudio(audio) {
473
475
  const decodedAudio = decodeToPcm(audio, this.inputFormat.encoding);
474
476
  const resampledPCM = resamplePcm(decodedAudio, this.inputFormat.sampleRate, 48e3);
475
- this.stt.input(int16ArrayToBuffer(resampledPCM));
477
+ const pcmBuffer = int16ArrayToBuffer(resampledPCM);
478
+ this.onNormalizedAudio?.(pcmBuffer);
479
+ this.stt.input(pcmBuffer);
476
480
  }
477
481
  inputText(text) {
478
482
  if (text === void 0 || text === null) {
@@ -711,6 +715,156 @@ var DeepgramSTT = class extends MagmaFlowSpeechToText {
711
715
  }
712
716
  }
713
717
  };
718
+ var kReconnectBaseMs = 500;
719
+ var kReconnectMaxMs = 1e4;
720
+ var DeepgramFluxSTT = class extends MagmaFlowSpeechToText {
721
+ ws = null;
722
+ apiKey;
723
+ eotThreshold;
724
+ eagerEotThreshold;
725
+ eotTimeoutMs;
726
+ audioQueue = [];
727
+ connecting = false;
728
+ killed = false;
729
+ reconnectAttempts = 0;
730
+ reconnectTimer;
731
+ constructor(config) {
732
+ super();
733
+ this.apiKey = config?.apiKey ?? process.env.DEEPGRAM_API_KEY;
734
+ this.eotThreshold = config?.eotThreshold ?? 0.7;
735
+ this.eagerEotThreshold = config?.eagerEotThreshold;
736
+ this.eotTimeoutMs = config?.eotTimeoutMs ?? 5e3;
737
+ }
738
+ connect() {
739
+ if (this.connecting || this.killed) return;
740
+ this.connecting = true;
741
+ const params = new URLSearchParams({
742
+ model: "flux-general-en",
743
+ encoding: "linear16",
744
+ sample_rate: "48000",
745
+ eot_threshold: this.eotThreshold.toString(),
746
+ eot_timeout_ms: this.eotTimeoutMs.toString()
747
+ });
748
+ if (this.eagerEotThreshold !== void 0) {
749
+ params.set("eager_eot_threshold", this.eagerEotThreshold.toString());
750
+ }
751
+ const url = `wss://api.deepgram.com/v2/listen?${params.toString()}`;
752
+ this.ws = new WebSocket2(url, {
753
+ headers: { Authorization: `Token ${this.apiKey}` }
754
+ });
755
+ this.ws.on("open", () => {
756
+ console.log("[DeepgramFlux] Connected");
757
+ this.connecting = false;
758
+ this.reconnectAttempts = 0;
759
+ for (const queued of this.audioQueue) {
760
+ this.ws.send(queued);
761
+ }
762
+ this.audioQueue = [];
763
+ });
764
+ this.ws.on("message", (data) => {
765
+ try {
766
+ const msg = JSON.parse(data.toString());
767
+ this.handleMessage(msg);
768
+ } catch (err) {
769
+ console.error(`[DeepgramFlux] Parse error: ${err.message}`);
770
+ }
771
+ });
772
+ this.ws.on("error", (err) => {
773
+ console.error(`[DeepgramFlux] Error: ${err.message}`);
774
+ this.connecting = false;
775
+ });
776
+ this.ws.on("close", (code, reason) => {
777
+ console.log(`[DeepgramFlux] Closed: ${code} ${reason.toString()}`);
778
+ this.clearConnection();
779
+ if (!this.killed) {
780
+ this.scheduleReconnect();
781
+ }
782
+ });
783
+ }
784
+ scheduleReconnect() {
785
+ const delay = Math.min(
786
+ kReconnectBaseMs * Math.pow(2, this.reconnectAttempts),
787
+ kReconnectMaxMs
788
+ );
789
+ this.reconnectAttempts++;
790
+ console.log(
791
+ `[DeepgramFlux] Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`
792
+ );
793
+ this.reconnectTimer = setTimeout(() => {
794
+ this.reconnectTimer = void 0;
795
+ if (!this.killed && !this.ws) {
796
+ this.connect();
797
+ }
798
+ }, delay);
799
+ }
800
+ clearConnection() {
801
+ this.ws = null;
802
+ this.connecting = false;
803
+ }
804
+ handleMessage(msg) {
805
+ switch (msg.type) {
806
+ case "TurnInfo":
807
+ this.handleTurnInfo(msg);
808
+ break;
809
+ case "Connected":
810
+ break;
811
+ case "Error":
812
+ console.error(`[DeepgramFlux] Server error: ${JSON.stringify(msg)}`);
813
+ break;
814
+ default:
815
+ console.log(`[DeepgramFlux] Unknown message: ${JSON.stringify(msg)}`);
816
+ break;
817
+ }
818
+ }
819
+ handleTurnInfo(msg) {
820
+ switch (msg.event) {
821
+ case "StartOfTurn":
822
+ this.onSpeechDetected();
823
+ break;
824
+ case "EndOfTurn":
825
+ this.onOutput({ text: msg.transcript });
826
+ break;
827
+ case "EagerEndOfTurn":
828
+ this.onEagerEndOfTurn(msg.transcript);
829
+ break;
830
+ case "TurnResumed":
831
+ this.onTurnResumed();
832
+ break;
833
+ }
834
+ }
835
+ input(audio) {
836
+ if (!this.ws && !this.connecting && !this.reconnectTimer) {
837
+ this.connect();
838
+ }
839
+ if (this.ws?.readyState === WebSocket2.OPEN) {
840
+ this.ws.send(audio);
841
+ } else {
842
+ this.audioQueue.push(audio);
843
+ }
844
+ }
845
+ flush() {
846
+ if (this.ws?.readyState === WebSocket2.OPEN) {
847
+ this.ws.send(JSON.stringify({ type: "Finalize" }));
848
+ }
849
+ }
850
+ kill() {
851
+ this.killed = true;
852
+ if (this.reconnectTimer) {
853
+ clearTimeout(this.reconnectTimer);
854
+ this.reconnectTimer = void 0;
855
+ }
856
+ if (this.ws?.readyState === WebSocket2.OPEN) {
857
+ this.ws.send(JSON.stringify({ type: "CloseStream" }));
858
+ this.ws.close();
859
+ }
860
+ this.clearConnection();
861
+ this.audioQueue = [];
862
+ }
863
+ onEagerEndOfTurn(transcript) {
864
+ }
865
+ onTurnResumed() {
866
+ }
867
+ };
714
868
  var NATIVE_WEBSOCKET_AVAILABLE = typeof WebSocket !== "undefined";
715
869
  var DummyWebSocket = class {
716
870
  url;
@@ -1078,7 +1232,6 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
1078
1232
  ).then(async (response) => {
1079
1233
  const reader = response.body?.getReader();
1080
1234
  if (!reader) return;
1081
- new TextDecoder();
1082
1235
  while (true) {
1083
1236
  const { done, value } = await reader.read();
1084
1237
  if (done) break;
@@ -1095,9 +1248,17 @@ var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
1095
1248
  };
1096
1249
  var HumeTTS = class extends MagmaFlowTextToSpeech {
1097
1250
  client;
1251
+ voice;
1252
+ description;
1253
+ speed;
1254
+ version;
1098
1255
  constructor(args) {
1099
1256
  super();
1100
1257
  this.client = args.client ?? new HumeClient({ apiKey: process.env.HUME_API_KEY });
1258
+ this.voice = args.voice;
1259
+ this.description = args.description;
1260
+ this.speed = args.speed;
1261
+ this.version = args.version;
1101
1262
  }
1102
1263
  async setup() {
1103
1264
  }
@@ -1105,22 +1266,28 @@ var HumeTTS = class extends MagmaFlowTextToSpeech {
1105
1266
  if (!text) {
1106
1267
  return;
1107
1268
  }
1108
- this.client.tts.synthesizeJsonStreaming({
1109
- utterances: [
1110
- {
1111
- text
1112
- }
1113
- ],
1114
- format: {
1115
- type: "pcm"
1116
- },
1117
- instantMode: true
1118
- }).then(async (stream) => {
1269
+ const utterance = { text };
1270
+ if (this.voice) utterance.voice = this.voice;
1271
+ if (this.description) utterance.description = this.description;
1272
+ if (this.speed !== void 0) utterance.speed = this.speed;
1273
+ const params = {
1274
+ utterances: [utterance],
1275
+ format: { type: "pcm" },
1276
+ instantMode: true,
1277
+ stripHeaders: true
1278
+ };
1279
+ if (this.version) params.version = this.version;
1280
+ this.client.tts.synthesizeJsonStreaming(params).then(async (stream) => {
1119
1281
  for await (const chunk of stream) {
1120
- this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
1282
+ if (chunk.type === "audio") {
1283
+ this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
1284
+ }
1121
1285
  }
1122
1286
  this.onOutput(null, requestId);
1123
1287
  console.log("[Hume] Finished:", text);
1288
+ }).catch((err) => {
1289
+ console.error(`[Hume] Error: ${err.message}`);
1290
+ this.onOutput(null, requestId);
1124
1291
  });
1125
1292
  }
1126
1293
  kill() {
@@ -1163,4 +1330,4 @@ var WhisperTTS = class extends MagmaFlowTextToSpeech {
1163
1330
  }
1164
1331
  };
1165
1332
 
1166
- export { DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
1333
+ export { DeepgramFluxSTT, DeepgramLanguage, DeepgramModel, DeepgramSTT, DeepgramTTS, ElevenLabsTTS, ElevenLabsVoice, GladiaLanguage, GladiaModel, GladiaSTT, HumeTTS, MagmaFlow, MagmaFlowSpeechToText, MagmaFlowTextToSpeech, WhisperTTS, splitTextIntoChunks };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pompeii-labs/audio",
3
- "version": "0.2.1",
3
+ "version": "0.3.1",
4
4
  "description": "The Audio SDK from Pompeii Labs",
5
5
  "keywords": [
6
6
  "Pompeii",
@@ -42,7 +42,7 @@
42
42
  },
43
43
  "dependencies": {
44
44
  "@deepgram/sdk": "4.2.0",
45
- "hume": "0.11.1",
45
+ "hume": "0.15.13",
46
46
  "openai": "4.86.2"
47
47
  },
48
48
  "devDependencies": {