@fonoster/apiserver 0.9.31 → 0.9.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,22 +5,22 @@ declare function createGetFnUtil(prisma: Prisma): (ref: string) => Promise<{
5
5
  };
6
6
  textToSpeech: {
7
7
  ref: string;
8
- config: import("@prisma/client/runtime/library").JsonValue;
9
8
  credentials: string | null;
9
+ config: import("@prisma/client/runtime/library").JsonValue;
10
10
  productRef: string;
11
11
  applicationRef: string;
12
12
  };
13
13
  speechToText: {
14
14
  ref: string;
15
- config: import("@prisma/client/runtime/library").JsonValue;
16
15
  credentials: string | null;
16
+ config: import("@prisma/client/runtime/library").JsonValue;
17
17
  productRef: string;
18
18
  applicationRef: string;
19
19
  };
20
20
  intelligence: {
21
21
  ref: string;
22
- config: import("@prisma/client/runtime/library").JsonValue;
23
22
  credentials: string | null;
23
+ config: import("@prisma/client/runtime/library").JsonValue;
24
24
  productRef: string;
25
25
  applicationRef: string;
26
26
  };
@@ -32,8 +32,13 @@ function httpBridge(identityConfig, params) {
32
32
  }
33
33
  res.setHeader("content-type", CONTENT_TYPE);
34
34
  stream.on("error", (error) => {
35
- logger.error(`Error reading file: ${error.message}`);
36
- res.status(500).send("Error reading file!");
35
+ logger.error(`error reading file: ${error.message}`);
36
+ if (!res.headersSent) {
37
+ res.status(500).send("Error reading file!");
38
+ }
39
+ else {
40
+ res.end();
41
+ }
37
42
  });
38
43
  stream.on("end", () => {
39
44
  res.end();
@@ -61,6 +66,11 @@ function httpBridge(identityConfig, params) {
61
66
  streamMap.set(id, stream);
62
67
  },
63
68
  removeStream: (id) => {
69
+ logger.verbose(`removing stream with id: ${id}`);
70
+ const stream = streamMap.get(id);
71
+ if (stream) {
72
+ stream.destroy();
73
+ }
64
74
  streamMap.delete(id);
65
75
  },
66
76
  getStream: (id) => {
@@ -52,7 +52,7 @@ declare class VoiceClientImpl implements VoiceClient {
52
52
  startSpeechGather(callback: (stream: {
53
53
  speech: string;
54
54
  responseTime: number;
55
- }) => void): Promise<void>;
55
+ }) => void): void;
56
56
  startDtmfGather(sessionRef: string, callback: (event: {
57
57
  digit: string;
58
58
  }) => void): Promise<void>;
@@ -106,7 +106,6 @@ class VoiceClientImpl {
106
106
  }
107
107
  catch (e) {
108
108
  logger.error("authz service error", e);
109
- // TODO: Play a different sound
110
109
  yield ari.channels.answer({ channelId });
111
110
  yield ari.channels.play({ channelId, media: "sound:unavailable" });
112
111
  yield new Promise((resolve) => setTimeout(resolve, 2000));
@@ -186,7 +185,13 @@ class VoiceClientImpl {
186
185
  }
187
186
  synthesize(text, options) {
188
187
  return __awaiter(this, void 0, void 0, function* () {
189
- const { ref, stream } = yield this.tts.synthesize(text, options);
188
+ const { ref, stream } = this.tts.synthesize(text, options);
189
+ stream.on("error", (error) => __awaiter(this, void 0, void 0, function* () {
190
+ logger.error(`stream error for ref ${ref}: ${error.message}`, {
191
+ errorDetails: error.stack || "No stack trace"
192
+ });
193
+ this.filesServer.removeStream(ref);
194
+ }));
190
195
  this.filesServer.addStream(ref, stream);
191
196
  return ref;
192
197
  });
@@ -203,15 +208,14 @@ class VoiceClientImpl {
203
208
  });
204
209
  }
205
210
  startSpeechGather(callback) {
206
- return __awaiter(this, void 0, void 0, function* () {
207
- try {
208
- const out = this.stt.streamTranscribe(this.transcriptionsStream);
209
- out.on("data", callback);
210
- }
211
- catch (e) {
212
- logger.error(e);
213
- }
214
- });
211
+ const out = this.stt.streamTranscribe(this.transcriptionsStream);
212
+ out.on("data", callback);
213
+ out.on("error", (error) => __awaiter(this, void 0, void 0, function* () {
214
+ logger.error("speech recognition error", { error });
215
+ const { sessionRef: channelId } = this.config;
216
+ const { ari } = this;
217
+ ari.channels.hangup({ channelId });
218
+ }));
215
219
  }
216
220
  startDtmfGather(sessionRef, callback) {
217
221
  return __awaiter(this, void 0, void 0, function* () {
@@ -83,15 +83,34 @@ class Deepgram extends AbstractSpeechToText_1.AbstractSpeechToText {
83
83
  streamTranscribe(stream) {
84
84
  const connection = this.client.listen.live(buildTranscribeConfig(this.engineConfig.config));
85
85
  const out = new stream_1.Stream();
86
+ // Add error handler immediately to catch any connection errors
87
+ connection.on(LiveTranscriptionEvents.Error, (err) => {
88
+ logger.error("error on Deepgram connection", { err });
89
+ // Emit error properly for handling upstream
90
+ out.emit("error", new Error("Speech recognition service error"));
91
+ try {
92
+ connection.destroy();
93
+ }
94
+ catch (destroyErr) {
95
+ logger.error("error destroying connection", { destroyErr });
96
+ }
97
+ });
86
98
  connection.on(LiveTranscriptionEvents.Open, () => {
87
99
  stream.on("data", (chunk) => {
88
- connection.send(chunk);
100
+ try {
101
+ connection.send(chunk);
102
+ }
103
+ catch (err) {
104
+ logger.error("error sending chunk to Deepgram", { err });
105
+ }
89
106
  });
90
107
  connection.on(LiveTranscriptionEvents.Transcript, (data) => {
91
- if (!data.channel.alternatives[0].transcript || !data.speech_final) {
108
+ var _a, _b, _c;
109
+ if (!((_c = (_b = (_a = data.channel) === null || _a === void 0 ? void 0 : _a.alternatives) === null || _b === void 0 ? void 0 : _b[0]) === null || _c === void 0 ? void 0 : _c.transcript) ||
110
+ !data.speech_final) {
92
111
  return;
93
112
  }
94
- const words = data.channel.alternatives[0].words;
113
+ const words = data.channel.alternatives[0].words || [];
95
114
  const responseTime = words.length > 0
96
115
  ? (words.reduce((acc, word) => acc + (word.end - word.start), 0) *
97
116
  1000) /
@@ -106,10 +125,30 @@ class Deepgram extends AbstractSpeechToText_1.AbstractSpeechToText {
106
125
  responseTime
107
126
  });
108
127
  });
109
- connection.on(LiveTranscriptionEvents.Error, (err) => {
110
- logger.warn("error on Deepgram connection", { err });
111
- connection.destroy();
128
+ });
129
+ // Handle stream errors and cleanup
130
+ stream.on("error", (err) => {
131
+ logger.warn("error on input stream", { err });
132
+ // Instead of emitting an error, just end the stream with a message
133
+ out.emit("data", {
134
+ speech: "Error with audio input stream",
135
+ responseTime: 0
112
136
  });
137
+ out.emit("end");
138
+ try {
139
+ connection.destroy();
140
+ }
141
+ catch (destroyErr) {
142
+ logger.warn("error destroying connection", { destroyErr });
143
+ }
144
+ });
145
+ stream.on("end", () => {
146
+ try {
147
+ connection.destroy();
148
+ }
149
+ catch (err) {
150
+ logger.error("error destroying connection on stream end", { err });
151
+ }
113
152
  });
114
153
  return out;
115
154
  }
@@ -143,10 +182,20 @@ class Deepgram extends AbstractSpeechToText_1.AbstractSpeechToText {
143
182
  });
144
183
  });
145
184
  stream.on("end", () => {
146
- connection.destroy();
185
+ try {
186
+ connection.destroy();
187
+ }
188
+ catch (destroyErr) {
189
+ logger.error("error destroying connection", { destroyErr });
190
+ }
147
191
  });
148
192
  stream.on("error", (err) => {
149
- connection.destroy();
193
+ try {
194
+ connection.destroy();
195
+ }
196
+ catch (destroyErr) {
197
+ logger.error("error destroying connection", { destroyErr });
198
+ }
150
199
  reject(err);
151
200
  });
152
201
  });
@@ -23,13 +23,18 @@ declare abstract class AbstractTextToSpeech<E, S extends SynthOptions = SynthOpt
23
23
  abstract readonly engineName: E;
24
24
  protected abstract OUTPUT_FORMAT: "wav" | "sln16";
25
25
  protected abstract CACHING_FIELDS: string[];
26
- abstract synthesize(text: string, options: S): Promise<{
26
+ abstract synthesize(text: string, options: S): {
27
27
  ref: string;
28
28
  stream: Readable;
29
- }>;
29
+ };
30
30
  static getConfigValidationSchema(): z.Schema;
31
31
  static getCredentialsValidationSchema(): z.Schema;
32
32
  protected createMediaReference(): string;
33
33
  getName(): E;
34
+ protected logSynthesisRequest(text: string, options: S): void;
35
+ protected safeSynthesize(ref: string, synthesisFunction: () => Promise<Readable>): Promise<{
36
+ ref: string;
37
+ stream: Readable;
38
+ }>;
34
39
  }
35
40
  export { AbstractTextToSpeech };
@@ -1,8 +1,21 @@
1
1
  "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
2
11
  Object.defineProperty(exports, "__esModule", { value: true });
3
12
  exports.AbstractTextToSpeech = void 0;
13
+ const logger_1 = require("@fonoster/logger");
4
14
  const uuid_1 = require("uuid");
5
15
  const MethodNotImplementedError_1 = require("../errors/MethodNotImplementedError");
16
+ const createErrorStream_1 = require("./utils/createErrorStream");
17
+ const isSsml_1 = require("./utils/isSsml");
18
+ const logger = (0, logger_1.getLogger)({ service: "apiserver", filePath: __filename });
6
19
  class AbstractTextToSpeech {
7
20
  static getConfigValidationSchema() {
8
21
  throw new MethodNotImplementedError_1.MethodNotImplementedError();
@@ -16,5 +29,22 @@ class AbstractTextToSpeech {
16
29
  getName() {
17
30
  return this.engineName;
18
31
  }
32
+ logSynthesisRequest(text, options) {
33
+ logger.verbose(`synthesize [input: ${text}, isSsml=${(0, isSsml_1.isSsml)(text)} options: ${JSON.stringify(options)}]`);
34
+ }
35
+ safeSynthesize(ref, synthesisFunction) {
36
+ return __awaiter(this, void 0, void 0, function* () {
37
+ try {
38
+ const stream = yield synthesisFunction();
39
+ return { ref, stream };
40
+ }
41
+ catch (error) {
42
+ return {
43
+ ref,
44
+ stream: (0, createErrorStream_1.createErrorStream)(`${this.engineName} synthesis failed: ${error.message}`)
45
+ };
46
+ }
47
+ });
48
+ }
19
49
  }
20
50
  exports.AbstractTextToSpeech = AbstractTextToSpeech;
@@ -19,25 +19,18 @@
19
19
  import { Readable } from "stream";
20
20
  import * as z from "zod";
21
21
  import { AbstractTextToSpeech } from "./AbstractTextToSpeech";
22
- import { SynthOptions } from "./types";
22
+ import { AzureTTSConfig, SynthOptions } from "./types";
23
23
  declare const ENGINE_NAME = "tts.azure";
24
- type AzureTTSConfig = {
25
- [key: string]: Record<string, string>;
26
- credentials: {
27
- subscriptionKey: string;
28
- serviceRegion: string;
29
- };
30
- };
31
24
  declare class Azure extends AbstractTextToSpeech<typeof ENGINE_NAME> {
32
25
  config: AzureTTSConfig;
33
26
  readonly engineName = "tts.azure";
34
27
  protected readonly OUTPUT_FORMAT = "sln16";
35
28
  protected readonly CACHING_FIELDS: string[];
36
29
  constructor(config: AzureTTSConfig);
37
- synthesize(text: string, options: SynthOptions): Promise<{
30
+ synthesize(text: string, options: SynthOptions): {
38
31
  ref: string;
39
32
  stream: Readable;
40
- }>;
33
+ };
41
34
  static getConfigValidationSchema(): z.Schema;
42
35
  static getCredentialsValidationSchema(): z.Schema;
43
36
  }
@@ -43,34 +43,14 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
43
43
  };
44
44
  Object.defineProperty(exports, "__esModule", { value: true });
45
45
  exports.ENGINE_NAME = exports.Azure = void 0;
46
- /**
47
- * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
48
- * http://github.com/fonoster/fonoster
49
- *
50
- * This file is part of Fonoster
51
- *
52
- * Licensed under the MIT License (the "License");
53
- * you may not use this file except in compliance with
54
- * the License. You may obtain a copy of the License at
55
- *
56
- * https://opensource.org/licenses/MIT
57
- *
58
- * Unless required by applicable law or agreed to in writing, software
59
- * distributed under the License is distributed on an "AS IS" BASIS,
60
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
61
- * See the License for the specific language governing permissions and
62
- * limitations under the License.
63
- */
64
- const stream_1 = require("stream");
65
46
  const common_1 = require("@fonoster/common");
66
- const logger_1 = require("@fonoster/logger");
67
47
  const sdk = __importStar(require("microsoft-cognitiveservices-speech-sdk"));
68
48
  const z = __importStar(require("zod"));
69
49
  const AbstractTextToSpeech_1 = require("./AbstractTextToSpeech");
70
- const isSsml_1 = require("./isSsml");
50
+ const createChunkedSynthesisStream_1 = require("./utils/createChunkedSynthesisStream");
51
+ const isSsml_1 = require("./utils/isSsml");
71
52
  const ENGINE_NAME = "tts.azure";
72
53
  exports.ENGINE_NAME = ENGINE_NAME;
73
- const logger = (0, logger_1.getLogger)({ service: "apiserver", filePath: __filename });
74
54
  class Azure extends AbstractTextToSpeech_1.AbstractTextToSpeech {
75
55
  constructor(config) {
76
56
  super();
@@ -80,36 +60,45 @@ class Azure extends AbstractTextToSpeech_1.AbstractTextToSpeech {
80
60
  this.config = config;
81
61
  }
82
62
  synthesize(text, options) {
83
- return __awaiter(this, void 0, void 0, function* () {
84
- logger.verbose(`synthesize [input: ${text}, isSsml=${(0, isSsml_1.isSsml)(text)} options: ${JSON.stringify(options)}]`);
85
- const { subscriptionKey, serviceRegion } = this.config.credentials;
86
- const speechConfig = sdk.SpeechConfig.fromSubscription(subscriptionKey, serviceRegion);
87
- speechConfig.speechSynthesisVoiceName = options.voice;
88
- speechConfig.speechSynthesisOutputFormat =
89
- sdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm;
63
+ this.logSynthesisRequest(text, options);
64
+ const ref = this.createMediaReference();
65
+ const { subscriptionKey, serviceRegion } = this.config.credentials;
66
+ const voice = options.voice || this.config.config.voice;
67
+ const speechConfig = sdk.SpeechConfig.fromSubscription(subscriptionKey, serviceRegion);
68
+ speechConfig.speechSynthesisVoiceName = voice;
69
+ speechConfig.speechSynthesisOutputFormat =
70
+ sdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm;
71
+ const stream = (0, createChunkedSynthesisStream_1.createChunkedSynthesisStream)(text, (chunkText) => __awaiter(this, void 0, void 0, function* () {
90
72
  const synthesizer = new sdk.SpeechSynthesizer(speechConfig);
91
- const isSSML = (0, isSsml_1.isSsml)(text);
73
+ const isSSML = (0, isSsml_1.isSsml)(chunkText);
92
74
  const func = isSSML ? "speakSsmlAsync" : "speakTextAsync";
93
- const audioData = yield new Promise((resolve, reject) => {
94
- const audioChunks = [];
95
- synthesizer[func](text, (result) => {
96
- if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
97
- audioChunks.push(Buffer.from(result.audioData));
98
- resolve(Buffer.concat(audioChunks));
99
- }
100
- else {
101
- reject(new Error("Speech synthesis canceled: " + result.errorDetails));
102
- }
103
- synthesizer.close();
104
- }, (err) => {
105
- synthesizer.close();
106
- reject(new Error(err));
75
+ try {
76
+ const audioData = yield new Promise((resolve, reject) => {
77
+ const audioChunks = [];
78
+ synthesizer[func](chunkText, (result) => {
79
+ if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
80
+ audioChunks.push(Buffer.from(result.audioData));
81
+ resolve(Buffer.concat(audioChunks));
82
+ }
83
+ else {
84
+ reject(new Error("Speech synthesis canceled: " + result.errorDetails));
85
+ }
86
+ synthesizer.close();
87
+ }, (err) => {
88
+ synthesizer.close();
89
+ reject(new Error(err));
90
+ });
107
91
  });
108
- });
109
- const ref = this.createMediaReference();
110
- const stream = stream_1.Readable.from(audioData);
111
- return { ref, stream };
112
- });
92
+ // Ignore the first 44 bytes of the response to avoid the WAV header
93
+ return audioData.subarray(44);
94
+ }
95
+ catch (error) {
96
+ // Make sure synthesizer is closed in case of error
97
+ synthesizer.close();
98
+ throw error;
99
+ }
100
+ }));
101
+ return { ref, stream };
113
102
  }
114
103
  static getConfigValidationSchema() {
115
104
  return z.object({
@@ -20,14 +20,8 @@ import { Readable } from "stream";
20
20
  import { DeepgramClient } from "@deepgram/sdk";
21
21
  import * as z from "zod";
22
22
  import { AbstractTextToSpeech } from "./AbstractTextToSpeech";
23
- import { SynthOptions } from "./types";
23
+ import { DeepgramTtsConfig, SynthOptions } from "./types";
24
24
  declare const ENGINE_NAME = "tts.deepgram";
25
- type DeepgramTtsConfig = {
26
- [key: string]: Record<string, string>;
27
- credentials: {
28
- apiKey: string;
29
- };
30
- };
31
25
  declare class Deepgram extends AbstractTextToSpeech<typeof ENGINE_NAME> {
32
26
  client: DeepgramClient;
33
27
  engineConfig: DeepgramTtsConfig;
@@ -37,11 +31,10 @@ declare class Deepgram extends AbstractTextToSpeech<typeof ENGINE_NAME> {
37
31
  protected readonly AUDIO_ENCODING: "linear16";
38
32
  protected readonly SAMPLE_RATE_HERTZ = 16000;
39
33
  constructor(config: DeepgramTtsConfig);
40
- synthesize(text: string, options: SynthOptions): Promise<{
34
+ synthesize(text: string, options: SynthOptions): {
41
35
  ref: string;
42
36
  stream: Readable;
43
- }>;
44
- private doSynthesize;
37
+ };
45
38
  static getConfigValidationSchema(): z.Schema;
46
39
  static getCredentialsValidationSchema(): z.Schema;
47
40
  }
@@ -43,36 +43,14 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
43
43
  };
44
44
  Object.defineProperty(exports, "__esModule", { value: true });
45
45
  exports.ENGINE_NAME = exports.Deepgram = void 0;
46
- /**
47
- * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
48
- * http://github.com/fonoster/fonoster
49
- *
50
- * This file is part of Fonoster
51
- *
52
- * Licensed under the MIT License (the "License");
53
- * you may not use this file except in compliance with
54
- * the License. You may obtain a copy of the License at
55
- *
56
- * https://opensource.org/licenses/MIT
57
- *
58
- * Unless required by applicable law or agreed to in writing, software
59
- * distributed under the License is distributed on an "AS IS" BASIS,
60
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
61
- * See the License for the specific language governing permissions and
62
- * limitations under the License.
63
- */
64
- const stream_1 = require("stream");
65
46
  const sdk_1 = require("@deepgram/sdk");
66
47
  const common_1 = require("@fonoster/common");
67
- const logger_1 = require("@fonoster/logger");
68
48
  const z = __importStar(require("zod"));
69
- const textChunksByFirstNaturalPause_1 = require("../handlers/utils/textChunksByFirstNaturalPause");
70
49
  const AbstractTextToSpeech_1 = require("./AbstractTextToSpeech");
71
- const isSsml_1 = require("./isSsml");
72
- const streamToBuffer_1 = require("./streamToBuffer");
50
+ const createChunkedSynthesisStream_1 = require("./utils/createChunkedSynthesisStream");
51
+ const streamToBuffer_1 = require("./utils/streamToBuffer");
73
52
  const ENGINE_NAME = "tts.deepgram";
74
53
  exports.ENGINE_NAME = ENGINE_NAME;
75
- const logger = (0, logger_1.getLogger)({ service: "apiserver", filePath: __filename });
76
54
  class Deepgram extends AbstractTextToSpeech_1.AbstractTextToSpeech {
77
55
  constructor(config) {
78
56
  super();
@@ -85,51 +63,20 @@ class Deepgram extends AbstractTextToSpeech_1.AbstractTextToSpeech {
85
63
  this.engineConfig = config;
86
64
  }
87
65
  synthesize(text, options) {
88
- return __awaiter(this, void 0, void 0, function* () {
89
- logger.verbose(`synthesize [input: ${text}, isSsml=${(0, isSsml_1.isSsml)(text)} options: ${JSON.stringify(options)}]`);
90
- const { voice } = this.engineConfig.config;
91
- const ref = this.createMediaReference();
92
- const chunks = (0, textChunksByFirstNaturalPause_1.textChunksByFirstNaturalPause)(text);
93
- const stream = new stream_1.Readable({ read() { } });
94
- const results = new Array(chunks.length);
95
- let nextIndexToPush = 0;
96
- function observeQueue() {
97
- if (nextIndexToPush < results.length &&
98
- results[nextIndexToPush] !== undefined) {
99
- stream.push(results[nextIndexToPush]);
100
- nextIndexToPush++;
101
- setImmediate(observeQueue);
102
- }
103
- else if (nextIndexToPush < results.length) {
104
- setTimeout(observeQueue, 10);
105
- }
106
- else {
107
- stream.push(null);
108
- }
109
- }
110
- observeQueue();
111
- chunks.forEach((text, index) => {
112
- this.doSynthesize(text, voice)
113
- .then((synthesizedText) => {
114
- results[index] = synthesizedText;
115
- })
116
- .catch((error) => {
117
- stream.emit("error", error);
118
- });
119
- });
120
- return { ref, stream };
121
- });
122
- }
123
- doSynthesize(text, voice) {
124
- return __awaiter(this, void 0, void 0, function* () {
125
- const response = yield this.client.speak.request({ text }, {
126
- model: voice || common_1.DeepgramVoice.AURA_ASTERIA_EN,
66
+ this.logSynthesisRequest(text, options);
67
+ const { voice } = this.engineConfig.config;
68
+ const ref = this.createMediaReference();
69
+ const selectedVoice = voice || common_1.DeepgramVoice.AURA_ASTERIA_EN;
70
+ const stream = (0, createChunkedSynthesisStream_1.createChunkedSynthesisStream)(text, (chunkText) => __awaiter(this, void 0, void 0, function* () {
71
+ const response = yield this.client.speak.request({ text: chunkText }, {
72
+ model: selectedVoice,
127
73
  encoding: this.AUDIO_ENCODING,
128
74
  sample_rate: this.SAMPLE_RATE_HERTZ,
129
75
  container: "none"
130
76
  });
131
77
  return (yield (0, streamToBuffer_1.streamToBuffer)(yield response.getStream()));
132
- });
78
+ }));
79
+ return { ref, stream };
133
80
  }
134
81
  static getConfigValidationSchema() {
135
82
  return z.object({
@@ -20,14 +20,8 @@ import { Readable } from "stream";
20
20
  import { ElevenLabsClient } from "elevenlabs";
21
21
  import * as z from "zod";
22
22
  import { AbstractTextToSpeech } from "./AbstractTextToSpeech";
23
- import { SynthOptions } from "./types";
23
+ import { ElevenLabsTtsConfig, SynthOptions } from "./types";
24
24
  declare const ENGINE_NAME = "tts.elevenlabs";
25
- type ElevenLabsTtsConfig = {
26
- [key: string]: Record<string, string>;
27
- credentials: {
28
- apiKey: string;
29
- };
30
- };
31
25
  declare class ElevenLabs extends AbstractTextToSpeech<typeof ENGINE_NAME> {
32
26
  client: ElevenLabsClient;
33
27
  engineConfig: ElevenLabsTtsConfig;
@@ -35,11 +29,10 @@ declare class ElevenLabs extends AbstractTextToSpeech<typeof ENGINE_NAME> {
35
29
  protected readonly OUTPUT_FORMAT = "sln16";
36
30
  protected readonly CACHING_FIELDS: string[];
37
31
  constructor(config: ElevenLabsTtsConfig);
38
- synthesize(text: string, options: SynthOptions): Promise<{
32
+ synthesize(text: string, options: SynthOptions): {
39
33
  ref: string;
40
34
  stream: Readable;
41
- }>;
42
- private doSynthesize;
35
+ };
43
36
  static getConfigValidationSchema(): z.Schema;
44
37
  static getCredentialsValidationSchema(): z.Schema;
45
38
  }
@@ -43,35 +43,13 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
43
43
  };
44
44
  Object.defineProperty(exports, "__esModule", { value: true });
45
45
  exports.ElevenLabs = exports.ENGINE_NAME = void 0;
46
- /**
47
- * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
48
- * http://github.com/fonoster/fonoster
49
- *
50
- * This file is part of Fonoster
51
- *
52
- * Licensed under the MIT License (the "License");
53
- * you may not use this file except in compliance with
54
- * the License. You may obtain a copy of the License at
55
- *
56
- * https://opensource.org/licenses/MIT
57
- *
58
- * Unless required by applicable law or agreed to in writing, software
59
- * distributed under the License is distributed on an "AS IS" BASIS,
60
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
61
- * See the License for the specific language governing permissions and
62
- * limitations under the License.
63
- */
64
- const stream_1 = require("stream");
65
- const logger_1 = require("@fonoster/logger");
66
46
  const elevenlabs_1 = require("elevenlabs");
67
47
  const z = __importStar(require("zod"));
68
- const textChunksByFirstNaturalPause_1 = require("../handlers/utils/textChunksByFirstNaturalPause"); // Assuming this is the chunking function
69
48
  const AbstractTextToSpeech_1 = require("./AbstractTextToSpeech");
70
- const isSsml_1 = require("./isSsml");
71
- const streamToBuffer_1 = require("./streamToBuffer");
49
+ const createChunkedSynthesisStream_1 = require("./utils/createChunkedSynthesisStream");
50
+ const streamToBuffer_1 = require("./utils/streamToBuffer");
72
51
  const ENGINE_NAME = "tts.elevenlabs";
73
52
  exports.ENGINE_NAME = ENGINE_NAME;
74
- const logger = (0, logger_1.getLogger)({ service: "apiserver", filePath: __filename });
75
53
  class ElevenLabs extends AbstractTextToSpeech_1.AbstractTextToSpeech {
76
54
  constructor(config) {
77
55
  super();
@@ -82,55 +60,24 @@ class ElevenLabs extends AbstractTextToSpeech_1.AbstractTextToSpeech {
82
60
  this.engineConfig = config;
83
61
  }
84
62
  synthesize(text, options) {
85
- return __awaiter(this, void 0, void 0, function* () {
86
- logger.verbose(`synthesize [input: ${text}, isSsml=${(0, isSsml_1.isSsml)(text)} options: ${JSON.stringify(options)}]`);
87
- const { voice, model } = this.engineConfig.config;
88
- const ref = this.createMediaReference();
89
- const chunks = (0, textChunksByFirstNaturalPause_1.textChunksByFirstNaturalPause)(text);
90
- const stream = new stream_1.Readable({ read() { } });
91
- const results = new Array(chunks.length);
92
- let nextIndexToPush = 0;
93
- function observeQueue() {
94
- if (nextIndexToPush < results.length &&
95
- results[nextIndexToPush] !== undefined) {
96
- stream.push(results[nextIndexToPush]);
97
- nextIndexToPush++;
98
- setImmediate(observeQueue);
99
- }
100
- else if (nextIndexToPush < results.length) {
101
- setTimeout(observeQueue, 10);
102
- }
103
- else {
104
- stream.push(null);
105
- }
106
- }
107
- observeQueue();
108
- chunks.forEach((text, index) => {
109
- this.doSynthesize({ text, voice, model })
110
- .then((synthesizedText) => {
111
- results[index] = synthesizedText;
112
- })
113
- .catch((error) => {
114
- stream.emit("error", error);
115
- });
116
- });
117
- return { ref, stream };
118
- });
119
- }
120
- doSynthesize(params) {
121
- return __awaiter(this, void 0, void 0, function* () {
122
- const { text, voice, model } = params;
63
+ this.logSynthesisRequest(text, options);
64
+ const { voice, model } = this.engineConfig.config;
65
+ const ref = this.createMediaReference();
66
+ const stream = (0, createChunkedSynthesisStream_1.createChunkedSynthesisStream)(text, (chunkText) => __awaiter(this, void 0, void 0, function* () {
123
67
  const response = yield this.client.generate({
124
68
  stream: true,
125
69
  voice,
126
- text,
70
+ text: chunkText,
127
71
  model_id: model !== null && model !== void 0 ? model : "eleven_flash_v2_5",
128
72
  output_format: "pcm_16000",
129
73
  // TODO: Make this configurable
130
74
  optimize_streaming_latency: 2
75
+ }, {
76
+ maxRetries: 3
131
77
  });
132
78
  return (yield (0, streamToBuffer_1.streamToBuffer)(response));
133
- });
79
+ }));
80
+ return { ref, stream };
134
81
  }
135
82
  static getConfigValidationSchema() {
136
83
  return z.object({});
@@ -20,15 +20,8 @@ import { Readable } from "stream";
20
20
  import { TextToSpeechClient } from "@google-cloud/text-to-speech";
21
21
  import * as z from "zod";
22
22
  import { AbstractTextToSpeech } from "./AbstractTextToSpeech";
23
- import { SynthOptions } from "./types";
23
+ import { GoogleTtsConfig, SynthOptions } from "./types";
24
24
  declare const ENGINE_NAME = "tts.google";
25
- type GoogleTtsConfig = {
26
- [key: string]: Record<string, string>;
27
- credentials: {
28
- client_email: string;
29
- private_key: string;
30
- };
31
- };
32
25
  declare class Google extends AbstractTextToSpeech<typeof ENGINE_NAME> {
33
26
  client: TextToSpeechClient;
34
27
  engineConfig: GoogleTtsConfig;
@@ -38,10 +31,10 @@ declare class Google extends AbstractTextToSpeech<typeof ENGINE_NAME> {
38
31
  protected readonly AUDIO_ENCODING: "LINEAR16";
39
32
  protected readonly SAMPLE_RATE_HERTZ = 16000;
40
33
  constructor(config: GoogleTtsConfig);
41
- synthesize(text: string, options: SynthOptions): Promise<{
34
+ synthesize(text: string, options: SynthOptions): {
42
35
  ref: string;
43
36
  stream: Readable;
44
- }>;
37
+ };
45
38
  static getConfigValidationSchema(): z.Schema;
46
39
  static getCredentialsValidationSchema(): z.Schema;
47
40
  }
@@ -43,34 +43,14 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
43
43
  };
44
44
  Object.defineProperty(exports, "__esModule", { value: true });
45
45
  exports.Google = exports.ENGINE_NAME = void 0;
46
- /**
47
- * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
48
- * http://github.com/fonoster/fonoster
49
- *
50
- * This file is part of Fonoster
51
- *
52
- * Licensed under the MIT License (the "License");
53
- * you may not use this file except in compliance with
54
- * the License. You may obtain a copy of the License at
55
- *
56
- * https://opensource.org/licenses/MIT
57
- *
58
- * Unless required by applicable law or agreed to in writing, software
59
- * distributed under the License is distributed on an "AS IS" BASIS,
60
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
61
- * See the License for the specific language governing permissions and
62
- * limitations under the License.
63
- */
64
- const stream_1 = require("stream");
65
46
  const common_1 = require("@fonoster/common");
66
- const logger_1 = require("@fonoster/logger");
67
47
  const text_to_speech_1 = require("@google-cloud/text-to-speech");
68
48
  const z = __importStar(require("zod"));
69
49
  const AbstractTextToSpeech_1 = require("./AbstractTextToSpeech");
70
- const isSsml_1 = require("./isSsml");
50
+ const createChunkedSynthesisStream_1 = require("./utils/createChunkedSynthesisStream");
51
+ const isSsml_1 = require("./utils/isSsml");
71
52
  const ENGINE_NAME = "tts.google";
72
53
  exports.ENGINE_NAME = ENGINE_NAME;
73
- const logger = (0, logger_1.getLogger)({ service: "apiserver", filePath: __filename });
74
54
  class Google extends AbstractTextToSpeech_1.AbstractTextToSpeech {
75
55
  constructor(config) {
76
56
  super();
@@ -83,12 +63,13 @@ class Google extends AbstractTextToSpeech_1.AbstractTextToSpeech {
83
63
  this.engineConfig = config;
84
64
  }
85
65
  synthesize(text, options) {
86
- return __awaiter(this, void 0, void 0, function* () {
87
- logger.verbose(`synthesize [input: ${text}, isSsml=${(0, isSsml_1.isSsml)(text)} options: ${JSON.stringify(options)}]`);
88
- const { voice } = this.engineConfig.config;
89
- const lang = `${voice.split("-")[0]}-${voice.split("-")[1]}`;
66
+ this.logSynthesisRequest(text, options);
67
+ const ref = this.createMediaReference();
68
+ const { voice } = this.engineConfig.config;
69
+ const lang = `${voice.split("-")[0]}-${voice.split("-")[1]}`;
70
+ const stream = (0, createChunkedSynthesisStream_1.createChunkedSynthesisStream)(text, (chunkText) => __awaiter(this, void 0, void 0, function* () {
90
71
  const request = {
91
- input: (0, isSsml_1.isSsml)(text) ? { ssml: text } : { text },
72
+ input: (0, isSsml_1.isSsml)(chunkText) ? { ssml: chunkText } : { text: chunkText },
92
73
  audioConfig: {
93
74
  audioEncoding: this.AUDIO_ENCODING,
94
75
  sampleRateHertz: this.SAMPLE_RATE_HERTZ
@@ -99,9 +80,11 @@ class Google extends AbstractTextToSpeech_1.AbstractTextToSpeech {
99
80
  }
100
81
  };
101
82
  const [response] = yield this.client.synthesizeSpeech(request);
102
- const ref = this.createMediaReference();
103
- return { ref, stream: stream_1.Readable.from(response.audioContent) };
104
- });
83
+ const audioContent = response.audioContent;
84
+ // Ignore the first 44 bytes of the response to avoid the WAV header
85
+ return audioContent.subarray(44);
86
+ }));
87
+ return { ref, stream };
105
88
  }
106
89
  static getConfigValidationSchema() {
107
90
  return z.object({
@@ -19,4 +19,30 @@
19
19
  type SynthOptions = {
20
20
  voice: string;
21
21
  };
22
- export { SynthOptions };
22
+ type DeepgramTtsConfig = {
23
+ [key: string]: Record<string, string>;
24
+ credentials: {
25
+ apiKey: string;
26
+ };
27
+ };
28
+ type ElevenLabsTtsConfig = {
29
+ [key: string]: Record<string, string>;
30
+ credentials: {
31
+ apiKey: string;
32
+ };
33
+ };
34
+ type GoogleTtsConfig = {
35
+ [key: string]: Record<string, string>;
36
+ credentials: {
37
+ client_email: string;
38
+ private_key: string;
39
+ };
40
+ };
41
+ type AzureTTSConfig = {
42
+ [key: string]: Record<string, string>;
43
+ credentials: {
44
+ subscriptionKey: string;
45
+ serviceRegion: string;
46
+ };
47
+ };
48
+ export { SynthOptions, AzureTTSConfig, DeepgramTtsConfig, ElevenLabsTtsConfig, GoogleTtsConfig };
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
3
+ * http://github.com/fonoster/fonoster
4
+ *
5
+ * This file is part of Fonoster
6
+ *
7
+ * Licensed under the MIT License (the "License");
8
+ * you may not use this file except in compliance with
9
+ * the License. You may obtain a copy of the License at
10
+ *
11
+ * https://opensource.org/licenses/MIT
12
+ *
13
+ * Unless required by applicable law or agreed to in writing, software
14
+ * distributed under the License is distributed on an "AS IS" BASIS,
15
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ * See the License for the specific language governing permissions and
17
+ * limitations under the License.
18
+ */
19
+ import { Readable } from "stream";
20
+ /**
21
+ * Creates a readable stream that processes text in chunks for better streaming performance.
22
+ * This utility ensures that chunks are processed in parallel but streamed in the correct order.
23
+ *
24
+ * @param text - The text to be synthesized
25
+ * @param synthesizeChunk - Function that processes each chunk and returns a Buffer or Readable
26
+ * @returns A readable stream containing the synthesized audio
27
+ */
28
+ export declare function createChunkedSynthesisStream(text: string, synthesizeChunk: (text: string, index: number) => Promise<Buffer | Readable>): Readable;
@@ -0,0 +1,78 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.createChunkedSynthesisStream = createChunkedSynthesisStream;
4
+ /**
5
+ * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
6
+ * http://github.com/fonoster/fonoster
7
+ *
8
+ * This file is part of Fonoster
9
+ *
10
+ * Licensed under the MIT License (the "License");
11
+ * you may not use this file except in compliance with
12
+ * the License. You may obtain a copy of the License at
13
+ *
14
+ * https://opensource.org/licenses/MIT
15
+ *
16
+ * Unless required by applicable law or agreed to in writing, software
17
+ * distributed under the License is distributed on an "AS IS" BASIS,
18
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ * See the License for the specific language governing permissions and
20
+ * limitations under the License.
21
+ */
22
+ const stream_1 = require("stream");
23
+ const logger_1 = require("@fonoster/logger");
24
+ const textChunksByFirstNaturalPause_1 = require("../../handlers/utils/textChunksByFirstNaturalPause");
25
+ const logger = (0, logger_1.getLogger)({ service: "apiserver", filePath: __filename });
26
+ /**
27
+ * Creates a readable stream that processes text in chunks for better streaming performance.
28
+ * This utility ensures that chunks are processed in parallel but streamed in the correct order.
29
+ *
30
+ * @param text - The text to be synthesized
31
+ * @param synthesizeChunk - Function that processes each chunk and returns a Buffer or Readable
32
+ * @returns A readable stream containing the synthesized audio
33
+ */
34
+ function createChunkedSynthesisStream(text, synthesizeChunk) {
35
+ const chunks = (0, textChunksByFirstNaturalPause_1.textChunksByFirstNaturalPause)(text);
36
+ const stream = new stream_1.Readable({ read() { } });
37
+ if (chunks.length === 0) {
38
+ logger.verbose("no text chunks to synthesize, returning empty stream");
39
+ stream.push(null);
40
+ return stream;
41
+ }
42
+ logger.verbose(`processing ${chunks.length} text chunks for synthesis`);
43
+ const results = new Array(chunks.length);
44
+ let nextIndexToPush = 0;
45
+ let hasError = false;
46
+ function observeQueue() {
47
+ if (nextIndexToPush < results.length &&
48
+ results[nextIndexToPush] !== undefined) {
49
+ stream.push(results[nextIndexToPush]);
50
+ nextIndexToPush++;
51
+ setImmediate(observeQueue);
52
+ }
53
+ else if (nextIndexToPush < results.length) {
54
+ setTimeout(observeQueue, 10);
55
+ }
56
+ else {
57
+ stream.push(null);
58
+ }
59
+ }
60
+ observeQueue();
61
+ chunks.forEach((chunkText, index) => {
62
+ synthesizeChunk(chunkText, index)
63
+ .then((synthesizedText) => {
64
+ if (!hasError) {
65
+ results[index] = synthesizedText;
66
+ }
67
+ })
68
+ .catch((error) => {
69
+ if (!hasError) {
70
+ hasError = true;
71
+ logger.error(`chunk synthesis failed: ${error.message}`);
72
+ stream.emit("error", new Error(`Synthesis failed: ${error.message}`));
73
+ stream.push(null);
74
+ }
75
+ });
76
+ });
77
+ return stream;
78
+ }
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
3
+ * http://github.com/fonoster/fonoster
4
+ *
5
+ * This file is part of Fonoster
6
+ *
7
+ * Licensed under the MIT License (the "License");
8
+ * you may not use this file except in compliance with
9
+ * the License. You may obtain a copy of the License at
10
+ *
11
+ * https://opensource.org/licenses/MIT
12
+ *
13
+ * Unless required by applicable law or agreed to in writing, software
14
+ * distributed under the License is distributed on an "AS IS" BASIS,
15
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ * See the License for the specific language governing permissions and
17
+ * limitations under the License.
18
+ */
19
+ import { Readable } from "stream";
20
+ declare function createErrorStream(errorMessage: string): Readable;
21
+ export { createErrorStream };
@@ -0,0 +1,28 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.createErrorStream = createErrorStream;
4
+ /**
5
+ * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
6
+ * http://github.com/fonoster/fonoster
7
+ *
8
+ * This file is part of Fonoster
9
+ *
10
+ * Licensed under the MIT License (the "License");
11
+ * you may not use this file except in compliance with
12
+ * the License. You may obtain a copy of the License at
13
+ *
14
+ * https://opensource.org/licenses/MIT
15
+ *
16
+ * Unless required by applicable law or agreed to in writing, software
17
+ * distributed under the License is distributed on an "AS IS" BASIS,
18
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ * See the License for the specific language governing permissions and
20
+ * limitations under the License.
21
+ */
22
+ const stream_1 = require("stream");
23
+ function createErrorStream(errorMessage) {
24
+ const errorStream = new stream_1.Readable({ read() { } });
25
+ errorStream.emit("error", new Error(errorMessage));
26
+ errorStream.push(null);
27
+ return errorStream;
28
+ }
@@ -48,10 +48,10 @@ type VoiceClient = {
48
48
  getTranscriptionsStream: () => Stream;
49
49
  };
50
50
  type TextToSpeech = {
51
- synthesize: (text: string, options: Record<string, unknown>) => Promise<{
51
+ synthesize: (text: string, options: Record<string, unknown>) => {
52
52
  ref: string;
53
53
  stream: Readable;
54
- }>;
54
+ };
55
55
  };
56
56
  type SpeechToText = {
57
57
  transcribe: (stream: Stream) => Promise<SpeechResult>;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@fonoster/apiserver",
3
- "version": "0.9.31",
3
+ "version": "0.9.38",
4
4
  "description": "APIServer for Fonoster",
5
5
  "author": "Pedro Sanders <psanders@fonoster.com>",
6
6
  "homepage": "https://github.com/fonoster/fonoster#readme",
@@ -21,11 +21,11 @@
21
21
  },
22
22
  "dependencies": {
23
23
  "@deepgram/sdk": "^3.5.1",
24
- "@fonoster/authz": "^0.9.31",
25
- "@fonoster/common": "^0.9.31",
26
- "@fonoster/identity": "^0.9.31",
24
+ "@fonoster/authz": "^0.9.35",
25
+ "@fonoster/common": "^0.9.35",
26
+ "@fonoster/identity": "^0.9.35",
27
27
  "@fonoster/logger": "^0.9.30",
28
- "@fonoster/sipnet": "^0.9.31",
28
+ "@fonoster/sipnet": "^0.9.35",
29
29
  "@fonoster/streams": "^0.9.30",
30
30
  "@fonoster/types": "^0.9.30",
31
31
  "@google-cloud/speech": "^6.6.0",
@@ -73,5 +73,5 @@
73
73
  "@types/uuid": "^10.0.0",
74
74
  "@types/validator": "^13.12.0"
75
75
  },
76
- "gitHead": "8d324aaed02811c1b143e60fbd4a6a8091ec164e"
76
+ "gitHead": "a32db703d073809843cd2060bc4c6c88f939c083"
77
77
  }
File without changes