@juspay/neurolink 8.14.0 → 8.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/adapters/tts/googleTTSHandler.d.ts +36 -24
- package/dist/adapters/tts/googleTTSHandler.js +169 -18
- package/dist/lib/adapters/tts/googleTTSHandler.d.ts +36 -24
- package/dist/lib/adapters/tts/googleTTSHandler.js +169 -18
- package/dist/lib/types/ttsTypes.d.ts +15 -0
- package/dist/lib/utils/ttsProcessor.d.ts +1 -0
- package/dist/lib/utils/ttsProcessor.js +1 -0
- package/dist/types/ttsTypes.d.ts +15 -0
- package/dist/utils/ttsProcessor.d.ts +1 -0
- package/dist/utils/ttsProcessor.js +1 -0
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
## [8.15.0](https://github.com/juspay/neurolink/compare/v8.14.0...v8.15.0) (2025-12-14)
|
|
2
|
+
|
|
3
|
+
### Features
|
|
4
|
+
|
|
5
|
+
- **(tts):** Implement synthesize method in GoogleTTSHandler ([9262e37](https://github.com/juspay/neurolink/commit/9262e37a08ef856eb5d16fd65fa922bd700897cb))
|
|
6
|
+
|
|
1
7
|
## [8.14.0](https://github.com/juspay/neurolink/compare/v8.13.2...v8.14.0) (2025-12-14)
|
|
2
8
|
|
|
3
9
|
### Features
|
|
@@ -1,35 +1,28 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Google Cloud Text-to-Speech Handler
|
|
3
|
-
*
|
|
4
|
-
* Handler for Google Cloud Text-to-Speech API integration.
|
|
5
|
-
* Supports Neural2 and WaveNet voice models with 220+ voices across 40+ languages.
|
|
6
|
-
*
|
|
7
|
-
* @module adapters/tts/googleTTSHandler
|
|
8
|
-
* @see https://cloud.google.com/text-to-speech/docs
|
|
9
|
-
*/
|
|
10
1
|
import type { TTSHandler } from "../../utils/ttsProcessor.js";
|
|
11
2
|
import type { TTSOptions, TTSResult, TTSVoice } from "../../types/ttsTypes.js";
|
|
12
|
-
/**
|
|
13
|
-
* Google Cloud TTS handler implementation
|
|
14
|
-
*
|
|
15
|
-
* Integrates with Google Cloud Text-to-Speech API for voice synthesis.
|
|
16
|
-
* Supports authentication via:
|
|
17
|
-
* - Explicit service account JSON key path
|
|
18
|
-
* - GOOGLE_APPLICATION_CREDENTIALS environment variable
|
|
19
|
-
*/
|
|
20
3
|
export declare class GoogleTTSHandler implements TTSHandler {
|
|
21
4
|
private client;
|
|
22
5
|
/**
|
|
23
|
-
*
|
|
24
|
-
*
|
|
6
|
+
* Google Cloud TTS maximum input size.
|
|
7
|
+
* ~5000 bytes INCLUDING SSML tags.
|
|
25
8
|
*/
|
|
26
9
|
private static readonly DEFAULT_MAX_TEXT_LENGTH;
|
|
27
|
-
maxTextLength: number;
|
|
28
10
|
/**
|
|
29
|
-
*
|
|
11
|
+
* Default timeout for Google Cloud TTS API calls (milliseconds)
|
|
30
12
|
*
|
|
31
|
-
*
|
|
13
|
+
* Google typically responds within:
|
|
14
|
+
* - 1–5 seconds for short or normal text
|
|
15
|
+
* - 5–10 seconds for longer text or Neural2 voices
|
|
32
16
|
*/
|
|
17
|
+
private static readonly DEFAULT_API_TIMEOUT_MS;
|
|
18
|
+
/**
|
|
19
|
+
* Maximum text length supported by Google Cloud TTS (in bytes).
|
|
20
|
+
*
|
|
21
|
+
* NOTE:
|
|
22
|
+
* Validation against this limit is performed by the shared TTS processor
|
|
23
|
+
* before invoking provider handlers, not inside this class.
|
|
24
|
+
*/
|
|
25
|
+
readonly maxTextLength: number;
|
|
33
26
|
constructor(credentialsPath?: string);
|
|
34
27
|
/**
|
|
35
28
|
* Validate that the provider is properly configured
|
|
@@ -42,6 +35,7 @@ export declare class GoogleTTSHandler implements TTSHandler {
|
|
|
42
35
|
*
|
|
43
36
|
* Note: This method is optional in the TTSHandler interface, but Google Cloud TTS
|
|
44
37
|
* fully implements it to provide comprehensive voice discovery capabilities.
|
|
38
|
+
* Will be Implemented in ISSUE - TTS-014
|
|
45
39
|
*
|
|
46
40
|
* @param languageCode - Optional language filter (e.g., "en-US")
|
|
47
41
|
* @returns List of available voices
|
|
@@ -50,9 +44,27 @@ export declare class GoogleTTSHandler implements TTSHandler {
|
|
|
50
44
|
/**
|
|
51
45
|
* Generate audio from text using provider-specific TTS API
|
|
52
46
|
*
|
|
53
|
-
* @param text - Text to convert to speech
|
|
47
|
+
* @param text - Text or SSML to convert to speech
|
|
54
48
|
* @param options - TTS configuration options
|
|
55
49
|
* @returns Audio buffer with metadata
|
|
56
50
|
*/
|
|
57
|
-
synthesize(
|
|
51
|
+
synthesize(text: string, options: TTSOptions): Promise<TTSResult>;
|
|
52
|
+
/**
|
|
53
|
+
* Extract language code from a Google Cloud voice name
|
|
54
|
+
*
|
|
55
|
+
* Example:
|
|
56
|
+
* "en-US-Neural2-C" -> "en-US"
|
|
57
|
+
*
|
|
58
|
+
* @param voiceId - Google Cloud voice identifier
|
|
59
|
+
* @returns Language code compatible with Google TTS
|
|
60
|
+
*/
|
|
61
|
+
private extractLanguageCode;
|
|
62
|
+
/**
|
|
63
|
+
* Map application audio format to Google Cloud audio encoding
|
|
64
|
+
*
|
|
65
|
+
* @param format - Audio format requested by the caller
|
|
66
|
+
* @returns Google Cloud AudioEncoding enum value
|
|
67
|
+
* @throws Error if format is unsupported
|
|
68
|
+
*/
|
|
69
|
+
private mapFormat;
|
|
58
70
|
}
|
|
@@ -2,33 +2,36 @@
|
|
|
2
2
|
* Google Cloud Text-to-Speech Handler
|
|
3
3
|
*
|
|
4
4
|
* Handler for Google Cloud Text-to-Speech API integration.
|
|
5
|
-
* Supports Neural2 and WaveNet voice models with 220+ voices across 40+ languages.
|
|
6
5
|
*
|
|
7
6
|
* @module adapters/tts/googleTTSHandler
|
|
8
7
|
* @see https://cloud.google.com/text-to-speech/docs
|
|
9
8
|
*/
|
|
10
9
|
import { TextToSpeechClient } from "@google-cloud/text-to-speech";
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
*
|
|
14
|
-
* Integrates with Google Cloud Text-to-Speech API for voice synthesis.
|
|
15
|
-
* Supports authentication via:
|
|
16
|
-
* - Explicit service account JSON key path
|
|
17
|
-
* - GOOGLE_APPLICATION_CREDENTIALS environment variable
|
|
18
|
-
*/
|
|
10
|
+
import { TTSError, TTS_ERROR_CODES } from "../../utils/ttsProcessor.js";
|
|
11
|
+
import { ErrorCategory, ErrorSeverity } from "../../constants/enums.js";
|
|
19
12
|
export class GoogleTTSHandler {
|
|
20
13
|
client = null;
|
|
21
14
|
/**
|
|
22
|
-
*
|
|
23
|
-
*
|
|
15
|
+
* Google Cloud TTS maximum input size.
|
|
16
|
+
* ~5000 bytes INCLUDING SSML tags.
|
|
24
17
|
*/
|
|
25
18
|
static DEFAULT_MAX_TEXT_LENGTH = 5000;
|
|
26
|
-
maxTextLength = GoogleTTSHandler.DEFAULT_MAX_TEXT_LENGTH;
|
|
27
19
|
/**
|
|
28
|
-
*
|
|
20
|
+
* Default timeout for Google Cloud TTS API calls (milliseconds)
|
|
21
|
+
*
|
|
22
|
+
* Google typically responds within:
|
|
23
|
+
* - 1–5 seconds for short or normal text
|
|
24
|
+
* - 5–10 seconds for longer text or Neural2 voices
|
|
25
|
+
*/
|
|
26
|
+
static DEFAULT_API_TIMEOUT_MS = 30 * 1000;
|
|
27
|
+
/**
|
|
28
|
+
* Maximum text length supported by Google Cloud TTS (in bytes).
|
|
29
29
|
*
|
|
30
|
-
*
|
|
30
|
+
* NOTE:
|
|
31
|
+
* Validation against this limit is performed by the shared TTS processor
|
|
32
|
+
* before invoking provider handlers, not inside this class.
|
|
31
33
|
*/
|
|
34
|
+
maxTextLength = GoogleTTSHandler.DEFAULT_MAX_TEXT_LENGTH;
|
|
32
35
|
constructor(credentialsPath) {
|
|
33
36
|
const path = credentialsPath ?? process.env.GOOGLE_APPLICATION_CREDENTIALS;
|
|
34
37
|
if (path) {
|
|
@@ -41,13 +44,14 @@ export class GoogleTTSHandler {
|
|
|
41
44
|
* @returns True if provider can generate TTS
|
|
42
45
|
*/
|
|
43
46
|
isConfigured() {
|
|
44
|
-
|
|
47
|
+
return this.client !== null;
|
|
45
48
|
}
|
|
46
49
|
/**
|
|
47
50
|
* Get available voices for the provider
|
|
48
51
|
*
|
|
49
52
|
* Note: This method is optional in the TTSHandler interface, but Google Cloud TTS
|
|
50
53
|
* fully implements it to provide comprehensive voice discovery capabilities.
|
|
54
|
+
* Will be Implemented in ISSUE - TTS-014
|
|
51
55
|
*
|
|
52
56
|
* @param languageCode - Optional language filter (e.g., "en-US")
|
|
53
57
|
* @returns List of available voices
|
|
@@ -58,11 +62,158 @@ export class GoogleTTSHandler {
|
|
|
58
62
|
/**
|
|
59
63
|
* Generate audio from text using provider-specific TTS API
|
|
60
64
|
*
|
|
61
|
-
* @param text - Text to convert to speech
|
|
65
|
+
* @param text - Text or SSML to convert to speech
|
|
62
66
|
* @param options - TTS configuration options
|
|
63
67
|
* @returns Audio buffer with metadata
|
|
64
68
|
*/
|
|
65
|
-
async synthesize(
|
|
66
|
-
|
|
69
|
+
async synthesize(text, options) {
|
|
70
|
+
if (!this.client) {
|
|
71
|
+
throw new TTSError({
|
|
72
|
+
code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
|
|
73
|
+
message: "Google Cloud TTS client not initialized. Set GOOGLE_APPLICATION_CREDENTIALS or pass credentials path.",
|
|
74
|
+
category: ErrorCategory.CONFIGURATION,
|
|
75
|
+
severity: ErrorSeverity.HIGH,
|
|
76
|
+
retriable: false,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
const startTime = Date.now();
|
|
80
|
+
try {
|
|
81
|
+
const isSSML = text.startsWith("<speak>") && text.endsWith("</speak>");
|
|
82
|
+
// Note: This validation only checks for the presence of opening and closing <speak> tags.
|
|
83
|
+
// Other SSML validation, such as malformed structure, unclosed inner tags, or invalid elements,
|
|
84
|
+
// will be handled by Google's API.
|
|
85
|
+
if ((text.startsWith("<speak>") && !text.endsWith("</speak>")) ||
|
|
86
|
+
(!text.startsWith("<speak>") && text.endsWith("</speak>"))) {
|
|
87
|
+
throw new TTSError({
|
|
88
|
+
code: TTS_ERROR_CODES.INVALID_INPUT,
|
|
89
|
+
message: "Malformed SSML: missing opening <speak> or closing </speak> tag.",
|
|
90
|
+
category: ErrorCategory.VALIDATION,
|
|
91
|
+
severity: ErrorSeverity.MEDIUM,
|
|
92
|
+
retriable: false,
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
const voiceId = options.voice ?? "en-US-Neural2-C";
|
|
96
|
+
const languageCode = this.extractLanguageCode(voiceId);
|
|
97
|
+
const audioEncoding = this.mapFormat(options.format ?? "mp3");
|
|
98
|
+
const request = {
|
|
99
|
+
input: isSSML ? { ssml: text } : { text },
|
|
100
|
+
voice: {
|
|
101
|
+
name: voiceId,
|
|
102
|
+
languageCode,
|
|
103
|
+
},
|
|
104
|
+
audioConfig: {
|
|
105
|
+
audioEncoding,
|
|
106
|
+
speakingRate: options.speed ?? 1.0,
|
|
107
|
+
pitch: options.pitch ?? 0.0,
|
|
108
|
+
volumeGainDb: options.volumeGainDb ?? 0.0,
|
|
109
|
+
},
|
|
110
|
+
};
|
|
111
|
+
const [response] = await this.client.synthesizeSpeech(request, {
|
|
112
|
+
timeout: GoogleTTSHandler.DEFAULT_API_TIMEOUT_MS,
|
|
113
|
+
});
|
|
114
|
+
const audioContent = response.audioContent;
|
|
115
|
+
if (!audioContent) {
|
|
116
|
+
throw new TTSError({
|
|
117
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
118
|
+
message: "Google TTS returned empty audio content",
|
|
119
|
+
category: ErrorCategory.EXECUTION,
|
|
120
|
+
severity: ErrorSeverity.HIGH,
|
|
121
|
+
retriable: true,
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
const buffer = audioContent instanceof Uint8Array
|
|
125
|
+
? Buffer.from(audioContent)
|
|
126
|
+
: typeof audioContent === "string"
|
|
127
|
+
? Buffer.from(audioContent, "base64")
|
|
128
|
+
: (() => {
|
|
129
|
+
throw new TTSError({
|
|
130
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
131
|
+
message: "Unsupported audioContent type returned by Google TTS",
|
|
132
|
+
category: ErrorCategory.EXECUTION,
|
|
133
|
+
severity: ErrorSeverity.HIGH,
|
|
134
|
+
retriable: true,
|
|
135
|
+
context: { type: typeof audioContent },
|
|
136
|
+
});
|
|
137
|
+
})();
|
|
138
|
+
const latency = Date.now() - startTime;
|
|
139
|
+
return {
|
|
140
|
+
buffer,
|
|
141
|
+
format: options.format ?? "mp3",
|
|
142
|
+
size: buffer.length,
|
|
143
|
+
voice: voiceId,
|
|
144
|
+
metadata: {
|
|
145
|
+
latency,
|
|
146
|
+
provider: "google-ai",
|
|
147
|
+
},
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
catch (err) {
|
|
151
|
+
if (err instanceof TTSError) {
|
|
152
|
+
throw err;
|
|
153
|
+
}
|
|
154
|
+
const latency = Date.now() - startTime;
|
|
155
|
+
const message = err instanceof Error ? err.message : "Unknown error";
|
|
156
|
+
throw new TTSError({
|
|
157
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
158
|
+
message: `Google TTS failed after ${latency}ms: ${message}`,
|
|
159
|
+
category: ErrorCategory.EXECUTION,
|
|
160
|
+
severity: ErrorSeverity.HIGH,
|
|
161
|
+
retriable: true,
|
|
162
|
+
context: { latency },
|
|
163
|
+
originalError: err instanceof Error ? err : undefined,
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Extract language code from a Google Cloud voice name
|
|
169
|
+
*
|
|
170
|
+
* Example:
|
|
171
|
+
* "en-US-Neural2-C" -> "en-US"
|
|
172
|
+
*
|
|
173
|
+
* @param voiceId - Google Cloud voice identifier
|
|
174
|
+
* @returns Language code compatible with Google TTS
|
|
175
|
+
*/
|
|
176
|
+
extractLanguageCode(voiceId) {
|
|
177
|
+
const parts = voiceId.split("-");
|
|
178
|
+
if (parts.length >= 2) {
|
|
179
|
+
return `${parts[0]}-${parts[1]}`;
|
|
180
|
+
}
|
|
181
|
+
else {
|
|
182
|
+
throw new TTSError({
|
|
183
|
+
code: TTS_ERROR_CODES.INVALID_INPUT,
|
|
184
|
+
message: `Invalid Google TTS voiceId format: "${voiceId}". Expected format like "en-US-Neural2-C".`,
|
|
185
|
+
category: ErrorCategory.VALIDATION,
|
|
186
|
+
severity: ErrorSeverity.MEDIUM,
|
|
187
|
+
retriable: false,
|
|
188
|
+
context: { voiceId },
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Map application audio format to Google Cloud audio encoding
|
|
194
|
+
*
|
|
195
|
+
* @param format - Audio format requested by the caller
|
|
196
|
+
* @returns Google Cloud AudioEncoding enum value
|
|
197
|
+
* @throws Error if format is unsupported
|
|
198
|
+
*/
|
|
199
|
+
mapFormat(format) {
|
|
200
|
+
switch (format.toLowerCase()) {
|
|
201
|
+
case "mp3":
|
|
202
|
+
return "MP3";
|
|
203
|
+
case "wav":
|
|
204
|
+
return "LINEAR16";
|
|
205
|
+
case "ogg":
|
|
206
|
+
case "opus":
|
|
207
|
+
return "OGG_OPUS";
|
|
208
|
+
default:
|
|
209
|
+
throw new TTSError({
|
|
210
|
+
code: TTS_ERROR_CODES.INVALID_INPUT,
|
|
211
|
+
message: `Unsupported audio format: ${format}`,
|
|
212
|
+
category: ErrorCategory.VALIDATION,
|
|
213
|
+
severity: ErrorSeverity.MEDIUM,
|
|
214
|
+
retriable: false,
|
|
215
|
+
context: { format },
|
|
216
|
+
});
|
|
217
|
+
}
|
|
67
218
|
}
|
|
68
219
|
}
|
|
@@ -1,35 +1,28 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Google Cloud Text-to-Speech Handler
|
|
3
|
-
*
|
|
4
|
-
* Handler for Google Cloud Text-to-Speech API integration.
|
|
5
|
-
* Supports Neural2 and WaveNet voice models with 220+ voices across 40+ languages.
|
|
6
|
-
*
|
|
7
|
-
* @module adapters/tts/googleTTSHandler
|
|
8
|
-
* @see https://cloud.google.com/text-to-speech/docs
|
|
9
|
-
*/
|
|
10
1
|
import type { TTSHandler } from "../../utils/ttsProcessor.js";
|
|
11
2
|
import type { TTSOptions, TTSResult, TTSVoice } from "../../types/ttsTypes.js";
|
|
12
|
-
/**
|
|
13
|
-
* Google Cloud TTS handler implementation
|
|
14
|
-
*
|
|
15
|
-
* Integrates with Google Cloud Text-to-Speech API for voice synthesis.
|
|
16
|
-
* Supports authentication via:
|
|
17
|
-
* - Explicit service account JSON key path
|
|
18
|
-
* - GOOGLE_APPLICATION_CREDENTIALS environment variable
|
|
19
|
-
*/
|
|
20
3
|
export declare class GoogleTTSHandler implements TTSHandler {
|
|
21
4
|
private client;
|
|
22
5
|
/**
|
|
23
|
-
*
|
|
24
|
-
*
|
|
6
|
+
* Google Cloud TTS maximum input size.
|
|
7
|
+
* ~5000 bytes INCLUDING SSML tags.
|
|
25
8
|
*/
|
|
26
9
|
private static readonly DEFAULT_MAX_TEXT_LENGTH;
|
|
27
|
-
maxTextLength: number;
|
|
28
10
|
/**
|
|
29
|
-
*
|
|
11
|
+
* Default timeout for Google Cloud TTS API calls (milliseconds)
|
|
30
12
|
*
|
|
31
|
-
*
|
|
13
|
+
* Google typically responds within:
|
|
14
|
+
* - 1–5 seconds for short or normal text
|
|
15
|
+
* - 5–10 seconds for longer text or Neural2 voices
|
|
32
16
|
*/
|
|
17
|
+
private static readonly DEFAULT_API_TIMEOUT_MS;
|
|
18
|
+
/**
|
|
19
|
+
* Maximum text length supported by Google Cloud TTS (in bytes).
|
|
20
|
+
*
|
|
21
|
+
* NOTE:
|
|
22
|
+
* Validation against this limit is performed by the shared TTS processor
|
|
23
|
+
* before invoking provider handlers, not inside this class.
|
|
24
|
+
*/
|
|
25
|
+
readonly maxTextLength: number;
|
|
33
26
|
constructor(credentialsPath?: string);
|
|
34
27
|
/**
|
|
35
28
|
* Validate that the provider is properly configured
|
|
@@ -42,6 +35,7 @@ export declare class GoogleTTSHandler implements TTSHandler {
|
|
|
42
35
|
*
|
|
43
36
|
* Note: This method is optional in the TTSHandler interface, but Google Cloud TTS
|
|
44
37
|
* fully implements it to provide comprehensive voice discovery capabilities.
|
|
38
|
+
* Will be Implemented in ISSUE - TTS-014
|
|
45
39
|
*
|
|
46
40
|
* @param languageCode - Optional language filter (e.g., "en-US")
|
|
47
41
|
* @returns List of available voices
|
|
@@ -50,9 +44,27 @@ export declare class GoogleTTSHandler implements TTSHandler {
|
|
|
50
44
|
/**
|
|
51
45
|
* Generate audio from text using provider-specific TTS API
|
|
52
46
|
*
|
|
53
|
-
* @param text - Text to convert to speech
|
|
47
|
+
* @param text - Text or SSML to convert to speech
|
|
54
48
|
* @param options - TTS configuration options
|
|
55
49
|
* @returns Audio buffer with metadata
|
|
56
50
|
*/
|
|
57
|
-
synthesize(
|
|
51
|
+
synthesize(text: string, options: TTSOptions): Promise<TTSResult>;
|
|
52
|
+
/**
|
|
53
|
+
* Extract language code from a Google Cloud voice name
|
|
54
|
+
*
|
|
55
|
+
* Example:
|
|
56
|
+
* "en-US-Neural2-C" -> "en-US"
|
|
57
|
+
*
|
|
58
|
+
* @param voiceId - Google Cloud voice identifier
|
|
59
|
+
* @returns Language code compatible with Google TTS
|
|
60
|
+
*/
|
|
61
|
+
private extractLanguageCode;
|
|
62
|
+
/**
|
|
63
|
+
* Map application audio format to Google Cloud audio encoding
|
|
64
|
+
*
|
|
65
|
+
* @param format - Audio format requested by the caller
|
|
66
|
+
* @returns Google Cloud AudioEncoding enum value
|
|
67
|
+
* @throws Error if format is unsupported
|
|
68
|
+
*/
|
|
69
|
+
private mapFormat;
|
|
58
70
|
}
|
|
@@ -2,33 +2,36 @@
|
|
|
2
2
|
* Google Cloud Text-to-Speech Handler
|
|
3
3
|
*
|
|
4
4
|
* Handler for Google Cloud Text-to-Speech API integration.
|
|
5
|
-
* Supports Neural2 and WaveNet voice models with 220+ voices across 40+ languages.
|
|
6
5
|
*
|
|
7
6
|
* @module adapters/tts/googleTTSHandler
|
|
8
7
|
* @see https://cloud.google.com/text-to-speech/docs
|
|
9
8
|
*/
|
|
10
9
|
import { TextToSpeechClient } from "@google-cloud/text-to-speech";
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
*
|
|
14
|
-
* Integrates with Google Cloud Text-to-Speech API for voice synthesis.
|
|
15
|
-
* Supports authentication via:
|
|
16
|
-
* - Explicit service account JSON key path
|
|
17
|
-
* - GOOGLE_APPLICATION_CREDENTIALS environment variable
|
|
18
|
-
*/
|
|
10
|
+
import { TTSError, TTS_ERROR_CODES } from "../../utils/ttsProcessor.js";
|
|
11
|
+
import { ErrorCategory, ErrorSeverity } from "../../constants/enums.js";
|
|
19
12
|
export class GoogleTTSHandler {
|
|
20
13
|
client = null;
|
|
21
14
|
/**
|
|
22
|
-
*
|
|
23
|
-
*
|
|
15
|
+
* Google Cloud TTS maximum input size.
|
|
16
|
+
* ~5000 bytes INCLUDING SSML tags.
|
|
24
17
|
*/
|
|
25
18
|
static DEFAULT_MAX_TEXT_LENGTH = 5000;
|
|
26
|
-
maxTextLength = GoogleTTSHandler.DEFAULT_MAX_TEXT_LENGTH;
|
|
27
19
|
/**
|
|
28
|
-
*
|
|
20
|
+
* Default timeout for Google Cloud TTS API calls (milliseconds)
|
|
21
|
+
*
|
|
22
|
+
* Google typically responds within:
|
|
23
|
+
* - 1–5 seconds for short or normal text
|
|
24
|
+
* - 5–10 seconds for longer text or Neural2 voices
|
|
25
|
+
*/
|
|
26
|
+
static DEFAULT_API_TIMEOUT_MS = 30 * 1000;
|
|
27
|
+
/**
|
|
28
|
+
* Maximum text length supported by Google Cloud TTS (in bytes).
|
|
29
29
|
*
|
|
30
|
-
*
|
|
30
|
+
* NOTE:
|
|
31
|
+
* Validation against this limit is performed by the shared TTS processor
|
|
32
|
+
* before invoking provider handlers, not inside this class.
|
|
31
33
|
*/
|
|
34
|
+
maxTextLength = GoogleTTSHandler.DEFAULT_MAX_TEXT_LENGTH;
|
|
32
35
|
constructor(credentialsPath) {
|
|
33
36
|
const path = credentialsPath ?? process.env.GOOGLE_APPLICATION_CREDENTIALS;
|
|
34
37
|
if (path) {
|
|
@@ -41,13 +44,14 @@ export class GoogleTTSHandler {
|
|
|
41
44
|
* @returns True if provider can generate TTS
|
|
42
45
|
*/
|
|
43
46
|
isConfigured() {
|
|
44
|
-
|
|
47
|
+
return this.client !== null;
|
|
45
48
|
}
|
|
46
49
|
/**
|
|
47
50
|
* Get available voices for the provider
|
|
48
51
|
*
|
|
49
52
|
* Note: This method is optional in the TTSHandler interface, but Google Cloud TTS
|
|
50
53
|
* fully implements it to provide comprehensive voice discovery capabilities.
|
|
54
|
+
* Will be Implemented in ISSUE - TTS-014
|
|
51
55
|
*
|
|
52
56
|
* @param languageCode - Optional language filter (e.g., "en-US")
|
|
53
57
|
* @returns List of available voices
|
|
@@ -58,12 +62,159 @@ export class GoogleTTSHandler {
|
|
|
58
62
|
/**
|
|
59
63
|
* Generate audio from text using provider-specific TTS API
|
|
60
64
|
*
|
|
61
|
-
* @param text - Text to convert to speech
|
|
65
|
+
* @param text - Text or SSML to convert to speech
|
|
62
66
|
* @param options - TTS configuration options
|
|
63
67
|
* @returns Audio buffer with metadata
|
|
64
68
|
*/
|
|
65
|
-
async synthesize(
|
|
66
|
-
|
|
69
|
+
async synthesize(text, options) {
|
|
70
|
+
if (!this.client) {
|
|
71
|
+
throw new TTSError({
|
|
72
|
+
code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
|
|
73
|
+
message: "Google Cloud TTS client not initialized. Set GOOGLE_APPLICATION_CREDENTIALS or pass credentials path.",
|
|
74
|
+
category: ErrorCategory.CONFIGURATION,
|
|
75
|
+
severity: ErrorSeverity.HIGH,
|
|
76
|
+
retriable: false,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
const startTime = Date.now();
|
|
80
|
+
try {
|
|
81
|
+
const isSSML = text.startsWith("<speak>") && text.endsWith("</speak>");
|
|
82
|
+
// Note: This validation only checks for the presence of opening and closing <speak> tags.
|
|
83
|
+
// Other SSML validation, such as malformed structure, unclosed inner tags, or invalid elements,
|
|
84
|
+
// will be handled by Google's API.
|
|
85
|
+
if ((text.startsWith("<speak>") && !text.endsWith("</speak>")) ||
|
|
86
|
+
(!text.startsWith("<speak>") && text.endsWith("</speak>"))) {
|
|
87
|
+
throw new TTSError({
|
|
88
|
+
code: TTS_ERROR_CODES.INVALID_INPUT,
|
|
89
|
+
message: "Malformed SSML: missing opening <speak> or closing </speak> tag.",
|
|
90
|
+
category: ErrorCategory.VALIDATION,
|
|
91
|
+
severity: ErrorSeverity.MEDIUM,
|
|
92
|
+
retriable: false,
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
const voiceId = options.voice ?? "en-US-Neural2-C";
|
|
96
|
+
const languageCode = this.extractLanguageCode(voiceId);
|
|
97
|
+
const audioEncoding = this.mapFormat(options.format ?? "mp3");
|
|
98
|
+
const request = {
|
|
99
|
+
input: isSSML ? { ssml: text } : { text },
|
|
100
|
+
voice: {
|
|
101
|
+
name: voiceId,
|
|
102
|
+
languageCode,
|
|
103
|
+
},
|
|
104
|
+
audioConfig: {
|
|
105
|
+
audioEncoding,
|
|
106
|
+
speakingRate: options.speed ?? 1.0,
|
|
107
|
+
pitch: options.pitch ?? 0.0,
|
|
108
|
+
volumeGainDb: options.volumeGainDb ?? 0.0,
|
|
109
|
+
},
|
|
110
|
+
};
|
|
111
|
+
const [response] = await this.client.synthesizeSpeech(request, {
|
|
112
|
+
timeout: GoogleTTSHandler.DEFAULT_API_TIMEOUT_MS,
|
|
113
|
+
});
|
|
114
|
+
const audioContent = response.audioContent;
|
|
115
|
+
if (!audioContent) {
|
|
116
|
+
throw new TTSError({
|
|
117
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
118
|
+
message: "Google TTS returned empty audio content",
|
|
119
|
+
category: ErrorCategory.EXECUTION,
|
|
120
|
+
severity: ErrorSeverity.HIGH,
|
|
121
|
+
retriable: true,
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
const buffer = audioContent instanceof Uint8Array
|
|
125
|
+
? Buffer.from(audioContent)
|
|
126
|
+
: typeof audioContent === "string"
|
|
127
|
+
? Buffer.from(audioContent, "base64")
|
|
128
|
+
: (() => {
|
|
129
|
+
throw new TTSError({
|
|
130
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
131
|
+
message: "Unsupported audioContent type returned by Google TTS",
|
|
132
|
+
category: ErrorCategory.EXECUTION,
|
|
133
|
+
severity: ErrorSeverity.HIGH,
|
|
134
|
+
retriable: true,
|
|
135
|
+
context: { type: typeof audioContent },
|
|
136
|
+
});
|
|
137
|
+
})();
|
|
138
|
+
const latency = Date.now() - startTime;
|
|
139
|
+
return {
|
|
140
|
+
buffer,
|
|
141
|
+
format: options.format ?? "mp3",
|
|
142
|
+
size: buffer.length,
|
|
143
|
+
voice: voiceId,
|
|
144
|
+
metadata: {
|
|
145
|
+
latency,
|
|
146
|
+
provider: "google-ai",
|
|
147
|
+
},
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
catch (err) {
|
|
151
|
+
if (err instanceof TTSError) {
|
|
152
|
+
throw err;
|
|
153
|
+
}
|
|
154
|
+
const latency = Date.now() - startTime;
|
|
155
|
+
const message = err instanceof Error ? err.message : "Unknown error";
|
|
156
|
+
throw new TTSError({
|
|
157
|
+
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
|
|
158
|
+
message: `Google TTS failed after ${latency}ms: ${message}`,
|
|
159
|
+
category: ErrorCategory.EXECUTION,
|
|
160
|
+
severity: ErrorSeverity.HIGH,
|
|
161
|
+
retriable: true,
|
|
162
|
+
context: { latency },
|
|
163
|
+
originalError: err instanceof Error ? err : undefined,
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Extract language code from a Google Cloud voice name
|
|
169
|
+
*
|
|
170
|
+
* Example:
|
|
171
|
+
* "en-US-Neural2-C" -> "en-US"
|
|
172
|
+
*
|
|
173
|
+
* @param voiceId - Google Cloud voice identifier
|
|
174
|
+
* @returns Language code compatible with Google TTS
|
|
175
|
+
*/
|
|
176
|
+
extractLanguageCode(voiceId) {
|
|
177
|
+
const parts = voiceId.split("-");
|
|
178
|
+
if (parts.length >= 2) {
|
|
179
|
+
return `${parts[0]}-${parts[1]}`;
|
|
180
|
+
}
|
|
181
|
+
else {
|
|
182
|
+
throw new TTSError({
|
|
183
|
+
code: TTS_ERROR_CODES.INVALID_INPUT,
|
|
184
|
+
message: `Invalid Google TTS voiceId format: "${voiceId}". Expected format like "en-US-Neural2-C".`,
|
|
185
|
+
category: ErrorCategory.VALIDATION,
|
|
186
|
+
severity: ErrorSeverity.MEDIUM,
|
|
187
|
+
retriable: false,
|
|
188
|
+
context: { voiceId },
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Map application audio format to Google Cloud audio encoding
|
|
194
|
+
*
|
|
195
|
+
* @param format - Audio format requested by the caller
|
|
196
|
+
* @returns Google Cloud AudioEncoding enum value
|
|
197
|
+
* @throws Error if format is unsupported
|
|
198
|
+
*/
|
|
199
|
+
mapFormat(format) {
|
|
200
|
+
switch (format.toLowerCase()) {
|
|
201
|
+
case "mp3":
|
|
202
|
+
return "MP3";
|
|
203
|
+
case "wav":
|
|
204
|
+
return "LINEAR16";
|
|
205
|
+
case "ogg":
|
|
206
|
+
case "opus":
|
|
207
|
+
return "OGG_OPUS";
|
|
208
|
+
default:
|
|
209
|
+
throw new TTSError({
|
|
210
|
+
code: TTS_ERROR_CODES.INVALID_INPUT,
|
|
211
|
+
message: `Unsupported audio format: ${format}`,
|
|
212
|
+
category: ErrorCategory.VALIDATION,
|
|
213
|
+
severity: ErrorSeverity.MEDIUM,
|
|
214
|
+
retriable: false,
|
|
215
|
+
context: { format },
|
|
216
|
+
});
|
|
217
|
+
}
|
|
67
218
|
}
|
|
68
219
|
}
|
|
69
220
|
//# sourceMappingURL=googleTTSHandler.js.map
|
|
@@ -25,6 +25,10 @@ export type TTSOptions = {
|
|
|
25
25
|
format?: AudioFormat;
|
|
26
26
|
/** Speaking rate 0.25-4.0 (default: 1.0) */
|
|
27
27
|
speed?: number;
|
|
28
|
+
/** Voice pitch adjustment -20.0 to 20.0 semitones (default: 0.0) */
|
|
29
|
+
pitch?: number;
|
|
30
|
+
/** Volume gain in dB -96.0 to 16.0 (default: 0.0) */
|
|
31
|
+
volumeGainDb?: number;
|
|
28
32
|
/** Audio quality (default: standard) */
|
|
29
33
|
quality?: TTSQuality;
|
|
30
34
|
/** Output file path (optional) */
|
|
@@ -48,6 +52,15 @@ export type TTSResult = {
|
|
|
48
52
|
voice?: string;
|
|
49
53
|
/** Sample rate in Hz */
|
|
50
54
|
sampleRate?: number;
|
|
55
|
+
/** Performance and request metadata */
|
|
56
|
+
metadata?: {
|
|
57
|
+
/** Request latency in milliseconds */
|
|
58
|
+
latency: number;
|
|
59
|
+
/** Provider name */
|
|
60
|
+
provider?: string;
|
|
61
|
+
/** Additional provider-specific metadata */
|
|
62
|
+
[key: string]: unknown;
|
|
63
|
+
};
|
|
51
64
|
};
|
|
52
65
|
/**
|
|
53
66
|
* Result of saving audio to file
|
|
@@ -81,6 +94,8 @@ export type TTSVoice = {
|
|
|
81
94
|
export declare const VALID_AUDIO_FORMATS: readonly AudioFormat[];
|
|
82
95
|
/** Valid TTS quality levels as an array for runtime validation */
|
|
83
96
|
export declare const VALID_TTS_QUALITIES: readonly TTSQuality[];
|
|
97
|
+
/** Valid Google TTS audio formats */
|
|
98
|
+
export type GoogleAudioEncoding = "MP3" | "LINEAR16" | "OGG_OPUS";
|
|
84
99
|
/**
|
|
85
100
|
* Type guard to check if an object is a TTSResult
|
|
86
101
|
*/
|
|
@@ -18,6 +18,7 @@ export declare const TTS_ERROR_CODES: {
|
|
|
18
18
|
readonly PROVIDER_NOT_SUPPORTED: "TTS_PROVIDER_NOT_SUPPORTED";
|
|
19
19
|
readonly PROVIDER_NOT_CONFIGURED: "TTS_PROVIDER_NOT_CONFIGURED";
|
|
20
20
|
readonly SYNTHESIS_FAILED: "TTS_SYNTHESIS_FAILED";
|
|
21
|
+
readonly INVALID_INPUT: "TTS_INVALID_INPUT";
|
|
21
22
|
};
|
|
22
23
|
/**
|
|
23
24
|
* TTS Error class for text-to-speech specific errors
|
|
@@ -18,6 +18,7 @@ export const TTS_ERROR_CODES = {
|
|
|
18
18
|
PROVIDER_NOT_SUPPORTED: "TTS_PROVIDER_NOT_SUPPORTED",
|
|
19
19
|
PROVIDER_NOT_CONFIGURED: "TTS_PROVIDER_NOT_CONFIGURED",
|
|
20
20
|
SYNTHESIS_FAILED: "TTS_SYNTHESIS_FAILED",
|
|
21
|
+
INVALID_INPUT: "TTS_INVALID_INPUT",
|
|
21
22
|
};
|
|
22
23
|
/**
|
|
23
24
|
* TTS Error class for text-to-speech specific errors
|
package/dist/types/ttsTypes.d.ts
CHANGED
|
@@ -25,6 +25,10 @@ export type TTSOptions = {
|
|
|
25
25
|
format?: AudioFormat;
|
|
26
26
|
/** Speaking rate 0.25-4.0 (default: 1.0) */
|
|
27
27
|
speed?: number;
|
|
28
|
+
/** Voice pitch adjustment -20.0 to 20.0 semitones (default: 0.0) */
|
|
29
|
+
pitch?: number;
|
|
30
|
+
/** Volume gain in dB -96.0 to 16.0 (default: 0.0) */
|
|
31
|
+
volumeGainDb?: number;
|
|
28
32
|
/** Audio quality (default: standard) */
|
|
29
33
|
quality?: TTSQuality;
|
|
30
34
|
/** Output file path (optional) */
|
|
@@ -48,6 +52,15 @@ export type TTSResult = {
|
|
|
48
52
|
voice?: string;
|
|
49
53
|
/** Sample rate in Hz */
|
|
50
54
|
sampleRate?: number;
|
|
55
|
+
/** Performance and request metadata */
|
|
56
|
+
metadata?: {
|
|
57
|
+
/** Request latency in milliseconds */
|
|
58
|
+
latency: number;
|
|
59
|
+
/** Provider name */
|
|
60
|
+
provider?: string;
|
|
61
|
+
/** Additional provider-specific metadata */
|
|
62
|
+
[key: string]: unknown;
|
|
63
|
+
};
|
|
51
64
|
};
|
|
52
65
|
/**
|
|
53
66
|
* Result of saving audio to file
|
|
@@ -81,6 +94,8 @@ export type TTSVoice = {
|
|
|
81
94
|
export declare const VALID_AUDIO_FORMATS: readonly AudioFormat[];
|
|
82
95
|
/** Valid TTS quality levels as an array for runtime validation */
|
|
83
96
|
export declare const VALID_TTS_QUALITIES: readonly TTSQuality[];
|
|
97
|
+
/** Valid Google TTS audio formats */
|
|
98
|
+
export type GoogleAudioEncoding = "MP3" | "LINEAR16" | "OGG_OPUS";
|
|
84
99
|
/**
|
|
85
100
|
* Type guard to check if an object is a TTSResult
|
|
86
101
|
*/
|
|
@@ -18,6 +18,7 @@ export declare const TTS_ERROR_CODES: {
|
|
|
18
18
|
readonly PROVIDER_NOT_SUPPORTED: "TTS_PROVIDER_NOT_SUPPORTED";
|
|
19
19
|
readonly PROVIDER_NOT_CONFIGURED: "TTS_PROVIDER_NOT_CONFIGURED";
|
|
20
20
|
readonly SYNTHESIS_FAILED: "TTS_SYNTHESIS_FAILED";
|
|
21
|
+
readonly INVALID_INPUT: "TTS_INVALID_INPUT";
|
|
21
22
|
};
|
|
22
23
|
/**
|
|
23
24
|
* TTS Error class for text-to-speech specific errors
|
|
@@ -18,6 +18,7 @@ export const TTS_ERROR_CODES = {
|
|
|
18
18
|
PROVIDER_NOT_SUPPORTED: "TTS_PROVIDER_NOT_SUPPORTED",
|
|
19
19
|
PROVIDER_NOT_CONFIGURED: "TTS_PROVIDER_NOT_CONFIGURED",
|
|
20
20
|
SYNTHESIS_FAILED: "TTS_SYNTHESIS_FAILED",
|
|
21
|
+
INVALID_INPUT: "TTS_INVALID_INPUT",
|
|
21
22
|
};
|
|
22
23
|
/**
|
|
23
24
|
* TTS Error class for text-to-speech specific errors
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@juspay/neurolink",
|
|
3
|
-
"version": "8.
|
|
3
|
+
"version": "8.15.0",
|
|
4
4
|
"description": "Universal AI Development Platform with working MCP integration, multi-provider support, and professional CLI. Built-in tools operational, 58+ external MCP servers discoverable. Connect to filesystem, GitHub, database operations, and more. Build, test, and deploy AI applications with 9 major providers: OpenAI, Anthropic, Google AI, AWS Bedrock, Azure, Hugging Face, Ollama, and Mistral AI.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Juspay Technologies",
|