@juspay/neurolink 8.14.0 → 8.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## [8.16.0](https://github.com/juspay/neurolink/compare/v8.15.0...v8.16.0) (2025-12-16)
2
+
3
+ ### Features
4
+
5
+ - **(tts):** Implement GoogleTTSHandler.getVoices() API ([15d39f7](https://github.com/juspay/neurolink/commit/15d39f7e6bfe093971bc822e8f4251b7e8711bb9))
6
+
7
+ ## [8.15.0](https://github.com/juspay/neurolink/compare/v8.14.0...v8.15.0) (2025-12-14)
8
+
9
+ ### Features
10
+
11
+ - **(tts):** Implement synthesize method in GoogleTTSHandler ([9262e37](https://github.com/juspay/neurolink/commit/9262e37a08ef856eb5d16fd65fa922bd700897cb))
12
+
1
13
  ## [8.14.0](https://github.com/juspay/neurolink/compare/v8.13.2...v8.14.0) (2025-12-14)
2
14
 
3
15
  ### Features
@@ -1,35 +1,30 @@
1
- /**
2
- * Google Cloud Text-to-Speech Handler
3
- *
4
- * Handler for Google Cloud Text-to-Speech API integration.
5
- * Supports Neural2 and WaveNet voice models with 220+ voices across 40+ languages.
6
- *
7
- * @module adapters/tts/googleTTSHandler
8
- * @see https://cloud.google.com/text-to-speech/docs
9
- */
10
1
  import type { TTSHandler } from "../../utils/ttsProcessor.js";
11
2
  import type { TTSOptions, TTSResult, TTSVoice } from "../../types/ttsTypes.js";
12
- /**
13
- * Google Cloud TTS handler implementation
14
- *
15
- * Integrates with Google Cloud Text-to-Speech API for voice synthesis.
16
- * Supports authentication via:
17
- * - Explicit service account JSON key path
18
- * - GOOGLE_APPLICATION_CREDENTIALS environment variable
19
- */
20
3
  export declare class GoogleTTSHandler implements TTSHandler {
21
4
  private client;
5
+ private voicesCache;
6
+ private static readonly CACHE_TTL_MS;
22
7
  /**
23
- * Maximum text length supported by Google Cloud TTS (5000 bytes)
24
- * Different providers have different limits
8
+ * Google Cloud TTS maximum input size.
9
+ * ~5000 bytes INCLUDING SSML tags.
25
10
  */
26
11
  private static readonly DEFAULT_MAX_TEXT_LENGTH;
27
- maxTextLength: number;
28
12
  /**
29
- * Constructor for GoogleTTSHandler
13
+ * Default timeout for Google Cloud TTS API calls (milliseconds)
30
14
  *
31
- * @param credentialsPath - Optional path to Google Cloud credentials JSON file
15
+ * Google typically responds within:
16
+ * - 1–5 seconds for short or normal text
17
+ * - 5–10 seconds for longer text or Neural2 voices
32
18
  */
19
+ private static readonly DEFAULT_API_TIMEOUT_MS;
20
+ /**
21
+ * Maximum text length supported by Google Cloud TTS (in bytes).
22
+ *
23
+ * NOTE:
24
+ * Validation against this limit is performed by the shared TTS processor
25
+ * before invoking provider handlers, not inside this class.
26
+ */
27
+ readonly maxTextLength: number;
33
28
  constructor(credentialsPath?: string);
34
29
  /**
35
30
  * Validate that the provider is properly configured
@@ -46,13 +41,48 @@ export declare class GoogleTTSHandler implements TTSHandler {
46
41
  * @param languageCode - Optional language filter (e.g., "en-US")
47
42
  * @returns List of available voices
48
43
  */
49
- getVoices(_languageCode?: string): Promise<TTSVoice[]>;
44
+ getVoices(languageCode?: string): Promise<TTSVoice[]>;
50
45
  /**
51
46
  * Generate audio from text using provider-specific TTS API
52
47
  *
53
- * @param text - Text to convert to speech
48
+ * @param text - Text or SSML to convert to speech
54
49
  * @param options - TTS configuration options
55
50
  * @returns Audio buffer with metadata
56
51
  */
57
- synthesize(_text: string, _options: TTSOptions): Promise<TTSResult>;
52
+ synthesize(text: string, options: TTSOptions): Promise<TTSResult>;
53
+ /**
54
+ * Extract language code from a Google Cloud voice name
55
+ *
56
+ * Example:
57
+ * "en-US-Neural2-C" -> "en-US"
58
+ *
59
+ * @param voiceId - Google Cloud voice identifier
60
+ * @returns Language code compatible with Google TTS
61
+ */
62
+ private extractLanguageCode;
63
+ /**
64
+ * Map application audio format to Google Cloud audio encoding
65
+ *
66
+ * @param format - Audio format requested by the caller
67
+ * @returns Google Cloud AudioEncoding enum value
68
+ * @throws Error if format is unsupported
69
+ */
70
+ private mapFormat;
71
+ /**
72
+ * Detect the voice type from a Google Cloud TTS voice name
73
+ *
74
+ * Parses the voice name to identify the underlying voice technology/model type.
75
+ * Google Cloud TTS offers different voice types with varying quality and pricing.
76
+ *
77
+ * @param name - The full Google Cloud voice name (e.g., "en-US-Neural2-C")
78
+ * @returns The detected voice type
79
+ *
80
+ * @example
81
+ * detectVoiceType("en-US-Neural2-C") // returns "neural"
82
+ * detectVoiceType("en-US-Wavenet-A") // returns "wavenet"
83
+ * detectVoiceType("en-US-Standard-B") // returns "standard"
84
+ * detectVoiceType("en-US-Chirp-A") // returns "chirp"
85
+ * detectVoiceType("en-US-Journey-D") // returns "unknown" (unrecognized type)
86
+ */
87
+ private detectVoiceType;
58
88
  }
@@ -2,33 +2,39 @@
2
2
  * Google Cloud Text-to-Speech Handler
3
3
  *
4
4
  * Handler for Google Cloud Text-to-Speech API integration.
5
- * Supports Neural2 and WaveNet voice models with 220+ voices across 40+ languages.
6
5
  *
7
6
  * @module adapters/tts/googleTTSHandler
8
7
  * @see https://cloud.google.com/text-to-speech/docs
9
8
  */
10
9
  import { TextToSpeechClient } from "@google-cloud/text-to-speech";
11
- /**
12
- * Google Cloud TTS handler implementation
13
- *
14
- * Integrates with Google Cloud Text-to-Speech API for voice synthesis.
15
- * Supports authentication via:
16
- * - Explicit service account JSON key path
17
- * - GOOGLE_APPLICATION_CREDENTIALS environment variable
18
- */
10
+ import { TTSError, TTS_ERROR_CODES } from "../../utils/ttsProcessor.js";
11
+ import { ErrorCategory, ErrorSeverity } from "../../constants/enums.js";
12
+ import { logger } from "../../utils/logger.js";
19
13
  export class GoogleTTSHandler {
20
14
  client = null;
15
+ voicesCache = null;
16
+ static CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
21
17
  /**
22
- * Maximum text length supported by Google Cloud TTS (5000 bytes)
23
- * Different providers have different limits
18
+ * Google Cloud TTS maximum input size.
19
+ * ~5000 bytes INCLUDING SSML tags.
24
20
  */
25
21
  static DEFAULT_MAX_TEXT_LENGTH = 5000;
26
- maxTextLength = GoogleTTSHandler.DEFAULT_MAX_TEXT_LENGTH;
27
22
  /**
28
- * Constructor for GoogleTTSHandler
23
+ * Default timeout for Google Cloud TTS API calls (milliseconds)
29
24
  *
30
- * @param credentialsPath - Optional path to Google Cloud credentials JSON file
25
+ * Google typically responds within:
26
+ * - 1–5 seconds for short or normal text
27
+ * - 5–10 seconds for longer text or Neural2 voices
31
28
  */
29
+ static DEFAULT_API_TIMEOUT_MS = 30 * 1000;
30
+ /**
31
+ * Maximum text length supported by Google Cloud TTS (in bytes).
32
+ *
33
+ * NOTE:
34
+ * Validation against this limit is performed by the shared TTS processor
35
+ * before invoking provider handlers, not inside this class.
36
+ */
37
+ maxTextLength = GoogleTTSHandler.DEFAULT_MAX_TEXT_LENGTH;
32
38
  constructor(credentialsPath) {
33
39
  const path = credentialsPath ?? process.env.GOOGLE_APPLICATION_CREDENTIALS;
34
40
  if (path) {
@@ -41,7 +47,7 @@ export class GoogleTTSHandler {
41
47
  * @returns True if provider can generate TTS
42
48
  */
43
49
  isConfigured() {
44
- throw new Error("Not implemented yet");
50
+ return this.client !== null;
45
51
  }
46
52
  /**
47
53
  * Get available voices for the provider
@@ -52,17 +58,262 @@ export class GoogleTTSHandler {
52
58
  * @param languageCode - Optional language filter (e.g., "en-US")
53
59
  * @returns List of available voices
54
60
  */
55
- async getVoices(_languageCode) {
56
- throw new Error("Not implemented yet");
61
+ async getVoices(languageCode) {
62
+ if (!this.client) {
63
+ throw new TTSError({
64
+ code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
65
+ message: "Google Cloud TTS client not initialized. Set GOOGLE_APPLICATION_CREDENTIALS or pass credentials path.",
66
+ category: ErrorCategory.CONFIGURATION,
67
+ severity: ErrorSeverity.HIGH,
68
+ retriable: false,
69
+ });
70
+ }
71
+ try {
72
+ // Return cached voices if available, valid, and no language filter is specified
73
+ if (this.voicesCache &&
74
+ Date.now() - this.voicesCache.timestamp <
75
+ GoogleTTSHandler.CACHE_TTL_MS &&
76
+ !languageCode) {
77
+ return this.voicesCache.voices;
78
+ }
79
+ // Call Google Cloud listVoices API
80
+ const [response] = await this.client.listVoices(languageCode ? { languageCode } : {});
81
+ if (!response.voices || response.voices.length === 0) {
82
+ logger.warn("Google Cloud TTS returned no voices");
83
+ return [];
84
+ }
85
+ const voices = [];
86
+ for (const voice of response.voices ?? []) {
87
+ // Validate required fields
88
+ if (!voice.name ||
89
+ !Array.isArray(voice.languageCodes) ||
90
+ voice.languageCodes.length === 0) {
91
+ logger.warn("Skipping voice with missing required fields", {
92
+ name: voice.name,
93
+ languageCodesCount: voice.languageCodes?.length,
94
+ });
95
+ continue;
96
+ }
97
+ const voiceName = voice.name;
98
+ const languageCodes = voice.languageCodes;
99
+ const primaryLanguageCode = languageCodes[0];
100
+ const voiceType = this.detectVoiceType(voiceName);
101
+ // Map Google's ssmlGender → internal Gender
102
+ const gender = voice.ssmlGender === "MALE"
103
+ ? "male"
104
+ : voice.ssmlGender === "FEMALE"
105
+ ? "female"
106
+ : "neutral";
107
+ voices.push({
108
+ id: voiceName,
109
+ name: voiceName,
110
+ languageCode: primaryLanguageCode,
111
+ languageCodes,
112
+ gender,
113
+ type: voiceType,
114
+ naturalSampleRateHertz: voice.naturalSampleRateHertz ?? undefined,
115
+ });
116
+ }
117
+ // Cache the result with timestamp if no language filter
118
+ if (!languageCode) {
119
+ this.voicesCache = { voices, timestamp: Date.now() };
120
+ }
121
+ return voices;
122
+ }
123
+ catch (err) {
124
+ // Log error but return empty array for graceful degradation
125
+ const message = err instanceof Error ? err.message : "Unknown error";
126
+ logger.error(`Failed to fetch Google TTS voices: ${message}`);
127
+ return [];
128
+ }
57
129
  }
58
130
  /**
59
131
  * Generate audio from text using provider-specific TTS API
60
132
  *
61
- * @param text - Text to convert to speech
133
+ * @param text - Text or SSML to convert to speech
62
134
  * @param options - TTS configuration options
63
135
  * @returns Audio buffer with metadata
64
136
  */
65
- async synthesize(_text, _options) {
66
- throw new Error("Not implemented yet");
137
+ async synthesize(text, options) {
138
+ if (!this.client) {
139
+ throw new TTSError({
140
+ code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
141
+ message: "Google Cloud TTS client not initialized. Set GOOGLE_APPLICATION_CREDENTIALS or pass credentials path.",
142
+ category: ErrorCategory.CONFIGURATION,
143
+ severity: ErrorSeverity.HIGH,
144
+ retriable: false,
145
+ });
146
+ }
147
+ const startTime = Date.now();
148
+ try {
149
+ const isSSML = text.startsWith("<speak>") && text.endsWith("</speak>");
150
+ // Note: This validation only checks for the presence of opening and closing <speak> tags.
151
+ // Other SSML validation, such as malformed structure, unclosed inner tags, or invalid elements,
152
+ // will be handled by Google's API.
153
+ if ((text.startsWith("<speak>") && !text.endsWith("</speak>")) ||
154
+ (!text.startsWith("<speak>") && text.endsWith("</speak>"))) {
155
+ throw new TTSError({
156
+ code: TTS_ERROR_CODES.INVALID_INPUT,
157
+ message: "Malformed SSML: missing opening <speak> or closing </speak> tag.",
158
+ category: ErrorCategory.VALIDATION,
159
+ severity: ErrorSeverity.MEDIUM,
160
+ retriable: false,
161
+ });
162
+ }
163
+ const voiceId = options.voice ?? "en-US-Neural2-C";
164
+ const languageCode = this.extractLanguageCode(voiceId);
165
+ const audioEncoding = this.mapFormat(options.format ?? "mp3");
166
+ const request = {
167
+ input: isSSML ? { ssml: text } : { text },
168
+ voice: {
169
+ name: voiceId,
170
+ languageCode,
171
+ },
172
+ audioConfig: {
173
+ audioEncoding,
174
+ speakingRate: options.speed ?? 1.0,
175
+ pitch: options.pitch ?? 0.0,
176
+ volumeGainDb: options.volumeGainDb ?? 0.0,
177
+ },
178
+ };
179
+ const [response] = await this.client.synthesizeSpeech(request, {
180
+ timeout: GoogleTTSHandler.DEFAULT_API_TIMEOUT_MS,
181
+ });
182
+ const audioContent = response.audioContent;
183
+ if (!audioContent) {
184
+ throw new TTSError({
185
+ code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
186
+ message: "Google TTS returned empty audio content",
187
+ category: ErrorCategory.EXECUTION,
188
+ severity: ErrorSeverity.HIGH,
189
+ retriable: true,
190
+ });
191
+ }
192
+ const buffer = audioContent instanceof Uint8Array
193
+ ? Buffer.from(audioContent)
194
+ : typeof audioContent === "string"
195
+ ? Buffer.from(audioContent, "base64")
196
+ : (() => {
197
+ throw new TTSError({
198
+ code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
199
+ message: "Unsupported audioContent type returned by Google TTS",
200
+ category: ErrorCategory.EXECUTION,
201
+ severity: ErrorSeverity.HIGH,
202
+ retriable: true,
203
+ context: { type: typeof audioContent },
204
+ });
205
+ })();
206
+ const latency = Date.now() - startTime;
207
+ return {
208
+ buffer,
209
+ format: options.format ?? "mp3",
210
+ size: buffer.length,
211
+ voice: voiceId,
212
+ metadata: {
213
+ latency,
214
+ provider: "google-ai",
215
+ },
216
+ };
217
+ }
218
+ catch (err) {
219
+ if (err instanceof TTSError) {
220
+ throw err;
221
+ }
222
+ const latency = Date.now() - startTime;
223
+ const message = err instanceof Error ? err.message : "Unknown error";
224
+ throw new TTSError({
225
+ code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
226
+ message: `Google TTS failed after ${latency}ms: ${message}`,
227
+ category: ErrorCategory.EXECUTION,
228
+ severity: ErrorSeverity.HIGH,
229
+ retriable: true,
230
+ context: { latency },
231
+ originalError: err instanceof Error ? err : undefined,
232
+ });
233
+ }
234
+ }
235
+ /**
236
+ * Extract language code from a Google Cloud voice name
237
+ *
238
+ * Example:
239
+ * "en-US-Neural2-C" -> "en-US"
240
+ *
241
+ * @param voiceId - Google Cloud voice identifier
242
+ * @returns Language code compatible with Google TTS
243
+ */
244
+ extractLanguageCode(voiceId) {
245
+ const parts = voiceId.split("-");
246
+ if (parts.length >= 2) {
247
+ return `${parts[0]}-${parts[1]}`;
248
+ }
249
+ else {
250
+ throw new TTSError({
251
+ code: TTS_ERROR_CODES.INVALID_INPUT,
252
+ message: `Invalid Google TTS voiceId format: "${voiceId}". Expected format like "en-US-Neural2-C".`,
253
+ category: ErrorCategory.VALIDATION,
254
+ severity: ErrorSeverity.MEDIUM,
255
+ retriable: false,
256
+ context: { voiceId },
257
+ });
258
+ }
259
+ }
260
+ /**
261
+ * Map application audio format to Google Cloud audio encoding
262
+ *
263
+ * @param format - Audio format requested by the caller
264
+ * @returns Google Cloud AudioEncoding enum value
265
+ * @throws Error if format is unsupported
266
+ */
267
+ mapFormat(format) {
268
+ switch (format.toLowerCase()) {
269
+ case "mp3":
270
+ return "MP3";
271
+ case "wav":
272
+ return "LINEAR16";
273
+ case "ogg":
274
+ case "opus":
275
+ return "OGG_OPUS";
276
+ default:
277
+ throw new TTSError({
278
+ code: TTS_ERROR_CODES.INVALID_INPUT,
279
+ message: `Unsupported audio format: ${format}`,
280
+ category: ErrorCategory.VALIDATION,
281
+ severity: ErrorSeverity.MEDIUM,
282
+ retriable: false,
283
+ context: { format },
284
+ });
285
+ }
286
+ }
287
+ /**
288
+ * Detect the voice type from a Google Cloud TTS voice name
289
+ *
290
+ * Parses the voice name to identify the underlying voice technology/model type.
291
+ * Google Cloud TTS offers different voice types with varying quality and pricing.
292
+ *
293
+ * @param name - The full Google Cloud voice name (e.g., "en-US-Neural2-C")
294
+ * @returns The detected voice type
295
+ *
296
+ * @example
297
+ * detectVoiceType("en-US-Neural2-C") // returns "neural"
298
+ * detectVoiceType("en-US-Wavenet-A") // returns "wavenet"
299
+ * detectVoiceType("en-US-Standard-B") // returns "standard"
300
+ * detectVoiceType("en-US-Chirp-A") // returns "chirp"
301
+ * detectVoiceType("en-US-Journey-D") // returns "unknown" (unrecognized type)
302
+ */
303
+ detectVoiceType(name) {
304
+ const tokens = name.toLowerCase().split("-");
305
+ if (tokens.some((t) => t.startsWith("chirp"))) {
306
+ return "chirp";
307
+ }
308
+ if (tokens.includes("neural2")) {
309
+ return "neural";
310
+ }
311
+ if (tokens.includes("wavenet")) {
312
+ return "wavenet";
313
+ }
314
+ if (tokens.includes("standard")) {
315
+ return "standard";
316
+ }
317
+ return "unknown";
67
318
  }
68
319
  }
@@ -1,35 +1,30 @@
1
- /**
2
- * Google Cloud Text-to-Speech Handler
3
- *
4
- * Handler for Google Cloud Text-to-Speech API integration.
5
- * Supports Neural2 and WaveNet voice models with 220+ voices across 40+ languages.
6
- *
7
- * @module adapters/tts/googleTTSHandler
8
- * @see https://cloud.google.com/text-to-speech/docs
9
- */
10
1
  import type { TTSHandler } from "../../utils/ttsProcessor.js";
11
2
  import type { TTSOptions, TTSResult, TTSVoice } from "../../types/ttsTypes.js";
12
- /**
13
- * Google Cloud TTS handler implementation
14
- *
15
- * Integrates with Google Cloud Text-to-Speech API for voice synthesis.
16
- * Supports authentication via:
17
- * - Explicit service account JSON key path
18
- * - GOOGLE_APPLICATION_CREDENTIALS environment variable
19
- */
20
3
  export declare class GoogleTTSHandler implements TTSHandler {
21
4
  private client;
5
+ private voicesCache;
6
+ private static readonly CACHE_TTL_MS;
22
7
  /**
23
- * Maximum text length supported by Google Cloud TTS (5000 bytes)
24
- * Different providers have different limits
8
+ * Google Cloud TTS maximum input size.
9
+ * ~5000 bytes INCLUDING SSML tags.
25
10
  */
26
11
  private static readonly DEFAULT_MAX_TEXT_LENGTH;
27
- maxTextLength: number;
28
12
  /**
29
- * Constructor for GoogleTTSHandler
13
+ * Default timeout for Google Cloud TTS API calls (milliseconds)
30
14
  *
31
- * @param credentialsPath - Optional path to Google Cloud credentials JSON file
15
+ * Google typically responds within:
16
+ * - 1–5 seconds for short or normal text
17
+ * - 5–10 seconds for longer text or Neural2 voices
32
18
  */
19
+ private static readonly DEFAULT_API_TIMEOUT_MS;
20
+ /**
21
+ * Maximum text length supported by Google Cloud TTS (in bytes).
22
+ *
23
+ * NOTE:
24
+ * Validation against this limit is performed by the shared TTS processor
25
+ * before invoking provider handlers, not inside this class.
26
+ */
27
+ readonly maxTextLength: number;
33
28
  constructor(credentialsPath?: string);
34
29
  /**
35
30
  * Validate that the provider is properly configured
@@ -46,13 +41,48 @@ export declare class GoogleTTSHandler implements TTSHandler {
46
41
  * @param languageCode - Optional language filter (e.g., "en-US")
47
42
  * @returns List of available voices
48
43
  */
49
- getVoices(_languageCode?: string): Promise<TTSVoice[]>;
44
+ getVoices(languageCode?: string): Promise<TTSVoice[]>;
50
45
  /**
51
46
  * Generate audio from text using provider-specific TTS API
52
47
  *
53
- * @param text - Text to convert to speech
48
+ * @param text - Text or SSML to convert to speech
54
49
  * @param options - TTS configuration options
55
50
  * @returns Audio buffer with metadata
56
51
  */
57
- synthesize(_text: string, _options: TTSOptions): Promise<TTSResult>;
52
+ synthesize(text: string, options: TTSOptions): Promise<TTSResult>;
53
+ /**
54
+ * Extract language code from a Google Cloud voice name
55
+ *
56
+ * Example:
57
+ * "en-US-Neural2-C" -> "en-US"
58
+ *
59
+ * @param voiceId - Google Cloud voice identifier
60
+ * @returns Language code compatible with Google TTS
61
+ */
62
+ private extractLanguageCode;
63
+ /**
64
+ * Map application audio format to Google Cloud audio encoding
65
+ *
66
+ * @param format - Audio format requested by the caller
67
+ * @returns Google Cloud AudioEncoding enum value
68
+ * @throws Error if format is unsupported
69
+ */
70
+ private mapFormat;
71
+ /**
72
+ * Detect the voice type from a Google Cloud TTS voice name
73
+ *
74
+ * Parses the voice name to identify the underlying voice technology/model type.
75
+ * Google Cloud TTS offers different voice types with varying quality and pricing.
76
+ *
77
+ * @param name - The full Google Cloud voice name (e.g., "en-US-Neural2-C")
78
+ * @returns The detected voice type
79
+ *
80
+ * @example
81
+ * detectVoiceType("en-US-Neural2-C") // returns "neural"
82
+ * detectVoiceType("en-US-Wavenet-A") // returns "wavenet"
83
+ * detectVoiceType("en-US-Standard-B") // returns "standard"
84
+ * detectVoiceType("en-US-Chirp-A") // returns "chirp"
85
+ * detectVoiceType("en-US-Journey-D") // returns "unknown" (unrecognized type)
86
+ */
87
+ private detectVoiceType;
58
88
  }
@@ -2,33 +2,39 @@
2
2
  * Google Cloud Text-to-Speech Handler
3
3
  *
4
4
  * Handler for Google Cloud Text-to-Speech API integration.
5
- * Supports Neural2 and WaveNet voice models with 220+ voices across 40+ languages.
6
5
  *
7
6
  * @module adapters/tts/googleTTSHandler
8
7
  * @see https://cloud.google.com/text-to-speech/docs
9
8
  */
10
9
  import { TextToSpeechClient } from "@google-cloud/text-to-speech";
11
- /**
12
- * Google Cloud TTS handler implementation
13
- *
14
- * Integrates with Google Cloud Text-to-Speech API for voice synthesis.
15
- * Supports authentication via:
16
- * - Explicit service account JSON key path
17
- * - GOOGLE_APPLICATION_CREDENTIALS environment variable
18
- */
10
+ import { TTSError, TTS_ERROR_CODES } from "../../utils/ttsProcessor.js";
11
+ import { ErrorCategory, ErrorSeverity } from "../../constants/enums.js";
12
+ import { logger } from "../../utils/logger.js";
19
13
  export class GoogleTTSHandler {
20
14
  client = null;
15
+ voicesCache = null;
16
+ static CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
21
17
  /**
22
- * Maximum text length supported by Google Cloud TTS (5000 bytes)
23
- * Different providers have different limits
18
+ * Google Cloud TTS maximum input size.
19
+ * ~5000 bytes INCLUDING SSML tags.
24
20
  */
25
21
  static DEFAULT_MAX_TEXT_LENGTH = 5000;
26
- maxTextLength = GoogleTTSHandler.DEFAULT_MAX_TEXT_LENGTH;
27
22
  /**
28
- * Constructor for GoogleTTSHandler
23
+ * Default timeout for Google Cloud TTS API calls (milliseconds)
29
24
  *
30
- * @param credentialsPath - Optional path to Google Cloud credentials JSON file
25
+ * Google typically responds within:
26
+ * - 1–5 seconds for short or normal text
27
+ * - 5–10 seconds for longer text or Neural2 voices
31
28
  */
29
+ static DEFAULT_API_TIMEOUT_MS = 30 * 1000;
30
+ /**
31
+ * Maximum text length supported by Google Cloud TTS (in bytes).
32
+ *
33
+ * NOTE:
34
+ * Validation against this limit is performed by the shared TTS processor
35
+ * before invoking provider handlers, not inside this class.
36
+ */
37
+ maxTextLength = GoogleTTSHandler.DEFAULT_MAX_TEXT_LENGTH;
32
38
  constructor(credentialsPath) {
33
39
  const path = credentialsPath ?? process.env.GOOGLE_APPLICATION_CREDENTIALS;
34
40
  if (path) {
@@ -41,7 +47,7 @@ export class GoogleTTSHandler {
41
47
  * @returns True if provider can generate TTS
42
48
  */
43
49
  isConfigured() {
44
- throw new Error("Not implemented yet");
50
+ return this.client !== null;
45
51
  }
46
52
  /**
47
53
  * Get available voices for the provider
@@ -52,18 +58,263 @@ export class GoogleTTSHandler {
52
58
  * @param languageCode - Optional language filter (e.g., "en-US")
53
59
  * @returns List of available voices
54
60
  */
55
- async getVoices(_languageCode) {
56
- throw new Error("Not implemented yet");
61
+ async getVoices(languageCode) {
62
+ if (!this.client) {
63
+ throw new TTSError({
64
+ code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
65
+ message: "Google Cloud TTS client not initialized. Set GOOGLE_APPLICATION_CREDENTIALS or pass credentials path.",
66
+ category: ErrorCategory.CONFIGURATION,
67
+ severity: ErrorSeverity.HIGH,
68
+ retriable: false,
69
+ });
70
+ }
71
+ try {
72
+ // Return cached voices if available, valid, and no language filter is specified
73
+ if (this.voicesCache &&
74
+ Date.now() - this.voicesCache.timestamp <
75
+ GoogleTTSHandler.CACHE_TTL_MS &&
76
+ !languageCode) {
77
+ return this.voicesCache.voices;
78
+ }
79
+ // Call Google Cloud listVoices API
80
+ const [response] = await this.client.listVoices(languageCode ? { languageCode } : {});
81
+ if (!response.voices || response.voices.length === 0) {
82
+ logger.warn("Google Cloud TTS returned no voices");
83
+ return [];
84
+ }
85
+ const voices = [];
86
+ for (const voice of response.voices ?? []) {
87
+ // Validate required fields
88
+ if (!voice.name ||
89
+ !Array.isArray(voice.languageCodes) ||
90
+ voice.languageCodes.length === 0) {
91
+ logger.warn("Skipping voice with missing required fields", {
92
+ name: voice.name,
93
+ languageCodesCount: voice.languageCodes?.length,
94
+ });
95
+ continue;
96
+ }
97
+ const voiceName = voice.name;
98
+ const languageCodes = voice.languageCodes;
99
+ const primaryLanguageCode = languageCodes[0];
100
+ const voiceType = this.detectVoiceType(voiceName);
101
+ // Map Google's ssmlGender → internal Gender
102
+ const gender = voice.ssmlGender === "MALE"
103
+ ? "male"
104
+ : voice.ssmlGender === "FEMALE"
105
+ ? "female"
106
+ : "neutral";
107
+ voices.push({
108
+ id: voiceName,
109
+ name: voiceName,
110
+ languageCode: primaryLanguageCode,
111
+ languageCodes,
112
+ gender,
113
+ type: voiceType,
114
+ naturalSampleRateHertz: voice.naturalSampleRateHertz ?? undefined,
115
+ });
116
+ }
117
+ // Cache the result with timestamp if no language filter
118
+ if (!languageCode) {
119
+ this.voicesCache = { voices, timestamp: Date.now() };
120
+ }
121
+ return voices;
122
+ }
123
+ catch (err) {
124
+ // Log error but return empty array for graceful degradation
125
+ const message = err instanceof Error ? err.message : "Unknown error";
126
+ logger.error(`Failed to fetch Google TTS voices: ${message}`);
127
+ return [];
128
+ }
57
129
  }
58
130
  /**
59
131
  * Generate audio from text using provider-specific TTS API
60
132
  *
61
- * @param text - Text to convert to speech
133
+ * @param text - Text or SSML to convert to speech
62
134
  * @param options - TTS configuration options
63
135
  * @returns Audio buffer with metadata
64
136
  */
65
- async synthesize(_text, _options) {
66
- throw new Error("Not implemented yet");
137
+ async synthesize(text, options) {
138
+ if (!this.client) {
139
+ throw new TTSError({
140
+ code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
141
+ message: "Google Cloud TTS client not initialized. Set GOOGLE_APPLICATION_CREDENTIALS or pass credentials path.",
142
+ category: ErrorCategory.CONFIGURATION,
143
+ severity: ErrorSeverity.HIGH,
144
+ retriable: false,
145
+ });
146
+ }
147
+ const startTime = Date.now();
148
+ try {
149
+ const isSSML = text.startsWith("<speak>") && text.endsWith("</speak>");
150
+ // Note: This validation only checks for the presence of opening and closing <speak> tags.
151
+ // Other SSML validation, such as malformed structure, unclosed inner tags, or invalid elements,
152
+ // will be handled by Google's API.
153
+ if ((text.startsWith("<speak>") && !text.endsWith("</speak>")) ||
154
+ (!text.startsWith("<speak>") && text.endsWith("</speak>"))) {
155
+ throw new TTSError({
156
+ code: TTS_ERROR_CODES.INVALID_INPUT,
157
+ message: "Malformed SSML: missing opening <speak> or closing </speak> tag.",
158
+ category: ErrorCategory.VALIDATION,
159
+ severity: ErrorSeverity.MEDIUM,
160
+ retriable: false,
161
+ });
162
+ }
163
+ const voiceId = options.voice ?? "en-US-Neural2-C";
164
+ const languageCode = this.extractLanguageCode(voiceId);
165
+ const audioEncoding = this.mapFormat(options.format ?? "mp3");
166
+ const request = {
167
+ input: isSSML ? { ssml: text } : { text },
168
+ voice: {
169
+ name: voiceId,
170
+ languageCode,
171
+ },
172
+ audioConfig: {
173
+ audioEncoding,
174
+ speakingRate: options.speed ?? 1.0,
175
+ pitch: options.pitch ?? 0.0,
176
+ volumeGainDb: options.volumeGainDb ?? 0.0,
177
+ },
178
+ };
179
+ const [response] = await this.client.synthesizeSpeech(request, {
180
+ timeout: GoogleTTSHandler.DEFAULT_API_TIMEOUT_MS,
181
+ });
182
+ const audioContent = response.audioContent;
183
+ if (!audioContent) {
184
+ throw new TTSError({
185
+ code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
186
+ message: "Google TTS returned empty audio content",
187
+ category: ErrorCategory.EXECUTION,
188
+ severity: ErrorSeverity.HIGH,
189
+ retriable: true,
190
+ });
191
+ }
192
+ const buffer = audioContent instanceof Uint8Array
193
+ ? Buffer.from(audioContent)
194
+ : typeof audioContent === "string"
195
+ ? Buffer.from(audioContent, "base64")
196
+ : (() => {
197
+ throw new TTSError({
198
+ code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
199
+ message: "Unsupported audioContent type returned by Google TTS",
200
+ category: ErrorCategory.EXECUTION,
201
+ severity: ErrorSeverity.HIGH,
202
+ retriable: true,
203
+ context: { type: typeof audioContent },
204
+ });
205
+ })();
206
+ const latency = Date.now() - startTime;
207
+ return {
208
+ buffer,
209
+ format: options.format ?? "mp3",
210
+ size: buffer.length,
211
+ voice: voiceId,
212
+ metadata: {
213
+ latency,
214
+ provider: "google-ai",
215
+ },
216
+ };
217
+ }
218
+ catch (err) {
219
+ if (err instanceof TTSError) {
220
+ throw err;
221
+ }
222
+ const latency = Date.now() - startTime;
223
+ const message = err instanceof Error ? err.message : "Unknown error";
224
+ throw new TTSError({
225
+ code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
226
+ message: `Google TTS failed after ${latency}ms: ${message}`,
227
+ category: ErrorCategory.EXECUTION,
228
+ severity: ErrorSeverity.HIGH,
229
+ retriable: true,
230
+ context: { latency },
231
+ originalError: err instanceof Error ? err : undefined,
232
+ });
233
+ }
234
+ }
235
+ /**
236
+ * Extract language code from a Google Cloud voice name
237
+ *
238
+ * Example:
239
+ * "en-US-Neural2-C" -> "en-US"
240
+ *
241
+ * @param voiceId - Google Cloud voice identifier
242
+ * @returns Language code compatible with Google TTS
243
+ */
244
+ extractLanguageCode(voiceId) {
245
+ const parts = voiceId.split("-");
246
+ if (parts.length >= 2) {
247
+ return `${parts[0]}-${parts[1]}`;
248
+ }
249
+ else {
250
+ throw new TTSError({
251
+ code: TTS_ERROR_CODES.INVALID_INPUT,
252
+ message: `Invalid Google TTS voiceId format: "${voiceId}". Expected format like "en-US-Neural2-C".`,
253
+ category: ErrorCategory.VALIDATION,
254
+ severity: ErrorSeverity.MEDIUM,
255
+ retriable: false,
256
+ context: { voiceId },
257
+ });
258
+ }
259
+ }
260
+ /**
261
+ * Map application audio format to Google Cloud audio encoding
262
+ *
263
+ * @param format - Audio format requested by the caller
264
+ * @returns Google Cloud AudioEncoding enum value
265
+ * @throws Error if format is unsupported
266
+ */
267
+ mapFormat(format) {
268
+ switch (format.toLowerCase()) {
269
+ case "mp3":
270
+ return "MP3";
271
+ case "wav":
272
+ return "LINEAR16";
273
+ case "ogg":
274
+ case "opus":
275
+ return "OGG_OPUS";
276
+ default:
277
+ throw new TTSError({
278
+ code: TTS_ERROR_CODES.INVALID_INPUT,
279
+ message: `Unsupported audio format: ${format}`,
280
+ category: ErrorCategory.VALIDATION,
281
+ severity: ErrorSeverity.MEDIUM,
282
+ retriable: false,
283
+ context: { format },
284
+ });
285
+ }
286
+ }
287
+ /**
288
+ * Detect the voice type from a Google Cloud TTS voice name
289
+ *
290
+ * Parses the voice name to identify the underlying voice technology/model type.
291
+ * Google Cloud TTS offers different voice types with varying quality and pricing.
292
+ *
293
+ * @param name - The full Google Cloud voice name (e.g., "en-US-Neural2-C")
294
+ * @returns The detected voice type
295
+ *
296
+ * @example
297
+ * detectVoiceType("en-US-Neural2-C") // returns "neural"
298
+ * detectVoiceType("en-US-Wavenet-A") // returns "wavenet"
299
+ * detectVoiceType("en-US-Standard-B") // returns "standard"
300
+ * detectVoiceType("en-US-Chirp-A") // returns "chirp"
301
+ * detectVoiceType("en-US-Journey-D") // returns "unknown" (unrecognized type)
302
+ */
303
+ detectVoiceType(name) {
304
+ const tokens = name.toLowerCase().split("-");
305
+ if (tokens.some((t) => t.startsWith("chirp"))) {
306
+ return "chirp";
307
+ }
308
+ if (tokens.includes("neural2")) {
309
+ return "neural";
310
+ }
311
+ if (tokens.includes("wavenet")) {
312
+ return "wavenet";
313
+ }
314
+ if (tokens.includes("standard")) {
315
+ return "standard";
316
+ }
317
+ return "unknown";
67
318
  }
68
319
  }
69
320
  //# sourceMappingURL=googleTTSHandler.js.map
@@ -25,6 +25,10 @@ export type TTSOptions = {
25
25
  format?: AudioFormat;
26
26
  /** Speaking rate 0.25-4.0 (default: 1.0) */
27
27
  speed?: number;
28
+ /** Voice pitch adjustment -20.0 to 20.0 semitones (default: 0.0) */
29
+ pitch?: number;
30
+ /** Volume gain in dB -96.0 to 16.0 (default: 0.0) */
31
+ volumeGainDb?: number;
28
32
  /** Audio quality (default: standard) */
29
33
  quality?: TTSQuality;
30
34
  /** Output file path (optional) */
@@ -48,6 +52,15 @@ export type TTSResult = {
48
52
  voice?: string;
49
53
  /** Sample rate in Hz */
50
54
  sampleRate?: number;
55
+ /** Performance and request metadata */
56
+ metadata?: {
57
+ /** Request latency in milliseconds */
58
+ latency: number;
59
+ /** Provider name */
60
+ provider?: string;
61
+ /** Additional provider-specific metadata */
62
+ [key: string]: unknown;
63
+ };
51
64
  };
52
65
  /**
53
66
  * Result of saving audio to file
@@ -62,6 +75,10 @@ export type AudioSaveResult = {
62
75
  /** Error message if failed */
63
76
  error?: string;
64
77
  };
78
+ /** Allowed TTS voice types */
79
+ export type VoiceType = "standard" | "wavenet" | "neural" | "chirp" | "unknown";
80
+ /** Allowed genders for TTS voices */
81
+ export type Gender = "male" | "female" | "neutral";
65
82
  /**
66
83
  * TTS voice information
67
84
  */
@@ -70,17 +87,25 @@ export type TTSVoice = {
70
87
  id: string;
71
88
  /** Display name */
72
89
  name: string;
73
- /** Language code (e.g., "en-US") */
90
+ /** Primary language code (e.g., "en-US") */
74
91
  languageCode: string;
92
+ /** All supported language codes */
93
+ languageCodes: string[];
75
94
  /** Gender */
76
- gender: "male" | "female" | "neutral";
95
+ gender: Gender;
77
96
  /** Voice type */
78
- type: "neural" | "wavenet" | "standard";
97
+ type?: VoiceType;
98
+ /** Voice description (optional) */
99
+ description?: string;
100
+ /** Natural sample rate in Hz (optional) */
101
+ naturalSampleRateHertz?: number;
79
102
  };
80
103
  /** Valid audio formats as an array for runtime validation */
81
104
  export declare const VALID_AUDIO_FORMATS: readonly AudioFormat[];
82
105
  /** Valid TTS quality levels as an array for runtime validation */
83
106
  export declare const VALID_TTS_QUALITIES: readonly TTSQuality[];
107
+ /** Valid Google TTS audio formats */
108
+ export type GoogleAudioEncoding = "MP3" | "LINEAR16" | "OGG_OPUS";
84
109
  /**
85
110
  * Type guard to check if an object is a TTSResult
86
111
  */
@@ -18,6 +18,7 @@ export declare const TTS_ERROR_CODES: {
18
18
  readonly PROVIDER_NOT_SUPPORTED: "TTS_PROVIDER_NOT_SUPPORTED";
19
19
  readonly PROVIDER_NOT_CONFIGURED: "TTS_PROVIDER_NOT_CONFIGURED";
20
20
  readonly SYNTHESIS_FAILED: "TTS_SYNTHESIS_FAILED";
21
+ readonly INVALID_INPUT: "TTS_INVALID_INPUT";
21
22
  };
22
23
  /**
23
24
  * TTS Error class for text-to-speech specific errors
@@ -18,6 +18,7 @@ export const TTS_ERROR_CODES = {
18
18
  PROVIDER_NOT_SUPPORTED: "TTS_PROVIDER_NOT_SUPPORTED",
19
19
  PROVIDER_NOT_CONFIGURED: "TTS_PROVIDER_NOT_CONFIGURED",
20
20
  SYNTHESIS_FAILED: "TTS_SYNTHESIS_FAILED",
21
+ INVALID_INPUT: "TTS_INVALID_INPUT",
21
22
  };
22
23
  /**
23
24
  * TTS Error class for text-to-speech specific errors
@@ -25,6 +25,10 @@ export type TTSOptions = {
25
25
  format?: AudioFormat;
26
26
  /** Speaking rate 0.25-4.0 (default: 1.0) */
27
27
  speed?: number;
28
+ /** Voice pitch adjustment -20.0 to 20.0 semitones (default: 0.0) */
29
+ pitch?: number;
30
+ /** Volume gain in dB -96.0 to 16.0 (default: 0.0) */
31
+ volumeGainDb?: number;
28
32
  /** Audio quality (default: standard) */
29
33
  quality?: TTSQuality;
30
34
  /** Output file path (optional) */
@@ -48,6 +52,15 @@ export type TTSResult = {
48
52
  voice?: string;
49
53
  /** Sample rate in Hz */
50
54
  sampleRate?: number;
55
+ /** Performance and request metadata */
56
+ metadata?: {
57
+ /** Request latency in milliseconds */
58
+ latency: number;
59
+ /** Provider name */
60
+ provider?: string;
61
+ /** Additional provider-specific metadata */
62
+ [key: string]: unknown;
63
+ };
51
64
  };
52
65
  /**
53
66
  * Result of saving audio to file
@@ -62,6 +75,10 @@ export type AudioSaveResult = {
62
75
  /** Error message if failed */
63
76
  error?: string;
64
77
  };
78
+ /** Allowed TTS voice types */
79
+ export type VoiceType = "standard" | "wavenet" | "neural" | "chirp" | "unknown";
80
+ /** Allowed genders for TTS voices */
81
+ export type Gender = "male" | "female" | "neutral";
65
82
  /**
66
83
  * TTS voice information
67
84
  */
@@ -70,17 +87,25 @@ export type TTSVoice = {
70
87
  id: string;
71
88
  /** Display name */
72
89
  name: string;
73
- /** Language code (e.g., "en-US") */
90
+ /** Primary language code (e.g., "en-US") */
74
91
  languageCode: string;
92
+ /** All supported language codes */
93
+ languageCodes: string[];
75
94
  /** Gender */
76
- gender: "male" | "female" | "neutral";
95
+ gender: Gender;
77
96
  /** Voice type */
78
- type: "neural" | "wavenet" | "standard";
97
+ type?: VoiceType;
98
+ /** Voice description (optional) */
99
+ description?: string;
100
+ /** Natural sample rate in Hz (optional) */
101
+ naturalSampleRateHertz?: number;
79
102
  };
80
103
  /** Valid audio formats as an array for runtime validation */
81
104
  export declare const VALID_AUDIO_FORMATS: readonly AudioFormat[];
82
105
  /** Valid TTS quality levels as an array for runtime validation */
83
106
  export declare const VALID_TTS_QUALITIES: readonly TTSQuality[];
107
+ /** Valid Google TTS audio formats */
108
+ export type GoogleAudioEncoding = "MP3" | "LINEAR16" | "OGG_OPUS";
84
109
  /**
85
110
  * Type guard to check if an object is a TTSResult
86
111
  */
@@ -18,6 +18,7 @@ export declare const TTS_ERROR_CODES: {
18
18
  readonly PROVIDER_NOT_SUPPORTED: "TTS_PROVIDER_NOT_SUPPORTED";
19
19
  readonly PROVIDER_NOT_CONFIGURED: "TTS_PROVIDER_NOT_CONFIGURED";
20
20
  readonly SYNTHESIS_FAILED: "TTS_SYNTHESIS_FAILED";
21
+ readonly INVALID_INPUT: "TTS_INVALID_INPUT";
21
22
  };
22
23
  /**
23
24
  * TTS Error class for text-to-speech specific errors
@@ -18,6 +18,7 @@ export const TTS_ERROR_CODES = {
18
18
  PROVIDER_NOT_SUPPORTED: "TTS_PROVIDER_NOT_SUPPORTED",
19
19
  PROVIDER_NOT_CONFIGURED: "TTS_PROVIDER_NOT_CONFIGURED",
20
20
  SYNTHESIS_FAILED: "TTS_SYNTHESIS_FAILED",
21
+ INVALID_INPUT: "TTS_INVALID_INPUT",
21
22
  };
22
23
  /**
23
24
  * TTS Error class for text-to-speech specific errors
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@juspay/neurolink",
3
- "version": "8.14.0",
3
+ "version": "8.16.0",
4
4
  "description": "Universal AI Development Platform with working MCP integration, multi-provider support, and professional CLI. Built-in tools operational, 58+ external MCP servers discoverable. Connect to filesystem, GitHub, database operations, and more. Build, test, and deploy AI applications with 9 major providers: OpenAI, Anthropic, Google AI, AWS Bedrock, Azure, Hugging Face, Ollama, and Mistral AI.",
5
5
  "author": {
6
6
  "name": "Juspay Technologies",