@pie-players/tts-server-google 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -0
- package/README.md +333 -0
- package/dist/GoogleCloudTTSProvider.d.ts +153 -0
- package/dist/GoogleCloudTTSProvider.d.ts.map +1 -0
- package/dist/GoogleCloudTTSProvider.js +454 -0
- package/dist/GoogleCloudTTSProvider.js.map +1 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/examples/INTEGRATION-GUIDE.md +532 -0
- package/package.json +38 -0
- package/src/GoogleCloudTTSProvider.ts +688 -0
- package/src/index.ts +7 -0
- package/tsconfig.json +9 -0
|
@@ -0,0 +1,688 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Google Cloud Text-to-Speech server-side TTS provider
|
|
3
|
+
* @module @pie-players/tts-server-google
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { v1beta1, protos } from "@google-cloud/text-to-speech";
|
|
7
|
+
|
|
8
|
+
import {
|
|
9
|
+
BaseTTSProvider,
|
|
10
|
+
type GetVoicesOptions,
|
|
11
|
+
type ServerProviderCapabilities,
|
|
12
|
+
type SpeechMark,
|
|
13
|
+
type SynthesizeRequest,
|
|
14
|
+
type SynthesizeResponse,
|
|
15
|
+
TTSError,
|
|
16
|
+
TTSErrorCode,
|
|
17
|
+
type TTSServerConfig,
|
|
18
|
+
type Voice,
|
|
19
|
+
} from "@pie-players/tts-server-core";
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Google Cloud Text-to-Speech provider configuration.
|
|
23
|
+
*
|
|
24
|
+
* This extends the base TTSServerConfig with Google Cloud-specific settings.
|
|
25
|
+
*/
|
|
26
|
+
export interface GoogleCloudTTSConfig extends TTSServerConfig {
|
|
27
|
+
/**
|
|
28
|
+
* Google Cloud project ID (required)
|
|
29
|
+
*
|
|
30
|
+
* @example 'my-project-123456'
|
|
31
|
+
* @required
|
|
32
|
+
*/
|
|
33
|
+
projectId: string;
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Authentication credentials
|
|
37
|
+
*
|
|
38
|
+
* Supports multiple authentication methods:
|
|
39
|
+
* - Service account JSON file path (recommended for production)
|
|
40
|
+
* - Service account key object (for containers/serverless)
|
|
41
|
+
* - API key (for simple applications)
|
|
42
|
+
* - Omit to use Application Default Credentials (ADC) for local development
|
|
43
|
+
*
|
|
44
|
+
* @example '/path/to/service-account.json'
|
|
45
|
+
* @example { client_email: '...', private_key: '...' }
|
|
46
|
+
* @example { apiKey: 'AIza...' }
|
|
47
|
+
* @see https://cloud.google.com/docs/authentication
|
|
48
|
+
*/
|
|
49
|
+
credentials?:
|
|
50
|
+
| string // Path to service account JSON file
|
|
51
|
+
| {
|
|
52
|
+
// Service account key object
|
|
53
|
+
client_email: string;
|
|
54
|
+
private_key: string;
|
|
55
|
+
project_id?: string;
|
|
56
|
+
}
|
|
57
|
+
| {
|
|
58
|
+
// API key
|
|
59
|
+
apiKey: string;
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Voice type: 'wavenet' (neural), 'standard', or 'studio' (premium)
|
|
64
|
+
*
|
|
65
|
+
* @default 'wavenet'
|
|
66
|
+
* @note WaveNet: $16/1M chars, Standard: $4/1M chars, Studio: $16/1M chars
|
|
67
|
+
*/
|
|
68
|
+
voiceType?: "wavenet" | "standard" | "studio";
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Default voice name if not specified in synthesis requests
|
|
72
|
+
*
|
|
73
|
+
* @default 'en-US-Wavenet-A'
|
|
74
|
+
* @example 'en-US-Wavenet-A', 'en-GB-Standard-B', 'es-ES-Studio-C'
|
|
75
|
+
* @see https://cloud.google.com/text-to-speech/docs/voices
|
|
76
|
+
*/
|
|
77
|
+
defaultVoice?: string;
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Audio encoding format
|
|
81
|
+
*
|
|
82
|
+
* @default 'MP3'
|
|
83
|
+
*/
|
|
84
|
+
audioEncoding?: "MP3" | "LINEAR16" | "OGG_OPUS";
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Enable detailed logging for debugging
|
|
88
|
+
*
|
|
89
|
+
* @default false
|
|
90
|
+
*/
|
|
91
|
+
enableLogging?: boolean;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Google Cloud Text-to-Speech Server Provider
|
|
96
|
+
*
|
|
97
|
+
* Provides high-quality neural text-to-speech with precise word-level timing
|
|
98
|
+
* through Google Cloud Text-to-Speech API.
|
|
99
|
+
*
|
|
100
|
+
* Features:
|
|
101
|
+
* - Speech marks support via SSML mark injection (millisecond precision)
|
|
102
|
+
* - WaveNet (neural), Standard, and Studio voice types
|
|
103
|
+
* - 200+ voices across 50+ languages
|
|
104
|
+
* - Full SSML support
|
|
105
|
+
* - Single API call for audio + speech marks
|
|
106
|
+
*/
|
|
107
|
+
export class GoogleCloudTTSProvider extends BaseTTSProvider {
|
|
108
|
+
readonly providerId = "google-cloud-tts";
|
|
109
|
+
readonly providerName = "Google Cloud Text-to-Speech";
|
|
110
|
+
readonly version = "1.0.0";
|
|
111
|
+
|
|
112
|
+
private client!: v1beta1.TextToSpeechClient;
|
|
113
|
+
private voiceType: "wavenet" | "standard" | "studio" = "wavenet";
|
|
114
|
+
private defaultVoice = "en-US-Wavenet-A";
|
|
115
|
+
private audioEncoding: "MP3" | "LINEAR16" | "OGG_OPUS" = "MP3";
|
|
116
|
+
private enableLogging = false;
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Initialize the Google Cloud TTS provider.
|
|
120
|
+
*
|
|
121
|
+
* This is FAST and lightweight - only validates config and creates the client.
|
|
122
|
+
* Does NOT fetch voices or make test API calls.
|
|
123
|
+
*
|
|
124
|
+
* @param config - Google Cloud TTS configuration
|
|
125
|
+
* @performance Completes in ~10-50ms
|
|
126
|
+
*/
|
|
127
|
+
async initialize(config: GoogleCloudTTSConfig): Promise<void> {
|
|
128
|
+
if (!config.projectId) {
|
|
129
|
+
throw new TTSError(
|
|
130
|
+
TTSErrorCode.INITIALIZATION_ERROR,
|
|
131
|
+
"Google Cloud project ID is required",
|
|
132
|
+
undefined,
|
|
133
|
+
this.providerId,
|
|
134
|
+
);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
this.config = config;
|
|
138
|
+
this.voiceType = config.voiceType || "wavenet";
|
|
139
|
+
this.defaultVoice = config.defaultVoice || "en-US-Wavenet-A";
|
|
140
|
+
this.audioEncoding = config.audioEncoding || "MP3";
|
|
141
|
+
this.enableLogging = config.enableLogging || false;
|
|
142
|
+
|
|
143
|
+
try {
|
|
144
|
+
// Initialize Google Cloud TTS client
|
|
145
|
+
const clientConfig: any = {
|
|
146
|
+
projectId: config.projectId,
|
|
147
|
+
};
|
|
148
|
+
|
|
149
|
+
// Handle different credential types
|
|
150
|
+
if (config.credentials) {
|
|
151
|
+
if (typeof config.credentials === "string") {
|
|
152
|
+
// Path to service account JSON file
|
|
153
|
+
clientConfig.keyFilename = config.credentials;
|
|
154
|
+
} else if ("apiKey" in config.credentials) {
|
|
155
|
+
// API key authentication
|
|
156
|
+
clientConfig.apiKey = config.credentials.apiKey;
|
|
157
|
+
} else {
|
|
158
|
+
// Service account key object
|
|
159
|
+
clientConfig.credentials = config.credentials;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
// Else: Use Application Default Credentials (ADC)
|
|
163
|
+
|
|
164
|
+
this.client = new v1beta1.TextToSpeechClient(clientConfig);
|
|
165
|
+
this.initialized = true;
|
|
166
|
+
|
|
167
|
+
if (this.enableLogging) {
|
|
168
|
+
console.log("[GoogleCloudTTS] Initialized successfully");
|
|
169
|
+
}
|
|
170
|
+
} catch (error) {
|
|
171
|
+
throw new TTSError(
|
|
172
|
+
TTSErrorCode.INITIALIZATION_ERROR,
|
|
173
|
+
`Failed to initialize Google Cloud TTS: ${error instanceof Error ? error.message : String(error)}`,
|
|
174
|
+
{ error },
|
|
175
|
+
this.providerId,
|
|
176
|
+
);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Synthesize speech with Google Cloud TTS
|
|
182
|
+
*/
|
|
183
|
+
async synthesize(request: SynthesizeRequest): Promise<SynthesizeResponse> {
|
|
184
|
+
this.ensureInitialized();
|
|
185
|
+
|
|
186
|
+
const capabilities = this.getCapabilities();
|
|
187
|
+
this.validateRequest(request, capabilities);
|
|
188
|
+
|
|
189
|
+
const voice = request.voice || this.defaultVoice;
|
|
190
|
+
const startTime = Date.now();
|
|
191
|
+
|
|
192
|
+
try {
|
|
193
|
+
// Check if speech marks are requested
|
|
194
|
+
if (request.includeSpeechMarks !== false) {
|
|
195
|
+
// Use SSML marks injection for precise word timing
|
|
196
|
+
const result = await this.synthesizeWithSpeechMarks(request, voice);
|
|
197
|
+
const duration = (Date.now() - startTime) / 1000;
|
|
198
|
+
|
|
199
|
+
return {
|
|
200
|
+
audio: result.audio,
|
|
201
|
+
contentType: result.contentType,
|
|
202
|
+
speechMarks: result.speechMarks,
|
|
203
|
+
metadata: {
|
|
204
|
+
providerId: this.providerId,
|
|
205
|
+
voice,
|
|
206
|
+
duration,
|
|
207
|
+
charCount: request.text.length,
|
|
208
|
+
cached: false,
|
|
209
|
+
timestamp: new Date().toISOString(),
|
|
210
|
+
},
|
|
211
|
+
};
|
|
212
|
+
} else {
|
|
213
|
+
// Audio only (no speech marks)
|
|
214
|
+
const result = await this.synthesizeAudio(request, voice);
|
|
215
|
+
const duration = (Date.now() - startTime) / 1000;
|
|
216
|
+
|
|
217
|
+
return {
|
|
218
|
+
audio: result.audio,
|
|
219
|
+
contentType: result.contentType,
|
|
220
|
+
speechMarks: [],
|
|
221
|
+
metadata: {
|
|
222
|
+
providerId: this.providerId,
|
|
223
|
+
voice,
|
|
224
|
+
duration,
|
|
225
|
+
charCount: request.text.length,
|
|
226
|
+
cached: false,
|
|
227
|
+
timestamp: new Date().toISOString(),
|
|
228
|
+
},
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
} catch (error) {
|
|
232
|
+
throw this.mapGoogleErrorToTTSError(error);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Synthesize audio stream only (no speech marks)
|
|
238
|
+
*/
|
|
239
|
+
private async synthesizeAudio(
|
|
240
|
+
request: SynthesizeRequest,
|
|
241
|
+
voice: string,
|
|
242
|
+
): Promise<{ audio: Buffer; contentType: string }> {
|
|
243
|
+
// Detect if text contains SSML tags
|
|
244
|
+
const isSsml = this.detectSSML(request.text);
|
|
245
|
+
|
|
246
|
+
if (isSsml && this.enableLogging) {
|
|
247
|
+
console.log("[GoogleCloudTTS] Detected SSML content");
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Parse voice name to extract language code
|
|
251
|
+
const languageCode = voice.split("-").slice(0, 2).join("-"); // e.g., "en-US" from "en-US-Wavenet-A"
|
|
252
|
+
|
|
253
|
+
// Map our audio encoding to Google's enum
|
|
254
|
+
const audioEncodingMap = {
|
|
255
|
+
MP3: "MP3" as const,
|
|
256
|
+
LINEAR16: "LINEAR16" as const,
|
|
257
|
+
OGG_OPUS: "OGG_OPUS" as const,
|
|
258
|
+
};
|
|
259
|
+
|
|
260
|
+
const [response] = await this.client.synthesizeSpeech({
|
|
261
|
+
input: isSsml ? { ssml: request.text } : { text: request.text },
|
|
262
|
+
voice: {
|
|
263
|
+
languageCode,
|
|
264
|
+
name: voice,
|
|
265
|
+
},
|
|
266
|
+
audioConfig: {
|
|
267
|
+
audioEncoding: audioEncodingMap[this.audioEncoding],
|
|
268
|
+
sampleRateHertz: request.sampleRate || 24000,
|
|
269
|
+
},
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
if (!response.audioContent) {
|
|
273
|
+
throw new Error("No audio content received from Google Cloud TTS");
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Convert Uint8Array to Buffer
|
|
277
|
+
const audioBuffer = Buffer.from(response.audioContent);
|
|
278
|
+
|
|
279
|
+
const contentTypeMap = {
|
|
280
|
+
MP3: "audio/mpeg",
|
|
281
|
+
LINEAR16: "audio/wav",
|
|
282
|
+
OGG_OPUS: "audio/ogg",
|
|
283
|
+
};
|
|
284
|
+
|
|
285
|
+
return {
|
|
286
|
+
audio: audioBuffer,
|
|
287
|
+
contentType: contentTypeMap[this.audioEncoding],
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Synthesize with speech marks using SSML mark injection
|
|
293
|
+
*/
|
|
294
|
+
private async synthesizeWithSpeechMarks(
|
|
295
|
+
request: SynthesizeRequest,
|
|
296
|
+
voice: string,
|
|
297
|
+
): Promise<{
|
|
298
|
+
audio: Buffer;
|
|
299
|
+
contentType: string;
|
|
300
|
+
speechMarks: SpeechMark[];
|
|
301
|
+
}> {
|
|
302
|
+
// Check if the text is already SSML
|
|
303
|
+
const isUserSSML = this.detectSSML(request.text);
|
|
304
|
+
|
|
305
|
+
// If user provided SSML, we need to inject marks within the existing SSML
|
|
306
|
+
// For simplicity in v1, we'll inject marks for plain text only
|
|
307
|
+
const { ssml, wordMap } = isUserSSML
|
|
308
|
+
? this.extractWordsFromSSML(request.text)
|
|
309
|
+
: this.injectSSMLMarks(request.text);
|
|
310
|
+
|
|
311
|
+
if (this.enableLogging) {
|
|
312
|
+
console.log(`[GoogleCloudTTS] Injected ${wordMap.length} SSML marks`);
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Parse voice name to extract language code
|
|
316
|
+
const languageCode = voice.split("-").slice(0, 2).join("-");
|
|
317
|
+
|
|
318
|
+
// Map our audio encoding to Google's enum
|
|
319
|
+
const audioEncodingMap = {
|
|
320
|
+
MP3: "MP3" as const,
|
|
321
|
+
LINEAR16: "LINEAR16" as const,
|
|
322
|
+
OGG_OPUS: "OGG_OPUS" as const,
|
|
323
|
+
};
|
|
324
|
+
|
|
325
|
+
// Single API call with timepoint tracking enabled
|
|
326
|
+
const responseArray = await this.client.synthesizeSpeech({
|
|
327
|
+
input: { ssml },
|
|
328
|
+
voice: {
|
|
329
|
+
languageCode,
|
|
330
|
+
name: voice,
|
|
331
|
+
},
|
|
332
|
+
audioConfig: {
|
|
333
|
+
audioEncoding: audioEncodingMap[this.audioEncoding],
|
|
334
|
+
sampleRateHertz: request.sampleRate || 24000,
|
|
335
|
+
},
|
|
336
|
+
enableTimePointing: [
|
|
337
|
+
protos.google.cloud.texttospeech.v1beta1.SynthesizeSpeechRequest
|
|
338
|
+
.TimepointType.SSML_MARK,
|
|
339
|
+
],
|
|
340
|
+
});
|
|
341
|
+
const response = responseArray[0];
|
|
342
|
+
|
|
343
|
+
if (!response.audioContent) {
|
|
344
|
+
throw new Error("No audio content received from Google Cloud TTS");
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// Convert Uint8Array to Buffer
|
|
348
|
+
const audioBuffer = Buffer.from(response.audioContent);
|
|
349
|
+
|
|
350
|
+
const contentTypeMap = {
|
|
351
|
+
MP3: "audio/mpeg",
|
|
352
|
+
LINEAR16: "audio/wav",
|
|
353
|
+
OGG_OPUS: "audio/ogg",
|
|
354
|
+
};
|
|
355
|
+
|
|
356
|
+
// Extract speech marks from timepoints
|
|
357
|
+
const speechMarks = this.extractSpeechMarksFromTimepoints(
|
|
358
|
+
response.timepoints || [],
|
|
359
|
+
wordMap,
|
|
360
|
+
);
|
|
361
|
+
|
|
362
|
+
if (this.enableLogging) {
|
|
363
|
+
console.log(
|
|
364
|
+
`[GoogleCloudTTS] Extracted ${speechMarks.length} speech marks`,
|
|
365
|
+
);
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
return {
|
|
369
|
+
audio: audioBuffer,
|
|
370
|
+
contentType: contentTypeMap[this.audioEncoding],
|
|
371
|
+
speechMarks,
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
/**
|
|
376
|
+
* Inject SSML marks before each word in plain text
|
|
377
|
+
*/
|
|
378
|
+
private injectSSMLMarks(text: string): {
|
|
379
|
+
ssml: string;
|
|
380
|
+
wordMap: Array<{
|
|
381
|
+
word: string;
|
|
382
|
+
start: number;
|
|
383
|
+
end: number;
|
|
384
|
+
markName: string;
|
|
385
|
+
}>;
|
|
386
|
+
} {
|
|
387
|
+
const words: Array<{
|
|
388
|
+
word: string;
|
|
389
|
+
start: number;
|
|
390
|
+
end: number;
|
|
391
|
+
markName: string;
|
|
392
|
+
}> = [];
|
|
393
|
+
const wordRegex = /\b[\w']+\b/g;
|
|
394
|
+
let match;
|
|
395
|
+
let markIndex = 0;
|
|
396
|
+
|
|
397
|
+
while ((match = wordRegex.exec(text)) !== null) {
|
|
398
|
+
const word = match[0];
|
|
399
|
+
const start = match.index;
|
|
400
|
+
const end = start + word.length;
|
|
401
|
+
const markName = `w${markIndex++}`;
|
|
402
|
+
|
|
403
|
+
words.push({ word, start, end, markName });
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// Build SSML with marks
|
|
407
|
+
let ssml = "<speak>";
|
|
408
|
+
let lastEnd = 0;
|
|
409
|
+
|
|
410
|
+
for (const { word, start, end, markName } of words) {
|
|
411
|
+
// Add text before word (including whitespace and punctuation)
|
|
412
|
+
ssml += this.escapeSSML(text.slice(lastEnd, start));
|
|
413
|
+
// Add marked word
|
|
414
|
+
ssml += `<mark name="${markName}"/>${this.escapeSSML(word)}`;
|
|
415
|
+
lastEnd = end;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
// Add remaining text
|
|
419
|
+
ssml += this.escapeSSML(text.slice(lastEnd)) + "</speak>";
|
|
420
|
+
|
|
421
|
+
return { ssml, wordMap: words };
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
/**
|
|
425
|
+
* Extract words from existing SSML (simplified version for v1)
|
|
426
|
+
*/
|
|
427
|
+
private extractWordsFromSSML(ssmlText: string): {
|
|
428
|
+
ssml: string;
|
|
429
|
+
wordMap: Array<{
|
|
430
|
+
word: string;
|
|
431
|
+
start: number;
|
|
432
|
+
end: number;
|
|
433
|
+
markName: string;
|
|
434
|
+
}>;
|
|
435
|
+
} {
|
|
436
|
+
// For now, just strip SSML tags and inject marks
|
|
437
|
+
// More sophisticated SSML parsing can be added in future versions
|
|
438
|
+
const plainText = ssmlText
|
|
439
|
+
.replace(/<[^>]+>/g, " ") // Remove all tags
|
|
440
|
+
.replace(/\s+/g, " ") // Normalize whitespace
|
|
441
|
+
.trim();
|
|
442
|
+
|
|
443
|
+
return this.injectSSMLMarks(plainText);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
/**
|
|
447
|
+
* Escape special XML characters for SSML
|
|
448
|
+
*/
|
|
449
|
+
private escapeSSML(text: string): string {
|
|
450
|
+
return text
|
|
451
|
+
.replace(/&/g, "&")
|
|
452
|
+
.replace(/</g, "<")
|
|
453
|
+
.replace(/>/g, ">")
|
|
454
|
+
.replace(/"/g, """)
|
|
455
|
+
.replace(/'/g, "'");
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
/**
|
|
459
|
+
* Extract speech marks from Google's timepoints
|
|
460
|
+
*/
|
|
461
|
+
private extractSpeechMarksFromTimepoints(
|
|
462
|
+
timepoints:
|
|
463
|
+
| protos.google.cloud.texttospeech.v1beta1.ITimepoint[]
|
|
464
|
+
| null
|
|
465
|
+
| undefined,
|
|
466
|
+
wordMap: Array<{
|
|
467
|
+
word: string;
|
|
468
|
+
start: number;
|
|
469
|
+
end: number;
|
|
470
|
+
markName: string;
|
|
471
|
+
}>,
|
|
472
|
+
): SpeechMark[] {
|
|
473
|
+
if (!timepoints || timepoints.length === 0) {
|
|
474
|
+
return [];
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
const speechMarks: SpeechMark[] = [];
|
|
478
|
+
|
|
479
|
+
for (const timepoint of timepoints) {
|
|
480
|
+
// Find corresponding word in our map
|
|
481
|
+
const wordInfo = wordMap.find((w) => w.markName === timepoint.markName);
|
|
482
|
+
|
|
483
|
+
if (
|
|
484
|
+
wordInfo &&
|
|
485
|
+
timepoint.timeSeconds !== undefined &&
|
|
486
|
+
timepoint.timeSeconds !== null
|
|
487
|
+
) {
|
|
488
|
+
speechMarks.push({
|
|
489
|
+
time: Math.round(timepoint.timeSeconds * 1000), // Convert to ms
|
|
490
|
+
type: "word",
|
|
491
|
+
start: wordInfo.start,
|
|
492
|
+
end: wordInfo.end,
|
|
493
|
+
value: wordInfo.word,
|
|
494
|
+
});
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
// Sort by time
|
|
499
|
+
return speechMarks.sort((a, b) => a.time - b.time);
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Detect if text contains SSML markup
|
|
504
|
+
*/
|
|
505
|
+
private detectSSML(text: string): boolean {
|
|
506
|
+
return (
|
|
507
|
+
text.includes("<speak") ||
|
|
508
|
+
text.includes("<prosody") ||
|
|
509
|
+
text.includes("<emphasis") ||
|
|
510
|
+
text.includes("<break") ||
|
|
511
|
+
text.includes("<phoneme") ||
|
|
512
|
+
text.includes("<say-as") ||
|
|
513
|
+
text.includes("<mark")
|
|
514
|
+
);
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
/**
|
|
518
|
+
* Get available voices from Google Cloud TTS
|
|
519
|
+
*/
|
|
520
|
+
async getVoices(options?: GetVoicesOptions): Promise<Voice[]> {
|
|
521
|
+
this.ensureInitialized();
|
|
522
|
+
|
|
523
|
+
try {
|
|
524
|
+
const [response] = await this.client.listVoices({
|
|
525
|
+
languageCode: options?.language,
|
|
526
|
+
});
|
|
527
|
+
|
|
528
|
+
if (!response.voices) {
|
|
529
|
+
return [];
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
return response.voices
|
|
533
|
+
.map((voice) => this.mapGoogleVoiceToVoice(voice))
|
|
534
|
+
.filter((voice) => {
|
|
535
|
+
// Apply filters
|
|
536
|
+
if (options?.gender && voice.gender !== options.gender) {
|
|
537
|
+
return false;
|
|
538
|
+
}
|
|
539
|
+
if (options?.quality && voice.quality !== options.quality) {
|
|
540
|
+
return false;
|
|
541
|
+
}
|
|
542
|
+
return true;
|
|
543
|
+
});
|
|
544
|
+
} catch (error) {
|
|
545
|
+
throw new TTSError(
|
|
546
|
+
TTSErrorCode.PROVIDER_ERROR,
|
|
547
|
+
`Failed to get voices: ${error instanceof Error ? error.message : String(error)}`,
|
|
548
|
+
{ error },
|
|
549
|
+
this.providerId,
|
|
550
|
+
);
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
/**
|
|
555
|
+
* Map Google Cloud voice to unified Voice interface
|
|
556
|
+
*/
|
|
557
|
+
private mapGoogleVoiceToVoice(
|
|
558
|
+
googleVoice: protos.google.cloud.texttospeech.v1beta1.IVoice,
|
|
559
|
+
): Voice {
|
|
560
|
+
const voiceName = googleVoice.name || "";
|
|
561
|
+
|
|
562
|
+
// Determine quality based on voice type
|
|
563
|
+
let quality: "standard" | "neural" | "premium" = "standard";
|
|
564
|
+
if (voiceName.includes("Wavenet")) {
|
|
565
|
+
quality = "neural";
|
|
566
|
+
} else if (voiceName.includes("Studio")) {
|
|
567
|
+
quality = "premium";
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
// Map SSML gender to our gender type
|
|
571
|
+
const genderMap: Record<string, "male" | "female" | "neutral"> = {
|
|
572
|
+
MALE: "male",
|
|
573
|
+
FEMALE: "female",
|
|
574
|
+
NEUTRAL: "neutral",
|
|
575
|
+
};
|
|
576
|
+
const gender = genderMap[googleVoice.ssmlGender || "NEUTRAL"] || "neutral";
|
|
577
|
+
|
|
578
|
+
return {
|
|
579
|
+
id: voiceName,
|
|
580
|
+
name: voiceName,
|
|
581
|
+
language: googleVoice.languageCodes?.[0] || "Unknown",
|
|
582
|
+
languageCode: googleVoice.languageCodes?.[0] || "",
|
|
583
|
+
gender,
|
|
584
|
+
quality,
|
|
585
|
+
supportedFeatures: {
|
|
586
|
+
ssml: true,
|
|
587
|
+
emotions: false, // Google doesn't have built-in emotions
|
|
588
|
+
styles: false, // Google doesn't have speaking styles
|
|
589
|
+
},
|
|
590
|
+
providerMetadata: {
|
|
591
|
+
naturalSampleRateHertz: googleVoice.naturalSampleRateHertz,
|
|
592
|
+
languageCodes: googleVoice.languageCodes,
|
|
593
|
+
ssmlGender: googleVoice.ssmlGender,
|
|
594
|
+
},
|
|
595
|
+
};
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
/**
|
|
599
|
+
* Get Google Cloud TTS capabilities
|
|
600
|
+
*/
|
|
601
|
+
getCapabilities(): ServerProviderCapabilities {
|
|
602
|
+
return {
|
|
603
|
+
// W3C Standard features
|
|
604
|
+
standard: {
|
|
605
|
+
supportsSSML: true, // ✅ Full SSML 1.1 support
|
|
606
|
+
supportsPitch: true, // ✅ Via SSML <prosody pitch>
|
|
607
|
+
supportsRate: true, // ✅ Via SSML <prosody rate>
|
|
608
|
+
supportsVolume: false, // ❌ Not supported (handle client-side)
|
|
609
|
+
supportsMultipleVoices: true, // ✅ 200+ voices across 50+ languages
|
|
610
|
+
maxTextLength: 5000, // Google Cloud TTS limit per request
|
|
611
|
+
},
|
|
612
|
+
|
|
613
|
+
// Provider-specific extensions
|
|
614
|
+
extensions: {
|
|
615
|
+
supportsSpeechMarks: true, // ✅ Via SSML marks + timepoints
|
|
616
|
+
supportedFormats: ["mp3", "wav", "ogg"], // MP3, LINEAR16, OGG_OPUS
|
|
617
|
+
supportsSampleRate: true, // ✅ Configurable sample rate
|
|
618
|
+
|
|
619
|
+
// Google Cloud-specific features
|
|
620
|
+
providerSpecific: {
|
|
621
|
+
voiceTypes: ["standard", "wavenet", "studio"],
|
|
622
|
+
voicesCount: 200, // ~200+ voices available
|
|
623
|
+
languagesCount: 50, // 50+ languages supported
|
|
624
|
+
supportsAudioProfiles: true, // Audio device profiles
|
|
625
|
+
supportsEffects: false, // No built-in effects
|
|
626
|
+
supportsEmotions: false, // No emotion control
|
|
627
|
+
supportsStyles: false, // No speaking styles
|
|
628
|
+
},
|
|
629
|
+
},
|
|
630
|
+
};
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
/**
|
|
634
|
+
* Map Google Cloud errors to TTSError codes
|
|
635
|
+
*/
|
|
636
|
+
private mapGoogleErrorToTTSError(error: any): TTSError {
|
|
637
|
+
const message = error.message || String(error);
|
|
638
|
+
|
|
639
|
+
// Check for specific Google Cloud error codes
|
|
640
|
+
if (error.code === 7) {
|
|
641
|
+
// PERMISSION_DENIED
|
|
642
|
+
return new TTSError(
|
|
643
|
+
TTSErrorCode.AUTHENTICATION_ERROR,
|
|
644
|
+
`Google Cloud authentication failed: ${message}`,
|
|
645
|
+
{ error },
|
|
646
|
+
this.providerId,
|
|
647
|
+
);
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
if (error.code === 8) {
|
|
651
|
+
// RESOURCE_EXHAUSTED
|
|
652
|
+
return new TTSError(
|
|
653
|
+
TTSErrorCode.RATE_LIMIT_EXCEEDED,
|
|
654
|
+
`Google Cloud rate limit exceeded: ${message}`,
|
|
655
|
+
{ error },
|
|
656
|
+
this.providerId,
|
|
657
|
+
);
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
if (error.code === 3) {
|
|
661
|
+
// INVALID_ARGUMENT
|
|
662
|
+
return new TTSError(
|
|
663
|
+
TTSErrorCode.INVALID_REQUEST,
|
|
664
|
+
`Invalid request to Google Cloud TTS: ${message}`,
|
|
665
|
+
{ error },
|
|
666
|
+
this.providerId,
|
|
667
|
+
);
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
// Default to provider error
|
|
671
|
+
return new TTSError(
|
|
672
|
+
TTSErrorCode.PROVIDER_ERROR,
|
|
673
|
+
`Google Cloud TTS error: ${message}`,
|
|
674
|
+
{ error },
|
|
675
|
+
this.providerId,
|
|
676
|
+
);
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
/**
|
|
680
|
+
* Clean up Google Cloud TTS client
|
|
681
|
+
*/
|
|
682
|
+
async destroy(): Promise<void> {
|
|
683
|
+
if (this.client) {
|
|
684
|
+
await this.client.close();
|
|
685
|
+
}
|
|
686
|
+
await super.destroy();
|
|
687
|
+
}
|
|
688
|
+
}
|
package/src/index.ts
ADDED