@pie-players/tts-server-google 0.1.0 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -3
- package/.turbo/turbo-build.log +0 -1
- package/examples/INTEGRATION-GUIDE.md +0 -532
- package/src/GoogleCloudTTSProvider.ts +0 -688
- package/src/index.ts +0 -7
- package/tsconfig.json +0 -9
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pie-players/tts-server-google",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.3",
|
|
4
4
|
"description": "Google Cloud Text-to-Speech provider for server-side TTS with speech marks support",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -11,10 +11,13 @@
|
|
|
11
11
|
"import": "./dist/index.js"
|
|
12
12
|
}
|
|
13
13
|
},
|
|
14
|
+
"files": [
|
|
15
|
+
"dist"
|
|
16
|
+
],
|
|
14
17
|
"scripts": {
|
|
15
18
|
"build": "tsc",
|
|
16
19
|
"dev": "tsc --watch",
|
|
17
|
-
"test": "vitest",
|
|
20
|
+
"test": "vitest run --passWithNoTests",
|
|
18
21
|
"test:coverage": "vitest --coverage"
|
|
19
22
|
},
|
|
20
23
|
"keywords": [
|
|
@@ -29,7 +32,7 @@
|
|
|
29
32
|
"license": "MIT",
|
|
30
33
|
"dependencies": {
|
|
31
34
|
"@google-cloud/text-to-speech": "^5.0.0",
|
|
32
|
-
"@pie-players/tts-server-core": "
|
|
35
|
+
"@pie-players/tts-server-core": "0.1.3"
|
|
33
36
|
},
|
|
34
37
|
"devDependencies": {
|
|
35
38
|
"typescript": "^5.3.3",
|
package/.turbo/turbo-build.log
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
$ tsc
|
|
@@ -1,532 +0,0 @@
|
|
|
1
|
-
# Google Cloud TTS Integration Guide
|
|
2
|
-
|
|
3
|
-
This guide shows how to integrate the Google Cloud Text-to-Speech provider into your server-side application.
|
|
4
|
-
|
|
5
|
-
## Prerequisites
|
|
6
|
-
|
|
7
|
-
1. **Google Cloud Project**: Create a project at [console.cloud.google.com](https://console.cloud.google.com)
|
|
8
|
-
2. **Enable Text-to-Speech API**: Go to APIs & Services → Enable "Cloud Text-to-Speech API"
|
|
9
|
-
3. **Authentication**: Set up one of the authentication methods below
|
|
10
|
-
|
|
11
|
-
## Authentication Setup
|
|
12
|
-
|
|
13
|
-
### Option 1: Service Account (Recommended for Production)
|
|
14
|
-
|
|
15
|
-
1. Go to IAM & Admin → Service Accounts
|
|
16
|
-
2. Create a service account
|
|
17
|
-
3. Grant the role: "Cloud Text-to-Speech User"
|
|
18
|
-
4. Create and download a JSON key file
|
|
19
|
-
|
|
20
|
-
```typescript
|
|
21
|
-
import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
|
|
22
|
-
|
|
23
|
-
const provider = new GoogleCloudTTSProvider();
|
|
24
|
-
|
|
25
|
-
await provider.initialize({
|
|
26
|
-
projectId: 'your-project-id',
|
|
27
|
-
credentials: './config/service-account.json',
|
|
28
|
-
voiceType: 'wavenet',
|
|
29
|
-
});
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
### Option 2: API Key (Simple but Less Secure)
|
|
33
|
-
|
|
34
|
-
1. Go to APIs & Services → Credentials
|
|
35
|
-
2. Create credentials → API Key
|
|
36
|
-
3. Restrict the key to "Cloud Text-to-Speech API"
|
|
37
|
-
|
|
38
|
-
```typescript
|
|
39
|
-
await provider.initialize({
|
|
40
|
-
projectId: 'your-project-id',
|
|
41
|
-
credentials: {
|
|
42
|
-
apiKey: process.env.GOOGLE_TTS_API_KEY!,
|
|
43
|
-
},
|
|
44
|
-
});
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
### Option 3: Application Default Credentials (Local Development)
|
|
48
|
-
|
|
49
|
-
1. Install Google Cloud SDK
|
|
50
|
-
2. Run: `gcloud auth application-default login`
|
|
51
|
-
|
|
52
|
-
```typescript
|
|
53
|
-
await provider.initialize({
|
|
54
|
-
projectId: 'your-project-id',
|
|
55
|
-
// No credentials needed - uses ADC
|
|
56
|
-
});
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
## SvelteKit Integration
|
|
60
|
-
|
|
61
|
-
### 1. Install Dependencies
|
|
62
|
-
|
|
63
|
-
```bash
|
|
64
|
-
npm install @pie-players/tts-server-google
|
|
65
|
-
```
|
|
66
|
-
|
|
67
|
-
### 2. Create TTS API Route
|
|
68
|
-
|
|
69
|
-
**File: `src/routes/api/tts/synthesize/+server.ts`**
|
|
70
|
-
|
|
71
|
-
```typescript
|
|
72
|
-
import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
|
|
73
|
-
import { json } from '@sveltejs/kit';
|
|
74
|
-
import type { RequestHandler } from './$types';
|
|
75
|
-
|
|
76
|
-
// Initialize provider once (singleton pattern)
|
|
77
|
-
let ttsProvider: GoogleCloudTTSProvider | null = null;
|
|
78
|
-
|
|
79
|
-
async function getTTSProvider() {
|
|
80
|
-
if (!ttsProvider) {
|
|
81
|
-
ttsProvider = new GoogleCloudTTSProvider();
|
|
82
|
-
await ttsProvider.initialize({
|
|
83
|
-
projectId: process.env.GOOGLE_CLOUD_PROJECT!,
|
|
84
|
-
credentials: process.env.GOOGLE_APPLICATION_CREDENTIALS,
|
|
85
|
-
voiceType: 'wavenet',
|
|
86
|
-
defaultVoice: 'en-US-Wavenet-A',
|
|
87
|
-
});
|
|
88
|
-
}
|
|
89
|
-
return ttsProvider;
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
export const POST: RequestHandler = async ({ request }) => {
|
|
93
|
-
try {
|
|
94
|
-
const { text, voice, includeSpeechMarks } = await request.json();
|
|
95
|
-
|
|
96
|
-
const provider = await getTTSProvider();
|
|
97
|
-
|
|
98
|
-
const result = await provider.synthesize({
|
|
99
|
-
text,
|
|
100
|
-
voice,
|
|
101
|
-
includeSpeechMarks: includeSpeechMarks ?? true,
|
|
102
|
-
});
|
|
103
|
-
|
|
104
|
-
// Convert audio buffer to base64 for JSON response
|
|
105
|
-
const audioBase64 = result.audio.toString('base64');
|
|
106
|
-
|
|
107
|
-
return json({
|
|
108
|
-
audio: audioBase64,
|
|
109
|
-
contentType: result.contentType,
|
|
110
|
-
speechMarks: result.speechMarks,
|
|
111
|
-
metadata: result.metadata,
|
|
112
|
-
});
|
|
113
|
-
} catch (error) {
|
|
114
|
-
console.error('TTS synthesis failed:', error);
|
|
115
|
-
return json(
|
|
116
|
-
{ error: error instanceof Error ? error.message : 'TTS synthesis failed' },
|
|
117
|
-
{ status: 500 }
|
|
118
|
-
);
|
|
119
|
-
}
|
|
120
|
-
};
|
|
121
|
-
```
|
|
122
|
-
|
|
123
|
-
### 3. Create Voices API Route
|
|
124
|
-
|
|
125
|
-
**File: `src/routes/api/tts/voices/+server.ts`**
|
|
126
|
-
|
|
127
|
-
```typescript
|
|
128
|
-
import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
|
|
129
|
-
import { json } from '@sveltejs/kit';
|
|
130
|
-
import type { RequestHandler } from './$types';
|
|
131
|
-
|
|
132
|
-
let ttsProvider: GoogleCloudTTSProvider | null = null;
|
|
133
|
-
|
|
134
|
-
async function getTTSProvider() {
|
|
135
|
-
if (!ttsProvider) {
|
|
136
|
-
ttsProvider = new GoogleCloudTTSProvider();
|
|
137
|
-
await ttsProvider.initialize({
|
|
138
|
-
projectId: process.env.GOOGLE_CLOUD_PROJECT!,
|
|
139
|
-
credentials: process.env.GOOGLE_APPLICATION_CREDENTIALS,
|
|
140
|
-
voiceType: 'wavenet',
|
|
141
|
-
});
|
|
142
|
-
}
|
|
143
|
-
return ttsProvider;
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
export const GET: RequestHandler = async ({ url }) => {
|
|
147
|
-
try {
|
|
148
|
-
const language = url.searchParams.get('language') || undefined;
|
|
149
|
-
const gender = url.searchParams.get('gender') as 'male' | 'female' | 'neutral' | undefined;
|
|
150
|
-
|
|
151
|
-
const provider = await getTTSProvider();
|
|
152
|
-
|
|
153
|
-
const voices = await provider.getVoices({ language, gender });
|
|
154
|
-
|
|
155
|
-
return json({ voices });
|
|
156
|
-
} catch (error) {
|
|
157
|
-
console.error('Failed to fetch voices:', error);
|
|
158
|
-
return json(
|
|
159
|
-
{ error: error instanceof Error ? error.message : 'Failed to fetch voices' },
|
|
160
|
-
{ status: 500 }
|
|
161
|
-
);
|
|
162
|
-
}
|
|
163
|
-
};
|
|
164
|
-
```
|
|
165
|
-
|
|
166
|
-
### 4. Environment Variables
|
|
167
|
-
|
|
168
|
-
**File: `.env`**
|
|
169
|
-
|
|
170
|
-
```bash
|
|
171
|
-
# Google Cloud Configuration
|
|
172
|
-
GOOGLE_CLOUD_PROJECT=your-project-id
|
|
173
|
-
GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
|
|
174
|
-
|
|
175
|
-
# Or use API key
|
|
176
|
-
# GOOGLE_TTS_API_KEY=AIza...
|
|
177
|
-
```
|
|
178
|
-
|
|
179
|
-
### 5. Client-Side Usage
|
|
180
|
-
|
|
181
|
-
**File: `src/lib/tts-client.ts`**
|
|
182
|
-
|
|
183
|
-
```typescript
|
|
184
|
-
export interface TTSResult {
|
|
185
|
-
audio: string; // Base64 encoded
|
|
186
|
-
contentType: string;
|
|
187
|
-
speechMarks: Array<{
|
|
188
|
-
time: number;
|
|
189
|
-
type: string;
|
|
190
|
-
start: number;
|
|
191
|
-
end: number;
|
|
192
|
-
value: string;
|
|
193
|
-
}>;
|
|
194
|
-
metadata: {
|
|
195
|
-
providerId: string;
|
|
196
|
-
voice: string;
|
|
197
|
-
duration: number;
|
|
198
|
-
};
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
export async function synthesizeSpeech(
|
|
202
|
-
text: string,
|
|
203
|
-
voice?: string
|
|
204
|
-
): Promise<TTSResult> {
|
|
205
|
-
const response = await fetch('/api/tts/synthesize', {
|
|
206
|
-
method: 'POST',
|
|
207
|
-
headers: { 'Content-Type': 'application/json' },
|
|
208
|
-
body: JSON.stringify({ text, voice, includeSpeechMarks: true }),
|
|
209
|
-
});
|
|
210
|
-
|
|
211
|
-
if (!response.ok) {
|
|
212
|
-
throw new Error(`TTS failed: ${response.statusText}`);
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
return response.json();
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
export async function playAudio(result: TTSResult): Promise<void> {
|
|
219
|
-
// Convert base64 to blob
|
|
220
|
-
const audioData = atob(result.audio);
|
|
221
|
-
const audioArray = new Uint8Array(audioData.length);
|
|
222
|
-
for (let i = 0; i < audioData.length; i++) {
|
|
223
|
-
audioArray[i] = audioData.charCodeAt(i);
|
|
224
|
-
}
|
|
225
|
-
const blob = new Blob([audioArray], { type: result.contentType });
|
|
226
|
-
|
|
227
|
-
// Play audio
|
|
228
|
-
const url = URL.createObjectURL(blob);
|
|
229
|
-
const audio = new Audio(url);
|
|
230
|
-
await audio.play();
|
|
231
|
-
|
|
232
|
-
// Clean up
|
|
233
|
-
audio.onended = () => URL.revokeObjectURL(url);
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
export async function getVoices(language?: string) {
|
|
237
|
-
const params = new URLSearchParams();
|
|
238
|
-
if (language) params.set('language', language);
|
|
239
|
-
|
|
240
|
-
const response = await fetch(`/api/tts/voices?${params}`);
|
|
241
|
-
const { voices } = await response.json();
|
|
242
|
-
return voices;
|
|
243
|
-
}
|
|
244
|
-
```
|
|
245
|
-
|
|
246
|
-
**File: `src/routes/+page.svelte`**
|
|
247
|
-
|
|
248
|
-
```svelte
|
|
249
|
-
<script lang="ts">
|
|
250
|
-
import { synthesizeSpeech, playAudio, getVoices } from '$lib/tts-client';
|
|
251
|
-
import { onMount } from 'svelte';
|
|
252
|
-
|
|
253
|
-
let text = 'Hello world, this is Google Cloud Text to Speech!';
|
|
254
|
-
let voice = 'en-US-Wavenet-A';
|
|
255
|
-
let voices: any[] = [];
|
|
256
|
-
let speechMarks: any[] = [];
|
|
257
|
-
let isPlaying = false;
|
|
258
|
-
|
|
259
|
-
onMount(async () => {
|
|
260
|
-
voices = await getVoices('en-US');
|
|
261
|
-
});
|
|
262
|
-
|
|
263
|
-
async function handleSpeak() {
|
|
264
|
-
try {
|
|
265
|
-
isPlaying = true;
|
|
266
|
-
const result = await synthesizeSpeech(text, voice);
|
|
267
|
-
speechMarks = result.speechMarks;
|
|
268
|
-
await playAudio(result);
|
|
269
|
-
} catch (error) {
|
|
270
|
-
console.error('Speech failed:', error);
|
|
271
|
-
alert('Speech synthesis failed');
|
|
272
|
-
} finally {
|
|
273
|
-
isPlaying = false;
|
|
274
|
-
}
|
|
275
|
-
}
|
|
276
|
-
</script>
|
|
277
|
-
|
|
278
|
-
<div class="container">
|
|
279
|
-
<h1>Google Cloud TTS Demo</h1>
|
|
280
|
-
|
|
281
|
-
<div class="controls">
|
|
282
|
-
<label>
|
|
283
|
-
Text to speak:
|
|
284
|
-
<textarea bind:value={text} rows="4"></textarea>
|
|
285
|
-
</label>
|
|
286
|
-
|
|
287
|
-
<label>
|
|
288
|
-
Voice:
|
|
289
|
-
<select bind:value={voice}>
|
|
290
|
-
{#each voices as v}
|
|
291
|
-
<option value={v.id}>{v.name} ({v.gender})</option>
|
|
292
|
-
{/each}
|
|
293
|
-
</select>
|
|
294
|
-
</label>
|
|
295
|
-
|
|
296
|
-
<button on:click={handleSpeak} disabled={isPlaying}>
|
|
297
|
-
{isPlaying ? 'Speaking...' : 'Speak'}
|
|
298
|
-
</button>
|
|
299
|
-
</div>
|
|
300
|
-
|
|
301
|
-
{#if speechMarks.length > 0}
|
|
302
|
-
<div class="speech-marks">
|
|
303
|
-
<h2>Speech Marks</h2>
|
|
304
|
-
<ul>
|
|
305
|
-
{#each speechMarks as mark}
|
|
306
|
-
<li>
|
|
307
|
-
{mark.value} ({mark.time}ms)
|
|
308
|
-
</li>
|
|
309
|
-
{/each}
|
|
310
|
-
</ul>
|
|
311
|
-
</div>
|
|
312
|
-
{/if}
|
|
313
|
-
</div>
|
|
314
|
-
|
|
315
|
-
<style>
|
|
316
|
-
.container {
|
|
317
|
-
max-width: 800px;
|
|
318
|
-
margin: 2rem auto;
|
|
319
|
-
padding: 2rem;
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
.controls {
|
|
323
|
-
display: flex;
|
|
324
|
-
flex-direction: column;
|
|
325
|
-
gap: 1rem;
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
textarea {
|
|
329
|
-
width: 100%;
|
|
330
|
-
padding: 0.5rem;
|
|
331
|
-
font-family: inherit;
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
select {
|
|
335
|
-
width: 100%;
|
|
336
|
-
padding: 0.5rem;
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
button {
|
|
340
|
-
padding: 0.75rem 1.5rem;
|
|
341
|
-
background: #4285f4;
|
|
342
|
-
color: white;
|
|
343
|
-
border: none;
|
|
344
|
-
border-radius: 4px;
|
|
345
|
-
cursor: pointer;
|
|
346
|
-
font-size: 1rem;
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
button:disabled {
|
|
350
|
-
background: #ccc;
|
|
351
|
-
cursor: not-allowed;
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
.speech-marks {
|
|
355
|
-
margin-top: 2rem;
|
|
356
|
-
padding: 1rem;
|
|
357
|
-
background: #f5f5f5;
|
|
358
|
-
border-radius: 4px;
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
.speech-marks ul {
|
|
362
|
-
list-style: none;
|
|
363
|
-
padding: 0;
|
|
364
|
-
}
|
|
365
|
-
|
|
366
|
-
.speech-marks li {
|
|
367
|
-
padding: 0.25rem;
|
|
368
|
-
font-family: monospace;
|
|
369
|
-
}
|
|
370
|
-
</style>
|
|
371
|
-
```
|
|
372
|
-
|
|
373
|
-
## Express.js Integration
|
|
374
|
-
|
|
375
|
-
```typescript
|
|
376
|
-
import express from 'express';
|
|
377
|
-
import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
|
|
378
|
-
|
|
379
|
-
const app = express();
|
|
380
|
-
app.use(express.json());
|
|
381
|
-
|
|
382
|
-
// Initialize provider
|
|
383
|
-
const ttsProvider = new GoogleCloudTTSProvider();
|
|
384
|
-
await ttsProvider.initialize({
|
|
385
|
-
projectId: process.env.GOOGLE_CLOUD_PROJECT!,
|
|
386
|
-
credentials: process.env.GOOGLE_APPLICATION_CREDENTIALS,
|
|
387
|
-
voiceType: 'wavenet',
|
|
388
|
-
});
|
|
389
|
-
|
|
390
|
-
// Synthesize endpoint
|
|
391
|
-
app.post('/api/tts/synthesize', async (req, res) => {
|
|
392
|
-
try {
|
|
393
|
-
const { text, voice, includeSpeechMarks } = req.body;
|
|
394
|
-
|
|
395
|
-
const result = await ttsProvider.synthesize({
|
|
396
|
-
text,
|
|
397
|
-
voice,
|
|
398
|
-
includeSpeechMarks: includeSpeechMarks ?? true,
|
|
399
|
-
});
|
|
400
|
-
|
|
401
|
-
// Return audio as buffer
|
|
402
|
-
res.json({
|
|
403
|
-
audio: result.audio.toString('base64'),
|
|
404
|
-
contentType: result.contentType,
|
|
405
|
-
speechMarks: result.speechMarks,
|
|
406
|
-
metadata: result.metadata,
|
|
407
|
-
});
|
|
408
|
-
} catch (error) {
|
|
409
|
-
console.error('TTS synthesis failed:', error);
|
|
410
|
-
res.status(500).json({ error: 'TTS synthesis failed' });
|
|
411
|
-
}
|
|
412
|
-
});
|
|
413
|
-
|
|
414
|
-
// Voices endpoint
|
|
415
|
-
app.get('/api/tts/voices', async (req, res) => {
|
|
416
|
-
try {
|
|
417
|
-
const { language, gender } = req.query;
|
|
418
|
-
|
|
419
|
-
const voices = await ttsProvider.getVoices({
|
|
420
|
-
language: language as string,
|
|
421
|
-
gender: gender as 'male' | 'female' | 'neutral',
|
|
422
|
-
});
|
|
423
|
-
|
|
424
|
-
res.json({ voices });
|
|
425
|
-
} catch (error) {
|
|
426
|
-
console.error('Failed to fetch voices:', error);
|
|
427
|
-
res.status(500).json({ error: 'Failed to fetch voices' });
|
|
428
|
-
}
|
|
429
|
-
});
|
|
430
|
-
|
|
431
|
-
app.listen(3000, () => {
|
|
432
|
-
console.log('Server running on http://localhost:3000');
|
|
433
|
-
});
|
|
434
|
-
```
|
|
435
|
-
|
|
436
|
-
## Next.js App Router Integration
|
|
437
|
-
|
|
438
|
-
**File: `app/api/tts/synthesize/route.ts`**
|
|
439
|
-
|
|
440
|
-
```typescript
|
|
441
|
-
import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
|
|
442
|
-
import { NextResponse } from 'next/server';
|
|
443
|
-
|
|
444
|
-
let ttsProvider: GoogleCloudTTSProvider | null = null;
|
|
445
|
-
|
|
446
|
-
async function getTTSProvider() {
|
|
447
|
-
if (!ttsProvider) {
|
|
448
|
-
ttsProvider = new GoogleCloudTTSProvider();
|
|
449
|
-
await ttsProvider.initialize({
|
|
450
|
-
projectId: process.env.GOOGLE_CLOUD_PROJECT!,
|
|
451
|
-
credentials: process.env.GOOGLE_APPLICATION_CREDENTIALS,
|
|
452
|
-
voiceType: 'wavenet',
|
|
453
|
-
});
|
|
454
|
-
}
|
|
455
|
-
return ttsProvider;
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
export async function POST(request: Request) {
|
|
459
|
-
try {
|
|
460
|
-
const { text, voice, includeSpeechMarks } = await request.json();
|
|
461
|
-
|
|
462
|
-
const provider = await getTTSProvider();
|
|
463
|
-
|
|
464
|
-
const result = await provider.synthesize({
|
|
465
|
-
text,
|
|
466
|
-
voice,
|
|
467
|
-
includeSpeechMarks: includeSpeechMarks ?? true,
|
|
468
|
-
});
|
|
469
|
-
|
|
470
|
-
return NextResponse.json({
|
|
471
|
-
audio: result.audio.toString('base64'),
|
|
472
|
-
contentType: result.contentType,
|
|
473
|
-
speechMarks: result.speechMarks,
|
|
474
|
-
metadata: result.metadata,
|
|
475
|
-
});
|
|
476
|
-
} catch (error) {
|
|
477
|
-
console.error('TTS synthesis failed:', error);
|
|
478
|
-
return NextResponse.json(
|
|
479
|
-
{ error: 'TTS synthesis failed' },
|
|
480
|
-
{ status: 500 }
|
|
481
|
-
);
|
|
482
|
-
}
|
|
483
|
-
}
|
|
484
|
-
```
|
|
485
|
-
|
|
486
|
-
## Security Best Practices
|
|
487
|
-
|
|
488
|
-
1. **Never expose API keys in client code** - Always use server-side endpoints
|
|
489
|
-
2. **Restrict API keys** - Limit to specific APIs and IP addresses
|
|
490
|
-
3. **Use service accounts in production** - More secure than API keys
|
|
491
|
-
4. **Rate limiting** - Implement rate limiting to prevent abuse
|
|
492
|
-
5. **Cache results** - Cache TTS output to reduce API calls and costs
|
|
493
|
-
6. **Validate input** - Sanitize and validate user input before synthesis
|
|
494
|
-
|
|
495
|
-
## Cost Optimization
|
|
496
|
-
|
|
497
|
-
1. **Cache frequently used phrases** - Store audio for common text
|
|
498
|
-
2. **Use standard voices when possible** - $4/1M vs $16/1M for neural
|
|
499
|
-
3. **Batch requests** - Group multiple synthesis requests when feasible
|
|
500
|
-
4. **Monitor usage** - Set up billing alerts in Google Cloud Console
|
|
501
|
-
|
|
502
|
-
## Troubleshooting
|
|
503
|
-
|
|
504
|
-
### Authentication Errors
|
|
505
|
-
|
|
506
|
-
```
|
|
507
|
-
Error: Google Cloud authentication failed
|
|
508
|
-
```
|
|
509
|
-
|
|
510
|
-
**Solution**: Verify your credentials are correct and the service account has the "Cloud Text-to-Speech User" role.
|
|
511
|
-
|
|
512
|
-
### Rate Limit Exceeded
|
|
513
|
-
|
|
514
|
-
```
|
|
515
|
-
Error: Google Cloud rate limit exceeded
|
|
516
|
-
```
|
|
517
|
-
|
|
518
|
-
**Solution**: Implement exponential backoff and request rate limiting. Consider increasing your quota in Google Cloud Console.
|
|
519
|
-
|
|
520
|
-
### No Audio Content
|
|
521
|
-
|
|
522
|
-
```
|
|
523
|
-
Error: No audio content received from Google Cloud TTS
|
|
524
|
-
```
|
|
525
|
-
|
|
526
|
-
**Solution**: Check that your project has the Text-to-Speech API enabled and your billing is active.
|
|
527
|
-
|
|
528
|
-
## Support
|
|
529
|
-
|
|
530
|
-
For issues specific to this package, please file an issue on GitHub.
|
|
531
|
-
|
|
532
|
-
For Google Cloud TTS API issues, see the [official documentation](https://cloud.google.com/text-to-speech/docs).
|
|
@@ -1,688 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Google Cloud Text-to-Speech server-side TTS provider
|
|
3
|
-
* @module @pie-players/tts-server-google
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
import { v1beta1, protos } from "@google-cloud/text-to-speech";
|
|
7
|
-
|
|
8
|
-
import {
|
|
9
|
-
BaseTTSProvider,
|
|
10
|
-
type GetVoicesOptions,
|
|
11
|
-
type ServerProviderCapabilities,
|
|
12
|
-
type SpeechMark,
|
|
13
|
-
type SynthesizeRequest,
|
|
14
|
-
type SynthesizeResponse,
|
|
15
|
-
TTSError,
|
|
16
|
-
TTSErrorCode,
|
|
17
|
-
type TTSServerConfig,
|
|
18
|
-
type Voice,
|
|
19
|
-
} from "@pie-players/tts-server-core";
|
|
20
|
-
|
|
21
|
-
/**
|
|
22
|
-
* Google Cloud Text-to-Speech provider configuration.
|
|
23
|
-
*
|
|
24
|
-
* This extends the base TTSServerConfig with Google Cloud-specific settings.
|
|
25
|
-
*/
|
|
26
|
-
export interface GoogleCloudTTSConfig extends TTSServerConfig {
|
|
27
|
-
/**
|
|
28
|
-
* Google Cloud project ID (required)
|
|
29
|
-
*
|
|
30
|
-
* @example 'my-project-123456'
|
|
31
|
-
* @required
|
|
32
|
-
*/
|
|
33
|
-
projectId: string;
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* Authentication credentials
|
|
37
|
-
*
|
|
38
|
-
* Supports multiple authentication methods:
|
|
39
|
-
* - Service account JSON file path (recommended for production)
|
|
40
|
-
* - Service account key object (for containers/serverless)
|
|
41
|
-
* - API key (for simple applications)
|
|
42
|
-
* - Omit to use Application Default Credentials (ADC) for local development
|
|
43
|
-
*
|
|
44
|
-
* @example '/path/to/service-account.json'
|
|
45
|
-
* @example { client_email: '...', private_key: '...' }
|
|
46
|
-
* @example { apiKey: 'AIza...' }
|
|
47
|
-
* @see https://cloud.google.com/docs/authentication
|
|
48
|
-
*/
|
|
49
|
-
credentials?:
|
|
50
|
-
| string // Path to service account JSON file
|
|
51
|
-
| {
|
|
52
|
-
// Service account key object
|
|
53
|
-
client_email: string;
|
|
54
|
-
private_key: string;
|
|
55
|
-
project_id?: string;
|
|
56
|
-
}
|
|
57
|
-
| {
|
|
58
|
-
// API key
|
|
59
|
-
apiKey: string;
|
|
60
|
-
};
|
|
61
|
-
|
|
62
|
-
/**
|
|
63
|
-
* Voice type: 'wavenet' (neural), 'standard', or 'studio' (premium)
|
|
64
|
-
*
|
|
65
|
-
* @default 'wavenet'
|
|
66
|
-
* @note WaveNet: $16/1M chars, Standard: $4/1M chars, Studio: $16/1M chars
|
|
67
|
-
*/
|
|
68
|
-
voiceType?: "wavenet" | "standard" | "studio";
|
|
69
|
-
|
|
70
|
-
/**
|
|
71
|
-
* Default voice name if not specified in synthesis requests
|
|
72
|
-
*
|
|
73
|
-
* @default 'en-US-Wavenet-A'
|
|
74
|
-
* @example 'en-US-Wavenet-A', 'en-GB-Standard-B', 'es-ES-Studio-C'
|
|
75
|
-
* @see https://cloud.google.com/text-to-speech/docs/voices
|
|
76
|
-
*/
|
|
77
|
-
defaultVoice?: string;
|
|
78
|
-
|
|
79
|
-
/**
|
|
80
|
-
* Audio encoding format
|
|
81
|
-
*
|
|
82
|
-
* @default 'MP3'
|
|
83
|
-
*/
|
|
84
|
-
audioEncoding?: "MP3" | "LINEAR16" | "OGG_OPUS";
|
|
85
|
-
|
|
86
|
-
/**
|
|
87
|
-
* Enable detailed logging for debugging
|
|
88
|
-
*
|
|
89
|
-
* @default false
|
|
90
|
-
*/
|
|
91
|
-
enableLogging?: boolean;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
/**
|
|
95
|
-
* Google Cloud Text-to-Speech Server Provider
|
|
96
|
-
*
|
|
97
|
-
* Provides high-quality neural text-to-speech with precise word-level timing
|
|
98
|
-
* through Google Cloud Text-to-Speech API.
|
|
99
|
-
*
|
|
100
|
-
* Features:
|
|
101
|
-
* - Speech marks support via SSML mark injection (millisecond precision)
|
|
102
|
-
* - WaveNet (neural), Standard, and Studio voice types
|
|
103
|
-
* - 200+ voices across 50+ languages
|
|
104
|
-
* - Full SSML support
|
|
105
|
-
* - Single API call for audio + speech marks
|
|
106
|
-
*/
|
|
107
|
-
export class GoogleCloudTTSProvider extends BaseTTSProvider {
|
|
108
|
-
readonly providerId = "google-cloud-tts";
|
|
109
|
-
readonly providerName = "Google Cloud Text-to-Speech";
|
|
110
|
-
readonly version = "1.0.0";
|
|
111
|
-
|
|
112
|
-
private client!: v1beta1.TextToSpeechClient;
|
|
113
|
-
private voiceType: "wavenet" | "standard" | "studio" = "wavenet";
|
|
114
|
-
private defaultVoice = "en-US-Wavenet-A";
|
|
115
|
-
private audioEncoding: "MP3" | "LINEAR16" | "OGG_OPUS" = "MP3";
|
|
116
|
-
private enableLogging = false;
|
|
117
|
-
|
|
118
|
-
/**
|
|
119
|
-
* Initialize the Google Cloud TTS provider.
|
|
120
|
-
*
|
|
121
|
-
* This is FAST and lightweight - only validates config and creates the client.
|
|
122
|
-
* Does NOT fetch voices or make test API calls.
|
|
123
|
-
*
|
|
124
|
-
* @param config - Google Cloud TTS configuration
|
|
125
|
-
* @performance Completes in ~10-50ms
|
|
126
|
-
*/
|
|
127
|
-
async initialize(config: GoogleCloudTTSConfig): Promise<void> {
|
|
128
|
-
if (!config.projectId) {
|
|
129
|
-
throw new TTSError(
|
|
130
|
-
TTSErrorCode.INITIALIZATION_ERROR,
|
|
131
|
-
"Google Cloud project ID is required",
|
|
132
|
-
undefined,
|
|
133
|
-
this.providerId,
|
|
134
|
-
);
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
this.config = config;
|
|
138
|
-
this.voiceType = config.voiceType || "wavenet";
|
|
139
|
-
this.defaultVoice = config.defaultVoice || "en-US-Wavenet-A";
|
|
140
|
-
this.audioEncoding = config.audioEncoding || "MP3";
|
|
141
|
-
this.enableLogging = config.enableLogging || false;
|
|
142
|
-
|
|
143
|
-
try {
|
|
144
|
-
// Initialize Google Cloud TTS client
|
|
145
|
-
const clientConfig: any = {
|
|
146
|
-
projectId: config.projectId,
|
|
147
|
-
};
|
|
148
|
-
|
|
149
|
-
// Handle different credential types
|
|
150
|
-
if (config.credentials) {
|
|
151
|
-
if (typeof config.credentials === "string") {
|
|
152
|
-
// Path to service account JSON file
|
|
153
|
-
clientConfig.keyFilename = config.credentials;
|
|
154
|
-
} else if ("apiKey" in config.credentials) {
|
|
155
|
-
// API key authentication
|
|
156
|
-
clientConfig.apiKey = config.credentials.apiKey;
|
|
157
|
-
} else {
|
|
158
|
-
// Service account key object
|
|
159
|
-
clientConfig.credentials = config.credentials;
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
// Else: Use Application Default Credentials (ADC)
|
|
163
|
-
|
|
164
|
-
this.client = new v1beta1.TextToSpeechClient(clientConfig);
|
|
165
|
-
this.initialized = true;
|
|
166
|
-
|
|
167
|
-
if (this.enableLogging) {
|
|
168
|
-
console.log("[GoogleCloudTTS] Initialized successfully");
|
|
169
|
-
}
|
|
170
|
-
} catch (error) {
|
|
171
|
-
throw new TTSError(
|
|
172
|
-
TTSErrorCode.INITIALIZATION_ERROR,
|
|
173
|
-
`Failed to initialize Google Cloud TTS: ${error instanceof Error ? error.message : String(error)}`,
|
|
174
|
-
{ error },
|
|
175
|
-
this.providerId,
|
|
176
|
-
);
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
/**
|
|
181
|
-
* Synthesize speech with Google Cloud TTS
|
|
182
|
-
*/
|
|
183
|
-
async synthesize(request: SynthesizeRequest): Promise<SynthesizeResponse> {
|
|
184
|
-
this.ensureInitialized();
|
|
185
|
-
|
|
186
|
-
const capabilities = this.getCapabilities();
|
|
187
|
-
this.validateRequest(request, capabilities);
|
|
188
|
-
|
|
189
|
-
const voice = request.voice || this.defaultVoice;
|
|
190
|
-
const startTime = Date.now();
|
|
191
|
-
|
|
192
|
-
try {
|
|
193
|
-
// Check if speech marks are requested
|
|
194
|
-
if (request.includeSpeechMarks !== false) {
|
|
195
|
-
// Use SSML marks injection for precise word timing
|
|
196
|
-
const result = await this.synthesizeWithSpeechMarks(request, voice);
|
|
197
|
-
const duration = (Date.now() - startTime) / 1000;
|
|
198
|
-
|
|
199
|
-
return {
|
|
200
|
-
audio: result.audio,
|
|
201
|
-
contentType: result.contentType,
|
|
202
|
-
speechMarks: result.speechMarks,
|
|
203
|
-
metadata: {
|
|
204
|
-
providerId: this.providerId,
|
|
205
|
-
voice,
|
|
206
|
-
duration,
|
|
207
|
-
charCount: request.text.length,
|
|
208
|
-
cached: false,
|
|
209
|
-
timestamp: new Date().toISOString(),
|
|
210
|
-
},
|
|
211
|
-
};
|
|
212
|
-
} else {
|
|
213
|
-
// Audio only (no speech marks)
|
|
214
|
-
const result = await this.synthesizeAudio(request, voice);
|
|
215
|
-
const duration = (Date.now() - startTime) / 1000;
|
|
216
|
-
|
|
217
|
-
return {
|
|
218
|
-
audio: result.audio,
|
|
219
|
-
contentType: result.contentType,
|
|
220
|
-
speechMarks: [],
|
|
221
|
-
metadata: {
|
|
222
|
-
providerId: this.providerId,
|
|
223
|
-
voice,
|
|
224
|
-
duration,
|
|
225
|
-
charCount: request.text.length,
|
|
226
|
-
cached: false,
|
|
227
|
-
timestamp: new Date().toISOString(),
|
|
228
|
-
},
|
|
229
|
-
};
|
|
230
|
-
}
|
|
231
|
-
} catch (error) {
|
|
232
|
-
throw this.mapGoogleErrorToTTSError(error);
|
|
233
|
-
}
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
/**
|
|
237
|
-
* Synthesize audio stream only (no speech marks)
|
|
238
|
-
*/
|
|
239
|
-
private async synthesizeAudio(
|
|
240
|
-
request: SynthesizeRequest,
|
|
241
|
-
voice: string,
|
|
242
|
-
): Promise<{ audio: Buffer; contentType: string }> {
|
|
243
|
-
// Detect if text contains SSML tags
|
|
244
|
-
const isSsml = this.detectSSML(request.text);
|
|
245
|
-
|
|
246
|
-
if (isSsml && this.enableLogging) {
|
|
247
|
-
console.log("[GoogleCloudTTS] Detected SSML content");
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
// Parse voice name to extract language code
|
|
251
|
-
const languageCode = voice.split("-").slice(0, 2).join("-"); // e.g., "en-US" from "en-US-Wavenet-A"
|
|
252
|
-
|
|
253
|
-
// Map our audio encoding to Google's enum
|
|
254
|
-
const audioEncodingMap = {
|
|
255
|
-
MP3: "MP3" as const,
|
|
256
|
-
LINEAR16: "LINEAR16" as const,
|
|
257
|
-
OGG_OPUS: "OGG_OPUS" as const,
|
|
258
|
-
};
|
|
259
|
-
|
|
260
|
-
const [response] = await this.client.synthesizeSpeech({
|
|
261
|
-
input: isSsml ? { ssml: request.text } : { text: request.text },
|
|
262
|
-
voice: {
|
|
263
|
-
languageCode,
|
|
264
|
-
name: voice,
|
|
265
|
-
},
|
|
266
|
-
audioConfig: {
|
|
267
|
-
audioEncoding: audioEncodingMap[this.audioEncoding],
|
|
268
|
-
sampleRateHertz: request.sampleRate || 24000,
|
|
269
|
-
},
|
|
270
|
-
});
|
|
271
|
-
|
|
272
|
-
if (!response.audioContent) {
|
|
273
|
-
throw new Error("No audio content received from Google Cloud TTS");
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
// Convert Uint8Array to Buffer
|
|
277
|
-
const audioBuffer = Buffer.from(response.audioContent);
|
|
278
|
-
|
|
279
|
-
const contentTypeMap = {
|
|
280
|
-
MP3: "audio/mpeg",
|
|
281
|
-
LINEAR16: "audio/wav",
|
|
282
|
-
OGG_OPUS: "audio/ogg",
|
|
283
|
-
};
|
|
284
|
-
|
|
285
|
-
return {
|
|
286
|
-
audio: audioBuffer,
|
|
287
|
-
contentType: contentTypeMap[this.audioEncoding],
|
|
288
|
-
};
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
/**
|
|
292
|
-
* Synthesize with speech marks using SSML mark injection
|
|
293
|
-
*/
|
|
294
|
-
private async synthesizeWithSpeechMarks(
|
|
295
|
-
request: SynthesizeRequest,
|
|
296
|
-
voice: string,
|
|
297
|
-
): Promise<{
|
|
298
|
-
audio: Buffer;
|
|
299
|
-
contentType: string;
|
|
300
|
-
speechMarks: SpeechMark[];
|
|
301
|
-
}> {
|
|
302
|
-
// Check if the text is already SSML
|
|
303
|
-
const isUserSSML = this.detectSSML(request.text);
|
|
304
|
-
|
|
305
|
-
// If user provided SSML, we need to inject marks within the existing SSML
|
|
306
|
-
// For simplicity in v1, we'll inject marks for plain text only
|
|
307
|
-
const { ssml, wordMap } = isUserSSML
|
|
308
|
-
? this.extractWordsFromSSML(request.text)
|
|
309
|
-
: this.injectSSMLMarks(request.text);
|
|
310
|
-
|
|
311
|
-
if (this.enableLogging) {
|
|
312
|
-
console.log(`[GoogleCloudTTS] Injected ${wordMap.length} SSML marks`);
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
// Parse voice name to extract language code
|
|
316
|
-
const languageCode = voice.split("-").slice(0, 2).join("-");
|
|
317
|
-
|
|
318
|
-
// Map our audio encoding to Google's enum
|
|
319
|
-
const audioEncodingMap = {
|
|
320
|
-
MP3: "MP3" as const,
|
|
321
|
-
LINEAR16: "LINEAR16" as const,
|
|
322
|
-
OGG_OPUS: "OGG_OPUS" as const,
|
|
323
|
-
};
|
|
324
|
-
|
|
325
|
-
// Single API call with timepoint tracking enabled
|
|
326
|
-
const responseArray = await this.client.synthesizeSpeech({
|
|
327
|
-
input: { ssml },
|
|
328
|
-
voice: {
|
|
329
|
-
languageCode,
|
|
330
|
-
name: voice,
|
|
331
|
-
},
|
|
332
|
-
audioConfig: {
|
|
333
|
-
audioEncoding: audioEncodingMap[this.audioEncoding],
|
|
334
|
-
sampleRateHertz: request.sampleRate || 24000,
|
|
335
|
-
},
|
|
336
|
-
enableTimePointing: [
|
|
337
|
-
protos.google.cloud.texttospeech.v1beta1.SynthesizeSpeechRequest
|
|
338
|
-
.TimepointType.SSML_MARK,
|
|
339
|
-
],
|
|
340
|
-
});
|
|
341
|
-
const response = responseArray[0];
|
|
342
|
-
|
|
343
|
-
if (!response.audioContent) {
|
|
344
|
-
throw new Error("No audio content received from Google Cloud TTS");
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
// Convert Uint8Array to Buffer
|
|
348
|
-
const audioBuffer = Buffer.from(response.audioContent);
|
|
349
|
-
|
|
350
|
-
const contentTypeMap = {
|
|
351
|
-
MP3: "audio/mpeg",
|
|
352
|
-
LINEAR16: "audio/wav",
|
|
353
|
-
OGG_OPUS: "audio/ogg",
|
|
354
|
-
};
|
|
355
|
-
|
|
356
|
-
// Extract speech marks from timepoints
|
|
357
|
-
const speechMarks = this.extractSpeechMarksFromTimepoints(
|
|
358
|
-
response.timepoints || [],
|
|
359
|
-
wordMap,
|
|
360
|
-
);
|
|
361
|
-
|
|
362
|
-
if (this.enableLogging) {
|
|
363
|
-
console.log(
|
|
364
|
-
`[GoogleCloudTTS] Extracted ${speechMarks.length} speech marks`,
|
|
365
|
-
);
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
return {
|
|
369
|
-
audio: audioBuffer,
|
|
370
|
-
contentType: contentTypeMap[this.audioEncoding],
|
|
371
|
-
speechMarks,
|
|
372
|
-
};
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
/**
|
|
376
|
-
* Inject SSML marks before each word in plain text
|
|
377
|
-
*/
|
|
378
|
-
private injectSSMLMarks(text: string): {
|
|
379
|
-
ssml: string;
|
|
380
|
-
wordMap: Array<{
|
|
381
|
-
word: string;
|
|
382
|
-
start: number;
|
|
383
|
-
end: number;
|
|
384
|
-
markName: string;
|
|
385
|
-
}>;
|
|
386
|
-
} {
|
|
387
|
-
const words: Array<{
|
|
388
|
-
word: string;
|
|
389
|
-
start: number;
|
|
390
|
-
end: number;
|
|
391
|
-
markName: string;
|
|
392
|
-
}> = [];
|
|
393
|
-
const wordRegex = /\b[\w']+\b/g;
|
|
394
|
-
let match;
|
|
395
|
-
let markIndex = 0;
|
|
396
|
-
|
|
397
|
-
while ((match = wordRegex.exec(text)) !== null) {
|
|
398
|
-
const word = match[0];
|
|
399
|
-
const start = match.index;
|
|
400
|
-
const end = start + word.length;
|
|
401
|
-
const markName = `w${markIndex++}`;
|
|
402
|
-
|
|
403
|
-
words.push({ word, start, end, markName });
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
// Build SSML with marks
|
|
407
|
-
let ssml = "<speak>";
|
|
408
|
-
let lastEnd = 0;
|
|
409
|
-
|
|
410
|
-
for (const { word, start, end, markName } of words) {
|
|
411
|
-
// Add text before word (including whitespace and punctuation)
|
|
412
|
-
ssml += this.escapeSSML(text.slice(lastEnd, start));
|
|
413
|
-
// Add marked word
|
|
414
|
-
ssml += `<mark name="${markName}"/>${this.escapeSSML(word)}`;
|
|
415
|
-
lastEnd = end;
|
|
416
|
-
}
|
|
417
|
-
|
|
418
|
-
// Add remaining text
|
|
419
|
-
ssml += this.escapeSSML(text.slice(lastEnd)) + "</speak>";
|
|
420
|
-
|
|
421
|
-
return { ssml, wordMap: words };
|
|
422
|
-
}
|
|
423
|
-
|
|
424
|
-
/**
|
|
425
|
-
* Extract words from existing SSML (simplified version for v1)
|
|
426
|
-
*/
|
|
427
|
-
private extractWordsFromSSML(ssmlText: string): {
|
|
428
|
-
ssml: string;
|
|
429
|
-
wordMap: Array<{
|
|
430
|
-
word: string;
|
|
431
|
-
start: number;
|
|
432
|
-
end: number;
|
|
433
|
-
markName: string;
|
|
434
|
-
}>;
|
|
435
|
-
} {
|
|
436
|
-
// For now, just strip SSML tags and inject marks
|
|
437
|
-
// More sophisticated SSML parsing can be added in future versions
|
|
438
|
-
const plainText = ssmlText
|
|
439
|
-
.replace(/<[^>]+>/g, " ") // Remove all tags
|
|
440
|
-
.replace(/\s+/g, " ") // Normalize whitespace
|
|
441
|
-
.trim();
|
|
442
|
-
|
|
443
|
-
return this.injectSSMLMarks(plainText);
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
/**
|
|
447
|
-
* Escape special XML characters for SSML
|
|
448
|
-
*/
|
|
449
|
-
private escapeSSML(text: string): string {
|
|
450
|
-
return text
|
|
451
|
-
.replace(/&/g, "&")
|
|
452
|
-
.replace(/</g, "<")
|
|
453
|
-
.replace(/>/g, ">")
|
|
454
|
-
.replace(/"/g, """)
|
|
455
|
-
.replace(/'/g, "'");
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
/**
|
|
459
|
-
* Extract speech marks from Google's timepoints
|
|
460
|
-
*/
|
|
461
|
-
private extractSpeechMarksFromTimepoints(
|
|
462
|
-
timepoints:
|
|
463
|
-
| protos.google.cloud.texttospeech.v1beta1.ITimepoint[]
|
|
464
|
-
| null
|
|
465
|
-
| undefined,
|
|
466
|
-
wordMap: Array<{
|
|
467
|
-
word: string;
|
|
468
|
-
start: number;
|
|
469
|
-
end: number;
|
|
470
|
-
markName: string;
|
|
471
|
-
}>,
|
|
472
|
-
): SpeechMark[] {
|
|
473
|
-
if (!timepoints || timepoints.length === 0) {
|
|
474
|
-
return [];
|
|
475
|
-
}
|
|
476
|
-
|
|
477
|
-
const speechMarks: SpeechMark[] = [];
|
|
478
|
-
|
|
479
|
-
for (const timepoint of timepoints) {
|
|
480
|
-
// Find corresponding word in our map
|
|
481
|
-
const wordInfo = wordMap.find((w) => w.markName === timepoint.markName);
|
|
482
|
-
|
|
483
|
-
if (
|
|
484
|
-
wordInfo &&
|
|
485
|
-
timepoint.timeSeconds !== undefined &&
|
|
486
|
-
timepoint.timeSeconds !== null
|
|
487
|
-
) {
|
|
488
|
-
speechMarks.push({
|
|
489
|
-
time: Math.round(timepoint.timeSeconds * 1000), // Convert to ms
|
|
490
|
-
type: "word",
|
|
491
|
-
start: wordInfo.start,
|
|
492
|
-
end: wordInfo.end,
|
|
493
|
-
value: wordInfo.word,
|
|
494
|
-
});
|
|
495
|
-
}
|
|
496
|
-
}
|
|
497
|
-
|
|
498
|
-
// Sort by time
|
|
499
|
-
return speechMarks.sort((a, b) => a.time - b.time);
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
/**
|
|
503
|
-
* Detect if text contains SSML markup
|
|
504
|
-
*/
|
|
505
|
-
private detectSSML(text: string): boolean {
|
|
506
|
-
return (
|
|
507
|
-
text.includes("<speak") ||
|
|
508
|
-
text.includes("<prosody") ||
|
|
509
|
-
text.includes("<emphasis") ||
|
|
510
|
-
text.includes("<break") ||
|
|
511
|
-
text.includes("<phoneme") ||
|
|
512
|
-
text.includes("<say-as") ||
|
|
513
|
-
text.includes("<mark")
|
|
514
|
-
);
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
/**
|
|
518
|
-
* Get available voices from Google Cloud TTS
|
|
519
|
-
*/
|
|
520
|
-
async getVoices(options?: GetVoicesOptions): Promise<Voice[]> {
|
|
521
|
-
this.ensureInitialized();
|
|
522
|
-
|
|
523
|
-
try {
|
|
524
|
-
const [response] = await this.client.listVoices({
|
|
525
|
-
languageCode: options?.language,
|
|
526
|
-
});
|
|
527
|
-
|
|
528
|
-
if (!response.voices) {
|
|
529
|
-
return [];
|
|
530
|
-
}
|
|
531
|
-
|
|
532
|
-
return response.voices
|
|
533
|
-
.map((voice) => this.mapGoogleVoiceToVoice(voice))
|
|
534
|
-
.filter((voice) => {
|
|
535
|
-
// Apply filters
|
|
536
|
-
if (options?.gender && voice.gender !== options.gender) {
|
|
537
|
-
return false;
|
|
538
|
-
}
|
|
539
|
-
if (options?.quality && voice.quality !== options.quality) {
|
|
540
|
-
return false;
|
|
541
|
-
}
|
|
542
|
-
return true;
|
|
543
|
-
});
|
|
544
|
-
} catch (error) {
|
|
545
|
-
throw new TTSError(
|
|
546
|
-
TTSErrorCode.PROVIDER_ERROR,
|
|
547
|
-
`Failed to get voices: ${error instanceof Error ? error.message : String(error)}`,
|
|
548
|
-
{ error },
|
|
549
|
-
this.providerId,
|
|
550
|
-
);
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
|
-
|
|
554
|
-
/**
|
|
555
|
-
* Map Google Cloud voice to unified Voice interface
|
|
556
|
-
*/
|
|
557
|
-
private mapGoogleVoiceToVoice(
|
|
558
|
-
googleVoice: protos.google.cloud.texttospeech.v1beta1.IVoice,
|
|
559
|
-
): Voice {
|
|
560
|
-
const voiceName = googleVoice.name || "";
|
|
561
|
-
|
|
562
|
-
// Determine quality based on voice type
|
|
563
|
-
let quality: "standard" | "neural" | "premium" = "standard";
|
|
564
|
-
if (voiceName.includes("Wavenet")) {
|
|
565
|
-
quality = "neural";
|
|
566
|
-
} else if (voiceName.includes("Studio")) {
|
|
567
|
-
quality = "premium";
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
// Map SSML gender to our gender type
|
|
571
|
-
const genderMap: Record<string, "male" | "female" | "neutral"> = {
|
|
572
|
-
MALE: "male",
|
|
573
|
-
FEMALE: "female",
|
|
574
|
-
NEUTRAL: "neutral",
|
|
575
|
-
};
|
|
576
|
-
const gender = genderMap[googleVoice.ssmlGender || "NEUTRAL"] || "neutral";
|
|
577
|
-
|
|
578
|
-
return {
|
|
579
|
-
id: voiceName,
|
|
580
|
-
name: voiceName,
|
|
581
|
-
language: googleVoice.languageCodes?.[0] || "Unknown",
|
|
582
|
-
languageCode: googleVoice.languageCodes?.[0] || "",
|
|
583
|
-
gender,
|
|
584
|
-
quality,
|
|
585
|
-
supportedFeatures: {
|
|
586
|
-
ssml: true,
|
|
587
|
-
emotions: false, // Google doesn't have built-in emotions
|
|
588
|
-
styles: false, // Google doesn't have speaking styles
|
|
589
|
-
},
|
|
590
|
-
providerMetadata: {
|
|
591
|
-
naturalSampleRateHertz: googleVoice.naturalSampleRateHertz,
|
|
592
|
-
languageCodes: googleVoice.languageCodes,
|
|
593
|
-
ssmlGender: googleVoice.ssmlGender,
|
|
594
|
-
},
|
|
595
|
-
};
|
|
596
|
-
}
|
|
597
|
-
|
|
598
|
-
/**
|
|
599
|
-
* Get Google Cloud TTS capabilities
|
|
600
|
-
*/
|
|
601
|
-
getCapabilities(): ServerProviderCapabilities {
|
|
602
|
-
return {
|
|
603
|
-
// W3C Standard features
|
|
604
|
-
standard: {
|
|
605
|
-
supportsSSML: true, // ✅ Full SSML 1.1 support
|
|
606
|
-
supportsPitch: true, // ✅ Via SSML <prosody pitch>
|
|
607
|
-
supportsRate: true, // ✅ Via SSML <prosody rate>
|
|
608
|
-
supportsVolume: false, // ❌ Not supported (handle client-side)
|
|
609
|
-
supportsMultipleVoices: true, // ✅ 200+ voices across 50+ languages
|
|
610
|
-
maxTextLength: 5000, // Google Cloud TTS limit per request
|
|
611
|
-
},
|
|
612
|
-
|
|
613
|
-
// Provider-specific extensions
|
|
614
|
-
extensions: {
|
|
615
|
-
supportsSpeechMarks: true, // ✅ Via SSML marks + timepoints
|
|
616
|
-
supportedFormats: ["mp3", "wav", "ogg"], // MP3, LINEAR16, OGG_OPUS
|
|
617
|
-
supportsSampleRate: true, // ✅ Configurable sample rate
|
|
618
|
-
|
|
619
|
-
// Google Cloud-specific features
|
|
620
|
-
providerSpecific: {
|
|
621
|
-
voiceTypes: ["standard", "wavenet", "studio"],
|
|
622
|
-
voicesCount: 200, // ~200+ voices available
|
|
623
|
-
languagesCount: 50, // 50+ languages supported
|
|
624
|
-
supportsAudioProfiles: true, // Audio device profiles
|
|
625
|
-
supportsEffects: false, // No built-in effects
|
|
626
|
-
supportsEmotions: false, // No emotion control
|
|
627
|
-
supportsStyles: false, // No speaking styles
|
|
628
|
-
},
|
|
629
|
-
},
|
|
630
|
-
};
|
|
631
|
-
}
|
|
632
|
-
|
|
633
|
-
/**
|
|
634
|
-
* Map Google Cloud errors to TTSError codes
|
|
635
|
-
*/
|
|
636
|
-
private mapGoogleErrorToTTSError(error: any): TTSError {
|
|
637
|
-
const message = error.message || String(error);
|
|
638
|
-
|
|
639
|
-
// Check for specific Google Cloud error codes
|
|
640
|
-
if (error.code === 7) {
|
|
641
|
-
// PERMISSION_DENIED
|
|
642
|
-
return new TTSError(
|
|
643
|
-
TTSErrorCode.AUTHENTICATION_ERROR,
|
|
644
|
-
`Google Cloud authentication failed: ${message}`,
|
|
645
|
-
{ error },
|
|
646
|
-
this.providerId,
|
|
647
|
-
);
|
|
648
|
-
}
|
|
649
|
-
|
|
650
|
-
if (error.code === 8) {
|
|
651
|
-
// RESOURCE_EXHAUSTED
|
|
652
|
-
return new TTSError(
|
|
653
|
-
TTSErrorCode.RATE_LIMIT_EXCEEDED,
|
|
654
|
-
`Google Cloud rate limit exceeded: ${message}`,
|
|
655
|
-
{ error },
|
|
656
|
-
this.providerId,
|
|
657
|
-
);
|
|
658
|
-
}
|
|
659
|
-
|
|
660
|
-
if (error.code === 3) {
|
|
661
|
-
// INVALID_ARGUMENT
|
|
662
|
-
return new TTSError(
|
|
663
|
-
TTSErrorCode.INVALID_REQUEST,
|
|
664
|
-
`Invalid request to Google Cloud TTS: ${message}`,
|
|
665
|
-
{ error },
|
|
666
|
-
this.providerId,
|
|
667
|
-
);
|
|
668
|
-
}
|
|
669
|
-
|
|
670
|
-
// Default to provider error
|
|
671
|
-
return new TTSError(
|
|
672
|
-
TTSErrorCode.PROVIDER_ERROR,
|
|
673
|
-
`Google Cloud TTS error: ${message}`,
|
|
674
|
-
{ error },
|
|
675
|
-
this.providerId,
|
|
676
|
-
);
|
|
677
|
-
}
|
|
678
|
-
|
|
679
|
-
/**
|
|
680
|
-
* Clean up Google Cloud TTS client
|
|
681
|
-
*/
|
|
682
|
-
async destroy(): Promise<void> {
|
|
683
|
-
if (this.client) {
|
|
684
|
-
await this.client.close();
|
|
685
|
-
}
|
|
686
|
-
await super.destroy();
|
|
687
|
-
}
|
|
688
|
-
}
|
package/src/index.ts
DELETED