@pie-players/tts-server-google 0.1.0 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pie-players/tts-server-google",
3
- "version": "0.1.0",
3
+ "version": "0.1.3",
4
4
  "description": "Google Cloud Text-to-Speech provider for server-side TTS with speech marks support",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -11,10 +11,13 @@
11
11
  "import": "./dist/index.js"
12
12
  }
13
13
  },
14
+ "files": [
15
+ "dist"
16
+ ],
14
17
  "scripts": {
15
18
  "build": "tsc",
16
19
  "dev": "tsc --watch",
17
- "test": "vitest",
20
+ "test": "vitest run --passWithNoTests",
18
21
  "test:coverage": "vitest --coverage"
19
22
  },
20
23
  "keywords": [
@@ -29,7 +32,7 @@
29
32
  "license": "MIT",
30
33
  "dependencies": {
31
34
  "@google-cloud/text-to-speech": "^5.0.0",
32
- "@pie-players/tts-server-core": "workspace:*"
35
+ "@pie-players/tts-server-core": "0.1.3"
33
36
  },
34
37
  "devDependencies": {
35
38
  "typescript": "^5.3.3",
@@ -1 +0,0 @@
1
- $ tsc
@@ -1,532 +0,0 @@
1
- # Google Cloud TTS Integration Guide
2
-
3
- This guide shows how to integrate the Google Cloud Text-to-Speech provider into your server-side application.
4
-
5
- ## Prerequisites
6
-
7
- 1. **Google Cloud Project**: Create a project at [console.cloud.google.com](https://console.cloud.google.com)
8
- 2. **Enable Text-to-Speech API**: Go to APIs & Services → Enable "Cloud Text-to-Speech API"
9
- 3. **Authentication**: Set up one of the authentication methods below
10
-
11
- ## Authentication Setup
12
-
13
- ### Option 1: Service Account (Recommended for Production)
14
-
15
- 1. Go to IAM & Admin → Service Accounts
16
- 2. Create a service account
17
- 3. Grant the role: "Cloud Text-to-Speech User"
18
- 4. Create and download a JSON key file
19
-
20
- ```typescript
21
- import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
22
-
23
- const provider = new GoogleCloudTTSProvider();
24
-
25
- await provider.initialize({
26
- projectId: 'your-project-id',
27
- credentials: './config/service-account.json',
28
- voiceType: 'wavenet',
29
- });
30
- ```
31
-
32
- ### Option 2: API Key (Simple but Less Secure)
33
-
34
- 1. Go to APIs & Services → Credentials
35
- 2. Create credentials → API Key
36
- 3. Restrict the key to "Cloud Text-to-Speech API"
37
-
38
- ```typescript
39
- await provider.initialize({
40
- projectId: 'your-project-id',
41
- credentials: {
42
- apiKey: process.env.GOOGLE_TTS_API_KEY!,
43
- },
44
- });
45
- ```
46
-
47
- ### Option 3: Application Default Credentials (Local Development)
48
-
49
- 1. Install Google Cloud SDK
50
- 2. Run: `gcloud auth application-default login`
51
-
52
- ```typescript
53
- await provider.initialize({
54
- projectId: 'your-project-id',
55
- // No credentials needed - uses ADC
56
- });
57
- ```
58
-
59
- ## SvelteKit Integration
60
-
61
- ### 1. Install Dependencies
62
-
63
- ```bash
64
- npm install @pie-players/tts-server-google
65
- ```
66
-
67
- ### 2. Create TTS API Route
68
-
69
- **File: `src/routes/api/tts/synthesize/+server.ts`**
70
-
71
- ```typescript
72
- import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
73
- import { json } from '@sveltejs/kit';
74
- import type { RequestHandler } from './$types';
75
-
76
- // Initialize provider once (singleton pattern)
77
- let ttsProvider: GoogleCloudTTSProvider | null = null;
78
-
79
- async function getTTSProvider() {
80
- if (!ttsProvider) {
81
- ttsProvider = new GoogleCloudTTSProvider();
82
- await ttsProvider.initialize({
83
- projectId: process.env.GOOGLE_CLOUD_PROJECT!,
84
- credentials: process.env.GOOGLE_APPLICATION_CREDENTIALS,
85
- voiceType: 'wavenet',
86
- defaultVoice: 'en-US-Wavenet-A',
87
- });
88
- }
89
- return ttsProvider;
90
- }
91
-
92
- export const POST: RequestHandler = async ({ request }) => {
93
- try {
94
- const { text, voice, includeSpeechMarks } = await request.json();
95
-
96
- const provider = await getTTSProvider();
97
-
98
- const result = await provider.synthesize({
99
- text,
100
- voice,
101
- includeSpeechMarks: includeSpeechMarks ?? true,
102
- });
103
-
104
- // Convert audio buffer to base64 for JSON response
105
- const audioBase64 = result.audio.toString('base64');
106
-
107
- return json({
108
- audio: audioBase64,
109
- contentType: result.contentType,
110
- speechMarks: result.speechMarks,
111
- metadata: result.metadata,
112
- });
113
- } catch (error) {
114
- console.error('TTS synthesis failed:', error);
115
- return json(
116
- { error: error instanceof Error ? error.message : 'TTS synthesis failed' },
117
- { status: 500 }
118
- );
119
- }
120
- };
121
- ```
122
-
123
- ### 3. Create Voices API Route
124
-
125
- **File: `src/routes/api/tts/voices/+server.ts`**
126
-
127
- ```typescript
128
- import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
129
- import { json } from '@sveltejs/kit';
130
- import type { RequestHandler } from './$types';
131
-
132
- let ttsProvider: GoogleCloudTTSProvider | null = null;
133
-
134
- async function getTTSProvider() {
135
- if (!ttsProvider) {
136
- ttsProvider = new GoogleCloudTTSProvider();
137
- await ttsProvider.initialize({
138
- projectId: process.env.GOOGLE_CLOUD_PROJECT!,
139
- credentials: process.env.GOOGLE_APPLICATION_CREDENTIALS,
140
- voiceType: 'wavenet',
141
- });
142
- }
143
- return ttsProvider;
144
- }
145
-
146
- export const GET: RequestHandler = async ({ url }) => {
147
- try {
148
- const language = url.searchParams.get('language') || undefined;
149
- const gender = url.searchParams.get('gender') as 'male' | 'female' | 'neutral' | undefined;
150
-
151
- const provider = await getTTSProvider();
152
-
153
- const voices = await provider.getVoices({ language, gender });
154
-
155
- return json({ voices });
156
- } catch (error) {
157
- console.error('Failed to fetch voices:', error);
158
- return json(
159
- { error: error instanceof Error ? error.message : 'Failed to fetch voices' },
160
- { status: 500 }
161
- );
162
- }
163
- };
164
- ```
165
-
166
- ### 4. Environment Variables
167
-
168
- **File: `.env`**
169
-
170
- ```bash
171
- # Google Cloud Configuration
172
- GOOGLE_CLOUD_PROJECT=your-project-id
173
- GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
174
-
175
- # Or use API key
176
- # GOOGLE_TTS_API_KEY=AIza...
177
- ```
178
-
179
- ### 5. Client-Side Usage
180
-
181
- **File: `src/lib/tts-client.ts`**
182
-
183
- ```typescript
184
- export interface TTSResult {
185
- audio: string; // Base64 encoded
186
- contentType: string;
187
- speechMarks: Array<{
188
- time: number;
189
- type: string;
190
- start: number;
191
- end: number;
192
- value: string;
193
- }>;
194
- metadata: {
195
- providerId: string;
196
- voice: string;
197
- duration: number;
198
- };
199
- }
200
-
201
- export async function synthesizeSpeech(
202
- text: string,
203
- voice?: string
204
- ): Promise<TTSResult> {
205
- const response = await fetch('/api/tts/synthesize', {
206
- method: 'POST',
207
- headers: { 'Content-Type': 'application/json' },
208
- body: JSON.stringify({ text, voice, includeSpeechMarks: true }),
209
- });
210
-
211
- if (!response.ok) {
212
- throw new Error(`TTS failed: ${response.statusText}`);
213
- }
214
-
215
- return response.json();
216
- }
217
-
218
- export async function playAudio(result: TTSResult): Promise<void> {
219
- // Convert base64 to blob
220
- const audioData = atob(result.audio);
221
- const audioArray = new Uint8Array(audioData.length);
222
- for (let i = 0; i < audioData.length; i++) {
223
- audioArray[i] = audioData.charCodeAt(i);
224
- }
225
- const blob = new Blob([audioArray], { type: result.contentType });
226
-
227
- // Play audio
228
- const url = URL.createObjectURL(blob);
229
- const audio = new Audio(url);
230
- await audio.play();
231
-
232
- // Clean up
233
- audio.onended = () => URL.revokeObjectURL(url);
234
- }
235
-
236
- export async function getVoices(language?: string) {
237
- const params = new URLSearchParams();
238
- if (language) params.set('language', language);
239
-
240
- const response = await fetch(`/api/tts/voices?${params}`);
241
- const { voices } = await response.json();
242
- return voices;
243
- }
244
- ```
245
-
246
- **File: `src/routes/+page.svelte`**
247
-
248
- ```svelte
249
- <script lang="ts">
250
- import { synthesizeSpeech, playAudio, getVoices } from '$lib/tts-client';
251
- import { onMount } from 'svelte';
252
-
253
- let text = 'Hello world, this is Google Cloud Text to Speech!';
254
- let voice = 'en-US-Wavenet-A';
255
- let voices: any[] = [];
256
- let speechMarks: any[] = [];
257
- let isPlaying = false;
258
-
259
- onMount(async () => {
260
- voices = await getVoices('en-US');
261
- });
262
-
263
- async function handleSpeak() {
264
- try {
265
- isPlaying = true;
266
- const result = await synthesizeSpeech(text, voice);
267
- speechMarks = result.speechMarks;
268
- await playAudio(result);
269
- } catch (error) {
270
- console.error('Speech failed:', error);
271
- alert('Speech synthesis failed');
272
- } finally {
273
- isPlaying = false;
274
- }
275
- }
276
- </script>
277
-
278
- <div class="container">
279
- <h1>Google Cloud TTS Demo</h1>
280
-
281
- <div class="controls">
282
- <label>
283
- Text to speak:
284
- <textarea bind:value={text} rows="4"></textarea>
285
- </label>
286
-
287
- <label>
288
- Voice:
289
- <select bind:value={voice}>
290
- {#each voices as v}
291
- <option value={v.id}>{v.name} ({v.gender})</option>
292
- {/each}
293
- </select>
294
- </label>
295
-
296
- <button on:click={handleSpeak} disabled={isPlaying}>
297
- {isPlaying ? 'Speaking...' : 'Speak'}
298
- </button>
299
- </div>
300
-
301
- {#if speechMarks.length > 0}
302
- <div class="speech-marks">
303
- <h2>Speech Marks</h2>
304
- <ul>
305
- {#each speechMarks as mark}
306
- <li>
307
- {mark.value} ({mark.time}ms)
308
- </li>
309
- {/each}
310
- </ul>
311
- </div>
312
- {/if}
313
- </div>
314
-
315
- <style>
316
- .container {
317
- max-width: 800px;
318
- margin: 2rem auto;
319
- padding: 2rem;
320
- }
321
-
322
- .controls {
323
- display: flex;
324
- flex-direction: column;
325
- gap: 1rem;
326
- }
327
-
328
- textarea {
329
- width: 100%;
330
- padding: 0.5rem;
331
- font-family: inherit;
332
- }
333
-
334
- select {
335
- width: 100%;
336
- padding: 0.5rem;
337
- }
338
-
339
- button {
340
- padding: 0.75rem 1.5rem;
341
- background: #4285f4;
342
- color: white;
343
- border: none;
344
- border-radius: 4px;
345
- cursor: pointer;
346
- font-size: 1rem;
347
- }
348
-
349
- button:disabled {
350
- background: #ccc;
351
- cursor: not-allowed;
352
- }
353
-
354
- .speech-marks {
355
- margin-top: 2rem;
356
- padding: 1rem;
357
- background: #f5f5f5;
358
- border-radius: 4px;
359
- }
360
-
361
- .speech-marks ul {
362
- list-style: none;
363
- padding: 0;
364
- }
365
-
366
- .speech-marks li {
367
- padding: 0.25rem;
368
- font-family: monospace;
369
- }
370
- </style>
371
- ```
372
-
373
- ## Express.js Integration
374
-
375
- ```typescript
376
- import express from 'express';
377
- import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
378
-
379
- const app = express();
380
- app.use(express.json());
381
-
382
- // Initialize provider
383
- const ttsProvider = new GoogleCloudTTSProvider();
384
- await ttsProvider.initialize({
385
- projectId: process.env.GOOGLE_CLOUD_PROJECT!,
386
- credentials: process.env.GOOGLE_APPLICATION_CREDENTIALS,
387
- voiceType: 'wavenet',
388
- });
389
-
390
- // Synthesize endpoint
391
- app.post('/api/tts/synthesize', async (req, res) => {
392
- try {
393
- const { text, voice, includeSpeechMarks } = req.body;
394
-
395
- const result = await ttsProvider.synthesize({
396
- text,
397
- voice,
398
- includeSpeechMarks: includeSpeechMarks ?? true,
399
- });
400
-
401
- // Return audio as buffer
402
- res.json({
403
- audio: result.audio.toString('base64'),
404
- contentType: result.contentType,
405
- speechMarks: result.speechMarks,
406
- metadata: result.metadata,
407
- });
408
- } catch (error) {
409
- console.error('TTS synthesis failed:', error);
410
- res.status(500).json({ error: 'TTS synthesis failed' });
411
- }
412
- });
413
-
414
- // Voices endpoint
415
- app.get('/api/tts/voices', async (req, res) => {
416
- try {
417
- const { language, gender } = req.query;
418
-
419
- const voices = await ttsProvider.getVoices({
420
- language: language as string,
421
- gender: gender as 'male' | 'female' | 'neutral',
422
- });
423
-
424
- res.json({ voices });
425
- } catch (error) {
426
- console.error('Failed to fetch voices:', error);
427
- res.status(500).json({ error: 'Failed to fetch voices' });
428
- }
429
- });
430
-
431
- app.listen(3000, () => {
432
- console.log('Server running on http://localhost:3000');
433
- });
434
- ```
435
-
436
- ## Next.js App Router Integration
437
-
438
- **File: `app/api/tts/synthesize/route.ts`**
439
-
440
- ```typescript
441
- import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
442
- import { NextResponse } from 'next/server';
443
-
444
- let ttsProvider: GoogleCloudTTSProvider | null = null;
445
-
446
- async function getTTSProvider() {
447
- if (!ttsProvider) {
448
- ttsProvider = new GoogleCloudTTSProvider();
449
- await ttsProvider.initialize({
450
- projectId: process.env.GOOGLE_CLOUD_PROJECT!,
451
- credentials: process.env.GOOGLE_APPLICATION_CREDENTIALS,
452
- voiceType: 'wavenet',
453
- });
454
- }
455
- return ttsProvider;
456
- }
457
-
458
- export async function POST(request: Request) {
459
- try {
460
- const { text, voice, includeSpeechMarks } = await request.json();
461
-
462
- const provider = await getTTSProvider();
463
-
464
- const result = await provider.synthesize({
465
- text,
466
- voice,
467
- includeSpeechMarks: includeSpeechMarks ?? true,
468
- });
469
-
470
- return NextResponse.json({
471
- audio: result.audio.toString('base64'),
472
- contentType: result.contentType,
473
- speechMarks: result.speechMarks,
474
- metadata: result.metadata,
475
- });
476
- } catch (error) {
477
- console.error('TTS synthesis failed:', error);
478
- return NextResponse.json(
479
- { error: 'TTS synthesis failed' },
480
- { status: 500 }
481
- );
482
- }
483
- }
484
- ```
485
-
486
- ## Security Best Practices
487
-
488
- 1. **Never expose API keys in client code** - Always use server-side endpoints
489
- 2. **Restrict API keys** - Limit to specific APIs and IP addresses
490
- 3. **Use service accounts in production** - More secure than API keys
491
- 4. **Rate limiting** - Implement rate limiting to prevent abuse
492
- 5. **Cache results** - Cache TTS output to reduce API calls and costs
493
- 6. **Validate input** - Sanitize and validate user input before synthesis
494
-
495
- ## Cost Optimization
496
-
497
- 1. **Cache frequently used phrases** - Store audio for common text
498
- 2. **Use standard voices when possible** - $4/1M vs $16/1M for neural
499
- 3. **Batch requests** - Group multiple synthesis requests when feasible
500
- 4. **Monitor usage** - Set up billing alerts in Google Cloud Console
501
-
502
- ## Troubleshooting
503
-
504
- ### Authentication Errors
505
-
506
- ```
507
- Error: Google Cloud authentication failed
508
- ```
509
-
510
- **Solution**: Verify your credentials are correct and the service account has the "Cloud Text-to-Speech User" role.
511
-
512
- ### Rate Limit Exceeded
513
-
514
- ```
515
- Error: Google Cloud rate limit exceeded
516
- ```
517
-
518
- **Solution**: Implement exponential backoff and request rate limiting. Consider increasing your quota in Google Cloud Console.
519
-
520
- ### No Audio Content
521
-
522
- ```
523
- Error: No audio content received from Google Cloud TTS
524
- ```
525
-
526
- **Solution**: Check that your project has the Text-to-Speech API enabled and your billing is active.
527
-
528
- ## Support
529
-
530
- For issues specific to this package, please file an issue on GitHub.
531
-
532
- For Google Cloud TTS API issues, see the [official documentation](https://cloud.google.com/text-to-speech/docs).
@@ -1,688 +0,0 @@
1
- /**
2
- * Google Cloud Text-to-Speech server-side TTS provider
3
- * @module @pie-players/tts-server-google
4
- */
5
-
6
- import { v1beta1, protos } from "@google-cloud/text-to-speech";
7
-
8
- import {
9
- BaseTTSProvider,
10
- type GetVoicesOptions,
11
- type ServerProviderCapabilities,
12
- type SpeechMark,
13
- type SynthesizeRequest,
14
- type SynthesizeResponse,
15
- TTSError,
16
- TTSErrorCode,
17
- type TTSServerConfig,
18
- type Voice,
19
- } from "@pie-players/tts-server-core";
20
-
21
- /**
22
- * Google Cloud Text-to-Speech provider configuration.
23
- *
24
- * This extends the base TTSServerConfig with Google Cloud-specific settings.
25
- */
26
- export interface GoogleCloudTTSConfig extends TTSServerConfig {
27
- /**
28
- * Google Cloud project ID (required)
29
- *
30
- * @example 'my-project-123456'
31
- * @required
32
- */
33
- projectId: string;
34
-
35
- /**
36
- * Authentication credentials
37
- *
38
- * Supports multiple authentication methods:
39
- * - Service account JSON file path (recommended for production)
40
- * - Service account key object (for containers/serverless)
41
- * - API key (for simple applications)
42
- * - Omit to use Application Default Credentials (ADC) for local development
43
- *
44
- * @example '/path/to/service-account.json'
45
- * @example { client_email: '...', private_key: '...' }
46
- * @example { apiKey: 'AIza...' }
47
- * @see https://cloud.google.com/docs/authentication
48
- */
49
- credentials?:
50
- | string // Path to service account JSON file
51
- | {
52
- // Service account key object
53
- client_email: string;
54
- private_key: string;
55
- project_id?: string;
56
- }
57
- | {
58
- // API key
59
- apiKey: string;
60
- };
61
-
62
- /**
63
- * Voice type: 'wavenet' (neural), 'standard', or 'studio' (premium)
64
- *
65
- * @default 'wavenet'
66
- * @note WaveNet: $16/1M chars, Standard: $4/1M chars, Studio: $16/1M chars
67
- */
68
- voiceType?: "wavenet" | "standard" | "studio";
69
-
70
- /**
71
- * Default voice name if not specified in synthesis requests
72
- *
73
- * @default 'en-US-Wavenet-A'
74
- * @example 'en-US-Wavenet-A', 'en-GB-Standard-B', 'es-ES-Studio-C'
75
- * @see https://cloud.google.com/text-to-speech/docs/voices
76
- */
77
- defaultVoice?: string;
78
-
79
- /**
80
- * Audio encoding format
81
- *
82
- * @default 'MP3'
83
- */
84
- audioEncoding?: "MP3" | "LINEAR16" | "OGG_OPUS";
85
-
86
- /**
87
- * Enable detailed logging for debugging
88
- *
89
- * @default false
90
- */
91
- enableLogging?: boolean;
92
- }
93
-
94
- /**
95
- * Google Cloud Text-to-Speech Server Provider
96
- *
97
- * Provides high-quality neural text-to-speech with precise word-level timing
98
- * through Google Cloud Text-to-Speech API.
99
- *
100
- * Features:
101
- * - Speech marks support via SSML mark injection (millisecond precision)
102
- * - WaveNet (neural), Standard, and Studio voice types
103
- * - 200+ voices across 50+ languages
104
- * - Full SSML support
105
- * - Single API call for audio + speech marks
106
- */
107
- export class GoogleCloudTTSProvider extends BaseTTSProvider {
108
- readonly providerId = "google-cloud-tts";
109
- readonly providerName = "Google Cloud Text-to-Speech";
110
- readonly version = "1.0.0";
111
-
112
- private client!: v1beta1.TextToSpeechClient;
113
- private voiceType: "wavenet" | "standard" | "studio" = "wavenet";
114
- private defaultVoice = "en-US-Wavenet-A";
115
- private audioEncoding: "MP3" | "LINEAR16" | "OGG_OPUS" = "MP3";
116
- private enableLogging = false;
117
-
118
- /**
119
- * Initialize the Google Cloud TTS provider.
120
- *
121
- * This is FAST and lightweight - only validates config and creates the client.
122
- * Does NOT fetch voices or make test API calls.
123
- *
124
- * @param config - Google Cloud TTS configuration
125
- * @performance Completes in ~10-50ms
126
- */
127
- async initialize(config: GoogleCloudTTSConfig): Promise<void> {
128
- if (!config.projectId) {
129
- throw new TTSError(
130
- TTSErrorCode.INITIALIZATION_ERROR,
131
- "Google Cloud project ID is required",
132
- undefined,
133
- this.providerId,
134
- );
135
- }
136
-
137
- this.config = config;
138
- this.voiceType = config.voiceType || "wavenet";
139
- this.defaultVoice = config.defaultVoice || "en-US-Wavenet-A";
140
- this.audioEncoding = config.audioEncoding || "MP3";
141
- this.enableLogging = config.enableLogging || false;
142
-
143
- try {
144
- // Initialize Google Cloud TTS client
145
- const clientConfig: any = {
146
- projectId: config.projectId,
147
- };
148
-
149
- // Handle different credential types
150
- if (config.credentials) {
151
- if (typeof config.credentials === "string") {
152
- // Path to service account JSON file
153
- clientConfig.keyFilename = config.credentials;
154
- } else if ("apiKey" in config.credentials) {
155
- // API key authentication
156
- clientConfig.apiKey = config.credentials.apiKey;
157
- } else {
158
- // Service account key object
159
- clientConfig.credentials = config.credentials;
160
- }
161
- }
162
- // Else: Use Application Default Credentials (ADC)
163
-
164
- this.client = new v1beta1.TextToSpeechClient(clientConfig);
165
- this.initialized = true;
166
-
167
- if (this.enableLogging) {
168
- console.log("[GoogleCloudTTS] Initialized successfully");
169
- }
170
- } catch (error) {
171
- throw new TTSError(
172
- TTSErrorCode.INITIALIZATION_ERROR,
173
- `Failed to initialize Google Cloud TTS: ${error instanceof Error ? error.message : String(error)}`,
174
- { error },
175
- this.providerId,
176
- );
177
- }
178
- }
179
-
180
- /**
181
- * Synthesize speech with Google Cloud TTS
182
- */
183
- async synthesize(request: SynthesizeRequest): Promise<SynthesizeResponse> {
184
- this.ensureInitialized();
185
-
186
- const capabilities = this.getCapabilities();
187
- this.validateRequest(request, capabilities);
188
-
189
- const voice = request.voice || this.defaultVoice;
190
- const startTime = Date.now();
191
-
192
- try {
193
- // Check if speech marks are requested
194
- if (request.includeSpeechMarks !== false) {
195
- // Use SSML marks injection for precise word timing
196
- const result = await this.synthesizeWithSpeechMarks(request, voice);
197
- const duration = (Date.now() - startTime) / 1000;
198
-
199
- return {
200
- audio: result.audio,
201
- contentType: result.contentType,
202
- speechMarks: result.speechMarks,
203
- metadata: {
204
- providerId: this.providerId,
205
- voice,
206
- duration,
207
- charCount: request.text.length,
208
- cached: false,
209
- timestamp: new Date().toISOString(),
210
- },
211
- };
212
- } else {
213
- // Audio only (no speech marks)
214
- const result = await this.synthesizeAudio(request, voice);
215
- const duration = (Date.now() - startTime) / 1000;
216
-
217
- return {
218
- audio: result.audio,
219
- contentType: result.contentType,
220
- speechMarks: [],
221
- metadata: {
222
- providerId: this.providerId,
223
- voice,
224
- duration,
225
- charCount: request.text.length,
226
- cached: false,
227
- timestamp: new Date().toISOString(),
228
- },
229
- };
230
- }
231
- } catch (error) {
232
- throw this.mapGoogleErrorToTTSError(error);
233
- }
234
- }
235
-
236
- /**
237
- * Synthesize audio stream only (no speech marks)
238
- */
239
- private async synthesizeAudio(
240
- request: SynthesizeRequest,
241
- voice: string,
242
- ): Promise<{ audio: Buffer; contentType: string }> {
243
- // Detect if text contains SSML tags
244
- const isSsml = this.detectSSML(request.text);
245
-
246
- if (isSsml && this.enableLogging) {
247
- console.log("[GoogleCloudTTS] Detected SSML content");
248
- }
249
-
250
- // Parse voice name to extract language code
251
- const languageCode = voice.split("-").slice(0, 2).join("-"); // e.g., "en-US" from "en-US-Wavenet-A"
252
-
253
- // Map our audio encoding to Google's enum
254
- const audioEncodingMap = {
255
- MP3: "MP3" as const,
256
- LINEAR16: "LINEAR16" as const,
257
- OGG_OPUS: "OGG_OPUS" as const,
258
- };
259
-
260
- const [response] = await this.client.synthesizeSpeech({
261
- input: isSsml ? { ssml: request.text } : { text: request.text },
262
- voice: {
263
- languageCode,
264
- name: voice,
265
- },
266
- audioConfig: {
267
- audioEncoding: audioEncodingMap[this.audioEncoding],
268
- sampleRateHertz: request.sampleRate || 24000,
269
- },
270
- });
271
-
272
- if (!response.audioContent) {
273
- throw new Error("No audio content received from Google Cloud TTS");
274
- }
275
-
276
- // Convert Uint8Array to Buffer
277
- const audioBuffer = Buffer.from(response.audioContent);
278
-
279
- const contentTypeMap = {
280
- MP3: "audio/mpeg",
281
- LINEAR16: "audio/wav",
282
- OGG_OPUS: "audio/ogg",
283
- };
284
-
285
- return {
286
- audio: audioBuffer,
287
- contentType: contentTypeMap[this.audioEncoding],
288
- };
289
- }
290
-
291
- /**
292
- * Synthesize with speech marks using SSML mark injection
293
- */
294
- private async synthesizeWithSpeechMarks(
295
- request: SynthesizeRequest,
296
- voice: string,
297
- ): Promise<{
298
- audio: Buffer;
299
- contentType: string;
300
- speechMarks: SpeechMark[];
301
- }> {
302
- // Check if the text is already SSML
303
- const isUserSSML = this.detectSSML(request.text);
304
-
305
- // If user provided SSML, we need to inject marks within the existing SSML
306
- // For simplicity in v1, we'll inject marks for plain text only
307
- const { ssml, wordMap } = isUserSSML
308
- ? this.extractWordsFromSSML(request.text)
309
- : this.injectSSMLMarks(request.text);
310
-
311
- if (this.enableLogging) {
312
- console.log(`[GoogleCloudTTS] Injected ${wordMap.length} SSML marks`);
313
- }
314
-
315
- // Parse voice name to extract language code
316
- const languageCode = voice.split("-").slice(0, 2).join("-");
317
-
318
- // Map our audio encoding to Google's enum
319
- const audioEncodingMap = {
320
- MP3: "MP3" as const,
321
- LINEAR16: "LINEAR16" as const,
322
- OGG_OPUS: "OGG_OPUS" as const,
323
- };
324
-
325
- // Single API call with timepoint tracking enabled
326
- const responseArray = await this.client.synthesizeSpeech({
327
- input: { ssml },
328
- voice: {
329
- languageCode,
330
- name: voice,
331
- },
332
- audioConfig: {
333
- audioEncoding: audioEncodingMap[this.audioEncoding],
334
- sampleRateHertz: request.sampleRate || 24000,
335
- },
336
- enableTimePointing: [
337
- protos.google.cloud.texttospeech.v1beta1.SynthesizeSpeechRequest
338
- .TimepointType.SSML_MARK,
339
- ],
340
- });
341
- const response = responseArray[0];
342
-
343
- if (!response.audioContent) {
344
- throw new Error("No audio content received from Google Cloud TTS");
345
- }
346
-
347
- // Convert Uint8Array to Buffer
348
- const audioBuffer = Buffer.from(response.audioContent);
349
-
350
- const contentTypeMap = {
351
- MP3: "audio/mpeg",
352
- LINEAR16: "audio/wav",
353
- OGG_OPUS: "audio/ogg",
354
- };
355
-
356
- // Extract speech marks from timepoints
357
- const speechMarks = this.extractSpeechMarksFromTimepoints(
358
- response.timepoints || [],
359
- wordMap,
360
- );
361
-
362
- if (this.enableLogging) {
363
- console.log(
364
- `[GoogleCloudTTS] Extracted ${speechMarks.length} speech marks`,
365
- );
366
- }
367
-
368
- return {
369
- audio: audioBuffer,
370
- contentType: contentTypeMap[this.audioEncoding],
371
- speechMarks,
372
- };
373
- }
374
-
375
- /**
376
- * Inject SSML marks before each word in plain text
377
- */
378
- private injectSSMLMarks(text: string): {
379
- ssml: string;
380
- wordMap: Array<{
381
- word: string;
382
- start: number;
383
- end: number;
384
- markName: string;
385
- }>;
386
- } {
387
- const words: Array<{
388
- word: string;
389
- start: number;
390
- end: number;
391
- markName: string;
392
- }> = [];
393
- const wordRegex = /\b[\w']+\b/g;
394
- let match;
395
- let markIndex = 0;
396
-
397
- while ((match = wordRegex.exec(text)) !== null) {
398
- const word = match[0];
399
- const start = match.index;
400
- const end = start + word.length;
401
- const markName = `w${markIndex++}`;
402
-
403
- words.push({ word, start, end, markName });
404
- }
405
-
406
- // Build SSML with marks
407
- let ssml = "<speak>";
408
- let lastEnd = 0;
409
-
410
- for (const { word, start, end, markName } of words) {
411
- // Add text before word (including whitespace and punctuation)
412
- ssml += this.escapeSSML(text.slice(lastEnd, start));
413
- // Add marked word
414
- ssml += `<mark name="${markName}"/>${this.escapeSSML(word)}`;
415
- lastEnd = end;
416
- }
417
-
418
- // Add remaining text
419
- ssml += this.escapeSSML(text.slice(lastEnd)) + "</speak>";
420
-
421
- return { ssml, wordMap: words };
422
- }
423
-
424
- /**
425
- * Extract words from existing SSML (simplified version for v1)
426
- */
427
- private extractWordsFromSSML(ssmlText: string): {
428
- ssml: string;
429
- wordMap: Array<{
430
- word: string;
431
- start: number;
432
- end: number;
433
- markName: string;
434
- }>;
435
- } {
436
- // For now, just strip SSML tags and inject marks
437
- // More sophisticated SSML parsing can be added in future versions
438
- const plainText = ssmlText
439
- .replace(/<[^>]+>/g, " ") // Remove all tags
440
- .replace(/\s+/g, " ") // Normalize whitespace
441
- .trim();
442
-
443
- return this.injectSSMLMarks(plainText);
444
- }
445
-
446
- /**
447
- * Escape special XML characters for SSML
448
- */
449
- private escapeSSML(text: string): string {
450
- return text
451
- .replace(/&/g, "&amp;")
452
- .replace(/</g, "&lt;")
453
- .replace(/>/g, "&gt;")
454
- .replace(/"/g, "&quot;")
455
- .replace(/'/g, "&apos;");
456
- }
457
-
458
- /**
459
- * Extract speech marks from Google's timepoints
460
- */
461
- private extractSpeechMarksFromTimepoints(
462
- timepoints:
463
- | protos.google.cloud.texttospeech.v1beta1.ITimepoint[]
464
- | null
465
- | undefined,
466
- wordMap: Array<{
467
- word: string;
468
- start: number;
469
- end: number;
470
- markName: string;
471
- }>,
472
- ): SpeechMark[] {
473
- if (!timepoints || timepoints.length === 0) {
474
- return [];
475
- }
476
-
477
- const speechMarks: SpeechMark[] = [];
478
-
479
- for (const timepoint of timepoints) {
480
- // Find corresponding word in our map
481
- const wordInfo = wordMap.find((w) => w.markName === timepoint.markName);
482
-
483
- if (
484
- wordInfo &&
485
- timepoint.timeSeconds !== undefined &&
486
- timepoint.timeSeconds !== null
487
- ) {
488
- speechMarks.push({
489
- time: Math.round(timepoint.timeSeconds * 1000), // Convert to ms
490
- type: "word",
491
- start: wordInfo.start,
492
- end: wordInfo.end,
493
- value: wordInfo.word,
494
- });
495
- }
496
- }
497
-
498
- // Sort by time
499
- return speechMarks.sort((a, b) => a.time - b.time);
500
- }
501
-
502
- /**
503
- * Detect if text contains SSML markup
504
- */
505
- private detectSSML(text: string): boolean {
506
- return (
507
- text.includes("<speak") ||
508
- text.includes("<prosody") ||
509
- text.includes("<emphasis") ||
510
- text.includes("<break") ||
511
- text.includes("<phoneme") ||
512
- text.includes("<say-as") ||
513
- text.includes("<mark")
514
- );
515
- }
516
-
517
- /**
518
- * Get available voices from Google Cloud TTS
519
- */
520
- async getVoices(options?: GetVoicesOptions): Promise<Voice[]> {
521
- this.ensureInitialized();
522
-
523
- try {
524
- const [response] = await this.client.listVoices({
525
- languageCode: options?.language,
526
- });
527
-
528
- if (!response.voices) {
529
- return [];
530
- }
531
-
532
- return response.voices
533
- .map((voice) => this.mapGoogleVoiceToVoice(voice))
534
- .filter((voice) => {
535
- // Apply filters
536
- if (options?.gender && voice.gender !== options.gender) {
537
- return false;
538
- }
539
- if (options?.quality && voice.quality !== options.quality) {
540
- return false;
541
- }
542
- return true;
543
- });
544
- } catch (error) {
545
- throw new TTSError(
546
- TTSErrorCode.PROVIDER_ERROR,
547
- `Failed to get voices: ${error instanceof Error ? error.message : String(error)}`,
548
- { error },
549
- this.providerId,
550
- );
551
- }
552
- }
553
-
554
- /**
555
- * Map Google Cloud voice to unified Voice interface
556
- */
557
- private mapGoogleVoiceToVoice(
558
- googleVoice: protos.google.cloud.texttospeech.v1beta1.IVoice,
559
- ): Voice {
560
- const voiceName = googleVoice.name || "";
561
-
562
- // Determine quality based on voice type
563
- let quality: "standard" | "neural" | "premium" = "standard";
564
- if (voiceName.includes("Wavenet")) {
565
- quality = "neural";
566
- } else if (voiceName.includes("Studio")) {
567
- quality = "premium";
568
- }
569
-
570
- // Map SSML gender to our gender type
571
- const genderMap: Record<string, "male" | "female" | "neutral"> = {
572
- MALE: "male",
573
- FEMALE: "female",
574
- NEUTRAL: "neutral",
575
- };
576
- const gender = genderMap[googleVoice.ssmlGender || "NEUTRAL"] || "neutral";
577
-
578
- return {
579
- id: voiceName,
580
- name: voiceName,
581
- language: googleVoice.languageCodes?.[0] || "Unknown",
582
- languageCode: googleVoice.languageCodes?.[0] || "",
583
- gender,
584
- quality,
585
- supportedFeatures: {
586
- ssml: true,
587
- emotions: false, // Google doesn't have built-in emotions
588
- styles: false, // Google doesn't have speaking styles
589
- },
590
- providerMetadata: {
591
- naturalSampleRateHertz: googleVoice.naturalSampleRateHertz,
592
- languageCodes: googleVoice.languageCodes,
593
- ssmlGender: googleVoice.ssmlGender,
594
- },
595
- };
596
- }
597
-
598
- /**
599
- * Get Google Cloud TTS capabilities
600
- */
601
- getCapabilities(): ServerProviderCapabilities {
602
- return {
603
- // W3C Standard features
604
- standard: {
605
- supportsSSML: true, // ✅ Full SSML 1.1 support
606
- supportsPitch: true, // ✅ Via SSML <prosody pitch>
607
- supportsRate: true, // ✅ Via SSML <prosody rate>
608
- supportsVolume: false, // ❌ Not supported (handle client-side)
609
- supportsMultipleVoices: true, // ✅ 200+ voices across 50+ languages
610
- maxTextLength: 5000, // Google Cloud TTS limit per request
611
- },
612
-
613
- // Provider-specific extensions
614
- extensions: {
615
- supportsSpeechMarks: true, // ✅ Via SSML marks + timepoints
616
- supportedFormats: ["mp3", "wav", "ogg"], // MP3, LINEAR16, OGG_OPUS
617
- supportsSampleRate: true, // ✅ Configurable sample rate
618
-
619
- // Google Cloud-specific features
620
- providerSpecific: {
621
- voiceTypes: ["standard", "wavenet", "studio"],
622
- voicesCount: 200, // ~200+ voices available
623
- languagesCount: 50, // 50+ languages supported
624
- supportsAudioProfiles: true, // Audio device profiles
625
- supportsEffects: false, // No built-in effects
626
- supportsEmotions: false, // No emotion control
627
- supportsStyles: false, // No speaking styles
628
- },
629
- },
630
- };
631
- }
632
-
633
- /**
634
- * Map Google Cloud errors to TTSError codes
635
- */
636
- private mapGoogleErrorToTTSError(error: any): TTSError {
637
- const message = error.message || String(error);
638
-
639
- // Check for specific Google Cloud error codes
640
- if (error.code === 7) {
641
- // PERMISSION_DENIED
642
- return new TTSError(
643
- TTSErrorCode.AUTHENTICATION_ERROR,
644
- `Google Cloud authentication failed: ${message}`,
645
- { error },
646
- this.providerId,
647
- );
648
- }
649
-
650
- if (error.code === 8) {
651
- // RESOURCE_EXHAUSTED
652
- return new TTSError(
653
- TTSErrorCode.RATE_LIMIT_EXCEEDED,
654
- `Google Cloud rate limit exceeded: ${message}`,
655
- { error },
656
- this.providerId,
657
- );
658
- }
659
-
660
- if (error.code === 3) {
661
- // INVALID_ARGUMENT
662
- return new TTSError(
663
- TTSErrorCode.INVALID_REQUEST,
664
- `Invalid request to Google Cloud TTS: ${message}`,
665
- { error },
666
- this.providerId,
667
- );
668
- }
669
-
670
- // Default to provider error
671
- return new TTSError(
672
- TTSErrorCode.PROVIDER_ERROR,
673
- `Google Cloud TTS error: ${message}`,
674
- { error },
675
- this.providerId,
676
- );
677
- }
678
-
679
- /**
680
- * Clean up Google Cloud TTS client
681
- */
682
- async destroy(): Promise<void> {
683
- if (this.client) {
684
- await this.client.close();
685
- }
686
- await super.destroy();
687
- }
688
- }
package/src/index.ts DELETED
@@ -1,7 +0,0 @@
1
- /**
2
- * Google Cloud Text-to-Speech server-side TTS provider
3
- * @module @pie-players/tts-server-google
4
- */
5
-
6
- export type { GoogleCloudTTSConfig } from "./GoogleCloudTTSProvider.js";
7
- export { GoogleCloudTTSProvider } from "./GoogleCloudTTSProvider.js";
package/tsconfig.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "extends": "../tts-server-core/tsconfig.json",
3
- "compilerOptions": {
4
- "outDir": "./dist",
5
- "rootDir": "./src"
6
- },
7
- "include": ["src/**/*"],
8
- "exclude": ["node_modules", "dist", "**/*.test.ts"]
9
- }