@pie-players/tts-server-google 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ $ tsc
package/README.md ADDED
@@ -0,0 +1,333 @@
1
+ # @pie-players/tts-server-google
2
+
3
+ Google Cloud Text-to-Speech provider for server-side text-to-speech with speech marks support.
4
+
5
+ ## Overview
6
+
7
+ This package provides a server-side TTS provider that uses Google Cloud Text-to-Speech to generate high-quality neural speech with millisecond-precise word timing through SSML mark injection.
8
+
9
+ ## Features
10
+
11
+ - ✅ **Speech Marks Support** - Millisecond-accurate word timing via SSML marks + timepoints
12
+ - ✅ **WaveNet Neural Voices** - High-quality neural TTS with Google's WaveNet technology
13
+ - ✅ **50+ Languages** - Extensive language support
14
+ - ✅ **Full SSML** - Supports Speech Synthesis Markup Language 1.1
15
+ - ✅ **Single API Call** - Audio and speech marks in one request (more efficient than AWS Polly)
16
+ - ✅ **200+ Voices** - Multiple voice types per language (Standard, WaveNet, Studio)
17
+ - ✅ **Flexible Authentication** - Service account, API key, or Application Default Credentials
18
+
19
+ ## Installation
20
+
21
+ ```bash
22
+ npm install @pie-players/tts-server-google
23
+ ```
24
+
25
+ ## Usage
26
+
27
+ ### Basic Setup
28
+
29
+ ```typescript
30
+ import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
31
+
32
+ const provider = new GoogleCloudTTSProvider();
33
+
34
+ await provider.initialize({
35
+ projectId: 'my-gcp-project',
36
+ credentials: '/path/to/service-account.json', // Or use other auth methods
37
+ voiceType: 'wavenet', // 'wavenet', 'standard', or 'studio'
38
+ defaultVoice: 'en-US-Wavenet-A',
39
+ });
40
+ ```
41
+
42
+ ### Authentication Methods
43
+
44
+ #### 1. Service Account JSON File (Recommended for Production)
45
+
46
+ ```typescript
47
+ await provider.initialize({
48
+ projectId: 'my-project',
49
+ credentials: '/path/to/service-account.json',
50
+ });
51
+ ```
52
+
53
+ #### 2. Service Account Object (For Containers/Serverless)
54
+
55
+ ```typescript
56
+ await provider.initialize({
57
+ projectId: 'my-project',
58
+ credentials: {
59
+ client_email: 'service-account@my-project.iam.gserviceaccount.com',
60
+ private_key: '-----BEGIN PRIVATE KEY-----\n...\n-----END PRIVATE KEY-----\n',
61
+ },
62
+ });
63
+ ```
64
+
65
+ #### 3. API Key (For Simple Applications)
66
+
67
+ ```typescript
68
+ await provider.initialize({
69
+ projectId: 'my-project',
70
+ credentials: {
71
+ apiKey: 'AIza...',
72
+ },
73
+ });
74
+ ```
75
+
76
+ #### 4. Application Default Credentials (For Local Development)
77
+
78
+ ```typescript
79
+ // Omit credentials to use gcloud auth application-default login
80
+ await provider.initialize({
81
+ projectId: 'my-project',
82
+ });
83
+ ```
84
+
85
+ ### Synthesize Speech
86
+
87
+ ```typescript
88
+ const result = await provider.synthesize({
89
+ text: 'Hello world, this is a test of Google Cloud Text to Speech.',
90
+ voice: 'en-US-Wavenet-A', // Optional, uses defaultVoice if not specified
91
+ includeSpeechMarks: true,
92
+ });
93
+
94
+ console.log('Audio:', result.audio); // Buffer
95
+ console.log('Speech marks:', result.speechMarks); // Array of word timings
96
+ console.log('Duration:', result.metadata.duration, 'seconds');
97
+ ```
98
+
99
+ ### List Available Voices
100
+
101
+ ```typescript
102
+ // Get all voices
103
+ const voices = await provider.getVoices();
104
+
105
+ // Filter by language
106
+ const spanishVoices = await provider.getVoices({ language: 'es-ES' });
107
+
108
+ // Filter by gender
109
+ const femaleVoices = await provider.getVoices({ gender: 'female' });
110
+
111
+ // Filter by quality
112
+ const neuralVoices = await provider.getVoices({ quality: 'neural' });
113
+ ```
114
+
115
+ ### Speech Marks Example
116
+
117
+ ```typescript
118
+ const result = await provider.synthesize({
119
+ text: 'Hello world',
120
+ includeSpeechMarks: true,
121
+ });
122
+
123
+ // result.speechMarks:
124
+ // [
125
+ // { time: 0, type: 'word', start: 0, end: 5, value: 'Hello' },
126
+ // { time: 420, type: 'word', start: 6, end: 11, value: 'world' }
127
+ // ]
128
+ ```
129
+
130
+ ### SSML Support
131
+
132
+ ```typescript
133
+ const result = await provider.synthesize({
134
+ text: `
135
+ <speak>
136
+ Hello, <break time="500ms"/> this is a test.
137
+ <prosody rate="slow" pitch="+2st">
138
+ I can speak slowly with higher pitch.
139
+ </prosody>
140
+ </speak>
141
+ `,
142
+ includeSpeechMarks: true,
143
+ });
144
+ ```
145
+
146
+ ## Configuration
147
+
148
+ ### GoogleCloudTTSConfig
149
+
150
+ ```typescript
151
+ interface GoogleCloudTTSConfig {
152
+ projectId: string; // Google Cloud project ID (required)
153
+ credentials?: // Authentication (optional if using ADC)
154
+ | string // Path to service account JSON
155
+ | { // Service account object
156
+ client_email: string;
157
+ private_key: string;
158
+ }
159
+ | { apiKey: string }; // API key
160
+ voiceType?: 'wavenet' | 'standard' | 'studio'; // Voice type (default: 'wavenet')
161
+ defaultVoice?: string; // Default voice (default: 'en-US-Wavenet-A')
162
+ audioEncoding?: 'MP3' | 'LINEAR16' | 'OGG_OPUS'; // Audio format (default: 'MP3')
163
+ enableLogging?: boolean; // Debug logging (default: false)
164
+ }
165
+ ```
166
+
167
+ ### Environment Variables
168
+
169
+ ```bash
170
+ GOOGLE_CLOUD_PROJECT=my-project
171
+ GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
172
+ ```
173
+
174
+ ## Capabilities
175
+
176
+ | Feature | Support |
177
+ |---------|---------|
178
+ | Speech Marks | ✅ Via SSML marks |
179
+ | SSML | ✅ Full 1.1 |
180
+ | Pitch Control | ✅ SSML |
181
+ | Rate Control | ✅ SSML |
182
+ | Volume Control | ❌ Client-side |
183
+ | Max Text Length | 5000 chars |
184
+ | Audio Formats | MP3, WAV, OGG |
185
+
186
+ ## Cost
187
+
188
+ - **Standard voices:** $4 per 1M characters
189
+ - **WaveNet (neural) voices:** $16 per 1M characters
190
+ - **Studio voices:** $16 per 1M characters
191
+ - **Speech marks (timepoints):** Included (no extra charge)
192
+
193
+ Pricing is competitive with AWS Polly.
194
+
195
+ ## Supported Voices
196
+
197
+ Popular voices include:
198
+
199
+ ### Standard Voices
200
+ - **English (US):** en-US-Standard-A/B/C/D/E/F/G/H/I/J
201
+ - **English (UK):** en-GB-Standard-A/B/C/D/F
202
+ - **Spanish:** es-ES-Standard-A/B/C/D
203
+ - **French:** fr-FR-Standard-A/B/C/D/E
204
+ - **German:** de-DE-Standard-A/B/C/D/E/F
205
+ - **Italian:** it-IT-Standard-A/B/C/D
206
+ - **Portuguese:** pt-BR-Standard-A/B/C
207
+
208
+ ### WaveNet (Neural) Voices
209
+ - **English (US):** en-US-Wavenet-A/B/C/D/E/F/G/H/I/J
210
+ - **English (UK):** en-GB-Wavenet-A/B/C/D/F
211
+ - **Spanish:** es-ES-Wavenet-B/C/D
212
+ - **French:** fr-FR-Wavenet-A/B/C/D/E
213
+ - **German:** de-DE-Wavenet-A/B/C/D/E/F
214
+ - **Italian:** it-IT-Wavenet-A/B/C/D
215
+ - **Portuguese:** pt-BR-Wavenet-A/B/C
216
+
217
+ ### Studio Voices
218
+ - **English (US):** en-US-Studio-O/Q
219
+ - **English (UK):** en-GB-Studio-B/C
220
+
221
+ Use `getVoices()` for the complete list of 200+ voices.
222
+
223
+ ## Voice Naming Convention
224
+
225
+ Google Cloud voices follow the pattern: `{languageCode}-{voiceType}-{variant}`
226
+
227
+ Examples:
228
+ - `en-US-Wavenet-A` - US English, WaveNet (neural), variant A
229
+ - `es-ES-Standard-B` - Spanish (Spain), Standard, variant B
230
+ - `fr-FR-Studio-A` - French, Studio (premium), variant A
231
+
232
+ ## Error Handling
233
+
234
+ ```typescript
235
+ import { TTSError, TTSErrorCode } from '@pie-players/tts-server-core';
236
+
237
+ try {
238
+ const result = await provider.synthesize({ text: 'Hello' });
239
+ } catch (error) {
240
+ if (error instanceof TTSError) {
241
+ console.error('Error code:', error.code);
242
+ console.error('Message:', error.message);
243
+ console.error('Provider:', error.providerId);
244
+
245
+ // Handle specific error types
246
+ if (error.code === TTSErrorCode.AUTHENTICATION_ERROR) {
247
+ console.error('Check your Google Cloud credentials');
248
+ } else if (error.code === TTSErrorCode.RATE_LIMIT_EXCEEDED) {
249
+ console.error('Rate limit exceeded, retry after some time');
250
+ }
251
+ }
252
+ }
253
+ ```
254
+
255
+ ## Google Cloud IAM Permissions
256
+
257
+ Required IAM permissions for the service account:
258
+
259
+ ```json
260
+ {
261
+ "role": "roles/cloudtexttospeech.user",
262
+ "permissions": [
263
+ "texttospeech.operations.get",
264
+ "texttospeech.voices.list",
265
+ "texttospeech.voices.synthesize"
266
+ ]
267
+ }
268
+ ```
269
+
270
+ Or use the predefined role:
271
+
272
+ ```bash
273
+ gcloud projects add-iam-policy-binding PROJECT_ID \
274
+ --member="serviceAccount:SERVICE_ACCOUNT_EMAIL" \
275
+ --role="roles/cloudtexttospeech.user"
276
+ ```
277
+
278
+ ## Comparison with AWS Polly
279
+
280
+ | Feature | Google Cloud TTS | AWS Polly |
281
+ |---------|------------------|-----------|
282
+ | **Voices** | 200+ voices | 60+ voices |
283
+ | **Languages** | 50+ languages | 25+ languages |
284
+ | **Speech Marks** | Via SSML marks | Native |
285
+ | **API Calls** | Single call | Two parallel calls |
286
+ | **Max Text** | 5000 chars | 3000 chars |
287
+ | **Neural Cost** | $16/1M chars | $16/1M chars |
288
+ | **Standard Cost** | $4/1M chars | $4/1M chars |
289
+ | **Authentication** | Flexible (4 methods) | AWS credentials |
290
+ | **Region** | Global service | Region-specific |
291
+
292
+ ## How Speech Marks Work
293
+
294
+ Unlike AWS Polly (which provides native speech marks), Google Cloud TTS requires SSML mark injection:
295
+
296
+ 1. The provider automatically parses your text and injects `<mark>` tags before each word
297
+ 2. Google Cloud TTS returns timepoints corresponding to these marks
298
+ 3. The provider converts timepoints to the unified speech mark format
299
+
300
+ This process is transparent to the user - just set `includeSpeechMarks: true`.
301
+
302
+ ## Advanced Configuration
303
+
304
+ ### Custom Audio Encoding
305
+
306
+ ```typescript
307
+ await provider.initialize({
308
+ projectId: 'my-project',
309
+ audioEncoding: 'LINEAR16', // For WAV format
310
+ });
311
+ ```
312
+
313
+ ### Enable Debug Logging
314
+
315
+ ```typescript
316
+ await provider.initialize({
317
+ projectId: 'my-project',
318
+ enableLogging: true, // Logs SSML injection and speech marks extraction
319
+ });
320
+ ```
321
+
322
+ ### Custom Sample Rate
323
+
324
+ ```typescript
325
+ const result = await provider.synthesize({
326
+ text: 'Hello',
327
+ sampleRate: 48000, // 48kHz (default is 24kHz)
328
+ });
329
+ ```
330
+
331
+ ## License
332
+
333
+ MIT
@@ -0,0 +1,153 @@
1
+ /**
2
+ * Google Cloud Text-to-Speech server-side TTS provider
3
+ * @module @pie-players/tts-server-google
4
+ */
5
+ import { BaseTTSProvider, type GetVoicesOptions, type ServerProviderCapabilities, type SynthesizeRequest, type SynthesizeResponse, type TTSServerConfig, type Voice } from "@pie-players/tts-server-core";
6
+ /**
7
+ * Google Cloud Text-to-Speech provider configuration.
8
+ *
9
+ * This extends the base TTSServerConfig with Google Cloud-specific settings.
10
+ */
11
+ export interface GoogleCloudTTSConfig extends TTSServerConfig {
12
+ /**
13
+ * Google Cloud project ID (required)
14
+ *
15
+ * @example 'my-project-123456'
16
+ * @required
17
+ */
18
+ projectId: string;
19
+ /**
20
+ * Authentication credentials
21
+ *
22
+ * Supports multiple authentication methods:
23
+ * - Service account JSON file path (recommended for production)
24
+ * - Service account key object (for containers/serverless)
25
+ * - API key (for simple applications)
26
+ * - Omit to use Application Default Credentials (ADC) for local development
27
+ *
28
+ * @example '/path/to/service-account.json'
29
+ * @example { client_email: '...', private_key: '...' }
30
+ * @example { apiKey: 'AIza...' }
31
+ * @see https://cloud.google.com/docs/authentication
32
+ */
33
+ credentials?: string | {
34
+ client_email: string;
35
+ private_key: string;
36
+ project_id?: string;
37
+ } | {
38
+ apiKey: string;
39
+ };
40
+ /**
41
+ * Voice type: 'wavenet' (neural), 'standard', or 'studio' (premium)
42
+ *
43
+ * @default 'wavenet'
44
+ * @note WaveNet: $16/1M chars, Standard: $4/1M chars, Studio: $16/1M chars
45
+ */
46
+ voiceType?: "wavenet" | "standard" | "studio";
47
+ /**
48
+ * Default voice name if not specified in synthesis requests
49
+ *
50
+ * @default 'en-US-Wavenet-A'
51
+ * @example 'en-US-Wavenet-A', 'en-GB-Standard-B', 'es-ES-Studio-C'
52
+ * @see https://cloud.google.com/text-to-speech/docs/voices
53
+ */
54
+ defaultVoice?: string;
55
+ /**
56
+ * Audio encoding format
57
+ *
58
+ * @default 'MP3'
59
+ */
60
+ audioEncoding?: "MP3" | "LINEAR16" | "OGG_OPUS";
61
+ /**
62
+ * Enable detailed logging for debugging
63
+ *
64
+ * @default false
65
+ */
66
+ enableLogging?: boolean;
67
+ }
68
+ /**
69
+ * Google Cloud Text-to-Speech Server Provider
70
+ *
71
+ * Provides high-quality neural text-to-speech with precise word-level timing
72
+ * through Google Cloud Text-to-Speech API.
73
+ *
74
+ * Features:
75
+ * - Speech marks support via SSML mark injection (millisecond precision)
76
+ * - WaveNet (neural), Standard, and Studio voice types
77
+ * - 200+ voices across 50+ languages
78
+ * - Full SSML support
79
+ * - Single API call for audio + speech marks
80
+ */
81
+ export declare class GoogleCloudTTSProvider extends BaseTTSProvider {
82
+ readonly providerId = "google-cloud-tts";
83
+ readonly providerName = "Google Cloud Text-to-Speech";
84
+ readonly version = "1.0.0";
85
+ private client;
86
+ private voiceType;
87
+ private defaultVoice;
88
+ private audioEncoding;
89
+ private enableLogging;
90
+ /**
91
+ * Initialize the Google Cloud TTS provider.
92
+ *
93
+ * This is FAST and lightweight - only validates config and creates the client.
94
+ * Does NOT fetch voices or make test API calls.
95
+ *
96
+ * @param config - Google Cloud TTS configuration
97
+ * @performance Completes in ~10-50ms
98
+ */
99
+ initialize(config: GoogleCloudTTSConfig): Promise<void>;
100
+ /**
101
+ * Synthesize speech with Google Cloud TTS
102
+ */
103
+ synthesize(request: SynthesizeRequest): Promise<SynthesizeResponse>;
104
+ /**
105
+ * Synthesize audio stream only (no speech marks)
106
+ */
107
+ private synthesizeAudio;
108
+ /**
109
+ * Synthesize with speech marks using SSML mark injection
110
+ */
111
+ private synthesizeWithSpeechMarks;
112
+ /**
113
+ * Inject SSML marks before each word in plain text
114
+ */
115
+ private injectSSMLMarks;
116
+ /**
117
+ * Extract words from existing SSML (simplified version for v1)
118
+ */
119
+ private extractWordsFromSSML;
120
+ /**
121
+ * Escape special XML characters for SSML
122
+ */
123
+ private escapeSSML;
124
+ /**
125
+ * Extract speech marks from Google's timepoints
126
+ */
127
+ private extractSpeechMarksFromTimepoints;
128
+ /**
129
+ * Detect if text contains SSML markup
130
+ */
131
+ private detectSSML;
132
+ /**
133
+ * Get available voices from Google Cloud TTS
134
+ */
135
+ getVoices(options?: GetVoicesOptions): Promise<Voice[]>;
136
+ /**
137
+ * Map Google Cloud voice to unified Voice interface
138
+ */
139
+ private mapGoogleVoiceToVoice;
140
+ /**
141
+ * Get Google Cloud TTS capabilities
142
+ */
143
+ getCapabilities(): ServerProviderCapabilities;
144
+ /**
145
+ * Map Google Cloud errors to TTSError codes
146
+ */
147
+ private mapGoogleErrorToTTSError;
148
+ /**
149
+ * Clean up Google Cloud TTS client
150
+ */
151
+ destroy(): Promise<void>;
152
+ }
153
+ //# sourceMappingURL=GoogleCloudTTSProvider.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"GoogleCloudTTSProvider.d.ts","sourceRoot":"","sources":["../src/GoogleCloudTTSProvider.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAIH,OAAO,EACN,eAAe,EACf,KAAK,gBAAgB,EACrB,KAAK,0BAA0B,EAE/B,KAAK,iBAAiB,EACtB,KAAK,kBAAkB,EAGvB,KAAK,eAAe,EACpB,KAAK,KAAK,EACV,MAAM,8BAA8B,CAAC;AAEtC;;;;GAIG;AACH,MAAM,WAAW,oBAAqB,SAAQ,eAAe;IAC5D;;;;;OAKG;IACH,SAAS,EAAE,MAAM,CAAC;IAElB;;;;;;;;;;;;;OAaG;IACH,WAAW,CAAC,EACT,MAAM,GACN;QAEA,YAAY,EAAE,MAAM,CAAC;QACrB,WAAW,EAAE,MAAM,CAAC;QACpB,UAAU,CAAC,EAAE,MAAM,CAAC;KACnB,GACD;QAEA,MAAM,EAAE,MAAM,CAAC;KACd,CAAC;IAEL;;;;;OAKG;IACH,SAAS,CAAC,EAAE,SAAS,GAAG,UAAU,GAAG,QAAQ,CAAC;IAE9C;;;;;;OAMG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;;;OAIG;IACH,aAAa,CAAC,EAAE,KAAK,GAAG,UAAU,GAAG,UAAU,CAAC;IAEhD;;;;OAIG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;CACxB;AAED;;;;;;;;;;;;GAYG;AACH,qBAAa,sBAAuB,SAAQ,eAAe;IAC1D,QAAQ,CAAC,UAAU,sBAAsB;IACzC,QAAQ,CAAC,YAAY,iCAAiC;IACtD,QAAQ,CAAC,OAAO,WAAW;IAE3B,OAAO,CAAC,MAAM,CAA8B;IAC5C,OAAO,CAAC,SAAS,CAAgD;IACjE,OAAO,CAAC,YAAY,CAAqB;IACzC,OAAO,CAAC,aAAa,CAA0C;IAC/D,OAAO,CAAC,aAAa,CAAS;IAE9B;;;;;;;;OAQG;IACG,UAAU,CAAC,MAAM,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAqD7D;;OAEG;IACG,UAAU,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC,kBAAkB,CAAC;IAqDzE;;OAEG;YACW,eAAe;IAoD7B;;OAEG;YACW,yBAAyB;IAiFvC;;OAEG;IACH,OAAO,CAAC,eAAe;IA8CvB;;OAEG;IACH,OAAO,CAAC,oBAAoB;IAmB5B;;OAEG;IACH,OAAO,CAAC,UAAU;IASlB;;OAEG;IACH,OAAO,CAAC,gCAAgC;IAyCxC;;OAEG;IACH,OAAO,CAAC,UAAU;IAYlB;;OAEG;IACG,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC;IAkC7D;;OAEG;IACH,OAAO,CAAC,qBAAqB;IAyC7B;;OAEG;IACH,eAAe,IAAI,0BAA0B;IAgC7C;;OAEG;IACH,OAAO,CAAC,wBAAwB;IA2ChC;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAM9B"}