npm - @pie-players/tts-server-google - Versions diffs - 0.1.0 - Mend

@pie-players/tts-server-google 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/.turbo/turbo-build.log +1 -0
package/README.md +333 -0
package/dist/GoogleCloudTTSProvider.d.ts +153 -0
package/dist/GoogleCloudTTSProvider.d.ts.map +1 -0
package/dist/GoogleCloudTTSProvider.js +454 -0
package/dist/GoogleCloudTTSProvider.js.map +1 -0
package/dist/index.d.ts +7 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +6 -0
package/dist/index.js.map +1 -0
package/examples/INTEGRATION-GUIDE.md +532 -0
package/package.json +38 -0
package/src/GoogleCloudTTSProvider.ts +688 -0
package/src/index.ts +7 -0
package/tsconfig.json +9 -0

package/.turbo/turbo-build.log ADDED Viewed

	@@ -0,0 +1 @@
1	+ $ tsc

package/README.md ADDED Viewed

@@ -0,0 +1,333 @@
+# @pie-players/tts-server-google
+Google Cloud Text-to-Speech provider for server-side text-to-speech with speech marks support.
+## Overview
+This package provides a server-side TTS provider that uses Google Cloud Text-to-Speech to generate high-quality neural speech with millisecond-precise word timing through SSML mark injection.
+## Features
+- ✅ **Speech Marks Support** - Millisecond-accurate word timing via SSML marks + timepoints
+- ✅ **WaveNet Neural Voices** - High-quality neural TTS with Google's WaveNet technology
+- ✅ **50+ Languages** - Extensive language support
+- ✅ **Full SSML** - Supports Speech Synthesis Markup Language 1.1
+- ✅ **Single API Call** - Audio and speech marks in one request (more efficient than AWS Polly)
+- ✅ **200+ Voices** - Multiple voice types per language (Standard, WaveNet, Studio)
+- ✅ **Flexible Authentication** - Service account, API key, or Application Default Credentials
+## Installation
+```bash
+npm install @pie-players/tts-server-google
+```
+## Usage
+### Basic Setup
+```typescript
+import { GoogleCloudTTSProvider } from '@pie-players/tts-server-google';
+const provider = new GoogleCloudTTSProvider();
+await provider.initialize({
+  projectId: 'my-gcp-project',
+  credentials: '/path/to/service-account.json', // Or use other auth methods
+  voiceType: 'wavenet', // 'wavenet', 'standard', or 'studio'
+  defaultVoice: 'en-US-Wavenet-A',
+});
+```
+### Authentication Methods
+#### 1. Service Account JSON File (Recommended for Production)
+```typescript
+await provider.initialize({
+  projectId: 'my-project',
+  credentials: '/path/to/service-account.json',
+});
+```
+#### 2. Service Account Object (For Containers/Serverless)
+```typescript
+await provider.initialize({
+  projectId: 'my-project',
+  credentials: {
+    client_email: 'service-account@my-project.iam.gserviceaccount.com',
+    private_key: '-----BEGIN PRIVATE KEY-----\n...\n-----END PRIVATE KEY-----\n',
+  },
+});
+```
+#### 3. API Key (For Simple Applications)
+```typescript
+await provider.initialize({
+  projectId: 'my-project',
+  credentials: {
+    apiKey: 'AIza...',
+  },
+});
+```
+#### 4. Application Default Credentials (For Local Development)
+```typescript
+// Omit credentials to use gcloud auth application-default login
+await provider.initialize({
+  projectId: 'my-project',
+});
+```
+### Synthesize Speech
+```typescript
+const result = await provider.synthesize({
+  text: 'Hello world, this is a test of Google Cloud Text to Speech.',
+  voice: 'en-US-Wavenet-A', // Optional, uses defaultVoice if not specified
+  includeSpeechMarks: true,
+});
+console.log('Audio:', result.audio); // Buffer
+console.log('Speech marks:', result.speechMarks); // Array of word timings
+console.log('Duration:', result.metadata.duration, 'seconds');
+```
+### List Available Voices
+```typescript
+// Get all voices
+const voices = await provider.getVoices();
+// Filter by language
+const spanishVoices = await provider.getVoices({ language: 'es-ES' });
+// Filter by gender
+const femaleVoices = await provider.getVoices({ gender: 'female' });
+// Filter by quality
+const neuralVoices = await provider.getVoices({ quality: 'neural' });
+```
+### Speech Marks Example
+```typescript
+const result = await provider.synthesize({
+  text: 'Hello world',
+  includeSpeechMarks: true,
+});
+// result.speechMarks:
+// [
+//   { time: 0, type: 'word', start: 0, end: 5, value: 'Hello' },
+//   { time: 420, type: 'word', start: 6, end: 11, value: 'world' }
+// ]
+```
+### SSML Support
+```typescript
+const result = await provider.synthesize({
+  text: `
+    <speak>
+      Hello, <break time="500ms"/> this is a test.
+      <prosody rate="slow" pitch="+2st">
+        I can speak slowly with higher pitch.
+      </prosody>
+    </speak>
+  `,
+  includeSpeechMarks: true,
+});
+```
+## Configuration
+### GoogleCloudTTSConfig
+```typescript
+interface GoogleCloudTTSConfig {
+  projectId: string;                    // Google Cloud project ID (required)
+  credentials?:                         // Authentication (optional if using ADC)
+    | string                            // Path to service account JSON
+    | {                                 // Service account object
+        client_email: string;
+        private_key: string;
+      }
+    | { apiKey: string };               // API key
+  voiceType?: 'wavenet' | 'standard' | 'studio';  // Voice type (default: 'wavenet')
+  defaultVoice?: string;                // Default voice (default: 'en-US-Wavenet-A')
+  audioEncoding?: 'MP3' | 'LINEAR16' | 'OGG_OPUS';  // Audio format (default: 'MP3')
+  enableLogging?: boolean;              // Debug logging (default: false)
+}
+```
+### Environment Variables
+```bash
+GOOGLE_CLOUD_PROJECT=my-project
+GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
+```
+## Capabilities
+| Feature | Support |
+|---------|---------|
+| Speech Marks | ✅ Via SSML marks |
+| SSML | ✅ Full 1.1 |
+| Pitch Control | ✅ SSML |
+| Rate Control | ✅ SSML |
+| Volume Control | ❌ Client-side |
+| Max Text Length | 5000 chars |
+| Audio Formats | MP3, WAV, OGG |
+## Cost
+- **Standard voices:** $4 per 1M characters
+- **WaveNet (neural) voices:** $16 per 1M characters
+- **Studio voices:** $16 per 1M characters
+- **Speech marks (timepoints):** Included (no extra charge)
+Pricing is competitive with AWS Polly.
+## Supported Voices
+Popular voices include:
+### Standard Voices
+- **English (US):** en-US-Standard-A/B/C/D/E/F/G/H/I/J
+- **English (UK):** en-GB-Standard-A/B/C/D/F
+- **Spanish:** es-ES-Standard-A/B/C/D
+- **French:** fr-FR-Standard-A/B/C/D/E
+- **German:** de-DE-Standard-A/B/C/D/E/F
+- **Italian:** it-IT-Standard-A/B/C/D
+- **Portuguese:** pt-BR-Standard-A/B/C
+### WaveNet (Neural) Voices
+- **English (US):** en-US-Wavenet-A/B/C/D/E/F/G/H/I/J
+- **English (UK):** en-GB-Wavenet-A/B/C/D/F
+- **Spanish:** es-ES-Wavenet-B/C/D
+- **French:** fr-FR-Wavenet-A/B/C/D/E
+- **German:** de-DE-Wavenet-A/B/C/D/E/F
+- **Italian:** it-IT-Wavenet-A/B/C/D
+- **Portuguese:** pt-BR-Wavenet-A/B/C
+### Studio Voices
+- **English (US):** en-US-Studio-O/Q
+- **English (UK):** en-GB-Studio-B/C
+Use `getVoices()` for the complete list of 200+ voices.
+## Voice Naming Convention
+Google Cloud voices follow the pattern: `{languageCode}-{voiceType}-{variant}`
+Examples:
+- `en-US-Wavenet-A` - US English, WaveNet (neural), variant A
+- `es-ES-Standard-B` - Spanish (Spain), Standard, variant B
+- `fr-FR-Studio-A` - French, Studio (premium), variant A
+## Error Handling
+```typescript
+import { TTSError, TTSErrorCode } from '@pie-players/tts-server-core';
+try {
+  const result = await provider.synthesize({ text: 'Hello' });
+} catch (error) {
+  if (error instanceof TTSError) {
+    console.error('Error code:', error.code);
+    console.error('Message:', error.message);
+    console.error('Provider:', error.providerId);
+    // Handle specific error types
+    if (error.code === TTSErrorCode.AUTHENTICATION_ERROR) {
+      console.error('Check your Google Cloud credentials');
+    } else if (error.code === TTSErrorCode.RATE_LIMIT_EXCEEDED) {
+      console.error('Rate limit exceeded, retry after some time');
+    }
+  }
+}
+```
+## Google Cloud IAM Permissions
+Required IAM permissions for the service account:
+```json
+{
+  "role": "roles/cloudtexttospeech.user",
+  "permissions": [
+    "texttospeech.operations.get",
+    "texttospeech.voices.list",
+    "texttospeech.voices.synthesize"
+  ]
+}
+```
+Or use the predefined role:
+```bash
+gcloud projects add-iam-policy-binding PROJECT_ID \
+  --member="serviceAccount:SERVICE_ACCOUNT_EMAIL" \
+  --role="roles/cloudtexttospeech.user"
+```
+## Comparison with AWS Polly
+| Feature | Google Cloud TTS | AWS Polly |
+|---------|------------------|-----------|
+| **Voices** | 200+ voices | 60+ voices |
+| **Languages** | 50+ languages | 25+ languages |
+| **Speech Marks** | Via SSML marks | Native |
+| **API Calls** | Single call | Two parallel calls |
+| **Max Text** | 5000 chars | 3000 chars |
+| **Neural Cost** | $16/1M chars | $16/1M chars |
+| **Standard Cost** | $4/1M chars | $4/1M chars |
+| **Authentication** | Flexible (4 methods) | AWS credentials |
+| **Region** | Global service | Region-specific |
+## How Speech Marks Work
+Unlike AWS Polly (which provides native speech marks), Google Cloud TTS requires SSML mark injection:
+1. The provider automatically parses your text and injects `<mark>` tags before each word
+2. Google Cloud TTS returns timepoints corresponding to these marks
+3. The provider converts timepoints to the unified speech mark format
+This process is transparent to the user - just set `includeSpeechMarks: true`.
+## Advanced Configuration
+### Custom Audio Encoding
+```typescript
+await provider.initialize({
+  projectId: 'my-project',
+  audioEncoding: 'LINEAR16', // For WAV format
+});
+```
+### Enable Debug Logging
+```typescript
+await provider.initialize({
+  projectId: 'my-project',
+  enableLogging: true, // Logs SSML injection and speech marks extraction
+});
+```
+### Custom Sample Rate
+```typescript
+const result = await provider.synthesize({
+  text: 'Hello',
+  sampleRate: 48000, // 48kHz (default is 24kHz)
+});
+```
+## License
+MIT

package/dist/GoogleCloudTTSProvider.d.ts ADDED Viewed

@@ -0,0 +1,153 @@
+/**
+ * Google Cloud Text-to-Speech server-side TTS provider
+ * @module @pie-players/tts-server-google
+ */
+import { BaseTTSProvider, type GetVoicesOptions, type ServerProviderCapabilities, type SynthesizeRequest, type SynthesizeResponse, type TTSServerConfig, type Voice } from "@pie-players/tts-server-core";
+/**
+ * Google Cloud Text-to-Speech provider configuration.
+ *
+ * This extends the base TTSServerConfig with Google Cloud-specific settings.
+ */
+export interface GoogleCloudTTSConfig extends TTSServerConfig {
+    /**
+     * Google Cloud project ID (required)
+     *
+     * @example 'my-project-123456'
+     * @required
+     */
+    projectId: string;
+    /**
+     * Authentication credentials
+     *
+     * Supports multiple authentication methods:
+     * - Service account JSON file path (recommended for production)
+     * - Service account key object (for containers/serverless)
+     * - API key (for simple applications)
+     * - Omit to use Application Default Credentials (ADC) for local development
+     *
+     * @example '/path/to/service-account.json'
+     * @example { client_email: '...', private_key: '...' }
+     * @example { apiKey: 'AIza...' }
+     * @see https://cloud.google.com/docs/authentication
+     */
+    credentials?: string | {
+        client_email: string;
+        private_key: string;
+        project_id?: string;
+    } | {
+        apiKey: string;
+    };
+    /**
+     * Voice type: 'wavenet' (neural), 'standard', or 'studio' (premium)
+     *
+     * @default 'wavenet'
+     * @note WaveNet: $16/1M chars, Standard: $4/1M chars, Studio: $16/1M chars
+     */
+    voiceType?: "wavenet" | "standard" | "studio";
+    /**
+     * Default voice name if not specified in synthesis requests
+     *
+     * @default 'en-US-Wavenet-A'
+     * @example 'en-US-Wavenet-A', 'en-GB-Standard-B', 'es-ES-Studio-C'
+     * @see https://cloud.google.com/text-to-speech/docs/voices
+     */
+    defaultVoice?: string;
+    /**
+     * Audio encoding format
+     *
+     * @default 'MP3'
+     */
+    audioEncoding?: "MP3" | "LINEAR16" | "OGG_OPUS";
+    /**
+     * Enable detailed logging for debugging
+     *
+     * @default false
+     */
+    enableLogging?: boolean;
+}
+/**
+ * Google Cloud Text-to-Speech Server Provider
+ *
+ * Provides high-quality neural text-to-speech with precise word-level timing
+ * through Google Cloud Text-to-Speech API.
+ *
+ * Features:
+ * - Speech marks support via SSML mark injection (millisecond precision)
+ * - WaveNet (neural), Standard, and Studio voice types
+ * - 200+ voices across 50+ languages
+ * - Full SSML support
+ * - Single API call for audio + speech marks
+ */
+export declare class GoogleCloudTTSProvider extends BaseTTSProvider {
+    readonly providerId = "google-cloud-tts";
+    readonly providerName = "Google Cloud Text-to-Speech";
+    readonly version = "1.0.0";
+    private client;
+    private voiceType;
+    private defaultVoice;
+    private audioEncoding;
+    private enableLogging;
+    /**
+     * Initialize the Google Cloud TTS provider.
+     *
+     * This is FAST and lightweight - only validates config and creates the client.
+     * Does NOT fetch voices or make test API calls.
+     *
+     * @param config - Google Cloud TTS configuration
+     * @performance Completes in ~10-50ms
+     */
+    initialize(config: GoogleCloudTTSConfig): Promise<void>;
+    /**
+     * Synthesize speech with Google Cloud TTS
+     */
+    synthesize(request: SynthesizeRequest): Promise<SynthesizeResponse>;
+    /**
+     * Synthesize audio stream only (no speech marks)
+     */
+    private synthesizeAudio;
+    /**
+     * Synthesize with speech marks using SSML mark injection
+     */
+    private synthesizeWithSpeechMarks;
+    /**
+     * Inject SSML marks before each word in plain text
+     */
+    private injectSSMLMarks;
+    /**
+     * Extract words from existing SSML (simplified version for v1)
+     */
+    private extractWordsFromSSML;
+    /**
+     * Escape special XML characters for SSML
+     */
+    private escapeSSML;
+    /**
+     * Extract speech marks from Google's timepoints
+     */
+    private extractSpeechMarksFromTimepoints;
+    /**
+     * Detect if text contains SSML markup
+     */
+    private detectSSML;
+    /**
+     * Get available voices from Google Cloud TTS
+     */
+    getVoices(options?: GetVoicesOptions): Promise<Voice[]>;
+    /**
+     * Map Google Cloud voice to unified Voice interface
+     */
+    private mapGoogleVoiceToVoice;
+    /**
+     * Get Google Cloud TTS capabilities
+     */
+    getCapabilities(): ServerProviderCapabilities;
+    /**
+     * Map Google Cloud errors to TTSError codes
+     */
+    private mapGoogleErrorToTTSError;
+    /**
+     * Clean up Google Cloud TTS client
+     */
+    destroy(): Promise<void>;
+}
+//# sourceMappingURL=GoogleCloudTTSProvider.d.ts.map

package/dist/GoogleCloudTTSProvider.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"GoogleCloudTTSProvider.d.ts","sourceRoot":"","sources":["../src/GoogleCloudTTSProvider.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAIH,OAAO,EACN,eAAe,EACf,KAAK,gBAAgB,EACrB,KAAK,0BAA0B,EAE/B,KAAK,iBAAiB,EACtB,KAAK,kBAAkB,EAGvB,KAAK,eAAe,EACpB,KAAK,KAAK,EACV,MAAM,8BAA8B,CAAC;AAEtC;;;;GAIG;AACH,MAAM,WAAW,oBAAqB,SAAQ,eAAe;IAC5D;;;;;OAKG;IACH,SAAS,EAAE,MAAM,CAAC;IAElB;;;;;;;;;;;;;OAaG;IACH,WAAW,CAAC,EACT,MAAM,GACN;QAEA,YAAY,EAAE,MAAM,CAAC;QACrB,WAAW,EAAE,MAAM,CAAC;QACpB,UAAU,CAAC,EAAE,MAAM,CAAC;KACnB,GACD;QAEA,MAAM,EAAE,MAAM,CAAC;KACd,CAAC;IAEL;;;;;OAKG;IACH,SAAS,CAAC,EAAE,SAAS,GAAG,UAAU,GAAG,QAAQ,CAAC;IAE9C;;;;;;OAMG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;;;OAIG;IACH,aAAa,CAAC,EAAE,KAAK,GAAG,UAAU,GAAG,UAAU,CAAC;IAEhD;;;;OAIG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;CACxB;AAED;;;;;;;;;;;;GAYG;AACH,qBAAa,sBAAuB,SAAQ,eAAe;IAC1D,QAAQ,CAAC,UAAU,sBAAsB;IACzC,QAAQ,CAAC,YAAY,iCAAiC;IACtD,QAAQ,CAAC,OAAO,WAAW;IAE3B,OAAO,CAAC,MAAM,CAA8B;IAC5C,OAAO,CAAC,SAAS,CAAgD;IACjE,OAAO,CAAC,YAAY,CAAqB;IACzC,OAAO,CAAC,aAAa,CAA0C;IAC/D,OAAO,CAAC,aAAa,CAAS;IAE9B;;;;;;;;OAQG;IACG,UAAU,CAAC,MAAM,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAqD7D;;OAEG;IACG,UAAU,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAAC,kBAAkB,CAAC;IAqDzE;;OAEG;YACW,eAAe;IAoD7B;;OAEG;YACW,yBAAyB;IAiFvC;;OAEG;IACH,OAAO,CAAC,eAAe;IA8CvB;;OAEG;IACH,OAAO,CAAC,oBAAoB;IAmB5B;;OAEG;IACH,OAAO,CAAC,UAAU;IASlB;;OAEG;IACH,OAAO,CAAC,gCAAgC;IAyCxC;;OAEG;IACH,OAAO,CAAC,UAAU;IAYlB;;OAEG;IACG,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC;IAkC7D;;OAEG;IACH,OAAO,CAAC,qBAAqB;IAyC7B;;OAEG;IACH,eAAe,IAAI,0BAA0B;IAgC7C;;OAEG;IACH,OAAO,CAAC,wBAAwB;IA2ChC;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAM9B"}