@loonylabs/tts-middleware 0.8.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -2
- package/dist/middleware/services/tts/index.d.ts +4 -3
- package/dist/middleware/services/tts/index.d.ts.map +1 -1
- package/dist/middleware/services/tts/index.js +3 -1
- package/dist/middleware/services/tts/index.js.map +1 -1
- package/dist/middleware/services/tts/providers/gemini-provider.d.ts +142 -0
- package/dist/middleware/services/tts/providers/gemini-provider.d.ts.map +1 -0
- package/dist/middleware/services/tts/providers/gemini-provider.js +358 -0
- package/dist/middleware/services/tts/providers/gemini-provider.js.map +1 -0
- package/dist/middleware/services/tts/providers/index.d.ts +2 -0
- package/dist/middleware/services/tts/providers/index.d.ts.map +1 -1
- package/dist/middleware/services/tts/providers/index.js +3 -1
- package/dist/middleware/services/tts/providers/index.js.map +1 -1
- package/dist/middleware/services/tts/providers/vertex-ai-tts-provider.d.ts +168 -0
- package/dist/middleware/services/tts/providers/vertex-ai-tts-provider.d.ts.map +1 -0
- package/dist/middleware/services/tts/providers/vertex-ai-tts-provider.js +416 -0
- package/dist/middleware/services/tts/providers/vertex-ai-tts-provider.js.map +1 -0
- package/dist/middleware/services/tts/tts.service.d.ts.map +1 -1
- package/dist/middleware/services/tts/tts.service.js +13 -0
- package/dist/middleware/services/tts/tts.service.js.map +1 -1
- package/dist/middleware/services/tts/types/common.types.d.ts +12 -1
- package/dist/middleware/services/tts/types/common.types.d.ts.map +1 -1
- package/dist/middleware/services/tts/types/common.types.js +1 -0
- package/dist/middleware/services/tts/types/common.types.js.map +1 -1
- package/dist/middleware/services/tts/types/index.d.ts +2 -2
- package/dist/middleware/services/tts/types/index.d.ts.map +1 -1
- package/dist/middleware/services/tts/types/index.js +2 -1
- package/dist/middleware/services/tts/types/index.js.map +1 -1
- package/dist/middleware/services/tts/types/provider-options.types.d.ts +85 -1
- package/dist/middleware/services/tts/types/provider-options.types.d.ts.map +1 -1
- package/dist/middleware/services/tts/types/provider-options.types.js +13 -0
- package/dist/middleware/services/tts/types/provider-options.types.js.map +1 -1
- package/dist/middleware/services/tts/utils/retry.utils.d.ts +11 -0
- package/dist/middleware/services/tts/utils/retry.utils.d.ts.map +1 -1
- package/dist/middleware/services/tts/utils/retry.utils.js +23 -0
- package/dist/middleware/services/tts/utils/retry.utils.js.map +1 -1
- package/dist/middleware/shared/config/tts.config.d.ts +17 -0
- package/dist/middleware/shared/config/tts.config.d.ts.map +1 -1
- package/dist/middleware/shared/config/tts.config.js +14 -0
- package/dist/middleware/shared/config/tts.config.js.map +1 -1
- package/package.json +3 -2
package/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
# TTS Middleware
|
|
4
4
|
|
|
5
|
-
*Provider-agnostic Text-to-Speech middleware with **GDPR compliance** support. Currently supports Azure Speech Services, EdenAI, Google Cloud TTS, Fish Audio, and
|
|
5
|
+
*Provider-agnostic Text-to-Speech middleware with **GDPR compliance** support. Currently supports Azure Speech Services, EdenAI, Google Cloud TTS, Fish Audio, Inworld AI, and Vertex AI TTS. Features EU data residency via Azure and Google Cloud, pluggable logging, character-based billing, and comprehensive error handling.*
|
|
6
6
|
|
|
7
7
|
<!-- Horizontal Badge Navigation Bar -->
|
|
8
8
|
[](https://www.npmjs.com/package/@loonylabs/tts-middleware)
|
|
@@ -43,6 +43,7 @@
|
|
|
43
43
|
- **Google Cloud TTS**: Neural2, WaveNet, Studio voices with EU data residency
|
|
44
44
|
- **Fish Audio**: S1 model with 13 languages & 64+ emotions (test/admin only)
|
|
45
45
|
- **Inworld AI**: TTS 1.5 Max/Mini with 15 languages & voice cloning (test/admin only)
|
|
46
|
+
- **Vertex AI TTS**: Gemini Flash/Pro models with 30 voices, 90+ languages & style prompts (test/admin only)
|
|
46
47
|
- **Ready for:** OpenAI, ElevenLabs, Deepgram (interfaces prepared)
|
|
47
48
|
- **GDPR/DSGVO Compliance**: Built-in EU region support for Azure and Google Cloud
|
|
48
49
|
- **SSML Abstraction**: Auto-generates provider-specific SSML from simple JSON options
|
|
@@ -137,6 +138,14 @@ const inworld = await ttsService.synthesize({
|
|
|
137
138
|
voice: { id: 'Ashley' },
|
|
138
139
|
providerOptions: { modelId: 'inworld-tts-1.5-max', temperature: 1.1 },
|
|
139
140
|
});
|
|
141
|
+
|
|
142
|
+
// Vertex AI TTS (test/admin only)
|
|
143
|
+
const vertexAI = await ttsService.synthesize({
|
|
144
|
+
text: 'Have a wonderful day!',
|
|
145
|
+
provider: TTSProvider.VERTEX_AI,
|
|
146
|
+
voice: { id: 'Kore' },
|
|
147
|
+
providerOptions: { model: 'gemini-2.5-flash-preview-tts', stylePrompt: 'Say cheerfully:' },
|
|
148
|
+
});
|
|
140
149
|
```
|
|
141
150
|
|
|
142
151
|
</details>
|
|
@@ -240,6 +249,10 @@ FISH_AUDIO_API_KEY=your-fish-audio-api-key
|
|
|
240
249
|
# Inworld AI (test/admin only – no EU data residency)
|
|
241
250
|
INWORLD_API_KEY=your-inworld-api-key
|
|
242
251
|
|
|
252
|
+
# Vertex AI TTS (test/admin only – no EU data residency)
|
|
253
|
+
# Reuses GOOGLE_APPLICATION_CREDENTIALS and GOOGLE_CLOUD_PROJECT from above
|
|
254
|
+
VERTEX_AI_TTS_REGION=us-central1
|
|
255
|
+
|
|
243
256
|
# Logging
|
|
244
257
|
TTS_DEBUG=false
|
|
245
258
|
LOG_LEVEL=info
|
|
@@ -304,6 +317,20 @@ LOG_LEVEL=info
|
|
|
304
317
|
| **Pricing** | $10/1M chars (Max), $5/1M chars (Mini) |
|
|
305
318
|
| **EU Compliance** | No data residency guarantees |
|
|
306
319
|
|
|
320
|
+
### Vertex AI TTS (Test/Admin Only)
|
|
321
|
+
|
|
322
|
+
| Feature | Details |
|
|
323
|
+
|---------|---------|
|
|
324
|
+
| **Models** | `gemini-2.5-flash-preview-tts` (budget, fast), `gemini-2.5-pro-preview-tts` (premium, natural) |
|
|
325
|
+
| **Languages** | 90+ with auto-detection |
|
|
326
|
+
| **Voices** | 30 multilingual: Kore, Puck, Charon, Zephyr, Fenrir, Sulafat, etc. |
|
|
327
|
+
| **Style Control** | Natural language prompts: "Say cheerfully:", "Read in a spooky whisper:" |
|
|
328
|
+
| **Audio** | MP3 (via ffmpeg), WAV (fallback) |
|
|
329
|
+
| **Auth** | Service Account OAuth2 (reuses `GOOGLE_APPLICATION_CREDENTIALS`) |
|
|
330
|
+
| **Region** | `VERTEX_AI_TTS_REGION` env var (default: `us-central1`) |
|
|
331
|
+
| **Pricing** | $0.50-1.00/M input tokens + $10-20/M audio output tokens |
|
|
332
|
+
| **EU Compliance** | Preview models currently `us-central1` only — no EU data residency yet |
|
|
333
|
+
|
|
307
334
|
## GDPR / Compliance
|
|
308
335
|
|
|
309
336
|
### Provider Compliance Overview
|
|
@@ -315,9 +342,12 @@ LOG_LEVEL=info
|
|
|
315
342
|
| **EdenAI** | Yes | Depends* | Depends* | Depends on underlying provider |
|
|
316
343
|
| **Fish Audio** | No | No | No | Test/admin only |
|
|
317
344
|
| **Inworld AI** | No | No | No | Test/admin only |
|
|
345
|
+
| **Vertex AI TTS** | Yes (Vertex DPA) | Partial | No* | Test/admin only |
|
|
318
346
|
|
|
319
347
|
*EdenAI is an aggregator - compliance depends on the underlying provider.
|
|
320
348
|
|
|
349
|
+
\*Vertex AI TTS: DPA available, no model training on customer data — but preview models are currently `us-central1` only (no EU data residency until GA with EU region support).
|
|
350
|
+
|
|
321
351
|
## API Reference
|
|
322
352
|
|
|
323
353
|
### TTSService
|
|
@@ -503,12 +533,14 @@ graph TD
|
|
|
503
533
|
Registry -->|Select| Eden[EdenAIProvider]
|
|
504
534
|
Registry -->|Select| Fish[FishAudioProvider]
|
|
505
535
|
Registry -->|Select| Inworld[InworldProvider]
|
|
536
|
+
Registry -->|Select| VertexAI[VertexAITTSProvider]
|
|
506
537
|
|
|
507
538
|
Azure -->|SSML/SDK| AzureAPI[Azure Speech API]
|
|
508
539
|
GCloud -->|gRPC/SDK| GoogleAPI[Google Cloud TTS API]
|
|
509
540
|
Eden -->|REST| EdenAPI[EdenAI API]
|
|
510
541
|
Fish -->|REST| FishAPI[Fish Audio API]
|
|
511
542
|
Inworld -->|REST| InworldAPI[Inworld AI API]
|
|
543
|
+
VertexAI -->|REST/OAuth2| VertexAPI[Vertex AI API]
|
|
512
544
|
|
|
513
545
|
GoogleAPI -->|EU Endpoint| EU[eu-texttospeech.googleapis.com]
|
|
514
546
|
EdenAPI -.-> OpenAI[OpenAI TTS]
|
|
@@ -518,7 +550,7 @@ graph TD
|
|
|
518
550
|
## Testing
|
|
519
551
|
|
|
520
552
|
```bash
|
|
521
|
-
# Run all tests (
|
|
553
|
+
# Run all tests (600+ tests, >90% coverage)
|
|
522
554
|
npm test
|
|
523
555
|
|
|
524
556
|
# Unit tests only
|
|
@@ -534,6 +566,8 @@ npm run test:coverage
|
|
|
534
566
|
npx ts-node scripts/manual-test-edenai.ts
|
|
535
567
|
npx ts-node scripts/manual-test-google-cloud-tts.ts
|
|
536
568
|
npx ts-node scripts/manual-test-fish-audio.ts [en] [de]
|
|
569
|
+
npx ts-node scripts/manual-test-inworld.ts [en] [de] [mini]
|
|
570
|
+
npx ts-node scripts/manual-test-vertex-ai.ts [en] [de] [pro] [style]
|
|
537
571
|
|
|
538
572
|
# List available Google Cloud voices
|
|
539
573
|
npx ts-node scripts/list-google-voices.ts de-DE
|
|
@@ -20,9 +20,10 @@
|
|
|
20
20
|
*/
|
|
21
21
|
export { TTSService, ttsService } from './tts.service';
|
|
22
22
|
export { TTSProvider, TTSErrorCode, AudioFormat, } from './types';
|
|
23
|
-
export type { AudioOptions, VoiceConfig, TTSSynthesizeRequest, TTSResponse, TTSResponseMetadata, TTSBillingInfo, TTSVoice, TTSVoiceMetadata, AzureProviderOptions, OpenAIProviderOptions, ElevenLabsProviderOptions, GoogleCloudProviderOptions, GoogleCloudTTSProviderOptions, DeepgramProviderOptions, EdenAIProviderOptions, FishAudioProviderOptions, InworldProviderOptions, ProviderOptions, } from './types';
|
|
24
|
-
export { isAzureOptions, isOpenAIOptions, isElevenLabsOptions, isGoogleCloudOptions, isGoogleCloudTTSOptions, isDeepgramOptions, isEdenAIOptions, isFishAudioOptions, isInworldOptions, } from './types';
|
|
25
|
-
export { BaseTTSProvider, AzureProvider, EdenAIProvider, FishAudioProvider, GoogleCloudTTSProvider, InworldProvider, } from './providers';
|
|
23
|
+
export type { AudioOptions, VoiceConfig, TTSSynthesizeRequest, TTSResponse, TTSResponseMetadata, TTSBillingInfo, TTSVoice, TTSVoiceMetadata, AzureProviderOptions, OpenAIProviderOptions, ElevenLabsProviderOptions, GoogleCloudProviderOptions, GoogleCloudTTSProviderOptions, DeepgramProviderOptions, EdenAIProviderOptions, FishAudioProviderOptions, InworldProviderOptions, VertexAITTSProviderOptions, ProviderOptions, } from './types';
|
|
24
|
+
export { isAzureOptions, isOpenAIOptions, isElevenLabsOptions, isGoogleCloudOptions, isGoogleCloudTTSOptions, isDeepgramOptions, isEdenAIOptions, isFishAudioOptions, isInworldOptions, isVertexAITTSOptions, } from './types';
|
|
25
|
+
export { BaseTTSProvider, AzureProvider, EdenAIProvider, FishAudioProvider, GoogleCloudTTSProvider, InworldProvider, VertexAITTSProvider, } from './providers';
|
|
26
|
+
export type { VertexAITTSConfig } from './providers';
|
|
26
27
|
export type { GoogleCloudTTSRegion, GoogleCloudTTSConfig, } from './providers';
|
|
27
28
|
export { TTSError, InvalidConfigError, InvalidVoiceError, QuotaExceededError, ProviderUnavailableError, SynthesisFailedError, NetworkError, } from './providers';
|
|
28
29
|
export { countCharacters, countCharactersWithoutSSML, validateCharacterCount, countBillableCharacters, estimateAudioDuration, formatCharacterCount, } from './utils';
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/middleware/services/tts/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAGH,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAGvD,OAAO,EACL,WAAW,EACX,YAAY,EACZ,WAAW,GACZ,MAAM,SAAS,CAAC;AAEjB,YAAY,EACV,YAAY,EACZ,WAAW,EACX,oBAAoB,EACpB,WAAW,EACX,mBAAmB,EACnB,cAAc,EACd,QAAQ,EACR,gBAAgB,EAChB,oBAAoB,EACpB,qBAAqB,EACrB,yBAAyB,EACzB,0BAA0B,EAC1B,6BAA6B,EAC7B,uBAAuB,EACvB,qBAAqB,EACrB,wBAAwB,EACxB,sBAAsB,EACtB,eAAe,GAChB,MAAM,SAAS,CAAC;AAEjB,OAAO,EACL,cAAc,EACd,eAAe,EACf,mBAAmB,EACnB,oBAAoB,EACpB,uBAAuB,EACvB,iBAAiB,EACjB,eAAe,EACf,kBAAkB,EAClB,gBAAgB,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/middleware/services/tts/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAGH,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAGvD,OAAO,EACL,WAAW,EACX,YAAY,EACZ,WAAW,GACZ,MAAM,SAAS,CAAC;AAEjB,YAAY,EACV,YAAY,EACZ,WAAW,EACX,oBAAoB,EACpB,WAAW,EACX,mBAAmB,EACnB,cAAc,EACd,QAAQ,EACR,gBAAgB,EAChB,oBAAoB,EACpB,qBAAqB,EACrB,yBAAyB,EACzB,0BAA0B,EAC1B,6BAA6B,EAC7B,uBAAuB,EACvB,qBAAqB,EACrB,wBAAwB,EACxB,sBAAsB,EACtB,0BAA0B,EAC1B,eAAe,GAChB,MAAM,SAAS,CAAC;AAEjB,OAAO,EACL,cAAc,EACd,eAAe,EACf,mBAAmB,EACnB,oBAAoB,EACpB,uBAAuB,EACvB,iBAAiB,EACjB,eAAe,EACf,kBAAkB,EAClB,gBAAgB,EAChB,oBAAoB,GACrB,MAAM,SAAS,CAAC;AAGjB,OAAO,EACL,eAAe,EACf,aAAa,EACb,cAAc,EACd,iBAAiB,EACjB,sBAAsB,EACtB,eAAe,EACf,mBAAmB,GACpB,MAAM,aAAa,CAAC;AAErB,YAAY,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAErD,YAAY,EACV,oBAAoB,EACpB,oBAAoB,GACrB,MAAM,aAAa,CAAC;AAGrB,OAAO,EACL,QAAQ,EACR,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,wBAAwB,EACxB,oBAAoB,EACpB,YAAY,GACb,MAAM,aAAa,CAAC;AAGrB,OAAO,EACL,eAAe,EACf,0BAA0B,EAC1B,sBAAsB,EACtB,uBAAuB,EACvB,qBAAqB,EACrB,oBAAoB,GACrB,MAAM,SAAS,CAAC;AAGjB,OAAO,EACL,SAAS,EACT,SAAS,EACT,WAAW,EACX,WAAW,EACX,WAAW,EACX,YAAY,GACb,MAAM,SAAS,CAAC;AAEjB,YAAY,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAC;AAGnD,OAAO,EACL,gBAAgB,EAChB,gBAAgB,EAChB,oBAAoB,GACrB,MAAM,SAAS,CAAC;AAEjB,YAAY,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC"}
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
* @module @loonylabs/tts-middleware
|
|
21
21
|
*/
|
|
22
22
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
23
|
-
exports.DEFAULT_RETRY_CONFIG = exports.isRetryableError = exports.executeWithRetry = exports.silentLogger = exports.getLogLevel = exports.setLogLevel = exports.resetLogger = exports.getLogger = exports.setLogger = exports.formatCharacterCount = exports.estimateAudioDuration = exports.countBillableCharacters = exports.validateCharacterCount = exports.countCharactersWithoutSSML = exports.countCharacters = exports.NetworkError = exports.SynthesisFailedError = exports.ProviderUnavailableError = exports.QuotaExceededError = exports.InvalidVoiceError = exports.InvalidConfigError = exports.TTSError = exports.InworldProvider = exports.GoogleCloudTTSProvider = exports.FishAudioProvider = exports.EdenAIProvider = exports.AzureProvider = exports.BaseTTSProvider = exports.isInworldOptions = exports.isFishAudioOptions = exports.isEdenAIOptions = exports.isDeepgramOptions = exports.isGoogleCloudTTSOptions = exports.isGoogleCloudOptions = exports.isElevenLabsOptions = exports.isOpenAIOptions = exports.isAzureOptions = exports.TTSErrorCode = exports.TTSProvider = exports.ttsService = exports.TTSService = void 0;
|
|
23
|
+
exports.DEFAULT_RETRY_CONFIG = exports.isRetryableError = exports.executeWithRetry = exports.silentLogger = exports.getLogLevel = exports.setLogLevel = exports.resetLogger = exports.getLogger = exports.setLogger = exports.formatCharacterCount = exports.estimateAudioDuration = exports.countBillableCharacters = exports.validateCharacterCount = exports.countCharactersWithoutSSML = exports.countCharacters = exports.NetworkError = exports.SynthesisFailedError = exports.ProviderUnavailableError = exports.QuotaExceededError = exports.InvalidVoiceError = exports.InvalidConfigError = exports.TTSError = exports.VertexAITTSProvider = exports.InworldProvider = exports.GoogleCloudTTSProvider = exports.FishAudioProvider = exports.EdenAIProvider = exports.AzureProvider = exports.BaseTTSProvider = exports.isVertexAITTSOptions = exports.isInworldOptions = exports.isFishAudioOptions = exports.isEdenAIOptions = exports.isDeepgramOptions = exports.isGoogleCloudTTSOptions = exports.isGoogleCloudOptions = exports.isElevenLabsOptions = exports.isOpenAIOptions = exports.isAzureOptions = exports.TTSErrorCode = exports.TTSProvider = exports.ttsService = exports.TTSService = void 0;
|
|
24
24
|
// ===== Main Service =====
|
|
25
25
|
var tts_service_1 = require("./tts.service");
|
|
26
26
|
Object.defineProperty(exports, "TTSService", { enumerable: true, get: function () { return tts_service_1.TTSService; } });
|
|
@@ -39,6 +39,7 @@ Object.defineProperty(exports, "isDeepgramOptions", { enumerable: true, get: fun
|
|
|
39
39
|
Object.defineProperty(exports, "isEdenAIOptions", { enumerable: true, get: function () { return types_2.isEdenAIOptions; } });
|
|
40
40
|
Object.defineProperty(exports, "isFishAudioOptions", { enumerable: true, get: function () { return types_2.isFishAudioOptions; } });
|
|
41
41
|
Object.defineProperty(exports, "isInworldOptions", { enumerable: true, get: function () { return types_2.isInworldOptions; } });
|
|
42
|
+
Object.defineProperty(exports, "isVertexAITTSOptions", { enumerable: true, get: function () { return types_2.isVertexAITTSOptions; } });
|
|
42
43
|
// ===== Providers =====
|
|
43
44
|
var providers_1 = require("./providers");
|
|
44
45
|
Object.defineProperty(exports, "BaseTTSProvider", { enumerable: true, get: function () { return providers_1.BaseTTSProvider; } });
|
|
@@ -47,6 +48,7 @@ Object.defineProperty(exports, "EdenAIProvider", { enumerable: true, get: functi
|
|
|
47
48
|
Object.defineProperty(exports, "FishAudioProvider", { enumerable: true, get: function () { return providers_1.FishAudioProvider; } });
|
|
48
49
|
Object.defineProperty(exports, "GoogleCloudTTSProvider", { enumerable: true, get: function () { return providers_1.GoogleCloudTTSProvider; } });
|
|
49
50
|
Object.defineProperty(exports, "InworldProvider", { enumerable: true, get: function () { return providers_1.InworldProvider; } });
|
|
51
|
+
Object.defineProperty(exports, "VertexAITTSProvider", { enumerable: true, get: function () { return providers_1.VertexAITTSProvider; } });
|
|
50
52
|
// ===== Errors =====
|
|
51
53
|
var providers_2 = require("./providers");
|
|
52
54
|
Object.defineProperty(exports, "TTSError", { enumerable: true, get: function () { return providers_2.TTSError; } });
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/middleware/services/tts/index.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;GAmBG;;;AAEH,2BAA2B;AAC3B,6CAAuD;AAA9C,yGAAA,UAAU,OAAA;AAAE,yGAAA,UAAU,OAAA;AAE/B,oBAAoB;AACpB,iCAIiB;AAHf,oGAAA,WAAW,OAAA;AACX,qGAAA,YAAY,OAAA;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/middleware/services/tts/index.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;GAmBG;;;AAEH,2BAA2B;AAC3B,6CAAuD;AAA9C,yGAAA,UAAU,OAAA;AAAE,yGAAA,UAAU,OAAA;AAE/B,oBAAoB;AACpB,iCAIiB;AAHf,oGAAA,WAAW,OAAA;AACX,qGAAA,YAAY,OAAA;AA0Bd,iCAWiB;AAVf,uGAAA,cAAc,OAAA;AACd,wGAAA,eAAe,OAAA;AACf,4GAAA,mBAAmB,OAAA;AACnB,6GAAA,oBAAoB,OAAA;AACpB,gHAAA,uBAAuB,OAAA;AACvB,0GAAA,iBAAiB,OAAA;AACjB,wGAAA,eAAe,OAAA;AACf,2GAAA,kBAAkB,OAAA;AAClB,yGAAA,gBAAgB,OAAA;AAChB,6GAAA,oBAAoB,OAAA;AAGtB,wBAAwB;AACxB,yCAQqB;AAPnB,4GAAA,eAAe,OAAA;AACf,0GAAA,aAAa,OAAA;AACb,2GAAA,cAAc,OAAA;AACd,8GAAA,iBAAiB,OAAA;AACjB,mHAAA,sBAAsB,OAAA;AACtB,4GAAA,eAAe,OAAA;AACf,gHAAA,mBAAmB,OAAA;AAUrB,qBAAqB;AACrB,yCAQqB;AAPnB,qGAAA,QAAQ,OAAA;AACR,+GAAA,kBAAkB,OAAA;AAClB,8GAAA,iBAAiB,OAAA;AACjB,+GAAA,kBAAkB,OAAA;AAClB,qHAAA,wBAAwB,OAAA;AACxB,iHAAA,oBAAoB,OAAA;AACpB,yGAAA,YAAY,OAAA;AAGd,wBAAwB;AACxB,iCAOiB;AANf,wGAAA,eAAe,OAAA;AACf,mHAAA,0BAA0B,OAAA;AAC1B,+GAAA,sBAAsB,OAAA;AACtB,gHAAA,uBAAuB,OAAA;AACvB,8GAAA,qBAAqB,OAAA;AACrB,6GAAA,oBAAoB,OAAA;AAGtB,qBAAqB;AACrB,iCAOiB;AANf,kGAAA,SAAS,OAAA;AACT,kGAAA,SAAS,OAAA;AACT,oGAAA,WAAW,OAAA;AACX,oGAAA,WAAW,OAAA;AACX,oGAAA,WAAW,OAAA;AACX,qGAAA,YAAY,OAAA;AAKd,oBAAoB;AACpB,iCAIiB;AAHf,yGAAA,gBAAgB,OAAA;AAChB,yGAAA,gBAAgB,OAAA;AAChB,6GAAA,oBAAoB,OAAA"}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gemini TTS Provider
|
|
3
|
+
*
|
|
4
|
+
* @description Provider for Google Gemini TTS via Vertex AI, using the generateContent
|
|
5
|
+
* endpoint with responseModalities: ['AUDIO']. Authenticates via Service Account
|
|
6
|
+
* (same as Google Cloud TTS — reuses GOOGLE_APPLICATION_CREDENTIALS).
|
|
7
|
+
*
|
|
8
|
+
* Supports 30 multilingual voices with auto-detect language and natural language
|
|
9
|
+
* style control. Output is raw PCM (24kHz, 16-bit, mono) which is converted to
|
|
10
|
+
* MP3 via ffmpeg or WAV as fallback.
|
|
11
|
+
*
|
|
12
|
+
* Test/Admin only -- no EU data residency guarantees.
|
|
13
|
+
*
|
|
14
|
+
* @see https://cloud.google.com/vertex-ai/generative-ai/docs/text-to-speech
|
|
15
|
+
*/
|
|
16
|
+
import type { TTSSynthesizeRequest, TTSResponse } from '../types';
|
|
17
|
+
import { BaseTTSProvider } from './base-tts-provider';
|
|
18
|
+
/**
|
|
19
|
+
* Gemini TTS configuration (Vertex AI)
|
|
20
|
+
*/
|
|
21
|
+
export interface GeminiConfig {
|
|
22
|
+
/**
|
|
23
|
+
* Path to Service Account JSON file
|
|
24
|
+
* @env GOOGLE_APPLICATION_CREDENTIALS
|
|
25
|
+
*/
|
|
26
|
+
keyFilename?: string;
|
|
27
|
+
/**
|
|
28
|
+
* Google Cloud Project ID
|
|
29
|
+
* @env GOOGLE_CLOUD_PROJECT
|
|
30
|
+
*/
|
|
31
|
+
projectId?: string;
|
|
32
|
+
/**
|
|
33
|
+
* Vertex AI region
|
|
34
|
+
* @env GEMINI_REGION
|
|
35
|
+
* @default 'us-central1'
|
|
36
|
+
*/
|
|
37
|
+
region?: string;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Gemini TTS provider implementation
|
|
41
|
+
*
|
|
42
|
+
* @description Provides TTS synthesis using Google's Gemini generateContent API
|
|
43
|
+
* via Vertex AI. Authenticates with Service Account OAuth2 (same credentials as
|
|
44
|
+
* Google Cloud TTS). Gemini outputs raw PCM which is converted to MP3 (via ffmpeg)
|
|
45
|
+
* or WAV (pure Node.js fallback).
|
|
46
|
+
*
|
|
47
|
+
* Billing: Token-based ($0.50-1.00/M input + $10-20/M audio output tokens).
|
|
48
|
+
* For billing compatibility, reports character count like all other providers.
|
|
49
|
+
*
|
|
50
|
+
* @example
|
|
51
|
+
* ```typescript
|
|
52
|
+
* const provider = new GeminiProvider();
|
|
53
|
+
* const response = await provider.synthesize(
|
|
54
|
+
* "Hello World",
|
|
55
|
+
* "Kore",
|
|
56
|
+
* {
|
|
57
|
+
* text: "Hello World",
|
|
58
|
+
* voice: { id: "Kore" },
|
|
59
|
+
* audio: { format: "mp3" },
|
|
60
|
+
* providerOptions: {
|
|
61
|
+
* model: "gemini-2.5-flash-preview-tts",
|
|
62
|
+
* stylePrompt: "Say cheerfully:"
|
|
63
|
+
* }
|
|
64
|
+
* }
|
|
65
|
+
* );
|
|
66
|
+
* ```
|
|
67
|
+
*/
|
|
68
|
+
export declare class GeminiProvider extends BaseTTSProvider {
|
|
69
|
+
private config;
|
|
70
|
+
private authClient;
|
|
71
|
+
/**
|
|
72
|
+
* Creates a new Gemini TTS provider
|
|
73
|
+
*
|
|
74
|
+
* @param config - Optional configuration (uses env vars if not provided)
|
|
75
|
+
* @throws {InvalidConfigError} If credentials are missing
|
|
76
|
+
*/
|
|
77
|
+
constructor(config?: Partial<GeminiConfig>);
|
|
78
|
+
/**
|
|
79
|
+
* Validate Gemini configuration
|
|
80
|
+
*
|
|
81
|
+
* @private
|
|
82
|
+
* @throws {InvalidConfigError} If configuration is invalid
|
|
83
|
+
*/
|
|
84
|
+
private validateGeminiConfig;
|
|
85
|
+
/**
|
|
86
|
+
* Get an authenticated access token via Service Account
|
|
87
|
+
*
|
|
88
|
+
* @private
|
|
89
|
+
* @returns OAuth2 access token
|
|
90
|
+
*/
|
|
91
|
+
private getAccessToken;
|
|
92
|
+
/**
|
|
93
|
+
* Synthesize text to speech using Gemini TTS
|
|
94
|
+
*
|
|
95
|
+
* @param text - The input text to synthesize
|
|
96
|
+
* @param voiceId - The voice name (e.g. "Kore", "Puck", "Charon")
|
|
97
|
+
* @param request - The full synthesis request with options
|
|
98
|
+
* @returns Promise resolving to the synthesis response
|
|
99
|
+
*/
|
|
100
|
+
synthesize(text: string, voiceId: string, request: TTSSynthesizeRequest): Promise<TTSResponse>;
|
|
101
|
+
/**
|
|
102
|
+
* Build Gemini generateContent request payload
|
|
103
|
+
*
|
|
104
|
+
* @private
|
|
105
|
+
*/
|
|
106
|
+
private buildRequest;
|
|
107
|
+
/**
|
|
108
|
+
* Call Gemini generateContent API via Vertex AI
|
|
109
|
+
*
|
|
110
|
+
* @private
|
|
111
|
+
* @param requestBody - The request payload
|
|
112
|
+
* @param model - The Gemini model to use
|
|
113
|
+
* @returns Promise resolving to raw PCM audio buffer
|
|
114
|
+
*/
|
|
115
|
+
private callAPI;
|
|
116
|
+
/**
|
|
117
|
+
* Convert raw PCM audio to the requested format
|
|
118
|
+
*
|
|
119
|
+
* @private
|
|
120
|
+
* @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
|
|
121
|
+
* @param requestedFormat - The desired output format ('mp3', 'wav', etc.)
|
|
122
|
+
* @returns The converted audio buffer and actual format used
|
|
123
|
+
*/
|
|
124
|
+
private convertPcmAudio;
|
|
125
|
+
/**
|
|
126
|
+
* Convert raw PCM to MP3 using ffmpeg via child_process
|
|
127
|
+
*
|
|
128
|
+
* @private
|
|
129
|
+
* @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
|
|
130
|
+
* @returns Promise resolving to MP3 buffer
|
|
131
|
+
*/
|
|
132
|
+
private pcmToMp3;
|
|
133
|
+
/**
|
|
134
|
+
* Convert raw PCM to WAV by prepending a 44-byte WAV header
|
|
135
|
+
*
|
|
136
|
+
* @private
|
|
137
|
+
* @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
|
|
138
|
+
* @returns WAV buffer
|
|
139
|
+
*/
|
|
140
|
+
private pcmToWav;
|
|
141
|
+
}
|
|
142
|
+
//# sourceMappingURL=gemini-provider.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"gemini-provider.d.ts","sourceRoot":"","sources":["../../../../../src/middleware/services/tts/providers/gemini-provider.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAGH,OAAO,KAAK,EAAE,oBAAoB,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAGlE,OAAO,EACL,eAAe,EAEhB,MAAM,qBAAqB,CAAC;AAG7B;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB;;;;OAIG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAMD;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,qBAAa,cAAe,SAAQ,eAAe;IACjD,OAAO,CAAC,MAAM,CAAe;IAC7B,OAAO,CAAC,UAAU,CAA6E;IAE/F;;;;;OAKG;gBACS,MAAM,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC;IAkB1C;;;;;OAKG;IACH,OAAO,CAAC,oBAAoB;IAgB5B;;;;;OAKG;YACW,cAAc;IAqB5B;;;;;;;OAOG;IACG,UAAU,CACd,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,MAAM,EACf,OAAO,EAAE,oBAAoB,GAC5B,OAAO,CAAC,WAAW,CAAC;IAsDvB;;;;OAIG;IACH,OAAO,CAAC,YAAY;IA6BpB;;;;;;;OAOG;YACW,OAAO;IA6CrB;;;;;;;OAOG;YACW,eAAe;IA0B7B;;;;;;OAMG;IACH,OAAO,CAAC,QAAQ;IAkChB;;;;;;OAMG;IACH,OAAO,CAAC,QAAQ;CAwBjB"}
|
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Gemini TTS Provider
|
|
4
|
+
*
|
|
5
|
+
* @description Provider for Google Gemini TTS via Vertex AI, using the generateContent
|
|
6
|
+
* endpoint with responseModalities: ['AUDIO']. Authenticates via Service Account
|
|
7
|
+
* (same as Google Cloud TTS — reuses GOOGLE_APPLICATION_CREDENTIALS).
|
|
8
|
+
*
|
|
9
|
+
* Supports 30 multilingual voices with auto-detect language and natural language
|
|
10
|
+
* style control. Output is raw PCM (24kHz, 16-bit, mono) which is converted to
|
|
11
|
+
* MP3 via ffmpeg or WAV as fallback.
|
|
12
|
+
*
|
|
13
|
+
* Test/Admin only -- no EU data residency guarantees.
|
|
14
|
+
*
|
|
15
|
+
* @see https://cloud.google.com/vertex-ai/generative-ai/docs/text-to-speech
|
|
16
|
+
*/
|
|
17
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
18
|
+
if (k2 === undefined) k2 = k;
|
|
19
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
20
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
21
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
22
|
+
}
|
|
23
|
+
Object.defineProperty(o, k2, desc);
|
|
24
|
+
}) : (function(o, m, k, k2) {
|
|
25
|
+
if (k2 === undefined) k2 = k;
|
|
26
|
+
o[k2] = m[k];
|
|
27
|
+
}));
|
|
28
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
29
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
30
|
+
}) : function(o, v) {
|
|
31
|
+
o["default"] = v;
|
|
32
|
+
});
|
|
33
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
34
|
+
var ownKeys = function(o) {
|
|
35
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
36
|
+
var ar = [];
|
|
37
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
38
|
+
return ar;
|
|
39
|
+
};
|
|
40
|
+
return ownKeys(o);
|
|
41
|
+
};
|
|
42
|
+
return function (mod) {
|
|
43
|
+
if (mod && mod.__esModule) return mod;
|
|
44
|
+
var result = {};
|
|
45
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
46
|
+
__setModuleDefault(result, mod);
|
|
47
|
+
return result;
|
|
48
|
+
};
|
|
49
|
+
})();
|
|
50
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
51
|
+
exports.GeminiProvider = void 0;
|
|
52
|
+
const child_process_1 = require("child_process");
|
|
53
|
+
const types_1 = require("../types");
|
|
54
|
+
const mp3_duration_utils_1 = require("../utils/mp3-duration.utils");
|
|
55
|
+
const base_tts_provider_1 = require("./base-tts-provider");
|
|
56
|
+
const DEFAULT_MODEL = 'gemini-2.5-flash-preview-tts';
|
|
57
|
+
const DEFAULT_SAMPLE_RATE = 24000;
|
|
58
|
+
const DEFAULT_REGION = 'us-central1';
|
|
59
|
+
/**
|
|
60
|
+
* Gemini TTS provider implementation
|
|
61
|
+
*
|
|
62
|
+
* @description Provides TTS synthesis using Google's Gemini generateContent API
|
|
63
|
+
* via Vertex AI. Authenticates with Service Account OAuth2 (same credentials as
|
|
64
|
+
* Google Cloud TTS). Gemini outputs raw PCM which is converted to MP3 (via ffmpeg)
|
|
65
|
+
* or WAV (pure Node.js fallback).
|
|
66
|
+
*
|
|
67
|
+
* Billing: Token-based ($0.50-1.00/M input + $10-20/M audio output tokens).
|
|
68
|
+
* For billing compatibility, reports character count like all other providers.
|
|
69
|
+
*
|
|
70
|
+
* @example
|
|
71
|
+
* ```typescript
|
|
72
|
+
* const provider = new GeminiProvider();
|
|
73
|
+
* const response = await provider.synthesize(
|
|
74
|
+
* "Hello World",
|
|
75
|
+
* "Kore",
|
|
76
|
+
* {
|
|
77
|
+
* text: "Hello World",
|
|
78
|
+
* voice: { id: "Kore" },
|
|
79
|
+
* audio: { format: "mp3" },
|
|
80
|
+
* providerOptions: {
|
|
81
|
+
* model: "gemini-2.5-flash-preview-tts",
|
|
82
|
+
* stylePrompt: "Say cheerfully:"
|
|
83
|
+
* }
|
|
84
|
+
* }
|
|
85
|
+
* );
|
|
86
|
+
* ```
|
|
87
|
+
*/
|
|
88
|
+
class GeminiProvider extends base_tts_provider_1.BaseTTSProvider {
|
|
89
|
+
/**
|
|
90
|
+
* Creates a new Gemini TTS provider
|
|
91
|
+
*
|
|
92
|
+
* @param config - Optional configuration (uses env vars if not provided)
|
|
93
|
+
* @throws {InvalidConfigError} If credentials are missing
|
|
94
|
+
*/
|
|
95
|
+
constructor(config) {
|
|
96
|
+
super(types_1.TTSProvider.GEMINI);
|
|
97
|
+
this.authClient = null;
|
|
98
|
+
this.config = {
|
|
99
|
+
keyFilename: config?.keyFilename || process.env.GOOGLE_APPLICATION_CREDENTIALS,
|
|
100
|
+
projectId: config?.projectId || process.env.GOOGLE_CLOUD_PROJECT,
|
|
101
|
+
region: config?.region || process.env.GEMINI_REGION || DEFAULT_REGION,
|
|
102
|
+
};
|
|
103
|
+
this.validateGeminiConfig();
|
|
104
|
+
this.log('info', 'Gemini TTS provider initialized', {
|
|
105
|
+
hasCredentials: !!this.config.keyFilename,
|
|
106
|
+
projectId: this.config.projectId ? '***' : undefined,
|
|
107
|
+
region: this.config.region,
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Validate Gemini configuration
|
|
112
|
+
*
|
|
113
|
+
* @private
|
|
114
|
+
* @throws {InvalidConfigError} If configuration is invalid
|
|
115
|
+
*/
|
|
116
|
+
validateGeminiConfig() {
|
|
117
|
+
if (!this.config.keyFilename) {
|
|
118
|
+
throw new base_tts_provider_1.InvalidConfigError(this.providerName, 'Google Cloud credentials are required for Gemini TTS (GOOGLE_APPLICATION_CREDENTIALS)');
|
|
119
|
+
}
|
|
120
|
+
if (!this.config.projectId) {
|
|
121
|
+
throw new base_tts_provider_1.InvalidConfigError(this.providerName, 'Google Cloud Project ID is required for Gemini TTS (GOOGLE_CLOUD_PROJECT)');
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Get an authenticated access token via Service Account
|
|
126
|
+
*
|
|
127
|
+
* @private
|
|
128
|
+
* @returns OAuth2 access token
|
|
129
|
+
*/
|
|
130
|
+
async getAccessToken() {
|
|
131
|
+
if (!this.authClient) {
|
|
132
|
+
const { GoogleAuth } = await Promise.resolve().then(() => __importStar(require('google-auth-library')));
|
|
133
|
+
const auth = new GoogleAuth({
|
|
134
|
+
keyFilename: this.config.keyFilename,
|
|
135
|
+
scopes: ['https://www.googleapis.com/auth/cloud-platform'],
|
|
136
|
+
});
|
|
137
|
+
this.authClient = await auth.getClient();
|
|
138
|
+
}
|
|
139
|
+
const tokenResponse = await this.authClient.getAccessToken();
|
|
140
|
+
if (!tokenResponse.token) {
|
|
141
|
+
throw new base_tts_provider_1.InvalidConfigError(this.providerName, 'Failed to obtain access token from Service Account');
|
|
142
|
+
}
|
|
143
|
+
return tokenResponse.token;
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Synthesize text to speech using Gemini TTS
|
|
147
|
+
*
|
|
148
|
+
* @param text - The input text to synthesize
|
|
149
|
+
* @param voiceId - The voice name (e.g. "Kore", "Puck", "Charon")
|
|
150
|
+
* @param request - The full synthesis request with options
|
|
151
|
+
* @returns Promise resolving to the synthesis response
|
|
152
|
+
*/
|
|
153
|
+
async synthesize(text, voiceId, request) {
|
|
154
|
+
this.validateConfig(request);
|
|
155
|
+
const startTime = Date.now();
|
|
156
|
+
const options = (request.providerOptions || {});
|
|
157
|
+
const model = options.model || DEFAULT_MODEL;
|
|
158
|
+
const requestedFormat = request.audio?.format || 'mp3';
|
|
159
|
+
const requestBody = this.buildRequest(text, voiceId, options);
|
|
160
|
+
this.log('debug', 'Synthesizing with Gemini TTS', {
|
|
161
|
+
voiceId,
|
|
162
|
+
model,
|
|
163
|
+
textLength: text.length,
|
|
164
|
+
requestedFormat,
|
|
165
|
+
});
|
|
166
|
+
try {
|
|
167
|
+
const pcmBuffer = await this.callAPI(requestBody, model);
|
|
168
|
+
const { audioBuffer, audioFormat } = await this.convertPcmAudio(pcmBuffer, requestedFormat);
|
|
169
|
+
const duration = Date.now() - startTime;
|
|
170
|
+
this.log('info', 'Synthesis successful', {
|
|
171
|
+
voiceId,
|
|
172
|
+
characters: text.length,
|
|
173
|
+
duration,
|
|
174
|
+
audioSize: audioBuffer.length,
|
|
175
|
+
audioFormat,
|
|
176
|
+
});
|
|
177
|
+
return {
|
|
178
|
+
audio: audioBuffer,
|
|
179
|
+
metadata: {
|
|
180
|
+
provider: this.providerName,
|
|
181
|
+
voice: voiceId,
|
|
182
|
+
duration,
|
|
183
|
+
audioDuration: audioFormat === 'mp3' ? (0, mp3_duration_utils_1.getMp3Duration)(audioBuffer) : undefined,
|
|
184
|
+
audioFormat,
|
|
185
|
+
sampleRate: DEFAULT_SAMPLE_RATE,
|
|
186
|
+
},
|
|
187
|
+
billing: {
|
|
188
|
+
characters: this.countCharacters(text),
|
|
189
|
+
},
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
catch (error) {
|
|
193
|
+
this.log('error', 'Synthesis failed', {
|
|
194
|
+
voiceId,
|
|
195
|
+
error: error.message,
|
|
196
|
+
});
|
|
197
|
+
throw this.handleError(error, 'during Gemini TTS API call');
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Build Gemini generateContent request payload
|
|
202
|
+
*
|
|
203
|
+
* @private
|
|
204
|
+
*/
|
|
205
|
+
buildRequest(text, voiceId, options) {
|
|
206
|
+
const synthesisText = options.stylePrompt
|
|
207
|
+
? `${options.stylePrompt} ${text}`
|
|
208
|
+
: text;
|
|
209
|
+
return {
|
|
210
|
+
contents: [
|
|
211
|
+
{
|
|
212
|
+
role: 'user',
|
|
213
|
+
parts: [{ text: synthesisText }],
|
|
214
|
+
},
|
|
215
|
+
],
|
|
216
|
+
generationConfig: {
|
|
217
|
+
responseModalities: ['AUDIO'],
|
|
218
|
+
speechConfig: {
|
|
219
|
+
voiceConfig: {
|
|
220
|
+
prebuiltVoiceConfig: {
|
|
221
|
+
voiceName: voiceId,
|
|
222
|
+
},
|
|
223
|
+
},
|
|
224
|
+
},
|
|
225
|
+
},
|
|
226
|
+
};
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* Call Gemini generateContent API via Vertex AI
|
|
230
|
+
*
|
|
231
|
+
* @private
|
|
232
|
+
* @param requestBody - The request payload
|
|
233
|
+
* @param model - The Gemini model to use
|
|
234
|
+
* @returns Promise resolving to raw PCM audio buffer
|
|
235
|
+
*/
|
|
236
|
+
async callAPI(requestBody, model) {
|
|
237
|
+
const accessToken = await this.getAccessToken();
|
|
238
|
+
const region = this.config.region || DEFAULT_REGION;
|
|
239
|
+
const projectId = this.config.projectId;
|
|
240
|
+
const url = `https://${region}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${region}/publishers/google/models/${model}:generateContent`;
|
|
241
|
+
const response = await fetch(url, {
|
|
242
|
+
method: 'POST',
|
|
243
|
+
headers: {
|
|
244
|
+
'Authorization': `Bearer ${accessToken}`,
|
|
245
|
+
'Content-Type': 'application/json',
|
|
246
|
+
},
|
|
247
|
+
body: JSON.stringify(requestBody),
|
|
248
|
+
});
|
|
249
|
+
if (!response.ok) {
|
|
250
|
+
const errorText = await response.text();
|
|
251
|
+
throw new Error(`Gemini API error (${response.status}): ${errorText}`);
|
|
252
|
+
}
|
|
253
|
+
const responseJson = await response.json();
|
|
254
|
+
const inlineData = responseJson.candidates?.[0]?.content?.parts?.[0]?.inlineData;
|
|
255
|
+
if (!inlineData?.data) {
|
|
256
|
+
throw new Error('Gemini API returned no audio data');
|
|
257
|
+
}
|
|
258
|
+
return Buffer.from(inlineData.data, 'base64');
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Convert raw PCM audio to the requested format
|
|
262
|
+
*
|
|
263
|
+
* @private
|
|
264
|
+
* @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
|
|
265
|
+
* @param requestedFormat - The desired output format ('mp3', 'wav', etc.)
|
|
266
|
+
* @returns The converted audio buffer and actual format used
|
|
267
|
+
*/
|
|
268
|
+
async convertPcmAudio(pcmBuffer, requestedFormat) {
|
|
269
|
+
if (requestedFormat === 'wav') {
|
|
270
|
+
return {
|
|
271
|
+
audioBuffer: this.pcmToWav(pcmBuffer),
|
|
272
|
+
audioFormat: 'wav',
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
// For mp3 (and any other format), try ffmpeg first, fall back to WAV
|
|
276
|
+
try {
|
|
277
|
+
const mp3Buffer = await this.pcmToMp3(pcmBuffer);
|
|
278
|
+
return { audioBuffer: mp3Buffer, audioFormat: 'mp3' };
|
|
279
|
+
}
|
|
280
|
+
catch (error) {
|
|
281
|
+
this.log('warn', 'ffmpeg not available, falling back to WAV output', {
|
|
282
|
+
error: error.message,
|
|
283
|
+
});
|
|
284
|
+
return {
|
|
285
|
+
audioBuffer: this.pcmToWav(pcmBuffer),
|
|
286
|
+
audioFormat: 'wav',
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
/**
|
|
291
|
+
* Convert raw PCM to MP3 using ffmpeg via child_process
|
|
292
|
+
*
|
|
293
|
+
* @private
|
|
294
|
+
* @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
|
|
295
|
+
* @returns Promise resolving to MP3 buffer
|
|
296
|
+
*/
|
|
297
|
+
pcmToMp3(pcmBuffer) {
|
|
298
|
+
return new Promise((resolve, reject) => {
|
|
299
|
+
const ffmpeg = (0, child_process_1.spawn)('ffmpeg', [
|
|
300
|
+
'-f', 's16le',
|
|
301
|
+
'-ar', String(DEFAULT_SAMPLE_RATE),
|
|
302
|
+
'-ac', '1',
|
|
303
|
+
'-i', 'pipe:0',
|
|
304
|
+
'-codec:a', 'libmp3lame',
|
|
305
|
+
'-b:a', '128k',
|
|
306
|
+
'-f', 'mp3',
|
|
307
|
+
'pipe:1',
|
|
308
|
+
]);
|
|
309
|
+
const chunks = [];
|
|
310
|
+
ffmpeg.stdout.on('data', (chunk) => chunks.push(chunk));
|
|
311
|
+
ffmpeg.stderr.on('data', () => { });
|
|
312
|
+
ffmpeg.on('error', (err) => {
|
|
313
|
+
reject(new Error(`ffmpeg spawn failed: ${err.message}`));
|
|
314
|
+
});
|
|
315
|
+
ffmpeg.on('close', (code) => {
|
|
316
|
+
if (code === 0) {
|
|
317
|
+
resolve(Buffer.concat(chunks));
|
|
318
|
+
}
|
|
319
|
+
else {
|
|
320
|
+
reject(new Error(`ffmpeg exited with code ${code}`));
|
|
321
|
+
}
|
|
322
|
+
});
|
|
323
|
+
ffmpeg.stdin.write(pcmBuffer);
|
|
324
|
+
ffmpeg.stdin.end();
|
|
325
|
+
});
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Convert raw PCM to WAV by prepending a 44-byte WAV header
|
|
329
|
+
*
|
|
330
|
+
* @private
|
|
331
|
+
* @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
|
|
332
|
+
* @returns WAV buffer
|
|
333
|
+
*/
|
|
334
|
+
pcmToWav(pcmBuffer) {
|
|
335
|
+
const channels = 1;
|
|
336
|
+
const bitsPerSample = 16;
|
|
337
|
+
const byteRate = DEFAULT_SAMPLE_RATE * channels * (bitsPerSample / 8);
|
|
338
|
+
const blockAlign = channels * (bitsPerSample / 8);
|
|
339
|
+
const dataLength = pcmBuffer.length;
|
|
340
|
+
const header = Buffer.alloc(44);
|
|
341
|
+
header.write('RIFF', 0);
|
|
342
|
+
header.writeUInt32LE(36 + dataLength, 4);
|
|
343
|
+
header.write('WAVE', 8);
|
|
344
|
+
header.write('fmt ', 12);
|
|
345
|
+
header.writeUInt32LE(16, 16); // PCM chunk size
|
|
346
|
+
header.writeUInt16LE(1, 20); // PCM format
|
|
347
|
+
header.writeUInt16LE(channels, 22);
|
|
348
|
+
header.writeUInt32LE(DEFAULT_SAMPLE_RATE, 24);
|
|
349
|
+
header.writeUInt32LE(byteRate, 28);
|
|
350
|
+
header.writeUInt16LE(blockAlign, 32);
|
|
351
|
+
header.writeUInt16LE(bitsPerSample, 34);
|
|
352
|
+
header.write('data', 36);
|
|
353
|
+
header.writeUInt32LE(dataLength, 40);
|
|
354
|
+
return Buffer.concat([header, pcmBuffer]);
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
exports.GeminiProvider = GeminiProvider;
|
|
358
|
+
//# sourceMappingURL=gemini-provider.js.map
|