@loonylabs/tts-middleware 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +36 -2
  2. package/dist/middleware/services/tts/index.d.ts +4 -3
  3. package/dist/middleware/services/tts/index.d.ts.map +1 -1
  4. package/dist/middleware/services/tts/index.js +3 -1
  5. package/dist/middleware/services/tts/index.js.map +1 -1
  6. package/dist/middleware/services/tts/providers/gemini-provider.d.ts +142 -0
  7. package/dist/middleware/services/tts/providers/gemini-provider.d.ts.map +1 -0
  8. package/dist/middleware/services/tts/providers/gemini-provider.js +358 -0
  9. package/dist/middleware/services/tts/providers/gemini-provider.js.map +1 -0
  10. package/dist/middleware/services/tts/providers/index.d.ts +2 -0
  11. package/dist/middleware/services/tts/providers/index.d.ts.map +1 -1
  12. package/dist/middleware/services/tts/providers/index.js +3 -1
  13. package/dist/middleware/services/tts/providers/index.js.map +1 -1
  14. package/dist/middleware/services/tts/providers/vertex-ai-tts-provider.d.ts +168 -0
  15. package/dist/middleware/services/tts/providers/vertex-ai-tts-provider.d.ts.map +1 -0
  16. package/dist/middleware/services/tts/providers/vertex-ai-tts-provider.js +416 -0
  17. package/dist/middleware/services/tts/providers/vertex-ai-tts-provider.js.map +1 -0
  18. package/dist/middleware/services/tts/tts.service.d.ts.map +1 -1
  19. package/dist/middleware/services/tts/tts.service.js +13 -0
  20. package/dist/middleware/services/tts/tts.service.js.map +1 -1
  21. package/dist/middleware/services/tts/types/common.types.d.ts +12 -1
  22. package/dist/middleware/services/tts/types/common.types.d.ts.map +1 -1
  23. package/dist/middleware/services/tts/types/common.types.js +1 -0
  24. package/dist/middleware/services/tts/types/common.types.js.map +1 -1
  25. package/dist/middleware/services/tts/types/index.d.ts +2 -2
  26. package/dist/middleware/services/tts/types/index.d.ts.map +1 -1
  27. package/dist/middleware/services/tts/types/index.js +2 -1
  28. package/dist/middleware/services/tts/types/index.js.map +1 -1
  29. package/dist/middleware/services/tts/types/provider-options.types.d.ts +85 -1
  30. package/dist/middleware/services/tts/types/provider-options.types.d.ts.map +1 -1
  31. package/dist/middleware/services/tts/types/provider-options.types.js +13 -0
  32. package/dist/middleware/services/tts/types/provider-options.types.js.map +1 -1
  33. package/dist/middleware/services/tts/utils/retry.utils.d.ts +11 -0
  34. package/dist/middleware/services/tts/utils/retry.utils.d.ts.map +1 -1
  35. package/dist/middleware/services/tts/utils/retry.utils.js +23 -0
  36. package/dist/middleware/services/tts/utils/retry.utils.js.map +1 -1
  37. package/dist/middleware/shared/config/tts.config.d.ts +17 -0
  38. package/dist/middleware/shared/config/tts.config.d.ts.map +1 -1
  39. package/dist/middleware/shared/config/tts.config.js +14 -0
  40. package/dist/middleware/shared/config/tts.config.js.map +1 -1
  41. package/package.json +3 -2
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  # TTS Middleware
4
4
 
5
- *Provider-agnostic Text-to-Speech middleware with **GDPR compliance** support. Currently supports Azure Speech Services, EdenAI, Google Cloud TTS, Fish Audio, and Inworld AI. Features EU data residency via Azure and Google Cloud, pluggable logging, character-based billing, and comprehensive error handling.*
5
+ *Provider-agnostic Text-to-Speech middleware with **GDPR compliance** support. Currently supports Azure Speech Services, EdenAI, Google Cloud TTS, Fish Audio, Inworld AI, and Vertex AI TTS. Features EU data residency via Azure and Google Cloud, pluggable logging, character-based billing, and comprehensive error handling.*
6
6
 
7
7
  <!-- Horizontal Badge Navigation Bar -->
8
8
  [![npm version](https://img.shields.io/npm/v/@loonylabs/tts-middleware.svg?style=for-the-badge&logo=npm&logoColor=white)](https://www.npmjs.com/package/@loonylabs/tts-middleware)
@@ -43,6 +43,7 @@
43
43
  - **Google Cloud TTS**: Neural2, WaveNet, Studio voices with EU data residency
44
44
  - **Fish Audio**: S1 model with 13 languages & 64+ emotions (test/admin only)
45
45
  - **Inworld AI**: TTS 1.5 Max/Mini with 15 languages & voice cloning (test/admin only)
46
+ - **Vertex AI TTS**: Gemini Flash/Pro models with 30 voices, 90+ languages & style prompts (test/admin only)
46
47
  - **Ready for:** OpenAI, ElevenLabs, Deepgram (interfaces prepared)
47
48
  - **GDPR/DSGVO Compliance**: Built-in EU region support for Azure and Google Cloud
48
49
  - **SSML Abstraction**: Auto-generates provider-specific SSML from simple JSON options
@@ -137,6 +138,14 @@ const inworld = await ttsService.synthesize({
137
138
  voice: { id: 'Ashley' },
138
139
  providerOptions: { modelId: 'inworld-tts-1.5-max', temperature: 1.1 },
139
140
  });
141
+
142
+ // Vertex AI TTS (test/admin only)
143
+ const vertexAI = await ttsService.synthesize({
144
+ text: 'Have a wonderful day!',
145
+ provider: TTSProvider.VERTEX_AI,
146
+ voice: { id: 'Kore' },
147
+ providerOptions: { model: 'gemini-2.5-flash-preview-tts', stylePrompt: 'Say cheerfully:' },
148
+ });
140
149
  ```
141
150
 
142
151
  </details>
@@ -240,6 +249,10 @@ FISH_AUDIO_API_KEY=your-fish-audio-api-key
240
249
  # Inworld AI (test/admin only – no EU data residency)
241
250
  INWORLD_API_KEY=your-inworld-api-key
242
251
 
252
+ # Vertex AI TTS (test/admin only – no EU data residency)
253
+ # Reuses GOOGLE_APPLICATION_CREDENTIALS and GOOGLE_CLOUD_PROJECT from above
254
+ VERTEX_AI_TTS_REGION=us-central1
255
+
243
256
  # Logging
244
257
  TTS_DEBUG=false
245
258
  LOG_LEVEL=info
@@ -304,6 +317,20 @@ LOG_LEVEL=info
304
317
  | **Pricing** | $10/1M chars (Max), $5/1M chars (Mini) |
305
318
  | **EU Compliance** | No data residency guarantees |
306
319
 
320
+ ### Vertex AI TTS (Test/Admin Only)
321
+
322
+ | Feature | Details |
323
+ |---------|---------|
324
+ | **Models** | `gemini-2.5-flash-preview-tts` (budget, fast), `gemini-2.5-pro-preview-tts` (premium, natural) |
325
+ | **Languages** | 90+ with auto-detection |
326
+ | **Voices** | 30 multilingual: Kore, Puck, Charon, Zephyr, Fenrir, Sulafat, etc. |
327
+ | **Style Control** | Natural language prompts: "Say cheerfully:", "Read in a spooky whisper:" |
328
+ | **Audio** | MP3 (via ffmpeg), WAV (fallback) |
329
+ | **Auth** | Service Account OAuth2 (reuses `GOOGLE_APPLICATION_CREDENTIALS`) |
330
+ | **Region** | `VERTEX_AI_TTS_REGION` env var (default: `us-central1`) |
331
+ | **Pricing** | $0.50-1.00/M input tokens + $10-20/M audio output tokens |
332
+ | **EU Compliance** | Preview models currently `us-central1` only — no EU data residency yet |
333
+
307
334
  ## GDPR / Compliance
308
335
 
309
336
  ### Provider Compliance Overview
@@ -315,9 +342,12 @@ LOG_LEVEL=info
315
342
  | **EdenAI** | Yes | Depends* | Depends* | Depends on underlying provider |
316
343
  | **Fish Audio** | No | No | No | Test/admin only |
317
344
  | **Inworld AI** | No | No | No | Test/admin only |
345
+ | **Vertex AI TTS** | Yes (Vertex DPA) | Partial | No* | Test/admin only |
318
346
 
319
347
  *EdenAI is an aggregator - compliance depends on the underlying provider.
320
348
 
349
+ \*Vertex AI TTS: DPA available, no model training on customer data — but preview models are currently `us-central1` only (no EU data residency until GA with EU region support).
350
+
321
351
  ## API Reference
322
352
 
323
353
  ### TTSService
@@ -503,12 +533,14 @@ graph TD
503
533
  Registry -->|Select| Eden[EdenAIProvider]
504
534
  Registry -->|Select| Fish[FishAudioProvider]
505
535
  Registry -->|Select| Inworld[InworldProvider]
536
+ Registry -->|Select| VertexAI[VertexAITTSProvider]
506
537
 
507
538
  Azure -->|SSML/SDK| AzureAPI[Azure Speech API]
508
539
  GCloud -->|gRPC/SDK| GoogleAPI[Google Cloud TTS API]
509
540
  Eden -->|REST| EdenAPI[EdenAI API]
510
541
  Fish -->|REST| FishAPI[Fish Audio API]
511
542
  Inworld -->|REST| InworldAPI[Inworld AI API]
543
+ VertexAI -->|REST/OAuth2| VertexAPI[Vertex AI API]
512
544
 
513
545
  GoogleAPI -->|EU Endpoint| EU[eu-texttospeech.googleapis.com]
514
546
  EdenAPI -.-> OpenAI[OpenAI TTS]
@@ -518,7 +550,7 @@ graph TD
518
550
  ## Testing
519
551
 
520
552
  ```bash
521
- # Run all tests (555 tests, >90% coverage)
553
+ # Run all tests (600+ tests, >90% coverage)
522
554
  npm test
523
555
 
524
556
  # Unit tests only
@@ -534,6 +566,8 @@ npm run test:coverage
534
566
  npx ts-node scripts/manual-test-edenai.ts
535
567
  npx ts-node scripts/manual-test-google-cloud-tts.ts
536
568
  npx ts-node scripts/manual-test-fish-audio.ts [en] [de]
569
+ npx ts-node scripts/manual-test-inworld.ts [en] [de] [mini]
570
+ npx ts-node scripts/manual-test-vertex-ai.ts [en] [de] [pro] [style]
537
571
 
538
572
  # List available Google Cloud voices
539
573
  npx ts-node scripts/list-google-voices.ts de-DE
@@ -20,9 +20,10 @@
20
20
  */
21
21
  export { TTSService, ttsService } from './tts.service';
22
22
  export { TTSProvider, TTSErrorCode, AudioFormat, } from './types';
23
- export type { AudioOptions, VoiceConfig, TTSSynthesizeRequest, TTSResponse, TTSResponseMetadata, TTSBillingInfo, TTSVoice, TTSVoiceMetadata, AzureProviderOptions, OpenAIProviderOptions, ElevenLabsProviderOptions, GoogleCloudProviderOptions, GoogleCloudTTSProviderOptions, DeepgramProviderOptions, EdenAIProviderOptions, FishAudioProviderOptions, InworldProviderOptions, ProviderOptions, } from './types';
24
- export { isAzureOptions, isOpenAIOptions, isElevenLabsOptions, isGoogleCloudOptions, isGoogleCloudTTSOptions, isDeepgramOptions, isEdenAIOptions, isFishAudioOptions, isInworldOptions, } from './types';
25
- export { BaseTTSProvider, AzureProvider, EdenAIProvider, FishAudioProvider, GoogleCloudTTSProvider, InworldProvider, } from './providers';
23
+ export type { AudioOptions, VoiceConfig, TTSSynthesizeRequest, TTSResponse, TTSResponseMetadata, TTSBillingInfo, TTSVoice, TTSVoiceMetadata, AzureProviderOptions, OpenAIProviderOptions, ElevenLabsProviderOptions, GoogleCloudProviderOptions, GoogleCloudTTSProviderOptions, DeepgramProviderOptions, EdenAIProviderOptions, FishAudioProviderOptions, InworldProviderOptions, VertexAITTSProviderOptions, ProviderOptions, } from './types';
24
+ export { isAzureOptions, isOpenAIOptions, isElevenLabsOptions, isGoogleCloudOptions, isGoogleCloudTTSOptions, isDeepgramOptions, isEdenAIOptions, isFishAudioOptions, isInworldOptions, isVertexAITTSOptions, } from './types';
25
+ export { BaseTTSProvider, AzureProvider, EdenAIProvider, FishAudioProvider, GoogleCloudTTSProvider, InworldProvider, VertexAITTSProvider, } from './providers';
26
+ export type { VertexAITTSConfig } from './providers';
26
27
  export type { GoogleCloudTTSRegion, GoogleCloudTTSConfig, } from './providers';
27
28
  export { TTSError, InvalidConfigError, InvalidVoiceError, QuotaExceededError, ProviderUnavailableError, SynthesisFailedError, NetworkError, } from './providers';
28
29
  export { countCharacters, countCharactersWithoutSSML, validateCharacterCount, countBillableCharacters, estimateAudioDuration, formatCharacterCount, } from './utils';
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/middleware/services/tts/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAGH,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAGvD,OAAO,EACL,WAAW,EACX,YAAY,EACZ,WAAW,GACZ,MAAM,SAAS,CAAC;AAEjB,YAAY,EACV,YAAY,EACZ,WAAW,EACX,oBAAoB,EACpB,WAAW,EACX,mBAAmB,EACnB,cAAc,EACd,QAAQ,EACR,gBAAgB,EAChB,oBAAoB,EACpB,qBAAqB,EACrB,yBAAyB,EACzB,0BAA0B,EAC1B,6BAA6B,EAC7B,uBAAuB,EACvB,qBAAqB,EACrB,wBAAwB,EACxB,sBAAsB,EACtB,eAAe,GAChB,MAAM,SAAS,CAAC;AAEjB,OAAO,EACL,cAAc,EACd,eAAe,EACf,mBAAmB,EACnB,oBAAoB,EACpB,uBAAuB,EACvB,iBAAiB,EACjB,eAAe,EACf,kBAAkB,EAClB,gBAAgB,GACjB,MAAM,SAAS,CAAC;AAGjB,OAAO,EACL,eAAe,EACf,aAAa,EACb,cAAc,EACd,iBAAiB,EACjB,sBAAsB,EACtB,eAAe,GAChB,MAAM,aAAa,CAAC;AAErB,YAAY,EACV,oBAAoB,EACpB,oBAAoB,GACrB,MAAM,aAAa,CAAC;AAGrB,OAAO,EACL,QAAQ,EACR,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,wBAAwB,EACxB,oBAAoB,EACpB,YAAY,GACb,MAAM,aAAa,CAAC;AAGrB,OAAO,EACL,eAAe,EACf,0BAA0B,EAC1B,sBAAsB,EACtB,uBAAuB,EACvB,qBAAqB,EACrB,oBAAoB,GACrB,MAAM,SAAS,CAAC;AAGjB,OAAO,EACL,SAAS,EACT,SAAS,EACT,WAAW,EACX,WAAW,EACX,WAAW,EACX,YAAY,GACb,MAAM,SAAS,CAAC;AAEjB,YAAY,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAC;AAGnD,OAAO,EACL,gBAAgB,EAChB,gBAAgB,EAChB,oBAAoB,GACrB,MAAM,SAAS,CAAC;AAEjB,YAAY,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/middleware/services/tts/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAGH,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAGvD,OAAO,EACL,WAAW,EACX,YAAY,EACZ,WAAW,GACZ,MAAM,SAAS,CAAC;AAEjB,YAAY,EACV,YAAY,EACZ,WAAW,EACX,oBAAoB,EACpB,WAAW,EACX,mBAAmB,EACnB,cAAc,EACd,QAAQ,EACR,gBAAgB,EAChB,oBAAoB,EACpB,qBAAqB,EACrB,yBAAyB,EACzB,0BAA0B,EAC1B,6BAA6B,EAC7B,uBAAuB,EACvB,qBAAqB,EACrB,wBAAwB,EACxB,sBAAsB,EACtB,0BAA0B,EAC1B,eAAe,GAChB,MAAM,SAAS,CAAC;AAEjB,OAAO,EACL,cAAc,EACd,eAAe,EACf,mBAAmB,EACnB,oBAAoB,EACpB,uBAAuB,EACvB,iBAAiB,EACjB,eAAe,EACf,kBAAkB,EAClB,gBAAgB,EAChB,oBAAoB,GACrB,MAAM,SAAS,CAAC;AAGjB,OAAO,EACL,eAAe,EACf,aAAa,EACb,cAAc,EACd,iBAAiB,EACjB,sBAAsB,EACtB,eAAe,EACf,mBAAmB,GACpB,MAAM,aAAa,CAAC;AAErB,YAAY,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAErD,YAAY,EACV,oBAAoB,EACpB,oBAAoB,GACrB,MAAM,aAAa,CAAC;AAGrB,OAAO,EACL,QAAQ,EACR,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,wBAAwB,EACxB,oBAAoB,EACpB,YAAY,GACb,MAAM,aAAa,CAAC;AAGrB,OAAO,EACL,eAAe,EACf,0BAA0B,EAC1B,sBAAsB,EACtB,uBAAuB,EACvB,qBAAqB,EACrB,oBAAoB,GACrB,MAAM,SAAS,CAAC;AAGjB,OAAO,EACL,SAAS,EACT,SAAS,EACT,WAAW,EACX,WAAW,EACX,WAAW,EACX,YAAY,GACb,MAAM,SAAS,CAAC;AAEjB,YAAY,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAC;AAGnD,OAAO,EACL,gBAAgB,EAChB,gBAAgB,EAChB,oBAAoB,GACrB,MAAM,SAAS,CAAC;AAEjB,YAAY,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC"}
@@ -20,7 +20,7 @@
20
20
  * @module @loonylabs/tts-middleware
21
21
  */
22
22
  Object.defineProperty(exports, "__esModule", { value: true });
23
- exports.DEFAULT_RETRY_CONFIG = exports.isRetryableError = exports.executeWithRetry = exports.silentLogger = exports.getLogLevel = exports.setLogLevel = exports.resetLogger = exports.getLogger = exports.setLogger = exports.formatCharacterCount = exports.estimateAudioDuration = exports.countBillableCharacters = exports.validateCharacterCount = exports.countCharactersWithoutSSML = exports.countCharacters = exports.NetworkError = exports.SynthesisFailedError = exports.ProviderUnavailableError = exports.QuotaExceededError = exports.InvalidVoiceError = exports.InvalidConfigError = exports.TTSError = exports.InworldProvider = exports.GoogleCloudTTSProvider = exports.FishAudioProvider = exports.EdenAIProvider = exports.AzureProvider = exports.BaseTTSProvider = exports.isInworldOptions = exports.isFishAudioOptions = exports.isEdenAIOptions = exports.isDeepgramOptions = exports.isGoogleCloudTTSOptions = exports.isGoogleCloudOptions = exports.isElevenLabsOptions = exports.isOpenAIOptions = exports.isAzureOptions = exports.TTSErrorCode = exports.TTSProvider = exports.ttsService = exports.TTSService = void 0;
23
+ exports.DEFAULT_RETRY_CONFIG = exports.isRetryableError = exports.executeWithRetry = exports.silentLogger = exports.getLogLevel = exports.setLogLevel = exports.resetLogger = exports.getLogger = exports.setLogger = exports.formatCharacterCount = exports.estimateAudioDuration = exports.countBillableCharacters = exports.validateCharacterCount = exports.countCharactersWithoutSSML = exports.countCharacters = exports.NetworkError = exports.SynthesisFailedError = exports.ProviderUnavailableError = exports.QuotaExceededError = exports.InvalidVoiceError = exports.InvalidConfigError = exports.TTSError = exports.VertexAITTSProvider = exports.InworldProvider = exports.GoogleCloudTTSProvider = exports.FishAudioProvider = exports.EdenAIProvider = exports.AzureProvider = exports.BaseTTSProvider = exports.isVertexAITTSOptions = exports.isInworldOptions = exports.isFishAudioOptions = exports.isEdenAIOptions = exports.isDeepgramOptions = exports.isGoogleCloudTTSOptions = exports.isGoogleCloudOptions = exports.isElevenLabsOptions = exports.isOpenAIOptions = exports.isAzureOptions = exports.TTSErrorCode = exports.TTSProvider = exports.ttsService = exports.TTSService = void 0;
24
24
  // ===== Main Service =====
25
25
  var tts_service_1 = require("./tts.service");
26
26
  Object.defineProperty(exports, "TTSService", { enumerable: true, get: function () { return tts_service_1.TTSService; } });
@@ -39,6 +39,7 @@ Object.defineProperty(exports, "isDeepgramOptions", { enumerable: true, get: fun
39
39
  Object.defineProperty(exports, "isEdenAIOptions", { enumerable: true, get: function () { return types_2.isEdenAIOptions; } });
40
40
  Object.defineProperty(exports, "isFishAudioOptions", { enumerable: true, get: function () { return types_2.isFishAudioOptions; } });
41
41
  Object.defineProperty(exports, "isInworldOptions", { enumerable: true, get: function () { return types_2.isInworldOptions; } });
42
+ Object.defineProperty(exports, "isVertexAITTSOptions", { enumerable: true, get: function () { return types_2.isVertexAITTSOptions; } });
42
43
  // ===== Providers =====
43
44
  var providers_1 = require("./providers");
44
45
  Object.defineProperty(exports, "BaseTTSProvider", { enumerable: true, get: function () { return providers_1.BaseTTSProvider; } });
@@ -47,6 +48,7 @@ Object.defineProperty(exports, "EdenAIProvider", { enumerable: true, get: functi
47
48
  Object.defineProperty(exports, "FishAudioProvider", { enumerable: true, get: function () { return providers_1.FishAudioProvider; } });
48
49
  Object.defineProperty(exports, "GoogleCloudTTSProvider", { enumerable: true, get: function () { return providers_1.GoogleCloudTTSProvider; } });
49
50
  Object.defineProperty(exports, "InworldProvider", { enumerable: true, get: function () { return providers_1.InworldProvider; } });
51
+ Object.defineProperty(exports, "VertexAITTSProvider", { enumerable: true, get: function () { return providers_1.VertexAITTSProvider; } });
50
52
  // ===== Errors =====
51
53
  var providers_2 = require("./providers");
52
54
  Object.defineProperty(exports, "TTSError", { enumerable: true, get: function () { return providers_2.TTSError; } });
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/middleware/services/tts/index.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;GAmBG;;;AAEH,2BAA2B;AAC3B,6CAAuD;AAA9C,yGAAA,UAAU,OAAA;AAAE,yGAAA,UAAU,OAAA;AAE/B,oBAAoB;AACpB,iCAIiB;AAHf,oGAAA,WAAW,OAAA;AACX,qGAAA,YAAY,OAAA;AAyBd,iCAUiB;AATf,uGAAA,cAAc,OAAA;AACd,wGAAA,eAAe,OAAA;AACf,4GAAA,mBAAmB,OAAA;AACnB,6GAAA,oBAAoB,OAAA;AACpB,gHAAA,uBAAuB,OAAA;AACvB,0GAAA,iBAAiB,OAAA;AACjB,wGAAA,eAAe,OAAA;AACf,2GAAA,kBAAkB,OAAA;AAClB,yGAAA,gBAAgB,OAAA;AAGlB,wBAAwB;AACxB,yCAOqB;AANnB,4GAAA,eAAe,OAAA;AACf,0GAAA,aAAa,OAAA;AACb,2GAAA,cAAc,OAAA;AACd,8GAAA,iBAAiB,OAAA;AACjB,mHAAA,sBAAsB,OAAA;AACtB,4GAAA,eAAe,OAAA;AAQjB,qBAAqB;AACrB,yCAQqB;AAPnB,qGAAA,QAAQ,OAAA;AACR,+GAAA,kBAAkB,OAAA;AAClB,8GAAA,iBAAiB,OAAA;AACjB,+GAAA,kBAAkB,OAAA;AAClB,qHAAA,wBAAwB,OAAA;AACxB,iHAAA,oBAAoB,OAAA;AACpB,yGAAA,YAAY,OAAA;AAGd,wBAAwB;AACxB,iCAOiB;AANf,wGAAA,eAAe,OAAA;AACf,mHAAA,0BAA0B,OAAA;AAC1B,+GAAA,sBAAsB,OAAA;AACtB,gHAAA,uBAAuB,OAAA;AACvB,8GAAA,qBAAqB,OAAA;AACrB,6GAAA,oBAAoB,OAAA;AAGtB,qBAAqB;AACrB,iCAOiB;AANf,kGAAA,SAAS,OAAA;AACT,kGAAA,SAAS,OAAA;AACT,oGAAA,WAAW,OAAA;AACX,oGAAA,WAAW,OAAA;AACX,oGAAA,WAAW,OAAA;AACX,qGAAA,YAAY,OAAA;AAKd,oBAAoB;AACpB,iCAIiB;AAHf,yGAAA,gBAAgB,OAAA;AAChB,yGAAA,gBAAgB,OAAA;AAChB,6GAAA,oBAAoB,OAAA"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/middleware/services/tts/index.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;GAmBG;;;AAEH,2BAA2B;AAC3B,6CAAuD;AAA9C,yGAAA,UAAU,OAAA;AAAE,yGAAA,UAAU,OAAA;AAE/B,oBAAoB;AACpB,iCAIiB;AAHf,oGAAA,WAAW,OAAA;AACX,qGAAA,YAAY,OAAA;AA0Bd,iCAWiB;AAVf,uGAAA,cAAc,OAAA;AACd,wGAAA,eAAe,OAAA;AACf,4GAAA,mBAAmB,OAAA;AACnB,6GAAA,oBAAoB,OAAA;AACpB,gHAAA,uBAAuB,OAAA;AACvB,0GAAA,iBAAiB,OAAA;AACjB,wGAAA,eAAe,OAAA;AACf,2GAAA,kBAAkB,OAAA;AAClB,yGAAA,gBAAgB,OAAA;AAChB,6GAAA,oBAAoB,OAAA;AAGtB,wBAAwB;AACxB,yCAQqB;AAPnB,4GAAA,eAAe,OAAA;AACf,0GAAA,aAAa,OAAA;AACb,2GAAA,cAAc,OAAA;AACd,8GAAA,iBAAiB,OAAA;AACjB,mHAAA,sBAAsB,OAAA;AACtB,4GAAA,eAAe,OAAA;AACf,gHAAA,mBAAmB,OAAA;AAUrB,qBAAqB;AACrB,yCAQqB;AAPnB,qGAAA,QAAQ,OAAA;AACR,+GAAA,kBAAkB,OAAA;AAClB,8GAAA,iBAAiB,OAAA;AACjB,+GAAA,kBAAkB,OAAA;AAClB,qHAAA,wBAAwB,OAAA;AACxB,iHAAA,oBAAoB,OAAA;AACpB,yGAAA,YAAY,OAAA;AAGd,wBAAwB;AACxB,iCAOiB;AANf,wGAAA,eAAe,OAAA;AACf,mHAAA,0BAA0B,OAAA;AAC1B,+GAAA,sBAAsB,OAAA;AACtB,gHAAA,uBAAuB,OAAA;AACvB,8GAAA,qBAAqB,OAAA;AACrB,6GAAA,oBAAoB,OAAA;AAGtB,qBAAqB;AACrB,iCAOiB;AANf,kGAAA,SAAS,OAAA;AACT,kGAAA,SAAS,OAAA;AACT,oGAAA,WAAW,OAAA;AACX,oGAAA,WAAW,OAAA;AACX,oGAAA,WAAW,OAAA;AACX,qGAAA,YAAY,OAAA;AAKd,oBAAoB;AACpB,iCAIiB;AAHf,yGAAA,gBAAgB,OAAA;AAChB,yGAAA,gBAAgB,OAAA;AAChB,6GAAA,oBAAoB,OAAA"}
@@ -0,0 +1,142 @@
1
+ /**
2
+ * Gemini TTS Provider
3
+ *
4
+ * @description Provider for Google Gemini TTS via Vertex AI, using the generateContent
5
+ * endpoint with responseModalities: ['AUDIO']. Authenticates via Service Account
6
+ * (same as Google Cloud TTS — reuses GOOGLE_APPLICATION_CREDENTIALS).
7
+ *
8
+ * Supports 30 multilingual voices with auto-detect language and natural language
9
+ * style control. Output is raw PCM (24kHz, 16-bit, mono) which is converted to
10
+ * MP3 via ffmpeg or WAV as fallback.
11
+ *
12
+ * Test/Admin only -- no EU data residency guarantees.
13
+ *
14
+ * @see https://cloud.google.com/vertex-ai/generative-ai/docs/text-to-speech
15
+ */
16
+ import type { TTSSynthesizeRequest, TTSResponse } from '../types';
17
+ import { BaseTTSProvider } from './base-tts-provider';
18
+ /**
19
+ * Gemini TTS configuration (Vertex AI)
20
+ */
21
+ export interface GeminiConfig {
22
+ /**
23
+ * Path to Service Account JSON file
24
+ * @env GOOGLE_APPLICATION_CREDENTIALS
25
+ */
26
+ keyFilename?: string;
27
+ /**
28
+ * Google Cloud Project ID
29
+ * @env GOOGLE_CLOUD_PROJECT
30
+ */
31
+ projectId?: string;
32
+ /**
33
+ * Vertex AI region
34
+ * @env GEMINI_REGION
35
+ * @default 'us-central1'
36
+ */
37
+ region?: string;
38
+ }
39
+ /**
40
+ * Gemini TTS provider implementation
41
+ *
42
+ * @description Provides TTS synthesis using Google's Gemini generateContent API
43
+ * via Vertex AI. Authenticates with Service Account OAuth2 (same credentials as
44
+ * Google Cloud TTS). Gemini outputs raw PCM which is converted to MP3 (via ffmpeg)
45
+ * or WAV (pure Node.js fallback).
46
+ *
47
+ * Billing: Token-based ($0.50-1.00/M input + $10-20/M audio output tokens).
48
+ * For billing compatibility, reports character count like all other providers.
49
+ *
50
+ * @example
51
+ * ```typescript
52
+ * const provider = new GeminiProvider();
53
+ * const response = await provider.synthesize(
54
+ * "Hello World",
55
+ * "Kore",
56
+ * {
57
+ * text: "Hello World",
58
+ * voice: { id: "Kore" },
59
+ * audio: { format: "mp3" },
60
+ * providerOptions: {
61
+ * model: "gemini-2.5-flash-preview-tts",
62
+ * stylePrompt: "Say cheerfully:"
63
+ * }
64
+ * }
65
+ * );
66
+ * ```
67
+ */
68
+ export declare class GeminiProvider extends BaseTTSProvider {
69
+ private config;
70
+ private authClient;
71
+ /**
72
+ * Creates a new Gemini TTS provider
73
+ *
74
+ * @param config - Optional configuration (uses env vars if not provided)
75
+ * @throws {InvalidConfigError} If credentials are missing
76
+ */
77
+ constructor(config?: Partial<GeminiConfig>);
78
+ /**
79
+ * Validate Gemini configuration
80
+ *
81
+ * @private
82
+ * @throws {InvalidConfigError} If configuration is invalid
83
+ */
84
+ private validateGeminiConfig;
85
+ /**
86
+ * Get an authenticated access token via Service Account
87
+ *
88
+ * @private
89
+ * @returns OAuth2 access token
90
+ */
91
+ private getAccessToken;
92
+ /**
93
+ * Synthesize text to speech using Gemini TTS
94
+ *
95
+ * @param text - The input text to synthesize
96
+ * @param voiceId - The voice name (e.g. "Kore", "Puck", "Charon")
97
+ * @param request - The full synthesis request with options
98
+ * @returns Promise resolving to the synthesis response
99
+ */
100
+ synthesize(text: string, voiceId: string, request: TTSSynthesizeRequest): Promise<TTSResponse>;
101
+ /**
102
+ * Build Gemini generateContent request payload
103
+ *
104
+ * @private
105
+ */
106
+ private buildRequest;
107
+ /**
108
+ * Call Gemini generateContent API via Vertex AI
109
+ *
110
+ * @private
111
+ * @param requestBody - The request payload
112
+ * @param model - The Gemini model to use
113
+ * @returns Promise resolving to raw PCM audio buffer
114
+ */
115
+ private callAPI;
116
+ /**
117
+ * Convert raw PCM audio to the requested format
118
+ *
119
+ * @private
120
+ * @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
121
+ * @param requestedFormat - The desired output format ('mp3', 'wav', etc.)
122
+ * @returns The converted audio buffer and actual format used
123
+ */
124
+ private convertPcmAudio;
125
+ /**
126
+ * Convert raw PCM to MP3 using ffmpeg via child_process
127
+ *
128
+ * @private
129
+ * @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
130
+ * @returns Promise resolving to MP3 buffer
131
+ */
132
+ private pcmToMp3;
133
+ /**
134
+ * Convert raw PCM to WAV by prepending a 44-byte WAV header
135
+ *
136
+ * @private
137
+ * @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
138
+ * @returns WAV buffer
139
+ */
140
+ private pcmToWav;
141
+ }
142
+ //# sourceMappingURL=gemini-provider.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"gemini-provider.d.ts","sourceRoot":"","sources":["../../../../../src/middleware/services/tts/providers/gemini-provider.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAGH,OAAO,KAAK,EAAE,oBAAoB,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAGlE,OAAO,EACL,eAAe,EAEhB,MAAM,qBAAqB,CAAC;AAG7B;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB;;;;OAIG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAMD;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,qBAAa,cAAe,SAAQ,eAAe;IACjD,OAAO,CAAC,MAAM,CAAe;IAC7B,OAAO,CAAC,UAAU,CAA6E;IAE/F;;;;;OAKG;gBACS,MAAM,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC;IAkB1C;;;;;OAKG;IACH,OAAO,CAAC,oBAAoB;IAgB5B;;;;;OAKG;YACW,cAAc;IAqB5B;;;;;;;OAOG;IACG,UAAU,CACd,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,MAAM,EACf,OAAO,EAAE,oBAAoB,GAC5B,OAAO,CAAC,WAAW,CAAC;IAsDvB;;;;OAIG;IACH,OAAO,CAAC,YAAY;IA6BpB;;;;;;;OAOG;YACW,OAAO;IA6CrB;;;;;;;OAOG;YACW,eAAe;IA0B7B;;;;;;OAMG;IACH,OAAO,CAAC,QAAQ;IAkChB;;;;;;OAMG;IACH,OAAO,CAAC,QAAQ;CAwBjB"}
@@ -0,0 +1,358 @@
1
+ "use strict";
2
+ /**
3
+ * Gemini TTS Provider
4
+ *
5
+ * @description Provider for Google Gemini TTS via Vertex AI, using the generateContent
6
+ * endpoint with responseModalities: ['AUDIO']. Authenticates via Service Account
7
+ * (same as Google Cloud TTS — reuses GOOGLE_APPLICATION_CREDENTIALS).
8
+ *
9
+ * Supports 30 multilingual voices with auto-detect language and natural language
10
+ * style control. Output is raw PCM (24kHz, 16-bit, mono) which is converted to
11
+ * MP3 via ffmpeg or WAV as fallback.
12
+ *
13
+ * Test/Admin only -- no EU data residency guarantees.
14
+ *
15
+ * @see https://cloud.google.com/vertex-ai/generative-ai/docs/text-to-speech
16
+ */
17
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
18
+ if (k2 === undefined) k2 = k;
19
+ var desc = Object.getOwnPropertyDescriptor(m, k);
20
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
21
+ desc = { enumerable: true, get: function() { return m[k]; } };
22
+ }
23
+ Object.defineProperty(o, k2, desc);
24
+ }) : (function(o, m, k, k2) {
25
+ if (k2 === undefined) k2 = k;
26
+ o[k2] = m[k];
27
+ }));
28
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
29
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
30
+ }) : function(o, v) {
31
+ o["default"] = v;
32
+ });
33
+ var __importStar = (this && this.__importStar) || (function () {
34
+ var ownKeys = function(o) {
35
+ ownKeys = Object.getOwnPropertyNames || function (o) {
36
+ var ar = [];
37
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
38
+ return ar;
39
+ };
40
+ return ownKeys(o);
41
+ };
42
+ return function (mod) {
43
+ if (mod && mod.__esModule) return mod;
44
+ var result = {};
45
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
46
+ __setModuleDefault(result, mod);
47
+ return result;
48
+ };
49
+ })();
50
+ Object.defineProperty(exports, "__esModule", { value: true });
51
+ exports.GeminiProvider = void 0;
52
+ const child_process_1 = require("child_process");
53
+ const types_1 = require("../types");
54
+ const mp3_duration_utils_1 = require("../utils/mp3-duration.utils");
55
+ const base_tts_provider_1 = require("./base-tts-provider");
56
+ const DEFAULT_MODEL = 'gemini-2.5-flash-preview-tts';
57
+ const DEFAULT_SAMPLE_RATE = 24000;
58
+ const DEFAULT_REGION = 'us-central1';
59
+ /**
60
+ * Gemini TTS provider implementation
61
+ *
62
+ * @description Provides TTS synthesis using Google's Gemini generateContent API
63
+ * via Vertex AI. Authenticates with Service Account OAuth2 (same credentials as
64
+ * Google Cloud TTS). Gemini outputs raw PCM which is converted to MP3 (via ffmpeg)
65
+ * or WAV (pure Node.js fallback).
66
+ *
67
+ * Billing: Token-based ($0.50-1.00/M input + $10-20/M audio output tokens).
68
+ * For billing compatibility, reports character count like all other providers.
69
+ *
70
+ * @example
71
+ * ```typescript
72
+ * const provider = new GeminiProvider();
73
+ * const response = await provider.synthesize(
74
+ * "Hello World",
75
+ * "Kore",
76
+ * {
77
+ * text: "Hello World",
78
+ * voice: { id: "Kore" },
79
+ * audio: { format: "mp3" },
80
+ * providerOptions: {
81
+ * model: "gemini-2.5-flash-preview-tts",
82
+ * stylePrompt: "Say cheerfully:"
83
+ * }
84
+ * }
85
+ * );
86
+ * ```
87
+ */
88
+ class GeminiProvider extends base_tts_provider_1.BaseTTSProvider {
89
+ /**
90
+ * Creates a new Gemini TTS provider
91
+ *
92
+ * @param config - Optional configuration (uses env vars if not provided)
93
+ * @throws {InvalidConfigError} If credentials are missing
94
+ */
95
+ constructor(config) {
96
+ super(types_1.TTSProvider.GEMINI);
97
+ this.authClient = null;
98
+ this.config = {
99
+ keyFilename: config?.keyFilename || process.env.GOOGLE_APPLICATION_CREDENTIALS,
100
+ projectId: config?.projectId || process.env.GOOGLE_CLOUD_PROJECT,
101
+ region: config?.region || process.env.GEMINI_REGION || DEFAULT_REGION,
102
+ };
103
+ this.validateGeminiConfig();
104
+ this.log('info', 'Gemini TTS provider initialized', {
105
+ hasCredentials: !!this.config.keyFilename,
106
+ projectId: this.config.projectId ? '***' : undefined,
107
+ region: this.config.region,
108
+ });
109
+ }
110
+ /**
111
+ * Validate Gemini configuration
112
+ *
113
+ * @private
114
+ * @throws {InvalidConfigError} If configuration is invalid
115
+ */
116
+ validateGeminiConfig() {
117
+ if (!this.config.keyFilename) {
118
+ throw new base_tts_provider_1.InvalidConfigError(this.providerName, 'Google Cloud credentials are required for Gemini TTS (GOOGLE_APPLICATION_CREDENTIALS)');
119
+ }
120
+ if (!this.config.projectId) {
121
+ throw new base_tts_provider_1.InvalidConfigError(this.providerName, 'Google Cloud Project ID is required for Gemini TTS (GOOGLE_CLOUD_PROJECT)');
122
+ }
123
+ }
124
+ /**
125
+ * Get an authenticated access token via Service Account
126
+ *
127
+ * @private
128
+ * @returns OAuth2 access token
129
+ */
130
+ async getAccessToken() {
131
+ if (!this.authClient) {
132
+ const { GoogleAuth } = await Promise.resolve().then(() => __importStar(require('google-auth-library')));
133
+ const auth = new GoogleAuth({
134
+ keyFilename: this.config.keyFilename,
135
+ scopes: ['https://www.googleapis.com/auth/cloud-platform'],
136
+ });
137
+ this.authClient = await auth.getClient();
138
+ }
139
+ const tokenResponse = await this.authClient.getAccessToken();
140
+ if (!tokenResponse.token) {
141
+ throw new base_tts_provider_1.InvalidConfigError(this.providerName, 'Failed to obtain access token from Service Account');
142
+ }
143
+ return tokenResponse.token;
144
+ }
145
+ /**
146
+ * Synthesize text to speech using Gemini TTS
147
+ *
148
+ * @param text - The input text to synthesize
149
+ * @param voiceId - The voice name (e.g. "Kore", "Puck", "Charon")
150
+ * @param request - The full synthesis request with options
151
+ * @returns Promise resolving to the synthesis response
152
+ */
153
+ async synthesize(text, voiceId, request) {
154
+ this.validateConfig(request);
155
+ const startTime = Date.now();
156
+ const options = (request.providerOptions || {});
157
+ const model = options.model || DEFAULT_MODEL;
158
+ const requestedFormat = request.audio?.format || 'mp3';
159
+ const requestBody = this.buildRequest(text, voiceId, options);
160
+ this.log('debug', 'Synthesizing with Gemini TTS', {
161
+ voiceId,
162
+ model,
163
+ textLength: text.length,
164
+ requestedFormat,
165
+ });
166
+ try {
167
+ const pcmBuffer = await this.callAPI(requestBody, model);
168
+ const { audioBuffer, audioFormat } = await this.convertPcmAudio(pcmBuffer, requestedFormat);
169
+ const duration = Date.now() - startTime;
170
+ this.log('info', 'Synthesis successful', {
171
+ voiceId,
172
+ characters: text.length,
173
+ duration,
174
+ audioSize: audioBuffer.length,
175
+ audioFormat,
176
+ });
177
+ return {
178
+ audio: audioBuffer,
179
+ metadata: {
180
+ provider: this.providerName,
181
+ voice: voiceId,
182
+ duration,
183
+ audioDuration: audioFormat === 'mp3' ? (0, mp3_duration_utils_1.getMp3Duration)(audioBuffer) : undefined,
184
+ audioFormat,
185
+ sampleRate: DEFAULT_SAMPLE_RATE,
186
+ },
187
+ billing: {
188
+ characters: this.countCharacters(text),
189
+ },
190
+ };
191
+ }
192
+ catch (error) {
193
+ this.log('error', 'Synthesis failed', {
194
+ voiceId,
195
+ error: error.message,
196
+ });
197
+ throw this.handleError(error, 'during Gemini TTS API call');
198
+ }
199
+ }
200
+ /**
201
+ * Build Gemini generateContent request payload
202
+ *
203
+ * @private
204
+ */
205
+ buildRequest(text, voiceId, options) {
206
+ const synthesisText = options.stylePrompt
207
+ ? `${options.stylePrompt} ${text}`
208
+ : text;
209
+ return {
210
+ contents: [
211
+ {
212
+ role: 'user',
213
+ parts: [{ text: synthesisText }],
214
+ },
215
+ ],
216
+ generationConfig: {
217
+ responseModalities: ['AUDIO'],
218
+ speechConfig: {
219
+ voiceConfig: {
220
+ prebuiltVoiceConfig: {
221
+ voiceName: voiceId,
222
+ },
223
+ },
224
+ },
225
+ },
226
+ };
227
+ }
228
+ /**
229
+ * Call Gemini generateContent API via Vertex AI
230
+ *
231
+ * @private
232
+ * @param requestBody - The request payload
233
+ * @param model - The Gemini model to use
234
+ * @returns Promise resolving to raw PCM audio buffer
235
+ */
236
+ async callAPI(requestBody, model) {
237
+ const accessToken = await this.getAccessToken();
238
+ const region = this.config.region || DEFAULT_REGION;
239
+ const projectId = this.config.projectId;
240
+ const url = `https://${region}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${region}/publishers/google/models/${model}:generateContent`;
241
+ const response = await fetch(url, {
242
+ method: 'POST',
243
+ headers: {
244
+ 'Authorization': `Bearer ${accessToken}`,
245
+ 'Content-Type': 'application/json',
246
+ },
247
+ body: JSON.stringify(requestBody),
248
+ });
249
+ if (!response.ok) {
250
+ const errorText = await response.text();
251
+ throw new Error(`Gemini API error (${response.status}): ${errorText}`);
252
+ }
253
+ const responseJson = await response.json();
254
+ const inlineData = responseJson.candidates?.[0]?.content?.parts?.[0]?.inlineData;
255
+ if (!inlineData?.data) {
256
+ throw new Error('Gemini API returned no audio data');
257
+ }
258
+ return Buffer.from(inlineData.data, 'base64');
259
+ }
260
+ /**
261
+ * Convert raw PCM audio to the requested format
262
+ *
263
+ * @private
264
+ * @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
265
+ * @param requestedFormat - The desired output format ('mp3', 'wav', etc.)
266
+ * @returns The converted audio buffer and actual format used
267
+ */
268
+ async convertPcmAudio(pcmBuffer, requestedFormat) {
269
+ if (requestedFormat === 'wav') {
270
+ return {
271
+ audioBuffer: this.pcmToWav(pcmBuffer),
272
+ audioFormat: 'wav',
273
+ };
274
+ }
275
+ // For mp3 (and any other format), try ffmpeg first, fall back to WAV
276
+ try {
277
+ const mp3Buffer = await this.pcmToMp3(pcmBuffer);
278
+ return { audioBuffer: mp3Buffer, audioFormat: 'mp3' };
279
+ }
280
+ catch (error) {
281
+ this.log('warn', 'ffmpeg not available, falling back to WAV output', {
282
+ error: error.message,
283
+ });
284
+ return {
285
+ audioBuffer: this.pcmToWav(pcmBuffer),
286
+ audioFormat: 'wav',
287
+ };
288
+ }
289
+ }
290
+ /**
291
+ * Convert raw PCM to MP3 using ffmpeg via child_process
292
+ *
293
+ * @private
294
+ * @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
295
+ * @returns Promise resolving to MP3 buffer
296
+ */
297
+ pcmToMp3(pcmBuffer) {
298
+ return new Promise((resolve, reject) => {
299
+ const ffmpeg = (0, child_process_1.spawn)('ffmpeg', [
300
+ '-f', 's16le',
301
+ '-ar', String(DEFAULT_SAMPLE_RATE),
302
+ '-ac', '1',
303
+ '-i', 'pipe:0',
304
+ '-codec:a', 'libmp3lame',
305
+ '-b:a', '128k',
306
+ '-f', 'mp3',
307
+ 'pipe:1',
308
+ ]);
309
+ const chunks = [];
310
+ ffmpeg.stdout.on('data', (chunk) => chunks.push(chunk));
311
+ ffmpeg.stderr.on('data', () => { });
312
+ ffmpeg.on('error', (err) => {
313
+ reject(new Error(`ffmpeg spawn failed: ${err.message}`));
314
+ });
315
+ ffmpeg.on('close', (code) => {
316
+ if (code === 0) {
317
+ resolve(Buffer.concat(chunks));
318
+ }
319
+ else {
320
+ reject(new Error(`ffmpeg exited with code ${code}`));
321
+ }
322
+ });
323
+ ffmpeg.stdin.write(pcmBuffer);
324
+ ffmpeg.stdin.end();
325
+ });
326
+ }
327
+ /**
328
+ * Convert raw PCM to WAV by prepending a 44-byte WAV header
329
+ *
330
+ * @private
331
+ * @param pcmBuffer - Raw PCM buffer (24kHz, 16-bit, mono, little-endian)
332
+ * @returns WAV buffer
333
+ */
334
+ pcmToWav(pcmBuffer) {
335
+ const channels = 1;
336
+ const bitsPerSample = 16;
337
+ const byteRate = DEFAULT_SAMPLE_RATE * channels * (bitsPerSample / 8);
338
+ const blockAlign = channels * (bitsPerSample / 8);
339
+ const dataLength = pcmBuffer.length;
340
+ const header = Buffer.alloc(44);
341
+ header.write('RIFF', 0);
342
+ header.writeUInt32LE(36 + dataLength, 4);
343
+ header.write('WAVE', 8);
344
+ header.write('fmt ', 12);
345
+ header.writeUInt32LE(16, 16); // PCM chunk size
346
+ header.writeUInt16LE(1, 20); // PCM format
347
+ header.writeUInt16LE(channels, 22);
348
+ header.writeUInt32LE(DEFAULT_SAMPLE_RATE, 24);
349
+ header.writeUInt32LE(byteRate, 28);
350
+ header.writeUInt16LE(blockAlign, 32);
351
+ header.writeUInt16LE(bitsPerSample, 34);
352
+ header.write('data', 36);
353
+ header.writeUInt32LE(dataLength, 40);
354
+ return Buffer.concat([header, pcmBuffer]);
355
+ }
356
+ }
357
+ exports.GeminiProvider = GeminiProvider;
358
+ //# sourceMappingURL=gemini-provider.js.map