dvgateway-adapters 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/index.d.ts +8 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +5 -2
- package/dist/index.js.map +1 -1
- package/dist/stt/google-chirp3.d.ts +76 -0
- package/dist/stt/google-chirp3.d.ts.map +1 -0
- package/dist/stt/google-chirp3.js +191 -0
- package/dist/stt/google-chirp3.js.map +1 -0
- package/dist/stt/index.d.ts +2 -0
- package/dist/stt/index.d.ts.map +1 -1
- package/dist/stt/index.js +1 -0
- package/dist/stt/index.js.map +1 -1
- package/dist/tts/cosyvoice-tts.d.ts +69 -0
- package/dist/tts/cosyvoice-tts.d.ts.map +1 -0
- package/dist/tts/cosyvoice-tts.js +163 -0
- package/dist/tts/cosyvoice-tts.js.map +1 -0
- package/dist/tts/gemini-tts.d.ts +86 -0
- package/dist/tts/gemini-tts.d.ts.map +1 -0
- package/dist/tts/gemini-tts.js +186 -0
- package/dist/tts/gemini-tts.js.map +1 -0
- package/dist/tts/index.d.ts +4 -0
- package/dist/tts/index.d.ts.map +1 -1
- package/dist/tts/index.js +2 -0
- package/dist/tts/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -13,8 +13,11 @@ npm install dvgateway-sdk dvgateway-adapters
|
|
|
13
13
|
| 분류 | 어댑터 | 서비스 |
|
|
14
14
|
|------|--------|--------|
|
|
15
15
|
| STT | `DeepgramAdapter` | Deepgram Nova-3 |
|
|
16
|
+
| STT | `GoogleChirp3Adapter` | Google Cloud STT Chirp 3 |
|
|
16
17
|
| TTS | `ElevenLabsAdapter` | ElevenLabs Flash v2.5 |
|
|
17
18
|
| TTS | `OpenAITtsAdapter` | OpenAI gpt-4o-mini-tts |
|
|
19
|
+
| TTS | `GeminiTtsAdapter` | Google Gemini TTS |
|
|
20
|
+
| TTS | `CosyVoiceAdapter` | Alibaba CosyVoice |
|
|
18
21
|
| TTS | `CachedTtsAdapter` | 디스크 기반 TTS 캐시 래퍼 |
|
|
19
22
|
| LLM | `AnthropicAdapter` | Anthropic Claude |
|
|
20
23
|
| LLM | `OpenAILlmAdapter` | OpenAI GPT |
|
|
@@ -24,9 +27,9 @@ npm install dvgateway-sdk dvgateway-adapters
|
|
|
24
27
|
|
|
25
28
|
```typescript
|
|
26
29
|
import { DVGatewayClient } from 'dvgateway-sdk';
|
|
27
|
-
import { DeepgramAdapter } from 'dvgateway-adapters/stt';
|
|
30
|
+
import { DeepgramAdapter, GoogleChirp3Adapter } from 'dvgateway-adapters/stt';
|
|
28
31
|
import { AnthropicAdapter } from 'dvgateway-adapters/llm';
|
|
29
|
-
import { ElevenLabsAdapter } from 'dvgateway-adapters/tts';
|
|
32
|
+
import { ElevenLabsAdapter, GeminiTtsAdapter, CosyVoiceAdapter } from 'dvgateway-adapters/tts';
|
|
30
33
|
|
|
31
34
|
const gw = new DVGatewayClient({
|
|
32
35
|
baseUrl: 'http://localhost:8080',
|
package/dist/index.d.ts
CHANGED
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
* Import only what you need — each adapter is tree-shakeable.
|
|
6
6
|
*
|
|
7
7
|
* Adapter overview:
|
|
8
|
-
* STT (Speech-to-Text) — Deepgram Nova-3
|
|
9
|
-
* TTS (Text-to-Speech) — ElevenLabs Flash v2.5, OpenAI TTS
|
|
8
|
+
* STT (Speech-to-Text) — Deepgram Nova-3, Google Chirp3
|
|
9
|
+
* TTS (Text-to-Speech) — ElevenLabs Flash v2.5, OpenAI TTS, Gemini TTS, CosyVoice
|
|
10
10
|
* LLM (Language Model) — Anthropic Claude, OpenAI GPT
|
|
11
11
|
* Realtime (Speech-to-Speech) — OpenAI Realtime API (audio 1.5)
|
|
12
12
|
*
|
|
@@ -24,6 +24,8 @@
|
|
|
24
24
|
*/
|
|
25
25
|
export { DeepgramAdapter } from './stt/deepgram.js';
|
|
26
26
|
export type { DeepgramAdapterOptions } from './stt/deepgram.js';
|
|
27
|
+
export { GoogleChirp3Adapter } from './stt/google-chirp3.js';
|
|
28
|
+
export type { GoogleChirp3AdapterOptions } from './stt/google-chirp3.js';
|
|
27
29
|
export type { HumanVoiceOptions, SttOptions, TtsOptions, } from 'dvgateway-sdk';
|
|
28
30
|
export { HUMAN_VOICE_DEFAULTS_KO, HUMAN_VOICE_DEFAULTS_EN, } from 'dvgateway-sdk';
|
|
29
31
|
export { ElevenLabsAdapter, ELEVENLABS_KOREAN_VOICES } from './tts/elevenlabs.js';
|
|
@@ -32,6 +34,10 @@ export { OpenAITtsAdapter } from './tts/openai-tts.js';
|
|
|
32
34
|
export type { OpenAITtsAdapterOptions, OpenAITtsVoice, OpenAITtsModel, } from './tts/openai-tts.js';
|
|
33
35
|
export { CachedTtsAdapter } from './tts/cached-tts.js';
|
|
34
36
|
export type { CachedTtsAdapterOptions, WarmupEntry, } from './tts/cached-tts.js';
|
|
37
|
+
export { GeminiTtsAdapter, GEMINI_TTS_VOICES } from './tts/gemini-tts.js';
|
|
38
|
+
export type { GeminiTtsAdapterOptions, GeminiTtsVoice, GeminiTtsModel, } from './tts/gemini-tts.js';
|
|
39
|
+
export { CosyVoiceAdapter, COSYVOICE_VOICES } from './tts/cosyvoice-tts.js';
|
|
40
|
+
export type { CosyVoiceAdapterOptions, CosyVoiceVoice, CosyVoiceModel, } from './tts/cosyvoice-tts.js';
|
|
35
41
|
export { AnthropicAdapter } from './llm/anthropic.js';
|
|
36
42
|
export type { AnthropicAdapterOptions } from './llm/anthropic.js';
|
|
37
43
|
export { OpenAILlmAdapter } from './llm/openai-llm.js';
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAGH,OAAO,EAAE,eAAe,EAAE,MAAmB,mBAAmB,CAAC;AACjE,YAAY,EAAE,sBAAsB,EAAE,MAAO,mBAAmB,CAAC;
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAGH,OAAO,EAAE,eAAe,EAAE,MAAmB,mBAAmB,CAAC;AACjE,YAAY,EAAE,sBAAsB,EAAE,MAAO,mBAAmB,CAAC;AAEjE,OAAO,EAAE,mBAAmB,EAAE,MAAe,wBAAwB,CAAC;AACtE,YAAY,EAAE,0BAA0B,EAAE,MAAM,wBAAwB,CAAC;AAGzE,YAAY,EACV,iBAAiB,EACjB,UAAU,EACV,UAAU,GACX,MAA4C,eAAe,CAAC;AAC7D,OAAO,EACL,uBAAuB,EACvB,uBAAuB,GACxB,MAA4C,eAAe,CAAC;AAG7D,OAAO,EAAE,iBAAiB,EAAE,wBAAwB,EAAE,MAAM,qBAAqB,CAAC;AAClF,YAAY,EAAE,wBAAwB,EAAE,MAAoB,qBAAqB,CAAC;AAElF,OAAO,EAAE,gBAAgB,EAAE,MAAsB,qBAAqB,CAAC;AACvE,YAAY,EACV,uBAAuB,EACvB,cAAc,EACd,cAAc,GACf,MAAgD,qBAAqB,CAAC;AAEvE,OAAO,EAAE,gBAAgB,EAAE,MAAsB,qBAAqB,CAAC;AACvE,YAAY,EACV,uBAAuB,EACvB,WAAW,GACZ,MAAgD,qBAAqB,CAAC;AAEvE,OAAO,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AAC1E,YAAY,EACV,uBAAuB,EACvB,cAAc,EACd,cAAc,GACf,MAAgD,qBAAqB,CAAC;AAEvE,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC5E,YAAY,EACV,uBAAuB,EACvB,cAAc,EACd,cAAc,GACf,MAAgD,wBAAwB,CAAC;AAG1E,OAAO,EAAE,gBAAgB,EAAE,MAAsB,oBAAoB,CAAC;AACtE,YAAY,EAAE,uBAAuB,EAAE,MAAU,oBAAoB,CAAC;AAEtE,OAAO,EAAE,gBAAgB,EAAE,MAAsB,qBAAqB,CAAC;AACvE,YAAY,EAAE,uBAAuB,EAAE,MAAU,qBAAqB,CAAC;AAGvE,OAAO,EAAE,qBAAqB,EAAE,MAAiB,+BAA+B,CAAC;AACjF,YAAY,EACV,4BAA4B,EAC5B,mBAAmB,EACnB,+BAA+B,EAC/B,kCAAkC,EAClC,qBAAqB,GACtB,MAAgD,+BAA+B,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
* Import only what you need — each adapter is tree-shakeable.
|
|
6
6
|
*
|
|
7
7
|
* Adapter overview:
|
|
8
|
-
* STT (Speech-to-Text) — Deepgram Nova-3
|
|
9
|
-
* TTS (Text-to-Speech) — ElevenLabs Flash v2.5, OpenAI TTS
|
|
8
|
+
* STT (Speech-to-Text) — Deepgram Nova-3, Google Chirp3
|
|
9
|
+
* TTS (Text-to-Speech) — ElevenLabs Flash v2.5, OpenAI TTS, Gemini TTS, CosyVoice
|
|
10
10
|
* LLM (Language Model) — Anthropic Claude, OpenAI GPT
|
|
11
11
|
* Realtime (Speech-to-Speech) — OpenAI Realtime API (audio 1.5)
|
|
12
12
|
*
|
|
@@ -24,11 +24,14 @@
|
|
|
24
24
|
*/
|
|
25
25
|
// ── STT (Speech-to-Text) ──────────────────────────────────────────────────────
|
|
26
26
|
export { DeepgramAdapter } from './stt/deepgram.js';
|
|
27
|
+
export { GoogleChirp3Adapter } from './stt/google-chirp3.js';
|
|
27
28
|
export { HUMAN_VOICE_DEFAULTS_KO, HUMAN_VOICE_DEFAULTS_EN, } from 'dvgateway-sdk';
|
|
28
29
|
// ── TTS (Text-to-Speech) ─────────────────────────────────────────────────────
|
|
29
30
|
export { ElevenLabsAdapter, ELEVENLABS_KOREAN_VOICES } from './tts/elevenlabs.js';
|
|
30
31
|
export { OpenAITtsAdapter } from './tts/openai-tts.js';
|
|
31
32
|
export { CachedTtsAdapter } from './tts/cached-tts.js';
|
|
33
|
+
export { GeminiTtsAdapter, GEMINI_TTS_VOICES } from './tts/gemini-tts.js';
|
|
34
|
+
export { CosyVoiceAdapter, COSYVOICE_VOICES } from './tts/cosyvoice-tts.js';
|
|
32
35
|
// ── LLM (Language Model) ─────────────────────────────────────────────────────
|
|
33
36
|
export { AnthropicAdapter } from './llm/anthropic.js';
|
|
34
37
|
export { OpenAILlmAdapter } from './llm/openai-llm.js';
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,iFAAiF;AACjF,OAAO,EAAE,eAAe,EAAE,MAAmB,mBAAmB,CAAC;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,iFAAiF;AACjF,OAAO,EAAE,eAAe,EAAE,MAAmB,mBAAmB,CAAC;AAGjE,OAAO,EAAE,mBAAmB,EAAE,MAAe,wBAAwB,CAAC;AAStE,OAAO,EACL,uBAAuB,EACvB,uBAAuB,GACxB,MAA4C,eAAe,CAAC;AAE7D,gFAAgF;AAChF,OAAO,EAAE,iBAAiB,EAAE,wBAAwB,EAAE,MAAM,qBAAqB,CAAC;AAGlF,OAAO,EAAE,gBAAgB,EAAE,MAAsB,qBAAqB,CAAC;AAOvE,OAAO,EAAE,gBAAgB,EAAE,MAAsB,qBAAqB,CAAC;AAMvE,OAAO,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AAO1E,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAO5E,gFAAgF;AAChF,OAAO,EAAE,gBAAgB,EAAE,MAAsB,oBAAoB,CAAC;AAGtE,OAAO,EAAE,gBAAgB,EAAE,MAAsB,qBAAqB,CAAC;AAGvE,iFAAiF;AACjF,OAAO,EAAE,qBAAqB,EAAE,MAAiB,+BAA+B,CAAC"}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Google Cloud Speech-to-Text V2 Adapter (Chirp 3 model)
|
|
3
|
+
*
|
|
4
|
+
* Sends 16kHz slin16 PCM audio to Google Cloud Speech-to-Text REST API
|
|
5
|
+
* in 2-second chunks and fires onTranscript callbacks for final results.
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - Chirp 3 model (best multilingual accuracy, GA — 100+ languages)
|
|
9
|
+
* - Speaker diarization support
|
|
10
|
+
* - Automatic language detection
|
|
11
|
+
* - Speech adaptation / denoiser
|
|
12
|
+
* - Automatic punctuation
|
|
13
|
+
*
|
|
14
|
+
* API versions:
|
|
15
|
+
* V2 (default): POST https://speech.googleapis.com/v2/projects/{projectId}/locations/global/recognizers/_:recognize
|
|
16
|
+
* V1 (fallback): POST https://speech.googleapis.com/v1/speech:recognize
|
|
17
|
+
*
|
|
18
|
+
* Audio format:
|
|
19
|
+
* Input: raw 16-bit PCM, 16kHz, mono (matches DVGateway slin16)
|
|
20
|
+
* Output: JSON transcript results with confidence scores
|
|
21
|
+
*
|
|
22
|
+
* API key format:
|
|
23
|
+
* V2: "project_id:api_key" — splits into project ID and API key
|
|
24
|
+
* V1: plain "api_key" — used when no colon separator is present
|
|
25
|
+
*
|
|
26
|
+
* Docs: https://cloud.google.com/speech-to-text/v2/docs
|
|
27
|
+
*/
|
|
28
|
+
import type { SttAdapter, AudioChunk, TranscriptResult } from 'dvgateway-sdk';
|
|
29
|
+
export interface GoogleChirp3AdapterOptions {
|
|
30
|
+
/**
|
|
31
|
+
* API key for Google Cloud Speech-to-Text.
|
|
32
|
+
* Format: "project_id:api_key" for V2 API, or plain "api_key" for V1 fallback.
|
|
33
|
+
*/
|
|
34
|
+
apiKey: string;
|
|
35
|
+
/** Language code, e.g. "ko-KR", "en-US" (default: "ko-KR") */
|
|
36
|
+
language?: string;
|
|
37
|
+
/**
|
|
38
|
+
* Google Cloud STT model (default: "chirp_3")
|
|
39
|
+
* Options: chirp_3, chirp_2, long, short, telephony, medical_dictation, medical_conversation
|
|
40
|
+
*/
|
|
41
|
+
model?: string;
|
|
42
|
+
/** Enable automatic punctuation (default: true) */
|
|
43
|
+
punctuate?: boolean;
|
|
44
|
+
}
|
|
45
|
+
export declare class GoogleChirp3Adapter implements SttAdapter {
|
|
46
|
+
private readonly opts;
|
|
47
|
+
private readonly projectId;
|
|
48
|
+
private readonly apiKey;
|
|
49
|
+
private transcriptHandler;
|
|
50
|
+
private stopped;
|
|
51
|
+
private audioBuffer;
|
|
52
|
+
constructor(opts: GoogleChirp3AdapterOptions);
|
|
53
|
+
onTranscript(handler: (result: TranscriptResult) => void): void;
|
|
54
|
+
startStream(linkedId: string, audioStream: AsyncIterable<AudioChunk>): Promise<void>;
|
|
55
|
+
stop(): Promise<void>;
|
|
56
|
+
/**
|
|
57
|
+
* Send a PCM audio chunk to Google Cloud Speech-to-Text for recognition.
|
|
58
|
+
* Tries V2 API first (if project ID is available), falls back to V1.
|
|
59
|
+
*/
|
|
60
|
+
private recognizeChunk;
|
|
61
|
+
/**
|
|
62
|
+
* Call Google Cloud Speech-to-Text V2 API.
|
|
63
|
+
* Endpoint: POST https://speech.googleapis.com/v2/projects/{projectId}/locations/global/recognizers/_:recognize
|
|
64
|
+
*/
|
|
65
|
+
private callV2Api;
|
|
66
|
+
/**
|
|
67
|
+
* Call Google Cloud Speech-to-Text V1 API.
|
|
68
|
+
* Endpoint: POST https://speech.googleapis.com/v1/speech:recognize
|
|
69
|
+
*/
|
|
70
|
+
private callV1Api;
|
|
71
|
+
/**
|
|
72
|
+
* Parse Google STT response and invoke the transcript handler.
|
|
73
|
+
*/
|
|
74
|
+
private handleResponse;
|
|
75
|
+
}
|
|
76
|
+
//# sourceMappingURL=google-chirp3.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"google-chirp3.d.ts","sourceRoot":"","sources":["../../src/stt/google-chirp3.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC;AAG9E,MAAM,WAAW,0BAA0B;IACzC;;;OAGG;IACH,MAAM,EAAE,MAAM,CAAC;IACf,8DAA8D;IAC9D,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,mDAAmD;IACnD,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAyBD,qBAAa,mBAAoB,YAAW,UAAU;IACpD,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAuC;IAC5D,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAgB;IAC1C,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAChC,OAAO,CAAC,iBAAiB,CAAqD;IAC9E,OAAO,CAAC,OAAO,CAAS;IACxB,OAAO,CAAC,WAAW,CAA2B;gBAElC,IAAI,EAAE,0BAA0B;IAmB5C,YAAY,CAAC,OAAO,EAAE,CAAC,MAAM,EAAE,gBAAgB,KAAK,IAAI,GAAG,IAAI;IAIzD,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,WAAW,EAAE,aAAa,CAAC,UAAU,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IA0BpF,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAK3B;;;OAGG;YACW,cAAc;IAmC5B;;;OAGG;YACW,SAAS;IAyBvB;;;OAGG;YACW,SAAS;IAyBvB;;OAEG;IACH,OAAO,CAAC,cAAc;CAmBvB"}
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Google Cloud Speech-to-Text V2 Adapter (Chirp 3 model)
|
|
3
|
+
*
|
|
4
|
+
* Sends 16kHz slin16 PCM audio to Google Cloud Speech-to-Text REST API
|
|
5
|
+
* in 2-second chunks and fires onTranscript callbacks for final results.
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - Chirp 3 model (best multilingual accuracy, GA — 100+ languages)
|
|
9
|
+
* - Speaker diarization support
|
|
10
|
+
* - Automatic language detection
|
|
11
|
+
* - Speech adaptation / denoiser
|
|
12
|
+
* - Automatic punctuation
|
|
13
|
+
*
|
|
14
|
+
* API versions:
|
|
15
|
+
* V2 (default): POST https://speech.googleapis.com/v2/projects/{projectId}/locations/global/recognizers/_:recognize
|
|
16
|
+
* V1 (fallback): POST https://speech.googleapis.com/v1/speech:recognize
|
|
17
|
+
*
|
|
18
|
+
* Audio format:
|
|
19
|
+
* Input: raw 16-bit PCM, 16kHz, mono (matches DVGateway slin16)
|
|
20
|
+
* Output: JSON transcript results with confidence scores
|
|
21
|
+
*
|
|
22
|
+
* API key format:
|
|
23
|
+
* V2: "project_id:api_key" — splits into project ID and API key
|
|
24
|
+
* V1: plain "api_key" — used when no colon separator is present
|
|
25
|
+
*
|
|
26
|
+
* Docs: https://cloud.google.com/speech-to-text/v2/docs
|
|
27
|
+
*/
|
|
28
|
+
import { float32ToSlin16 } from 'dvgateway-sdk';
|
|
29
|
+
/** Audio chunk accumulation settings */
|
|
30
|
+
const CHUNK_DURATION_MS = 2000;
|
|
31
|
+
const SAMPLE_RATE = 16000;
|
|
32
|
+
const BYTES_PER_SAMPLE = 2; // 16-bit PCM
|
|
33
|
+
const CHUNK_BYTE_SIZE = SAMPLE_RATE * BYTES_PER_SAMPLE * (CHUNK_DURATION_MS / 1000);
|
|
34
|
+
export class GoogleChirp3Adapter {
|
|
35
|
+
opts;
|
|
36
|
+
projectId;
|
|
37
|
+
apiKey;
|
|
38
|
+
transcriptHandler = null;
|
|
39
|
+
stopped = false;
|
|
40
|
+
audioBuffer = Buffer.alloc(0);
|
|
41
|
+
constructor(opts) {
|
|
42
|
+
this.opts = {
|
|
43
|
+
language: opts.language ?? 'ko-KR',
|
|
44
|
+
model: opts.model ?? 'chirp_3',
|
|
45
|
+
punctuate: opts.punctuate ?? true,
|
|
46
|
+
apiKey: opts.apiKey,
|
|
47
|
+
};
|
|
48
|
+
// Parse API key: "project_id:api_key" for V2, plain "api_key" for V1
|
|
49
|
+
const colonIdx = opts.apiKey.indexOf(':');
|
|
50
|
+
if (colonIdx > 0) {
|
|
51
|
+
this.projectId = opts.apiKey.substring(0, colonIdx);
|
|
52
|
+
this.apiKey = opts.apiKey.substring(colonIdx + 1);
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
this.projectId = null;
|
|
56
|
+
this.apiKey = opts.apiKey;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
onTranscript(handler) {
|
|
60
|
+
this.transcriptHandler = handler;
|
|
61
|
+
}
|
|
62
|
+
async startStream(linkedId, audioStream) {
|
|
63
|
+
this.stopped = false;
|
|
64
|
+
this.audioBuffer = Buffer.alloc(0);
|
|
65
|
+
for await (const chunk of audioStream) {
|
|
66
|
+
if (this.stopped)
|
|
67
|
+
break;
|
|
68
|
+
// Convert Float32 samples to slin16 PCM
|
|
69
|
+
const pcm = float32ToSlin16(chunk.samples);
|
|
70
|
+
this.audioBuffer = Buffer.concat([this.audioBuffer, Buffer.from(pcm.buffer, pcm.byteOffset, pcm.byteLength)]);
|
|
71
|
+
// When we have accumulated enough audio, send a chunk for recognition
|
|
72
|
+
while (this.audioBuffer.length >= CHUNK_BYTE_SIZE && !this.stopped) {
|
|
73
|
+
const chunkData = this.audioBuffer.subarray(0, CHUNK_BYTE_SIZE);
|
|
74
|
+
this.audioBuffer = this.audioBuffer.subarray(CHUNK_BYTE_SIZE);
|
|
75
|
+
await this.recognizeChunk(linkedId, chunkData);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
// Process any remaining audio in the buffer
|
|
79
|
+
if (!this.stopped && this.audioBuffer.length > 0) {
|
|
80
|
+
await this.recognizeChunk(linkedId, this.audioBuffer);
|
|
81
|
+
this.audioBuffer = Buffer.alloc(0);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
async stop() {
|
|
85
|
+
this.stopped = true;
|
|
86
|
+
this.audioBuffer = Buffer.alloc(0);
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Send a PCM audio chunk to Google Cloud Speech-to-Text for recognition.
|
|
90
|
+
* Tries V2 API first (if project ID is available), falls back to V1.
|
|
91
|
+
*/
|
|
92
|
+
async recognizeChunk(linkedId, pcmData) {
|
|
93
|
+
const base64Audio = pcmData.toString('base64');
|
|
94
|
+
try {
|
|
95
|
+
let response;
|
|
96
|
+
if (this.projectId) {
|
|
97
|
+
// Try V2 API first
|
|
98
|
+
response = await this.callV2Api(base64Audio);
|
|
99
|
+
if (response.error) {
|
|
100
|
+
// Fallback to V1 on error
|
|
101
|
+
process.stderr.write(`[GoogleChirp3Adapter] V2 API error (${response.error.code}): ${response.error.message}, falling back to V1\n`);
|
|
102
|
+
response = await this.callV1Api(base64Audio);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
// No project ID — use V1 directly
|
|
107
|
+
response = await this.callV1Api(base64Audio);
|
|
108
|
+
}
|
|
109
|
+
if (response.error) {
|
|
110
|
+
process.stderr.write(`[GoogleChirp3Adapter] API error (${response.error.code}): ${response.error.message}\n`);
|
|
111
|
+
return;
|
|
112
|
+
}
|
|
113
|
+
this.handleResponse(linkedId, response);
|
|
114
|
+
}
|
|
115
|
+
catch (err) {
|
|
116
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
117
|
+
process.stderr.write(`[GoogleChirp3Adapter] recognition error: ${message}\n`);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Call Google Cloud Speech-to-Text V2 API.
|
|
122
|
+
* Endpoint: POST https://speech.googleapis.com/v2/projects/{projectId}/locations/global/recognizers/_:recognize
|
|
123
|
+
*/
|
|
124
|
+
async callV2Api(base64Audio) {
|
|
125
|
+
const url = `https://speech.googleapis.com/v2/projects/${this.projectId}/locations/global/recognizers/_:recognize?key=${this.apiKey}`;
|
|
126
|
+
const body = {
|
|
127
|
+
config: {
|
|
128
|
+
languageCodes: [this.opts.language],
|
|
129
|
+
model: this.opts.model,
|
|
130
|
+
autoDecodingConfig: {},
|
|
131
|
+
features: {
|
|
132
|
+
enableAutomaticPunctuation: this.opts.punctuate,
|
|
133
|
+
},
|
|
134
|
+
},
|
|
135
|
+
content: base64Audio,
|
|
136
|
+
};
|
|
137
|
+
const res = await fetch(url, {
|
|
138
|
+
method: 'POST',
|
|
139
|
+
headers: { 'Content-Type': 'application/json' },
|
|
140
|
+
body: JSON.stringify(body),
|
|
141
|
+
});
|
|
142
|
+
return (await res.json());
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Call Google Cloud Speech-to-Text V1 API.
|
|
146
|
+
* Endpoint: POST https://speech.googleapis.com/v1/speech:recognize
|
|
147
|
+
*/
|
|
148
|
+
async callV1Api(base64Audio) {
|
|
149
|
+
const url = `https://speech.googleapis.com/v1/speech:recognize?key=${this.apiKey}`;
|
|
150
|
+
const body = {
|
|
151
|
+
config: {
|
|
152
|
+
encoding: 'LINEAR16',
|
|
153
|
+
sampleRateHertz: SAMPLE_RATE,
|
|
154
|
+
languageCode: this.opts.language,
|
|
155
|
+
enableAutomaticPunctuation: this.opts.punctuate,
|
|
156
|
+
model: this.opts.model,
|
|
157
|
+
},
|
|
158
|
+
audio: {
|
|
159
|
+
content: base64Audio,
|
|
160
|
+
},
|
|
161
|
+
};
|
|
162
|
+
const res = await fetch(url, {
|
|
163
|
+
method: 'POST',
|
|
164
|
+
headers: { 'Content-Type': 'application/json' },
|
|
165
|
+
body: JSON.stringify(body),
|
|
166
|
+
});
|
|
167
|
+
return (await res.json());
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Parse Google STT response and invoke the transcript handler.
|
|
171
|
+
*/
|
|
172
|
+
handleResponse(linkedId, response) {
|
|
173
|
+
if (!response.results || response.results.length === 0)
|
|
174
|
+
return;
|
|
175
|
+
for (const result of response.results) {
|
|
176
|
+
const alt = result.alternatives?.[0];
|
|
177
|
+
if (!alt || !alt.transcript)
|
|
178
|
+
continue;
|
|
179
|
+
const transcriptResult = {
|
|
180
|
+
linkedId,
|
|
181
|
+
text: alt.transcript,
|
|
182
|
+
isFinal: result.isFinal !== false, // REST API results are final by default
|
|
183
|
+
confidence: alt.confidence,
|
|
184
|
+
language: result.languageCode ?? this.opts.language,
|
|
185
|
+
timestampMs: Date.now(),
|
|
186
|
+
};
|
|
187
|
+
this.transcriptHandler?.(transcriptResult);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
//# sourceMappingURL=google-chirp3.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"google-chirp3.js","sourceRoot":"","sources":["../../src/stt/google-chirp3.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAGH,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAoChD,wCAAwC;AACxC,MAAM,iBAAiB,GAAG,IAAI,CAAC;AAC/B,MAAM,WAAW,GAAG,KAAK,CAAC;AAC1B,MAAM,gBAAgB,GAAG,CAAC,CAAC,CAAC,aAAa;AACzC,MAAM,eAAe,GAAG,WAAW,GAAG,gBAAgB,GAAG,CAAC,iBAAiB,GAAG,IAAI,CAAC,CAAC;AAEpF,MAAM,OAAO,mBAAmB;IACb,IAAI,CAAuC;IAC3C,SAAS,CAAgB;IACzB,MAAM,CAAS;IACxB,iBAAiB,GAAgD,IAAI,CAAC;IACtE,OAAO,GAAG,KAAK,CAAC;IAChB,WAAW,GAAW,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IAE9C,YAAY,IAAgC;QAC1C,IAAI,CAAC,IAAI,GAAG;YACV,QAAQ,EAAE,IAAI,CAAC,QAAQ,IAAI,OAAO;YAClC,KAAK,EAAK,IAAI,CAAC,KAAK,IAAO,SAAS;YACpC,SAAS,EAAE,IAAI,CAAC,SAAS,IAAI,IAAI;YACjC,MAAM,EAAI,IAAI,CAAC,MAAM;SACtB,CAAC;QAEF,qEAAqE;QACrE,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC1C,IAAI,QAAQ,GAAG,CAAC,EAAE,CAAC;YACjB,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;YACpD,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,QAAQ,GAAG,CAAC,CAAC,CAAC;QACpD,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;YACtB,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;QAC5B,CAAC;IACH,CAAC;IAED,YAAY,CAAC,OAA2C;QACtD,IAAI,CAAC,iBAAiB,GAAG,OAAO,CAAC;IACnC,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,QAAgB,EAAE,WAAsC;QACxE,IAAI,CAAC,OAAO,GAAG,KAAK,CAAC;QACrB,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAEnC,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,WAAW,EAAE,CAAC;YACtC,IAAI,IAAI,CAAC,OAAO;gBAAE,MAAM;YAExB,wCAAwC;YACxC,MAAM,GAAG,GAAG,eAAe,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;YAC3C,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAE9G,sEAAsE;YACtE,OAAO,IAAI,CAAC,WAAW,CAAC,MAAM,IAAI,eAAe,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;gBACnE,MAAM,SAAS,GAAG,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC;gBAChE,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC;gBAC9D,MAAM,IAAI,CAAC,cAAc,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;YACjD,CAAC;QACH,CAAC;QAED,4CAA4C;QAC5C,IAAI,CAAC,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACjD,MAAM,IAAI,CAAC,cAAc,CAAC,QAAQ,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;YACtD,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACrC,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI;QACR,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACpB,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACrC,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,cAAc,CAAC,QAAgB,EAAE,OAAe;QAC5D,MAAM,WAAW,GAAG,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;QAE/C,IAAI,CAAC;YACH,IAAI,QAA2B,CAAC;YAEhC,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;gBACnB,mBAAmB;gBACnB,QAAQ,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;gBAC7C,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;oBACnB,0BAA0B;oBAC1B,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,uCAAuC,QAAQ,CAAC,KAAK,CAAC,IAAI,MAAM,QAAQ,CAAC,KAAK,CAAC,OAAO,wBAAwB,CAC/G,CAAC;oBACF,QAAQ,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;gBAC/C,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,kCAAkC;gBAClC,QAAQ,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;YAC/C,CAAC;YAED,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;gBACnB,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,oCAAoC,QAAQ,CAAC,KAAK,CAAC,IAAI,MAAM,QAAQ,CAAC,KAAK,CAAC,OAAO,IAAI,CACxF,CAAC;gBACF,OAAO;YACT,CAAC;YAED,IAAI,CAAC,cAAc,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;QAC1C,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,OAAO,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YACjE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,4CAA4C,OAAO,IAAI,CAAC,CAAC;QAChF,CAAC;IACH,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,SAAS,CAAC,WAAmB;QACzC,MAAM,GAAG,GACP,6CAA6C,IAAI,CAAC,SAAS,iDAAiD,IAAI,CAAC,MAAM,EAAE,CAAC;QAE5H,MAAM,IAAI,GAAG;YACX,MAAM,EAAE;gBACN,aAAa,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC;gBACnC,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,KAAK;gBACtB,kBAAkB,EAAE,EAAE;gBACtB,QAAQ,EAAE;oBACR,0BAA0B,EAAE,IAAI,CAAC,IAAI,CAAC,SAAS;iBAChD;aACF;YACD,OAAO,EAAE,WAAW;SACrB,CAAC;QAEF,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAC3B,MAAM,EAAE,MAAM;YACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;YAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;SAC3B,CAAC,CAAC;QAEH,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAAsB,CAAC;IACjD,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,SAAS,CAAC,WAAmB;QACzC,MAAM,GAAG,GAAG,yDAAyD,IAAI,CAAC,MAAM,EAAE,CAAC;QAEnF,MAAM,IAAI,GAAG;YACX,MAAM,EAAE;gBACN,QAAQ,EAAE,UAAmB;gBAC7B,eAAe,EAAE,WAAW;gBAC5B,YAAY,EAAE,IAAI,CAAC,IAAI,CAAC,QAAQ;gBAChC,0BAA0B,EAAE,IAAI,CAAC,IAAI,CAAC,SAAS;gBAC/C,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,KAAK;aACvB;YACD,KAAK,EAAE;gBACL,OAAO,EAAE,WAAW;aACrB;SACF,CAAC;QAEF,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAC3B,MAAM,EAAE,MAAM;YACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;YAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;SAC3B,CAAC,CAAC;QAEH,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAAsB,CAAC;IACjD,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,QAAgB,EAAE,QAA2B;QAClE,IAAI,CAAC,QAAQ,CAAC,OAAO,IAAI,QAAQ,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO;QAE/D,KAAK,MAAM,MAAM,IAAI,QAAQ,CAAC,OAAO,EAAE,CAAC;YACtC,MAAM,GAAG,GAAG,MAAM,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC;YACrC,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU;gBAAE,SAAS;YAEtC,MAAM,gBAAgB,GAAqB;gBACzC,QAAQ;gBACR,IAAI,EAAE,GAAG,CAAC,UAAU;gBACpB,OAAO,EAAE,MAAM,CAAC,OAAO,KAAK,KAAK,EAAE,wCAAwC;gBAC3E,UAAU,EAAE,GAAG,CAAC,UAAU;gBAC1B,QAAQ,EAAE,MAAM,CAAC,YAAY,IAAI,IAAI,CAAC,IAAI,CAAC,QAAQ;gBACnD,WAAW,EAAE,IAAI,CAAC,GAAG,EAAE;aACxB,CAAC;YAEF,IAAI,CAAC,iBAAiB,EAAE,CAAC,gBAAgB,CAAC,CAAC;QAC7C,CAAC;IACH,CAAC;CACF"}
|
package/dist/stt/index.d.ts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
1
|
export { DeepgramAdapter } from './deepgram.js';
|
|
2
2
|
export type { DeepgramAdapterOptions } from './deepgram.js';
|
|
3
|
+
export { GoogleChirp3Adapter } from './google-chirp3.js';
|
|
4
|
+
export type { GoogleChirp3AdapterOptions } from './google-chirp3.js';
|
|
3
5
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/stt/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/stt/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,YAAY,EAAE,sBAAsB,EAAE,MAAM,eAAe,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/stt/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,YAAY,EAAE,sBAAsB,EAAE,MAAM,eAAe,CAAC;AAC5D,OAAO,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAC;AACzD,YAAY,EAAE,0BAA0B,EAAE,MAAM,oBAAoB,CAAC"}
|
package/dist/stt/index.js
CHANGED
package/dist/stt/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/stt/index.ts"],"names":[],"mappings":"AAAA,gCAAgC;AAChC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/stt/index.ts"],"names":[],"mappings":"AAAA,gCAAgC;AAChC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAEhD,OAAO,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAC"}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Alibaba CosyVoice TTS Adapter (DashScope API)
|
|
3
|
+
*
|
|
4
|
+
* Synthesizes text to speech using Alibaba's CosyVoice models via the DashScope API.
|
|
5
|
+
* Returns 16kHz slin16 PCM chunks for direct injection into DVGateway.
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - High-quality Chinese and multilingual TTS via CosyVoice
|
|
9
|
+
* - MP3 → 16kHz PCM conversion via ffmpeg subprocess
|
|
10
|
+
* - Supports 11 languages including Korean via language_hints parameter
|
|
11
|
+
* - Multiple voice presets optimized for different use cases
|
|
12
|
+
*
|
|
13
|
+
* Model Reference (2026-03):
|
|
14
|
+
* cosyvoice-v3.5-plus — Highest quality, best prosody and naturalness (default)
|
|
15
|
+
* cosyvoice-v3.5-flash — Low latency, optimized for real-time applications
|
|
16
|
+
*
|
|
17
|
+
* Voice options:
|
|
18
|
+
* longxiaochun — Recommended, versatile female voice (default)
|
|
19
|
+
* longxiaochun_v2 — Updated version of longxiaochun
|
|
20
|
+
* longyue — Female, gentle tone
|
|
21
|
+
* longwan — Female, warm tone
|
|
22
|
+
* longjing — Female, clear and professional
|
|
23
|
+
* longshuo — Male, steady and authoritative
|
|
24
|
+
* longhua — Male, warm and friendly
|
|
25
|
+
* longfei — Male, energetic
|
|
26
|
+
* longshu — Male, calm and measured
|
|
27
|
+
*
|
|
28
|
+
* Supported languages via language_hints:
|
|
29
|
+
* zh (Chinese), en (English), ja (Japanese), ko (Korean),
|
|
30
|
+
* yue (Cantonese), and more.
|
|
31
|
+
*
|
|
32
|
+
* Note: CosyVoice outputs MP3; we decode to PCM and resample to 16kHz for DVGateway.
|
|
33
|
+
*
|
|
34
|
+
* API Endpoint: POST https://dashscope.aliyuncs.com/api/v1/services/aigc/text2audio/generation
|
|
35
|
+
* Docs: https://help.aliyun.com/document_detail/2712195.html
|
|
36
|
+
*/
|
|
37
|
+
import type { TtsAdapter, TtsOptions, VoiceInfo } from 'dvgateway-sdk';
|
|
38
|
+
export type CosyVoiceVoice = 'longxiaochun' | 'longxiaochun_v2' | 'longyue' | 'longwan' | 'longjing' | 'longshuo' | 'longhua' | 'longfei' | 'longshu';
|
|
39
|
+
export type CosyVoiceModel = 'cosyvoice-v3.5-plus' | 'cosyvoice-v3.5-flash';
|
|
40
|
+
export interface CosyVoiceAdapterOptions {
|
|
41
|
+
apiKey: string;
|
|
42
|
+
/** Voice preset (default: "longxiaochun") — see voice options above */
|
|
43
|
+
voice?: CosyVoiceVoice;
|
|
44
|
+
/**
|
|
45
|
+
* Model (default: "cosyvoice-v3.5-plus")
|
|
46
|
+
* cosyvoice-v3.5-plus — Highest quality, best prosody
|
|
47
|
+
* cosyvoice-v3.5-flash — Low latency, real-time optimized
|
|
48
|
+
*/
|
|
49
|
+
model?: CosyVoiceModel;
|
|
50
|
+
/**
|
|
51
|
+
* Language hints for multilingual synthesis.
|
|
52
|
+
* Array of language codes, e.g. ["ko"] for Korean, ["zh"] for Chinese.
|
|
53
|
+
* When not specified, the model auto-detects the language.
|
|
54
|
+
*/
|
|
55
|
+
language?: string[];
|
|
56
|
+
/**
|
|
57
|
+
* Audio sample rate in Hz (default: 16000).
|
|
58
|
+
* Supported: 8000, 16000, 22050, 24000, 44100, 48000
|
|
59
|
+
*/
|
|
60
|
+
sampleRate?: number;
|
|
61
|
+
}
|
|
62
|
+
/** Available CosyVoice voice presets with descriptions */
|
|
63
|
+
export declare const COSYVOICE_VOICES: ReadonlyArray<VoiceInfo>;
|
|
64
|
+
export declare class CosyVoiceAdapter implements TtsAdapter {
|
|
65
|
+
private readonly opts;
|
|
66
|
+
constructor(opts: CosyVoiceAdapterOptions);
|
|
67
|
+
synthesize(text: string, options?: TtsOptions): AsyncIterable<Buffer>;
|
|
68
|
+
}
|
|
69
|
+
//# sourceMappingURL=cosyvoice-tts.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cosyvoice-tts.d.ts","sourceRoot":"","sources":["../../src/tts/cosyvoice-tts.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AAGH,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,eAAe,CAAC;AAEvE,MAAM,MAAM,cAAc,GACtB,cAAc,GACd,iBAAiB,GACjB,SAAS,GACT,SAAS,GACT,UAAU,GACV,UAAU,GACV,SAAS,GACT,SAAS,GACT,SAAS,CAAC;AAEd,MAAM,MAAM,cAAc,GAAG,qBAAqB,GAAG,sBAAsB,CAAC;AAE5E,MAAM,WAAW,uBAAuB;IACtC,MAAM,EAAE,MAAM,CAAC;IACf,uEAAuE;IACvE,KAAK,CAAC,EAAE,cAAc,CAAC;IACvB;;;;OAIG;IACH,KAAK,CAAC,EAAE,cAAc,CAAC;IACvB;;;;OAIG;IACH,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;IACpB;;;OAGG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,0DAA0D;AAC1D,eAAO,MAAM,gBAAgB,EAAE,aAAa,CAAC,SAAS,CAU5C,CAAC;AAMX,qBAAa,gBAAiB,YAAW,UAAU;IACjD,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAoC;gBAE7C,IAAI,EAAE,uBAAuB;IAUlC,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC;CA+D7E"}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Alibaba CosyVoice TTS Adapter (DashScope API)
|
|
3
|
+
*
|
|
4
|
+
* Synthesizes text to speech using Alibaba's CosyVoice models via the DashScope API.
|
|
5
|
+
* Returns 16kHz slin16 PCM chunks for direct injection into DVGateway.
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - High-quality Chinese and multilingual TTS via CosyVoice
|
|
9
|
+
* - MP3 → 16kHz PCM conversion via ffmpeg subprocess
|
|
10
|
+
* - Supports 11 languages including Korean via language_hints parameter
|
|
11
|
+
* - Multiple voice presets optimized for different use cases
|
|
12
|
+
*
|
|
13
|
+
* Model Reference (2026-03):
|
|
14
|
+
* cosyvoice-v3.5-plus — Highest quality, best prosody and naturalness (default)
|
|
15
|
+
* cosyvoice-v3.5-flash — Low latency, optimized for real-time applications
|
|
16
|
+
*
|
|
17
|
+
* Voice options:
|
|
18
|
+
* longxiaochun — Recommended, versatile female voice (default)
|
|
19
|
+
* longxiaochun_v2 — Updated version of longxiaochun
|
|
20
|
+
* longyue — Female, gentle tone
|
|
21
|
+
* longwan — Female, warm tone
|
|
22
|
+
* longjing — Female, clear and professional
|
|
23
|
+
* longshuo — Male, steady and authoritative
|
|
24
|
+
* longhua — Male, warm and friendly
|
|
25
|
+
* longfei — Male, energetic
|
|
26
|
+
* longshu — Male, calm and measured
|
|
27
|
+
*
|
|
28
|
+
* Supported languages via language_hints:
|
|
29
|
+
* zh (Chinese), en (English), ja (Japanese), ko (Korean),
|
|
30
|
+
* yue (Cantonese), and more.
|
|
31
|
+
*
|
|
32
|
+
* Note: CosyVoice outputs MP3; we decode to PCM and resample to 16kHz for DVGateway.
|
|
33
|
+
*
|
|
34
|
+
* API Endpoint: POST https://dashscope.aliyuncs.com/api/v1/services/aigc/text2audio/generation
|
|
35
|
+
* Docs: https://help.aliyun.com/document_detail/2712195.html
|
|
36
|
+
*/
|
|
37
|
+
import { spawn } from 'child_process';
|
|
38
|
+
/** Available CosyVoice voice presets with descriptions */
|
|
39
|
+
export const COSYVOICE_VOICES = [
|
|
40
|
+
{ id: 'longxiaochun', label: 'longxiaochun (여성, 추천)' },
|
|
41
|
+
{ id: 'longxiaochun_v2', label: 'longxiaochun_v2 (여성, 업데이트)' },
|
|
42
|
+
{ id: 'longyue', label: 'longyue (여성, 부드러운 톤)' },
|
|
43
|
+
{ id: 'longwan', label: 'longwan (여성, 따뜻한 톤)' },
|
|
44
|
+
{ id: 'longjing', label: 'longjing (여성, 전문적)' },
|
|
45
|
+
{ id: 'longshuo', label: 'longshuo (남성, 안정적)' },
|
|
46
|
+
{ id: 'longhua', label: 'longhua (남성, 친근한)' },
|
|
47
|
+
{ id: 'longfei', label: 'longfei (남성, 에너지)' },
|
|
48
|
+
{ id: 'longshu', label: 'longshu (남성, 차분한)' },
|
|
49
|
+
];
|
|
50
|
+
const DASHSCOPE_TTS_URL = 'https://dashscope.aliyuncs.com/api/v1/services/aigc/text2audio/generation';
|
|
51
|
+
const DV_SAMPLE_RATE = 16000;
|
|
52
|
+
const PCM_CHUNK_BYTES = 640; // 20ms at 16kHz, 16-bit PCM
|
|
53
|
+
export class CosyVoiceAdapter {
|
|
54
|
+
opts;
|
|
55
|
+
constructor(opts) {
|
|
56
|
+
this.opts = {
|
|
57
|
+
apiKey: opts.apiKey,
|
|
58
|
+
voice: opts.voice ?? 'longxiaochun',
|
|
59
|
+
model: opts.model ?? 'cosyvoice-v3.5-plus',
|
|
60
|
+
language: opts.language ?? [],
|
|
61
|
+
sampleRate: opts.sampleRate ?? DV_SAMPLE_RATE,
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
async *synthesize(text, options) {
|
|
65
|
+
const voice = options?.voiceId ?? this.opts.voice;
|
|
66
|
+
const language = options?.language
|
|
67
|
+
? [options.language.split('-')[0]] // "ko-KR" → ["ko"]
|
|
68
|
+
: this.opts.language;
|
|
69
|
+
// Build DashScope request body
|
|
70
|
+
const requestBody = {
|
|
71
|
+
model: this.opts.model,
|
|
72
|
+
input: {
|
|
73
|
+
text,
|
|
74
|
+
voice,
|
|
75
|
+
...(language.length > 0 ? { language_hints: language } : {}),
|
|
76
|
+
},
|
|
77
|
+
parameters: {
|
|
78
|
+
format: 'mp3',
|
|
79
|
+
sample_rate: this.opts.sampleRate,
|
|
80
|
+
},
|
|
81
|
+
};
|
|
82
|
+
const response = await fetch(DASHSCOPE_TTS_URL, {
|
|
83
|
+
method: 'POST',
|
|
84
|
+
headers: {
|
|
85
|
+
'Authorization': `Bearer ${this.opts.apiKey}`,
|
|
86
|
+
'Content-Type': 'application/json',
|
|
87
|
+
'Accept': '*/*',
|
|
88
|
+
},
|
|
89
|
+
body: JSON.stringify(requestBody),
|
|
90
|
+
});
|
|
91
|
+
if (!response.ok) {
|
|
92
|
+
const errorText = await response.text();
|
|
93
|
+
throw new Error(`CosyVoice API error ${response.status}: ${errorText}`);
|
|
94
|
+
}
|
|
95
|
+
// Determine if response is JSON (base64 audio) or binary MP3
|
|
96
|
+
const contentType = response.headers.get('content-type') ?? '';
|
|
97
|
+
let mp3Buffer;
|
|
98
|
+
if (contentType.includes('application/json')) {
|
|
99
|
+
// JSON response with base64-encoded audio
|
|
100
|
+
const json = await response.json();
|
|
101
|
+
if (!json.output?.audio) {
|
|
102
|
+
throw new Error(`CosyVoice API returned no audio data: ${json.message ?? 'unknown error'}`);
|
|
103
|
+
}
|
|
104
|
+
mp3Buffer = Buffer.from(json.output.audio, 'base64');
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
// Binary MP3 response
|
|
108
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
109
|
+
mp3Buffer = Buffer.from(arrayBuffer);
|
|
110
|
+
}
|
|
111
|
+
// Convert MP3 → 16kHz 16-bit signed LE PCM via ffmpeg
|
|
112
|
+
const pcmBuffer = await mp3ToPcm(mp3Buffer, DV_SAMPLE_RATE);
|
|
113
|
+
// Yield PCM chunks (20ms frames)
|
|
114
|
+
let offset = 0;
|
|
115
|
+
while (offset < pcmBuffer.length) {
|
|
116
|
+
const end = Math.min(offset + PCM_CHUNK_BYTES, pcmBuffer.length);
|
|
117
|
+
yield pcmBuffer.subarray(offset, end);
|
|
118
|
+
offset = end;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Convert MP3 audio buffer to raw 16-bit signed LE PCM at the target sample rate
|
|
124
|
+
* using ffmpeg as a subprocess.
|
|
125
|
+
*/
|
|
126
|
+
function mp3ToPcm(mp3Data, sampleRate) {
|
|
127
|
+
return new Promise((resolve, reject) => {
|
|
128
|
+
const ffmpeg = spawn('ffmpeg', [
|
|
129
|
+
'-i', 'pipe:0', // Read from stdin
|
|
130
|
+
'-f', 's16le', // Output format: signed 16-bit little-endian
|
|
131
|
+
'-acodec', 'pcm_s16le',
|
|
132
|
+
'-ar', String(sampleRate),
|
|
133
|
+
'-ac', '1', // Mono
|
|
134
|
+
'-loglevel', 'error',
|
|
135
|
+
'pipe:1', // Write to stdout
|
|
136
|
+
]);
|
|
137
|
+
const chunks = [];
|
|
138
|
+
ffmpeg.stdout.on('data', (chunk) => {
|
|
139
|
+
chunks.push(chunk);
|
|
140
|
+
});
|
|
141
|
+
ffmpeg.stderr.on('data', (data) => {
|
|
142
|
+
// Log ffmpeg errors but don't reject — some warnings are non-fatal
|
|
143
|
+
const msg = data.toString().trim();
|
|
144
|
+
if (msg) {
|
|
145
|
+
console.error(`[TTS][CosyVoice] ffmpeg: ${msg}`);
|
|
146
|
+
}
|
|
147
|
+
});
|
|
148
|
+
ffmpeg.on('close', (code) => {
|
|
149
|
+
if (code !== 0) {
|
|
150
|
+
reject(new Error(`ffmpeg exited with code ${code}`));
|
|
151
|
+
return;
|
|
152
|
+
}
|
|
153
|
+
resolve(Buffer.concat(chunks));
|
|
154
|
+
});
|
|
155
|
+
ffmpeg.on('error', (err) => {
|
|
156
|
+
reject(new Error(`ffmpeg spawn error: ${err.message}`));
|
|
157
|
+
});
|
|
158
|
+
// Write MP3 data to ffmpeg stdin and close
|
|
159
|
+
ffmpeg.stdin.write(mp3Data);
|
|
160
|
+
ffmpeg.stdin.end();
|
|
161
|
+
});
|
|
162
|
+
}
|
|
163
|
+
//# sourceMappingURL=cosyvoice-tts.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cosyvoice-tts.js","sourceRoot":"","sources":["../../src/tts/cosyvoice-tts.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,eAAe,CAAC;AAuCtC,0DAA0D;AAC1D,MAAM,CAAC,MAAM,gBAAgB,GAA6B;IACxD,EAAE,EAAE,EAAE,cAAc,EAAK,KAAK,EAAE,uBAAuB,EAAE;IACzD,EAAE,EAAE,EAAE,iBAAiB,EAAE,KAAK,EAAE,4BAA4B,EAAE;IAC9D,EAAE,EAAE,EAAE,SAAS,EAAU,KAAK,EAAE,sBAAsB,EAAE;IACxD,EAAE,EAAE,EAAE,SAAS,EAAU,KAAK,EAAE,qBAAqB,EAAE;IACvD,EAAE,EAAE,EAAE,UAAU,EAAS,KAAK,EAAE,oBAAoB,EAAE;IACtD,EAAE,EAAE,EAAE,UAAU,EAAS,KAAK,EAAE,oBAAoB,EAAE;IACtD,EAAE,EAAE,EAAE,SAAS,EAAU,KAAK,EAAE,mBAAmB,EAAE;IACrD,EAAE,EAAE,EAAE,SAAS,EAAU,KAAK,EAAE,mBAAmB,EAAE;IACrD,EAAE,EAAE,EAAE,SAAS,EAAU,KAAK,EAAE,mBAAmB,EAAE;CAC7C,CAAC;AAEX,MAAM,iBAAiB,GAAG,2EAA2E,CAAC;AACtG,MAAM,cAAc,GAAG,KAAK,CAAC;AAC7B,MAAM,eAAe,GAAG,GAAG,CAAC,CAAC,4BAA4B;AAEzD,MAAM,OAAO,gBAAgB;IACV,IAAI,CAAoC;IAEzD,YAAY,IAA6B;QACvC,IAAI,CAAC,IAAI,GAAG;YACV,MAAM,EAAM,IAAI,CAAC,MAAM;YACvB,KAAK,EAAO,IAAI,CAAC,KAAK,IAAS,cAAc;YAC7C,KAAK,EAAO,IAAI,CAAC,KAAK,IAAS,qBAAqB;YACpD,QAAQ,EAAI,IAAI,CAAC,QAAQ,IAAM,EAAE;YACjC,UAAU,EAAE,IAAI,CAAC,UAAU,IAAI,cAAc;SAC9C,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,CAAC,UAAU,CAAC,IAAY,EAAE,OAAoB;QAClD,MAAM,KAAK,GAAI,OAAO,EAAE,OAAsC,IAAI,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC;QAClF,MAAM,QAAQ,GAAG,OAAO,EAAE,QAAQ;YAChC,CAAC,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAE,mBAAmB;YACvD,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC;QAEvB,+BAA+B;QAC/B,MAAM,WAAW,GAA4B;YAC3C,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,KAAK;YACtB,KAAK,EAAE;gBACL,IAAI;gBACJ,KAAK;gBACL,GAAG,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aAC7D;YACD,UAAU,EAAE;gBACV,MAAM,EAAE,KAAK;gBACb,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,UAAU;aAClC;SACF,CAAC;QAEF,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,iBAAiB,EAAE;YAC9C,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,eAAe,EAAE,UAAU,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE;gBAC7C,cAAc,EAAE,kBAAkB;gBAClC,QAAQ,EAAE,KAAK;aAChB;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC;SAClC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,SAAS,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACxC,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC,CAAC;QAC1E,CAAC;QAED,6DAA6D;QAC7D,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;QAC/D,IAAI,SAAiB,CAAC;QAEtB,IAAI,WAAW,CAAC,QAAQ,CAAC,kBAAkB,CAAC,EAAE,CAAC;YAC7C,0CAA0C;YAC1C,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAuD,CAAC;YACxF,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE,CAAC;gBACxB,MAAM,IAAI,KAAK,CAAC,yCAAyC,IAAI,CAAC,OAAO,IAAI,eAAe,EAAE,CAAC,CAAC;YAC9F,CAAC;YACD,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;QACvD,CAAC;aAAM,CAAC;YACN,sBAAsB;YACtB,MAAM,WAAW,GAAG,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC;YACjD,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACvC,CAAC;QAED,sDAAsD;QACtD,MAAM,SAAS,GAAG,MAAM,QAAQ,CAAC,SAAS,EAAE,cAAc,CAAC,CAAC;QAE5D,iCAAiC;QACjC,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,OAAO,MAAM,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC;YACjC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,eAAe,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;YACjE,MAAM,SAAS,CAAC,QAAQ,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;YACtC,MAAM,GAAG,GAAG,CAAC;QACf,CAAC;IACH,CAAC;CACF;AAED;;;GAGG;AACH,SAAS,QAAQ,CAAC,OAAe,EAAE,UAAkB;IACnD,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,MAAM,MAAM,GAAG,KAAK,CAAC,QAAQ,EAAE;YAC7B,IAAI,EAAE,QAAQ,EAAS,kBAAkB;YACzC,IAAI,EAAE,OAAO,EAAU,6CAA6C;YACpE,SAAS,EAAE,WAAW;YACtB,KAAK,EAAE,MAAM,CAAC,UAAU,CAAC;YACzB,KAAK,EAAE,GAAG,EAAa,OAAO;YAC9B,WAAW,EAAE,OAAO;YACpB,QAAQ,EAAe,kBAAkB;SAC1C,CAAC,CAAC;QAEH,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAa,EAAE,EAAE;YACzC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE;YACxC,mEAAmE;YACnE,MAAM,GAAG,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAC;YACnC,IAAI,GAAG,EAAE,CAAC;gBACR,OAAO,CAAC,KAAK,CAAC,4BAA4B,GAAG,EAAE,CAAC,CAAC;YACnD,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;YAC1B,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,KAAK,CAAC,2BAA2B,IAAI,EAAE,CAAC,CAAC,CAAC;gBACrD,OAAO;YACT,CAAC;YACD,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;QACjC,CAAC,CAAC,CAAC;QAEH,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,GAAG,EAAE,EAAE;YACzB,MAAM,CAAC,IAAI,KAAK,CAAC,uBAAuB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAC1D,CAAC,CAAC,CAAC;QAEH,2CAA2C;QAC3C,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAC5B,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC;IACrB,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Google Gemini TTS Adapter
|
|
3
|
+
*
|
|
4
|
+
* Uses Google Cloud Text-to-Speech API with Gemini models for speech synthesis.
|
|
5
|
+
* Returns 16kHz slin16 PCM chunks for direct injection into DVGateway.
|
|
6
|
+
*
|
|
7
|
+
* The API returns base64-encoded MP3 audio which is decoded and converted
|
|
8
|
+
* to 16kHz 16-bit PCM via ffmpeg subprocess for DVGateway compatibility.
|
|
9
|
+
*
|
|
10
|
+
* Model Reference (2026-03):
|
|
11
|
+
* gemini-2.5-flash-tts — Low-latency, optimized for real-time voice (default)
|
|
12
|
+
* gemini-2.5-pro-tts — Highest quality, richer prosody and expressiveness
|
|
13
|
+
*
|
|
14
|
+
* Voice options (30 voices):
|
|
15
|
+
* Kore — Recommended female voice (natural, warm)
|
|
16
|
+
* Puck — Recommended male voice (clear, friendly)
|
|
17
|
+
* Aoede — Melodic, expressive female
|
|
18
|
+
* Charon — Deep, authoritative male
|
|
19
|
+
* Fenrir — Strong, commanding male
|
|
20
|
+
* Leda — Soft, gentle female
|
|
21
|
+
* Orus — Calm, measured male
|
|
22
|
+
* Zephyr — Light, airy, gender-neutral
|
|
23
|
+
* Achernar — Crisp, professional
|
|
24
|
+
* Achird — Warm, conversational
|
|
25
|
+
* Algenib — Bright, energetic
|
|
26
|
+
* Algieba — Smooth, refined
|
|
27
|
+
* Alnilam — Clear, precise
|
|
28
|
+
* Autonoe — Expressive, dynamic
|
|
29
|
+
* Callirhoe — Graceful, flowing
|
|
30
|
+
* Despina — Cheerful, lively
|
|
31
|
+
* Enceladus — Rich, resonant
|
|
32
|
+
* Erinome — Gentle, soothing
|
|
33
|
+
* Gacrux — Steady, reliable
|
|
34
|
+
* Iapetus — Bold, confident
|
|
35
|
+
* Laomedeia — Elegant, poised
|
|
36
|
+
* Pulcherrima — Beautiful, melodic
|
|
37
|
+
* Rasalgethi — Warm, inviting
|
|
38
|
+
* Sadachbia — Calm, reassuring
|
|
39
|
+
* Sadaltager — Neutral, versatile
|
|
40
|
+
* Schedar — Crisp, articulate
|
|
41
|
+
* Sulafar — Deep, thoughtful
|
|
42
|
+
* Umbriel — Soft, subtle
|
|
43
|
+
* Vindemiatrix — Bright, clear
|
|
44
|
+
* Zubenelgenubi — Unique, distinctive
|
|
45
|
+
*
|
|
46
|
+
* Supports 24+ languages including Korean (ko-KR).
|
|
47
|
+
* Natural language prompts can control style, tone, pace, and emotion
|
|
48
|
+
* via the `prompt` field in the input object.
|
|
49
|
+
*
|
|
50
|
+
* API Endpoint: POST https://texttospeech.googleapis.com/v1/text:synthesize?key={apiKey}
|
|
51
|
+
* Docs: https://cloud.google.com/text-to-speech/docs/reference/rest
|
|
52
|
+
*/
|
|
53
|
+
import type { TtsAdapter, TtsOptions, VoiceInfo } from 'dvgateway-sdk';
|
|
54
|
+
export type GeminiTtsVoice = 'Kore' | 'Puck' | 'Aoede' | 'Charon' | 'Fenrir' | 'Leda' | 'Orus' | 'Zephyr' | 'Achernar' | 'Achird' | 'Algenib' | 'Algieba' | 'Alnilam' | 'Autonoe' | 'Callirhoe' | 'Despina' | 'Enceladus' | 'Erinome' | 'Gacrux' | 'Iapetus' | 'Laomedeia' | 'Pulcherrima' | 'Rasalgethi' | 'Sadachbia' | 'Sadaltager' | 'Schedar' | 'Sulafar' | 'Umbriel' | 'Vindemiatrix' | 'Zubenelgenubi';
|
|
55
|
+
export type GeminiTtsModel = 'gemini-2.5-flash-tts' | 'gemini-2.5-pro-tts';
|
|
56
|
+
export interface GeminiTtsAdapterOptions {
|
|
57
|
+
/** Google Cloud API key */
|
|
58
|
+
apiKey: string;
|
|
59
|
+
/** Voice name (default: "Kore") — see voice options above */
|
|
60
|
+
voice?: GeminiTtsVoice;
|
|
61
|
+
/**
|
|
62
|
+
* Model (default: "gemini-2.5-flash-tts")
|
|
63
|
+
* gemini-2.5-flash-tts — Low-latency, real-time optimized
|
|
64
|
+
* gemini-2.5-pro-tts — Highest quality, richer prosody
|
|
65
|
+
*/
|
|
66
|
+
model?: GeminiTtsModel;
|
|
67
|
+
/**
|
|
68
|
+
* BCP-47 language code (default: "ko-KR")
|
|
69
|
+
* Examples: "en-US", "ja-JP", "zh-CN", "ko-KR"
|
|
70
|
+
*/
|
|
71
|
+
languageCode?: string;
|
|
72
|
+
/**
|
|
73
|
+
* Natural language prompt for style control.
|
|
74
|
+
* Controls tone, pace, emotion, and speaking style.
|
|
75
|
+
* E.g. "Speak warmly and calmly, with gentle pauses between sentences."
|
|
76
|
+
*/
|
|
77
|
+
prompt?: string;
|
|
78
|
+
}
|
|
79
|
+
/** All available Gemini TTS voices with descriptive labels */
|
|
80
|
+
export declare const GEMINI_TTS_VOICES: ReadonlyArray<VoiceInfo>;
|
|
81
|
+
export declare class GeminiTtsAdapter implements TtsAdapter {
|
|
82
|
+
private readonly opts;
|
|
83
|
+
constructor(opts: GeminiTtsAdapterOptions);
|
|
84
|
+
synthesize(text: string, options?: TtsOptions): AsyncIterable<Buffer>;
|
|
85
|
+
}
|
|
86
|
+
//# sourceMappingURL=gemini-tts.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"gemini-tts.d.ts","sourceRoot":"","sources":["../../src/tts/gemini-tts.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmDG;AAGH,OAAO,KAAK,EAAE,UAAU,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,eAAe,CAAC;AAEvE,MAAM,MAAM,cAAc,GACtB,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,QAAQ,GAAG,MAAM,GAAG,MAAM,GAAG,QAAQ,GAC5E,UAAU,GAAG,QAAQ,GAAG,SAAS,GAAG,SAAS,GAAG,SAAS,GAAG,SAAS,GACrE,WAAW,GAAG,SAAS,GAAG,WAAW,GAAG,SAAS,GAAG,QAAQ,GAAG,SAAS,GACxE,WAAW,GAAG,aAAa,GAAG,YAAY,GAAG,WAAW,GAAG,YAAY,GACvE,SAAS,GAAG,SAAS,GAAG,SAAS,GAAG,cAAc,GAAG,eAAe,CAAC;AAEzE,MAAM,MAAM,cAAc,GAAG,sBAAsB,GAAG,oBAAoB,CAAC;AAE3E,MAAM,WAAW,uBAAuB;IACtC,2BAA2B;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,6DAA6D;IAC7D,KAAK,CAAC,EAAE,cAAc,CAAC;IACvB;;;;OAIG;IACH,KAAK,CAAC,EAAE,cAAc,CAAC;IACvB;;;OAGG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB;;;;OAIG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,8DAA8D;AAC9D,eAAO,MAAM,iBAAiB,EAAE,aAAa,CAAC,SAAS,CA+B7C,CAAC;AAMX,qBAAa,gBAAiB,YAAW,UAAU;IACjD,OAAO,CAAC,QAAQ,CAAC,IAAI,CAA0E;gBAEnF,IAAI,EAAE,uBAAuB;IAUlC,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC;CAuD7E"}
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Google Gemini TTS Adapter
|
|
3
|
+
*
|
|
4
|
+
* Uses Google Cloud Text-to-Speech API with Gemini models for speech synthesis.
|
|
5
|
+
* Returns 16kHz slin16 PCM chunks for direct injection into DVGateway.
|
|
6
|
+
*
|
|
7
|
+
* The API returns base64-encoded MP3 audio which is decoded and converted
|
|
8
|
+
* to 16kHz 16-bit PCM via ffmpeg subprocess for DVGateway compatibility.
|
|
9
|
+
*
|
|
10
|
+
* Model Reference (2026-03):
|
|
11
|
+
* gemini-2.5-flash-tts — Low-latency, optimized for real-time voice (default)
|
|
12
|
+
* gemini-2.5-pro-tts — Highest quality, richer prosody and expressiveness
|
|
13
|
+
*
|
|
14
|
+
* Voice options (30 voices):
|
|
15
|
+
* Kore — Recommended female voice (natural, warm)
|
|
16
|
+
* Puck — Recommended male voice (clear, friendly)
|
|
17
|
+
* Aoede — Melodic, expressive female
|
|
18
|
+
* Charon — Deep, authoritative male
|
|
19
|
+
* Fenrir — Strong, commanding male
|
|
20
|
+
* Leda — Soft, gentle female
|
|
21
|
+
* Orus — Calm, measured male
|
|
22
|
+
* Zephyr — Light, airy, gender-neutral
|
|
23
|
+
* Achernar — Crisp, professional
|
|
24
|
+
* Achird — Warm, conversational
|
|
25
|
+
* Algenib — Bright, energetic
|
|
26
|
+
* Algieba — Smooth, refined
|
|
27
|
+
* Alnilam — Clear, precise
|
|
28
|
+
* Autonoe — Expressive, dynamic
|
|
29
|
+
* Callirhoe — Graceful, flowing
|
|
30
|
+
* Despina — Cheerful, lively
|
|
31
|
+
* Enceladus — Rich, resonant
|
|
32
|
+
* Erinome — Gentle, soothing
|
|
33
|
+
* Gacrux — Steady, reliable
|
|
34
|
+
* Iapetus — Bold, confident
|
|
35
|
+
* Laomedeia — Elegant, poised
|
|
36
|
+
* Pulcherrima — Beautiful, melodic
|
|
37
|
+
* Rasalgethi — Warm, inviting
|
|
38
|
+
* Sadachbia — Calm, reassuring
|
|
39
|
+
* Sadaltager — Neutral, versatile
|
|
40
|
+
* Schedar — Crisp, articulate
|
|
41
|
+
* Sulafar — Deep, thoughtful
|
|
42
|
+
* Umbriel — Soft, subtle
|
|
43
|
+
* Vindemiatrix — Bright, clear
|
|
44
|
+
* Zubenelgenubi — Unique, distinctive
|
|
45
|
+
*
|
|
46
|
+
* Supports 24+ languages including Korean (ko-KR).
|
|
47
|
+
* Natural language prompts can control style, tone, pace, and emotion
|
|
48
|
+
* via the `prompt` field in the input object.
|
|
49
|
+
*
|
|
50
|
+
* API Endpoint: POST https://texttospeech.googleapis.com/v1/text:synthesize?key={apiKey}
|
|
51
|
+
* Docs: https://cloud.google.com/text-to-speech/docs/reference/rest
|
|
52
|
+
*/
|
|
53
|
+
import { spawn } from 'node:child_process';
|
|
54
|
+
/** All available Gemini TTS voices with descriptive labels */
|
|
55
|
+
export const GEMINI_TTS_VOICES = [
|
|
56
|
+
{ id: 'Kore', label: 'Kore (recommended female, natural)' },
|
|
57
|
+
{ id: 'Puck', label: 'Puck (recommended male, clear)' },
|
|
58
|
+
{ id: 'Aoede', label: 'Aoede (melodic, expressive)' },
|
|
59
|
+
{ id: 'Charon', label: 'Charon (deep, authoritative)' },
|
|
60
|
+
{ id: 'Fenrir', label: 'Fenrir (strong, commanding)' },
|
|
61
|
+
{ id: 'Leda', label: 'Leda (soft, gentle)' },
|
|
62
|
+
{ id: 'Orus', label: 'Orus (calm, measured)' },
|
|
63
|
+
{ id: 'Zephyr', label: 'Zephyr (light, airy)' },
|
|
64
|
+
{ id: 'Achernar', label: 'Achernar (crisp, professional)' },
|
|
65
|
+
{ id: 'Achird', label: 'Achird (warm, conversational)' },
|
|
66
|
+
{ id: 'Algenib', label: 'Algenib (bright, energetic)' },
|
|
67
|
+
{ id: 'Algieba', label: 'Algieba (smooth, refined)' },
|
|
68
|
+
{ id: 'Alnilam', label: 'Alnilam (clear, precise)' },
|
|
69
|
+
{ id: 'Autonoe', label: 'Autonoe (expressive, dynamic)' },
|
|
70
|
+
{ id: 'Callirhoe', label: 'Callirhoe (graceful, flowing)' },
|
|
71
|
+
{ id: 'Despina', label: 'Despina (cheerful, lively)' },
|
|
72
|
+
{ id: 'Enceladus', label: 'Enceladus (rich, resonant)' },
|
|
73
|
+
{ id: 'Erinome', label: 'Erinome (gentle, soothing)' },
|
|
74
|
+
{ id: 'Gacrux', label: 'Gacrux (steady, reliable)' },
|
|
75
|
+
{ id: 'Iapetus', label: 'Iapetus (bold, confident)' },
|
|
76
|
+
{ id: 'Laomedeia', label: 'Laomedeia (elegant, poised)' },
|
|
77
|
+
{ id: 'Pulcherrima', label: 'Pulcherrima (beautiful, melodic)' },
|
|
78
|
+
{ id: 'Rasalgethi', label: 'Rasalgethi (warm, inviting)' },
|
|
79
|
+
{ id: 'Sadachbia', label: 'Sadachbia (calm, reassuring)' },
|
|
80
|
+
{ id: 'Sadaltager', label: 'Sadaltager (neutral, versatile)' },
|
|
81
|
+
{ id: 'Schedar', label: 'Schedar (crisp, articulate)' },
|
|
82
|
+
{ id: 'Sulafar', label: 'Sulafar (deep, thoughtful)' },
|
|
83
|
+
{ id: 'Umbriel', label: 'Umbriel (soft, subtle)' },
|
|
84
|
+
{ id: 'Vindemiatrix', label: 'Vindemiatrix (bright, clear)' },
|
|
85
|
+
{ id: 'Zubenelgenubi', label: 'Zubenelgenubi (unique, distinctive)' },
|
|
86
|
+
];
|
|
87
|
+
const API_BASE_URL = 'https://texttospeech.googleapis.com/v1/text:synthesize';
|
|
88
|
+
const DV_SAMPLE_RATE = 16000;
|
|
89
|
+
const PCM_CHUNK_BYTES = 640; // 20ms at 16kHz, 16-bit PCM (16000 * 2 * 0.02)
|
|
90
|
+
export class GeminiTtsAdapter {
|
|
91
|
+
opts;
|
|
92
|
+
constructor(opts) {
|
|
93
|
+
this.opts = {
|
|
94
|
+
apiKey: opts.apiKey,
|
|
95
|
+
voice: opts.voice ?? 'Kore',
|
|
96
|
+
model: opts.model ?? 'gemini-2.5-flash-tts',
|
|
97
|
+
languageCode: opts.languageCode ?? 'ko-KR',
|
|
98
|
+
prompt: opts.prompt,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
async *synthesize(text, options) {
|
|
102
|
+
const voice = options?.voiceId ?? this.opts.voice;
|
|
103
|
+
// Build request body for Google Cloud Text-to-Speech API
|
|
104
|
+
const input = { text };
|
|
105
|
+
if (this.opts.prompt) {
|
|
106
|
+
input.prompt = this.opts.prompt;
|
|
107
|
+
}
|
|
108
|
+
const requestBody = {
|
|
109
|
+
input,
|
|
110
|
+
voice: {
|
|
111
|
+
languageCode: this.opts.languageCode,
|
|
112
|
+
name: voice,
|
|
113
|
+
modelName: this.opts.model,
|
|
114
|
+
},
|
|
115
|
+
audioConfig: {
|
|
116
|
+
audioEncoding: 'MP3',
|
|
117
|
+
},
|
|
118
|
+
};
|
|
119
|
+
const url = `${API_BASE_URL}?key=${this.opts.apiKey}`;
|
|
120
|
+
const response = await fetch(url, {
|
|
121
|
+
method: 'POST',
|
|
122
|
+
headers: { 'Content-Type': 'application/json' },
|
|
123
|
+
body: JSON.stringify(requestBody),
|
|
124
|
+
});
|
|
125
|
+
if (!response.ok) {
|
|
126
|
+
const errorBody = await response.text();
|
|
127
|
+
throw new Error(`Gemini TTS API error (${response.status}): ${errorBody}`);
|
|
128
|
+
}
|
|
129
|
+
const data = (await response.json());
|
|
130
|
+
if (!data.audioContent) {
|
|
131
|
+
throw new Error('Gemini TTS API returned empty audioContent');
|
|
132
|
+
}
|
|
133
|
+
// Decode base64 MP3 audio
|
|
134
|
+
const mp3Buffer = Buffer.from(data.audioContent, 'base64');
|
|
135
|
+
// Convert MP3 to 16kHz 16-bit signed little-endian PCM via ffmpeg
|
|
136
|
+
const pcmBuffer = await mp3ToPcm16k(mp3Buffer);
|
|
137
|
+
// Yield PCM chunks (20ms frames)
|
|
138
|
+
let offset = 0;
|
|
139
|
+
while (offset < pcmBuffer.length) {
|
|
140
|
+
const end = Math.min(offset + PCM_CHUNK_BYTES, pcmBuffer.length);
|
|
141
|
+
yield pcmBuffer.subarray(offset, end);
|
|
142
|
+
offset = end;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Convert MP3 audio buffer to 16kHz 16-bit signed little-endian PCM
|
|
148
|
+
* using ffmpeg subprocess. This follows the same pattern used by the
|
|
149
|
+
* Go gateway for audio format conversion.
|
|
150
|
+
*/
|
|
151
|
+
function mp3ToPcm16k(mp3Data) {
|
|
152
|
+
return new Promise((resolve, reject) => {
|
|
153
|
+
const ffmpeg = spawn('ffmpeg', [
|
|
154
|
+
'-i', 'pipe:0', // Read from stdin
|
|
155
|
+
'-f', 's16le', // Output format: signed 16-bit little-endian
|
|
156
|
+
'-ar', String(DV_SAMPLE_RATE), // Sample rate: 16kHz
|
|
157
|
+
'-ac', '1', // Mono channel
|
|
158
|
+
'-acodec', 'pcm_s16le', // PCM codec
|
|
159
|
+
'pipe:1', // Write to stdout
|
|
160
|
+
], {
|
|
161
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
162
|
+
});
|
|
163
|
+
const chunks = [];
|
|
164
|
+
let stderrOutput = '';
|
|
165
|
+
ffmpeg.stdout.on('data', (chunk) => {
|
|
166
|
+
chunks.push(chunk);
|
|
167
|
+
});
|
|
168
|
+
ffmpeg.stderr.on('data', (data) => {
|
|
169
|
+
stderrOutput += data.toString();
|
|
170
|
+
});
|
|
171
|
+
ffmpeg.on('close', (code) => {
|
|
172
|
+
if (code !== 0) {
|
|
173
|
+
reject(new Error(`ffmpeg MP3→PCM conversion failed (exit code ${code}): ${stderrOutput}`));
|
|
174
|
+
return;
|
|
175
|
+
}
|
|
176
|
+
resolve(Buffer.concat(chunks));
|
|
177
|
+
});
|
|
178
|
+
ffmpeg.on('error', (err) => {
|
|
179
|
+
reject(new Error(`Failed to spawn ffmpeg: ${err.message}`));
|
|
180
|
+
});
|
|
181
|
+
// Write MP3 data to ffmpeg stdin and close
|
|
182
|
+
ffmpeg.stdin.write(mp3Data);
|
|
183
|
+
ffmpeg.stdin.end();
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
//# sourceMappingURL=gemini-tts.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"gemini-tts.js","sourceRoot":"","sources":["../../src/tts/gemini-tts.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmDG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAoC3C,8DAA8D;AAC9D,MAAM,CAAC,MAAM,iBAAiB,GAA6B;IACzD,EAAE,EAAE,EAAE,MAAM,EAAW,KAAK,EAAE,oCAAoC,EAAE;IACpE,EAAE,EAAE,EAAE,MAAM,EAAW,KAAK,EAAE,gCAAgC,EAAE;IAChE,EAAE,EAAE,EAAE,OAAO,EAAU,KAAK,EAAE,6BAA6B,EAAE;IAC7D,EAAE,EAAE,EAAE,QAAQ,EAAS,KAAK,EAAE,8BAA8B,EAAE;IAC9D,EAAE,EAAE,EAAE,QAAQ,EAAS,KAAK,EAAE,6BAA6B,EAAE;IAC7D,EAAE,EAAE,EAAE,MAAM,EAAW,KAAK,EAAE,qBAAqB,EAAE;IACrD,EAAE,EAAE,EAAE,MAAM,EAAW,KAAK,EAAE,uBAAuB,EAAE;IACvD,EAAE,EAAE,EAAE,QAAQ,EAAS,KAAK,EAAE,sBAAsB,EAAE;IACtD,EAAE,EAAE,EAAE,UAAU,EAAO,KAAK,EAAE,gCAAgC,EAAE;IAChE,EAAE,EAAE,EAAE,QAAQ,EAAS,KAAK,EAAE,+BAA+B,EAAE;IAC/D,EAAE,EAAE,EAAE,SAAS,EAAQ,KAAK,EAAE,6BAA6B,EAAE;IAC7D,EAAE,EAAE,EAAE,SAAS,EAAQ,KAAK,EAAE,2BAA2B,EAAE;IAC3D,EAAE,EAAE,EAAE,SAAS,EAAQ,KAAK,EAAE,0BAA0B,EAAE;IAC1D,EAAE,EAAE,EAAE,SAAS,EAAQ,KAAK,EAAE,+BAA+B,EAAE;IAC/D,EAAE,EAAE,EAAE,WAAW,EAAM,KAAK,EAAE,+BAA+B,EAAE;IAC/D,EAAE,EAAE,EAAE,SAAS,EAAQ,KAAK,EAAE,4BAA4B,EAAE;IAC5D,EAAE,EAAE,EAAE,WAAW,EAAM,KAAK,EAAE,4BAA4B,EAAE;IAC5D,EAAE,EAAE,EAAE,SAAS,EAAQ,KAAK,EAAE,4BAA4B,EAAE;IAC5D,EAAE,EAAE,EAAE,QAAQ,EAAS,KAAK,EAAE,2BAA2B,EAAE;IAC3D,EAAE,EAAE,EAAE,SAAS,EAAQ,KAAK,EAAE,2BAA2B,EAAE;IAC3D,EAAE,EAAE,EAAE,WAAW,EAAM,KAAK,EAAE,6BAA6B,EAAE;IAC7D,EAAE,EAAE,EAAE,aAAa,EAAI,KAAK,EAAE,kCAAkC,EAAE;IAClE,EAAE,EAAE,EAAE,YAAY,EAAK,KAAK,EAAE,6BAA6B,EAAE;IAC7D,EAAE,EAAE,EAAE,WAAW,EAAM,KAAK,EAAE,8BAA8B,EAAE;IAC9D,EAAE,EAAE,EAAE,YAAY,EAAK,KAAK,EAAE,iCAAiC,EAAE;IACjE,EAAE,EAAE,EAAE,SAAS,EAAQ,KAAK,EAAE,6BAA6B,EAAE;IAC7D,EAAE,EAAE,EAAE,SAAS,EAAQ,KAAK,EAAE,4BAA4B,EAAE;IAC5D,EAAE,EAAE,EAAE,SAAS,EAAQ,KAAK,EAAE,wBAAwB,EAAE;IACxD,EAAE,EAAE,EAAE,cAAc,EAAG,KAAK,EAAE,8BAA8B,EAAE;IAC9D,EAAE,EAAE,EAAE,eAAe,EAAE,KAAK,EAAE,qCAAqC,EAAE;CAC7D,CAAC;AAEX,MAAM,YAAY,GAAG,wDAAwD,CAAC;AAC9E,MAAM,cAAc,GAAG,KAAK,CAAC;AAC7B,MAAM,eAAe,GAAG,GAAG,CAAC,CAAC,+CAA+C;AAE5E,MAAM,OAAO,gBAAgB;IACV,IAAI,CAA0E;IAE/F,YAAY,IAA6B;QACvC,IAAI,CAAC,IAAI,GAAG;YACV,MAAM,EAAQ,IAAI,CAAC,MAAM;YACzB,KAAK,EAAS,IAAI,CAAC,KAAK,IAAW,MAAM;YACzC,KAAK,EAAS,IAAI,CAAC,KAAK,IAAW,sBAAsB;YACzD,YAAY,EAAE,IAAI,CAAC,YAAY,IAAI,OAAO;YAC1C,MAAM,EAAQ,IAAI,CAAC,MAAM;SAC1B,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,CAAC,UAAU,CAAC,IAAY,EAAE,OAAoB;QAClD,MAAM,KAAK,GAAI,OAAO,EAAE,OAAsC,IAAI,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC;QAElF,yDAAyD;QACzD,MAAM,KAAK,GAA2B,EAAE,IAAI,EAAE,CAAC;QAC/C,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YACrB,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC;QAClC,CAAC;QAED,MAAM,WAAW,GAAG;YAClB,KAAK;YACL,KAAK,EAAE;gBACL,YAAY,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY;gBACpC,IAAI,EAAU,KAAK;gBACnB,SAAS,EAAK,IAAI,CAAC,IAAI,CAAC,KAAK;aAC9B;YACD,WAAW,EAAE;gBACX,aAAa,EAAE,KAAK;aACrB;SACF,CAAC;QAEF,MAAM,GAAG,GAAG,GAAG,YAAY,QAAQ,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QAEtD,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAChC,MAAM,EAAG,MAAM;YACf,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;YAC/C,IAAI,EAAK,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,SAAS,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACxC,MAAM,IAAI,KAAK,CACb,yBAAyB,QAAQ,CAAC,MAAM,MAAM,SAAS,EAAE,CAC1D,CAAC;QACJ,CAAC;QAED,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAA6B,CAAC;QACjE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,CAAC;YACvB,MAAM,IAAI,KAAK,CAAC,4CAA4C,CAAC,CAAC;QAChE,CAAC;QAED,0BAA0B;QAC1B,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAC;QAE3D,kEAAkE;QAClE,MAAM,SAAS,GAAG,MAAM,WAAW,CAAC,SAAS,CAAC,CAAC;QAE/C,iCAAiC;QACjC,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,OAAO,MAAM,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC;YACjC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,eAAe,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;YACjE,MAAM,SAAS,CAAC,QAAQ,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;YACtC,MAAM,GAAG,GAAG,CAAC;QACf,CAAC;IACH,CAAC;CACF;AAED;;;;GAIG;AACH,SAAS,WAAW,CAAC,OAAe;IAClC,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,MAAM,MAAM,GAAG,KAAK,CAAC,QAAQ,EAAE;YAC7B,IAAI,EAAE,QAAQ,EAAY,kBAAkB;YAC5C,IAAI,EAAE,OAAO,EAAa,6CAA6C;YACvE,KAAK,EAAE,MAAM,CAAC,cAAc,CAAC,EAAE,qBAAqB;YACpD,KAAK,EAAE,GAAG,EAAgB,eAAe;YACzC,SAAS,EAAE,WAAW,EAAI,YAAY;YACtC,QAAQ,EAAkB,kBAAkB;SAC7C,EAAE;YACD,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;SAChC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,YAAY,GAAG,EAAE,CAAC;QAEtB,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAa,EAAE,EAAE;YACzC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC,CAAC,CAAC;QAEH,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE;YACxC,YAAY,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;YAC1B,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,KAAK,CACd,+CAA+C,IAAI,MAAM,YAAY,EAAE,CACxE,CAAC,CAAC;gBACH,OAAO;YACT,CAAC;YACD,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;QACjC,CAAC,CAAC,CAAC;QAEH,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,GAAG,EAAE,EAAE;YACzB,MAAM,CAAC,IAAI,KAAK,CAAC,2BAA2B,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAC9D,CAAC,CAAC,CAAC;QAEH,2CAA2C;QAC3C,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAC5B,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC;IACrB,CAAC,CAAC,CAAC;AACL,CAAC"}
|
package/dist/tts/index.d.ts
CHANGED
|
@@ -2,6 +2,10 @@ export { ElevenLabsAdapter, ELEVENLABS_KOREAN_VOICES } from './elevenlabs.js';
|
|
|
2
2
|
export type { ElevenLabsAdapterOptions } from './elevenlabs.js';
|
|
3
3
|
export { OpenAITtsAdapter } from './openai-tts.js';
|
|
4
4
|
export type { OpenAITtsAdapterOptions, OpenAITtsVoice, OpenAITtsModel } from './openai-tts.js';
|
|
5
|
+
export { GeminiTtsAdapter, GEMINI_TTS_VOICES } from './gemini-tts.js';
|
|
6
|
+
export type { GeminiTtsAdapterOptions, GeminiTtsVoice, GeminiTtsModel } from './gemini-tts.js';
|
|
5
7
|
export { CachedTtsAdapter } from './cached-tts.js';
|
|
6
8
|
export type { CachedTtsAdapterOptions, WarmupEntry } from './cached-tts.js';
|
|
9
|
+
export { CosyVoiceAdapter, COSYVOICE_VOICES } from './cosyvoice-tts.js';
|
|
10
|
+
export type { CosyVoiceAdapterOptions, CosyVoiceVoice, CosyVoiceModel } from './cosyvoice-tts.js';
|
|
7
11
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/tts/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/tts/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,wBAAwB,EAAE,MAAM,iBAAiB,CAAC;AAC9E,YAAY,EAAE,wBAAwB,EAAE,MAAM,iBAAiB,CAAC;AAEhE,OAAO,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AACnD,YAAY,EAAE,uBAAuB,EAAE,cAAc,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAE/F,OAAO,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AACnD,YAAY,EAAE,uBAAuB,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/tts/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,wBAAwB,EAAE,MAAM,iBAAiB,CAAC;AAC9E,YAAY,EAAE,wBAAwB,EAAE,MAAM,iBAAiB,CAAC;AAEhE,OAAO,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AACnD,YAAY,EAAE,uBAAuB,EAAE,cAAc,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAE/F,OAAO,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACtE,YAAY,EAAE,uBAAuB,EAAE,cAAc,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAE/F,OAAO,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AACnD,YAAY,EAAE,uBAAuB,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAE5E,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AACxE,YAAY,EAAE,uBAAuB,EAAE,cAAc,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC"}
|
package/dist/tts/index.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
// TTS (Text-to-Speech) adapters
|
|
2
2
|
export { ElevenLabsAdapter, ELEVENLABS_KOREAN_VOICES } from './elevenlabs.js';
|
|
3
3
|
export { OpenAITtsAdapter } from './openai-tts.js';
|
|
4
|
+
export { GeminiTtsAdapter, GEMINI_TTS_VOICES } from './gemini-tts.js';
|
|
4
5
|
export { CachedTtsAdapter } from './cached-tts.js';
|
|
6
|
+
export { CosyVoiceAdapter, COSYVOICE_VOICES } from './cosyvoice-tts.js';
|
|
5
7
|
//# sourceMappingURL=index.js.map
|
package/dist/tts/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/tts/index.ts"],"names":[],"mappings":"AAAA,gCAAgC;AAChC,OAAO,EAAE,iBAAiB,EAAE,wBAAwB,EAAE,MAAM,iBAAiB,CAAC;AAG9E,OAAO,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AAGnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/tts/index.ts"],"names":[],"mappings":"AAAA,gCAAgC;AAChC,OAAO,EAAE,iBAAiB,EAAE,wBAAwB,EAAE,MAAM,iBAAiB,CAAC;AAG9E,OAAO,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AAGnD,OAAO,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AAGtE,OAAO,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AAGnD,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC"}
|
package/package.json
CHANGED