unspeech 0.1.6 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,82 @@
1
+ # unSpeech TypeScript Client
2
+
3
+ > Your Text-to-Speech Services, All-in-One.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ npm i unspeech
9
+ ```
10
+
11
+ ## Getting Started
12
+
13
+ ### List voices
14
+
15
+ Besides of the `/audio/speech` endpoint, we support listing all the available voices from providers as well:
16
+
17
+ ```ts
18
+ import { createUnSpeech, listVoices } from 'unspeech'
19
+
20
+ const unspeech = createUnSpeech('YOUR_EXTERNAL_PROVIDER_API_KEY', 'http://localhost:5933/v1/')
21
+
22
+ const voices = await listVoices(
23
+ unspeech.voice({ backend: 'elevenlabs' })
24
+ )
25
+ ```
26
+
27
+ ### Speech synthesis
28
+
29
+ For general purpose `/audio/speech` requests, `@xsai/generate-speech` or xsAI can be used as it's compatible:
30
+
31
+ ```bash
32
+ npm i @xsai/generate-speech
33
+ ```
34
+
35
+ ```ts
36
+ import { generateSpeech } from '@xsai/generate-speech'
37
+ import { createUnSpeech } from 'unspeech'
38
+
39
+ const unspeech = createUnSpeech('YOUR_EXTERNAL_PROVIDER_API_KEY', 'http://localhost:5933/v1/')
40
+ const speech = await generateSpeech({
41
+ ...unspeech.speech('elevenlabs/eleven_multilingual_v2'),
42
+ input: 'Hello, World!',
43
+ voice: '9BWtsMINqrJLrRacOk9x',
44
+ })
45
+ ```
46
+
47
+ For the other providers, you can import them as needed
48
+
49
+ ```ts
50
+ import {
51
+ createUnAlibabaCloud,
52
+ createUnElevenLabs,
53
+ createUnMicrosoft,
54
+ createUnSpeech,
55
+ createUnVolcengine,
56
+ } from 'unspeech'
57
+ ```
58
+
59
+ When using
60
+
61
+ - [Microsoft / Azure AI Speech service](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/text-to-speech)
62
+ - [Alibaba Cloud Model Studio / 阿里云百炼 / CosyVoice](https://www.alibabacloud.com/en/product/modelstudio)
63
+ - [Volcano Engine / 火山引擎语音技术](https://www.volcengine.com/product/voice-tech)
64
+ - [ElevenLabs](https://elevenlabs.io/docs/api-reference/text-to-speech/convert)
65
+
66
+ providers, [SSML](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup) is supported to control in fine grain level for pitch, volume, rate, etc.
67
+
68
+ ## Related Projects
69
+
70
+ Looking for something like unSpeech, but for local TTS? check it out:
71
+
72
+ - [erew123/alltalk_tts/alltalkbeta](https://github.com/erew123/alltalk_tts/tree/alltalkbeta)
73
+ - [astramind-ai/Auralis](https://github.com/astramind-ai/Auralis)
74
+ - [matatonic/openedai-speech](https://github.com/matatonic/openedai-speech)
75
+
76
+ Or to use free Edge TTS:
77
+
78
+ - [travisvn/openai-edge-tts](https://github.com/travisvn/openai-edge-tts)
79
+
80
+ ## License
81
+
82
+ [AGPL-3.0](./LICENSE)
@@ -0,0 +1,395 @@
1
+ import * as _xsai_ext_providers_utils0 from "@xsai-ext/providers/utils";
2
+ import { SpeechProviderWithExtraOptions } from "@xsai-ext/providers/utils";
3
+ import { CommonRequestOptions } from "@xsai/shared";
4
+
5
+ //#region src/types/voice.d.ts
6
+ interface Voice {
7
+ compatible_models: string[];
8
+ description: string;
9
+ formats: VoiceFormat[];
10
+ id: string;
11
+ labels: Record<string, any> & {
12
+ accent?: string;
13
+ age?: string;
14
+ gender?: string;
15
+ type?: string;
16
+ };
17
+ languages: VoiceLanguage[];
18
+ name: string;
19
+ predefined_options?: Record<string, any>;
20
+ preview_audio_url?: string;
21
+ tags: string[];
22
+ }
23
+ interface VoiceFormat {
24
+ bitrate: number;
25
+ extension: string;
26
+ format_code: string;
27
+ mime_type: string;
28
+ name: string;
29
+ sample_rate: number;
30
+ }
31
+ interface VoiceLanguage {
32
+ code: string;
33
+ title: string;
34
+ }
35
+ interface VoiceProvider {
36
+ voice: () => Omit<CommonRequestOptions, 'model'> & {
37
+ query?: string;
38
+ };
39
+ }
40
+ interface VoiceProviderWithExtraOptions<T = undefined> {
41
+ voice: (options?: T) => Omit<CommonRequestOptions, 'model'> & Partial<T> & {
42
+ query?: string;
43
+ };
44
+ }
45
+ //#endregion
46
+ //#region src/types/index.d.ts
47
+ interface UnSpeechOptions {
48
+ /** @experimental */
49
+ extraBody?: Record<string, unknown>;
50
+ }
51
+ //#endregion
52
+ //#region src/backend/microsoft.d.ts
53
+ type MicrosoftRegions = 'australiaeast' | 'brazilsouth' | 'canadacentral' | 'centralindia' | 'centralus' | 'eastasia' | 'eastus2' | 'eastus' | 'francecentral' | 'germanywestcentral' | 'japaneast' | 'japanwest' | 'jioindiawest' | 'koreacentral' | 'northcentralus' | 'northeurope' | 'norwayeast' | 'southcentralus' | 'southeastasia' | 'swedencentral' | 'switzerlandnorth' | 'switzerlandwest' | 'uaenorth' | 'uksouth' | 'usgovarizona' | 'usgovvirginia' | 'westcentralus' | 'westeurope' | 'westus2' | 'westus3' | 'westus';
54
+ interface UnMicrosoftOptionAutoSSML {
55
+ gender: 'Female' | 'Male' | 'Neutral' | string;
56
+ lang: 'en-US' | string;
57
+ /**
58
+ * Speech Studio - Voice Gallery
59
+ * https://speech.microsoft.com/portal/018ba84135d64cf79106cc99c75ffa6a/voicegallery
60
+ */
61
+ voice: 'en-US-AndrewMultilingualNeural' | 'en-US-AriaNeural' | 'en-US-AvaMultilingualNeural' | 'en-US-BrianMultilingualNeural' | 'en-US-ChristopherMultilingualNeural' | 'en-US-EmmaMultilingualNeural' | 'en-US-JaneNeural' | string;
62
+ }
63
+ interface UnMicrosoftOptionCommon {
64
+ /**
65
+ * Text to speech API reference (REST) - Speech service - Azure AI services | Microsoft Learn
66
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#custom-neural-voices
67
+ */
68
+ deploymentId?: string;
69
+ /**
70
+ * Text to speech API reference (REST) - Speech service - Azure AI services | Microsoft Learn
71
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#prebuilt-neural-voices
72
+ *
73
+ * NOTICE: Voices in preview are available in only these three regions: East US, West Europe, and Southeast Asia.
74
+ */
75
+ region: MicrosoftRegions | string;
76
+ sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | number;
77
+ }
78
+ interface UnMicrosoftOptionCustomSSML {
79
+ /**
80
+ * By default, unspeech service will help you automatically convert OpenAI style plain text input
81
+ * into SSML with lang, gender, voice parameters, but if you ever wanted to provide your own SSML
82
+ * with all customizable parameters, you can set this option to `true` to disable the automatic
83
+ * conversion and use your own SSML instead.
84
+ *
85
+ * About SSML (Speech Synthesis Markup Language), @see {@link https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup}
86
+ */
87
+ disableSsml?: boolean;
88
+ }
89
+ /** @see {@link https://elevenlabs.io/docs/api-reference/text-to-speech/convert#request} */
90
+ type UnMicrosoftOptions = (UnMicrosoftOptionAutoSSML | UnMicrosoftOptionCustomSSML) & UnMicrosoftOptionCommon;
91
+ /**
92
+ * [Microsoft / Azure AI](https://speech.microsoft.com/portal) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
93
+ * only.
94
+ *
95
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
96
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
97
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
98
+ *
99
+ * @param apiKey - Microsoft / Azure AI subscription key
100
+ * @param baseURL - UnSpeech Instance URL
101
+ * @returns SpeechProviderWithExtraOptions
102
+ */
103
+ declare function createUnMicrosoft(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"microsoft/v1", UnMicrosoftOptions> & VoiceProviderWithExtraOptions<UnMicrosoftOptions>;
104
+ //#endregion
105
+ //#region src/backend/alibabacloud.d.ts
106
+ interface UnAlibabaCloudOptions {
107
+ /**
108
+ * Speech pitch. Range: 0.5 to 2.0.
109
+ * @default 1.0
110
+ */
111
+ pitch?: number;
112
+ /**
113
+ * Speech rate. Range: 0.5 to 2.0.
114
+ * @default 1.0
115
+ */
116
+ rate?: number;
117
+ /**
118
+ * Sampling rate of the synthesized audio.
119
+ * @default 22050
120
+ */
121
+ sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | number;
122
+ /**
123
+ * Volume of the synthesized audio. Range: 0 to 100.
124
+ * @default 50
125
+ */
126
+ volume?: number;
127
+ }
128
+ /**
129
+ * [Alibaba Cloud / 阿里云 通义听悟](https://tingwu.aliyun.com/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
130
+ * only.
131
+ *
132
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
133
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
134
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
135
+ *
136
+ * @param apiKey - Alibaba Cloud AccessKey Token (see https://help.aliyun.com/document_detail/72153.html)
137
+ * @param baseURL - UnSpeech Instance URL
138
+ * @returns SpeechProviderWithExtraOptions & VoiceProviderWithExtraOptions
139
+ */
140
+ declare function createUnAlibabaCloud(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"alibaba/v1", UnAlibabaCloudOptions> & VoiceProviderWithExtraOptions<UnAlibabaCloudOptions>;
141
+ //#endregion
142
+ //#region src/backend/deepgram.d.ts
143
+ /** @see {@link https://developers.deepgram.com/docs/text-to-speech} */
144
+ interface UnDeepgramOptions {}
145
+ /**
146
+ * [Deepgram](https://deepgram.com/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
147
+ * only.
148
+ *
149
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
150
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
151
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
152
+ *
153
+ * @param apiKey - Deepgram API Key
154
+ * @param baseURL - UnSpeech Instance URL
155
+ * @returns SpeechProviderWithExtraOptions
156
+ */
157
+ declare function createUnDeepgram(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<string, UnDeepgramOptions> & VoiceProviderWithExtraOptions<UnDeepgramOptions>;
158
+ //#endregion
159
+ //#region src/backend/elevenlabs.d.ts
160
+ /** @see {@link https://elevenlabs.io/docs/api-reference/text-to-speech/convert#request} */
161
+ interface UnElevenLabsOptions {
162
+ /**
163
+ * This parameter controls text normalization with three modes: 'auto', 'on', and 'off'. When set to 'auto',
164
+ * the system will automatically decide whether to apply text normalization (e.g., spelling out numbers).
165
+ * With 'on', text normalization will always be applied, while with 'off', it will be skipped. Cannot be
166
+ * turned on for 'eleven_turbo_v2_5' model.
167
+ */
168
+ applyTextNormalization?: 'auto' | 'off' | 'on';
169
+ /**
170
+ * Language code (ISO 639-1) used to enforce a language for the model. Currently only Turbo v2.5
171
+ * supports language enforcement. For other models, an error will be returned if language code is provided.
172
+ */
173
+ languageCode?: string;
174
+ /**
175
+ * A list of request_id of the samples that were generated before this generation. Can
176
+ * be used to improve the flow of prosody when splitting up a large task into multiple
177
+ * requests. The results will be best when the same model is used across the generations.
178
+ *
179
+ * In case both next_text and next_request_ids is send, next_text will be ignored.
180
+ * A maximum of 3 request_ids can be send.
181
+ */
182
+ nextRequestIds?: string[];
183
+ /**
184
+ * The text that comes after the text of the current request. Can be used to improve
185
+ * the flow of prosody when concatenating together multiple generations or to influence
186
+ * the prosody in the current generation.
187
+ */
188
+ nextText?: string;
189
+ /**
190
+ * A list of request_id of the samples that were generated before this generation. Can be
191
+ * used to improve the flow of prosody when splitting up a large task into multiple requests.
192
+ * The results will be best when the same model is used across the generations. In case both
193
+ * previous_text and previous_request_ids is send, previous_text will be ignored. A maximum
194
+ * of 3 request_ids can be send.
195
+ */
196
+ previousRequestIds?: string[];
197
+ /**
198
+ * The text that came before the text of the current request. Can be used to improve the
199
+ * flow of prosody when concatenating together multiple generations or to influence the
200
+ * prosody in the current generation.
201
+ */
202
+ previousText?: string;
203
+ /**
204
+ * A list of pronunciation dictionary locators (id, version_id) to be applied to the text.
205
+ * They will be applied in order. You may have up to 3 locators per request
206
+ */
207
+ pronunciationDictionaryLocators?: {
208
+ pronunciationDictionaryId: string;
209
+ versionId: string;
210
+ }[];
211
+ /**
212
+ * If specified, our system will make a best effort to sample deterministically, such that
213
+ * repeated requests with the same seed and parameters should return the same result.
214
+ * Determinism is not guaranteed. Must be integer between 0 and 4294967295.
215
+ */
216
+ seed?: number;
217
+ /**
218
+ * Voice settings overriding stored settings for the given voice. They are applied only on the given request.
219
+ */
220
+ voiceSettings?: {
221
+ /**
222
+ * Determines how closely the AI should adhere to the original voice when attempting to replicate it.
223
+ */
224
+ similarityBoost: number;
225
+ /**
226
+ * Controls the speed of the generated speech. Values range from 0.7 to 1.2, with 1.0 being the default
227
+ * speed. Lower values create slower, more deliberate speech while higher values produce faster-paced
228
+ * speech. Extreme values can impact the quality of the generated speech.
229
+ *
230
+ * @default 1.0
231
+ */
232
+ speed?: number;
233
+ /**
234
+ * Determines how stable the voice is and the randomness between each generation. Lower values introduce
235
+ * broader emotional range for the voice. Higher values can result in a monotonous voice with limited
236
+ * emotion.
237
+ */
238
+ stability: number;
239
+ /**
240
+ * Determines the style exaggeration of the voice. This setting attempts to amplify the style of the original
241
+ * speaker. It does consume additional computational resources and might increase latency if set to anything
242
+ * other than 0.
243
+ *
244
+ * @default 0
245
+ */
246
+ style?: number;
247
+ /**
248
+ * This setting boosts the similarity to the original speaker. Using this setting requires a slightly higher
249
+ * computational load, which in turn increases latency.
250
+ *
251
+ * @default true
252
+ */
253
+ useSpeakerBoost?: boolean;
254
+ };
255
+ }
256
+ /**
257
+ * [ElevenLabs](https://elevenlabs.io/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
258
+ * only.
259
+ *
260
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
261
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
262
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
263
+ *
264
+ * @param apiKey - ElevenLabs API Key
265
+ * @param baseURL - UnSpeech Instance URL
266
+ * @returns SpeechProviderWithExtraOptions
267
+ */
268
+ declare function createUnElevenLabs(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"eleven_english_sts_v2" | "eleven_flash_v2" | "eleven_flash_v2_5" | "eleven_multilingual_sts_v2" | "eleven_multilingual_v2", UnElevenLabsOptions> & VoiceProviderWithExtraOptions<UnElevenLabsOptions>;
269
+ //#endregion
270
+ //#region src/backend/volcengine.d.ts
271
+ interface UnVolcengineOptions {
272
+ app?: {
273
+ appId?: string;
274
+ cluster?: 'volcano_tts' | string;
275
+ };
276
+ audio?: {
277
+ /**
278
+ * @default 160
279
+ */
280
+ bitRate?: 160 | number;
281
+ /**
282
+ * Languages that contextual to the model
283
+ */
284
+ contextLanguage?: 'es' | 'id' | 'pt' | string;
285
+ emotion?: 'angry' | string;
286
+ /**
287
+ * After calling emotion to set the emotion parameter you can use emotion_scale to
288
+ * further set the emotion value, the range is 1~5, the default value is 4 when not
289
+ * set.
290
+ *
291
+ * Note: Theoretically, the larger the emotion value is, the more obvious the emotion
292
+ * is. However, the emotion value 1~5 is actually non-linear growth, there may be
293
+ * more than a certain value, the increase in emotion is not obvious, for example,
294
+ * set 3 and 5 when the emotion value may be close.
295
+ *
296
+ * 1~5
297
+ *
298
+ * @default 4
299
+ */
300
+ emotionScale?: number;
301
+ enableEmotion?: boolean;
302
+ /**
303
+ * @default 'mp3'
304
+ */
305
+ encoding?: 'mp3' | 'ogg_opus' | 'pcm' | 'wav';
306
+ /**
307
+ * - undefined: General mixed bilingual
308
+ * - crosslingual: mix with zh/en/ja/es-ms/id/pt-br
309
+ * - zh: primarily Chinese, supports mixed Chinese and English
310
+ * - en: only English
311
+ * - ja: only Japanese
312
+ * - es-mx: only Mexican Spanish
313
+ * - id: only Indonesian
314
+ * - pt-br: only Brazilian Portuguese
315
+ *
316
+ * @default 'en'
317
+ */
318
+ explicitLanguage?: 'crosslingual' | 'en' | 'es-mx' | 'id' | 'jp' | 'pt-br' | 'zh' | string;
319
+ /**
320
+ * 0.5 ~ 2
321
+ *
322
+ * @default 1
323
+ */
324
+ loudnessRatio?: number;
325
+ /**
326
+ * @default 24000
327
+ */
328
+ rate?: 8000 | 16000 | 24000 | number;
329
+ /**
330
+ * 0.8~2
331
+ *
332
+ * @default 1
333
+ */
334
+ speedRatio?: number;
335
+ };
336
+ request?: {
337
+ cacheConfig?: Record<string, unknown>;
338
+ disableMarkdownFilter?: boolean;
339
+ enableLatexTone?: boolean;
340
+ extraParam?: string;
341
+ reqid?: string;
342
+ /**
343
+ * 0 ~ 30000ms
344
+ */
345
+ silenceDuration?: number;
346
+ /**
347
+ * - set to `ssml` to use SSML
348
+ */
349
+ textType?: 'ssml' | string;
350
+ useCache?: boolean;
351
+ withTimestamp?: string;
352
+ };
353
+ user?: {
354
+ uid?: string;
355
+ };
356
+ }
357
+ /**
358
+ * [Volcengine / 火山引擎](https://www.volcengine.com/docs/6561/162929) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
359
+ * only.
360
+ *
361
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
362
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
363
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
364
+ *
365
+ * @param apiKey - Volcano Engine Speech Service Token
366
+ * @param baseURL - UnSpeech Instance URL
367
+ * @returns SpeechProviderWithExtraOptions
368
+ */
369
+ declare function createUnVolcengine(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"volcengine/v1", UnVolcengineOptions> & VoiceProviderWithExtraOptions<UnVolcengineOptions>;
370
+ //#endregion
371
+ //#region src/backend/index.d.ts
372
+ /** @see {@link https://github.com/moeru-ai/unspeech} */
373
+ declare function createUnSpeech(apiKey: string, baseURL?: string): _xsai_ext_providers_utils0.SpeechProviderWithExtraOptions<`alibaba/${string}` | `aliyun/${string}` | `deepgram/${string}` | `elevenlabs/${string}` | `koemotion/${string}` | `openai/${string}` | `volcano/${string}` | `volcengine/${string}`, UnSpeechOptions> & VoiceProviderWithExtraOptions<{
374
+ appId: string;
375
+ backend: "volcano";
376
+ } | {
377
+ appId: string;
378
+ backend: "volcengine";
379
+ } | {
380
+ backend: "azure" | "microsoft";
381
+ region: MicrosoftRegions | string;
382
+ } | {
383
+ backend: "ali" | "alibaba" | "alibaba-model-studio" | "aliyun" | "bailian" | "deepgram" | "elevenlabs" | "koemotion" | "openai";
384
+ }>;
385
+ //#endregion
386
+ //#region src/utils/list-voices.d.ts
387
+ interface ListVoicesOptions extends Omit<CommonRequestOptions, 'model'> {
388
+ query?: string;
389
+ }
390
+ interface ListVoicesResponse {
391
+ voices: Voice[];
392
+ }
393
+ declare function listVoices(options: ListVoicesOptions): Promise<Voice[]>;
394
+ //#endregion
395
+ export { ListVoicesOptions, ListVoicesResponse, MicrosoftRegions, UnAlibabaCloudOptions, UnDeepgramOptions, UnElevenLabsOptions, UnMicrosoftOptionAutoSSML, UnMicrosoftOptionCommon, UnMicrosoftOptionCustomSSML, UnMicrosoftOptions, UnSpeechOptions, UnVolcengineOptions, Voice, VoiceFormat, VoiceLanguage, VoiceProvider, VoiceProviderWithExtraOptions, createUnAlibabaCloud, createUnDeepgram, createUnElevenLabs, createUnMicrosoft, createUnSpeech, createUnVolcengine, listVoices };
package/dist/index.mjs ADDED
@@ -0,0 +1,250 @@
1
+ import { createSpeechProviderWithExtraOptions, merge } from "@xsai-ext/providers/utils";
2
+ import { objCamelToSnake, requestHeaders, requestURL, responseJSON } from "@xsai/shared";
3
+
4
+ //#region src/backend/alibabacloud.ts
5
+ /**
6
+ * [Alibaba Cloud / 阿里云 通义听悟](https://tingwu.aliyun.com/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
7
+ * only.
8
+ *
9
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
10
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
11
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
12
+ *
13
+ * @param apiKey - Alibaba Cloud AccessKey Token (see https://help.aliyun.com/document_detail/72153.html)
14
+ * @param baseURL - UnSpeech Instance URL
15
+ * @returns SpeechProviderWithExtraOptions & VoiceProviderWithExtraOptions
16
+ */
17
+ function createUnAlibabaCloud(apiKey, baseURL = "http://localhost:5933/v1/") {
18
+ const toUnSpeechOptions = (options) => {
19
+ const { pitch, rate, sampleRate, volume } = options;
20
+ const extraBody = {
21
+ pitch,
22
+ rate,
23
+ sampleRate,
24
+ volume
25
+ };
26
+ Object.keys(extraBody).forEach((key) => extraBody[key] === void 0 && delete extraBody[key]);
27
+ return { extraBody: objCamelToSnake(extraBody) };
28
+ };
29
+ return merge({ speech: (model, options) => ({
30
+ ...options ? toUnSpeechOptions(options) : {},
31
+ apiKey,
32
+ baseURL,
33
+ model: `alibaba/${model}`
34
+ }) }, { voice: (options) => {
35
+ let adjustedBaseURL = baseURL;
36
+ if (adjustedBaseURL.endsWith("v1/")) adjustedBaseURL = adjustedBaseURL.slice(0, -3);
37
+ else if (adjustedBaseURL.endsWith("v1")) adjustedBaseURL = adjustedBaseURL.slice(0, -2);
38
+ return {
39
+ query: "provider=alibaba",
40
+ ...options ? toUnSpeechOptions(options) : {},
41
+ apiKey,
42
+ baseURL: adjustedBaseURL
43
+ };
44
+ } });
45
+ }
46
+
47
+ //#endregion
48
+ //#region src/backend/deepgram.ts
49
+ /**
50
+ * [Deepgram](https://deepgram.com/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
51
+ * only.
52
+ *
53
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
54
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
55
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
56
+ *
57
+ * @param apiKey - Deepgram API Key
58
+ * @param baseURL - UnSpeech Instance URL
59
+ * @returns SpeechProviderWithExtraOptions
60
+ */
61
+ function createUnDeepgram(apiKey, baseURL = "http://localhost:5933/v1/") {
62
+ return merge({ speech: (model, _options) => ({
63
+ apiKey,
64
+ baseURL,
65
+ model: model.startsWith("deepgram/") ? model : `deepgram/${model}`
66
+ }) }, { voice: (_options) => {
67
+ if (baseURL.endsWith("v1/")) baseURL = baseURL.slice(0, -3);
68
+ else if (baseURL.endsWith("v1")) baseURL = baseURL.slice(0, -2);
69
+ return {
70
+ query: "provider=deepgram",
71
+ apiKey,
72
+ baseURL
73
+ };
74
+ } });
75
+ }
76
+
77
+ //#endregion
78
+ //#region src/backend/elevenlabs.ts
79
+ /**
80
+ * [ElevenLabs](https://elevenlabs.io/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
81
+ * only.
82
+ *
83
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
84
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
85
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
86
+ *
87
+ * @param apiKey - ElevenLabs API Key
88
+ * @param baseURL - UnSpeech Instance URL
89
+ * @returns SpeechProviderWithExtraOptions
90
+ */
91
+ function createUnElevenLabs(apiKey, baseURL = "http://localhost:5933/v1/") {
92
+ const toUnSpeechOptions = ({ applyTextNormalization, languageCode, nextRequestIds, nextText, previousRequestIds, previousText, pronunciationDictionaryLocators, seed, voiceSettings }) => ({ extraBody: objCamelToSnake({
93
+ applyTextNormalization,
94
+ languageCode,
95
+ nextRequestIds,
96
+ nextText,
97
+ previousRequestIds,
98
+ previousText,
99
+ pronunciationDictionaryLocators: pronunciationDictionaryLocators ? pronunciationDictionaryLocators.map((pdl) => objCamelToSnake(pdl)) : void 0,
100
+ seed,
101
+ voiceSettings: objCamelToSnake(voiceSettings != null ? voiceSettings : {
102
+ similarityBoost: .75,
103
+ stability: .5
104
+ })
105
+ }) });
106
+ return merge({ speech: (model, options) => ({
107
+ ...options ? toUnSpeechOptions(options) : {},
108
+ apiKey,
109
+ baseURL,
110
+ model: `elevenlabs/${model}`
111
+ }) }, { voice: (options) => {
112
+ if (baseURL.endsWith("v1/")) baseURL = baseURL.slice(0, -3);
113
+ else if (baseURL.endsWith("v1")) baseURL = baseURL.slice(0, -2);
114
+ return {
115
+ query: "provider=elevenlabs",
116
+ ...options ? toUnSpeechOptions(options) : {},
117
+ apiKey,
118
+ baseURL
119
+ };
120
+ } });
121
+ }
122
+
123
+ //#endregion
124
+ //#region src/backend/microsoft.ts
125
+ /**
126
+ * [Microsoft / Azure AI](https://speech.microsoft.com/portal) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
127
+ * only.
128
+ *
129
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
130
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
131
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
132
+ *
133
+ * @param apiKey - Microsoft / Azure AI subscription key
134
+ * @param baseURL - UnSpeech Instance URL
135
+ * @returns SpeechProviderWithExtraOptions
136
+ */
137
+ function createUnMicrosoft(apiKey, baseURL = "http://localhost:5933/v1/") {
138
+ const toUnSpeechOptions = (options) => {
139
+ const { deploymentId, region, sampleRate } = options;
140
+ const extraBody = {
141
+ deploymentId,
142
+ region,
143
+ sampleRate
144
+ };
145
+ if ("disableSsml" in options) extraBody.disableSsml = options.disableSsml;
146
+ else if ("lang" in options) {
147
+ extraBody.lang = options.lang;
148
+ extraBody.gender = options.gender;
149
+ extraBody.voice = options.voice;
150
+ }
151
+ return { extraBody: objCamelToSnake(extraBody) };
152
+ };
153
+ return merge({ speech: (model, options) => ({
154
+ ...options ? toUnSpeechOptions(options) : {},
155
+ apiKey,
156
+ baseURL,
157
+ model: `microsoft/${model}`
158
+ }) }, { voice: (options) => {
159
+ if (baseURL.endsWith("v1/")) baseURL = baseURL.slice(0, -3);
160
+ else if (baseURL.endsWith("v1")) baseURL = baseURL.slice(0, -2);
161
+ return {
162
+ query: `region=${options?.region}&provider=microsoft`,
163
+ ...options ? toUnSpeechOptions(options) : {},
164
+ apiKey,
165
+ baseURL
166
+ };
167
+ } });
168
+ }
169
+
170
+ //#endregion
171
+ //#region src/backend/volcengine.ts
172
+ /**
173
+ * [Volcengine / 火山引擎](https://www.volcengine.com/docs/6561/162929) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
174
+ * only.
175
+ *
176
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
177
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
178
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
179
+ *
180
+ * @param apiKey - Volcano Engine Speech Service Token
181
+ * @param baseURL - UnSpeech Instance URL
182
+ * @returns SpeechProviderWithExtraOptions
183
+ */
184
+ function createUnVolcengine(apiKey, baseURL = "http://localhost:5933/v1/") {
185
+ const toUnSpeechOptions = (options) => {
186
+ const extraBody = { app: {
187
+ appid: options.app?.appId,
188
+ token: apiKey
189
+ } };
190
+ if (typeof options.app !== "undefined") extraBody.app = {
191
+ ...options.app,
192
+ appid: options.app?.appId,
193
+ token: apiKey
194
+ };
195
+ if (typeof options.user !== "undefined") extraBody.user = options.user;
196
+ if (typeof options.audio !== "undefined") extraBody.audio = options.audio;
197
+ return { extraBody: objCamelToSnake(extraBody) };
198
+ };
199
+ return merge({ speech: (model, options) => ({
200
+ ...options ? toUnSpeechOptions(options) : {},
201
+ apiKey,
202
+ baseURL,
203
+ model: `volcengine/${model}`
204
+ }) }, { voice: (options) => {
205
+ if (baseURL.endsWith("v1/")) baseURL = baseURL.slice(0, -3);
206
+ else if (baseURL.endsWith("v1")) baseURL = baseURL.slice(0, -2);
207
+ return {
208
+ query: "provider=volcengine",
209
+ ...options ? toUnSpeechOptions(options) : {},
210
+ apiKey,
211
+ baseURL
212
+ };
213
+ } });
214
+ }
215
+
216
+ //#endregion
217
+ //#region src/backend/index.ts
218
+ /** @see {@link https://github.com/moeru-ai/unspeech} */
219
+ function createUnSpeech(apiKey, baseURL = "http://localhost:5933/v1/") {
220
+ return merge(createSpeechProviderWithExtraOptions({
221
+ apiKey,
222
+ baseURL
223
+ }), { voice: (options) => {
224
+ if (baseURL.endsWith("v1/")) baseURL = baseURL.slice(0, -3);
225
+ else if (baseURL.endsWith("v1")) baseURL = baseURL.slice(0, -2);
226
+ if (options?.backend === "microsoft" || options?.backend === "azure") return {
227
+ apiKey,
228
+ baseURL,
229
+ query: `region=${options.region}&provider=${options.backend}`
230
+ };
231
+ return {
232
+ apiKey,
233
+ baseURL,
234
+ query: `provider=${options?.backend}`
235
+ };
236
+ } });
237
+ }
238
+
239
+ //#endregion
240
+ //#region src/utils/list-voices.ts
241
+ async function listVoices(options) {
242
+ return (options.fetch ?? globalThis.fetch)(requestURL(options.query ? `api/voices?${options.query}` : "api/voices", options.baseURL), {
243
+ headers: requestHeaders({ ...options.headers }, options.apiKey),
244
+ method: "GET",
245
+ signal: options.abortSignal
246
+ }).then(responseJSON).then(({ voices }) => voices);
247
+ }
248
+
249
+ //#endregion
250
+ export { createUnAlibabaCloud, createUnDeepgram, createUnElevenLabs, createUnMicrosoft, createUnSpeech, createUnVolcengine, listVoices };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "unspeech",
3
3
  "type": "module",
4
- "version": "0.1.6",
4
+ "version": "0.1.10",
5
5
  "description": "Client SDK for unSpeech",
6
6
  "author": "Moeru AI",
7
7
  "license": "MIT",
@@ -24,26 +24,12 @@
24
24
  "dist"
25
25
  ],
26
26
  "dependencies": {
27
- "@xsai-ext/shared-providers": "^0.2.0-beta.2",
28
- "@xsai/shared": "^0.2.0-beta.2"
29
- },
30
- "devDependencies": {
31
- "@antfu/eslint-config": "^4.11.0",
32
- "@importantimport/eslint-config": "1.0.0-beta.2",
33
- "@importantimport/tsconfig": "1.0.0-beta.2",
34
- "@types/node": "^22.13.15",
35
- "bumpp": "^10.1.0",
36
- "eslint": "^9.23.0",
37
- "jiti": "^2.4.2",
38
- "tsup": "^8.4.0",
39
- "tsx": "^4.19.3",
40
- "typescript": "^5.8.2"
27
+ "@xsai-ext/providers": "^0.4.0",
28
+ "@xsai/shared": "^0.4.0"
41
29
  },
42
30
  "scripts": {
43
- "build": "tsup",
44
- "bump": "bumpp",
45
- "lint": "eslint --cache .",
46
- "test": "vitest"
31
+ "build": "tsdown",
32
+ "typecheck": "tsc --noEmit"
47
33
  },
48
34
  "main": "./dist/index.js",
49
35
  "types": "./dist/index.d.ts"
package/dist/index.d.ts DELETED
@@ -1,371 +0,0 @@
1
- import * as _xsai_ext_shared_providers from '@xsai-ext/shared-providers';
2
- import { SpeechProviderWithExtraOptions } from '@xsai-ext/shared-providers';
3
- import { CommonRequestOptions } from '@xsai/shared';
4
-
5
- interface Voice {
6
- compatible_models: string[];
7
- description: string;
8
- formats: VoiceFormat[];
9
- id: string;
10
- labels: Record<string, any> & {
11
- accent?: string;
12
- age?: string;
13
- gender?: string;
14
- type?: string;
15
- };
16
- languages: VoiceLanguage[];
17
- name: string;
18
- predefined_options?: Record<string, any>;
19
- preview_audio_url?: string;
20
- tags: string[];
21
- }
22
- interface VoiceFormat {
23
- bitrate: number;
24
- extension: string;
25
- format_code: string;
26
- mime_type: string;
27
- name: string;
28
- sample_rate: number;
29
- }
30
- interface VoiceLanguage {
31
- code: string;
32
- title: string;
33
- }
34
- interface VoiceProvider {
35
- voice: () => Omit<CommonRequestOptions, 'model'> & {
36
- query?: string;
37
- };
38
- }
39
- interface VoiceProviderWithExtraOptions<T = undefined> {
40
- voice: (options?: T) => Omit<CommonRequestOptions, 'model'> & {
41
- query?: string;
42
- } & Partial<T>;
43
- }
44
-
45
- interface UnSpeechOptions {
46
- /** @experimental */
47
- extraBody?: Record<string, unknown>;
48
- }
49
-
50
- type MicrosoftRegions = 'australiaeast' | 'brazilsouth' | 'canadacentral' | 'centralindia' | 'centralus' | 'eastasia' | 'eastus2' | 'eastus' | 'francecentral' | 'germanywestcentral' | 'japaneast' | 'japanwest' | 'jioindiawest' | 'koreacentral' | 'northcentralus' | 'northeurope' | 'norwayeast' | 'southcentralus' | 'southeastasia' | 'swedencentral' | 'switzerlandnorth' | 'switzerlandwest' | 'uaenorth' | 'uksouth' | 'usgovarizona' | 'usgovvirginia' | 'westcentralus' | 'westeurope' | 'westus2' | 'westus3' | 'westus';
51
- interface UnMicrosoftOptionAutoSSML {
52
- gender: 'Female' | 'Male' | 'Neutral' | string;
53
- lang: 'en-US' | string;
54
- /**
55
- * Speech Studio - Voice Gallery
56
- * https://speech.microsoft.com/portal/018ba84135d64cf79106cc99c75ffa6a/voicegallery
57
- */
58
- voice: 'en-US-AndrewMultilingualNeural' | 'en-US-AriaNeural' | 'en-US-AvaMultilingualNeural' | 'en-US-BrianMultilingualNeural' | 'en-US-ChristopherMultilingualNeural' | 'en-US-EmmaMultilingualNeural' | 'en-US-JaneNeural' | string;
59
- }
60
- interface UnMicrosoftOptionCommon {
61
- /**
62
- * Text to speech API reference (REST) - Speech service - Azure AI services | Microsoft Learn
63
- * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#custom-neural-voices
64
- */
65
- deploymentId?: string;
66
- /**
67
- * Text to speech API reference (REST) - Speech service - Azure AI services | Microsoft Learn
68
- * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#prebuilt-neural-voices
69
- *
70
- * NOTICE: Voices in preview are available in only these three regions: East US, West Europe, and Southeast Asia.
71
- */
72
- region: MicrosoftRegions | string;
73
- sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | number;
74
- }
75
- interface UnMicrosoftOptionCustomSSML {
76
- /**
77
- * By default, unspeech service will help you automatically convert OpenAI style plain text input
78
- * into SSML with lang, gender, voice parameters, but if you ever wanted to provide your own SSML
79
- * with all customizable parameters, you can set this option to `true` to disable the automatic
80
- * conversion and use your own SSML instead.
81
- *
82
- * About SSML (Speech Synthesis Markup Language), @see {@link https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup}
83
- */
84
- disableSsml?: boolean;
85
- }
86
- /** @see {@link https://elevenlabs.io/docs/api-reference/text-to-speech/convert#request} */
87
- type UnMicrosoftOptions = (UnMicrosoftOptionAutoSSML | UnMicrosoftOptionCustomSSML) & UnMicrosoftOptionCommon;
88
- /**
89
- * [Microsoft / Azure AI](https://speech.microsoft.com/portal) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
90
- * only.
91
- *
92
- * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
93
- * OpenAI-compatible audio & speech related API that can be used with various providers such
94
- * as ElevenLabs, Azure TTS, Google TTS, etc.
95
- *
96
- * @param apiKey - Microsoft / Azure AI subscription key
97
- * @param baseURL - UnSpeech Instance URL
98
- * @returns SpeechProviderWithExtraOptions
99
- */
100
- declare const createUnMicrosoft: (apiKey: string, baseURL?: string) => SpeechProviderWithExtraOptions<"microsoft/v1", UnMicrosoftOptions> & VoiceProviderWithExtraOptions<UnMicrosoftOptions>;
101
-
102
- /** @see {@link https://elevenlabs.io/docs/api-reference/text-to-speech/convert#request} */
103
- interface UnElevenLabsOptions {
104
- /**
105
- * This parameter controls text normalization with three modes: 'auto', 'on', and 'off'. When set to 'auto',
106
- * the system will automatically decide whether to apply text normalization (e.g., spelling out numbers).
107
- * With 'on', text normalization will always be applied, while with 'off', it will be skipped. Cannot be
108
- * turned on for 'eleven_turbo_v2_5' model.
109
- */
110
- applyTextNormalization?: 'auto' | 'off' | 'on';
111
- /**
112
- * Language code (ISO 639-1) used to enforce a language for the model. Currently only Turbo v2.5
113
- * supports language enforcement. For other models, an error will be returned if language code is provided.
114
- */
115
- languageCode?: string;
116
- /**
117
- * A list of request_id of the samples that were generated before this generation. Can
118
- * be used to improve the flow of prosody when splitting up a large task into multiple
119
- * requests. The results will be best when the same model is used across the generations.
120
- *
121
- * In case both next_text and next_request_ids is send, next_text will be ignored.
122
- * A maximum of 3 request_ids can be send.
123
- */
124
- nextRequestIds?: string[];
125
- /**
126
- * The text that comes after the text of the current request. Can be used to improve
127
- * the flow of prosody when concatenating together multiple generations or to influence
128
- * the prosody in the current generation.
129
- */
130
- nextText?: string;
131
- /**
132
- * A list of request_id of the samples that were generated before this generation. Can be
133
- * used to improve the flow of prosody when splitting up a large task into multiple requests.
134
- * The results will be best when the same model is used across the generations. In case both
135
- * previous_text and previous_request_ids is send, previous_text will be ignored. A maximum
136
- * of 3 request_ids can be send.
137
- */
138
- previousRequestIds?: string[];
139
- /**
140
- * The text that came before the text of the current request. Can be used to improve the
141
- * flow of prosody when concatenating together multiple generations or to influence the
142
- * prosody in the current generation.
143
- */
144
- previousText?: string;
145
- /**
146
- * A list of pronunciation dictionary locators (id, version_id) to be applied to the text.
147
- * They will be applied in order. You may have up to 3 locators per request
148
- */
149
- pronunciationDictionaryLocators?: {
150
- pronunciationDictionaryId: string;
151
- versionId: string;
152
- }[];
153
- /**
154
- * If specified, our system will make a best effort to sample deterministically, such that
155
- * repeated requests with the same seed and parameters should return the same result.
156
- * Determinism is not guaranteed. Must be integer between 0 and 4294967295.
157
- */
158
- seed?: number;
159
- /**
160
- * Voice settings overriding stored settings for the given voice. They are applied only on the given request.
161
- */
162
- voiceSettings?: {
163
- /**
164
- * Determines how closely the AI should adhere to the original voice when attempting to replicate it.
165
- */
166
- similarityBoost: number;
167
- /**
168
- * Controls the speed of the generated speech. Values range from 0.7 to 1.2, with 1.0 being the default
169
- * speed. Lower values create slower, more deliberate speech while higher values produce faster-paced
170
- * speech. Extreme values can impact the quality of the generated speech.
171
- *
172
- * @default 1.0
173
- */
174
- speed?: number;
175
- /**
176
- * Determines how stable the voice is and the randomness between each generation. Lower values introduce
177
- * broader emotional range for the voice. Higher values can result in a monotonous voice with limited
178
- * emotion.
179
- */
180
- stability: number;
181
- /**
182
- * Determines the style exaggeration of the voice. This setting attempts to amplify the style of the original
183
- * speaker. It does consume additional computational resources and might increase latency if set to anything
184
- * other than 0.
185
- *
186
- * @default 0
187
- */
188
- style?: number;
189
- /**
190
- * This setting boosts the similarity to the original speaker. Using this setting requires a slightly higher
191
- * computational load, which in turn increases latency.
192
- *
193
- * @default true
194
- */
195
- useSpeakerBoost?: boolean;
196
- };
197
- }
198
- /**
199
- * [ElevenLabs](https://elevenlabs.io/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
200
- * only.
201
- *
202
- * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
203
- * OpenAI-compatible audio & speech related API that can be used with various providers such
204
- * as ElevenLabs, Azure TTS, Google TTS, etc.
205
- *
206
- * @param apiKey - ElevenLabs API Key
207
- * @param baseURL - UnSpeech Instance URL
208
- * @returns SpeechProviderWithExtraOptions
209
- */
210
- declare const createUnElevenLabs: (apiKey: string, baseURL?: string) => SpeechProviderWithExtraOptions<"eleven_english_sts_v2" | "eleven_flash_v2" | "eleven_flash_v2_5" | "eleven_multilingual_sts_v2" | "eleven_multilingual_v2", UnElevenLabsOptions> & VoiceProviderWithExtraOptions<UnElevenLabsOptions>;
211
-
212
- interface UnVolcanoEngineOptions {
213
- app?: {
214
- appId?: string;
215
- cluster?: string | 'volcano_tts';
216
- };
217
- user?: {
218
- uid?: string;
219
- };
220
- audio?: {
221
- emotion?: string | 'angry';
222
- enableEmotion?: boolean;
223
- /**
224
- * After calling emotion to set the emotion parameter you can use emotion_scale to
225
- * further set the emotion value, the range is 1~5, the default value is 4 when not
226
- * set.
227
- *
228
- * Note: Theoretically, the larger the emotion value is, the more obvious the emotion
229
- * is. However, the emotion value 1~5 is actually non-linear growth, there may be
230
- * more than a certain value, the increase in emotion is not obvious, for example,
231
- * set 3 and 5 when the emotion value may be close.
232
- *
233
- * 1~5
234
- *
235
- * @default 4
236
- */
237
- emotionScale?: number;
238
- /**
239
- * @default 'mp3'
240
- */
241
- encoding?: 'wav' | 'pcm' | 'ogg_opus' | 'mp3';
242
- /**
243
- * 0.8~2
244
- *
245
- * @default 1
246
- */
247
- speedRatio?: number;
248
- /**
249
- * @default 24000
250
- */
251
- rate?: number | 24000 | 8000 | 16000;
252
- /**
253
- * @default 160
254
- */
255
- bitRate?: number | 160;
256
- /**
257
- * - undefined: General mixed bilingual
258
- * - crosslingual: mix with zh/en/ja/es-ms/id/pt-br
259
- * - zh: primarily Chinese, supports mixed Chinese and English
260
- * - en: only English
261
- * - ja: only Japanese
262
- * - es-mx: only Mexican Spanish
263
- * - id: only Indonesian
264
- * - pt-br: only Brazilian Portuguese
265
- *
266
- * @default 'en'
267
- */
268
- explicitLanguage?: string | 'crosslingual' | 'zh' | 'en' | 'jp' | 'es-mx' | 'id' | 'pt-br';
269
- /**
270
- * Languages that contextual to the model
271
- */
272
- contextLanguage?: string | 'id' | 'es' | 'pt';
273
- /**
274
- * 0.5 ~ 2
275
- *
276
- * @default 1
277
- */
278
- loudnessRatio?: number;
279
- };
280
- request?: {
281
- reqid?: string;
282
- /**
283
- * - set to `ssml` to use SSML
284
- */
285
- textType?: string | 'ssml';
286
- /**
287
- * 0 ~ 30000ms
288
- */
289
- silenceDuration?: number;
290
- withTimestamp?: string;
291
- extraParam?: string;
292
- disableMarkdownFilter?: boolean;
293
- enableLatexTone?: boolean;
294
- cacheConfig?: Record<string, unknown>;
295
- useCache?: boolean;
296
- };
297
- }
298
- /**
299
- * [Volcengine / 火山引擎](https://www.volcengine.com/docs/6561/162929) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
300
- * only.
301
- *
302
- * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
303
- * OpenAI-compatible audio & speech related API that can be used with various providers such
304
- * as ElevenLabs, Azure TTS, Google TTS, etc.
305
- *
306
- * @param apiKey - Volcano Engine Speech Service Token
307
- * @param appId - Volcano Engine Speech Service App ID
308
- * @param baseURL - UnSpeech Instance URL
309
- * @returns SpeechProviderWithExtraOptions
310
- */
311
- declare const createUnVolcanoEngine: (apiKey: string, baseURL?: string) => SpeechProviderWithExtraOptions<"volcengine/v1", UnVolcanoEngineOptions> & VoiceProviderWithExtraOptions<UnVolcanoEngineOptions>;
312
-
313
- interface UnAlibabaOptions {
314
- /**
315
- * Sampling rate of the synthesized audio.
316
- * @default 22050
317
- */
318
- sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | number;
319
- /**
320
- * Volume of the synthesized audio. Range: 0 to 100.
321
- * @default 50
322
- */
323
- volume?: number;
324
- /**
325
- * Speech rate. Range: 0.5 to 2.0.
326
- * @default 1.0
327
- */
328
- rate?: number;
329
- /**
330
- * Speech pitch. Range: 0.5 to 2.0.
331
- * @default 1.0
332
- */
333
- pitch?: number;
334
- }
335
- /**
336
- * [Alibaba Cloud / 阿里云 通义听悟](https://tingwu.aliyun.com/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
337
- * only.
338
- *
339
- * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
340
- * OpenAI-compatible audio & speech related API that can be used with various providers such
341
- * as ElevenLabs, Azure TTS, Google TTS, etc.
342
- *
343
- * @param apiKey - Alibaba Cloud AccessKey Token (see https://help.aliyun.com/document_detail/72153.html)
344
- * @param baseURL - UnSpeech Instance URL
345
- * @returns SpeechProviderWithExtraOptions & VoiceProviderWithExtraOptions
346
- */
347
- declare const createUnAlibaba: (apiKey: string, baseURL?: string) => SpeechProviderWithExtraOptions<"alibaba/v1", UnAlibabaOptions> & VoiceProviderWithExtraOptions<UnAlibabaOptions>;
348
-
349
- /** @see {@link https://github.com/moeru-ai/unspeech} */
350
- declare const createUnSpeech: (apiKey: string, baseURL?: string) => _xsai_ext_shared_providers.SpeechProviderWithExtraOptions<`elevenlabs/${string}` | `koemotion/${string}` | `openai/${string}` | `volcengine/${string}` | `volcano/${string}` | `aliyun/${string}` | `alibaba/${string}`, UnSpeechOptions> & VoiceProviderWithExtraOptions<{
351
- backend: "elevenlabs" | "koemotion" | "openai" | "alibaba" | "aliyun" | "ali" | "bailian" | "alibaba-model-studio";
352
- } | {
353
- backend: "microsoft" | "azure";
354
- region: MicrosoftRegions | string;
355
- } | {
356
- backend: "volcengine";
357
- appId: string;
358
- } | {
359
- backend: "volcano";
360
- appId: string;
361
- }>;
362
-
363
- interface ListVoicesOptions extends Omit<CommonRequestOptions, 'model'> {
364
- query?: string;
365
- }
366
- interface ListVoicesResponse {
367
- voices: Voice[];
368
- }
369
- declare const listVoices: (options: ListVoicesOptions) => Promise<Voice[]>;
370
-
371
- export { type ListVoicesOptions, type ListVoicesResponse, type MicrosoftRegions, type UnAlibabaOptions, type UnElevenLabsOptions, type UnMicrosoftOptionAutoSSML, type UnMicrosoftOptionCommon, type UnMicrosoftOptionCustomSSML, type UnMicrosoftOptions, type UnSpeechOptions, type UnVolcanoEngineOptions, type Voice, type VoiceFormat, type VoiceLanguage, type VoiceProvider, type VoiceProviderWithExtraOptions, createUnAlibaba, createUnElevenLabs, createUnMicrosoft, createUnSpeech, createUnVolcanoEngine, listVoices };
package/dist/index.js DELETED
@@ -1,153 +0,0 @@
1
- // src/backend/index.ts
2
- import { createSpeechProviderWithExtraOptions, merge as merge3 } from "@xsai-ext/shared-providers";
3
-
4
- // src/backend/elevenlabs.ts
5
- import { merge } from "@xsai-ext/shared-providers";
6
- import { objCamelToSnake } from "@xsai/shared";
7
- var createUnElevenLabs = (apiKey, baseURL = "http://localhost:5933/v1/") => {
8
- const toUnSpeechOptions = ({
9
- applyTextNormalization,
10
- languageCode,
11
- nextRequestIds,
12
- nextText,
13
- previousRequestIds,
14
- previousText,
15
- pronunciationDictionaryLocators,
16
- seed,
17
- voiceSettings
18
- }) => ({
19
- extraBody: objCamelToSnake({
20
- applyTextNormalization,
21
- languageCode,
22
- nextRequestIds,
23
- nextText,
24
- previousRequestIds,
25
- previousText,
26
- pronunciationDictionaryLocators: pronunciationDictionaryLocators ? pronunciationDictionaryLocators.map((pdl) => objCamelToSnake(pdl)) : void 0,
27
- seed,
28
- voiceSettings: objCamelToSnake(voiceSettings != null ? voiceSettings : {
29
- similarityBoost: 0.75,
30
- stability: 0.5
31
- })
32
- })
33
- });
34
- const speechProvider = {
35
- speech: (model, options) => ({
36
- ...options ? toUnSpeechOptions(options) : {},
37
- apiKey,
38
- baseURL,
39
- model: `elevenlabs/${model}`
40
- })
41
- };
42
- const voiceProvider = {
43
- voice: (options) => {
44
- if (baseURL.endsWith("v1/")) {
45
- baseURL = baseURL.slice(0, -3);
46
- } else if (baseURL.endsWith("v1")) {
47
- baseURL = baseURL.slice(0, -2);
48
- }
49
- return {
50
- query: `provider=elevenlabs`,
51
- ...options ? toUnSpeechOptions(options) : {},
52
- apiKey,
53
- baseURL
54
- };
55
- }
56
- };
57
- return merge(
58
- speechProvider,
59
- voiceProvider
60
- );
61
- };
62
-
63
- // src/backend/microsoft.ts
64
- import { merge as merge2 } from "@xsai-ext/shared-providers";
65
- import { objCamelToSnake as objCamelToSnake2 } from "@xsai/shared";
66
- var createUnMicrosoft = (apiKey, baseURL = "http://localhost:5933/v1/") => {
67
- const toUnSpeechOptions = (options) => {
68
- const { deploymentId, region, sampleRate } = options;
69
- const extraBody = {
70
- deploymentId,
71
- region,
72
- sampleRate
73
- };
74
- if ("disableSsml" in options) {
75
- extraBody.disableSsml = options.disableSsml;
76
- } else if ("lang" in options) {
77
- extraBody.lang = options.lang;
78
- extraBody.gender = options.gender;
79
- extraBody.voice = options.voice;
80
- }
81
- return { extraBody: objCamelToSnake2(extraBody) };
82
- };
83
- const speechProvider = {
84
- speech: (model, options) => ({
85
- ...options ? toUnSpeechOptions(options) : {},
86
- apiKey,
87
- baseURL,
88
- model: `microsoft/${model}`
89
- })
90
- };
91
- const voiceProvider = {
92
- voice: (options) => {
93
- if (baseURL.endsWith("v1/")) {
94
- baseURL = baseURL.slice(0, -3);
95
- } else if (baseURL.endsWith("v1")) {
96
- baseURL = baseURL.slice(0, -2);
97
- }
98
- return {
99
- query: `region=${options?.region}&provider=microsoft`,
100
- ...options ? toUnSpeechOptions(options) : {},
101
- apiKey,
102
- baseURL
103
- };
104
- }
105
- };
106
- return merge2(
107
- speechProvider,
108
- voiceProvider
109
- );
110
- };
111
-
112
- // src/backend/index.ts
113
- var createUnSpeech = (apiKey, baseURL = "http://localhost:5933/v1/") => {
114
- const voiceProvider = {
115
- voice: (options) => {
116
- if (baseURL.endsWith("v1/")) {
117
- baseURL = baseURL.slice(0, -3);
118
- } else if (baseURL.endsWith("v1")) {
119
- baseURL = baseURL.slice(0, -2);
120
- }
121
- if (options?.backend === "microsoft" || options?.backend === "azure") {
122
- return {
123
- query: `region=${options.region}&provider=${options.backend}`,
124
- baseURL,
125
- apiKey
126
- };
127
- }
128
- return {
129
- query: `provider=${options?.backend}`,
130
- baseURL,
131
- apiKey
132
- };
133
- }
134
- };
135
- return merge3(
136
- createSpeechProviderWithExtraOptions({ apiKey, baseURL }),
137
- voiceProvider
138
- );
139
- };
140
-
141
- // src/utils/list-voices.ts
142
- import { requestHeaders, requestURL, responseJSON } from "@xsai/shared";
143
- var listVoices = async (options) => (options.fetch ?? globalThis.fetch)(requestURL(options.query ? `api/voices?${options.query}` : "api/voices", options.baseURL), {
144
- headers: requestHeaders({ ...options.headers }, options.apiKey),
145
- method: "GET",
146
- signal: options.abortSignal
147
- }).then(responseJSON).then(({ voices }) => voices);
148
- export {
149
- createUnElevenLabs,
150
- createUnMicrosoft,
151
- createUnSpeech,
152
- listVoices
153
- };