unspeech 0.1.7 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,82 @@
1
+ # unSpeech TypeScript Client
2
+
3
+ > Your Text-to-Speech Services, All-in-One.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ npm i unspeech
9
+ ```
10
+
11
+ ## Getting Started
12
+
13
+ ### List voices
14
+
15
+ Besides of the `/audio/speech` endpoint, we support listing all the available voices from providers as well:
16
+
17
+ ```ts
18
+ import { createUnSpeech, listVoices } from 'unspeech'
19
+
20
+ const unspeech = createUnSpeech('YOUR_EXTERNAL_PROVIDER_API_KEY', 'http://localhost:5933/v1/')
21
+
22
+ const voices = await listVoices(
23
+ unspeech.voice({ backend: 'elevenlabs' })
24
+ )
25
+ ```
26
+
27
+ ### Speech synthesis
28
+
29
+ For general purpose `/audio/speech` requests, `@xsai/generate-speech` or xsAI can be used as it's compatible:
30
+
31
+ ```bash
32
+ npm i @xsai/generate-speech
33
+ ```
34
+
35
+ ```ts
36
+ import { generateSpeech } from '@xsai/generate-speech'
37
+ import { createUnSpeech } from 'unspeech'
38
+
39
+ const unspeech = createUnSpeech('YOUR_EXTERNAL_PROVIDER_API_KEY', 'http://localhost:5933/v1/')
40
+ const speech = await generateSpeech({
41
+ ...unspeech.speech('elevenlabs/eleven_multilingual_v2'),
42
+ input: 'Hello, World!',
43
+ voice: '9BWtsMINqrJLrRacOk9x',
44
+ })
45
+ ```
46
+
47
+ For the other providers, you can import them as needed
48
+
49
+ ```ts
50
+ import {
51
+ createUnAlibabaCloud,
52
+ createUnElevenLabs,
53
+ createUnMicrosoft,
54
+ createUnSpeech,
55
+ createUnVolcengine,
56
+ } from 'unspeech'
57
+ ```
58
+
59
+ When using
60
+
61
+ - [Microsoft / Azure AI Speech service](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/text-to-speech)
62
+ - [Alibaba Cloud Model Studio / 阿里云百炼 / CosyVoice](https://www.alibabacloud.com/en/product/modelstudio)
63
+ - [Volcano Engine / 火山引擎语音技术](https://www.volcengine.com/product/voice-tech)
64
+ - [ElevenLabs](https://elevenlabs.io/docs/api-reference/text-to-speech/convert)
65
+
66
+ providers, [SSML](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup) is supported to control in fine grain level for pitch, volume, rate, etc.
67
+
68
+ ## Related Projects
69
+
70
+ Looking for something like unSpeech, but for local TTS? check it out:
71
+
72
+ - [erew123/alltalk_tts/alltalkbeta](https://github.com/erew123/alltalk_tts/tree/alltalkbeta)
73
+ - [astramind-ai/Auralis](https://github.com/astramind-ai/Auralis)
74
+ - [matatonic/openedai-speech](https://github.com/matatonic/openedai-speech)
75
+
76
+ Or to use free Edge TTS:
77
+
78
+ - [travisvn/openai-edge-tts](https://github.com/travisvn/openai-edge-tts)
79
+
80
+ ## License
81
+
82
+ [AGPL-3.0](./LICENSE)
@@ -0,0 +1,395 @@
1
+ import * as _xsai_ext_providers_utils0 from "@xsai-ext/providers/utils";
2
+ import { SpeechProviderWithExtraOptions } from "@xsai-ext/providers/utils";
3
+ import { CommonRequestOptions } from "@xsai/shared";
4
+
5
+ //#region src/types/voice.d.ts
6
+ interface Voice {
7
+ compatible_models: string[];
8
+ description: string;
9
+ formats: VoiceFormat[];
10
+ id: string;
11
+ labels: Record<string, any> & {
12
+ accent?: string;
13
+ age?: string;
14
+ gender?: string;
15
+ type?: string;
16
+ };
17
+ languages: VoiceLanguage[];
18
+ name: string;
19
+ predefined_options?: Record<string, any>;
20
+ preview_audio_url?: string;
21
+ tags: string[];
22
+ }
23
+ interface VoiceFormat {
24
+ bitrate: number;
25
+ extension: string;
26
+ format_code: string;
27
+ mime_type: string;
28
+ name: string;
29
+ sample_rate: number;
30
+ }
31
+ interface VoiceLanguage {
32
+ code: string;
33
+ title: string;
34
+ }
35
+ interface VoiceProvider {
36
+ voice: () => Omit<CommonRequestOptions, 'model'> & {
37
+ query?: string;
38
+ };
39
+ }
40
+ interface VoiceProviderWithExtraOptions<T = undefined> {
41
+ voice: (options?: T) => Omit<CommonRequestOptions, 'model'> & Partial<T> & {
42
+ query?: string;
43
+ };
44
+ }
45
+ //#endregion
46
+ //#region src/types/index.d.ts
47
+ interface UnSpeechOptions {
48
+ /** @experimental */
49
+ extraBody?: Record<string, unknown>;
50
+ }
51
+ //#endregion
52
+ //#region src/backend/microsoft.d.ts
53
+ type MicrosoftRegions = 'australiaeast' | 'brazilsouth' | 'canadacentral' | 'centralindia' | 'centralus' | 'eastasia' | 'eastus2' | 'eastus' | 'francecentral' | 'germanywestcentral' | 'japaneast' | 'japanwest' | 'jioindiawest' | 'koreacentral' | 'northcentralus' | 'northeurope' | 'norwayeast' | 'southcentralus' | 'southeastasia' | 'swedencentral' | 'switzerlandnorth' | 'switzerlandwest' | 'uaenorth' | 'uksouth' | 'usgovarizona' | 'usgovvirginia' | 'westcentralus' | 'westeurope' | 'westus2' | 'westus3' | 'westus';
54
+ interface UnMicrosoftOptionAutoSSML {
55
+ gender: 'Female' | 'Male' | 'Neutral' | string;
56
+ lang: 'en-US' | string;
57
+ /**
58
+ * Speech Studio - Voice Gallery
59
+ * https://speech.microsoft.com/portal/018ba84135d64cf79106cc99c75ffa6a/voicegallery
60
+ */
61
+ voice: 'en-US-AndrewMultilingualNeural' | 'en-US-AriaNeural' | 'en-US-AvaMultilingualNeural' | 'en-US-BrianMultilingualNeural' | 'en-US-ChristopherMultilingualNeural' | 'en-US-EmmaMultilingualNeural' | 'en-US-JaneNeural' | string;
62
+ }
63
+ interface UnMicrosoftOptionCommon {
64
+ /**
65
+ * Text to speech API reference (REST) - Speech service - Azure AI services | Microsoft Learn
66
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#custom-neural-voices
67
+ */
68
+ deploymentId?: string;
69
+ /**
70
+ * Text to speech API reference (REST) - Speech service - Azure AI services | Microsoft Learn
71
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#prebuilt-neural-voices
72
+ *
73
+ * NOTICE: Voices in preview are available in only these three regions: East US, West Europe, and Southeast Asia.
74
+ */
75
+ region: MicrosoftRegions | string;
76
+ sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | number;
77
+ }
78
+ interface UnMicrosoftOptionCustomSSML {
79
+ /**
80
+ * By default, unspeech service will help you automatically convert OpenAI style plain text input
81
+ * into SSML with lang, gender, voice parameters, but if you ever wanted to provide your own SSML
82
+ * with all customizable parameters, you can set this option to `true` to disable the automatic
83
+ * conversion and use your own SSML instead.
84
+ *
85
+ * About SSML (Speech Synthesis Markup Language), @see {@link https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup}
86
+ */
87
+ disableSsml?: boolean;
88
+ }
89
+ /** @see {@link https://elevenlabs.io/docs/api-reference/text-to-speech/convert#request} */
90
+ type UnMicrosoftOptions = (UnMicrosoftOptionAutoSSML | UnMicrosoftOptionCustomSSML) & UnMicrosoftOptionCommon;
91
+ /**
92
+ * [Microsoft / Azure AI](https://speech.microsoft.com/portal) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
93
+ * only.
94
+ *
95
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
96
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
97
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
98
+ *
99
+ * @param apiKey - Microsoft / Azure AI subscription key
100
+ * @param baseURL - UnSpeech Instance URL
101
+ * @returns SpeechProviderWithExtraOptions
102
+ */
103
+ declare function createUnMicrosoft(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"microsoft/v1", UnMicrosoftOptions> & VoiceProviderWithExtraOptions<UnMicrosoftOptions>;
104
+ //#endregion
105
+ //#region src/backend/alibabacloud.d.ts
106
+ interface UnAlibabaCloudOptions {
107
+ /**
108
+ * Speech pitch. Range: 0.5 to 2.0.
109
+ * @default 1.0
110
+ */
111
+ pitch?: number;
112
+ /**
113
+ * Speech rate. Range: 0.5 to 2.0.
114
+ * @default 1.0
115
+ */
116
+ rate?: number;
117
+ /**
118
+ * Sampling rate of the synthesized audio.
119
+ * @default 22050
120
+ */
121
+ sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | number;
122
+ /**
123
+ * Volume of the synthesized audio. Range: 0 to 100.
124
+ * @default 50
125
+ */
126
+ volume?: number;
127
+ }
128
+ /**
129
+ * [Alibaba Cloud / 阿里云 通义听悟](https://tingwu.aliyun.com/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
130
+ * only.
131
+ *
132
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
133
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
134
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
135
+ *
136
+ * @param apiKey - Alibaba Cloud AccessKey Token (see https://help.aliyun.com/document_detail/72153.html)
137
+ * @param baseURL - UnSpeech Instance URL
138
+ * @returns SpeechProviderWithExtraOptions & VoiceProviderWithExtraOptions
139
+ */
140
+ declare function createUnAlibabaCloud(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"alibaba/v1", UnAlibabaCloudOptions> & VoiceProviderWithExtraOptions<UnAlibabaCloudOptions>;
141
+ //#endregion
142
+ //#region src/backend/deepgram.d.ts
143
+ /** @see {@link https://developers.deepgram.com/docs/text-to-speech} */
144
+ interface UnDeepgramOptions {}
145
+ /**
146
+ * [Deepgram](https://deepgram.com/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
147
+ * only.
148
+ *
149
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
150
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
151
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
152
+ *
153
+ * @param apiKey - Deepgram API Key
154
+ * @param baseURL - UnSpeech Instance URL
155
+ * @returns SpeechProviderWithExtraOptions
156
+ */
157
+ declare function createUnDeepgram(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<string, UnDeepgramOptions> & VoiceProviderWithExtraOptions<UnDeepgramOptions>;
158
+ //#endregion
159
+ //#region src/backend/elevenlabs.d.ts
160
+ /** @see {@link https://elevenlabs.io/docs/api-reference/text-to-speech/convert#request} */
161
+ interface UnElevenLabsOptions {
162
+ /**
163
+ * This parameter controls text normalization with three modes: 'auto', 'on', and 'off'. When set to 'auto',
164
+ * the system will automatically decide whether to apply text normalization (e.g., spelling out numbers).
165
+ * With 'on', text normalization will always be applied, while with 'off', it will be skipped. Cannot be
166
+ * turned on for 'eleven_turbo_v2_5' model.
167
+ */
168
+ applyTextNormalization?: 'auto' | 'off' | 'on';
169
+ /**
170
+ * Language code (ISO 639-1) used to enforce a language for the model. Currently only Turbo v2.5
171
+ * supports language enforcement. For other models, an error will be returned if language code is provided.
172
+ */
173
+ languageCode?: string;
174
+ /**
175
+ * A list of request_id of the samples that were generated before this generation. Can
176
+ * be used to improve the flow of prosody when splitting up a large task into multiple
177
+ * requests. The results will be best when the same model is used across the generations.
178
+ *
179
+ * In case both next_text and next_request_ids is send, next_text will be ignored.
180
+ * A maximum of 3 request_ids can be send.
181
+ */
182
+ nextRequestIds?: string[];
183
+ /**
184
+ * The text that comes after the text of the current request. Can be used to improve
185
+ * the flow of prosody when concatenating together multiple generations or to influence
186
+ * the prosody in the current generation.
187
+ */
188
+ nextText?: string;
189
+ /**
190
+ * A list of request_id of the samples that were generated before this generation. Can be
191
+ * used to improve the flow of prosody when splitting up a large task into multiple requests.
192
+ * The results will be best when the same model is used across the generations. In case both
193
+ * previous_text and previous_request_ids is send, previous_text will be ignored. A maximum
194
+ * of 3 request_ids can be send.
195
+ */
196
+ previousRequestIds?: string[];
197
+ /**
198
+ * The text that came before the text of the current request. Can be used to improve the
199
+ * flow of prosody when concatenating together multiple generations or to influence the
200
+ * prosody in the current generation.
201
+ */
202
+ previousText?: string;
203
+ /**
204
+ * A list of pronunciation dictionary locators (id, version_id) to be applied to the text.
205
+ * They will be applied in order. You may have up to 3 locators per request
206
+ */
207
+ pronunciationDictionaryLocators?: {
208
+ pronunciationDictionaryId: string;
209
+ versionId: string;
210
+ }[];
211
+ /**
212
+ * If specified, our system will make a best effort to sample deterministically, such that
213
+ * repeated requests with the same seed and parameters should return the same result.
214
+ * Determinism is not guaranteed. Must be integer between 0 and 4294967295.
215
+ */
216
+ seed?: number;
217
+ /**
218
+ * Voice settings overriding stored settings for the given voice. They are applied only on the given request.
219
+ */
220
+ voiceSettings?: {
221
+ /**
222
+ * Determines how closely the AI should adhere to the original voice when attempting to replicate it.
223
+ */
224
+ similarityBoost: number;
225
+ /**
226
+ * Controls the speed of the generated speech. Values range from 0.7 to 1.2, with 1.0 being the default
227
+ * speed. Lower values create slower, more deliberate speech while higher values produce faster-paced
228
+ * speech. Extreme values can impact the quality of the generated speech.
229
+ *
230
+ * @default 1.0
231
+ */
232
+ speed?: number;
233
+ /**
234
+ * Determines how stable the voice is and the randomness between each generation. Lower values introduce
235
+ * broader emotional range for the voice. Higher values can result in a monotonous voice with limited
236
+ * emotion.
237
+ */
238
+ stability: number;
239
+ /**
240
+ * Determines the style exaggeration of the voice. This setting attempts to amplify the style of the original
241
+ * speaker. It does consume additional computational resources and might increase latency if set to anything
242
+ * other than 0.
243
+ *
244
+ * @default 0
245
+ */
246
+ style?: number;
247
+ /**
248
+ * This setting boosts the similarity to the original speaker. Using this setting requires a slightly higher
249
+ * computational load, which in turn increases latency.
250
+ *
251
+ * @default true
252
+ */
253
+ useSpeakerBoost?: boolean;
254
+ };
255
+ }
256
+ /**
257
+ * [ElevenLabs](https://elevenlabs.io/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
258
+ * only.
259
+ *
260
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
261
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
262
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
263
+ *
264
+ * @param apiKey - ElevenLabs API Key
265
+ * @param baseURL - UnSpeech Instance URL
266
+ * @returns SpeechProviderWithExtraOptions
267
+ */
268
+ declare function createUnElevenLabs(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"eleven_english_sts_v2" | "eleven_flash_v2" | "eleven_flash_v2_5" | "eleven_multilingual_sts_v2" | "eleven_multilingual_v2", UnElevenLabsOptions> & VoiceProviderWithExtraOptions<UnElevenLabsOptions>;
269
+ //#endregion
270
+ //#region src/backend/volcengine.d.ts
271
+ interface UnVolcengineOptions {
272
+ app?: {
273
+ appId?: string;
274
+ cluster?: 'volcano_tts' | string;
275
+ };
276
+ audio?: {
277
+ /**
278
+ * @default 160
279
+ */
280
+ bitRate?: 160 | number;
281
+ /**
282
+ * Languages that contextual to the model
283
+ */
284
+ contextLanguage?: 'es' | 'id' | 'pt' | string;
285
+ emotion?: 'angry' | string;
286
+ /**
287
+ * After calling emotion to set the emotion parameter you can use emotion_scale to
288
+ * further set the emotion value, the range is 1~5, the default value is 4 when not
289
+ * set.
290
+ *
291
+ * Note: Theoretically, the larger the emotion value is, the more obvious the emotion
292
+ * is. However, the emotion value 1~5 is actually non-linear growth, there may be
293
+ * more than a certain value, the increase in emotion is not obvious, for example,
294
+ * set 3 and 5 when the emotion value may be close.
295
+ *
296
+ * 1~5
297
+ *
298
+ * @default 4
299
+ */
300
+ emotionScale?: number;
301
+ enableEmotion?: boolean;
302
+ /**
303
+ * @default 'mp3'
304
+ */
305
+ encoding?: 'mp3' | 'ogg_opus' | 'pcm' | 'wav';
306
+ /**
307
+ * - undefined: General mixed bilingual
308
+ * - crosslingual: mix with zh/en/ja/es-ms/id/pt-br
309
+ * - zh: primarily Chinese, supports mixed Chinese and English
310
+ * - en: only English
311
+ * - ja: only Japanese
312
+ * - es-mx: only Mexican Spanish
313
+ * - id: only Indonesian
314
+ * - pt-br: only Brazilian Portuguese
315
+ *
316
+ * @default 'en'
317
+ */
318
+ explicitLanguage?: 'crosslingual' | 'en' | 'es-mx' | 'id' | 'jp' | 'pt-br' | 'zh' | string;
319
+ /**
320
+ * 0.5 ~ 2
321
+ *
322
+ * @default 1
323
+ */
324
+ loudnessRatio?: number;
325
+ /**
326
+ * @default 24000
327
+ */
328
+ rate?: 8000 | 16000 | 24000 | number;
329
+ /**
330
+ * 0.8~2
331
+ *
332
+ * @default 1
333
+ */
334
+ speedRatio?: number;
335
+ };
336
+ request?: {
337
+ cacheConfig?: Record<string, unknown>;
338
+ disableMarkdownFilter?: boolean;
339
+ enableLatexTone?: boolean;
340
+ extraParam?: string;
341
+ reqid?: string;
342
+ /**
343
+ * 0 ~ 30000ms
344
+ */
345
+ silenceDuration?: number;
346
+ /**
347
+ * - set to `ssml` to use SSML
348
+ */
349
+ textType?: 'ssml' | string;
350
+ useCache?: boolean;
351
+ withTimestamp?: string;
352
+ };
353
+ user?: {
354
+ uid?: string;
355
+ };
356
+ }
357
+ /**
358
+ * [Volcengine / 火山引擎](https://www.volcengine.com/docs/6561/162929) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
359
+ * only.
360
+ *
361
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
362
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
363
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
364
+ *
365
+ * @param apiKey - Volcano Engine Speech Service Token
366
+ * @param baseURL - UnSpeech Instance URL
367
+ * @returns SpeechProviderWithExtraOptions
368
+ */
369
+ declare function createUnVolcengine(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"volcengine/v1", UnVolcengineOptions> & VoiceProviderWithExtraOptions<UnVolcengineOptions>;
370
+ //#endregion
371
+ //#region src/backend/index.d.ts
372
+ /** @see {@link https://github.com/moeru-ai/unspeech} */
373
+ declare function createUnSpeech(apiKey: string, baseURL?: string): _xsai_ext_providers_utils0.SpeechProviderWithExtraOptions<`alibaba/${string}` | `aliyun/${string}` | `deepgram/${string}` | `elevenlabs/${string}` | `koemotion/${string}` | `openai/${string}` | `volcano/${string}` | `volcengine/${string}`, UnSpeechOptions> & VoiceProviderWithExtraOptions<{
374
+ appId: string;
375
+ backend: "volcano";
376
+ } | {
377
+ appId: string;
378
+ backend: "volcengine";
379
+ } | {
380
+ backend: "azure" | "microsoft";
381
+ region: MicrosoftRegions | string;
382
+ } | {
383
+ backend: "ali" | "alibaba" | "alibaba-model-studio" | "aliyun" | "bailian" | "deepgram" | "elevenlabs" | "koemotion" | "openai";
384
+ }>;
385
+ //#endregion
386
+ //#region src/utils/list-voices.d.ts
387
+ interface ListVoicesOptions extends Omit<CommonRequestOptions, 'model'> {
388
+ query?: string;
389
+ }
390
+ interface ListVoicesResponse {
391
+ voices: Voice[];
392
+ }
393
+ declare function listVoices(options: ListVoicesOptions): Promise<Voice[]>;
394
+ //#endregion
395
+ export { ListVoicesOptions, ListVoicesResponse, MicrosoftRegions, UnAlibabaCloudOptions, UnDeepgramOptions, UnElevenLabsOptions, UnMicrosoftOptionAutoSSML, UnMicrosoftOptionCommon, UnMicrosoftOptionCustomSSML, UnMicrosoftOptions, UnSpeechOptions, UnVolcengineOptions, Voice, VoiceFormat, VoiceLanguage, VoiceProvider, VoiceProviderWithExtraOptions, createUnAlibabaCloud, createUnDeepgram, createUnElevenLabs, createUnMicrosoft, createUnSpeech, createUnVolcengine, listVoices };