npm - unspeech - Versions diffs - 0.1.7 → 0.1.11 - Mend

unspeech 0.1.7 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,82 @@
+# unSpeech TypeScript Client
+> Your Text-to-Speech Services, All-in-One.
+## Install
+```bash
+npm i unspeech
+```
+## Getting Started
+### List voices
+Besides of the `/audio/speech` endpoint, we support listing all the available voices from providers as well:
+```ts
+import { createUnSpeech, listVoices } from 'unspeech'
+const unspeech = createUnSpeech('YOUR_EXTERNAL_PROVIDER_API_KEY', 'http://localhost:5933/v1/')
+const voices = await listVoices(
+  unspeech.voice({ backend: 'elevenlabs' })
+)
+```
+### Speech synthesis
+For general purpose `/audio/speech` requests, `@xsai/generate-speech` or xsAI can be used as it's compatible:
+```bash
+npm i @xsai/generate-speech
+```
+```ts
+import { generateSpeech } from '@xsai/generate-speech'
+import { createUnSpeech } from 'unspeech'
+const unspeech = createUnSpeech('YOUR_EXTERNAL_PROVIDER_API_KEY', 'http://localhost:5933/v1/')
+const speech = await generateSpeech({
+  ...unspeech.speech('elevenlabs/eleven_multilingual_v2'),
+  input: 'Hello, World!',
+  voice: '9BWtsMINqrJLrRacOk9x',
+})
+```
+For the other providers, you can import them as needed
+```ts
+import {
+  createUnAlibabaCloud,
+  createUnElevenLabs,
+  createUnMicrosoft,
+  createUnSpeech,
+  createUnVolcengine,
+} from 'unspeech'
+```
+When using
+- [Microsoft / Azure AI Speech service](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/text-to-speech)
+- [Alibaba Cloud Model Studio / 阿里云百炼 / CosyVoice](https://www.alibabacloud.com/en/product/modelstudio)
+- [Volcano Engine / 火山引擎语音技术](https://www.volcengine.com/product/voice-tech)
+- [ElevenLabs](https://elevenlabs.io/docs/api-reference/text-to-speech/convert)
+providers, [SSML](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup) is supported to control in fine grain level for pitch, volume, rate, etc.
+## Related Projects
+Looking for something like unSpeech, but for local TTS? check it out:
+- [erew123/alltalk_tts/alltalkbeta](https://github.com/erew123/alltalk_tts/tree/alltalkbeta)
+- [astramind-ai/Auralis](https://github.com/astramind-ai/Auralis)
+- [matatonic/openedai-speech](https://github.com/matatonic/openedai-speech)
+Or to use free Edge TTS:
+- [travisvn/openai-edge-tts](https://github.com/travisvn/openai-edge-tts)
+## License
+[AGPL-3.0](./LICENSE)

package/dist/index.d.mts ADDED Viewed

@@ -0,0 +1,395 @@
+import * as _xsai_ext_providers_utils0 from "@xsai-ext/providers/utils";
+import { SpeechProviderWithExtraOptions } from "@xsai-ext/providers/utils";
+import { CommonRequestOptions } from "@xsai/shared";
+//#region src/types/voice.d.ts
+interface Voice {
+  compatible_models: string[];
+  description: string;
+  formats: VoiceFormat[];
+  id: string;
+  labels: Record<string, any> & {
+    accent?: string;
+    age?: string;
+    gender?: string;
+    type?: string;
+  };
+  languages: VoiceLanguage[];
+  name: string;
+  predefined_options?: Record<string, any>;
+  preview_audio_url?: string;
+  tags: string[];
+}
+interface VoiceFormat {
+  bitrate: number;
+  extension: string;
+  format_code: string;
+  mime_type: string;
+  name: string;
+  sample_rate: number;
+}
+interface VoiceLanguage {
+  code: string;
+  title: string;
+}
+interface VoiceProvider {
+  voice: () => Omit<CommonRequestOptions, 'model'> & {
+    query?: string;
+  };
+}
+interface VoiceProviderWithExtraOptions<T = undefined> {
+  voice: (options?: T) => Omit<CommonRequestOptions, 'model'> & Partial<T> & {
+    query?: string;
+  };
+}
+//#endregion
+//#region src/types/index.d.ts
+interface UnSpeechOptions {
+  /** @experimental */
+  extraBody?: Record<string, unknown>;
+}
+//#endregion
+//#region src/backend/microsoft.d.ts
+type MicrosoftRegions = 'australiaeast' | 'brazilsouth' | 'canadacentral' | 'centralindia' | 'centralus' | 'eastasia' | 'eastus2' | 'eastus' | 'francecentral' | 'germanywestcentral' | 'japaneast' | 'japanwest' | 'jioindiawest' | 'koreacentral' | 'northcentralus' | 'northeurope' | 'norwayeast' | 'southcentralus' | 'southeastasia' | 'swedencentral' | 'switzerlandnorth' | 'switzerlandwest' | 'uaenorth' | 'uksouth' | 'usgovarizona' | 'usgovvirginia' | 'westcentralus' | 'westeurope' | 'westus2' | 'westus3' | 'westus';
+interface UnMicrosoftOptionAutoSSML {
+  gender: 'Female' | 'Male' | 'Neutral' | string;
+  lang: 'en-US' | string;
+  /**
+   * Speech Studio - Voice Gallery
+   * https://speech.microsoft.com/portal/018ba84135d64cf79106cc99c75ffa6a/voicegallery
+   */
+  voice: 'en-US-AndrewMultilingualNeural' | 'en-US-AriaNeural' | 'en-US-AvaMultilingualNeural' | 'en-US-BrianMultilingualNeural' | 'en-US-ChristopherMultilingualNeural' | 'en-US-EmmaMultilingualNeural' | 'en-US-JaneNeural' | string;
+}
+interface UnMicrosoftOptionCommon {
+  /**
+   * Text to speech API reference (REST) - Speech service - Azure AI services | Microsoft Learn
+   * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#custom-neural-voices
+   */
+  deploymentId?: string;
+  /**
+   * Text to speech API reference (REST) - Speech service - Azure AI services | Microsoft Learn
+   * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#prebuilt-neural-voices
+   *
+   * NOTICE: Voices in preview are available in only these three regions: East US, West Europe, and Southeast Asia.
+   */
+  region: MicrosoftRegions | string;
+  sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | number;
+}
+interface UnMicrosoftOptionCustomSSML {
+  /**
+   * By default, unspeech service will help you automatically convert OpenAI style plain text input
+   * into SSML with lang, gender, voice parameters, but if you ever wanted to provide your own SSML
+   * with all customizable parameters, you can set this option to `true` to disable the automatic
+   * conversion and use your own SSML instead.
+   *
+   * About SSML (Speech Synthesis Markup Language), @see {@link https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup}
+   */
+  disableSsml?: boolean;
+}
+/** @see {@link https://elevenlabs.io/docs/api-reference/text-to-speech/convert#request} */
+type UnMicrosoftOptions = (UnMicrosoftOptionAutoSSML | UnMicrosoftOptionCustomSSML) & UnMicrosoftOptionCommon;
+/**
+ * [Microsoft / Azure AI](https://speech.microsoft.com/portal) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
+ * only.
+ *
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
+ *
+ * @param apiKey - Microsoft / Azure AI subscription key
+ * @param baseURL - UnSpeech Instance URL
+ * @returns SpeechProviderWithExtraOptions
+ */
+declare function createUnMicrosoft(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"microsoft/v1", UnMicrosoftOptions> & VoiceProviderWithExtraOptions<UnMicrosoftOptions>;
+//#endregion
+//#region src/backend/alibabacloud.d.ts
+interface UnAlibabaCloudOptions {
+  /**
+   * Speech pitch. Range: 0.5 to 2.0.
+   * @default 1.0
+   */
+  pitch?: number;
+  /**
+   * Speech rate. Range: 0.5 to 2.0.
+   * @default 1.0
+   */
+  rate?: number;
+  /**
+   * Sampling rate of the synthesized audio.
+   * @default 22050
+   */
+  sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | number;
+  /**
+   * Volume of the synthesized audio. Range: 0 to 100.
+   * @default 50
+   */
+  volume?: number;
+}
+/**
+ * [Alibaba Cloud / 阿里云 通义听悟](https://tingwu.aliyun.com/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
+ * only.
+ *
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
+ *
+ * @param apiKey - Alibaba Cloud AccessKey Token (see https://help.aliyun.com/document_detail/72153.html)
+ * @param baseURL - UnSpeech Instance URL
+ * @returns SpeechProviderWithExtraOptions & VoiceProviderWithExtraOptions
+ */
+declare function createUnAlibabaCloud(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"alibaba/v1", UnAlibabaCloudOptions> & VoiceProviderWithExtraOptions<UnAlibabaCloudOptions>;
+//#endregion
+//#region src/backend/deepgram.d.ts
+/** @see {@link https://developers.deepgram.com/docs/text-to-speech} */
+interface UnDeepgramOptions {}
+/**
+ * [Deepgram](https://deepgram.com/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
+ * only.
+ *
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
+ *
+ * @param apiKey - Deepgram API Key
+ * @param baseURL - UnSpeech Instance URL
+ * @returns SpeechProviderWithExtraOptions
+ */
+declare function createUnDeepgram(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<string, UnDeepgramOptions> & VoiceProviderWithExtraOptions<UnDeepgramOptions>;
+//#endregion
+//#region src/backend/elevenlabs.d.ts
+/** @see {@link https://elevenlabs.io/docs/api-reference/text-to-speech/convert#request} */
+interface UnElevenLabsOptions {
+  /**
+   * This parameter controls text normalization with three modes: 'auto', 'on', and 'off'. When set to 'auto',
+   * the system will automatically decide whether to apply text normalization (e.g., spelling out numbers).
+   * With 'on', text normalization will always be applied, while with 'off', it will be skipped. Cannot be
+   * turned on for 'eleven_turbo_v2_5' model.
+   */
+  applyTextNormalization?: 'auto' | 'off' | 'on';
+  /**
+   * Language code (ISO 639-1) used to enforce a language for the model. Currently only Turbo v2.5
+   * supports language enforcement. For other models, an error will be returned if language code is provided.
+   */
+  languageCode?: string;
+  /**
+   * A list of request_id of the samples that were generated before this generation. Can
+   * be used to improve the flow of prosody when splitting up a large task into multiple
+   * requests. The results will be best when the same model is used across the generations.
+   *
+   * In case both next_text and next_request_ids is send, next_text will be ignored.
+   * A maximum of 3 request_ids can be send.
+   */
+  nextRequestIds?: string[];
+  /**
+   * The text that comes after the text of the current request. Can be used to improve
+   * the flow of prosody when concatenating together multiple generations or to influence
+   * the prosody in the current generation.
+   */
+  nextText?: string;
+  /**
+   * A list of request_id of the samples that were generated before this generation. Can be
+   * used to improve the flow of prosody when splitting up a large task into multiple requests.
+   * The results will be best when the same model is used across the generations. In case both
+   * previous_text and previous_request_ids is send, previous_text will be ignored. A maximum
+   * of 3 request_ids can be send.
+   */
+  previousRequestIds?: string[];
+  /**
+   * The text that came before the text of the current request. Can be used to improve the
+   * flow of prosody when concatenating together multiple generations or to influence the
+   * prosody in the current generation.
+   */
+  previousText?: string;
+  /**
+   * A list of pronunciation dictionary locators (id, version_id) to be applied to the text.
+   * They will be applied in order. You may have up to 3 locators per request
+   */
+  pronunciationDictionaryLocators?: {
+    pronunciationDictionaryId: string;
+    versionId: string;
+  }[];
+  /**
+   * If specified, our system will make a best effort to sample deterministically, such that
+   * repeated requests with the same seed and parameters should return the same result.
+   * Determinism is not guaranteed. Must be integer between 0 and 4294967295.
+   */
+  seed?: number;
+  /**
+   * Voice settings overriding stored settings for the given voice. They are applied only on the given request.
+   */
+  voiceSettings?: {
+    /**
+     * Determines how closely the AI should adhere to the original voice when attempting to replicate it.
+     */
+    similarityBoost: number;
+    /**
+     * Controls the speed of the generated speech. Values range from 0.7 to 1.2, with 1.0 being the default
+     * speed. Lower values create slower, more deliberate speech while higher values produce faster-paced
+     * speech. Extreme values can impact the quality of the generated speech.
+     *
+     * @default 1.0
+     */
+    speed?: number;
+    /**
+     * Determines how stable the voice is and the randomness between each generation. Lower values introduce
+     * broader emotional range for the voice. Higher values can result in a monotonous voice with limited
+     * emotion.
+     */
+    stability: number;
+    /**
+     * Determines the style exaggeration of the voice. This setting attempts to amplify the style of the original
+     * speaker. It does consume additional computational resources and might increase latency if set to anything
+     * other than 0.
+     *
+     * @default 0
+     */
+    style?: number;
+    /**
+     * This setting boosts the similarity to the original speaker. Using this setting requires a slightly higher
+     * computational load, which in turn increases latency.
+     *
+     * @default true
+     */
+    useSpeakerBoost?: boolean;
+  };
+}
+/**
+ * [ElevenLabs](https://elevenlabs.io/) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
+ * only.
+ *
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
+ *
+ * @param apiKey - ElevenLabs API Key
+ * @param baseURL - UnSpeech Instance URL
+ * @returns SpeechProviderWithExtraOptions
+ */
+declare function createUnElevenLabs(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"eleven_english_sts_v2" | "eleven_flash_v2" | "eleven_flash_v2_5" | "eleven_multilingual_sts_v2" | "eleven_multilingual_v2", UnElevenLabsOptions> & VoiceProviderWithExtraOptions<UnElevenLabsOptions>;
+//#endregion
+//#region src/backend/volcengine.d.ts
+interface UnVolcengineOptions {
+  app?: {
+    appId?: string;
+    cluster?: 'volcano_tts' | string;
+  };
+  audio?: {
+    /**
+     * @default 160
+     */
+    bitRate?: 160 | number;
+    /**
+     * Languages that contextual to the model
+     */
+    contextLanguage?: 'es' | 'id' | 'pt' | string;
+    emotion?: 'angry' | string;
+    /**
+     * After calling emotion to set the emotion parameter you can use emotion_scale to
+     * further set the emotion value, the range is 1~5, the default value is 4 when not
+     * set.
+     *
+     * Note: Theoretically, the larger the emotion value is, the more obvious the emotion
+     * is. However, the emotion value 1~5 is actually non-linear growth, there may be
+     * more than a certain value, the increase in emotion is not obvious, for example,
+     * set 3 and 5 when the emotion value may be close.
+     *
+     * 1~5
+     *
+     * @default 4
+     */
+    emotionScale?: number;
+    enableEmotion?: boolean;
+    /**
+     * @default 'mp3'
+     */
+    encoding?: 'mp3' | 'ogg_opus' | 'pcm' | 'wav';
+    /**
+     * - undefined: General mixed bilingual
+     * - crosslingual: mix with zh/en/ja/es-ms/id/pt-br
+     * - zh: primarily Chinese, supports mixed Chinese and English
+     * - en: only English
+     * - ja: only Japanese
+     * - es-mx: only Mexican Spanish
+     * - id: only Indonesian
+     * - pt-br: only Brazilian Portuguese
+     *
+     * @default 'en'
+     */
+    explicitLanguage?: 'crosslingual' | 'en' | 'es-mx' | 'id' | 'jp' | 'pt-br' | 'zh' | string;
+    /**
+     * 0.5 ~ 2
+     *
+     * @default 1
+     */
+    loudnessRatio?: number;
+    /**
+     * @default 24000
+     */
+    rate?: 8000 | 16000 | 24000 | number;
+    /**
+     * 0.8~2
+     *
+     * @default 1
+     */
+    speedRatio?: number;
+  };
+  request?: {
+    cacheConfig?: Record<string, unknown>;
+    disableMarkdownFilter?: boolean;
+    enableLatexTone?: boolean;
+    extraParam?: string;
+    reqid?: string;
+    /**
+     * 0 ~ 30000ms
+     */
+    silenceDuration?: number;
+    /**
+     * - set to `ssml` to use SSML
+     */
+    textType?: 'ssml' | string;
+    useCache?: boolean;
+    withTimestamp?: string;
+  };
+  user?: {
+    uid?: string;
+  };
+}
+/**
+ * [Volcengine / 火山引擎](https://www.volcengine.com/docs/6561/162929) provider for [UnSpeech](https://github.com/moeru-ai/unspeech)
+ * only.
+ *
+ * [UnSpeech](https://github.com/moeru-ai/unspeech) is a open-source project that provides a
+ * OpenAI-compatible audio & speech related API that can be used with various providers such
+ * as ElevenLabs, Azure TTS, Google TTS, etc.
+ *
+ * @param apiKey - Volcano Engine Speech Service Token
+ * @param baseURL - UnSpeech Instance URL
+ * @returns SpeechProviderWithExtraOptions
+ */
+declare function createUnVolcengine(apiKey: string, baseURL?: string): SpeechProviderWithExtraOptions<"volcengine/v1", UnVolcengineOptions> & VoiceProviderWithExtraOptions<UnVolcengineOptions>;
+//#endregion
+//#region src/backend/index.d.ts
+/** @see {@link https://github.com/moeru-ai/unspeech} */
+declare function createUnSpeech(apiKey: string, baseURL?: string): _xsai_ext_providers_utils0.SpeechProviderWithExtraOptions<`alibaba/${string}` | `aliyun/${string}` | `deepgram/${string}` | `elevenlabs/${string}` | `koemotion/${string}` | `openai/${string}` | `volcano/${string}` | `volcengine/${string}`, UnSpeechOptions> & VoiceProviderWithExtraOptions<{
+  appId: string;
+  backend: "volcano";
+} | {
+  appId: string;
+  backend: "volcengine";
+} | {
+  backend: "azure" | "microsoft";
+  region: MicrosoftRegions | string;
+} | {
+  backend: "ali" | "alibaba" | "alibaba-model-studio" | "aliyun" | "bailian" | "deepgram" | "elevenlabs" | "koemotion" | "openai";
+}>;
+//#endregion
+//#region src/utils/list-voices.d.ts
+interface ListVoicesOptions extends Omit<CommonRequestOptions, 'model'> {
+  query?: string;
+}
+interface ListVoicesResponse {
+  voices: Voice[];
+}
+declare function listVoices(options: ListVoicesOptions): Promise<Voice[]>;
+//#endregion
+export { ListVoicesOptions, ListVoicesResponse, MicrosoftRegions, UnAlibabaCloudOptions, UnDeepgramOptions, UnElevenLabsOptions, UnMicrosoftOptionAutoSSML, UnMicrosoftOptionCommon, UnMicrosoftOptionCustomSSML, UnMicrosoftOptions, UnSpeechOptions, UnVolcengineOptions, Voice, VoiceFormat, VoiceLanguage, VoiceProvider, VoiceProviderWithExtraOptions, createUnAlibabaCloud, createUnDeepgram, createUnElevenLabs, createUnMicrosoft, createUnSpeech, createUnVolcengine, listVoices };