PyPI - xinference - Versions diffs - 1.0.1__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

xinference 1.0.1py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (343) hide show

xinference/thirdparty/fish_speech/tools/webui/inference.py ADDED Viewed

@@ -0,0 +1,91 @@
+import html
+from functools import partial
+from typing import Any, Callable
+from fish_speech.i18n import i18n
+from tools.schema import ServeReferenceAudio, ServeTTSRequest
+def inference_wrapper(
+    text,
+    normalize,
+    reference_id,
+    reference_audio,
+    reference_text,
+    max_new_tokens,
+    chunk_length,
+    top_p,
+    repetition_penalty,
+    temperature,
+    seed,
+    use_memory_cache,
+    engine,
+):
+    """
+    Wrapper for the inference function.
+    Used in the Gradio interface.
+    """
+    if reference_audio:
+        references = get_reference_audio(reference_audio, reference_text)
+    else:
+        references = []
+    req = ServeTTSRequest(
+        text=text,
+        normalize=normalize,
+        reference_id=reference_id if reference_id else None,
+        references=references,
+        max_new_tokens=max_new_tokens,
+        chunk_length=chunk_length,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        temperature=temperature,
+        seed=int(seed) if seed else None,
+        use_memory_cache=use_memory_cache,
+    )
+    for result in engine.inference(req):
+        match result.code:
+            case "final":
+                return result.audio, None
+            case "error":
+                return None, build_html_error_message(i18n(result.error))
+            case _:
+                pass
+    return None, i18n("No audio generated")
+def get_reference_audio(reference_audio: str, reference_text: str) -> list:
+    """
+    Get the reference audio bytes.
+    """
+    with open(reference_audio, "rb") as audio_file:
+        audio_bytes = audio_file.read()
+    return [ServeReferenceAudio(audio=audio_bytes, text=reference_text)]
+def build_html_error_message(error: Any) -> str:
+    error = error if isinstance(error, Exception) else Exception("Unknown error")
+    return f"""
+    <div style="color: red;
+    font-weight: bold;">
+        {html.escape(str(error))}
+    </div>
+    """
+def get_inference_wrapper(engine) -> Callable:
+    """
+    Get the inference function with the immutable arguments.
+    """
+    return partial(
+        inference_wrapper,
+        engine=engine,
+    )

xinference/thirdparty/fish_speech/tools/webui/variables.py ADDED Viewed

@@ -0,0 +1,14 @@
+from fish_speech.i18n import i18n
+HEADER_MD = f"""# Fish Speech
+{i18n("A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).")}
+{i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).")}
+{i18n("Related code and weights are released under CC BY-NC-SA 4.0 License.")}
+{i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")}
+"""
+TEXTBOX_PLACEHOLDER = i18n("Put your text here.")

xinference/thirdparty/matcha/utils/utils.py CHANGED Viewed

@@ -7,10 +7,10 @@ from pathlib import Path
 from typing import Any, Callable, Dict, Tuple
 import gdown
-import matplotlib.pyplot as plt
+# import matplotlib.pyplot as plt
 import numpy as np
 import torch
-import wget
+# import wget
 from omegaconf import DictConfig
 from matcha.utils import pylogger, rich_utils

xinference/thirdparty/melo/api.py ADDED Viewed

@@ -0,0 +1,135 @@
+import os
+import re
+import json
+import torch
+import librosa
+import soundfile
+import torchaudio
+import numpy as np
+import torch.nn as nn
+from tqdm import tqdm
+import torch
+from . import utils
+from . import commons
+from .models import SynthesizerTrn
+from .split_utils import split_sentence
+from .mel_processing import spectrogram_torch, spectrogram_torch_conv
+from .download_utils import load_or_download_config, load_or_download_model
+class TTS(nn.Module):
+    def __init__(self,
+                language,
+                device='auto',
+                use_hf=True,
+                config_path=None,
+                ckpt_path=None):
+        super().__init__()
+        if device == 'auto':
+            device = 'cpu'
+            if torch.cuda.is_available(): device = 'cuda'
+            if torch.backends.mps.is_available(): device = 'mps'
+        if 'cuda' in device:
+            assert torch.cuda.is_available()
+        # config_path =
+        hps = load_or_download_config(language, use_hf=use_hf, config_path=config_path)
+        num_languages = hps.num_languages
+        num_tones = hps.num_tones
+        symbols = hps.symbols
+        model = SynthesizerTrn(
+            len(symbols),
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            n_speakers=hps.data.n_speakers,
+            num_tones=num_tones,
+            num_languages=num_languages,
+            **hps.model,
+        ).to(device)
+        model.eval()
+        self.model = model
+        self.symbol_to_id = {s: i for i, s in enumerate(symbols)}
+        self.hps = hps
+        self.device = device
+        # load state_dict
+        checkpoint_dict = load_or_download_model(language, device, use_hf=use_hf, ckpt_path=ckpt_path)
+        self.model.load_state_dict(checkpoint_dict['model'], strict=True)
+        language = language.split('_')[0]
+        self.language = 'ZH_MIX_EN' if language == 'ZH' else language # we support a ZH_MIX_EN model
+    @staticmethod
+    def audio_numpy_concat(segment_data_list, sr, speed=1.):
+        audio_segments = []
+        for segment_data in segment_data_list:
+            audio_segments += segment_data.reshape(-1).tolist()
+            audio_segments += [0] * int((sr * 0.05) / speed)
+        audio_segments = np.array(audio_segments).astype(np.float32)
+        return audio_segments
+    @staticmethod
+    def split_sentences_into_pieces(text, language, quiet=False):
+        texts = split_sentence(text, language_str=language)
+        if not quiet:
+            print(" > Text split to sentences.")
+            print('\n'.join(texts))
+            print(" > ===========================")
+        return texts
+    def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_scale=0.6, noise_scale_w=0.8, speed=1.0, pbar=None, format=None, position=None, quiet=False,):
+        language = self.language
+        texts = self.split_sentences_into_pieces(text, language, quiet)
+        audio_list = []
+        if pbar:
+            tx = pbar(texts)
+        else:
+            if position:
+                tx = tqdm(texts, position=position)
+            elif quiet:
+                tx = texts
+            else:
+                tx = tqdm(texts)
+        for t in tx:
+            if language in ['EN', 'ZH_MIX_EN']:
+                t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
+            device = self.device
+            bert, ja_bert, phones, tones, lang_ids = utils.get_text_for_tts_infer(t, language, self.hps, device, self.symbol_to_id)
+            with torch.no_grad():
+                x_tst = phones.to(device).unsqueeze(0)
+                tones = tones.to(device).unsqueeze(0)
+                lang_ids = lang_ids.to(device).unsqueeze(0)
+                bert = bert.to(device).unsqueeze(0)
+                ja_bert = ja_bert.to(device).unsqueeze(0)
+                x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+                del phones
+                speakers = torch.LongTensor([speaker_id]).to(device)
+                audio = self.model.infer(
+                        x_tst,
+                        x_tst_lengths,
+                        speakers,
+                        tones,
+                        lang_ids,
+                        bert,
+                        ja_bert,
+                        sdp_ratio=sdp_ratio,
+                        noise_scale=noise_scale,
+                        noise_scale_w=noise_scale_w,
+                        length_scale=1. / speed,
+                    )[0][0, 0].data.cpu().float().numpy()
+                del x_tst, tones, lang_ids, bert, ja_bert, x_tst_lengths, speakers
+                #
+            audio_list.append(audio)
+        torch.cuda.empty_cache()
+        audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)
+        if output_path is None:
+            return audio
+        else:
+            if format:
+                soundfile.write(output_path, audio, self.hps.data.sampling_rate, format=format)
+            else:
+                soundfile.write(output_path, audio, self.hps.data.sampling_rate)

xinference/thirdparty/melo/app.py ADDED Viewed

@@ -0,0 +1,61 @@
+# WebUI by mrfakename <X @realmrfakename / HF @mrfakename>
+# Demo also available on HF Spaces: https://huggingface.co/spaces/mrfakename/MeloTTS
+import gradio as gr
+import os, torch, io
+# os.system('python -m unidic download')
+print("Make sure you've downloaded unidic (python -m unidic download) for this WebUI to work.")
+from melo.api import TTS
+speed = 1.0
+import tempfile
+import click
+device = 'auto'
+models = {
+    'EN': TTS(language='EN', device=device),
+    'ES': TTS(language='ES', device=device),
+    'FR': TTS(language='FR', device=device),
+    'ZH': TTS(language='ZH', device=device),
+    'JP': TTS(language='JP', device=device),
+    'KR': TTS(language='KR', device=device),
+}
+speaker_ids = models['EN'].hps.data.spk2id
+default_text_dict = {
+    'EN': 'The field of text-to-speech has seen rapid development recently.',
+    'ES': 'El campo de la conversión de texto a voz ha experimentado un rápido desarrollo recientemente.',
+    'FR': 'Le domaine de la synthèse vocale a connu un développement rapide récemment',
+    'ZH': 'text-to-speech 领域近年来发展迅速',
+    'JP': 'テキスト読み上げの分野は最近急速な発展を遂げています',
+    'KR': '최근 텍스트 음성 변환 분야가 급속도로 발전하고 있습니다.',
+}
+def synthesize(speaker, text, speed, language, progress=gr.Progress()):
+    bio = io.BytesIO()
+    models[language].tts_to_file(text, models[language].hps.data.spk2id[speaker], bio, speed=speed, pbar=progress.tqdm, format='wav')
+    return bio.getvalue()
+def load_speakers(language, text):
+    if text in list(default_text_dict.values()):
+        newtext = default_text_dict[language]
+    else:
+        newtext = text
+    return gr.update(value=list(models[language].hps.data.spk2id.keys())[0], choices=list(models[language].hps.data.spk2id.keys())), newtext
+with gr.Blocks() as demo:
+    gr.Markdown('# MeloTTS WebUI\n\nA WebUI for MeloTTS.')
+    with gr.Group():
+        speaker = gr.Dropdown(speaker_ids.keys(), interactive=True, value='EN-US', label='Speaker')
+        language = gr.Radio(['EN', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN')
+        speed = gr.Slider(label='Speed', minimum=0.1, maximum=10.0, value=1.0, interactive=True, step=0.1)
+        text = gr.Textbox(label="Text to speak", value=default_text_dict['EN'])
+        language.input(load_speakers, inputs=[language, text], outputs=[speaker, text])
+    btn = gr.Button('Synthesize', variant='primary')
+    aud = gr.Audio(interactive=False)
+    btn.click(synthesize, inputs=[speaker, text, speed, language], outputs=[aud])
+    gr.Markdown('WebUI by [mrfakename](https://twitter.com/realmrfakename).')
+@click.command()
+@click.option('--share', '-s', is_flag=True, show_default=True, default=False, help="Expose a publicly-accessible shared Gradio link usable by anyone with the link. Only share the link with people you trust.")
+@click.option('--host', '-h', default=None)
+@click.option('--port', '-p', type=int, default=None)
+def main(share, host, port):
+    demo.queue(api_open=False).launch(show_api=False, share=share, server_name=host, server_port=port)
+if __name__ == "__main__":
+    main()

xinference 1.0.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

Potentially problematic release.

xinference 1.0.1py3-none-any.whl → 1.2.1py3-none-any.whl