PyPI - xinference - Versions diffs - 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl - Mend

xinference 0.14.2py3-none-any.whl → 0.14.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (191) hide show

xinference/thirdparty/matcha/onnx/export.py ADDED Viewed

@@ -0,0 +1,181 @@
+import argparse
+import random
+from pathlib import Path
+import numpy as np
+import torch
+from lightning import LightningModule
+from matcha.cli import VOCODER_URLS, load_matcha, load_vocoder
+DEFAULT_OPSET = 15
+SEED = 1234
+random.seed(SEED)
+np.random.seed(SEED)
+torch.manual_seed(SEED)
+torch.cuda.manual_seed(SEED)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+class MatchaWithVocoder(LightningModule):
+    def __init__(self, matcha, vocoder):
+        super().__init__()
+        self.matcha = matcha
+        self.vocoder = vocoder
+    def forward(self, x, x_lengths, scales, spks=None):
+        mel, mel_lengths = self.matcha(x, x_lengths, scales, spks)
+        wavs = self.vocoder(mel).clamp(-1, 1)
+        lengths = mel_lengths * 256
+        return wavs.squeeze(1), lengths
+def get_exportable_module(matcha, vocoder, n_timesteps):
+    """
+    Return an appropriate `LighteningModule` and output-node names
+    based on whether the vocoder is embedded in  the final graph
+    """
+    def onnx_forward_func(x, x_lengths, scales, spks=None):
+        """
+        Custom forward function for accepting
+        scaler parameters as tensors
+        """
+        # Extract scaler parameters from tensors
+        temperature = scales[0]
+        length_scale = scales[1]
+        output = matcha.synthesise(x, x_lengths, n_timesteps, temperature, spks, length_scale)
+        return output["mel"], output["mel_lengths"]
+    # Monkey-patch Matcha's forward function
+    matcha.forward = onnx_forward_func
+    if vocoder is None:
+        model, output_names = matcha, ["mel", "mel_lengths"]
+    else:
+        model = MatchaWithVocoder(matcha, vocoder)
+        output_names = ["wav", "wav_lengths"]
+    return model, output_names
+def get_inputs(is_multi_speaker):
+    """
+    Create dummy inputs for tracing
+    """
+    dummy_input_length = 50
+    x = torch.randint(low=0, high=20, size=(1, dummy_input_length), dtype=torch.long)
+    x_lengths = torch.LongTensor([dummy_input_length])
+    # Scales
+    temperature = 0.667
+    length_scale = 1.0
+    scales = torch.Tensor([temperature, length_scale])
+    model_inputs = [x, x_lengths, scales]
+    input_names = [
+        "x",
+        "x_lengths",
+        "scales",
+    ]
+    if is_multi_speaker:
+        spks = torch.LongTensor([1])
+        model_inputs.append(spks)
+        input_names.append("spks")
+    return tuple(model_inputs), input_names
+def main():
+    parser = argparse.ArgumentParser(description="Export 🍵 Matcha-TTS to ONNX")
+    parser.add_argument(
+        "checkpoint_path",
+        type=str,
+        help="Path to the model checkpoint",
+    )
+    parser.add_argument("output", type=str, help="Path to output `.onnx` file")
+    parser.add_argument(
+        "--n-timesteps", type=int, default=5, help="Number of steps to use for reverse diffusion in decoder (default 5)"
+    )
+    parser.add_argument(
+        "--vocoder-name",
+        type=str,
+        choices=list(VOCODER_URLS.keys()),
+        default=None,
+        help="Name of the vocoder to embed in the ONNX graph",
+    )
+    parser.add_argument(
+        "--vocoder-checkpoint-path",
+        type=str,
+        default=None,
+        help="Vocoder checkpoint to embed  in the ONNX graph for an `e2e` like experience",
+    )
+    parser.add_argument("--opset", type=int, default=DEFAULT_OPSET, help="ONNX opset version to use (default 15")
+    args = parser.parse_args()
+    print(f"[🍵] Loading Matcha checkpoint from {args.checkpoint_path}")
+    print(f"Setting n_timesteps to {args.n_timesteps}")
+    checkpoint_path = Path(args.checkpoint_path)
+    matcha = load_matcha(checkpoint_path.stem, checkpoint_path, "cpu")
+    if args.vocoder_name or args.vocoder_checkpoint_path:
+        assert (
+            args.vocoder_name and args.vocoder_checkpoint_path
+        ), "Both vocoder_name and vocoder-checkpoint are required when embedding the vocoder in the ONNX graph."
+        vocoder, _ = load_vocoder(args.vocoder_name, args.vocoder_checkpoint_path, "cpu")
+    else:
+        vocoder = None
+    is_multi_speaker = matcha.n_spks > 1
+    dummy_input, input_names = get_inputs(is_multi_speaker)
+    model, output_names = get_exportable_module(matcha, vocoder, args.n_timesteps)
+    # Set dynamic shape for inputs/outputs
+    dynamic_axes = {
+        "x": {0: "batch_size", 1: "time"},
+        "x_lengths": {0: "batch_size"},
+    }
+    if vocoder is None:
+        dynamic_axes.update(
+            {
+                "mel": {0: "batch_size", 2: "time"},
+                "mel_lengths": {0: "batch_size"},
+            }
+        )
+    else:
+        print("Embedding the vocoder in the ONNX graph")
+        dynamic_axes.update(
+            {
+                "wav": {0: "batch_size", 1: "time"},
+                "wav_lengths": {0: "batch_size"},
+            }
+        )
+    if is_multi_speaker:
+        dynamic_axes["spks"] = {0: "batch_size"}
+    # Create the output directory (if not exists)
+    Path(args.output).parent.mkdir(parents=True, exist_ok=True)
+    model.to_onnx(
+        args.output,
+        dummy_input,
+        input_names=input_names,
+        output_names=output_names,
+        dynamic_axes=dynamic_axes,
+        opset_version=args.opset,
+        export_params=True,
+        do_constant_folding=True,
+    )
+    print(f"[🍵] ONNX model exported to  {args.output}")
+if __name__ == "__main__":
+    main()

xinference/thirdparty/matcha/onnx/infer.py ADDED Viewed

@@ -0,0 +1,168 @@
+import argparse
+import os
+import warnings
+from pathlib import Path
+from time import perf_counter
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+import torch
+from matcha.cli import plot_spectrogram_to_numpy, process_text
+def validate_args(args):
+    assert (
+        args.text or args.file
+    ), "Either text or file must be provided Matcha-T(ea)TTS need sometext to whisk the waveforms."
+    assert args.temperature >= 0, "Sampling temperature cannot be negative"
+    assert args.speaking_rate >= 0, "Speaking rate must be greater than 0"
+    return args
+def write_wavs(model, inputs, output_dir, external_vocoder=None):
+    if external_vocoder is None:
+        print("The provided model has the vocoder embedded in the graph.\nGenerating waveform directly")
+        t0 = perf_counter()
+        wavs, wav_lengths = model.run(None, inputs)
+        infer_secs = perf_counter() - t0
+        mel_infer_secs = vocoder_infer_secs = None
+    else:
+        print("[🍵] Generating mel using Matcha")
+        mel_t0 = perf_counter()
+        mels, mel_lengths = model.run(None, inputs)
+        mel_infer_secs = perf_counter() - mel_t0
+        print("Generating waveform from mel using external vocoder")
+        vocoder_inputs = {external_vocoder.get_inputs()[0].name: mels}
+        vocoder_t0 = perf_counter()
+        wavs = external_vocoder.run(None, vocoder_inputs)[0]
+        vocoder_infer_secs = perf_counter() - vocoder_t0
+        wavs = wavs.squeeze(1)
+        wav_lengths = mel_lengths * 256
+        infer_secs = mel_infer_secs + vocoder_infer_secs
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for i, (wav, wav_length) in enumerate(zip(wavs, wav_lengths)):
+        output_filename = output_dir.joinpath(f"output_{i + 1}.wav")
+        audio = wav[:wav_length]
+        print(f"Writing audio to {output_filename}")
+        sf.write(output_filename, audio, 22050, "PCM_24")
+    wav_secs = wav_lengths.sum() / 22050
+    print(f"Inference seconds: {infer_secs}")
+    print(f"Generated wav seconds: {wav_secs}")
+    rtf = infer_secs / wav_secs
+    if mel_infer_secs is not None:
+        mel_rtf = mel_infer_secs / wav_secs
+        print(f"Matcha RTF: {mel_rtf}")
+    if vocoder_infer_secs is not None:
+        vocoder_rtf = vocoder_infer_secs / wav_secs
+        print(f"Vocoder RTF: {vocoder_rtf}")
+    print(f"Overall RTF: {rtf}")
+def write_mels(model, inputs, output_dir):
+    t0 = perf_counter()
+    mels, mel_lengths = model.run(None, inputs)
+    infer_secs = perf_counter() - t0
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for i, mel in enumerate(mels):
+        output_stem = output_dir.joinpath(f"output_{i + 1}")
+        plot_spectrogram_to_numpy(mel.squeeze(), output_stem.with_suffix(".png"))
+        np.save(output_stem.with_suffix(".numpy"), mel)
+    wav_secs = (mel_lengths * 256).sum() / 22050
+    print(f"Inference seconds: {infer_secs}")
+    print(f"Generated wav seconds: {wav_secs}")
+    rtf = infer_secs / wav_secs
+    print(f"RTF: {rtf}")
+def main():
+    parser = argparse.ArgumentParser(
+        description=" 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching"
+    )
+    parser.add_argument(
+        "model",
+        type=str,
+        help="ONNX model to use",
+    )
+    parser.add_argument("--vocoder", type=str, default=None, help="Vocoder to use (defaults to None)")
+    parser.add_argument("--text", type=str, default=None, help="Text to synthesize")
+    parser.add_argument("--file", type=str, default=None, help="Text file to synthesize")
+    parser.add_argument("--spk", type=int, default=None, help="Speaker ID")
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.667,
+        help="Variance of the x0 noise (default: 0.667)",
+    )
+    parser.add_argument(
+        "--speaking-rate",
+        type=float,
+        default=1.0,
+        help="change the speaking rate, a higher value means slower speaking rate (default: 1.0)",
+    )
+    parser.add_argument("--gpu", action="store_true", help="Use CPU for inference (default: use GPU if available)")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=os.getcwd(),
+        help="Output folder to save results (default: current dir)",
+    )
+    args = parser.parse_args()
+    args = validate_args(args)
+    if args.gpu:
+        providers = ["GPUExecutionProvider"]
+    else:
+        providers = ["CPUExecutionProvider"]
+    model = ort.InferenceSession(args.model, providers=providers)
+    model_inputs = model.get_inputs()
+    model_outputs = list(model.get_outputs())
+    if args.text:
+        text_lines = args.text.splitlines()
+    else:
+        with open(args.file, encoding="utf-8") as file:
+            text_lines = file.read().splitlines()
+    processed_lines = [process_text(0, line, "cpu") for line in text_lines]
+    x = [line["x"].squeeze() for line in processed_lines]
+    # Pad
+    x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True)
+    x = x.detach().cpu().numpy()
+    x_lengths = np.array([line["x_lengths"].item() for line in processed_lines], dtype=np.int64)
+    inputs = {
+        "x": x,
+        "x_lengths": x_lengths,
+        "scales": np.array([args.temperature, args.speaking_rate], dtype=np.float32),
+    }
+    is_multi_speaker = len(model_inputs) == 4
+    if is_multi_speaker:
+        if args.spk is None:
+            args.spk = 0
+            warn = "[!] Speaker ID not provided! Using speaker ID 0"
+            warnings.warn(warn, UserWarning)
+        inputs["spks"] = np.repeat(args.spk, x.shape[0]).astype(np.int64)
+    has_vocoder_embedded = model_outputs[0].name == "wav"
+    if has_vocoder_embedded:
+        write_wavs(model, inputs, args.output_dir)
+    elif args.vocoder:
+        external_vocoder = ort.InferenceSession(args.vocoder, providers=providers)
+        write_wavs(model, inputs, args.output_dir, external_vocoder=external_vocoder)
+    else:
+        warn = "[!] A vocoder is not embedded in the graph nor an external vocoder is provided. The mel output will be written as numpy arrays to `*.npy` files in the output directory"
+        warnings.warn(warn, UserWarning)
+        write_mels(model, inputs, args.output_dir)
+if __name__ == "__main__":
+    main()

xinference/thirdparty/matcha/text/__init__.py ADDED Viewed

@@ -0,0 +1,53 @@
+""" from https://github.com/keithito/tacotron """
+from matcha.text import cleaners
+from matcha.text.symbols import symbols
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}  # pylint: disable=unnecessary-comprehension
+def text_to_sequence(text, cleaner_names):
+    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+    Returns:
+      List of integers corresponding to the symbols in the text
+    """
+    sequence = []
+    clean_text = _clean_text(text, cleaner_names)
+    for symbol in clean_text:
+        symbol_id = _symbol_to_id[symbol]
+        sequence += [symbol_id]
+    return sequence, clean_text
+def cleaned_text_to_sequence(cleaned_text):
+    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+    """
+    sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
+    return sequence
+def sequence_to_text(sequence):
+    """Converts a sequence of IDs back to a string"""
+    result = ""
+    for symbol_id in sequence:
+        s = _id_to_symbol[symbol_id]
+        result += s
+    return result
+def _clean_text(text, cleaner_names):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception("Unknown cleaner: %s" % name)
+        text = cleaner(text)
+    return text

xinference/thirdparty/matcha/text/cleaners.py ADDED Viewed

@@ -0,0 +1,121 @@
+""" from https://github.com/keithito/tacotron
+Cleaners are transformations that run over the input text at both training and eval time.
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+"""
+import logging
+import re
+import phonemizer
+from unidecode import unidecode
+# To avoid excessive logging we set the log level of the phonemizer package to Critical
+critical_logger = logging.getLogger("phonemizer")
+critical_logger.setLevel(logging.CRITICAL)
+# Intializing the phonemizer globally significantly reduces the speed
+# now the phonemizer is not initialising at every call
+# Might be less flexible, but it is much-much faster
+global_phonemizer = phonemizer.backend.EspeakBackend(
+    language="en-us",
+    preserve_punctuation=True,
+    with_stress=True,
+    language_switch="remove-flags",
+    logger=critical_logger,
+)
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r"\s+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+def convert_to_ascii(text):
+    return unidecode(text)
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def transliteration_cleaners(text):
+    """Pipeline for non-English text that transliterates to ASCII."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def english_cleaners2(text):
+    """Pipeline for English text, including abbreviation expansion. + punctuation + stress"""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_abbreviations(text)
+    phonemes = global_phonemizer.phonemize([text], strip=True, njobs=1)[0]
+    phonemes = collapse_whitespace(phonemes)
+    return phonemes
+# I am removing this due to incompatibility with several version of python
+# However, if you want to use it, you can uncomment it
+# and install piper-phonemize with the following command:
+# pip install piper-phonemize
+# import piper_phonemize
+# def english_cleaners_piper(text):
+#     """Pipeline for English text, including abbreviation expansion. + punctuation + stress"""
+#     text = convert_to_ascii(text)
+#     text = lowercase(text)
+#     text = expand_abbreviations(text)
+#     phonemes = "".join(piper_phonemize.phonemize_espeak(text=text, voice="en-US")[0])
+#     phonemes = collapse_whitespace(phonemes)
+#     return phonemes

xinference/thirdparty/matcha/text/numbers.py ADDED Viewed

@@ -0,0 +1,71 @@
+""" from https://github.com/keithito/tacotron """
+import re
+import inflect
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return f"{dollars} {dollar_unit}, {cents} {cent_unit}"
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return f"{dollars} {dollar_unit}"
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return f"{cents} {cent_unit}"
+    else:
+        return "zero dollars"
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
+    else:
+        return _inflect.number_to_words(num, andword="")
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text

xinference/thirdparty/matcha/text/symbols.py ADDED Viewed

@@ -0,0 +1,17 @@
+""" from https://github.com/keithito/tacotron
+Defines the set of symbols used in text input to the model.
+"""
+_pad = "_"
+_punctuation = ';:,.!?¡¿—…"«»“” '
+_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+_letters_ipa = (
+    "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+)
+# Export all symbols:
+symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+# Special symbol ids
+SPACE_ID = symbols.index(" ")

xinference 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl

Potentially problematic release.

xinference 0.14.2py3-none-any.whl → 0.14.4py3-none-any.whl