PyPI - xinference - Versions diffs - 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

xinference 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (104) hide show

xinference/model/utils.py CHANGED Viewed

@@ -11,17 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
 import json
 import logging
 import os
 import random
+import threading
 from json import JSONDecodeError
 from pathlib import Path
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Optional, Set, Tuple, Type, Union
 import huggingface_hub
 import numpy as np
 import torch
+from tqdm.auto import tqdm
 from ..constants import (
     XINFERENCE_CACHE_DIR,
@@ -343,3 +347,116 @@ def set_all_random_seed(seed: int):
     np.random.seed(seed)
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
+class CancellableDownloader:
+    def __init__(
+        self,
+        cancel_error_cls: Type[BaseException] = asyncio.CancelledError,
+        cancelled_event: Optional[threading.Event] = None,
+    ):
+        self._cancelled = cancelled_event
+        if self._cancelled is None:
+            self._cancelled = threading.Event()
+        self._done_event = threading.Event()
+        self._cancel_error_cls = cancel_error_cls
+        self._original_update = None
+        # progress for tqdm that is main
+        self._main_progresses: Set[tqdm] = set()
+        # progress for file downloader
+        # mainly when tqdm unit is set
+        self._download_progresses: Set[tqdm] = set()
+        # tqdm original update
+        self._original_tqdm_update = None
+    def reset(self):
+        self._main_progresses.clear()
+        self._download_progresses.clear()
+    def get_progress(self) -> float:
+        if self.cancelled or self.done:
+            # directly return 1.0 when cancelled or finished
+            return 1.0
+        tasks = finished_tasks = 0
+        for main_progress in self._main_progresses:
+            tasks += main_progress.total or 0
+            finished_tasks += main_progress.n
+        if tasks == 0:
+            # we assumed at least 1 task
+            tasks = 1
+        finished_ratio = finished_tasks / tasks
+        all_download_progress = finished_download_progress = 0
+        for download_progress in self._download_progresses:
+            # we skip finished download
+            if download_progress.n == download_progress.total:
+                continue
+            all_download_progress += download_progress.total or (
+                download_progress.n * 10
+            )
+            finished_download_progress += download_progress.n
+        if all_download_progress > 0:
+            rest_ratio = (
+                (tasks - finished_tasks)
+                / tasks
+                * (finished_download_progress / all_download_progress)
+            )
+            return finished_ratio + rest_ratio
+        else:
+            return finished_ratio
+    def cancel(self):
+        self._cancelled.set()
+    @property
+    def cancelled(self):
+        return self._cancelled.is_set()
+    @property
+    def done(self):
+        return self._done_event.is_set()
+    def wait(self, timeout: float):
+        self._done_event.wait(timeout)
+    def raise_error(self, error_msg: str = "Download cancelled"):
+        raise self._cancel_error_cls(error_msg)
+    def patch_tqdm(self):
+        # patch tqdm
+        # raise error if cancelled
+        self._original_update = original_update = tqdm.update
+        downloader = self
+        def patched_update(self, n):
+            if downloader.cancelled:
+                downloader.raise_error()
+            if not self.disable:
+                progresses = (
+                    downloader._main_progresses
+                    if getattr(self, "unit", "it") == "it"
+                    else downloader._download_progresses
+                )
+                progresses.add(self)
+            return original_update(self, n)
+        tqdm.update = patched_update
+    def unpatch_tqdm(self):
+        from tqdm.auto import tqdm
+        if self._original_update:
+            tqdm.update = self._original_update
+    def __enter__(self):
+        self.patch_tqdm()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.unpatch_tqdm()
+        self._done_event.set()
+        self.reset()

xinference/model/video/core.py CHANGED Viewed

@@ -17,7 +17,7 @@ from collections import defaultdict
 from typing import Any, Dict, List, Literal, Optional, Tuple
 from ...constants import XINFERENCE_CACHE_DIR
-from ..core import CacheableModelSpec, ModelDescription
+from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
 from ..utils import valid_model_revision
 from .diffusers import DiffUsersVideoModel
@@ -44,6 +44,7 @@ class VideoModelFamilyV1(CacheableModelSpec):
     model_ability: Optional[List[str]]
     default_model_config: Optional[Dict[str, Any]]
     default_generate_config: Optional[Dict[str, Any]]
+    virtualenv: Optional[VirtualEnvSettings]
 class VideoModelDescription(ModelDescription):
@@ -57,6 +58,10 @@ class VideoModelDescription(ModelDescription):
         super().__init__(address, devices, model_path=model_path)
         self._model_spec = model_spec
+    @property
+    def spec(self):
+        return self._model_spec
     def to_dict(self):
         return {
             "model_type": "video",

xinference/thirdparty/megatts3/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/megatts3/tts/frontend_function.py ADDED Viewed

@@ -0,0 +1,175 @@
+# Copyright 2025 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+import whisper
+import librosa
+from copy import deepcopy
+from tts.utils.text_utils.ph_tone_convert import split_ph_timestamp, split_ph
+from tts.utils.audio_utils.align import mel2token_to_dur
+''' Graphme to phoneme function '''
+def g2p(self, text_inp):
+    # prepare inputs
+    txt_token = self.g2p_tokenizer('<BOT>' + text_inp + '<BOS>')['input_ids']
+    input_ids = torch.LongTensor([txt_token+[145+self.speech_start_idx]]).to(self.device)
+    # model forward
+    with torch.cuda.amp.autocast(dtype=self.precision, enabled=True):
+        outputs = self.g2p_model.generate(input_ids, max_new_tokens=256, do_sample=True, top_k=1, eos_token_id=800+1+self.speech_start_idx)
+    # process outputs
+    ph_tokens = outputs[:, len(txt_token):-1]-self.speech_start_idx
+    ph_pred, tone_pred = split_ph(ph_tokens[0])
+    ph_pred, tone_pred = ph_pred[None, :].to(self.device), tone_pred[None, :].to(self.device)
+    return ph_pred, tone_pred
+''' Get phoneme2mel align of prompt speech '''
+def align(self, wav):
+    with torch.inference_mode():
+        whisper_wav = librosa.resample(wav, orig_sr=self.sr, target_sr=16000)
+        mel = torch.FloatTensor(whisper.log_mel_spectrogram(whisper_wav).T).to(self.device)[None].transpose(1,2)
+        prompt_max_frame = mel.size(2) // self.fm * self.fm
+        mel = mel[:, :, :prompt_max_frame]
+        token = torch.LongTensor([[798]]).to(self.device)
+        audio_features = self.aligner_lm.embed_audio(mel)
+        for i in range(768):
+            with torch.cuda.amp.autocast(dtype=self.precision, enabled=True):
+                logits = self.aligner_lm.logits(token, audio_features, None)
+                token_pred = torch.argmax(F.softmax(logits[:, -1], dim=-1), 1)[None]
+                token = torch.cat([token, token_pred], dim=1)
+                if token_pred[0] == 799:
+                    break
+        alignment_tokens = token
+    ph_ref, tone_ref, dur_ref, _ = split_ph_timestamp(deepcopy(alignment_tokens)[0, 1:-1])
+    ph_ref = torch.Tensor(ph_ref)[None].to(self.device)
+    tone_ref = torch.Tensor(tone_ref)[None].to(self.device)
+    if dur_ref.sum() < prompt_max_frame:
+        dur_ref[-1] += prompt_max_frame - dur_ref.sum()
+    elif dur_ref.sum() > prompt_max_frame:
+        len_diff = dur_ref.sum() - prompt_max_frame
+        while True:
+            for i in range(len(dur_ref)):
+                dur_ref[i] -= 1
+                len_diff -= 1
+                if len_diff == 0:
+                    break
+            if len_diff == 0:
+                break
+    mel2ph_ref = self.length_regulator(dur_ref[None]).to(self.device)
+    mel2ph_ref = mel2ph_ref[:, :mel2ph_ref.size(1)//self.fm*self.fm]
+    return ph_ref, tone_ref, mel2ph_ref
+''' Duration Prompting '''
+def make_dur_prompt(self, mel2ph_ref, ph_ref, tone_ref):
+    dur_tokens_2d_ = mel2token_to_dur(mel2ph_ref, ph_ref.shape[1]).clamp(
+                    max=self.hp_dur_model['dur_code_size'] - 1) + 1
+    ctx_dur_tokens = dur_tokens_2d_.clone().flatten(0, 1).to(self.device)
+    txt_tokens_flat_ = ph_ref.flatten(0, 1)
+    ctx_dur_tokens = ctx_dur_tokens[txt_tokens_flat_ > 0][None]
+    last_dur_pos_prompt = ctx_dur_tokens.shape[1]
+    dur_spk_pos_ids_flat = range(0, last_dur_pos_prompt)
+    dur_spk_pos_ids_flat = torch.LongTensor([dur_spk_pos_ids_flat]).to(self.device)
+    with torch.cuda.amp.autocast(dtype=self.precision, enabled=True):
+        _, incremental_state_dur_prompt = self.dur_model.infer(
+            ph_ref, {'tone': tone_ref}, None, None, None,
+            ctx_vqcodes=ctx_dur_tokens, spk_pos_ids_flat=dur_spk_pos_ids_flat, return_state=True)
+    return incremental_state_dur_prompt, ctx_dur_tokens
+''' Duration Prediction '''
+def dur_pred(self, ctx_dur_tokens, incremental_state_dur_prompt, ph_pred, tone_pred, seg_i, dur_disturb, dur_alpha, is_first, is_final):
+    last_dur_token = ctx_dur_tokens[:, -1:]
+    last_dur_pos_prompt = ctx_dur_tokens.shape[1]
+    incremental_state_dur = deepcopy(incremental_state_dur_prompt)
+    txt_len = ph_pred.shape[1]
+    dur_spk_pos_ids_flat = range(last_dur_pos_prompt, last_dur_pos_prompt + txt_len)
+    dur_spk_pos_ids_flat = torch.LongTensor([dur_spk_pos_ids_flat]).to(self.device)
+    last_dur_pos_prompt = last_dur_pos_prompt + txt_len
+    with torch.cuda.amp.autocast(dtype=self.precision, enabled=True):
+        dur_pred = self.dur_model.infer(
+            ph_pred, {'tone': tone_pred}, None, None, None,
+            incremental_state=incremental_state_dur,
+            first_decoder_inp=last_dur_token,
+            spk_pos_ids_flat=dur_spk_pos_ids_flat,
+        )
+    dur_pred = dur_pred - 1
+    dur_pred = dur_pred.clamp(0, self.hp_dur_model['dur_code_size'] - 1)
+    # if is_final:
+    #     dur_pred[:, -1] = dur_pred[:, -1].clamp(64, 128)
+    # else:
+    #     dur_pred[:, -1] = dur_pred[:, -1].clamp(48, 128)
+    # if seg_i > 0:
+        # dur_pred[:, 0] = 0
+    # ['。', '！', '？', 'sil']
+    for sil_token in [148, 153, 166, 145]:
+        dur_pred[ph_pred==sil_token].clamp_min(32)
+    # ['，', '；']
+    for sil_token in [163, 165]:
+        dur_pred[ph_pred==sil_token].clamp_min(16)
+    if not is_final:
+        # add 0.32ms for crossfade
+        dur_pred[:, -1] =  dur_pred[:, -1] + 32
+    else:
+        dur_pred[:, -1] = dur_pred[:, -1].clamp(64, 128)
+    ''' DiT target speech generation '''
+    dur_disturb_choice = (torch.rand_like(dur_pred.float()) > 0.5).float()
+    dur_disturb_r = 1 + torch.rand_like(dur_pred.float()) * dur_disturb
+    dur_pred = dur_pred * dur_disturb_r * dur_disturb_choice + \
+            dur_pred / dur_disturb_r * (1 - dur_disturb_choice)
+    dur_pred = torch.round(dur_pred * dur_alpha).clamp(0, 127)
+    if is_first:
+        dur_pred[:, 0] = 8
+    dur_sum = dur_pred.sum()
+    npad = self.fm - dur_sum % self.fm
+    if npad < self.fm:
+        dur_pred[:, -1] += npad
+    mel2ph_pred = self.length_regulator(dur_pred).to(self.device)
+    return mel2ph_pred
+def prepare_inputs_for_dit(self, mel2ph_ref, mel2ph_pred, ph_ref, tone_ref, ph_pred, tone_pred, vae_latent):
+    # Prepare duration token
+    mel2ph_pred = torch.cat((mel2ph_ref, mel2ph_pred+ph_ref.size(1)), dim=1)
+    mel2ph_pred = mel2ph_pred[:, :mel2ph_pred.size(1)//self.fm*self.fm].repeat(3, 1)
+    # Prepare phone and tone token
+    ph_pred = torch.cat((ph_ref, ph_pred), dim=1)
+    tone_pred = torch.cat((tone_ref, tone_pred), dim=1)
+    # Disable the English tone (set them to 3)"""
+    en_tone_idx = ~((tone_pred == 4) | ( (11 <= tone_pred) & (tone_pred <= 15)) | (tone_pred == 0))
+    tone_pred[en_tone_idx] = 3
+    # Prepare cfg inputs
+    ph_seq = torch.cat([ph_pred, ph_pred, torch.full(ph_pred.size(), self.cfg_mask_token_phone, device=self.device)], 0)
+    tone_seq = torch.cat([tone_pred, tone_pred, torch.full(tone_pred.size(), self.cfg_mask_token_tone, device=self.device)], 0)
+    target_size = mel2ph_pred.size(1)//self.vae_stride
+    vae_latent_ = vae_latent.repeat(3, 1, 1)
+    ctx_mask = torch.ones_like(vae_latent_[:, :, 0:1])
+    vae_latent_ = F.pad(vae_latent_, (0, 0, 0, target_size - vae_latent.size(1)), mode='constant', value=0)
+    vae_latent_[1:] = 0.0
+    ctx_mask = F.pad(ctx_mask, (0, 0, 0, target_size - vae_latent.size(1)), mode='constant', value=0)
+    return {
+        'phone': ph_seq,
+        'tone': tone_seq,
+        "lat_ctx": vae_latent_ * ctx_mask,
+        "ctx_mask": ctx_mask,
+        "dur": mel2ph_pred,
+    }

xinference/thirdparty/megatts3/tts/gradio_api.py ADDED Viewed

@@ -0,0 +1,93 @@
+# Copyright 2025 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import multiprocessing as mp
+import torch
+import os
+from functools import partial
+import gradio as gr
+import traceback
+from tts.infer_cli import MegaTTS3DiTInfer, convert_to_wav, cut_wav
+def model_worker(input_queue, output_queue, device_id):
+    device = None
+    if device_id is not None:
+        device = torch.device(f'cuda:{device_id}')
+    infer_pipe = MegaTTS3DiTInfer(device=device)
+    while True:
+        task = input_queue.get()
+        inp_audio_path, inp_npy_path, inp_text, infer_timestep, p_w, t_w = task
+        try:
+            convert_to_wav(inp_audio_path)
+            wav_path = os.path.splitext(inp_audio_path)[0] + '.wav'
+            cut_wav(wav_path, max_len=28)
+            with open(wav_path, 'rb') as file:
+                file_content = file.read()
+            resource_context = infer_pipe.preprocess(file_content, latent_file=inp_npy_path)
+            wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=infer_timestep, p_w=p_w, t_w=t_w)
+            output_queue.put(wav_bytes)
+        except Exception as e:
+            traceback.print_exc()
+            print(task, str(e))
+            output_queue.put(None)
+def main(inp_audio, inp_npy, inp_text, infer_timestep, p_w, t_w, processes, input_queue, output_queue):
+    print("Push task to the inp queue |", inp_audio, inp_npy, inp_text, infer_timestep, p_w, t_w)
+    input_queue.put((inp_audio, inp_npy, inp_text, infer_timestep, p_w, t_w))
+    res = output_queue.get()
+    if res is not None:
+        return res
+    else:
+        print("")
+        return None
+if __name__ == '__main__':
+    mp.set_start_method('spawn', force=True)
+    mp_manager = mp.Manager()
+    devices = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+    if devices != '':
+        devices = os.environ.get('CUDA_VISIBLE_DEVICES', '').split(",")
+    else:
+        devices = None
+    num_workers = 1
+    input_queue = mp_manager.Queue()
+    output_queue = mp_manager.Queue()
+    processes = []
+    print("Start open workers")
+    for i in range(num_workers):
+        p = mp.Process(target=model_worker, args=(input_queue, output_queue, i % len(devices) if devices is not None else None))
+        p.start()
+        processes.append(p)
+    api_interface = gr.Interface(fn=
+                                partial(main, processes=processes, input_queue=input_queue,
+                                        output_queue=output_queue),
+                                inputs=[gr.Audio(type="filepath", label="Upload .wav"), gr.File(type="filepath", label="Upload .npy"), "text",
+                                        gr.Number(label="infer timestep", value=32),
+                                        gr.Number(label="Intelligibility Weight", value=1.4),
+                                        gr.Number(label="Similarity Weight", value=3.0)], outputs=[gr.Audio(label="Synthesized Audio")],
+                                title="MegaTTS3",
+                                description="Upload a speech clip as a reference for timbre, " +
+                                "upload the pre-extracted latent file, "+
+                                "input the target text, and receive the cloned voice.", concurrency_limit=1)
+    api_interface.launch(server_name='0.0.0.0', server_port=7929, debug=True)
+    for p in processes:
+        p.join()

xinference 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

Potentially problematic release.

xinference 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl