PyPI - xinference - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

xinference 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (94) hide show

xinference/_compat.py +22 -2
xinference/_version.py +3 -3
xinference/api/restful_api.py +91 -6
xinference/client/restful/restful_client.py +39 -0
xinference/core/model.py +41 -13
xinference/deploy/cmdline.py +3 -1
xinference/deploy/test/test_cmdline.py +56 -0
xinference/isolation.py +24 -0
xinference/model/audio/__init__.py +12 -0
xinference/model/audio/core.py +26 -4
xinference/model/audio/f5tts.py +195 -0
xinference/model/audio/fish_speech.py +71 -35
xinference/model/audio/model_spec.json +88 -0
xinference/model/audio/model_spec_modelscope.json +9 -0
xinference/model/audio/whisper_mlx.py +208 -0
xinference/model/embedding/core.py +322 -6
xinference/model/embedding/model_spec.json +8 -1
xinference/model/embedding/model_spec_modelscope.json +9 -1
xinference/model/llm/__init__.py +4 -2
xinference/model/llm/llm_family.json +479 -53
xinference/model/llm/llm_family_modelscope.json +423 -17
xinference/model/llm/mlx/core.py +230 -50
xinference/model/llm/sglang/core.py +2 -0
xinference/model/llm/transformers/chatglm.py +9 -5
xinference/model/llm/transformers/core.py +1 -0
xinference/model/llm/transformers/glm_edge_v.py +230 -0
xinference/model/llm/transformers/utils.py +16 -8
xinference/model/llm/utils.py +23 -1
xinference/model/llm/vllm/core.py +89 -2
xinference/thirdparty/f5_tts/__init__.py +0 -0
xinference/thirdparty/f5_tts/api.py +166 -0
xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
xinference/thirdparty/f5_tts/eval/README.md +49 -0
xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
xinference/thirdparty/f5_tts/infer/README.md +191 -0
xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
xinference/thirdparty/f5_tts/model/__init__.py +10 -0
xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
xinference/thirdparty/f5_tts/model/cfm.py +285 -0
xinference/thirdparty/f5_tts/model/dataset.py +319 -0
xinference/thirdparty/f5_tts/model/modules.py +658 -0
xinference/thirdparty/f5_tts/model/trainer.py +366 -0
xinference/thirdparty/f5_tts/model/utils.py +185 -0
xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
xinference/thirdparty/f5_tts/socket_server.py +159 -0
xinference/thirdparty/f5_tts/train/README.md +77 -0
xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
xinference/thirdparty/f5_tts/train/train.py +75 -0
xinference/types.py +2 -1
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.2f269bb3.js → main.4eb4ee80.js} +3 -3
xinference/web/ui/build/static/js/main.4eb4ee80.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/8c5eeb02f772d02cbe8b89c05428d0dd41a97866f75f7dc1c2164a67f5a1cf98.json +1 -0
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/METADATA +39 -18
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/RECORD +92 -39
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/WHEEL +1 -1
xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
/xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.4eb4ee80.js.LICENSE.txt} +0 -0
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/LICENSE +0 -0
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/entry_points.txt +0 -0
{xinference-1.0.0.dist-info → xinference-1.1.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -156,6 +156,7 @@ def _get_completion(
     finish_reason: Optional[str],
     model_uid: str,
     r: InferenceRequest,
+    completion_tokens: int,
 ):
     completion_choice = CompletionChoice(
         text=output, index=0, logprobs=None, finish_reason=finish_reason
@@ -170,8 +171,8 @@ def _get_completion(
     )
     completion_usage = CompletionUsage(
         prompt_tokens=len(r.prompt_tokens),
-        completion_tokens=len(r.new_tokens),
-        total_tokens=len(r.prompt_tokens) + len(r.new_tokens),
+        completion_tokens=completion_tokens,
+        total_tokens=len(r.prompt_tokens) + completion_tokens,
     )
     completion = Completion(
         id=completion_chunk["id"],
@@ -371,7 +372,7 @@ def _batch_inference_one_step_internal(
                 r.stopped = stopped
                 r.finish_reason = finish_reason
-            if r.stopped and r not in stop_token_mapping and r not in output_mapping:
+            if r.stopped and r not in stop_token_mapping:
                 stop_token_mapping[r] = _i + 1
             if r.stream:
@@ -446,12 +447,14 @@ def _batch_inference_one_step_internal(
             else:
                 # last round, handle non-stream result
                 if r.stopped and _i == decode_round - 1:
-                    invalid_token_num = decode_round - stop_token_mapping[r]
+                    invalid_token_num = (
+                        (decode_round - stop_token_mapping[r] + 1)
+                        if r.finish_reason == "stop"
+                        else (decode_round - stop_token_mapping[r])
+                    )
                     outputs = (
                         tokenizer.decode(
-                            r.new_tokens[: -(invalid_token_num + 1)]
-                            if r.finish_reason == "stop"
-                            else r.new_tokens[:-invalid_token_num],
+                            r.new_tokens[:-invalid_token_num],
                             skip_special_tokens=True,
                             spaces_between_special_tokens=False,
                             clean_up_tokenization_spaces=True,
@@ -460,7 +463,12 @@ def _batch_inference_one_step_internal(
                         else output_mapping[r]
                     )
                     completion = _get_completion(
-                        outputs, r.chunk_id, r.finish_reason, model_uid, r
+                        outputs,
+                        r.chunk_id,
+                        r.finish_reason,
+                        model_uid,
+                        r,
+                        len(r.new_tokens) - invalid_token_num,
                     )
                     r.completion = [completion]

xinference/model/llm/utils.py CHANGED Viewed

@@ -324,7 +324,10 @@ class ChatModelMixin:
         """
         try:
             if isinstance(c, dict):
-                return [(None, c["name"], c["arguments"])]
+                try:
+                    return [(None, c["name"], json.loads(c["arguments"]))]
+                except Exception:
+                    return [(None, c["name"], c["arguments"])]
         except KeyError:
             logger.error("Can't parse glm output: %s", c)
             return [(str(c), None, None)]
@@ -569,6 +572,25 @@ def _decode_image(_url):
             return Image.open(BytesIO(response.content)).convert("RGB")
+def _decode_image_without_rgb(_url):
+    if _url.startswith("data:"):
+        logging.info("Parse url by base64 decoder.")
+        # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
+        # e.g. f"data:image/jpeg;base64,{base64_image}"
+        _type, data = _url.split(";")
+        _, ext = _type.split("/")
+        data = data[len("base64,") :]
+        data = base64.b64decode(data.encode("utf-8"))
+        return Image.open(BytesIO(data))
+    else:
+        try:
+            response = requests.get(_url)
+        except requests.exceptions.MissingSchema:
+            return Image.open(_url)
+        else:
+            return Image.open(BytesIO(response.content))
 @typing.no_type_check
 def generate_completion_chunk(
     chunk_text: Optional[str],

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -69,6 +69,7 @@ class VLLMModelConfig(TypedDict, total=False):
     quantization: Optional[str]
     max_model_len: Optional[int]
     limit_mm_per_prompt: Optional[Dict[str, int]]
+    guided_decoding_backend: Optional[str]
 class VLLMGenerateConfig(TypedDict, total=False):
@@ -85,6 +86,15 @@ class VLLMGenerateConfig(TypedDict, total=False):
     stop: Optional[Union[str, List[str]]]
     stream: bool  # non-sampling param, should not be passed to the engine.
     stream_options: Optional[Union[dict, None]]
+    skip_special_tokens: Optional[bool]
+    response_format: Optional[dict]
+    guided_json: Optional[Union[str, dict]]
+    guided_regex: Optional[str]
+    guided_choice: Optional[List[str]]
+    guided_grammar: Optional[str]
+    guided_json_object: Optional[bool]
+    guided_decoding_backend: Optional[str]
+    guided_whitespace_pattern: Optional[str]
 try:
@@ -144,6 +154,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
     VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
@@ -171,6 +182,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.5.3":
 if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
     VLLM_SUPPORTED_MODELS.append("llama-3.1")
     VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.3-instruct")
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
@@ -314,6 +326,7 @@ class VLLMModel(LLM):
         model_config.setdefault("max_num_seqs", 256)
         model_config.setdefault("quantization", None)
         model_config.setdefault("max_model_len", None)
+        model_config.setdefault("guided_decoding_backend", "outlines")
         return model_config
@@ -325,6 +338,22 @@ class VLLMModel(LLM):
             generate_config = {}
         sanitized = VLLMGenerateConfig()
+        response_format = generate_config.pop("response_format", None)
+        guided_decoding_backend = generate_config.get("guided_decoding_backend", None)
+        guided_json_object = None
+        guided_json = None
+        if response_format is not None:
+            if response_format.get("type") == "json_object":
+                guided_json_object = True
+            elif response_format.get("type") == "json_schema":
+                json_schema = response_format.get("json_schema")
+                assert json_schema is not None
+                guided_json = json_schema.get("json_schema")
+                if guided_decoding_backend is None:
+                    guided_decoding_backend = "outlines"
         sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
         sanitized.setdefault("n", generate_config.get("n", 1))
         sanitized.setdefault("best_of", generate_config.get("best_of", None))
@@ -346,6 +375,31 @@ class VLLMModel(LLM):
         sanitized.setdefault(
             "stream_options", generate_config.get("stream_options", None)
         )
+        sanitized.setdefault(
+            "skip_special_tokens", generate_config.get("skip_special_tokens", True)
+        )
+        sanitized.setdefault(
+            "guided_json", generate_config.get("guided_json", guided_json)
+        )
+        sanitized.setdefault("guided_regex", generate_config.get("guided_regex", None))
+        sanitized.setdefault(
+            "guided_choice", generate_config.get("guided_choice", None)
+        )
+        sanitized.setdefault(
+            "guided_grammar", generate_config.get("guided_grammar", None)
+        )
+        sanitized.setdefault(
+            "guided_whitespace_pattern",
+            generate_config.get("guided_whitespace_pattern", None),
+        )
+        sanitized.setdefault(
+            "guided_json_object",
+            generate_config.get("guided_json_object", guided_json_object),
+        )
+        sanitized.setdefault(
+            "guided_decoding_backend",
+            generate_config.get("guided_decoding_backend", guided_decoding_backend),
+        )
         return sanitized
@@ -483,13 +537,46 @@ class VLLMModel(LLM):
             if isinstance(stream_options, dict)
             else False
         )
-        sampling_params = SamplingParams(**sanitized_generate_config)
+        if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
+            # guided decoding only available for vllm >= 0.6.3
+            from vllm.sampling_params import GuidedDecodingParams
+            guided_options = GuidedDecodingParams.from_optional(
+                json=sanitized_generate_config.pop("guided_json", None),
+                regex=sanitized_generate_config.pop("guided_regex", None),
+                choice=sanitized_generate_config.pop("guided_choice", None),
+                grammar=sanitized_generate_config.pop("guided_grammar", None),
+                json_object=sanitized_generate_config.pop("guided_json_object", None),
+                backend=sanitized_generate_config.pop("guided_decoding_backend", None),
+                whitespace_pattern=sanitized_generate_config.pop(
+                    "guided_whitespace_pattern", None
+                ),
+            )
+            sampling_params = SamplingParams(
+                guided_decoding=guided_options, **sanitized_generate_config
+            )
+        else:
+            # ignore generate configs
+            sanitized_generate_config.pop("guided_json", None)
+            sanitized_generate_config.pop("guided_regex", None)
+            sanitized_generate_config.pop("guided_choice", None)
+            sanitized_generate_config.pop("guided_grammar", None)
+            sanitized_generate_config.pop("guided_json_object", None)
+            sanitized_generate_config.pop("guided_decoding_backend", None)
+            sanitized_generate_config.pop("guided_whitespace_pattern", None)
+            sampling_params = SamplingParams(**sanitized_generate_config)
         if not request_id:
             request_id = str(uuid.uuid1())
         assert self._engine is not None
         results_generator = self._engine.generate(
-            prompt, sampling_params, request_id, lora_request=lora_request
+            prompt,
+            sampling_params,
+            request_id,
+            lora_request,
         )
         async def stream_results() -> AsyncGenerator[CompletionChunk, None]:

xinference/thirdparty/f5_tts/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/f5_tts/api.py ADDED Viewed

@@ -0,0 +1,166 @@
+import random
+import sys
+from importlib.resources import files
+import soundfile as sf
+import tqdm
+from cached_path import cached_path
+from f5_tts.infer.utils_infer import (
+    hop_length,
+    infer_process,
+    load_model,
+    load_vocoder,
+    preprocess_ref_audio_text,
+    remove_silence_for_generated_wav,
+    save_spectrogram,
+    transcribe,
+    target_sample_rate,
+)
+from f5_tts.model import DiT, UNetT
+from f5_tts.model.utils import seed_everything
+class F5TTS:
+    def __init__(
+        self,
+        model_type="F5-TTS",
+        ckpt_file="",
+        vocab_file="",
+        ode_method="euler",
+        use_ema=True,
+        vocoder_name="vocos",
+        local_path=None,
+        device=None,
+        hf_cache_dir=None,
+    ):
+        # Initialize parameters
+        self.final_wave = None
+        self.target_sample_rate = target_sample_rate
+        self.hop_length = hop_length
+        self.seed = -1
+        self.mel_spec_type = vocoder_name
+        # Set device
+        if device is not None:
+            self.device = device
+        else:
+            import torch
+            self.device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+        # Load models
+        self.load_vocoder_model(vocoder_name, local_path=local_path, hf_cache_dir=hf_cache_dir)
+        self.load_ema_model(
+            model_type, ckpt_file, vocoder_name, vocab_file, ode_method, use_ema, hf_cache_dir=hf_cache_dir
+        )
+    def load_vocoder_model(self, vocoder_name, local_path=None, hf_cache_dir=None):
+        self.vocoder = load_vocoder(vocoder_name, local_path is not None, local_path, self.device, hf_cache_dir)
+    def load_ema_model(self, model_type, ckpt_file, mel_spec_type, vocab_file, ode_method, use_ema, hf_cache_dir=None):
+        if model_type == "F5-TTS":
+            if not ckpt_file:
+                if mel_spec_type == "vocos":
+                    ckpt_file = str(
+                        cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors", cache_dir=hf_cache_dir)
+                    )
+                elif mel_spec_type == "bigvgan":
+                    ckpt_file = str(
+                        cached_path("hf://SWivid/F5-TTS/F5TTS_Base_bigvgan/model_1250000.pt", cache_dir=hf_cache_dir)
+                    )
+            model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+            model_cls = DiT
+        elif model_type == "E2-TTS":
+            if not ckpt_file:
+                ckpt_file = str(
+                    cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors", cache_dir=hf_cache_dir)
+                )
+            model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
+            model_cls = UNetT
+        else:
+            raise ValueError(f"Unknown model type: {model_type}")
+        self.ema_model = load_model(
+            model_cls, model_cfg, ckpt_file, mel_spec_type, vocab_file, ode_method, use_ema, self.device
+        )
+    def transcribe(self, ref_audio, language=None):
+        return transcribe(ref_audio, language)
+    def export_wav(self, wav, file_wave, remove_silence=False):
+        sf.write(file_wave, wav, self.target_sample_rate)
+        if remove_silence:
+            remove_silence_for_generated_wav(file_wave)
+    def export_spectrogram(self, spect, file_spect):
+        save_spectrogram(spect, file_spect)
+    def infer(
+        self,
+        ref_file,
+        ref_text,
+        gen_text,
+        show_info=print,
+        progress=tqdm,
+        target_rms=0.1,
+        cross_fade_duration=0.15,
+        sway_sampling_coef=-1,
+        cfg_strength=2,
+        nfe_step=32,
+        speed=1.0,
+        fix_duration=None,
+        remove_silence=False,
+        file_wave=None,
+        file_spect=None,
+        seed=-1,
+    ):
+        if seed == -1:
+            seed = random.randint(0, sys.maxsize)
+        seed_everything(seed)
+        self.seed = seed
+        ref_file, ref_text = preprocess_ref_audio_text(ref_file, ref_text, device=self.device)
+        wav, sr, spect = infer_process(
+            ref_file,
+            ref_text,
+            gen_text,
+            self.ema_model,
+            self.vocoder,
+            self.mel_spec_type,
+            show_info=show_info,
+            progress=progress,
+            target_rms=target_rms,
+            cross_fade_duration=cross_fade_duration,
+            nfe_step=nfe_step,
+            cfg_strength=cfg_strength,
+            sway_sampling_coef=sway_sampling_coef,
+            speed=speed,
+            fix_duration=fix_duration,
+            device=self.device,
+        )
+        if file_wave is not None:
+            self.export_wav(wav, file_wave, remove_silence)
+        if file_spect is not None:
+            self.export_spectrogram(spect, file_spect)
+        return wav, sr, spect
+if __name__ == "__main__":
+    f5tts = F5TTS()
+    wav, sr, spect = f5tts.infer(
+        ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
+        ref_text="some call me nature, others call me mother nature.",
+        gen_text="""I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences.""",
+        file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")),
+        file_spect=str(files("f5_tts").joinpath("../../tests/api_out.png")),
+        seed=-1,  # random seed = -1
+    )
+    print("seed :", f5tts.seed)

xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml ADDED Viewed

@@ -0,0 +1,44 @@
+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN  # dataset name
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: E2TTS_Base
+  tokenizer: pinyin
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 1024
+    depth: 24
+    heads: 16
+    ff_mult: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: None  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml ADDED Viewed

@@ -0,0 +1,44 @@
+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0
+  bnb_optimizer: False
+model:
+  name: E2TTS_Small
+  tokenizer: pinyin
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 768
+    depth: 20
+    heads: 12
+    ff_mult: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: None  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml ADDED Viewed

@@ -0,0 +1,46 @@
+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN  # dataset name
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: F5TTS_Base  # model name
+  tokenizer: pinyin  # tokenizer type
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 1024
+    depth: 22
+    heads: 16
+    ff_mult: 2
+    text_dim: 512
+    conv_layers: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: None  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml ADDED Viewed

@@ -0,0 +1,46 @@
+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: F5TTS_Small
+  tokenizer: pinyin
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 768
+    depth: 18
+    heads: 12
+    ff_mult: 2
+    text_dim: 512
+    conv_layers: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: None  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

xinference/thirdparty/f5_tts/eval/README.md ADDED Viewed

@@ -0,0 +1,49 @@
+# Evaluation
+Install packages for evaluation:
+```bash
+pip install -e .[eval]
+```
+## Generating Samples for Evaluation
+### Prepare Test Datasets
+1. *Seed-TTS testset*: Download from [seed-tts-eval](https://github.com/BytedanceSpeech/seed-tts-eval).
+2. *LibriSpeech test-clean*: Download from [OpenSLR](http://www.openslr.org/12/).
+3. Unzip the downloaded datasets and place them in the `data/` directory.
+4. Update the path for *LibriSpeech test-clean* data in `src/f5_tts/eval/eval_infer_batch.py`
+5. Our filtered LibriSpeech-PC 4-10s subset: `data/librispeech_pc_test_clean_cross_sentence.lst`
+### Batch Inference for Test Set
+To run batch inference for evaluations, execute the following commands:
+```bash
+# batch inference for evaluations
+accelerate config  # if not set before
+bash src/f5_tts/eval/eval_infer_batch.sh
+```
+## Objective Evaluation on Generated Results
+### Download Evaluation Model Checkpoints
+1. Chinese ASR Model: [Paraformer-zh](https://huggingface.co/funasr/paraformer-zh)
+2. English ASR Model: [Faster-Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)
+3. WavLM Model: Download from [Google Drive](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view).
+Then update in the following scripts with the paths you put evaluation model ckpts to.
+### Objective Evaluation
+Update the path with your batch-inferenced results, and carry out WER / SIM evaluations:
+```bash
+# Evaluation for Seed-TTS test set
+python src/f5_tts/eval/eval_seedtts_testset.py --gen_wav_dir <GEN_WAVE_DIR>
+# Evaluation for LibriSpeech-PC test-clean (cross-sentence)
+python src/f5_tts/eval/eval_librispeech_test_clean.py --gen_wav_dir <GEN_WAVE_DIR> --librispeech_test_clean_path <TEST_CLEAN_PATH>
+```

xinference 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

xinference 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl