PyPI - xinference - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

xinference 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (104) hide show

xinference/_compat.py +2 -0
xinference/_version.py +3 -3
xinference/api/restful_api.py +23 -1
xinference/core/model.py +1 -6
xinference/core/utils.py +10 -6
xinference/model/audio/core.py +5 -0
xinference/model/audio/cosyvoice.py +25 -3
xinference/model/audio/f5tts.py +15 -10
xinference/model/audio/f5tts_mlx.py +260 -0
xinference/model/audio/fish_speech.py +35 -111
xinference/model/audio/model_spec.json +19 -3
xinference/model/audio/model_spec_modelscope.json +9 -0
xinference/model/audio/utils.py +32 -0
xinference/model/image/core.py +69 -1
xinference/model/image/model_spec.json +127 -4
xinference/model/image/model_spec_modelscope.json +130 -4
xinference/model/image/stable_diffusion/core.py +45 -13
xinference/model/llm/llm_family.json +47 -0
xinference/model/llm/llm_family.py +15 -36
xinference/model/llm/llm_family_modelscope.json +49 -0
xinference/model/llm/mlx/core.py +68 -13
xinference/model/llm/transformers/core.py +1 -0
xinference/model/llm/transformers/qwen2_vl.py +2 -0
xinference/model/llm/utils.py +1 -0
xinference/model/llm/vllm/core.py +11 -2
xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
xinference/thirdparty/cosyvoice/bin/train.py +42 -8
xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
xinference/thirdparty/cosyvoice/cli/model.py +330 -80
xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
xinference/thirdparty/cosyvoice/utils/common.py +28 -1
xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
xinference/thirdparty/fish_speech/tools/schema.py +11 -28
xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
xinference/thirdparty/matcha/utils/utils.py +2 -2
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/METADATA +11 -6
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/RECORD +95 -74
xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
xinference/thirdparty/fish_speech/tools/api.py +0 -943
xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
xinference/thirdparty/fish_speech/tools/webui.py +0 -548
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/LICENSE +0 -0
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/WHEEL +0 -0
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/entry_points.txt +0 -0
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/top_level.txt +0 -0

xinference/thirdparty/fish_speech/tools/llama/generate.py CHANGED Viewed

@@ -17,9 +17,16 @@ from loguru import logger
 from tqdm import tqdm
 from transformers import AutoTokenizer
-from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
+from fish_speech.conversation import (
+    CODEBOOK_PAD_TOKEN_ID,
+    Conversation,
+    Message,
+    TextPart,
+    VQPart,
+)
 from fish_speech.models.text2semantic.llama import BaseModelArgs
 from fish_speech.text import clean_text, split_text
+from fish_speech.tokenizer import IM_END_TOKEN, FishTokenizer
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 torch._inductor.config.coordinate_descent_tuning = True
@@ -145,8 +152,8 @@ def decode_one_token_ar_agent(
     model: DualARTransformer,
     x: torch.Tensor,
     input_pos: torch.Tensor,
+    semantic_ids: list,
     previous_tokens: torch.Tensor = None,
-    semantic_id: int = 32003,
     **sampling_kwargs,
 ) -> torch.Tensor:
     # print(x, input_pos)
@@ -190,19 +197,13 @@ def decode_one_token_ar_agent(
         codebooks.append(a)
     codebooks = torch.stack(codebooks, dim=1)
+    semantic_ids_tensor = torch.tensor(semantic_ids, device=codebooks.device)
     codebooks[:, 1:, :] = torch.masked_fill(
-        codebooks[:, 1:, :], codebooks[:, :1, :] != semantic_id, CODEBOOK_PAD_TOKEN_ID
+        codebooks[:, 1:, :],
+        ~torch.isin(codebooks[:, :1, :], semantic_ids_tensor),
+        CODEBOOK_PAD_TOKEN_ID,
     )
-    # for i in range(codebooks.size(1) - 1):
-    #     codebooks[:, i + 1, :] = torch.masked_fill(
-    #         codebooks[:, i + 1, :],
-    #         codebooks[:, :1, :] != semantic_id,
-    #         CODEBOOK_PAD_TOKEN_ID + i * 1024,
-    #     )
-    # print(codebooks)
     return codebooks
@@ -210,8 +211,8 @@ def decode_one_token_naive_agent(
     model: NaiveTransformer,
     x: torch.Tensor,
     input_pos: torch.Tensor,
+    semantic_ids: list,
     previous_tokens: torch.Tensor = None,
-    semantic_id: int = 32003,
     **sampling_kwargs,
 ) -> torch.Tensor:
     x = model.forward_generate(x, input_pos)
@@ -236,8 +237,11 @@ def decode_one_token_naive_agent(
         )
     codebooks = torch.stack(codebooks, dim=1)
+    semantic_ids_tensor = torch.tensor(semantic_ids, device=codebooks.device)
     codebooks[:, 1:, :] = torch.masked_fill(
-        codebooks[:, 1:, :], codebooks[:, :1, :] != semantic_id, CODEBOOK_PAD_TOKEN_ID
+        codebooks[:, 1:, :],
+        ~torch.isin(codebooks[:, :1, :], semantic_ids_tensor),
+        CODEBOOK_PAD_TOKEN_ID,
     )
     return codebooks
@@ -247,8 +251,8 @@ def decode_one_token_ar(
     model: DualARTransformer,
     x: torch.Tensor,
     input_pos: torch.Tensor,
+    semantic_ids: list,
     previous_tokens: torch.Tensor = None,
-    semantic_id: int = 0,
     **sampling_kwargs,
 ) -> torch.Tensor:
     x = model.forward_generate(x, input_pos)
@@ -261,21 +265,32 @@ def decode_one_token_ar(
     codebooks = [
         sample(
             x.logits,
-            previous_tokens=None,  # Disable repetition penalty for the token codebook
+            previous_tokens=(
+                previous_tokens[0] if previous_tokens is not None else None
+            ),  # Disable repetition penalty for the token codebook
             **sampling_kwargs_main,
         )[0]
     ]
-    x = x.hidden_states
+    hidden_states = x.hidden_states
     # Cleanup the cache
     for layer in model.fast_layers:
         layer.attention.kv_cache.k_cache.fill_(0)
         layer.attention.kv_cache.v_cache.fill_(0)
-    for codebook_idx in range(model.config.num_codebooks):
-        input_pos = torch.tensor([codebook_idx], device=x.device, dtype=torch.long)
-        logits = model.forward_generate_fast(x, input_pos)
+    input_pos = torch.tensor([0], device=hidden_states.device, dtype=torch.long)
+    model.forward_generate_fast(hidden_states, input_pos)
+    a = codebooks[0] - model.tokenizer.semantic_begin_id
+    a[a < 0] = 0
+    hidden_states = model.fast_embeddings(a)
+    codebooks.append(a)
+    for codebook_idx in range(1, model.config.num_codebooks):
+        input_pos = torch.tensor(
+            [codebook_idx], device=hidden_states.device, dtype=torch.long
+        )
+        logits = model.forward_generate_fast(hidden_states, input_pos)
         a = sample(
             logits,
             previous_tokens=(
@@ -285,14 +300,16 @@ def decode_one_token_ar(
             ),
             **sampling_kwargs,
         )[0]
-        x = model.fast_embeddings(a)
+        hidden_states = model.fast_embeddings(a)
         codebooks.append(a)
     codebooks = torch.stack(codebooks, dim=0)
-    codebooks[1:, :] = torch.masked_fill(
-        codebooks[1:, :], codebooks[:1, :] != semantic_id, CODEBOOK_PAD_TOKEN_ID
-    )
+    # semantic_ids_tensor = torch.tensor(semantic_ids, device=codebooks.device)
+    # codebooks[1:, :] = torch.masked_fill(
+    #     codebooks[1:, :], ~torch.isin(codebooks[:1, :], semantic_ids_tensor), CODEBOOK_PAD_TOKEN_ID
+    # )
+    # print(codebooks)
     return codebooks
@@ -337,9 +354,8 @@ def decode_n_tokens(
     cur_token: torch.Tensor,
     input_pos: torch.Tensor,
     num_new_tokens: int,
-    im_end_id: int = 4,
+    semantic_ids: list,
     decode_one_token=decode_one_token_naive,
-    semantic_id: int = 0,
     **sampling_kwargs,
 ):
     previous_tokens = torch.zeros(
@@ -368,7 +384,7 @@ def decode_n_tokens(
                 x=cur_token,
                 input_pos=input_pos,
                 previous_tokens=window,
-                semantic_id=semantic_id,
+                semantic_ids=semantic_ids,
                 **sampling_kwargs,
             )
@@ -378,7 +394,7 @@ def decode_n_tokens(
             model.config.num_codebooks + 1, -1
         )
-        if cur_token[0, 0, -1] == im_end_id:
+        if cur_token[0, 0, -1] == model.tokenizer.get_token_id(IM_END_TOKEN):
             break
     return previous_tokens[:, : i + 1]
@@ -391,7 +407,6 @@ def generate(
     model: NaiveTransformer,
     prompt: torch.Tensor,
     max_new_tokens: int,
-    im_end_id: int = 4,
     decode_one_token=decode_one_token_naive,
     **sampling_kwargs,
 ) -> torch.Tensor:
@@ -401,7 +416,10 @@ def generate(
     # create an empty tensor of the expected final shape and fill in the current tokens
     T = prompt.size(1)
-    semantic_id = model.tokenizer.convert_tokens_to_ids("<|semantic|>")
+    # semantic_id = model.tokenizer.convert_tokens_to_ids("<|semantic|>")
+    semantic_ids = [
+        model.tokenizer.get_token_id(f"<|semantic:{i}|>") for i in range(1024)
+    ]
     if max_new_tokens:
         if T + max_new_tokens > model.config.max_seq_len:
@@ -435,7 +453,7 @@ def generate(
         model,
         prompt.view(1, codebook_dim, -1),
         input_pos,
-        semantic_id=semantic_id,
+        semantic_ids=semantic_ids,
         **sampling_kwargs,
     )
     seq[:, T : T + 1] = next_token
@@ -446,9 +464,8 @@ def generate(
         next_token.view(1, codebook_dim, -1),
         input_pos,
         max_new_tokens - 1,
-        im_end_id=im_end_id,
         decode_one_token=decode_one_token,
-        semantic_id=semantic_id,
+        semantic_ids=semantic_ids,
         **sampling_kwargs,
     )
     # x = torch.cat(generated_tokens, dim=1)
@@ -463,8 +480,8 @@ def decode_n_tokens_agent(
     cur_token: torch.Tensor,
     input_pos: torch.Tensor,
     num_new_tokens: int,
+    semantic_ids: list,
     im_end_id: int = 4,
-    semantic_id: int = 32003,
     decode_one_token=decode_one_token_naive_agent,
     early_stop_threshold: float = 0.6,
     **sampling_kwargs,
@@ -495,7 +512,7 @@ def decode_n_tokens_agent(
                 x=cur_token,
                 input_pos=input_pos,
                 previous_tokens=window,
-                semantic_id=semantic_id,
+                semantic_ids=semantic_ids,
                 **sampling_kwargs,
             )
@@ -529,8 +546,8 @@ def generate_agent(
     model: BaseTransformer,
     prompt: torch.Tensor,
     max_new_tokens: int,
+    semantic_ids: list,
     im_end_id: int = 4,
-    semantic_id: int = 32003,
     decode_one_token=decode_one_token_naive_agent,
     num_samples: int = 1,
     early_stop_threshold: float = 0.6,
@@ -574,7 +591,7 @@ def generate_agent(
         model,
         prompt,
         input_pos,
-        semantic_id=semantic_id,
+        semantic_ids=semantic_ids,
         **sampling_kwargs,
     ).view(num_samples, codebook_dim, -1)
     yield next_token.cpu()
@@ -587,7 +604,7 @@ def generate_agent(
         input_pos,
         max_new_tokens - 1,
         im_end_id=im_end_id,
-        semantic_id=semantic_id,
+        semantic_ids=semantic_ids,
         decode_one_token=decode_one_token,
         early_stop_threshold=early_stop_threshold,
         **sampling_kwargs,
@@ -602,65 +619,63 @@ def encode_tokens(
     num_codebooks=4,
 ):
     string = clean_text(string)
-    string = f"<|im_start|>user\n{string}<|im_end|><|im_start|>assistant\n"
-    new_tokens = tokenizer.encode(
-        string,
-        add_special_tokens=False,
-        max_length=10**6,
-        truncation=False,
+    messages = []
+    messages.append(
+        Message(
+            role="user",
+            parts=[TextPart(text=string)],
+            cal_loss=False,
+        )
     )
-    tokens = torch.tensor([new_tokens], dtype=torch.int, device=device)
-    # Codebooks
-    zeros = (
-        torch.ones((num_codebooks, tokens.size(1)), dtype=torch.int, device=device)
-        * CODEBOOK_PAD_TOKEN_ID
-    )
-    prompt = torch.cat((tokens, zeros), dim=0)
+    if prompt_tokens is not None:
+        if prompt_tokens.ndim == 3:
+            assert (
+                prompt_tokens.shape[0] == 1
+            ), "3D prompt tokens should have shape (1, num_codebooks, seq_len)"
+            prompt_tokens = prompt_tokens[0]
-    if prompt_tokens is None:
-        return prompt
+        assert prompt_tokens.ndim == 2, "Prompt tokens should be 2D tensor"
-    # Get prompt tokens
-    if prompt_tokens.ndim == 3:
-        assert (
-            prompt_tokens.shape[0] == 1
-        ), f"3 dim prompt tokens should have shape (1, num_codebooks, seq_len)"
-        prompt_tokens = prompt_tokens[0]
+        if prompt_tokens.shape[0] > num_codebooks:
+            logger.warning(
+                f"Prompt tokens shape {prompt_tokens.shape} is larger than num_codebooks {num_codebooks}, getting first {num_codebooks} codebooks"
+            )
+            prompt_tokens = prompt_tokens[:num_codebooks]
-    assert prompt_tokens.ndim == 2
-    data = prompt_tokens + 1
+        vq_part = VQPart(codes=prompt_tokens.to(device))
-    if prompt_tokens.shape[0] > num_codebooks:
-        logger.warning(
-            f"Prompt tokens shape {prompt_tokens.shape} is larger than num_codebooks {num_codebooks}, getting first {num_codebooks} codebooks"
+        messages.append(
+            Message(
+                role="assistant",
+                parts=[TextPart(text="<|voice|>"), vq_part],
+                cal_loss=False,
+            )
+        )
+    else:
+        messages.append(
+            Message(
+                role="assistant",
+                parts=[TextPart(text="<|voice|>")],
+                cal_loss=False,
+                add_im_end=False,
+            )
         )
-        data = data[:num_codebooks]
-    # Add pad token for each codebook
-    data = torch.cat(
-        (data, torch.zeros((data.size(0), 1), dtype=torch.int, device=device)),
-        dim=1,
-    )
-    # Since 1.0, we use <|semantic|>
-    s0_token_id = tokenizer.convert_tokens_to_ids("<|semantic|>")
-    end_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
-    main_token_ids = (
-        torch.ones((1, data.size(1)), dtype=torch.int, device=device) * s0_token_id
+    conversation = Conversation(messages=messages)
+    # conversation.visualize(tokenizer)
+    encoded = conversation.encode_for_inference(
+        tokenizer=tokenizer,
+        num_codebooks=num_codebooks,
     )
-    main_token_ids[0, -1] = end_token_id
-    data = torch.cat((main_token_ids, data), dim=0)
-    prompt = torch.cat((prompt, data), dim=1)
-    return prompt
+    return encoded.to(device)
 def load_model(checkpoint_path, device, precision, compile=False, is_agent=False):
     model: Union[NaiveTransformer, DualARTransformer] = BaseTransformer.from_pretrained(
-        checkpoint_path, load_weights=True
+        checkpoint_path, load_weights=True, is_agent=is_agent
     )
     model = model.to(device=device, dtype=precision)
@@ -729,11 +744,26 @@ def generate_long(
     model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
     tokenizer = model.tokenizer
-    im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    im_end_id = tokenizer.get_token_id("<|im_end|>")
     encoded = []
     texts = split_text(text, chunk_length) if iterative_prompt else [text]
-    encoded_prompts = []
+    encoded_prompts = [
+        Conversation(
+            messages=[
+                Message(
+                    role="system",
+                    parts=[TextPart(text="Speak out the provided text.")],
+                    cal_loss=False,
+                )
+            ]
+        )
+        .encode_for_inference(
+            tokenizer=tokenizer,
+            num_codebooks=model.config.num_codebooks,
+        )
+        .to(device)
+    ]
     if use_prompt:
         for idx, (t, c) in enumerate(zip(prompt_text, prompt_tokens)):
@@ -812,7 +842,6 @@ def generate_long(
                 model=model,
                 prompt=cat_encoded,
                 max_new_tokens=max_new_tokens,
-                im_end_id=im_end_id,
                 decode_one_token=decode_one_token,
                 temperature=temperature,
                 top_p=top_p,
@@ -842,12 +871,11 @@ def generate_long(
                 )
             # Put the generated tokens
-            # since there is <im_end> and <eos> tokens, we remove last 2 tokens
-            codes = y[1:, prompt_length:-1].clone()
-            codes = codes - 1
+            # since there is <im_end>, we remove last token
+            codes = y[1:, prompt_length + 1 :].clone()
             assert (codes >= 0).all(), f"Negative code found"
-            decoded = y[:, prompt_length:-1].clone()
+            decoded = y[:, prompt_length:].clone()
             # But for global encoding, we should keep the <im_end> token
             global_encoded.append(decoded)

xinference/thirdparty/fish_speech/tools/run_webui.py ADDED Viewed

@@ -0,0 +1,104 @@
+import os
+from argparse import ArgumentParser
+from pathlib import Path
+import pyrootutils
+import torch
+from loguru import logger
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from tools.inference_engine import TTSInferenceEngine
+from tools.llama.generate import launch_thread_safe_queue
+from tools.schema import ServeTTSRequest
+from tools.vqgan.inference import load_model as load_decoder_model
+from tools.webui import build_app
+from tools.webui.inference import get_inference_wrapper
+# Make einx happy
+os.environ["EINX_FILTER_TRACEBACK"] = "false"
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--llama-checkpoint-path",
+        type=Path,
+        default="checkpoints/fish-speech-1.5",
+    )
+    parser.add_argument(
+        "--decoder-checkpoint-path",
+        type=Path,
+        default="checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
+    )
+    parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--half", action="store_true")
+    parser.add_argument("--compile", action="store_true")
+    parser.add_argument("--max-gradio-length", type=int, default=0)
+    parser.add_argument("--theme", type=str, default="light")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    args.precision = torch.half if args.half else torch.bfloat16
+    # Check if MPS or CUDA is available
+    if torch.backends.mps.is_available():
+        args.device = "mps"
+        logger.info("mps is available, running on mps.")
+    elif not torch.cuda.is_available():
+        logger.info("CUDA is not available, running on CPU.")
+        args.device = "cpu"
+    logger.info("Loading Llama model...")
+    llama_queue = launch_thread_safe_queue(
+        checkpoint_path=args.llama_checkpoint_path,
+        device=args.device,
+        precision=args.precision,
+        compile=args.compile,
+    )
+    logger.info("Loading VQ-GAN model...")
+    decoder_model = load_decoder_model(
+        config_name=args.decoder_config_name,
+        checkpoint_path=args.decoder_checkpoint_path,
+        device=args.device,
+    )
+    logger.info("Decoder model loaded, warming up...")
+    # Create the inference engine
+    inference_engine = TTSInferenceEngine(
+        llama_queue=llama_queue,
+        decoder_model=decoder_model,
+        compile=args.compile,
+        precision=args.precision,
+    )
+    # Dry run to check if the model is loaded correctly and avoid the first-time latency
+    list(
+        inference_engine.inference(
+            ServeTTSRequest(
+                text="Hello world.",
+                references=[],
+                reference_id=None,
+                max_new_tokens=1024,
+                chunk_length=200,
+                top_p=0.7,
+                repetition_penalty=1.5,
+                temperature=0.7,
+                format="wav",
+            )
+        )
+    )
+    logger.info("Warming up done, launching the web UI...")
+    # Get the inference function with the immutable arguments
+    inference_fct = get_inference_wrapper(inference_engine)
+    app = build_app(inference_fct, args.theme)
+    app.launch(show_api=True)

xinference/thirdparty/fish_speech/tools/schema.py CHANGED Viewed

@@ -1,16 +1,14 @@
 import os
 import queue
 from dataclasses import dataclass
-from typing import Annotated, Literal, Optional
+from typing import Annotated, Literal
 import torch
-from pydantic import AfterValidator, BaseModel, Field, confloat, conint, conlist
+from pydantic import BaseModel, Field, conint, conlist
 from pydantic.functional_validators import SkipValidation
 from fish_speech.conversation import Message, TextPart, VQPart
-GLOBAL_NUM_SAMPLES = int(os.getenv("GLOBAL_NUM_SAMPLES", 1))
 class ServeVQPart(BaseModel):
     type: Literal["vq"] = "vq"
@@ -69,6 +67,9 @@ class ServeMessage(BaseModel):
     def to_conversation_message(self):
         new_message = Message(role=self.role, parts=[])
+        if self.role == "assistant":
+            new_message.modality = "voice"
         for part in self.parts:
             if isinstance(part, ServeTextPart):
                 new_message.parts.append(TextPart(text=part.text))
@@ -82,7 +83,7 @@ class ServeMessage(BaseModel):
         return new_message
-class ServeRequest(BaseModel):
+class ServeChatRequest(BaseModel):
     messages: Annotated[list[ServeMessage], conlist(ServeMessage, min_length=1)]
     max_new_tokens: int = 1024
     top_p: float = 0.7
@@ -111,11 +112,6 @@ class ServeVQGANDecodeResponse(BaseModel):
     audios: list[bytes]
-class ServeReferenceAudio(BaseModel):
-    audio: bytes
-    text: str
 class ServeForwardMessage(BaseModel):
     role: str
     content: str
@@ -147,24 +143,11 @@ class ServeReferenceAudio(BaseModel):
         return f"ServeReferenceAudio(text={self.text!r}, audio_size={len(self.audio)})"
-class ServeChatRequestV1(BaseModel):
-    model: str = "llama3-8b"
-    messages: list[ServeForwardMessage] = []
-    audio: bytes | None = None
-    temperature: float = 1.0
-    top_p: float = 1.0
-    max_tokens: int = 256
-    voice: str = "jessica"
-    tts_audio_format: Literal["mp3", "pcm", "opus"] = "mp3"
-    tts_audio_bitrate: Literal[16, 24, 32, 48, 64, 96, 128, 192] = 128
 class ServeTTSRequest(BaseModel):
     text: str
     chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
     # Audio format
     format: Literal["wav", "pcm", "mp3"] = "wav"
-    mp3_bitrate: Literal[64, 128, 192] = 128
     # References audios for in-context learning
     references: list[ServeReferenceAudio] = []
     # Reference id
@@ -172,16 +155,16 @@ class ServeTTSRequest(BaseModel):
     # Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
     reference_id: str | None = None
     seed: int | None = None
-    use_memory_cache: Literal["on-demand", "never"] = "never"
+    use_memory_cache: Literal["on", "off"] = "off"
     # Normalize text for en & zh, this increase stability for numbers
     normalize: bool = True
-    mp3_bitrate: Optional[int] = 64
-    opus_bitrate: Optional[int] = -1000
-    # Balance mode will reduce latency to 300ms, but may decrease stability
-    latency: Literal["normal", "balanced"] = "normal"
     # not usually used below
     streaming: bool = False
     max_new_tokens: int = 1024
     top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
     repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
     temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
+    class Config:
+        # Allow arbitrary types for pytorch related types
+        arbitrary_types_allowed = True

xinference/thirdparty/fish_speech/tools/server/agent/__init__.py ADDED Viewed

@@ -0,0 +1,57 @@
+import struct
+from functools import partial
+import ormsgpack
+from tools.server.agent.generate import generate_responses
+from tools.server.agent.pre_generation_utils import prepare_messages
+def execute_request(input_queue, tokenizer, config, request, device):
+    """
+    This function prepares the conversation, encodes the request,
+    sends the generation request, and handles decoding/streaming.
+    It returns a response generator (ServeResponse or ServeStreamResponse).
+    """
+    prompt, im_end_id = prepare_messages(request, tokenizer, config)
+    yield from generate_responses(
+        input_queue, tokenizer, config, request, prompt, im_end_id, device
+    )
+def response_generator(req, llama_queue, tokenizer, config, device):
+    """
+    Non-streaming response wrapper for the chat endpoint.
+    Only returns the final result.
+    """
+    generator = execute_request(llama_queue, tokenizer, config, req, device)
+    return next(generator)
+async def streaming_generator(req, llama_queue, tokenizer, config, device, json_mode):
+    """
+    Streaming response wrapper for the chat endpoint.
+    Returns the response in chunks.
+    """
+    generator = execute_request(llama_queue, tokenizer, config, req, device)
+    for i in generator:
+        if json_mode:
+            body = i.model_dump_json().encode("utf-8")
+            yield b"data: " + body + b"\n\n"
+        else:
+            body = ormsgpack.packb(i, option=ormsgpack.OPT_SERIALIZE_PYDANTIC)
+            yield struct.pack("I", len(body)) + body
+def get_response_generator(
+    llama_queue, tokenizer, config, req, device, json_mode
+) -> partial:
+    """
+    Get the correct response generator based on the request.
+    """
+    if not req.streaming:
+        return partial(response_generator, req, llama_queue, tokenizer, config, device)
+    else:
+        return partial(
+            streaming_generator, req, llama_queue, tokenizer, config, device, json_mode
+        )

xinference 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

xinference 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl