PyPI - xinference - Versions diffs - 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl - Mend

xinference 0.14.4.post1py3-none-any.whl → 0.15.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show

xinference/thirdparty/fish_speech/tools/api.py CHANGED Viewed

@@ -9,16 +9,20 @@ import wave
 from argparse import ArgumentParser
 from http import HTTPStatus
 from pathlib import Path
-from typing import Annotated, Literal, Optional
+from typing import Annotated, Any, Literal, Optional
 import numpy as np
+import ormsgpack
 # import pyrootutils
 import soundfile as sf
 import torch
 import torchaudio
+# from baize.datastructures import ContentType
 # from kui.asgi import (
 #     Body,
+#     FactoryClass,
 #     HTTPException,
+#     HttpRequest,
 #     HttpView,
 #     JSONResponse,
 #     Kui,
@@ -27,14 +31,16 @@ import torchaudio
 # )
 # from kui.asgi.routing import MultimethodRoutes
 from loguru import logger
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, conint
 # pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 # from fish_speech.models.vqgan.lit_module import VQGAN
 from fish_speech.models.vqgan.modules.firefly import FireflyArchitecture
+from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
 from fish_speech.utils import autocast_exclude_mps
-# from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
+from tools.commons import ServeReferenceAudio, ServeTTSRequest
+from tools.file import AUDIO_EXTENSIONS, audio_to_bytes, list_files, read_ref_text
 from tools.llama.generate import (
     GenerateRequest,
     GenerateResponse,
@@ -82,11 +88,8 @@ async def other_exception_handler(exc: "Exception"):
 def load_audio(reference_audio, sr):
     if len(reference_audio) > 255 or not Path(reference_audio).exists():
-        try:
-            audio_data = base64.b64decode(reference_audio)
-            reference_audio = io.BytesIO(audio_data)
-        except base64.binascii.Error:
-            raise ValueError("Invalid path or base64 string")
+        audio_data = reference_audio
+        reference_audio = io.BytesIO(audio_data)
     waveform, original_sr = torchaudio.load(
         reference_audio, backend="sox" if sys.platform == "linux" else "soundfile"
@@ -145,7 +148,7 @@ def decode_vq_tokens(
         return decoder_model.decode(
             indices=codes[None],
             feature_lengths=feature_lengths,
-        ).squeeze()
+        )[0].squeeze()
     raise ValueError(f"Unknown model type: {type(decoder_model)}")
@@ -153,58 +156,6 @@ def decode_vq_tokens(
 # routes = MultimethodRoutes(base_class=HttpView)
-def get_random_paths(base_path, data, speaker, emotion):
-    if base_path and data and speaker and emotion and (Path(base_path).exists()):
-        if speaker in data and emotion in data[speaker]:
-            files = data[speaker][emotion]
-            lab_files = [f for f in files if f.endswith(".lab")]
-            wav_files = [f for f in files if f.endswith(".wav")]
-            if lab_files and wav_files:
-                selected_lab = random.choice(lab_files)
-                selected_wav = random.choice(wav_files)
-                lab_path = Path(base_path) / speaker / emotion / selected_lab
-                wav_path = Path(base_path) / speaker / emotion / selected_wav
-                if lab_path.exists() and wav_path.exists():
-                    return lab_path, wav_path
-    return None, None
-def load_json(json_file):
-    if not json_file:
-        logger.info("Not using a json file")
-        return None
-    try:
-        with open(json_file, "r", encoding="utf-8") as file:
-            data = json.load(file)
-    except FileNotFoundError:
-        logger.warning(f"ref json not found: {json_file}")
-        data = None
-    except Exception as e:
-        logger.warning(f"Loading json failed: {e}")
-        data = None
-    return data
-class InvokeRequest(BaseModel):
-    text: str = "你说的对, 但是原神是一款由米哈游自主研发的开放世界手游."
-    reference_text: Optional[str] = None
-    reference_audio: Optional[str] = None
-    max_new_tokens: int = 1024
-    chunk_length: Annotated[int, Field(ge=0, le=500, strict=True)] = 100
-    top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
-    repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
-    temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
-    emotion: Optional[str] = None
-    format: Literal["wav", "mp3", "flac"] = "wav"
-    streaming: bool = False
-    ref_json: Optional[str] = "ref_data.json"
-    ref_base: Optional[str] = "ref_data"
-    speaker: Optional[str] = None
 def get_content_type(audio_format):
     if audio_format == "wav":
         return "audio/wav"
@@ -217,35 +168,52 @@ def get_content_type(audio_format):
 @torch.inference_mode()
-def inference(req: InvokeRequest):
-    # Parse reference audio aka prompt
-    prompt_tokens = None
-    ref_data = load_json(req.ref_json)
-    ref_base = req.ref_base
-    lab_path, wav_path = get_random_paths(ref_base, ref_data, req.speaker, req.emotion)
-    if lab_path and wav_path:
-        with open(lab_path, "r", encoding="utf-8") as lab_file:
-            ref_text = lab_file.read()
-        req.reference_audio = wav_path
-        req.reference_text = ref_text
-        logger.info("ref_path: " + str(wav_path))
-        logger.info("ref_text: " + ref_text)
-    # Parse reference audio aka prompt
-    prompt_tokens = encode_reference(
-        decoder_model=decoder_model,
-        reference_audio=req.reference_audio,
-        enable_reference_audio=req.reference_audio is not None,
-    )
-    logger.info(f"ref_text: {req.reference_text}")
+def inference(req: ServeTTSRequest):
+    idstr: str | None = req.reference_id
+    if idstr is not None:
+        ref_folder = Path("references") / idstr
+        ref_folder.mkdir(parents=True, exist_ok=True)
+        ref_audios = list_files(
+            ref_folder, AUDIO_EXTENSIONS, recursive=True, sort=False
+        )
+        prompt_tokens = [
+            encode_reference(
+                decoder_model=decoder_model,
+                reference_audio=audio_to_bytes(str(ref_audio)),
+                enable_reference_audio=True,
+            )
+            for ref_audio in ref_audios
+        ]
+        prompt_texts = [
+            read_ref_text(str(ref_audio.with_suffix(".lab")))
+            for ref_audio in ref_audios
+        ]
+    else:
+        # Parse reference audio aka prompt
+        refs = req.references
+        if refs is None:
+            refs = []
+        prompt_tokens = [
+            encode_reference(
+                decoder_model=decoder_model,
+                reference_audio=ref.audio,
+                enable_reference_audio=True,
+            )
+            for ref in refs
+        ]
+        prompt_texts = [ref.text for ref in refs]
     # LLAMA Inference
     request = dict(
         device=decoder_model.device,
         max_new_tokens=req.max_new_tokens,
-        text=req.text,
+        text=(
+            req.text
+            if not req.normalize
+            else ChnNormedText(raw_text=req.text).normalize()
+        ),
         top_p=req.top_p,
         repetition_penalty=req.repetition_penalty,
         temperature=req.temperature,
@@ -254,7 +222,7 @@ def inference(req: InvokeRequest):
         chunk_length=req.chunk_length,
         max_length=2048,
         prompt_tokens=prompt_tokens,
-        prompt_text=req.reference_text,
+        prompt_text=prompt_texts,
     )
     response_queue = queue.Queue()
@@ -307,40 +275,7 @@ def inference(req: InvokeRequest):
     yield fake_audios
-def auto_rerank_inference(req: InvokeRequest, use_auto_rerank: bool = True):
-    if not use_auto_rerank:
-        # 如果不使用 auto_rerank，直接调用原始的 inference 函数
-        return inference(req)
-    zh_model, en_model = load_model()
-    max_attempts = 5
-    best_wer = float("inf")
-    best_audio = None
-    for attempt in range(max_attempts):
-        # 调用原始的 inference 函数
-        audio_generator = inference(req)
-        fake_audios = next(audio_generator)
-        asr_result = batch_asr(
-            zh_model if is_chinese(req.text) else en_model, [fake_audios], 44100
-        )[0]
-        wer = calculate_wer(req.text, asr_result["text"])
-        if wer <= 0.1 and not asr_result["huge_gap"]:
-            return fake_audios
-        if wer < best_wer:
-            best_wer = wer
-            best_audio = fake_audios
-        if attempt == max_attempts - 1:
-            break
-    return best_audio
-async def inference_async(req: InvokeRequest):
+async def inference_async(req: ServeTTSRequest):
     for chunk in inference(req):
         yield chunk
@@ -349,9 +284,9 @@ async def buffer_to_async_generator(buffer):
     yield buffer
-# @routes.http.post("/v1/invoke")
+# @routes.http.post("/v1/tts")
 # async def api_invoke_model(
-#     req: Annotated[InvokeRequest, Body(exclusive=True)],
+#     req: Annotated[ServeTTSRequest, Body(exclusive=True)],
 # ):
 #     """
 #     Invoke model and generate audio
@@ -410,21 +345,20 @@ def parse_args():
     parser.add_argument(
         "--llama-checkpoint-path",
         type=str,
-        default="checkpoints/fish-speech-1.2-sft",
+        default="checkpoints/fish-speech-1.4",
     )
     parser.add_argument(
         "--decoder-checkpoint-path",
         type=str,
-        default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+        default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
     )
     parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
     parser.add_argument("--device", type=str, default="cuda")
     parser.add_argument("--half", action="store_true")
     parser.add_argument("--compile", action="store_true")
     parser.add_argument("--max-text-length", type=int, default=0)
-    parser.add_argument("--listen", type=str, default="127.0.0.1:8000")
+    parser.add_argument("--listen", type=str, default="127.0.0.1:8080")
     parser.add_argument("--workers", type=int, default=1)
-    parser.add_argument("--use-auto-rerank", type=bool, default=True)
     return parser.parse_args()
@@ -436,18 +370,30 @@ def parse_args():
 #     },
 # ).routes
 #
+#
+# class MsgPackRequest(HttpRequest):
+#     async def data(self) -> Annotated[Any, ContentType("application/msgpack")]:
+#         if self.content_type == "application/msgpack":
+#             return ormsgpack.unpackb(await self.body)
+#
+#         raise HTTPException(
+#             HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
+#             headers={"Accept": "application/msgpack"},
+#         )
+#
+#
 # app = Kui(
 #     routes=routes + openapi[1:],  # Remove the default route
 #     exception_handlers={
 #         HTTPException: http_execption_handler,
 #         Exception: other_exception_handler,
 #     },
+#     factory_class=FactoryClass(http=MsgPackRequest),
 #     cors_config={},
 # )
 if __name__ == "__main__":
-    import threading
     import uvicorn
@@ -474,18 +420,17 @@ if __name__ == "__main__":
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
     list(
         inference(
-            InvokeRequest(
+            ServeTTSRequest(
                 text="Hello world.",
-                reference_text=None,
-                reference_audio=None,
-                max_new_tokens=0,
+                references=[],
+                reference_id=None,
+                max_new_tokens=1024,
+                chunk_length=200,
                 top_p=0.7,
                 repetition_penalty=1.2,
                 temperature=0.7,
                 emotion=None,
                 format="wav",
-                ref_base=None,
-                ref_json=None,
             )
         )
     )

xinference/thirdparty/fish_speech/tools/commons.py ADDED Viewed

@@ -0,0 +1,35 @@
+from typing import Annotated, Literal, Optional
+from pydantic import BaseModel, Field, conint
+class ServeReferenceAudio(BaseModel):
+    audio: bytes
+    text: str
+class ServeTTSRequest(BaseModel):
+    text: str
+    chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
+    # Audio format
+    format: Literal["wav", "pcm", "mp3"] = "wav"
+    mp3_bitrate: Literal[64, 128, 192] = 128
+    # References audios for in-context learning
+    references: list[ServeReferenceAudio] = []
+    # Reference id
+    # For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
+    # Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
+    reference_id: str | None = None
+    # Normalize text for en & zh, this increase stability for numbers
+    normalize: bool = True
+    mp3_bitrate: Optional[int] = 64
+    opus_bitrate: Optional[int] = -1000
+    # Balance mode will reduce latency to 300ms, but may decrease stability
+    latency: Literal["normal", "balanced"] = "normal"
+    # not usually used below
+    streaming: bool = False
+    emotion: Optional[str] = None
+    max_new_tokens: int = 1024
+    top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
+    repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
+    temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7

xinference/thirdparty/fish_speech/tools/download_models.py CHANGED Viewed

@@ -22,8 +22,8 @@ def check_and_download_files(repo_id, file_list, local_dir):
 # 1st
-repo_id_1 = "fishaudio/fish-speech-1.2-sft"
-local_dir_1 = "./checkpoints/fish-speech-1.2-sft"
+repo_id_1 = "fishaudio/fish-speech-1.4"
+local_dir_1 = "./checkpoints/fish-speech-1.4"
 files_1 = [
     "model.pth",
     "README.md",
@@ -31,7 +31,7 @@ files_1 = [
     "tokenizer_config.json",
     "tokenizer.json",
     "config.json",
-    "firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+    "firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
 ]
 # 3rd

xinference/thirdparty/fish_speech/tools/file.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import base64
 from pathlib import Path
 from typing import Union
@@ -23,6 +24,22 @@ VIDEO_EXTENSIONS = {
 }
+def audio_to_bytes(file_path):
+    if not file_path or not Path(file_path).exists():
+        return None
+    with open(file_path, "rb") as wav_file:
+        wav = wav_file.read()
+    return wav
+def read_ref_text(ref_text):
+    path = Path(ref_text)
+    if path.exists() and path.is_file():
+        with path.open("r", encoding="utf-8") as file:
+            return file.read()
+    return ref_text
 def list_files(
     path: Union[Path, str],
     extensions: set[str] = None,

xinference/thirdparty/fish_speech/tools/llama/build_dataset.py CHANGED Viewed

@@ -13,7 +13,7 @@ from tqdm import tqdm
 from fish_speech.datasets.protos.text_data_pb2 import Semantics, Sentence, TextData
 from fish_speech.datasets.protos.text_data_stream import pack_pb_stream
-from fish_speech.utils.file import load_filelist
+from tools.file import load_filelist
 # To avoid CPU overload
 os.environ["MKL_NUM_THREADS"] = "1"

xinference/thirdparty/fish_speech/tools/llama/generate.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import queue
 import threading
 import time
+from contextlib import nullcontext
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal, Optional, Tuple, Union
@@ -93,15 +94,20 @@ def decode_one_token_ar(
     **sampling_kwargs,
 ) -> torch.Tensor:
     x = model.forward_generate(x, input_pos)
+    sampling_kwargs_main = sampling_kwargs.copy()
+    sampling_kwargs_main["temperature"] = 0.1
+    sampling_kwargs_main["top_p"] = 0.1
+    sampling_kwargs_main["repetition_penalty"] = 1.0
     codebooks = [
         sample(
             x.logits,
-            previous_tokens=(
-                previous_tokens[0] if previous_tokens is not None else None
-            ),  # Disable repetition penalty for the token codebook
-            **sampling_kwargs,
+            previous_tokens=None,  # Disable repetition penalty for the token codebook
+            **sampling_kwargs_main,
         )[0]
     ]
     x = x.hidden_states
     # Cleanup the cache
@@ -136,11 +142,16 @@ def decode_one_token_naive(
 ) -> torch.Tensor:
     x = model.forward_generate(x, input_pos)
+    sampling_kwargs_main = sampling_kwargs.copy()
+    sampling_kwargs_main["temperature"] = 0.1
+    sampling_kwargs_main["top_p"] = 0.1
+    sampling_kwargs_main["repetition_penalty"] = 1.0
     codebooks = [
         sample(
-            x.token_logits,
+            x.logits,
             previous_tokens=None,  # Disable repetition penalty for the token codebook
-            **sampling_kwargs,
+            **sampling_kwargs_main,
         )[0]
     ]
@@ -181,8 +192,12 @@ def decode_n_tokens(
         else:
             window = previous_tokens[:, i - win_size : i]
-        with torch.backends.cuda.sdp_kernel(
-            enable_flash=False, enable_mem_efficient=False, enable_math=True
+        with (
+            torch.backends.cuda.sdp_kernel(
+                enable_flash=False, enable_mem_efficient=False, enable_math=True
+            )
+            if torch.cuda.is_available()
+            else nullcontext()
         ):  # Actually better for Inductor to codegen attention here
             next_token = decode_one_token(
                 model=model,
@@ -222,25 +237,11 @@ def generate(
     # create an empty tensor of the expected final shape and fill in the current tokens
     T = prompt.size(1)
-    if max_new_tokens:
-        if T + max_new_tokens > model.config.max_seq_len:
-            max_new_tokens = model.config.max_seq_len - T
-            logger.info(f"Truncating max_new_tokens to {max_new_tokens}")
-        T_new = T + max_new_tokens
-    else:
-        T_new = model.config.max_seq_len
-        max_new_tokens = T_new - T
     device, dtype = prompt.device, prompt.dtype
-    with torch.device(device):
-        model.setup_caches(
-            max_batch_size=1, max_seq_len=T_new, dtype=next(model.parameters()).dtype
-        )
     codebook_dim = 1 + model.config.num_codebooks
     # create an empty tensor of the expected final shape and fill in the current tokens
-    empty = torch.empty((codebook_dim, T_new), dtype=dtype, device=device)
+    empty = torch.empty((codebook_dim, max_new_tokens), dtype=dtype, device=device)
     empty[:, :T] = prompt
     seq = empty
     input_pos = torch.arange(0, T, device=device)
@@ -560,6 +561,10 @@ def launch_thread_safe_queue(
         model, decode_one_token = load_model(
             checkpoint_path, device, precision, compile=compile
         )
+        with torch.device(device):
+            model.setup_caches(
+                max_batch_size=1, max_seq_len=2048, dtype=next(model.parameters()).dtype
+            )
         init_event.set()
         while True:
@@ -607,7 +612,7 @@ def launch_thread_safe_queue(
 @click.option(
     "--checkpoint-path",
     type=click.Path(path_type=Path, exists=True),
-    default="checkpoints/fish-speech-1.2-sft",
+    default="checkpoints/fish-speech-1.4",
 )
 @click.option("--device", type=str, default="cuda")
 @click.option("--compile/--no-compile", default=False)

xinference/thirdparty/fish_speech/tools/llama/merge_lora.py CHANGED Viewed

@@ -15,7 +15,7 @@ from fish_speech.models.text2semantic.lora import get_merged_state_dict
 @click.command()
 @click.option("--lora-config", type=str, default="r_8_alpha_16")
-@click.option("--base-weight", type=str, default="checkpoints/fish-speech-1.2-sft")
+@click.option("--base-weight", type=str, default="checkpoints/fish-speech-1.4")
 @click.option("--lora-weight", type=str, required=True)
 @click.option("--output", type=str, required=True)
 def merge(lora_config, base_weight, lora_weight, output):

xinference/thirdparty/fish_speech/tools/llama/quantize.py CHANGED Viewed

@@ -428,7 +428,7 @@ def generate_folder_name():
 @click.option(
     "--checkpoint-path",
     type=click.Path(path_type=Path, exists=True),
-    default="checkpoints/fish-speech-1.2-sft",
+    default="checkpoints/fish-speech-1.4",
 )
 @click.option(
     "--mode", type=str, default="int8", help="type of quantization to perform"
@@ -451,7 +451,7 @@ def quantize(checkpoint_path: Path, mode: str, groupsize: int, timestamp: str) -
         precision=precision,
         compile=False,
     )
-    vq_model = "firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
+    vq_model = "firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
     now = timestamp if timestamp != "None" else generate_folder_name()
     if mode == "int8":

xinference/thirdparty/fish_speech/tools/msgpack_api.py ADDED Viewed

@@ -0,0 +1,34 @@
+import httpx
+import ormsgpack
+from tools.commons import ServeReferenceAudio, ServeTTSRequest
+# priority: ref_id > references
+request = ServeTTSRequest(
+    text="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
+    # reference_id="114514",
+    references=[
+        ServeReferenceAudio(
+            audio=open("lengyue.wav", "rb").read(),
+            text=open("lengyue.lab", "r", encoding="utf-8").read(),
+        )
+    ],
+    streaming=True,
+)
+with (
+    httpx.Client() as client,
+    open("hello.wav", "wb") as f,
+):
+    with client.stream(
+        "POST",
+        "http://127.0.0.1:8080/v1/tts",
+        content=ormsgpack.packb(request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
+        headers={
+            "authorization": "Bearer YOUR_API_KEY",
+            "content-type": "application/msgpack",
+        },
+        timeout=None,
+    ) as response:
+        for chunk in response.iter_bytes():
+            f.write(chunk)

xinference 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl

Potentially problematic release.

xinference 0.14.4.post1py3-none-any.whl → 0.15.1py3-none-any.whl