PyPI - xinference - Versions diffs - 0.16.3__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

xinference 0.16.3py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (69) hide show

xinference/thirdparty/fish_speech/tools/post_api.py CHANGED Viewed

@@ -8,14 +8,15 @@ import requests
 from pydub import AudioSegment
 from pydub.playback import play
-from tools.commons import ServeReferenceAudio, ServeTTSRequest
 from tools.file import audio_to_bytes, read_ref_text
+from tools.schema import ServeReferenceAudio, ServeTTSRequest
 def parse_args():
     parser = argparse.ArgumentParser(
-        description="Send a WAV file and text to a server and receive synthesized audio."
+        description="Send a WAV file and text to a server and receive synthesized audio.",
+        formatter_class=argparse.RawTextHelpFormatter,
     )
     parser.add_argument(
@@ -33,7 +34,7 @@ def parse_args():
         "-id",
         type=str,
         default=None,
-        help="ID of the reference model o be used for the speech",
+        help="ID of the reference model to be used for the speech\n(Local: name of folder containing audios and files)",
     )
     parser.add_argument(
         "--reference_audio",
@@ -41,7 +42,7 @@ def parse_args():
         type=str,
         nargs="+",
         default=None,
-        help="Path to the WAV file",
+        help="Path to the audio file",
     )
     parser.add_argument(
         "--reference_text",
@@ -68,17 +69,25 @@ def parse_args():
     parser.add_argument(
         "--format", type=str, choices=["wav", "mp3", "flac"], default="wav"
     )
-    parser.add_argument("--mp3_bitrate", type=int, default=64)
+    parser.add_argument(
+        "--mp3_bitrate", type=int, choices=[64, 128, 192], default=64, help="kHz"
+    )
     parser.add_argument("--opus_bitrate", type=int, default=-1000)
-    parser.add_argument("--latency", type=str, default="normal", help="延迟选项")
+    parser.add_argument(
+        "--latency",
+        type=str,
+        default="normal",
+        choices=["normal", "balanced"],
+        help="Used in api.fish.audio/v1/tts",
+    )
     parser.add_argument(
         "--max_new_tokens",
         type=int,
-        default=1024,
-        help="Maximum new tokens to generate",
+        default=0,
+        help="Maximum new tokens to generate. \n0 means no limit.",
     )
     parser.add_argument(
-        "--chunk_length", type=int, default=100, help="Chunk length for synthesis"
+        "--chunk_length", type=int, default=200, help="Chunk length for synthesis"
     )
     parser.add_argument(
         "--top_p", type=float, default=0.7, help="Top-p sampling for synthesis"
@@ -92,10 +101,7 @@ def parse_args():
     parser.add_argument(
         "--temperature", type=float, default=0.7, help="Temperature for sampling"
     )
-    parser.add_argument(
-        "--speaker", type=str, default=None, help="Speaker ID for voice synthesis"
-    )
-    parser.add_argument("--emotion", type=str, default=None, help="Speaker's Emotion")
     parser.add_argument(
         "--streaming", type=bool, default=False, help="Enable streaming response"
     )
@@ -103,6 +109,22 @@ def parse_args():
         "--channels", type=int, default=1, help="Number of audio channels"
     )
     parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio")
+    parser.add_argument(
+        "--use_memory_cache",
+        type=str,
+        default="never",
+        choices=["on-demand", "never"],
+        help="Cache encoded references codes in memory.\n"
+        "If `on-demand`, the server will use cached encodings\n "
+        "instead of encoding reference audio again.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="`None` means randomized inference, otherwise deterministic.\n"
+        "It can't be used for fixing a timbre.",
+    )
     return parser.parse_args()
@@ -145,9 +167,9 @@ if __name__ == "__main__":
         "top_p": args.top_p,
         "repetition_penalty": args.repetition_penalty,
         "temperature": args.temperature,
-        "speaker": args.speaker,
-        "emotion": args.emotion,
         "streaming": args.streaming,
+        "use_memory_cache": args.use_memory_cache,
+        "seed": args.seed,
     }
     pydantic_data = ServeTTSRequest(**data)

xinference/thirdparty/fish_speech/tools/schema.py ADDED Viewed

@@ -0,0 +1,187 @@
+import os
+import queue
+from dataclasses import dataclass
+from typing import Annotated, Literal, Optional
+import torch
+from pydantic import AfterValidator, BaseModel, Field, confloat, conint, conlist
+from pydantic.functional_validators import SkipValidation
+from fish_speech.conversation import Message, TextPart, VQPart
+GLOBAL_NUM_SAMPLES = int(os.getenv("GLOBAL_NUM_SAMPLES", 1))
+class ServeVQPart(BaseModel):
+    type: Literal["vq"] = "vq"
+    codes: SkipValidation[list[list[int]]]
+class ServeTextPart(BaseModel):
+    type: Literal["text"] = "text"
+    text: str
+class ServeAudioPart(BaseModel):
+    type: Literal["audio"] = "audio"
+    audio: bytes
+@dataclass
+class ASRPackRequest:
+    audio: torch.Tensor
+    result_queue: queue.Queue
+    language: str
+class ServeASRRequest(BaseModel):
+    # The audio should be an uncompressed PCM float16 audio
+    audios: list[bytes]
+    sample_rate: int = 44100
+    language: Literal["zh", "en", "ja", "auto"] = "auto"
+class ServeASRTranscription(BaseModel):
+    text: str
+    duration: float
+    huge_gap: bool
+class ServeASRSegment(BaseModel):
+    text: str
+    start: float
+    end: float
+class ServeTimedASRResponse(BaseModel):
+    text: str
+    segments: list[ServeASRSegment]
+    duration: float
+class ServeASRResponse(BaseModel):
+    transcriptions: list[ServeASRTranscription]
+class ServeMessage(BaseModel):
+    role: Literal["system", "assistant", "user"]
+    parts: list[ServeVQPart | ServeTextPart]
+    def to_conversation_message(self):
+        new_message = Message(role=self.role, parts=[])
+        for part in self.parts:
+            if isinstance(part, ServeTextPart):
+                new_message.parts.append(TextPart(text=part.text))
+            elif isinstance(part, ServeVQPart):
+                new_message.parts.append(
+                    VQPart(codes=torch.tensor(part.codes, dtype=torch.int))
+                )
+            else:
+                raise ValueError(f"Unsupported part type: {part}")
+        return new_message
+class ServeRequest(BaseModel):
+    messages: Annotated[list[ServeMessage], conlist(ServeMessage, min_length=1)]
+    max_new_tokens: int = 1024
+    top_p: float = 0.7
+    repetition_penalty: float = 1.2
+    temperature: float = 0.7
+    streaming: bool = False
+    num_samples: int = 1
+    early_stop_threshold: float = 1.0
+class ServeVQGANEncodeRequest(BaseModel):
+    # The audio here should be in wav, mp3, etc
+    audios: list[bytes]
+class ServeVQGANEncodeResponse(BaseModel):
+    tokens: SkipValidation[list[list[list[int]]]]
+class ServeVQGANDecodeRequest(BaseModel):
+    tokens: SkipValidation[list[list[list[int]]]]
+class ServeVQGANDecodeResponse(BaseModel):
+    # The audio here should be in PCM float16 format
+    audios: list[bytes]
+class ServeReferenceAudio(BaseModel):
+    audio: bytes
+    text: str
+class ServeForwardMessage(BaseModel):
+    role: str
+    content: str
+class ServeResponse(BaseModel):
+    messages: list[ServeMessage]
+    finish_reason: Literal["stop", "error"] | None = None
+    stats: dict[str, int | float | str] = {}
+class ServeStreamDelta(BaseModel):
+    role: Literal["system", "assistant", "user"] | None = None
+    part: ServeVQPart | ServeTextPart | None = None
+class ServeStreamResponse(BaseModel):
+    sample_id: int = 0
+    delta: ServeStreamDelta | None = None
+    finish_reason: Literal["stop", "error"] | None = None
+    stats: dict[str, int | float | str] | None = None
+class ServeReferenceAudio(BaseModel):
+    audio: bytes
+    text: str
+    def __repr__(self) -> str:
+        return f"ServeReferenceAudio(text={self.text!r}, audio_size={len(self.audio)})"
+class ServeChatRequestV1(BaseModel):
+    model: str = "llama3-8b"
+    messages: list[ServeForwardMessage] = []
+    audio: bytes | None = None
+    temperature: float = 1.0
+    top_p: float = 1.0
+    max_tokens: int = 256
+    voice: str = "jessica"
+    tts_audio_format: Literal["mp3", "pcm", "opus"] = "mp3"
+    tts_audio_bitrate: Literal[16, 24, 32, 48, 64, 96, 128, 192] = 128
+class ServeTTSRequest(BaseModel):
+    text: str
+    chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
+    # Audio format
+    format: Literal["wav", "pcm", "mp3"] = "wav"
+    mp3_bitrate: Literal[64, 128, 192] = 128
+    # References audios for in-context learning
+    references: list[ServeReferenceAudio] = []
+    # Reference id
+    # For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
+    # Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
+    reference_id: str | None = None
+    seed: int | None = None
+    use_memory_cache: Literal["on-demand", "never"] = "never"
+    # Normalize text for en & zh, this increase stability for numbers
+    normalize: bool = True
+    mp3_bitrate: Optional[int] = 64
+    opus_bitrate: Optional[int] = -1000
+    # Balance mode will reduce latency to 300ms, but may decrease stability
+    latency: Literal["normal", "balanced"] = "normal"
+    # not usually used below
+    streaming: bool = False
+    max_new_tokens: int = 1024
+    top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
+    repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
+    temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7

xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py CHANGED Viewed

@@ -24,6 +24,12 @@ OmegaConf.register_new_resolver("eval", eval)
 # This file is used to convert the audio files to text files using the Whisper model.
 # It's mainly used to generate the training data for the VQ model.
+backends = torchaudio.list_audio_backends()
+if "ffmpeg" in backends:
+    backend = "ffmpeg"
+else:
+    backend = "soundfile"
 RANK = int(os.environ.get("SLURM_PROCID", 0))
 WORLD_SIZE = int(os.environ.get("SLURM_NTASKS", 1))
@@ -81,7 +87,7 @@ def process_batch(files: list[Path], model) -> float:
     for file in files:
         try:
             wav, sr = torchaudio.load(
-                str(file), backend="sox" if sys.platform == "linux" else "soundfile"
+                str(file), backend=backend
             )  # Need to install libsox-dev
         except Exception as e:
             logger.error(f"Error reading {file}: {e}")

xinference/thirdparty/fish_speech/tools/vqgan/inference.py CHANGED Viewed

@@ -24,8 +24,7 @@ def load_model(config_name, checkpoint_path, device="cuda"):
     model = instantiate(cfg)
     state_dict = torch.load(
-        checkpoint_path,
-        map_location=device,
+        checkpoint_path, map_location=device, mmap=True, weights_only=True
     )
     if "state_dict" in state_dict:
         state_dict = state_dict["state_dict"]
@@ -37,7 +36,7 @@ def load_model(config_name, checkpoint_path, device="cuda"):
             if "generator." in k
         }
-    result = model.load_state_dict(state_dict, strict=False)
+    result = model.load_state_dict(state_dict, strict=False, assign=True)
     model.eval()
     model.to(device)

xinference/thirdparty/fish_speech/tools/webui.py CHANGED Viewed

@@ -21,8 +21,9 @@ from transformers import AutoTokenizer
 from fish_speech.i18n import i18n
 from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
-from fish_speech.utils import autocast_exclude_mps
+from fish_speech.utils import autocast_exclude_mps, set_seed
 from tools.api import decode_vq_tokens, encode_reference
+from tools.file import AUDIO_EXTENSIONS, list_files
 from tools.llama.generate import (
     GenerateRequest,
     GenerateResponse,
@@ -70,6 +71,7 @@ def inference(
     top_p,
     repetition_penalty,
     temperature,
+    seed="0",
     streaming=False,
 ):
     if args.max_gradio_length > 0 and len(text) > args.max_gradio_length:
@@ -81,6 +83,11 @@ def inference(
             ),
         )
+    seed = int(seed)
+    if seed != 0:
+        set_seed(seed)
+        logger.warning(f"set seed: {seed}")
     # Parse reference audio aka prompt
     prompt_tokens = encode_reference(
         decoder_model=decoder_model,
@@ -139,7 +146,9 @@ def inference(
         segments.append(fake_audios)
         if streaming:
-            yield (fake_audios * 32768).astype(np.int16).tobytes(), None, None
+            wav_header = wav_chunk_header()
+            audio_data = (fake_audios * 32768).astype(np.int16).tobytes()
+            yield wav_header + audio_data, None, None
     if len(segments) == 0:
         return (
@@ -177,6 +186,7 @@ def inference_wrapper(
     top_p,
     repetition_penalty,
     temperature,
+    seed,
     batch_infer_num,
 ):
     audios = []
@@ -193,6 +203,7 @@ def inference_wrapper(
             top_p,
             repetition_penalty,
             temperature,
+            seed,
         )
         _, audio_data, error_message = next(result)
@@ -235,7 +246,11 @@ def normalize_text(user_input, use_normalization):
         return user_input
-asr_model = None
+def update_examples():
+    examples_dir = Path("references")
+    examples_dir.mkdir(parents=True, exist_ok=True)
+    example_audios = list_files(examples_dir, AUDIO_EXTENSIONS, recursive=True)
+    return gr.Dropdown(choices=example_audios + [""])
 def build_app():
@@ -273,76 +288,100 @@ def build_app():
                     )
                 with gr.Row():
-                    with gr.Tab(label=i18n("Advanced Config")):
-                        chunk_length = gr.Slider(
-                            label=i18n("Iterative Prompt Length, 0 means off"),
-                            minimum=50,
-                            maximum=300,
-                            value=200,
-                            step=8,
-                        )
-                        max_new_tokens = gr.Slider(
-                            label=i18n("Maximum tokens per batch, 0 means no limit"),
-                            minimum=0,
-                            maximum=2048,
-                            value=1024,  # 0 means no limit
-                            step=8,
-                        )
-                        top_p = gr.Slider(
-                            label="Top-P",
-                            minimum=0.6,
-                            maximum=0.9,
-                            value=0.7,
-                            step=0.01,
-                        )
-                        repetition_penalty = gr.Slider(
-                            label=i18n("Repetition Penalty"),
-                            minimum=1,
-                            maximum=1.5,
-                            value=1.2,
-                            step=0.01,
-                        )
-                        temperature = gr.Slider(
-                            label="Temperature",
-                            minimum=0.6,
-                            maximum=0.9,
-                            value=0.7,
-                            step=0.01,
-                        )
-                    with gr.Tab(label=i18n("Reference Audio")):
-                        gr.Markdown(
-                            i18n(
-                                "5 to 10 seconds of reference audio, useful for specifying speaker."
-                            )
-                        )
-                        enable_reference_audio = gr.Checkbox(
-                            label=i18n("Enable Reference Audio"),
-                        )
-                        reference_audio = gr.Audio(
-                            label=i18n("Reference Audio"),
-                            type="filepath",
-                        )
-                        with gr.Row():
-                            reference_text = gr.Textbox(
-                                label=i18n("Reference Text"),
-                                lines=1,
-                                placeholder="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
-                                value="",
-                            )
-                    with gr.Tab(label=i18n("Batch Inference")):
-                        batch_infer_num = gr.Slider(
-                            label="Batch infer nums",
-                            minimum=1,
-                            maximum=n_audios,
-                            step=1,
-                            value=1,
-                        )
+                    with gr.Column():
+                        with gr.Tab(label=i18n("Advanced Config")):
+                            with gr.Row():
+                                chunk_length = gr.Slider(
+                                    label=i18n("Iterative Prompt Length, 0 means off"),
+                                    minimum=50,
+                                    maximum=300,
+                                    value=200,
+                                    step=8,
+                                )
+                                max_new_tokens = gr.Slider(
+                                    label=i18n(
+                                        "Maximum tokens per batch, 0 means no limit"
+                                    ),
+                                    minimum=0,
+                                    maximum=2048,
+                                    value=0,  # 0 means no limit
+                                    step=8,
+                                )
+                            with gr.Row():
+                                top_p = gr.Slider(
+                                    label="Top-P",
+                                    minimum=0.6,
+                                    maximum=0.9,
+                                    value=0.7,
+                                    step=0.01,
+                                )
+                                repetition_penalty = gr.Slider(
+                                    label=i18n("Repetition Penalty"),
+                                    minimum=1,
+                                    maximum=1.5,
+                                    value=1.2,
+                                    step=0.01,
+                                )
+                            with gr.Row():
+                                temperature = gr.Slider(
+                                    label="Temperature",
+                                    minimum=0.6,
+                                    maximum=0.9,
+                                    value=0.7,
+                                    step=0.01,
+                                )
+                                seed = gr.Textbox(
+                                    label="Seed",
+                                    info="0 means randomized inference, otherwise deterministic",
+                                    placeholder="any 32-bit-integer",
+                                    value="0",
+                                )
+                        with gr.Tab(label=i18n("Reference Audio")):
+                            with gr.Row():
+                                gr.Markdown(
+                                    i18n(
+                                        "5 to 10 seconds of reference audio, useful for specifying speaker."
+                                    )
+                                )
+                            with gr.Row():
+                                enable_reference_audio = gr.Checkbox(
+                                    label=i18n("Enable Reference Audio"),
+                                )
+                            with gr.Row():
+                                example_audio_dropdown = gr.Dropdown(
+                                    label=i18n("Select Example Audio"),
+                                    choices=[""],
+                                    value="",
+                                    interactive=True,
+                                    allow_custom_value=True,
+                                )
+                            with gr.Row():
+                                reference_audio = gr.Audio(
+                                    label=i18n("Reference Audio"),
+                                    type="filepath",
+                                )
+                            with gr.Row():
+                                reference_text = gr.Textbox(
+                                    label=i18n("Reference Text"),
+                                    lines=1,
+                                    placeholder="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
+                                    value="",
+                                )
+                        with gr.Tab(label=i18n("Batch Inference")):
+                            with gr.Row():
+                                batch_infer_num = gr.Slider(
+                                    label="Batch infer nums",
+                                    minimum=1,
+                                    maximum=n_audios,
+                                    step=1,
+                                    value=1,
+                                )
             with gr.Column(scale=3):
                 for _ in range(n_audios):
@@ -383,6 +422,28 @@ def build_app():
             fn=normalize_text, inputs=[text, if_refine_text], outputs=[refined_text]
         )
+        def select_example_audio(audio_path):
+            audio_path = Path(audio_path)
+            if audio_path.is_file():
+                lab_file = Path(audio_path.with_suffix(".lab"))
+                if lab_file.exists():
+                    lab_content = lab_file.read_text(encoding="utf-8").strip()
+                else:
+                    lab_content = ""
+                return str(audio_path), lab_content, True
+            return None, "", False
+        # Connect the dropdown to update reference audio and text
+        example_audio_dropdown.change(
+            fn=update_examples, inputs=[], outputs=[example_audio_dropdown]
+        ).then(
+            fn=select_example_audio,
+            inputs=[example_audio_dropdown],
+            outputs=[reference_audio, reference_text, enable_reference_audio],
+        )
         # # Submit
         generate.click(
             inference_wrapper,
@@ -396,6 +457,7 @@ def build_app():
                 top_p,
                 repetition_penalty,
                 temperature,
+                seed,
                 batch_infer_num,
             ],
             [stream_audio, *global_audio_list, *global_error_list],
@@ -414,9 +476,10 @@ def build_app():
                 top_p,
                 repetition_penalty,
                 temperature,
+                seed,
             ],
             [stream_audio, global_audio_list[0], global_error_list[0]],
-            concurrency_limit=10,
+            concurrency_limit=1,
         )
     return app
@@ -471,7 +534,7 @@ if __name__ == "__main__":
             enable_reference_audio=False,
             reference_audio=None,
             reference_text="",
-            max_new_tokens=1024,
+            max_new_tokens=0,
             chunk_length=200,
             top_p=0.7,
             repetition_penalty=1.2,

xinference/types.py CHANGED Viewed

@@ -71,7 +71,8 @@ class EmbeddingUsage(TypedDict):
 class EmbeddingData(TypedDict):
     index: int
     object: str
-    embedding: List[float]
+    # support sparse embedding
+    embedding: Union[List[float], Dict[str, float]]
 class Embedding(TypedDict):

xinference 0.16.3__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

xinference 0.16.3py3-none-any.whl → 1.0.1py3-none-any.whl