PyPI - xinference - Versions diffs - 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

xinference 0.16.3py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show

xinference/thirdparty/fish_speech/tools/fish_e2e.py ADDED Viewed

@@ -0,0 +1,298 @@
+import base64
+import ctypes
+import io
+import json
+import os
+import struct
+from dataclasses import dataclass
+from enum import Enum
+from typing import AsyncGenerator, Union
+import httpx
+import numpy as np
+import ormsgpack
+import soundfile as sf
+from .schema import (
+    ServeChatRequest,
+    ServeMessage,
+    ServeTextPart,
+    ServeVQGANDecodeRequest,
+    ServeVQGANEncodeRequest,
+    ServeVQPart,
+)
+class CustomAudioFrame:
+    def __init__(self, data, sample_rate, num_channels, samples_per_channel):
+        if len(data) < num_channels * samples_per_channel * ctypes.sizeof(
+            ctypes.c_int16
+        ):
+            raise ValueError(
+                "data length must be >= num_channels * samples_per_channel * sizeof(int16)"
+            )
+        self._data = bytearray(data)
+        self._sample_rate = sample_rate
+        self._num_channels = num_channels
+        self._samples_per_channel = samples_per_channel
+    @property
+    def data(self):
+        return memoryview(self._data).cast("h")
+    @property
+    def sample_rate(self):
+        return self._sample_rate
+    @property
+    def num_channels(self):
+        return self._num_channels
+    @property
+    def samples_per_channel(self):
+        return self._samples_per_channel
+    @property
+    def duration(self):
+        return self.samples_per_channel / self.sample_rate
+    def __repr__(self):
+        return (
+            f"CustomAudioFrame(sample_rate={self.sample_rate}, "
+            f"num_channels={self.num_channels}, "
+            f"samples_per_channel={self.samples_per_channel}, "
+            f"duration={self.duration:.3f})"
+        )
+class FishE2EEventType(Enum):
+    SPEECH_SEGMENT = 1
+    TEXT_SEGMENT = 2
+    END_OF_TEXT = 3
+    END_OF_SPEECH = 4
+    ASR_RESULT = 5
+    USER_CODES = 6
+@dataclass
+class FishE2EEvent:
+    type: FishE2EEventType
+    frame: np.ndarray = None
+    text: str = None
+    vq_codes: list[list[int]] = None
+client = httpx.AsyncClient(
+    timeout=None,
+    limits=httpx.Limits(
+        max_connections=None,
+        max_keepalive_connections=None,
+        keepalive_expiry=None,
+    ),
+)
+class FishE2EAgent:
+    def __init__(self):
+        self.llm_url = "http://localhost:8080/v1/chat"
+        self.vqgan_url = "http://localhost:8080"
+        self.client = httpx.AsyncClient(timeout=None)
+    async def get_codes(self, audio_data, sample_rate):
+        audio_buffer = io.BytesIO()
+        sf.write(audio_buffer, audio_data, sample_rate, format="WAV")
+        audio_buffer.seek(0)
+        # Step 1: Encode audio using VQGAN
+        encode_request = ServeVQGANEncodeRequest(audios=[audio_buffer.read()])
+        encode_request_bytes = ormsgpack.packb(
+            encode_request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC
+        )
+        encode_response = await self.client.post(
+            f"{self.vqgan_url}/v1/vqgan/encode",
+            data=encode_request_bytes,
+            headers={"Content-Type": "application/msgpack"},
+        )
+        encode_response_data = ormsgpack.unpackb(encode_response.content)
+        codes = encode_response_data["tokens"][0]
+        return codes
+    async def stream(
+        self,
+        system_audio_data: np.ndarray | None,
+        user_audio_data: np.ndarray | None,
+        sample_rate: int,
+        num_channels: int,
+        chat_ctx: dict | None = None,
+    ) -> AsyncGenerator[bytes, None]:
+        if system_audio_data is not None:
+            sys_codes = await self.get_codes(system_audio_data, sample_rate)
+        else:
+            sys_codes = None
+        if user_audio_data is not None:
+            user_codes = await self.get_codes(user_audio_data, sample_rate)
+        # Step 2: Prepare LLM request
+        if chat_ctx is None:
+            sys_parts = [
+                ServeTextPart(
+                    text='您是由 Fish Audio 设计的语音助手，提供端到端的语音交互，实现无缝用户体验。首先转录用户的语音，然后使用以下格式回答："Question: [用户语音]\n\nAnswer: [你的回答]\n"。'
+                ),
+            ]
+            if system_audio_data is not None:
+                sys_parts.append(ServeVQPart(codes=sys_codes))
+            chat_ctx = {
+                "messages": [
+                    ServeMessage(
+                        role="system",
+                        parts=sys_parts,
+                    ),
+                ],
+            }
+        else:
+            if chat_ctx["added_sysaudio"] is False and sys_codes:
+                chat_ctx["added_sysaudio"] = True
+                chat_ctx["messages"][0].parts.append(ServeVQPart(codes=sys_codes))
+        prev_messages = chat_ctx["messages"].copy()
+        if user_audio_data is not None:
+            yield FishE2EEvent(
+                type=FishE2EEventType.USER_CODES,
+                vq_codes=user_codes,
+            )
+        else:
+            user_codes = None
+        request = ServeChatRequest(
+            messages=prev_messages
+            + (
+                [
+                    ServeMessage(
+                        role="user",
+                        parts=[ServeVQPart(codes=user_codes)],
+                    )
+                ]
+                if user_codes
+                else []
+            ),
+            streaming=True,
+            num_samples=1,
+        )
+        # Step 3: Stream LLM response and decode audio
+        buffer = b""
+        vq_codes = []
+        current_vq = False
+        async def decode_send():
+            nonlocal current_vq
+            nonlocal vq_codes
+            data = np.concatenate(vq_codes, axis=1).tolist()
+            # Decode VQ codes to audio
+            decode_request = ServeVQGANDecodeRequest(tokens=[data])
+            decode_response = await self.client.post(
+                f"{self.vqgan_url}/v1/vqgan/decode",
+                data=ormsgpack.packb(
+                    decode_request,
+                    option=ormsgpack.OPT_SERIALIZE_PYDANTIC,
+                ),
+                headers={"Content-Type": "application/msgpack"},
+            )
+            decode_data = ormsgpack.unpackb(decode_response.content)
+            # Convert float16 audio data to int16
+            audio_data = np.frombuffer(decode_data["audios"][0], dtype=np.float16)
+            audio_data = (audio_data * 32768).astype(np.int16).tobytes()
+            audio_frame = CustomAudioFrame(
+                data=audio_data,
+                samples_per_channel=len(audio_data) // 2,
+                sample_rate=44100,
+                num_channels=1,
+            )
+            yield FishE2EEvent(
+                type=FishE2EEventType.SPEECH_SEGMENT,
+                frame=audio_frame,
+                vq_codes=data,
+            )
+            current_vq = False
+            vq_codes = []
+        async with self.client.stream(
+            "POST",
+            self.llm_url,
+            data=ormsgpack.packb(request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
+            headers={"Content-Type": "application/msgpack"},
+        ) as response:
+            async for chunk in response.aiter_bytes():
+                buffer += chunk
+                while len(buffer) >= 4:
+                    read_length = struct.unpack("I", buffer[:4])[0]
+                    if len(buffer) < 4 + read_length:
+                        break
+                    body = buffer[4 : 4 + read_length]
+                    buffer = buffer[4 + read_length :]
+                    data = ormsgpack.unpackb(body)
+                    if data["delta"] and data["delta"]["part"]:
+                        if current_vq and data["delta"]["part"]["type"] == "text":
+                            async for event in decode_send():
+                                yield event
+                        if data["delta"]["part"]["type"] == "text":
+                            yield FishE2EEvent(
+                                type=FishE2EEventType.TEXT_SEGMENT,
+                                text=data["delta"]["part"]["text"],
+                            )
+                        elif data["delta"]["part"]["type"] == "vq":
+                            vq_codes.append(np.array(data["delta"]["part"]["codes"]))
+                            current_vq = True
+        if current_vq and vq_codes:
+            async for event in decode_send():
+                yield event
+        yield FishE2EEvent(type=FishE2EEventType.END_OF_TEXT)
+        yield FishE2EEvent(type=FishE2EEventType.END_OF_SPEECH)
+# Example usage:
+async def main():
+    import torchaudio
+    agent = FishE2EAgent()
+    # Replace this with actual audio data loading
+    with open("uz_story_en.m4a", "rb") as f:
+        audio_data = f.read()
+    audio_data, sample_rate = torchaudio.load("uz_story_en.m4a")
+    audio_data = (audio_data.numpy() * 32768).astype(np.int16)
+    stream = agent.stream(audio_data, sample_rate, 1)
+    if os.path.exists("audio_segment.wav"):
+        os.remove("audio_segment.wav")
+    async for event in stream:
+        if event.type == FishE2EEventType.SPEECH_SEGMENT:
+            # Handle speech segment (e.g., play audio or save to file)
+            with open("audio_segment.wav", "ab+") as f:
+                f.write(event.frame.data)
+        elif event.type == FishE2EEventType.ASR_RESULT:
+            print(event.text, flush=True)
+        elif event.type == FishE2EEventType.TEXT_SEGMENT:
+            print(event.text, flush=True, end="")
+        elif event.type == FishE2EEventType.END_OF_TEXT:
+            print("\nEnd of text reached.")
+        elif event.type == FishE2EEventType.END_OF_SPEECH:
+            print("End of speech reached.")
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())

xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py ADDED Viewed

@@ -0,0 +1,192 @@
+import gc
+import queue
+from typing import Generator
+import numpy as np
+import torch
+from loguru import logger
+from fish_speech.models.vqgan.modules.firefly import FireflyArchitecture
+from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
+from fish_speech.utils import autocast_exclude_mps, set_seed
+from tools.inference_engine.reference_loader import ReferenceLoader
+from tools.inference_engine.utils import InferenceResult, wav_chunk_header
+from tools.inference_engine.vq_manager import VQManager
+from tools.llama.generate import (
+    GenerateRequest,
+    GenerateResponse,
+    WrappedGenerateResponse,
+)
+from tools.schema import ServeTTSRequest
+class TTSInferenceEngine(ReferenceLoader, VQManager):
+    def __init__(
+        self,
+        llama_queue: queue.Queue,
+        decoder_model: FireflyArchitecture,
+        precision: torch.dtype,
+        compile: bool,
+    ) -> None:
+        super().__init__()
+        self.llama_queue = llama_queue
+        self.decoder_model = decoder_model
+        self.precision = precision
+        self.compile = compile
+    @torch.inference_mode()
+    def inference(self, req: ServeTTSRequest) -> Generator[InferenceResult, None, None]:
+        """
+        Main inference function:
+        - Loads the reference audio and text.
+        - Calls the LLAMA model for inference.
+        - Decodes the VQ tokens to audio.
+        """
+        ref_id: str | None = req.reference_id
+        prompt_tokens, prompt_texts = [], []
+        # Load the reference audio and text based on id or hash
+        if ref_id is not None:
+            prompt_tokens, prompt_texts = self.load_by_id(ref_id, req.use_memory_cache)
+        elif req.references:
+            prompt_tokens, prompt_texts = self.load_by_hash(
+                req.references, req.use_memory_cache
+            )
+        # Set the random seed if provided
+        if req.seed is not None:
+            set_seed(req.seed)
+            logger.warning(f"set seed: {req.seed}")
+        # Get the symbolic tokens from the LLAMA model
+        response_queue = self.send_Llama_request(req, prompt_tokens, prompt_texts)
+        # Get the sample rate from the decoder model
+        sample_rate = self.decoder_model.spec_transform.sample_rate
+        # If streaming, send the header
+        # if req.streaming:
+        #     yield InferenceResult(
+        #         code="header",
+        #         audio=(sample_rate, wav_chunk_header(sample_rate=sample_rate)),
+        #         error=None,
+        #     )
+        segments = []
+        while True:
+            # Get the response from the LLAMA model
+            wrapped_result: WrappedGenerateResponse = response_queue.get()
+            if wrapped_result.status == "error":
+                yield InferenceResult(
+                    code="error",
+                    audio=None,
+                    error=(
+                        wrapped_result.response
+                        if isinstance(wrapped_result.response, Exception)
+                        else Exception("Unknown error")
+                    ),
+                )
+                break
+            # Check the response type
+            if not isinstance(wrapped_result.response, GenerateResponse):
+                raise TypeError(
+                    "Expected GenerateResponse, got {type(wrapped_result.response).__name__}"
+                )
+            result: GenerateResponse = wrapped_result.response
+            if result.action != "next":
+                segment = self.get_audio_segment(result)
+                if req.streaming:  # Used only by the API server
+                    yield InferenceResult(
+                        code="segment",
+                        audio=(sample_rate, segment),
+                        error=None,
+                    )
+                segments.append(segment)
+            else:
+                break
+        # Clean up the memory
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            gc.collect()
+        # Edge case: no audio generated
+        if len(segments) == 0:
+            yield InferenceResult(
+                code="error",
+                audio=None,
+                error=RuntimeError("No audio generated, please check the input text."),
+            )
+        else:
+            # Streaming or not, return the final audio
+            audio = np.concatenate(segments, axis=0)
+            yield InferenceResult(
+                code="final",
+                audio=(sample_rate, audio),
+                error=None,
+            )
+        return None
+    def send_Llama_request(
+        self, req: ServeTTSRequest, prompt_tokens: list, prompt_texts: list
+    ) -> queue.Queue:
+        """
+        Send a request to the LLAMA model to generate the symbolic tokens.
+        """
+        # Prepare the request
+        request = dict(
+            device=self.decoder_model.device,
+            max_new_tokens=req.max_new_tokens,
+            text=(
+                req.text
+                if not req.normalize
+                else ChnNormedText(raw_text=req.text).normalize()
+            ),
+            top_p=req.top_p,
+            repetition_penalty=req.repetition_penalty,
+            temperature=req.temperature,
+            compile=self.compile,
+            iterative_prompt=req.chunk_length > 0,
+            chunk_length=req.chunk_length,
+            max_length=4096,
+            prompt_tokens=prompt_tokens,
+            prompt_text=prompt_texts,
+        )
+        # Create a queue to get the response
+        response_queue = queue.Queue()
+        # Send the request to the LLAMA model
+        self.llama_queue.put(
+            GenerateRequest(
+                request=request,
+                response_queue=response_queue,
+            )
+        )
+        return response_queue
+    def get_audio_segment(self, result: GenerateResponse) -> np.ndarray:
+        """
+        Decode the VQ tokens to audio.
+        """
+        # Don't use autocast on MPS devices
+        with autocast_exclude_mps(
+            device_type=self.decoder_model.device.type, dtype=self.precision
+        ):
+            # Decode the symbolic tokens to audio
+            segment = self.decode_vq_tokens(codes=result.codes)
+        # Convert the audio to numpy
+        return segment.float().cpu().numpy()

xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py ADDED Viewed

@@ -0,0 +1,125 @@
+import io
+from hashlib import sha256
+from pathlib import Path
+from typing import Callable, Literal, Tuple
+import torch
+import torchaudio
+from loguru import logger
+from fish_speech.models.vqgan.modules.firefly import FireflyArchitecture
+from tools.file import AUDIO_EXTENSIONS, audio_to_bytes, list_files, read_ref_text
+from tools.schema import ServeReferenceAudio
+class ReferenceLoader:
+    def __init__(self) -> None:
+        """
+        Component of the TTSInferenceEngine class.
+        Loads and manages the cache for the reference audio and text.
+        """
+        self.ref_by_id: dict = {}
+        self.ref_by_hash: dict = {}
+        # Make Pylance happy (attribut/method not defined...)
+        self.decoder_model: FireflyArchitecture
+        self.encode_reference: Callable
+        # Define the torchaudio backend
+        backends = torchaudio.list_audio_backends()
+        if "ffmpeg" in backends:
+            self.backend = "ffmpeg"
+        else:
+            self.backend = "soundfile"
+    def load_by_id(
+        self,
+        id: str,
+        use_cache: Literal["on", "off"],
+    ) -> Tuple:
+        # Load the references audio and text by id
+        ref_folder = Path("references") / id
+        ref_folder.mkdir(parents=True, exist_ok=True)
+        ref_audios = list_files(
+            ref_folder, AUDIO_EXTENSIONS, recursive=True, sort=False
+        )
+        if use_cache == "off" or id not in self.ref_by_id:
+            # If the references are not already loaded, encode them
+            prompt_tokens = [
+                self.encode_reference(
+                    # decoder_model=self.decoder_model,
+                    reference_audio=audio_to_bytes(str(ref_audio)),
+                    enable_reference_audio=True,
+                )
+                for ref_audio in ref_audios
+            ]
+            prompt_texts = [
+                read_ref_text(str(ref_audio.with_suffix(".lab")))
+                for ref_audio in ref_audios
+            ]
+            self.ref_by_id[id] = (prompt_tokens, prompt_texts)
+        else:
+            # Reuse already encoded references
+            logger.info("Use same references")
+            prompt_tokens, prompt_texts = self.ref_by_id[id]
+        return prompt_tokens, prompt_texts
+    def load_by_hash(
+        self,
+        references: list[ServeReferenceAudio],
+        use_cache: Literal["on", "off"],
+    ) -> Tuple:
+        # Load the references audio and text by hash
+        audio_hashes = [sha256(ref.audio).hexdigest() for ref in references]
+        cache_used = False
+        prompt_tokens, prompt_texts = [], []
+        for i, ref in enumerate(references):
+            if use_cache == "off" or audio_hashes[i] not in self.ref_by_hash:
+                # If the references are not already loaded, encode them
+                prompt_tokens.append(
+                    self.encode_reference(
+                        reference_audio=ref.audio,
+                        enable_reference_audio=True,
+                    )
+                )
+                prompt_texts.append(ref.text)
+                self.ref_by_hash[audio_hashes[i]] = (prompt_tokens, prompt_texts)
+            else:
+                # Reuse already encoded references
+                prompt_tokens, prompt_texts = self.ref_by_hash[audio_hashes[i]]
+                cache_used = True
+        if cache_used:
+            logger.info("Use same references")
+        return prompt_tokens, prompt_texts
+    def load_audio(self, reference_audio, sr):
+        """
+        Load the audio data from a file or bytes.
+        """
+        if len(reference_audio) > 255 or not Path(reference_audio).exists():
+            audio_data = reference_audio
+            reference_audio = io.BytesIO(audio_data)
+        waveform, original_sr = torchaudio.load(reference_audio, backend=self.backend)
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        if original_sr != sr:
+            resampler = torchaudio.transforms.Resample(
+                orig_freq=original_sr, new_freq=sr
+            )
+            waveform = resampler(waveform)
+        audio = waveform.squeeze().numpy()
+        return audio

xinference/thirdparty/fish_speech/tools/inference_engine/utils.py ADDED Viewed

@@ -0,0 +1,39 @@
+import io
+import wave
+from dataclasses import dataclass
+from typing import Literal, Optional, Tuple
+import numpy as np
+from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
+@dataclass
+class InferenceResult:
+    code: Literal["header", "segment", "error", "final"]
+    audio: Optional[Tuple[int, np.ndarray | bytes]]
+    error: Optional[Exception]
+def normalize_text(user_input: str, use_normalization: bool) -> str:
+    """Normalize user input text if needed."""
+    if use_normalization:
+        return ChnNormedText(raw_text=user_input).normalize()
+    else:
+        return user_input
+def wav_chunk_header(
+    sample_rate: int = 44100, bit_depth: int = 16, channels: int = 1
+) -> bytes:
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes

xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

Potentially problematic release.

xinference 0.16.3py3-none-any.whl → 1.2.1py3-none-any.whl