PyPI - xinference - Versions diffs - 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

xinference 0.16.3py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (373) hide show

xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} RENAMED Viewed

@@ -8,14 +8,15 @@ import requests
 from pydub import AudioSegment
 from pydub.playback import play
-from tools.commons import ServeReferenceAudio, ServeTTSRequest
 from tools.file import audio_to_bytes, read_ref_text
+from tools.schema import ServeReferenceAudio, ServeTTSRequest
 def parse_args():
     parser = argparse.ArgumentParser(
-        description="Send a WAV file and text to a server and receive synthesized audio."
+        description="Send a WAV file and text to a server and receive synthesized audio.",
+        formatter_class=argparse.RawTextHelpFormatter,
     )
     parser.add_argument(
@@ -33,7 +34,7 @@ def parse_args():
         "-id",
         type=str,
         default=None,
-        help="ID of the reference model o be used for the speech",
+        help="ID of the reference model to be used for the speech\n(Local: name of folder containing audios and files)",
     )
     parser.add_argument(
         "--reference_audio",
@@ -41,7 +42,7 @@ def parse_args():
         type=str,
         nargs="+",
         default=None,
-        help="Path to the WAV file",
+        help="Path to the audio file",
     )
     parser.add_argument(
         "--reference_text",
@@ -68,17 +69,21 @@ def parse_args():
     parser.add_argument(
         "--format", type=str, choices=["wav", "mp3", "flac"], default="wav"
     )
-    parser.add_argument("--mp3_bitrate", type=int, default=64)
-    parser.add_argument("--opus_bitrate", type=int, default=-1000)
-    parser.add_argument("--latency", type=str, default="normal", help="延迟选项")
+    parser.add_argument(
+        "--latency",
+        type=str,
+        default="normal",
+        choices=["normal", "balanced"],
+        help="Used in api.fish.audio/v1/tts",
+    )
     parser.add_argument(
         "--max_new_tokens",
         type=int,
         default=1024,
-        help="Maximum new tokens to generate",
+        help="Maximum new tokens to generate. \n0 means no limit.",
     )
     parser.add_argument(
-        "--chunk_length", type=int, default=100, help="Chunk length for synthesis"
+        "--chunk_length", type=int, default=200, help="Chunk length for synthesis"
     )
     parser.add_argument(
         "--top_p", type=float, default=0.7, help="Top-p sampling for synthesis"
@@ -92,10 +97,7 @@ def parse_args():
     parser.add_argument(
         "--temperature", type=float, default=0.7, help="Temperature for sampling"
     )
-    parser.add_argument(
-        "--speaker", type=str, default=None, help="Speaker ID for voice synthesis"
-    )
-    parser.add_argument("--emotion", type=str, default=None, help="Speaker's Emotion")
     parser.add_argument(
         "--streaming", type=bool, default=False, help="Enable streaming response"
     )
@@ -103,6 +105,20 @@ def parse_args():
         "--channels", type=int, default=1, help="Number of audio channels"
     )
     parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio")
+    parser.add_argument(
+        "--use_memory_cache",
+        type=str,
+        default="off",
+        choices=["on", "off"],
+        help="Cache encoded references codes in memory.\n",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="`None` means randomized inference, otherwise deterministic.\n"
+        "It can't be used for fixing a timbre.",
+    )
     return parser.parse_args()
@@ -132,22 +148,22 @@ if __name__ == "__main__":
     data = {
         "text": args.text,
         "references": [
-            ServeReferenceAudio(audio=ref_audio, text=ref_text)
+            ServeReferenceAudio(
+                audio=ref_audio if ref_audio is not None else b"", text=ref_text
+            )
             for ref_text, ref_audio in zip(ref_texts, byte_audios)
         ],
         "reference_id": idstr,
         "normalize": args.normalize,
         "format": args.format,
-        "mp3_bitrate": args.mp3_bitrate,
-        "opus_bitrate": args.opus_bitrate,
         "max_new_tokens": args.max_new_tokens,
         "chunk_length": args.chunk_length,
         "top_p": args.top_p,
         "repetition_penalty": args.repetition_penalty,
         "temperature": args.temperature,
-        "speaker": args.speaker,
-        "emotion": args.emotion,
         "streaming": args.streaming,
+        "use_memory_cache": args.use_memory_cache,
+        "seed": args.seed,
     }
     pydantic_data = ServeTTSRequest(**data)

xinference/thirdparty/fish_speech/tools/api_server.py ADDED Viewed

@@ -0,0 +1,98 @@
+from threading import Lock
+import pyrootutils
+import uvicorn
+from kui.asgi import FactoryClass, HTTPException, HttpRoute, Kui, OpenAPI, Routes
+from loguru import logger
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from tools.server.api_utils import MsgPackRequest, parse_args
+from tools.server.exception_handler import ExceptionHandler
+from tools.server.model_manager import ModelManager
+from tools.server.views import (
+    ASRView,
+    ChatView,
+    HealthView,
+    TTSView,
+    VQGANDecodeView,
+    VQGANEncodeView,
+)
+class API(ExceptionHandler):
+    def __init__(self):
+        self.args = parse_args()
+        self.routes = [
+            ("/v1/health", HealthView),
+            ("/v1/vqgan/encode", VQGANEncodeView),
+            ("/v1/vqgan/decode", VQGANDecodeView),
+            ("/v1/asr", ASRView),
+            ("/v1/tts", TTSView),
+            ("/v1/chat", ChatView),
+        ]
+        self.routes = Routes([HttpRoute(path, view) for path, view in self.routes])
+        self.openapi = OpenAPI(
+            {
+                "title": "Fish Speech API",
+                "version": "1.5.0",
+            },
+        ).routes
+        # Initialize the app
+        self.app = Kui(
+            routes=self.routes + self.openapi[1:],  # Remove the default route
+            exception_handlers={
+                HTTPException: self.http_exception_handler,
+                Exception: self.other_exception_handler,
+            },
+            factory_class=FactoryClass(http=MsgPackRequest),
+            cors_config={},
+        )
+        # Add the state variables
+        self.app.state.lock = Lock()
+        self.app.state.device = self.args.device
+        self.app.state.max_text_length = self.args.max_text_length
+        # Associate the app with the model manager
+        self.app.on_startup(self.initialize_app)
+    async def initialize_app(self, app: Kui):
+        # Make the ModelManager available to the views
+        app.state.model_manager = ModelManager(
+            mode=self.args.mode,
+            device=self.args.device,
+            half=self.args.half,
+            compile=self.args.compile,
+            asr_enabled=self.args.load_asr_model,
+            llama_checkpoint_path=self.args.llama_checkpoint_path,
+            decoder_checkpoint_path=self.args.decoder_checkpoint_path,
+            decoder_config_name=self.args.decoder_config_name,
+        )
+        logger.info(f"Startup done, listening server at http://{self.args.listen}")
+# Each worker process created by Uvicorn has its own memory space,
+# meaning that models and variables are not shared between processes.
+# Therefore, any variables (like `llama_queue` or `decoder_model`)
+# will not be shared across workers.
+# Multi-threading for deep learning can cause issues, such as inconsistent
+# outputs if multiple threads access the same buffers simultaneously.
+# Instead, it's better to use multiprocessing or independent models per thread.
+if __name__ == "__main__":
+    api = API()
+    host, port = api.args.listen.split(":")
+    uvicorn.run(
+        api.app,
+        host=host,
+        port=int(port),
+        workers=api.args.workers,
+        log_level="info",
+    )

xinference/thirdparty/fish_speech/tools/download_models.py CHANGED Viewed

@@ -22,14 +22,14 @@ def check_and_download_files(repo_id, file_list, local_dir):
 # 1st
-repo_id_1 = "fishaudio/fish-speech-1.4"
-local_dir_1 = "./checkpoints/fish-speech-1.4"
+repo_id_1 = "fishaudio/fish-speech-1.5"
+local_dir_1 = "./checkpoints/fish-speech-1.5"
 files_1 = [
+    "gitattributes",
     "model.pth",
     "README.md",
-    "special_tokens_map.json",
-    "tokenizer_config.json",
-    "tokenizer.json",
+    "special_tokens.json",
+    "tokenizer.tiktoken",
     "config.json",
     "firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
 ]

xinference/thirdparty/fish_speech/tools/e2e_webui.py ADDED Viewed

@@ -0,0 +1,232 @@
+import io
+import re
+import wave
+import gradio as gr
+import numpy as np
+from .fish_e2e import FishE2EAgent, FishE2EEventType
+from .schema import ServeMessage, ServeTextPart, ServeVQPart
+def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes
+class ChatState:
+    def __init__(self):
+        self.conversation = []
+        self.added_systext = False
+        self.added_sysaudio = False
+    def get_history(self):
+        results = []
+        for msg in self.conversation:
+            results.append({"role": msg.role, "content": self.repr_message(msg)})
+        # Process assistant messages to extract questions and update user messages
+        for i, msg in enumerate(results):
+            if msg["role"] == "assistant":
+                match = re.search(r"Question: (.*?)\n\nResponse:", msg["content"])
+                if match and i > 0 and results[i - 1]["role"] == "user":
+                    # Update previous user message with extracted question
+                    results[i - 1]["content"] += "\n" + match.group(1)
+                    # Remove the Question/Answer format from assistant message
+                    msg["content"] = msg["content"].split("\n\nResponse: ", 1)[1]
+        return results
+    def repr_message(self, msg: ServeMessage):
+        response = ""
+        for part in msg.parts:
+            if isinstance(part, ServeTextPart):
+                response += part.text
+            elif isinstance(part, ServeVQPart):
+                response += f"<audio {len(part.codes[0]) / 21:.2f}s>"
+        return response
+def clear_fn():
+    return [], ChatState(), None, None, None
+async def process_audio_input(
+    sys_audio_input, sys_text_input, audio_input, state: ChatState, text_input: str
+):
+    if audio_input is None and not text_input:
+        raise gr.Error("No input provided")
+    agent = FishE2EAgent()  # Create new agent instance for each request
+    # Convert audio input to numpy array
+    if isinstance(audio_input, tuple):
+        sr, audio_data = audio_input
+    elif text_input:
+        sr = 44100
+        audio_data = None
+    else:
+        raise gr.Error("Invalid audio format")
+    if isinstance(sys_audio_input, tuple):
+        sr, sys_audio_data = sys_audio_input
+    else:
+        sr = 44100
+        sys_audio_data = None
+    def append_to_chat_ctx(
+        part: ServeTextPart | ServeVQPart, role: str = "assistant"
+    ) -> None:
+        if not state.conversation or state.conversation[-1].role != role:
+            state.conversation.append(ServeMessage(role=role, parts=[part]))
+        else:
+            state.conversation[-1].parts.append(part)
+    if state.added_systext is False and sys_text_input:
+        state.added_systext = True
+        append_to_chat_ctx(ServeTextPart(text=sys_text_input), role="system")
+    if text_input:
+        append_to_chat_ctx(ServeTextPart(text=text_input), role="user")
+        audio_data = None
+    result_audio = b""
+    async for event in agent.stream(
+        sys_audio_data,
+        audio_data,
+        sr,
+        1,
+        chat_ctx={
+            "messages": state.conversation,
+            "added_sysaudio": state.added_sysaudio,
+        },
+    ):
+        if event.type == FishE2EEventType.USER_CODES:
+            append_to_chat_ctx(ServeVQPart(codes=event.vq_codes), role="user")
+        elif event.type == FishE2EEventType.SPEECH_SEGMENT:
+            append_to_chat_ctx(ServeVQPart(codes=event.vq_codes))
+            yield state.get_history(), wav_chunk_header() + event.frame.data, None, None
+        elif event.type == FishE2EEventType.TEXT_SEGMENT:
+            append_to_chat_ctx(ServeTextPart(text=event.text))
+            yield state.get_history(), None, None, None
+    yield state.get_history(), None, None, None
+async def process_text_input(
+    sys_audio_input, sys_text_input, state: ChatState, text_input: str
+):
+    async for event in process_audio_input(
+        sys_audio_input, sys_text_input, None, state, text_input
+    ):
+        yield event
+def create_demo():
+    with gr.Blocks() as demo:
+        state = gr.State(ChatState())
+        with gr.Row():
+            # Left column (70%) for chatbot and notes
+            with gr.Column(scale=7):
+                chatbot = gr.Chatbot(
+                    [],
+                    elem_id="chatbot",
+                    bubble_full_width=False,
+                    height=600,
+                    type="messages",
+                )
+                # notes = gr.Markdown(
+                #     """
+                # # Fish Agent
+                # 1. 此Demo为Fish Audio自研端到端语言模型Fish Agent 3B版本.
+                # 2. 你可以在我们的官方仓库找到代码以及权重，但是相关内容全部基于 CC BY-NC-SA 4.0 许可证发布.
+                # 3. Demo为早期灰度测试版本，推理速度尚待优化.
+                # # 特色
+                # 1. 该模型自动集成ASR与TTS部分，不需要外挂其它模型，即真正的端到端，而非三段式(ASR+LLM+TTS).
+                # 2. 模型可以使用reference audio控制说话音色.
+                # 3. 可以生成具有较强情感与韵律的音频.
+                # """
+                # )
+                notes = gr.Markdown(
+                    """
+                    # Fish Agent
+                    1. This demo is Fish Audio's self-researh end-to-end language model, Fish Agent version 3B.
+                    2. You can find the code and weights in our official repo in [gitub](https://github.com/fishaudio/fish-speech) and [hugging face](https://huggingface.co/fishaudio/fish-agent-v0.1-3b), but the content is released under a CC BY-NC-SA 4.0 licence.
+                    3. The demo is an early alpha test version, the inference speed needs to be optimised.
+                    # Features
+                    1. The model automatically integrates ASR and TTS parts, no need to plug-in other models, i.e., true end-to-end, not three-stage (ASR+LLM+TTS).
+                    2. The model can use reference audio to control the speech timbre.
+                    3. The model can generate speech with strong emotion.
+                """
+                )
+            # Right column (30%) for controls
+            with gr.Column(scale=3):
+                sys_audio_input = gr.Audio(
+                    sources=["upload"],
+                    type="numpy",
+                    label="Give a timbre for your assistant",
+                )
+                sys_text_input = gr.Textbox(
+                    label="What is your assistant's role?",
+                    value="You are a voice assistant created by Fish Audio, offering end-to-end voice interaction for a seamless user experience. You are required to first transcribe the user's speech, then answer it in the following format: 'Question: [USER_SPEECH]\n\nAnswer: [YOUR_RESPONSE]\n'. You are required to use the following voice in this conversation.",
+                    type="text",
+                )
+                audio_input = gr.Audio(
+                    sources=["microphone"], type="numpy", label="Speak your message"
+                )
+                text_input = gr.Textbox(label="Or type your message", type="text")
+                output_audio = gr.Audio(
+                    label="Assistant's Voice",
+                    streaming=True,
+                    autoplay=True,
+                    interactive=False,
+                )
+                send_button = gr.Button("Send", variant="primary")
+                clear_button = gr.Button("Clear")
+        # Event handlers
+        audio_input.stop_recording(
+            process_audio_input,
+            inputs=[sys_audio_input, sys_text_input, audio_input, state, text_input],
+            outputs=[chatbot, output_audio, audio_input, text_input],
+            show_progress=True,
+        )
+        send_button.click(
+            process_text_input,
+            inputs=[sys_audio_input, sys_text_input, state, text_input],
+            outputs=[chatbot, output_audio, audio_input, text_input],
+            show_progress=True,
+        )
+        text_input.submit(
+            process_text_input,
+            inputs=[sys_audio_input, sys_text_input, state, text_input],
+            outputs=[chatbot, output_audio, audio_input, text_input],
+            show_progress=True,
+        )
+        clear_button.click(
+            clear_fn,
+            inputs=[],
+            outputs=[chatbot, state, audio_input, output_audio, text_input],
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(server_name="127.0.0.1", server_port=7860, share=True)

xinference 0.16.3__py3-none-any.whl → 1.2.1__py3-none-any.whl

Potentially problematic release.

xinference 0.16.3py3-none-any.whl → 1.2.1py3-none-any.whl