PyPI - xinference - Versions diffs - 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl - Mend

xinference 0.14.4.post1py3-none-any.whl → 0.15.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show

xinference/thirdparty/fish_speech/tools/post_api.py CHANGED Viewed

@@ -1,40 +1,19 @@
 import argparse
 import base64
-import json
 import wave
-from pathlib import Path
+import ormsgpack
 import pyaudio
 import requests
+from pydub import AudioSegment
+from pydub.playback import play
+from tools.commons import ServeReferenceAudio, ServeTTSRequest
+from tools.file import audio_to_bytes, read_ref_text
-def wav_to_base64(file_path):
-    if not file_path or not Path(file_path).exists():
-        return None
-    with open(file_path, "rb") as wav_file:
-        wav_content = wav_file.read()
-        base64_encoded = base64.b64encode(wav_content)
-        return base64_encoded.decode("utf-8")
+def parse_args():
-def read_ref_text(ref_text):
-    path = Path(ref_text)
-    if path.exists() and path.is_file():
-        with path.open("r", encoding="utf-8") as file:
-            return file.read()
-    return ref_text
-def play_audio(audio_content, format, channels, rate):
-    p = pyaudio.PyAudio()
-    stream = p.open(format=format, channels=channels, rate=rate, output=True)
-    stream.write(audio_content)
-    stream.stop_stream()
-    stream.close()
-    p.terminate()
-if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Send a WAV file and text to a server and receive synthesized audio."
     )
@@ -43,16 +22,24 @@ if __name__ == "__main__":
         "--url",
         "-u",
         type=str,
-        default="http://127.0.0.1:8080/v1/invoke",
+        default="http://127.0.0.1:8080/v1/tts",
         help="URL of the server",
     )
     parser.add_argument(
         "--text", "-t", type=str, required=True, help="Text to be synthesized"
     )
+    parser.add_argument(
+        "--reference_id",
+        "-id",
+        type=str,
+        default=None,
+        help="ID of the reference model o be used for the speech",
+    )
     parser.add_argument(
         "--reference_audio",
         "-ra",
         type=str,
+        nargs="+",
         default=None,
         help="Path to the WAV file",
     )
@@ -60,9 +47,30 @@ if __name__ == "__main__":
         "--reference_text",
         "-rt",
         type=str,
+        nargs="+",
         default=None,
         help="Reference text for voice synthesis",
     )
+    parser.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        default="generated_audio",
+        help="Output audio file name",
+    )
+    parser.add_argument(
+        "--play",
+        type=bool,
+        default=True,
+        help="Whether to play audio after receiving data",
+    )
+    parser.add_argument("--normalize", type=bool, default=True)
+    parser.add_argument(
+        "--format", type=str, choices=["wav", "mp3", "flac"], default="wav"
+    )
+    parser.add_argument("--mp3_bitrate", type=int, default=64)
+    parser.add_argument("--opus_bitrate", type=int, default=-1000)
+    parser.add_argument("--latency", type=str, default="normal", help="延迟选项")
     parser.add_argument(
         "--max_new_tokens",
         type=int,
@@ -88,7 +96,6 @@ if __name__ == "__main__":
         "--speaker", type=str, default=None, help="Speaker ID for voice synthesis"
     )
     parser.add_argument("--emotion", type=str, default=None, help="Speaker's Emotion")
-    parser.add_argument("--format", type=str, default="wav", help="Audio format")
     parser.add_argument(
         "--streaming", type=bool, default=False, help="Enable streaming response"
     )
@@ -97,18 +104,42 @@ if __name__ == "__main__":
     )
     parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio")
-    args = parser.parse_args()
+    return parser.parse_args()
-    base64_audio = wav_to_base64(args.reference_audio)
-    ref_text = args.reference_text
-    if ref_text:
-        ref_text = read_ref_text(ref_text)
+if __name__ == "__main__":
+    args = parse_args()
+    idstr: str | None = args.reference_id
+    # priority: ref_id > [{text, audio},...]
+    if idstr is None:
+        ref_audios = args.reference_audio
+        ref_texts = args.reference_text
+        if ref_audios is None:
+            byte_audios = []
+        else:
+            byte_audios = [audio_to_bytes(ref_audio) for ref_audio in ref_audios]
+        if ref_texts is None:
+            ref_texts = []
+        else:
+            ref_texts = [read_ref_text(ref_text) for ref_text in ref_texts]
+    else:
+        byte_audios = []
+        ref_texts = []
+        pass  # in api.py
     data = {
         "text": args.text,
-        "reference_text": ref_text,
-        "reference_audio": base64_audio,
+        "references": [
+            ServeReferenceAudio(audio=ref_audio, text=ref_text)
+            for ref_text, ref_audio in zip(ref_texts, byte_audios)
+        ],
+        "reference_id": idstr,
+        "normalize": args.normalize,
+        "format": args.format,
+        "mp3_bitrate": args.mp3_bitrate,
+        "opus_bitrate": args.opus_bitrate,
         "max_new_tokens": args.max_new_tokens,
         "chunk_length": args.chunk_length,
         "top_p": args.top_p,
@@ -116,22 +147,30 @@ if __name__ == "__main__":
         "temperature": args.temperature,
         "speaker": args.speaker,
         "emotion": args.emotion,
-        "format": args.format,
         "streaming": args.streaming,
     }
-    response = requests.post(args.url, json=data, stream=args.streaming)
+    pydantic_data = ServeTTSRequest(**data)
-    audio_format = pyaudio.paInt16  # Assuming 16-bit PCM format
+    response = requests.post(
+        args.url,
+        data=ormsgpack.packb(pydantic_data, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
+        stream=args.streaming,
+        headers={
+            "authorization": "Bearer YOUR_API_KEY",
+            "content-type": "application/msgpack",
+        },
+    )
     if response.status_code == 200:
         if args.streaming:
             p = pyaudio.PyAudio()
+            audio_format = pyaudio.paInt16  # Assuming 16-bit PCM format
             stream = p.open(
                 format=audio_format, channels=args.channels, rate=args.rate, output=True
             )
-            wf = wave.open("generated_audio.wav", "wb")
+            wf = wave.open(f"{args.output}.wav", "wb")
             wf.setnchannels(args.channels)
             wf.setsampwidth(p.get_sample_size(audio_format))
             wf.setframerate(args.rate)
@@ -153,12 +192,14 @@ if __name__ == "__main__":
                 wf.close()
         else:
             audio_content = response.content
-            with open("generated_audio.wav", "wb") as audio_file:
+            audio_path = f"{args.output}.{args.format}"
+            with open(audio_path, "wb") as audio_file:
                 audio_file.write(audio_content)
-            play_audio(audio_content, audio_format, args.channels, args.rate)
-            print("Audio has been saved to 'generated_audio.wav'.")
+            audio = AudioSegment.from_file(audio_path, format=args.format)
+            if args.play:
+                play(audio)
+            print(f"Audio has been saved to '{audio_path}'.")
     else:
         print(f"Request failed with status code {response.status_code}")
         print(response.json())

xinference/thirdparty/fish_speech/tools/sensevoice/README.md ADDED Viewed

@@ -0,0 +1,59 @@
+# FunASR Command Line Interface
+This tool provides a command-line interface for separating vocals from instrumental tracks, converting videos to audio, and performing speech-to-text transcription on the resulting audio files.
+## Requirements
+- Python >= 3.10
+- PyTorch <= 2.3.1
+- ffmpeg, pydub, audio-separator[gpu].
+## Installation
+Install the required packages:
+```bash
+pip install -e .[stable]
+```
+Make sure you have `ffmpeg` installed and available in your `PATH`.
+## Usage
+### Basic Usage
+To run the tool with default settings:
+```bash
+python tools/sensevoice/fun_asr.py --audio-dir <audio_directory> --save-dir <output_directory>
+```
+## Options
+|          Option           |                                  Description                                  |
+| :-----------------------: | :---------------------------------------------------------------------------: |
+|        --audio-dir        |                  Directory containing audio or video files.                   |
+|        --save-dir         |                   Directory to save processed audio files.                    |
+|         --device          |         Device to use for processing. Options: cuda (default) or cpu.         |
+|        --language         |                Language of the transcription. Default is auto.                |
+| --max_single_segment_time | Maximum duration of a single audio segment in milliseconds. Default is 20000. |
+|          --punc           |                        Enable punctuation prediction.                         |
+|         --denoise         |                  Enable noise reduction (vocal separation).                   |
+## Example
+To process audio files in the directory `path/to/audio` and save the output to `path/to/output`, with punctuation and noise reduction enabled:
+```bash
+python tools/sensevoice/fun_asr.py --audio-dir path/to/audio --save-dir path/to/output --punc --denoise
+```
+## Additional Notes
+- The tool supports `both audio and video files`. Videos will be converted to audio automatically.
+- If the `--denoise` option is used, the tool will perform vocal separation to isolate the vocals from the instrumental tracks.
+- The script will automatically create necessary directories in the `--save-dir`.
+## Troubleshooting
+If you encounter any issues, make sure all dependencies are correctly installed and configured. For more detailed troubleshooting, refer to the documentation of each dependency.

xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py CHANGED Viewed

@@ -26,7 +26,7 @@ def uvr5_cli(
     output_folder: Path,
     audio_files: list[Path] | None = None,
     output_format: str = "flac",
-    model: str = "BS-Roformer-Viperx-1296.ckpt",
+    model: str = "BS-Roformer-Viperx-1297.ckpt",
 ):
     # ["BS-Roformer-Viperx-1297.ckpt", "BS-Roformer-Viperx-1296.ckpt", "BS-Roformer-Viperx-1053.ckpt", "Mel-Roformer-Viperx-1143.ckpt"]
     sepr = Separator(

xinference/thirdparty/fish_speech/tools/smart_pad.py CHANGED Viewed

@@ -15,21 +15,34 @@ threshold = 10 ** (-50 / 20.0)
 def process(file):
     waveform, sample_rate = torchaudio.load(str(file), backend="sox")
+    if waveform.size(0) > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
     loudness = librosa.feature.rms(
         y=waveform.numpy().squeeze(), frame_length=2048, hop_length=512, center=True
     )[0]
     for i in range(len(loudness) - 1, 0, -1):
         if loudness[i] > threshold:
             break
-    silent_time = (len(loudness) - i) * 512 / sample_rate
+    end_silent_time = (len(loudness) - i) * 512 / sample_rate
-    if silent_time <= 0.3:
-        random_time = random.uniform(0.3, 0.7)
+    if end_silent_time <= 0.3:
+        random_time = random.uniform(0.3, 0.7) - end_silent_time
         waveform = F.pad(
             waveform, (0, int(random_time * sample_rate)), mode="constant", value=0
         )
+    for i in range(len(loudness)):
+        if loudness[i] > threshold:
+            break
+    start_silent_time = i * 512 / sample_rate
+    if start_silent_time > 0.02:
+        waveform = waveform[:, int((start_silent_time - 0.02) * sample_rate) :]
     torchaudio.save(uri=str(file), src=waveform, sample_rate=sample_rate)

xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py CHANGED Viewed

@@ -42,7 +42,7 @@ logger.add(sys.stderr, format=logger_format)
 @lru_cache(maxsize=1)
 def get_model(
     config_name: str = "firefly_gan_vq",
-    checkpoint_path: str = "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+    checkpoint_path: str = "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
     device: str | torch.device = "cuda",
 ):
     with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
@@ -133,7 +133,7 @@ def process_batch(files: list[Path], model) -> float:
 @click.option("--config-name", default="firefly_gan_vq")
 @click.option(
     "--checkpoint-path",
-    default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+    default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
 )
 @click.option("--batch-size", default=64)
 @click.option("--filelist", default=None, type=Path)

xinference/thirdparty/fish_speech/tools/vqgan/inference.py CHANGED Viewed

@@ -59,7 +59,7 @@ def load_model(config_name, checkpoint_path, device="cuda"):
 @click.option("--config-name", default="firefly_gan_vq")
 @click.option(
     "--checkpoint-path",
-    default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+    default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
 )
 @click.option(
     "--device",
@@ -103,7 +103,9 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
     # Restore
     feature_lengths = torch.tensor([indices.shape[1]], device=device)
-    fake_audios = model.decode(indices=indices[None], feature_lengths=feature_lengths)
+    fake_audios, _ = model.decode(
+        indices=indices[None], feature_lengths=feature_lengths
+    )
     audio_time = fake_audios.shape[-1] / model.spec_transform.sample_rate
     logger.info(

xinference/thirdparty/fish_speech/tools/webui.py CHANGED Viewed

@@ -23,7 +23,6 @@ from fish_speech.i18n import i18n
 from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
 from fish_speech.utils import autocast_exclude_mps
 from tools.api import decode_vq_tokens, encode_reference
-from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
 from tools.llama.generate import (
     GenerateRequest,
     GenerateResponse,
@@ -40,9 +39,9 @@ HEADER_MD = f"""# Fish Speech
 {i18n("A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).")}
-{i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).")}
+{i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.4).")}
-{i18n("Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.")}
+{i18n("Related code and weights are released under CC BY-NC-SA 4.0 License.")}
 {i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")}
 """
@@ -160,66 +159,6 @@ def inference(
         gc.collect()
-def inference_with_auto_rerank(
-    text,
-    enable_reference_audio,
-    reference_audio,
-    reference_text,
-    max_new_tokens,
-    chunk_length,
-    top_p,
-    repetition_penalty,
-    temperature,
-    use_auto_rerank,
-    streaming=False,
-):
-    max_attempts = 2 if use_auto_rerank else 1
-    best_wer = float("inf")
-    best_audio = None
-    best_sample_rate = None
-    for attempt in range(max_attempts):
-        audio_generator = inference(
-            text,
-            enable_reference_audio,
-            reference_audio,
-            reference_text,
-            max_new_tokens,
-            chunk_length,
-            top_p,
-            repetition_penalty,
-            temperature,
-            streaming=False,
-        )
-        # 获取音频数据
-        for _ in audio_generator:
-            pass
-        _, (sample_rate, audio), message = _
-        if audio is None:
-            return None, None, message
-        if not use_auto_rerank:
-            return None, (sample_rate, audio), None
-        asr_result = batch_asr(asr_model, [audio], sample_rate)[0]
-        wer = calculate_wer(text, asr_result["text"])
-        if wer <= 0.3 and not asr_result["huge_gap"]:
-            return None, (sample_rate, audio), None
-        if wer < best_wer:
-            best_wer = wer
-            best_audio = audio
-            best_sample_rate = sample_rate
-        if attempt == max_attempts - 1:
-            break
-    return None, (best_sample_rate, best_audio), None
 inference_stream = partial(inference, streaming=True)
 n_audios = 4
@@ -239,13 +178,12 @@ def inference_wrapper(
     repetition_penalty,
     temperature,
     batch_infer_num,
-    if_load_asr_model,
 ):
     audios = []
     errors = []
     for _ in range(batch_infer_num):
-        result = inference_with_auto_rerank(
+        result = inference(
             text,
             enable_reference_audio,
             reference_audio,
@@ -255,10 +193,9 @@ def inference_wrapper(
             top_p,
             repetition_penalty,
             temperature,
-            if_load_asr_model,
         )
-        _, audio_data, error_message = result
+        _, audio_data, error_message = next(result)
         audios.append(
             gr.Audio(value=audio_data if audio_data else None, visible=True),
@@ -301,42 +238,6 @@ def normalize_text(user_input, use_normalization):
 asr_model = None
-def change_if_load_asr_model(if_load):
-    global asr_model
-    if if_load:
-        gr.Warning("Loading faster whisper model...")
-        if asr_model is None:
-            asr_model = load_model()
-        return gr.Checkbox(label="Unload faster whisper model", value=if_load)
-    if if_load is False:
-        gr.Warning("Unloading faster whisper model...")
-        del asr_model
-        asr_model = None
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            gc.collect()
-        return gr.Checkbox(label="Load faster whisper model", value=if_load)
-def change_if_auto_label(if_load, if_auto_label, enable_ref, ref_audio, ref_text):
-    if if_load and asr_model is not None:
-        if (
-            if_auto_label
-            and enable_ref
-            and ref_audio is not None
-            and ref_text.strip() == ""
-        ):
-            data, sample_rate = librosa.load(ref_audio)
-            res = batch_asr(asr_model, [data], sample_rate)[0]
-            ref_text = res["text"]
-    else:
-        gr.Warning("Whisper model not loaded!")
-    return gr.Textbox(value=ref_text)
 def build_app():
     with gr.Blocks(theme=gr.themes.Base()) as app:
         gr.Markdown(HEADER_MD)
@@ -367,23 +268,17 @@ def build_app():
                 with gr.Row():
                     if_refine_text = gr.Checkbox(
                         label=i18n("Text Normalization"),
-                        value=True,
-                        scale=1,
-                    )
-                    if_load_asr_model = gr.Checkbox(
-                        label=i18n("Load / Unload ASR model for auto-reranking"),
                         value=False,
-                        scale=3,
+                        scale=1,
                     )
                 with gr.Row():
                     with gr.Tab(label=i18n("Advanced Config")):
                         chunk_length = gr.Slider(
                             label=i18n("Iterative Prompt Length, 0 means off"),
-                            minimum=0,
-                            maximum=500,
-                            value=100,
+                            minimum=50,
+                            maximum=300,
+                            value=200,
                             step=8,
                         )
@@ -434,12 +329,6 @@ def build_app():
                             type="filepath",
                         )
                         with gr.Row():
-                            if_auto_label = gr.Checkbox(
-                                label=i18n("Auto Labeling"),
-                                min_width=100,
-                                scale=0,
-                                value=False,
-                            )
                             reference_text = gr.Textbox(
                                 label=i18n("Reference Text"),
                                 lines=1,
@@ -494,28 +383,6 @@ def build_app():
             fn=normalize_text, inputs=[text, if_refine_text], outputs=[refined_text]
         )
-        if_load_asr_model.change(
-            fn=change_if_load_asr_model,
-            inputs=[if_load_asr_model],
-            outputs=[if_load_asr_model],
-        )
-        if_auto_label.change(
-            fn=lambda: gr.Textbox(value=""),
-            inputs=[],
-            outputs=[reference_text],
-        ).then(
-            fn=change_if_auto_label,
-            inputs=[
-                if_load_asr_model,
-                if_auto_label,
-                enable_reference_audio,
-                reference_audio,
-                reference_text,
-            ],
-            outputs=[reference_text],
-        )
         # # Submit
         generate.click(
             inference_wrapper,
@@ -530,7 +397,6 @@ def build_app():
                 repetition_penalty,
                 temperature,
                 batch_infer_num,
-                if_load_asr_model,
             ],
             [stream_audio, *global_audio_list, *global_error_list],
             concurrency_limit=1,
@@ -560,12 +426,12 @@ def parse_args():
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
-        default="checkpoints/fish-speech-1.2-sft",
+        default="checkpoints/fish-speech-1.4",
     )
     parser.add_argument(
         "--decoder-checkpoint-path",
         type=Path,
-        default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+        default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
     )
     parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
     parser.add_argument("--device", type=str, default="cuda")
@@ -605,8 +471,8 @@ if __name__ == "__main__":
             enable_reference_audio=False,
             reference_audio=None,
             reference_text="",
-            max_new_tokens=0,
-            chunk_length=100,
+            max_new_tokens=1024,
+            chunk_length=200,
             top_p=0.7,
             repetition_penalty=1.2,
             temperature=0.7,

xinference/thirdparty/matcha/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.7.0

xinference/thirdparty/matcha/hifigan/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2020 Jungil Kong
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

xinference 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl

Potentially problematic release.

xinference 0.14.4.post1py3-none-any.whl → 0.15.1py3-none-any.whl