PyPI - xinference - Versions diffs - 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl - Mend

xinference 0.14.1.post1py3-none-any.whl → 0.14.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show

xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py ADDED Viewed

@@ -0,0 +1,332 @@
+import gc
+import os
+import re
+from audio_separator.separator import Separator
+os.environ["MODELSCOPE_CACHE"] = "./.cache/funasr"
+os.environ["UVR5_CACHE"] = "./.cache/uvr5-models"
+import json
+import subprocess
+from pathlib import Path
+import click
+import torch
+from loguru import logger
+from pydub import AudioSegment
+from silero_vad import get_speech_timestamps, load_silero_vad, read_audio
+from tqdm import tqdm
+from tools.file import AUDIO_EXTENSIONS, VIDEO_EXTENSIONS, list_files
+from tools.sensevoice.auto_model import AutoModel
+def uvr5_cli(
+    audio_dir: Path,
+    output_folder: Path,
+    audio_files: list[Path] | None = None,
+    output_format: str = "flac",
+    model: str = "BS-Roformer-Viperx-1296.ckpt",
+):
+    # ["BS-Roformer-Viperx-1297.ckpt", "BS-Roformer-Viperx-1296.ckpt", "BS-Roformer-Viperx-1053.ckpt", "Mel-Roformer-Viperx-1143.ckpt"]
+    sepr = Separator(
+        model_file_dir=os.environ["UVR5_CACHE"],
+        output_dir=output_folder,
+        output_format=output_format,
+    )
+    dictmodel = {
+        "BS-Roformer-Viperx-1297.ckpt": "model_bs_roformer_ep_317_sdr_12.9755.ckpt",
+        "BS-Roformer-Viperx-1296.ckpt": "model_bs_roformer_ep_368_sdr_12.9628.ckpt",
+        "BS-Roformer-Viperx-1053.ckpt": "model_bs_roformer_ep_937_sdr_10.5309.ckpt",
+        "Mel-Roformer-Viperx-1143.ckpt": "model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt",
+    }
+    roformer_model = dictmodel[model]
+    sepr.load_model(roformer_model)
+    if audio_files is None:
+        audio_files = list_files(
+            path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
+        )
+    total_files = len(audio_files)
+    print(f"{total_files} audio files found")
+    res = []
+    for audio in tqdm(audio_files, desc="Denoising: "):
+        file_path = str(audio_dir / audio)
+        sep_out = sepr.separate(file_path)
+        if isinstance(sep_out, str):
+            res.append(sep_out)
+        elif isinstance(sep_out, list):
+            res.extend(sep_out)
+    del sepr
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    return res, roformer_model
+def get_sample_rate(media_path: Path):
+    result = subprocess.run(
+        [
+            "ffprobe",
+            "-v",
+            "quiet",
+            "-print_format",
+            "json",
+            "-show_streams",
+            str(media_path),
+        ],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    media_info = json.loads(result.stdout)
+    for stream in media_info.get("streams", []):
+        if stream.get("codec_type") == "audio":
+            return stream.get("sample_rate")
+    return "44100"  # Default sample rate if not found
+def convert_to_mono(src_path: Path, out_path: Path, out_fmt: str = "wav"):
+    sr = get_sample_rate(src_path)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    if src_path.resolve() == out_path.resolve():
+        output = str(out_path.with_stem(out_path.stem + f"_{sr}"))
+    else:
+        output = str(out_path)
+    subprocess.run(
+        [
+            "ffmpeg",
+            "-loglevel",
+            "error",
+            "-i",
+            str(src_path),
+            "-acodec",
+            "pcm_s16le" if out_fmt == "wav" else "flac",
+            "-ar",
+            sr,
+            "-ac",
+            "1",
+            "-y",
+            output,
+        ],
+        check=True,
+    )
+    return out_path
+def convert_video_to_audio(video_path: Path, audio_dir: Path):
+    cur_dir = audio_dir / video_path.relative_to(audio_dir).parent
+    vocals = [
+        p
+        for p in cur_dir.glob(f"{video_path.stem}_(Vocals)*.*")
+        if p.suffix in AUDIO_EXTENSIONS
+    ]
+    if len(vocals) > 0:
+        return vocals[0]
+    audio_path = cur_dir / f"{video_path.stem}.wav"
+    convert_to_mono(video_path, audio_path)
+    return audio_path
+@click.command()
+@click.option("--audio-dir", required=True, help="Directory containing audio files")
+@click.option(
+    "--save-dir", required=True, help="Directory to save processed audio files"
+)
+@click.option("--device", default="cuda", help="Device to use [cuda / cpu]")
+@click.option("--language", default="auto", help="Language of the transcription")
+@click.option(
+    "--max_single_segment_time",
+    default=20000,
+    type=int,
+    help="Maximum of Output single audio duration(ms)",
+)
+@click.option("--fsmn-vad/--silero-vad", default=False)
+@click.option("--punc/--no-punc", default=False)
+@click.option("--denoise/--no-denoise", default=False)
+@click.option("--save_emo/--no_save_emo", default=False)
+def main(
+    audio_dir: str,
+    save_dir: str,
+    device: str,
+    language: str,
+    max_single_segment_time: int,
+    fsmn_vad: bool,
+    punc: bool,
+    denoise: bool,
+    save_emo: bool,
+):
+    audios_path = Path(audio_dir)
+    save_path = Path(save_dir)
+    save_path.mkdir(parents=True, exist_ok=True)
+    video_files = list_files(
+        path=audio_dir, extensions=VIDEO_EXTENSIONS, recursive=True
+    )
+    v2a_files = [convert_video_to_audio(p, audio_dir) for p in video_files]
+    if denoise:
+        VOCAL = "_(Vocals)"
+        original_files = [
+            p
+            for p in audios_path.glob("**/*")
+            if p.suffix in AUDIO_EXTENSIONS and VOCAL not in p.stem
+        ]
+        _, cur_model = uvr5_cli(
+            audio_dir=audio_dir, output_folder=audio_dir, audio_files=original_files
+        )
+        need_remove = [p for p in audios_path.glob("**/*(Instrumental)*")]
+        need_remove.extend(original_files)
+        for _ in need_remove:
+            _.unlink()
+        vocal_files = [
+            p
+            for p in audios_path.glob("**/*")
+            if p.suffix in AUDIO_EXTENSIONS and VOCAL in p.stem
+        ]
+        for f in vocal_files:
+            fn, ext = f.stem, f.suffix
+            v_pos = fn.find(VOCAL + "_" + cur_model.split(".")[0])
+            if v_pos != -1:
+                new_fn = fn[: v_pos + len(VOCAL)]
+                new_f = f.with_name(new_fn + ext)
+                f = f.rename(new_f)
+                convert_to_mono(f, f, "flac")
+                f.unlink()
+    audio_files = list_files(
+        path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
+    )
+    logger.info("Loading / Downloading Funasr model...")
+    model_dir = "iic/SenseVoiceSmall"
+    vad_model = "fsmn-vad" if fsmn_vad else None
+    vad_kwargs = {"max_single_segment_time": max_single_segment_time}
+    punc_model = "ct-punc" if punc else None
+    manager = AutoModel(
+        model=model_dir,
+        trust_remote_code=False,
+        vad_model=vad_model,
+        vad_kwargs=vad_kwargs,
+        punc_model=punc_model,
+        device=device,
+    )
+    if not fsmn_vad and vad_model is None:
+        vad_model = load_silero_vad()
+    logger.info("Model loaded.")
+    pattern = re.compile(r"_\d{3}\.")
+    for file_path in tqdm(audio_files, desc="Processing audio file"):
+        if pattern.search(file_path.name):
+            # logger.info(f"Skipping {file_path} as it has already been processed.")
+            continue
+        file_stem = file_path.stem
+        file_suffix = file_path.suffix
+        rel_path = Path(file_path).relative_to(audio_dir)
+        (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
+        audio = AudioSegment.from_file(file_path)
+        cfg = dict(
+            cache={},
+            language=language,  # "zh", "en", "yue", "ja", "ko", "nospeech"
+            use_itn=False,
+            batch_size_s=60,
+        )
+        if fsmn_vad:
+            elapsed, vad_res = manager.vad(input=str(file_path), **cfg)
+        else:
+            wav = read_audio(
+                str(file_path)
+            )  # backend (sox, soundfile, or ffmpeg) required!
+            audio_key = file_path.stem
+            audio_val = []
+            speech_timestamps = get_speech_timestamps(
+                wav,
+                vad_model,
+                max_speech_duration_s=max_single_segment_time // 1000,
+                return_seconds=True,
+            )
+            audio_val = [
+                [int(timestamp["start"] * 1000), int(timestamp["end"] * 1000)]
+                for timestamp in speech_timestamps
+            ]
+            vad_res = []
+            vad_res.append(dict(key=audio_key, value=audio_val))
+        res = manager.inference_with_vadres(
+            input=str(file_path), vad_res=vad_res, **cfg
+        )
+        for i, info in enumerate(res):
+            [start_ms, end_ms] = info["interval"]
+            text = info["text"]
+            emo = info["emo"]
+            sliced_audio = audio[start_ms:end_ms]
+            audio_save_path = (
+                save_path / rel_path.parent / f"{file_stem}_{i:03d}{file_suffix}"
+            )
+            sliced_audio.export(audio_save_path, format=file_suffix[1:])
+            print(f"Exported {audio_save_path}: {text}")
+            transcript_save_path = (
+                save_path / rel_path.parent / f"{file_stem}_{i:03d}.lab"
+            )
+            with open(
+                transcript_save_path,
+                "w",
+                encoding="utf-8",
+            ) as f:
+                f.write(text)
+            if save_emo:
+                emo_save_path = save_path / rel_path.parent / f"{file_stem}_{i:03d}.emo"
+                with open(
+                    emo_save_path,
+                    "w",
+                    encoding="utf-8",
+                ) as f:
+                    f.write(emo)
+        if audios_path.resolve() == save_path.resolve():
+            file_path.unlink()
+if __name__ == "__main__":
+    main()
+    exit(0)
+    from funasr.utils.postprocess_utils import rich_transcription_postprocess
+    # Load the audio file
+    audio_path = Path(r"D:\PythonProject\ok\1_output_(Vocals).wav")
+    model_dir = "iic/SenseVoiceSmall"
+    m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir, device="cuda:0")
+    m.eval()
+    res = m.inference(
+        data_in=f"{kwargs['model_path']}/example/zh.mp3",
+        language="auto",  # "zh", "en", "yue", "ja", "ko", "nospeech"
+        use_itn=False,
+        ban_emo_unk=False,
+        **kwargs,
+    )
+    print(res)
+    text = rich_transcription_postprocess(res[0][0]["text"])
+    print(text)

xinference/thirdparty/fish_speech/tools/sensevoice/vad_utils.py ADDED Viewed

@@ -0,0 +1,61 @@
+import torch
+from torch.nn.utils.rnn import pad_sequence
+def slice_padding_fbank(speech, speech_lengths, vad_segments):
+    speech_list = []
+    speech_lengths_list = []
+    for i, segment in enumerate(vad_segments):
+        bed_idx = int(segment[0][0] * 16)
+        end_idx = min(int(segment[0][1] * 16), speech_lengths[0])
+        speech_i = speech[0, bed_idx:end_idx]
+        speech_lengths_i = end_idx - bed_idx
+        speech_list.append(speech_i)
+        speech_lengths_list.append(speech_lengths_i)
+    feats_pad = pad_sequence(speech_list, batch_first=True, padding_value=0.0)
+    speech_lengths_pad = torch.Tensor(speech_lengths_list).int()
+    return feats_pad, speech_lengths_pad
+def slice_padding_audio_samples(speech, speech_lengths, vad_segments):
+    speech_list = []
+    speech_lengths_list = []
+    intervals = []
+    for i, segment in enumerate(vad_segments):
+        bed_idx = int(segment[0][0] * 16)
+        end_idx = min(int(segment[0][1] * 16), speech_lengths)
+        speech_i = speech[bed_idx:end_idx]
+        speech_lengths_i = end_idx - bed_idx
+        speech_list.append(speech_i)
+        speech_lengths_list.append(speech_lengths_i)
+        intervals.append([bed_idx // 16, end_idx // 16])
+    return speech_list, speech_lengths_list, intervals
+def merge_vad(vad_result, max_length=15000, min_length=0):
+    new_result = []
+    if len(vad_result) <= 1:
+        return vad_result
+    time_step = [t[0] for t in vad_result] + [t[1] for t in vad_result]
+    time_step = sorted(list(set(time_step)))
+    if len(time_step) == 0:
+        return []
+    bg = 0
+    for i in range(len(time_step) - 1):
+        time = time_step[i]
+        if time_step[i + 1] - bg < max_length:
+            continue
+        if time - bg > min_length:
+            new_result.append([bg, time])
+        # if time - bg < max_length * 1.5:
+        #     new_result.append([bg, time])
+        # else:
+        #     split_num = int(time - bg) // max_length + 1
+        #     spl_l = int(time - bg) // split_num
+        #     for j in range(split_num):
+        #         new_result.append([bg + j * spl_l, bg + (j + 1) * spl_l])
+        bg = time
+    new_result.append([bg, time_step[-1]])
+    return new_result

xinference/thirdparty/fish_speech/tools/smart_pad.py ADDED Viewed

@@ -0,0 +1,47 @@
+import random
+from multiprocessing import Pool
+from pathlib import Path
+import click
+import librosa
+import torch.nn.functional as F
+import torchaudio
+from tqdm import tqdm
+from tools.file import AUDIO_EXTENSIONS, list_files
+threshold = 10 ** (-50 / 20.0)
+def process(file):
+    waveform, sample_rate = torchaudio.load(str(file), backend="sox")
+    loudness = librosa.feature.rms(
+        y=waveform.numpy().squeeze(), frame_length=2048, hop_length=512, center=True
+    )[0]
+    for i in range(len(loudness) - 1, 0, -1):
+        if loudness[i] > threshold:
+            break
+    silent_time = (len(loudness) - i) * 512 / sample_rate
+    if silent_time <= 0.3:
+        random_time = random.uniform(0.3, 0.7)
+        waveform = F.pad(
+            waveform, (0, int(random_time * sample_rate)), mode="constant", value=0
+        )
+    torchaudio.save(uri=str(file), src=waveform, sample_rate=sample_rate)
+@click.command()
+@click.argument("source", type=Path)
+@click.option("--num-workers", type=int, default=12)
+def main(source, num_workers):
+    files = list(list_files(source, AUDIO_EXTENSIONS, recursive=True))
+    with Pool(num_workers) as p:
+        list(tqdm(p.imap_unordered(process, files), total=len(files)))
+if __name__ == "__main__":
+    main()

xinference/thirdparty/fish_speech/tools/vqgan/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/fish_speech/tools/vqgan/create_train_split.py ADDED Viewed

@@ -0,0 +1,83 @@
+import math
+from pathlib import Path
+from random import Random
+import click
+from loguru import logger
+from pydub import AudioSegment
+from tqdm import tqdm
+from tools.file import AUDIO_EXTENSIONS, list_files, load_filelist
+@click.command()
+@click.argument("root", type=click.Path(exists=True, path_type=Path))
+@click.option("--val-ratio", type=float, default=None)
+@click.option("--val-count", type=int, default=None)
+@click.option("--filelist", default=None, type=Path)
+@click.option("--min-duration", default=None, type=float)
+@click.option("--max-duration", default=None, type=float)
+def main(root, val_ratio, val_count, filelist, min_duration, max_duration):
+    if filelist:
+        files = [i[0] for i in load_filelist(filelist)]
+    else:
+        files = list_files(root, AUDIO_EXTENSIONS, recursive=True, sort=True)
+    if min_duration is None and max_duration is None:
+        filtered_files = list(map(str, [file.relative_to(root) for file in files]))
+    else:
+        filtered_files = []
+        for file in tqdm(files):
+            try:
+                audio = AudioSegment.from_file(str(file))
+                duration = len(audio) / 1000.0
+                if min_duration is not None and duration < min_duration:
+                    logger.info(
+                        f"Skipping {file} due to duration {duration:.2f} < {min_duration:.2f}"
+                    )
+                    continue
+                if max_duration is not None and duration > max_duration:
+                    logger.info(
+                        f"Skipping {file} due to duration {duration:.2f} > {max_duration:.2f}"
+                    )
+                    continue
+                filtered_files.append(str(file.relative_to(root)))
+            except Exception as e:
+                logger.info(f"Error processing {file}: {e}")
+    logger.info(
+        f"Found {len(files)} files, remaining {len(filtered_files)} files after filtering"
+    )
+    Random(42).shuffle(filtered_files)
+    if val_count is None and val_ratio is None:
+        logger.info("Validation ratio and count not specified, using min(20%, 100)")
+        val_size = min(100, math.ceil(len(filtered_files) * 0.2))
+    elif val_count is not None and val_ratio is not None:
+        logger.error("Cannot specify both val_count and val_ratio")
+        return
+    elif val_count is not None:
+        if val_count < 1 or val_count > len(filtered_files):
+            logger.error("val_count must be between 1 and number of files")
+            return
+        val_size = val_count
+    else:
+        val_size = math.ceil(len(filtered_files) * val_ratio)
+    logger.info(f"Using {val_size} files for validation")
+    with open(root / "vq_train_filelist.txt", "w", encoding="utf-8") as f:
+        f.write("\n".join(filtered_files[val_size:]))
+    with open(root / "vq_val_filelist.txt", "w", encoding="utf-8") as f:
+        f.write("\n".join(filtered_files[:val_size]))
+    logger.info("Done")
+if __name__ == "__main__":
+    main()

xinference 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl

Potentially problematic release.

xinference 0.14.1.post1py3-none-any.whl → 0.14.3py3-none-any.whl