PyPI - minicpmo-utils - Versions diffs - 0.0.5__tar.gz - Mend

minicpmo-utils 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

minicpmo_utils-0.0.5/PKG-INFO ADDED Viewed

@@ -0,0 +1,116 @@
+Metadata-Version: 2.4
+Name: minicpmo-utils
+Version: 0.0.5
+Summary: Unified utilities package for MiniCPM-o: includes cosyvoice + stepaudio2 and extensible utils.
+Author: MiniCPM-o Utils Maintainers
+License: Apache-2.0
+Keywords: minicpmo,audio,tts,utils,cosyvoice,stepaudio2
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: numpy
+Requires-Dist: pillow==10.4.0
+Requires-Dist: librosa==0.9.0
+Requires-Dist: decord==0.6.0
+Requires-Dist: moviepy==2.1.2
+Requires-Dist: numba==0.61.2
+Provides-Extra: tts
+Requires-Dist: torch>=2.3.0; extra == "tts"
+Requires-Dist: torchaudio>=2.3.0; extra == "tts"
+Requires-Dist: transformers<4.53.0,>=4.51.0; extra == "tts"
+Requires-Dist: onnxruntime<=1.21.0,>=1.18.0; extra == "tts"
+Requires-Dist: onnx; extra == "tts"
+Requires-Dist: hyperpyyaml; extra == "tts"
+Requires-Dist: openai-whisper==20231117; extra == "tts"
+Requires-Dist: tqdm; extra == "tts"
+Requires-Dist: tiktoken; extra == "tts"
+Requires-Dist: inflect; extra == "tts"
+Requires-Dist: omegaconf>=2.0.6; extra == "tts"
+Requires-Dist: conformer==0.3.2; extra == "tts"
+Requires-Dist: einops==0.8.1; extra == "tts"
+Requires-Dist: hydra-core; extra == "tts"
+Requires-Dist: lightning==2.2.4; extra == "tts"
+Requires-Dist: rich; extra == "tts"
+Requires-Dist: gdown==5.2.0; extra == "tts"
+Requires-Dist: matplotlib; extra == "tts"
+Requires-Dist: wget; extra == "tts"
+Requires-Dist: pyarrow; extra == "tts"
+Requires-Dist: pyworld; extra == "tts"
+Requires-Dist: scipy; extra == "tts"
+Requires-Dist: pyyaml; extra == "tts"
+Requires-Dist: regex; extra == "tts"
+Requires-Dist: soundfile==0.12.1; extra == "tts"
+Requires-Dist: diffusers==0.29.0; extra == "tts"
+Provides-Extra: streaming
+Requires-Dist: minicpmo-utils[tts]; extra == "streaming"
+Provides-Extra: streaming-flash
+Requires-Dist: minicpmo-utils[streaming]; extra == "streaming-flash"
+Requires-Dist: flash-attn>=2.6.0; sys_platform == "linux" and extra == "streaming-flash"
+Requires-Dist: triton>=2.3.0; sys_platform == "linux" and extra == "streaming-flash"
+Requires-Dist: safetensors; extra == "streaming-flash"
+Requires-Dist: pynvml; extra == "streaming-flash"
+Requires-Dist: xxhash; extra == "streaming-flash"
+Provides-Extra: gpu
+Requires-Dist: onnxruntime-gpu<=1.23.2,>=1.18.0; sys_platform == "linux" and extra == "gpu"
+Provides-Extra: all
+Requires-Dist: minicpmo-utils[gpu,streaming,tts]; extra == "all"
+## minicpmo-utils
+一个统一安装的工具包（一个 PyPI 分发包），把仓库里的 `cosyvoice` 与 `stepaudio2` 一起打进同一个 wheel，并预留 `minicpmo` 作为后续扩展 utils 的统一入口。
+### 安装方式
+- 从源码本地安装（开发态，可编辑，默认只装公共依赖）：
+```bash
+cd minicpmo-utils
+pip install -e .
+```
+- 如果只想安装 cosyvoice 相关依赖（TTS）：
+```bash
+pip install -e .[tts]
+```
+- 如果只想安装 stepaudio2 / streaming 相关依赖：
+```bash
+pip install -e .[streaming]
+```
+- 同时安装 cosyvoice + stepaudio2 相关依赖：
+```bash
+pip install -e .[tts,streaming]
+```
+- 构建并安装 wheel（推荐分发）：
+```bash
+cd minicpmo-utils
+python -m build        # 生成 dist/*.whl
+pip install \"dist/minicpmo_utils-0.1.0-py3-none-any.whl[tts,streaming]\"
+```
+### 导入方式
+包会暴露以下顶层模块，安装后可直接使用：
+- `import cosyvoice`
+- `import stepaudio2`
+- `import matcha`
+- `import minicpmo`
+也支持通过统一入口导入子包：
+```python
+from minicpmo import cosyvoice, stepaudio2, matcha
+```
+以及通过统一的 utils 入口使用通用工具函数，例如：
+```python
+from minicpmo.utils import get_video_frame_audio_segments
+```

minicpmo_utils-0.0.5/README.md ADDED Viewed

@@ -0,0 +1,53 @@
+## minicpmo-utils
+一个统一安装的工具包（一个 PyPI 分发包），把仓库里的 `cosyvoice` 与 `stepaudio2` 一起打进同一个 wheel，并预留 `minicpmo` 作为后续扩展 utils 的统一入口。
+### 安装方式
+- 从源码本地安装（开发态，可编辑，默认只装公共依赖）：
+```bash
+cd minicpmo-utils
+pip install -e .
+```
+- 如果只想安装 cosyvoice 相关依赖（TTS）：
+```bash
+pip install -e .[tts]
+```
+- 如果只想安装 stepaudio2 / streaming 相关依赖：
+```bash
+pip install -e .[streaming]
+```
+- 同时安装 cosyvoice + stepaudio2 相关依赖：
+```bash
+pip install -e .[tts,streaming]
+```
+- 构建并安装 wheel（推荐分发）：
+```bash
+cd minicpmo-utils
+python -m build        # 生成 dist/*.whl
+pip install \"dist/minicpmo_utils-0.1.0-py3-none-any.whl[tts,streaming]\"
+```
+### 导入方式
+包会暴露以下顶层模块，安装后可直接使用：
+- `import cosyvoice`
+- `import stepaudio2`
+- `import matcha`
+- `import minicpmo`
+也支持通过统一入口导入子包：
+```python
+from minicpmo import cosyvoice, stepaudio2, matcha
+```
+以及通过统一的 utils 入口使用通用工具函数，例如：
+```python
+from minicpmo.utils import get_video_frame_audio_segments
+```

minicpmo_utils-0.0.5/pyproject.toml ADDED Viewed

@@ -0,0 +1,108 @@
+[build-system]
+requires = ["setuptools>=69", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "minicpmo-utils"
+version = "0.0.5"
+description = "Unified utilities package for MiniCPM-o: includes cosyvoice + stepaudio2 and extensible utils."
+readme = "README.md"
+requires-python = ">=3.10"
+license = {text = "Apache-2.0"}
+authors = [
+  {name = "MiniCPM-o Utils Maintainers"},
+]
+keywords = ["minicpmo", "audio", "tts", "utils", "cosyvoice", "stepaudio2"]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+]
+# NOTE:
+# - 这是“一个分发包(minicpmo-utils)”同时提供多个顶层 import 包：
+#   - cosyvoice (来自 ../cosyvoice/src)
+#   - matcha    (来自 ../cosyvoice/src)
+#   - stepaudio2(来自 ../stepaudio2/src)
+#   - s3tokenizer (来自 S3Tokenizer-main)
+#   - minicpmo (本项目扩展 utils 的统一入口：from minicpmo.utils import ...)
+dependencies = [
+  "numpy",
+  "pillow==10.4.0",
+  "librosa==0.9.0",
+  "decord==0.6.0",
+  "moviepy==2.1.2",
+  "numba==0.61.2",
+]
+[project.optional-dependencies]
+# cosyvoice TTS 相关依赖
+tts = [
+  "torch>=2.3.0",
+  "torchaudio>=2.3.0",
+  "transformers>=4.51.0,<4.53.0",  # 4.52+ 有兼容性问题
+  "onnxruntime>=1.18.0,<=1.21.0",
+  "onnx",
+  "hyperpyyaml",
+  "openai-whisper==20231117",
+  "tqdm",
+  "tiktoken",
+  "inflect",
+  "omegaconf>=2.0.6",
+  "conformer==0.3.2",
+  "einops==0.8.1",
+  "hydra-core",
+  "lightning==2.2.4",
+  "rich",
+  "gdown==5.2.0",
+  "matplotlib",
+  "wget",
+  "pyarrow",
+  "pyworld",
+  # 新增依赖
+  "scipy",
+  "pyyaml",
+  "regex",
+  "soundfile==0.12.1",
+  "diffusers==0.29.0"
+]
+# stepaudio2 基础依赖（token2wav 等）
+streaming = [
+  "minicpmo-utils[tts]",  # streaming 依赖 tts
+]
+# stepaudio2 Flash 推理引擎依赖（flashcosyvoice.engine 模块需要）
+streaming-flash = [
+  "minicpmo-utils[streaming]",
+  "flash-attn>=2.6.0; sys_platform == 'linux'",
+  "triton>=2.3.0; sys_platform == 'linux'",
+  "safetensors",
+  "pynvml",
+  "xxhash",
+]
+# Linux GPU onnxruntime 可以很重，且与环境强相关，保留为可选 extra
+gpu = [
+  "onnxruntime-gpu>=1.18.0,<=1.23.2; sys_platform == 'linux'",
+]
+all = [
+  "minicpmo-utils[tts,streaming,gpu]",
+]
+[tool.setuptools]
+include-package-data = true
+[tool.setuptools.packages.find]
+# 现在所有代码都在本项目的 src/ 下
+where = ["src"]
+[tool.setuptools.package-data]
+"cosyvoice.tokenizer.assets" = ["*.tiktoken"]
+"s3tokenizer.assets" = ["*.wav", "*.npz"]

minicpmo_utils-0.0.5/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

minicpmo_utils-0.0.5/src/cosyvoice/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""
+CosyVoice: Text-to-Speech with Large Language Model
+"""
+__version__ = "0.1.0"
+# Lazy import to avoid requiring all dependencies at package import time
+def __getattr__(name):
+    if name in ('CosyVoice', 'CosyVoice2'):
+        from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+        if name == 'CosyVoice':
+            return CosyVoice
+        elif name == 'CosyVoice2':
+            return CosyVoice2
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
+__all__ = ['CosyVoice', 'CosyVoice2']

minicpmo_utils-0.0.5/src/cosyvoice/bin/average_model.py ADDED Viewed

@@ -0,0 +1,93 @@
+# Copyright (c) 2020 Mobvoi Inc (Di Wu)
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import glob
+import yaml
+import torch
+def get_args():
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument('--src_path',
+                        required=True,
+                        help='src model path for average')
+    parser.add_argument('--val_best',
+                        action="store_true",
+                        help='averaged model')
+    parser.add_argument('--num',
+                        default=5,
+                        type=int,
+                        help='nums for averaged model')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    val_scores = []
+    if args.val_best:
+        yamls = glob.glob('{}/*.yaml'.format(args.src_path))
+        yamls = [
+            f for f in yamls
+            if not (os.path.basename(f).startswith('train')
+                    or os.path.basename(f).startswith('init'))
+        ]
+        for y in yamls:
+            with open(y, 'r') as f:
+                dic_yaml = yaml.load(f, Loader=yaml.BaseLoader)
+                loss = float(dic_yaml['loss_dict']['loss'])
+                epoch = int(dic_yaml['epoch'])
+                step = int(dic_yaml['step'])
+                tag = dic_yaml['tag']
+                val_scores += [[epoch, step, loss, tag]]
+        sorted_val_scores = sorted(val_scores,
+                                   key=lambda x: x[2],
+                                   reverse=False)
+        print("best val (epoch, step, loss, tag) = " +
+              str(sorted_val_scores[:args.num]))
+        path_list = [
+            args.src_path + '/epoch_{}_whole.pt'.format(score[0])
+            for score in sorted_val_scores[:args.num]
+        ]
+    print(path_list)
+    avg = {}
+    num = args.num
+    assert num == len(path_list)
+    for path in path_list:
+        print('Processing {}'.format(path))
+        states = torch.load(path, map_location=torch.device('cpu'))
+        for k in states.keys():
+            if k not in ['step', 'epoch']:
+                if k not in avg.keys():
+                    avg[k] = states[k].clone()
+                else:
+                    avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            # pytorch 1.6 use true_divide instead of /=
+            avg[k] = torch.true_divide(avg[k], num)
+    print('Saving to {}'.format(args.dst_model))
+    torch.save(avg, args.dst_model)
+if __name__ == '__main__':
+    main()

minicpmo_utils-0.0.5/src/cosyvoice/bin/export_jit.py ADDED Viewed

@@ -0,0 +1,103 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import torch
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+from cosyvoice.utils.file_utils import logging
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/CosyVoice-300M',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def get_optimized_script(model, preserved_attrs=[]):
+    script = torch.jit.script(model)
+    if preserved_attrs != []:
+        script = torch.jit.freeze(script, preserved_attrs=preserved_attrs)
+    else:
+        script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    return script
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    torch._C._jit_set_fusion_strategy([('STATIC', 1)])
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    try:
+        model = CosyVoice(args.model_dir)
+    except Exception:
+        try:
+            model = CosyVoice2(args.model_dir)
+        except Exception:
+            raise TypeError('no valid model_type!')
+    if not isinstance(model, CosyVoice2):
+        # 1. export llm text_encoder
+        llm_text_encoder = model.model.llm.text_encoder
+        script = get_optimized_script(llm_text_encoder)
+        script.save('{}/llm.text_encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(llm_text_encoder.half())
+        script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export llm_text_encoder')
+        # 2. export llm llm
+        llm_llm = model.model.llm.llm
+        script = get_optimized_script(llm_llm, ['forward_chunk'])
+        script.save('{}/llm.llm.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(llm_llm.half(), ['forward_chunk'])
+        script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export llm_llm')
+        # 3. export flow encoder
+        flow_encoder = model.model.flow.encoder
+        script = get_optimized_script(flow_encoder)
+        script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(flow_encoder.half())
+        script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export flow_encoder')
+    else:
+        # 3. export flow encoder
+        flow_encoder = model.model.flow.encoder
+        script = get_optimized_script(flow_encoder)
+        script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(flow_encoder.half())
+        script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export flow_encoder')
+if __name__ == '__main__':
+    main()

minicpmo_utils-0.0.5/src/cosyvoice/bin/export_onnx.py ADDED Viewed

@@ -0,0 +1,120 @@
+# Copyright (c) 2024 Antgroup Inc (authors: Zhoubofan, hexisyztem@icloud.com)
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import onnxruntime
+import random
+import torch
+from tqdm import tqdm
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+from cosyvoice.utils.file_utils import logging
+def get_dummy_input(batch_size, seq_len, out_channels, device):
+    x = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    mask = torch.ones((batch_size, 1, seq_len), dtype=torch.float32, device=device)
+    mu = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    t = torch.rand((batch_size), dtype=torch.float32, device=device)
+    spks = torch.rand((batch_size, out_channels), dtype=torch.float32, device=device)
+    cond = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    return x, mask, mu, t, spks, cond
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/CosyVoice-300M',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+@torch.no_grad()
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    try:
+        model = CosyVoice(args.model_dir)
+    except Exception:
+        try:
+            model = CosyVoice2(args.model_dir)
+        except Exception:
+            raise TypeError('no valid model_type!')
+    # 1. export flow decoder estimator
+    estimator = model.model.flow.decoder.estimator
+    estimator.eval()
+    device = model.model.device
+    batch_size, seq_len = 2, 256
+    out_channels = model.model.flow.decoder.estimator.out_channels
+    x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+    torch.onnx.export(
+        estimator,
+        (x, mask, mu, t, spks, cond),
+        '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+        export_params=True,
+        opset_version=18,
+        do_constant_folding=True,
+        input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
+        output_names=['estimator_out'],
+        dynamic_axes={
+            'x': {2: 'seq_len'},
+            'mask': {2: 'seq_len'},
+            'mu': {2: 'seq_len'},
+            'cond': {2: 'seq_len'},
+            'estimator_out': {2: 'seq_len'},
+        }
+    )
+    # 2. test computation consistency
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+    estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                  sess_options=option, providers=providers)
+    for _ in tqdm(range(10)):
+        x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
+        output_pytorch = estimator(x, mask, mu, t, spks, cond)
+        ort_inputs = {
+            'x': x.cpu().numpy(),
+            'mask': mask.cpu().numpy(),
+            'mu': mu.cpu().numpy(),
+            't': t.cpu().numpy(),
+            'spks': spks.cpu().numpy(),
+            'cond': cond.cpu().numpy()
+        }
+        output_onnx = estimator_onnx.run(None, ort_inputs)[0]
+        torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+    logging.info('successfully export estimator')
+if __name__ == "__main__":
+    main()