minicpmo-utils 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cosyvoice/__init__.py +17 -0
- cosyvoice/bin/average_model.py +93 -0
- cosyvoice/bin/export_jit.py +103 -0
- cosyvoice/bin/export_onnx.py +120 -0
- cosyvoice/bin/inference_deprecated.py +126 -0
- cosyvoice/bin/train.py +195 -0
- cosyvoice/cli/__init__.py +0 -0
- cosyvoice/cli/cosyvoice.py +209 -0
- cosyvoice/cli/frontend.py +238 -0
- cosyvoice/cli/model.py +386 -0
- cosyvoice/dataset/__init__.py +0 -0
- cosyvoice/dataset/dataset.py +151 -0
- cosyvoice/dataset/processor.py +434 -0
- cosyvoice/flow/decoder.py +494 -0
- cosyvoice/flow/flow.py +281 -0
- cosyvoice/flow/flow_matching.py +227 -0
- cosyvoice/flow/length_regulator.py +70 -0
- cosyvoice/hifigan/discriminator.py +230 -0
- cosyvoice/hifigan/f0_predictor.py +58 -0
- cosyvoice/hifigan/generator.py +582 -0
- cosyvoice/hifigan/hifigan.py +67 -0
- cosyvoice/llm/llm.py +610 -0
- cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
- cosyvoice/tokenizer/tokenizer.py +279 -0
- cosyvoice/transformer/__init__.py +0 -0
- cosyvoice/transformer/activation.py +84 -0
- cosyvoice/transformer/attention.py +330 -0
- cosyvoice/transformer/convolution.py +145 -0
- cosyvoice/transformer/decoder.py +396 -0
- cosyvoice/transformer/decoder_layer.py +132 -0
- cosyvoice/transformer/embedding.py +302 -0
- cosyvoice/transformer/encoder.py +474 -0
- cosyvoice/transformer/encoder_layer.py +236 -0
- cosyvoice/transformer/label_smoothing_loss.py +96 -0
- cosyvoice/transformer/positionwise_feed_forward.py +115 -0
- cosyvoice/transformer/subsampling.py +383 -0
- cosyvoice/transformer/upsample_encoder.py +320 -0
- cosyvoice/utils/__init__.py +0 -0
- cosyvoice/utils/class_utils.py +83 -0
- cosyvoice/utils/common.py +186 -0
- cosyvoice/utils/executor.py +176 -0
- cosyvoice/utils/file_utils.py +129 -0
- cosyvoice/utils/frontend_utils.py +136 -0
- cosyvoice/utils/losses.py +57 -0
- cosyvoice/utils/mask.py +265 -0
- cosyvoice/utils/scheduler.py +738 -0
- cosyvoice/utils/train_utils.py +367 -0
- cosyvoice/vllm/cosyvoice2.py +103 -0
- matcha/__init__.py +0 -0
- matcha/app.py +357 -0
- matcha/cli.py +418 -0
- matcha/hifigan/__init__.py +0 -0
- matcha/hifigan/config.py +28 -0
- matcha/hifigan/denoiser.py +64 -0
- matcha/hifigan/env.py +17 -0
- matcha/hifigan/meldataset.py +217 -0
- matcha/hifigan/models.py +368 -0
- matcha/hifigan/xutils.py +60 -0
- matcha/models/__init__.py +0 -0
- matcha/models/baselightningmodule.py +209 -0
- matcha/models/components/__init__.py +0 -0
- matcha/models/components/decoder.py +443 -0
- matcha/models/components/flow_matching.py +132 -0
- matcha/models/components/text_encoder.py +410 -0
- matcha/models/components/transformer.py +316 -0
- matcha/models/matcha_tts.py +239 -0
- matcha/onnx/__init__.py +0 -0
- matcha/onnx/export.py +181 -0
- matcha/onnx/infer.py +168 -0
- matcha/text/__init__.py +53 -0
- matcha/text/cleaners.py +116 -0
- matcha/text/numbers.py +71 -0
- matcha/text/symbols.py +17 -0
- matcha/train.py +122 -0
- matcha/utils/__init__.py +5 -0
- matcha/utils/audio.py +82 -0
- matcha/utils/generate_data_statistics.py +111 -0
- matcha/utils/instantiators.py +56 -0
- matcha/utils/logging_utils.py +53 -0
- matcha/utils/model.py +90 -0
- matcha/utils/monotonic_align/__init__.py +22 -0
- matcha/utils/monotonic_align/setup.py +7 -0
- matcha/utils/pylogger.py +21 -0
- matcha/utils/rich_utils.py +101 -0
- matcha/utils/utils.py +219 -0
- minicpmo/__init__.py +24 -0
- minicpmo/utils.py +636 -0
- minicpmo/version.py +2 -0
- minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
- minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
- minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
- minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
- s3tokenizer/__init__.py +153 -0
- s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
- s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
- s3tokenizer/assets/mel_filters.npz +0 -0
- s3tokenizer/cli.py +183 -0
- s3tokenizer/model.py +546 -0
- s3tokenizer/model_v2.py +605 -0
- s3tokenizer/utils.py +390 -0
- stepaudio2/__init__.py +40 -0
- stepaudio2/cosyvoice2/__init__.py +1 -0
- stepaudio2/cosyvoice2/flow/__init__.py +0 -0
- stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
- stepaudio2/cosyvoice2/flow/flow.py +230 -0
- stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
- stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
- stepaudio2/cosyvoice2/transformer/attention.py +328 -0
- stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
- stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
- stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
- stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
- stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
- stepaudio2/cosyvoice2/utils/__init__.py +1 -0
- stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
- stepaudio2/cosyvoice2/utils/common.py +101 -0
- stepaudio2/cosyvoice2/utils/mask.py +49 -0
- stepaudio2/flashcosyvoice/__init__.py +0 -0
- stepaudio2/flashcosyvoice/cli.py +424 -0
- stepaudio2/flashcosyvoice/config.py +80 -0
- stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
- stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
- stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
- stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
- stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
- stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
- stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
- stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
- stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/flow.py +198 -0
- stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
- stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
- stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
- stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
- stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
- stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
- stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
- stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
- stepaudio2/flashcosyvoice/utils/audio.py +77 -0
- stepaudio2/flashcosyvoice/utils/context.py +28 -0
- stepaudio2/flashcosyvoice/utils/loader.py +116 -0
- stepaudio2/flashcosyvoice/utils/memory.py +19 -0
- stepaudio2/stepaudio2.py +204 -0
- stepaudio2/token2wav.py +248 -0
- stepaudio2/utils.py +91 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: minicpmo-utils
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Unified utilities package for MiniCPM-o: includes cosyvoice + stepaudio2 and extensible utils.
|
|
5
|
+
Author: MiniCPM-o Utils Maintainers
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Keywords: minicpmo,audio,tts,utils,cosyvoice,stepaudio2
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: torch>=2.3.0
|
|
18
|
+
Requires-Dist: torchaudio>=2.3.0
|
|
19
|
+
Requires-Dist: transformers>=4.49.0
|
|
20
|
+
Requires-Dist: numpy
|
|
21
|
+
Requires-Dist: hyperpyyaml
|
|
22
|
+
Requires-Dist: modelscope
|
|
23
|
+
Requires-Dist: openai-whisper
|
|
24
|
+
Requires-Dist: tqdm
|
|
25
|
+
Requires-Dist: tiktoken
|
|
26
|
+
Requires-Dist: inflect
|
|
27
|
+
Requires-Dist: omegaconf
|
|
28
|
+
Requires-Dist: einops
|
|
29
|
+
Requires-Dist: librosa
|
|
30
|
+
Requires-Dist: onnxruntime>=1.18.0
|
|
31
|
+
Requires-Dist: diffusers
|
|
32
|
+
Provides-Extra: gpu
|
|
33
|
+
Requires-Dist: onnxruntime-gpu>=1.18.0; sys_platform == "linux" and extra == "gpu"
|
|
34
|
+
|
|
35
|
+
## minicpmo-utils
|
|
36
|
+
|
|
37
|
+
一个统一安装的工具包(一个 PyPI 分发包),把仓库里的 `cosyvoice` 与 `stepaudio2` 一起打进同一个 wheel,并预留 `minicpmo` 作为后续扩展 utils 的统一入口。
|
|
38
|
+
|
|
39
|
+
### 安装方式
|
|
40
|
+
|
|
41
|
+
- 从源码本地安装(开发态,可编辑):
|
|
42
|
+
```bash
|
|
43
|
+
cd minicpmo-utils
|
|
44
|
+
pip install -e .
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
- 构建并安装 wheel(推荐分发):
|
|
48
|
+
```bash
|
|
49
|
+
cd minicpmo-utils
|
|
50
|
+
python -m build # 生成 dist/*.whl
|
|
51
|
+
pip install dist/minicpmo_utils-0.1.0-py3-none-any.whl
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### 导入方式
|
|
55
|
+
|
|
56
|
+
包会暴露以下顶层模块,安装后可直接使用:
|
|
57
|
+
- `import cosyvoice`
|
|
58
|
+
- `import stepaudio2`
|
|
59
|
+
- `import matcha`
|
|
60
|
+
- `import minicpmo`
|
|
61
|
+
|
|
62
|
+
也支持通过统一入口导入子包:
|
|
63
|
+
```python
|
|
64
|
+
from minicpmo import cosyvoice, stepaudio2, matcha
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
以及通过统一的 utils 入口使用通用工具函数,例如:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from minicpmo.utils import get_video_frame_audio_segments
|
|
71
|
+
```
|
|
72
|
+
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
cosyvoice/__init__.py,sha256=yXNTKqiEBedjE7e_Qav__IUpv9XfG9-em4yJxmoH5Zs,530
|
|
2
|
+
cosyvoice/bin/average_model.py,sha256=82LeGlvQh8xFHA_T9fJJDBTtDhJ_UzukJJcFRjyjc9Y,3202
|
|
3
|
+
cosyvoice/bin/export_jit.py,sha256=36EvvRkOE621pMqDL3Il69hXdLXsJtSGtDO3_r13nmA,3906
|
|
4
|
+
cosyvoice/bin/export_onnx.py,sha256=nUHWmNTkGMowvHuXI92OR7R6v1doVsDxvxEzP5GWiQk,4661
|
|
5
|
+
cosyvoice/bin/inference_deprecated.py,sha256=d4x-8eER00VwuOEHnObd9kd7cOriv_xbXZOAEXG60fk,6126
|
|
6
|
+
cosyvoice/bin/train.py,sha256=1bpuU0d8fwc5ygrY65FbT9kfWpdZkfsXX83NBkNAMLc,8071
|
|
7
|
+
cosyvoice/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
cosyvoice/cli/cosyvoice.py,sha256=WCNOWg-_CJ-y9HDxL1tFrCirnB2a030jyN5EaELke6s,12808
|
|
9
|
+
cosyvoice/cli/frontend.py,sha256=8z3yYsmjTpi8olTD2DNooPuW0nFnwDKSJIzPETvqv4c,13827
|
|
10
|
+
cosyvoice/cli/model.py,sha256=X3Za8Ak5LEeRUldLalztoSnu6UsCP9WcYQFvu9MDacQ,24008
|
|
11
|
+
cosyvoice/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
cosyvoice/dataset/dataset.py,sha256=QlINnr5R2kyDQQudr9jqdK_z63VQME2WJo8UBX8sVTo,4804
|
|
13
|
+
cosyvoice/dataset/processor.py,sha256=RBpJ8QlP00_e8qkjEo86h2nBvYcuttNVtp_HIixx-wY,15876
|
|
14
|
+
cosyvoice/flow/decoder.py,sha256=717Oudt_Y93aHVvKa_prKLjqEWVsSx-cEJ0o9lbLvyk,19866
|
|
15
|
+
cosyvoice/flow/flow.py,sha256=qEl_61gzbnVmsfCF0RrP-ctPGiSUmr0sJE-_l8dvm20,12156
|
|
16
|
+
cosyvoice/flow/flow_matching.py,sha256=sa1nH-N_hywDS96PdcwZwbiHWNVON1-itU4UoIit3-Y,10499
|
|
17
|
+
cosyvoice/flow/length_regulator.py,sha256=srvavaBIUN8Mk0Vi35WyN8og-n6P6J0E2bgnqZ1nQRs,3137
|
|
18
|
+
cosyvoice/hifigan/discriminator.py,sha256=PbWxtVhMYAyZbujFPqAhNwqK2cYuP1oo1c8l8Dq5-c8,8617
|
|
19
|
+
cosyvoice/hifigan/f0_predictor.py,sha256=uoymCIodAtYIb-oh0E1p7pgXvdjqICiZgJnLCXGhhmk,2065
|
|
20
|
+
cosyvoice/hifigan/generator.py,sha256=90YB5v6-tBCpYejtiTG0QHTThd7X9vd-6RigKbPUJiY,22754
|
|
21
|
+
cosyvoice/hifigan/hifigan.py,sha256=6QDaL15-wwLbmm6m0rcZkOq9iCPgfllBm32MUt99wNY,3240
|
|
22
|
+
cosyvoice/llm/llm.py,sha256=aYR8ZUgeYrXba-zoS7hONqbCIMsWVlHqt65Ug1uw6uQ,30566
|
|
23
|
+
cosyvoice/tokenizer/tokenizer.py,sha256=lDQPx83ycMaaOutjKQxSQQROIHFOAf6nNvNh-eWlbfI,7456
|
|
24
|
+
cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken,sha256=dHl5Yx6BMZNDaqvP98HCNdN96Al7ccVj7Itjt6UVxxg,907395
|
|
25
|
+
cosyvoice/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
cosyvoice/transformer/activation.py,sha256=pKlsrqn3sFERKG3l6nYL39-cTlNEj1NCCFfcBKUEQMI,3089
|
|
27
|
+
cosyvoice/transformer/attention.py,sha256=QdJpstXjo5UsClOPRkgK_4Vwdn64joBFLzZ0Ns72KLE,14389
|
|
28
|
+
cosyvoice/transformer/convolution.py,sha256=619B8ySpciXHO5xDCvi7IxvXc4bvGEULsP0yn0aatOE,5230
|
|
29
|
+
cosyvoice/transformer/decoder.py,sha256=2wQscn4OZTrJJHM7H7FeaXkv_YDJ089iJIN0VV1Yocw,16580
|
|
30
|
+
cosyvoice/transformer/decoder_layer.py,sha256=uVZiq3LsawsPUMOhX77PFvrLeG0yO0rKHQY7nCHA1k4,4807
|
|
31
|
+
cosyvoice/transformer/embedding.py,sha256=tQwwvlxmw5yV4NsQFYFMVF_NBiqTbA6nrUwBUIslldw,11777
|
|
32
|
+
cosyvoice/transformer/encoder.py,sha256=J_nXSZcgNy--Z3TQkLif8GPH7PiPk6TXWye7GtspGKU,21434
|
|
33
|
+
cosyvoice/transformer/encoder_layer.py,sha256=GSBYK-LJt894Nee1ORGOweudqPLHEcYlf4WYs3kpUbk,9602
|
|
34
|
+
cosyvoice/transformer/label_smoothing_loss.py,sha256=24gEzxwg4a-_bDPeSDZYmxlH2IF5fQLVB8KoqNT0D90,3459
|
|
35
|
+
cosyvoice/transformer/positionwise_feed_forward.py,sha256=boA447zIyght3KUI-5udQL86uYvrq89clJNdAyMp0Pg,4219
|
|
36
|
+
cosyvoice/transformer/subsampling.py,sha256=MfwDR6hRq8EgXf1M9oCZwMQWWJw-maB7JQ6GMM7OGdA,12666
|
|
37
|
+
cosyvoice/transformer/upsample_encoder.py,sha256=qAA8ISzmRpfOQwAfd2kC7mBpajt-Nzk1R5Ap3Mr31Wk,14149
|
|
38
|
+
cosyvoice/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
+
cosyvoice/utils/class_utils.py,sha256=ddaXf3V0ME-EZM21DIiVZMpVepmhEkGZGLKYHw6Nz8I,3321
|
|
40
|
+
cosyvoice/utils/common.py,sha256=YWGo2Q176wdm9tLeZ8zONadUr0OTDCivN8TRm2Xu-Jc,6792
|
|
41
|
+
cosyvoice/utils/executor.py,sha256=LqeifOT3xpwno9oy97jBBi1nuyCMZVsJsVeJh6n5vp4,8835
|
|
42
|
+
cosyvoice/utils/file_utils.py,sha256=mV_0mRdhZSTODqVBgopxrjeLGbCWf-VLGVtHfgNcN_8,5461
|
|
43
|
+
cosyvoice/utils/frontend_utils.py,sha256=DQypTgz1GeLRf3LNHcq2yweuoN5I5-eSgmNiKE6hjTA,4273
|
|
44
|
+
cosyvoice/utils/losses.py,sha256=XND3_XjOViLBm7kRZRa3BWmeSMCPIXubiqESfyv5XBA,2121
|
|
45
|
+
cosyvoice/utils/mask.py,sha256=hSxuSxQgGiOKsHY5bbWZwVey7lpaKqzw8nfdzoxkiXY,9728
|
|
46
|
+
cosyvoice/utils/scheduler.py,sha256=lEfquE_Lcer2VG2zUVa0n-UxgvJEdEodyT66so-h6jQ,24920
|
|
47
|
+
cosyvoice/utils/train_utils.py,sha256=yUFlYHUZRtrqQBx7kBJiYPbx2ArtAPKWzAxHQk4vsoo,16581
|
|
48
|
+
cosyvoice/vllm/cosyvoice2.py,sha256=csgta5DvFgOjo_D6lrBPBdc66b5pBAwAfk_6SPTyb4k,4056
|
|
49
|
+
matcha/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
|
+
matcha/app.py,sha256=bBOG1VM5L8X64aF8rH4iddu45dje9lmkyr5ubAq_4-4,13992
|
|
51
|
+
matcha/cli.py,sha256=Iw_SITi4QrYGb_HZnxBNZ_ivnZ_zvXSJJEbyQ4clad8,15519
|
|
52
|
+
matcha/train.py,sha256=eKFZFkaSvmdfMNhOAcvp63kp-Wvj_9IE47mAp4sHGxg,4613
|
|
53
|
+
matcha/hifigan/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
+
matcha/hifigan/config.py,sha256=cxS_YTK_UoO2uma_Ue8QRFtNA6ldSEt1EL36u2-NgCc,779
|
|
55
|
+
matcha/hifigan/denoiser.py,sha256=Q2juI2a3QTqce6fuvqWQnShoV671kVW2JsQyoqwwvLI,2644
|
|
56
|
+
matcha/hifigan/env.py,sha256=QthDmqTWWAIo2tSI-arVteRVxnKUnm9mkjlQVnXbmDc,429
|
|
57
|
+
matcha/hifigan/meldataset.py,sha256=4Ps0NXA3Yv0oV2PyCdHcptc4lZ43tp978m3BM2NYWxc,6786
|
|
58
|
+
matcha/hifigan/models.py,sha256=Ln7J7YzeN4h33tBKQHhlPcoNfmX-_-kwBkTZIYJY5q8,11668
|
|
59
|
+
matcha/hifigan/xutils.py,sha256=aNtYu1SyQaGtbN_NnSfLGhTi-r_y3vYKMIf7EYGaDOA,1396
|
|
60
|
+
matcha/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
|
+
matcha/models/baselightningmodule.py,sha256=45aD1xV6AunZ7_O6RU6-B0OaUMMNO_gFKUWprs3EfCc,7003
|
|
62
|
+
matcha/models/matcha_tts.py,sha256=zbpw9noGwR_fG9yBKxxtMhmHgnxS7PPE1_ft9KK-jPo,10059
|
|
63
|
+
matcha/models/components/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
64
|
+
matcha/models/components/decoder.py,sha256=fqlKYc08lnA6oQf9zRuC3i8vat4go8X0NLYAUxgXams,14459
|
|
65
|
+
matcha/models/components/flow_matching.py,sha256=pvKy7somAHSdV13ahiTYZNkEIn4qn3ZKbLK4kpBEXmw,4657
|
|
66
|
+
matcha/models/components/text_encoder.py,sha256=QEQynUB7PaBsdlJ-e1TUBAe9rL2LBnmk5bAkr_vCTek,14845
|
|
67
|
+
matcha/models/components/transformer.py,sha256=Zv8gktl0qZslG52OXAi7zszq7fr0HivYrytLx6Vty-o,13237
|
|
68
|
+
matcha/onnx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
69
|
+
matcha/onnx/export.py,sha256=VUAviRbyh9WhXaxLPQjdQLwaVK_ZZNgaWPMFA2UKh4c,5386
|
|
70
|
+
matcha/onnx/infer.py,sha256=ACgjtfMdvzuU4XmalNLrebmkBD2W8VLcLi8blXnjUmU,6293
|
|
71
|
+
matcha/text/__init__.py,sha256=6dxZsGZ25Fvk4_b72yfA7NVcNInCJLSjpvN9Mc66uBI,1696
|
|
72
|
+
matcha/text/cleaners.py,sha256=t4-wsMBQdJT6J12mHiXME23tmpOi8pmV-GkejDVcNf8,3560
|
|
73
|
+
matcha/text/numbers.py,sha256=Nwp-HmqaqstAELVqqlMu_7bj4qiWQzMXFmw56nBq1h0,2249
|
|
74
|
+
matcha/text/symbols.py,sha256=hw78I8JtBmkri5HiDUQC05i2N9MFkxWJoHujIxJFP1c,635
|
|
75
|
+
matcha/utils/__init__.py,sha256=YNs91rjzeUcVSs3aZjv9thma42I81Jksy9RF3nquvOo,326
|
|
76
|
+
matcha/utils/audio.py,sha256=L3QQZL_MlIXRmkI1ZJbYriTeyAw9bvXKf06Kv7-cYco,2282
|
|
77
|
+
matcha/utils/generate_data_statistics.py,sha256=E5Z9Tpa6ZFXfSLLQcAwhg7YYqBgLZDkIqBoGoOZhEjA,3269
|
|
78
|
+
matcha/utils/instantiators.py,sha256=QxqOnRVabCSeurpPODXqh3JSZ-E7m8jKSsxy1WvTX-I,1828
|
|
79
|
+
matcha/utils/logging_utils.py,sha256=glOI_JG8_YBKHWwD5RRKKkCez3N7DJlH27Vz48yAEtU,1711
|
|
80
|
+
matcha/utils/model.py,sha256=UViKHaV89_IeaKJFww1xHV_RTXqv0YvfQWqwOtnzQ-I,2935
|
|
81
|
+
matcha/utils/pylogger.py,sha256=YbC8Ym5HZrJcDBIsQO6jSnuyY5CLZQR13E_oAS9SYZQ,720
|
|
82
|
+
matcha/utils/rich_utils.py,sha256=Oj5jrkz5s1b3RJL6m_8EXj85LY079FWClMIzf_Gwvcc,3279
|
|
83
|
+
matcha/utils/utils.py,sha256=-XaKibvqf6mXCoGPkNcFKB8Ynq_bK6_S8-ntUhSR1WE,7159
|
|
84
|
+
matcha/utils/monotonic_align/__init__.py,sha256=_s_INV7vL_N9mhYtZADAk11CsSGP8kykn0fEyyM73ts,646
|
|
85
|
+
matcha/utils/monotonic_align/setup.py,sha256=bf0d0cvGRaACC22qq0sqgZEBBhz4qzDlMqBxOyTKC2g,207
|
|
86
|
+
minicpmo/__init__.py,sha256=wyo8jLG2ATqFcTUrcTcvIzzaf37AYcbHdOAK15PLTjE,556
|
|
87
|
+
minicpmo/utils.py,sha256=XJ717gRjAto7KytP8LFiwBfyFaom6fYtVG9oOMOKkiw,22445
|
|
88
|
+
minicpmo/version.py,sha256=p4MxVO8ih2-hnh7INVBkrqMKBDGTkMso1LEbM6mx3jg,23
|
|
89
|
+
s3tokenizer/__init__.py,sha256=8fg7P72mJjjvkyieA6Y2thBj2eNV_9hBxJp8aW3gcVg,5048
|
|
90
|
+
s3tokenizer/cli.py,sha256=qUUA-5Ro-ILeQd-6YXVuItpYikEdaphIUfabpUTlIh4,5942
|
|
91
|
+
s3tokenizer/model.py,sha256=v-DfJg11h9ztX1gHlUOLcpx9xq4ZZT5aqHOKNtG7Hzw,19542
|
|
92
|
+
s3tokenizer/model_v2.py,sha256=x3hNCIT4UaHb715HojZ3AbEBy-wcPDWR-EohJiY4Aw0,21806
|
|
93
|
+
s3tokenizer/utils.py,sha256=baAg5C7yi0AZx7caF0hJBU-pkj3coMPKM8QrLqgk-9k,14140
|
|
94
|
+
s3tokenizer/assets/BAC009S0764W0121.wav,sha256=RtvJmMnR1IERJnxAdB3TIA8uW89AdfjEyX9EURYNzlA,134570
|
|
95
|
+
s3tokenizer/assets/BAC009S0764W0122.wav,sha256=V2DHrOCSOkmfYF03MBHHqvZY4wSgmZpJPGNmnHVjhOE,131724
|
|
96
|
+
s3tokenizer/assets/mel_filters.npz,sha256=dFCucHI6XvnTQePO5ijHywF382zkLES37SvzMl8PbUw,4271
|
|
97
|
+
stepaudio2/__init__.py,sha256=ahfLETm7fp_A320ERK6j2xT4nTyecy8OA8Z-syVLuQQ,1547
|
|
98
|
+
stepaudio2/stepaudio2.py,sha256=hf1EZl5odF8wFqs0VqC-B_DBioHrfpzWgZN501o1-YA,9265
|
|
99
|
+
stepaudio2/token2wav.py,sha256=eAhDCCXjf9R7F5TsNfbNP5Se256Kqq9gKIrKFh0I5qE,11642
|
|
100
|
+
stepaudio2/utils.py,sha256=p83TqxnLAu2OyAq3eCnESlkbU9kkNdH5Tnh1Da-J7Fg,3384
|
|
101
|
+
stepaudio2/cosyvoice2/__init__.py,sha256=VFv7gYSAXPuP6d6arx-EqTy8W1WbPM4lGmdmPENMWT8,44
|
|
102
|
+
stepaudio2/cosyvoice2/flow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
103
|
+
stepaudio2/cosyvoice2/flow/decoder_dit.py,sha256=eQxV2JjZLI2HSWHltHtfnU3VwGfsIMpepmgzsyzisV0,22418
|
|
104
|
+
stepaudio2/cosyvoice2/flow/flow.py,sha256=-zF1KSmD8PouUvJ6Bg_V_2RwwotBXbqFjE_oALoXJ4Q,8284
|
|
105
|
+
stepaudio2/cosyvoice2/flow/flow_matching.py,sha256=CdXiwC78e2ViEre35wP2pVxlz6Xvtmv8bEcHrfuRYOM,8385
|
|
106
|
+
stepaudio2/cosyvoice2/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
107
|
+
stepaudio2/cosyvoice2/transformer/attention.py,sha256=Yy3xmqfnEjE9DxsGeJ7NKKh5YLu7nsLSLcCgmieUo90,14314
|
|
108
|
+
stepaudio2/cosyvoice2/transformer/embedding.py,sha256=2xDZKyM0P26YG2ZBevr-G3ZkyVukZEKjetho0yobSo4,4766
|
|
109
|
+
stepaudio2/cosyvoice2/transformer/encoder_layer.py,sha256=3lRTqu3m7KhJJhqc6B4M5Oqgvakz3junmX2TTYtVXEE,6838
|
|
110
|
+
stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py,sha256=r49V1dFDxbnGocDqCO-B8OXhb3Z4oykhRD5DmBMghAo,1887
|
|
111
|
+
stepaudio2/cosyvoice2/transformer/subsampling.py,sha256=uFjanKdf855iAlq3sv4YhpbOQoInRI2Q7xADLIfIFqc,2529
|
|
112
|
+
stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py,sha256=YVqzOMmNj1-GqJCbDXzzX6XS_WgZD2laYF8LIuNgbAQ,18854
|
|
113
|
+
stepaudio2/cosyvoice2/utils/__init__.py,sha256=J3q6oPwKcQzvgZAdW8pZ58wpCRgx73ZDa12YdsggeTw,35
|
|
114
|
+
stepaudio2/cosyvoice2/utils/class_utils.py,sha256=dGLa5anz6qeHKMAcPMh_z2gNGp2SuFLbjT30YN5gGp0,1377
|
|
115
|
+
stepaudio2/cosyvoice2/utils/common.py,sha256=WIuAd4n0cGwwk1NyD0oeU1oK-5csW76kXL-xz1graII,3334
|
|
116
|
+
stepaudio2/cosyvoice2/utils/mask.py,sha256=CkCYG5pGBHLH-Fdt5fEeQqnc3ytXFYA5g3dhsD5RtpY,1703
|
|
117
|
+
stepaudio2/flashcosyvoice/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
118
|
+
stepaudio2/flashcosyvoice/cli.py,sha256=eSDEAFtGbgUv91vTzx4KXFRfAtx6eIgXg_5Ut3EtxUs,19863
|
|
119
|
+
stepaudio2/flashcosyvoice/config.py,sha256=4VoEScKra4qgHxbcFuTycYAPMYYijt-Jjf6GEeKzCdU,2619
|
|
120
|
+
stepaudio2/flashcosyvoice/cosyvoice2.py,sha256=7t2pRvJRxxVHrLRAOMMXwQjsOTHnxnHGGG3ciBk6xmk,7729
|
|
121
|
+
stepaudio2/flashcosyvoice/cosyvoice3.py,sha256=UMxsD-RBudOyiUc4zLNwveluKfwwP4AcrrDOWRrSXLk,57
|
|
122
|
+
stepaudio2/flashcosyvoice/engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
123
|
+
stepaudio2/flashcosyvoice/engine/block_manager.py,sha256=Q6Fd3GbUwYlXUJlFdjVh_t682C0N0yCAjUVSrAeQb3w,4027
|
|
124
|
+
stepaudio2/flashcosyvoice/engine/llm_engine.py,sha256=j16UKgoPVxmO-f4oMQ4cR6CxmR4OqRQvlHjm2GFnhHM,5430
|
|
125
|
+
stepaudio2/flashcosyvoice/engine/model_runner.py,sha256=fzYoPoUkcr1epAtaDeKVTTypWTcpkDj87MD4-f88opU,14036
|
|
126
|
+
stepaudio2/flashcosyvoice/engine/scheduler.py,sha256=4o8M0XhFCrkaFtoGQHgz0BitYA_YpZP_mvTrfo0PwM4,3134
|
|
127
|
+
stepaudio2/flashcosyvoice/engine/sequence.py,sha256=pwYdtCST9ZHqxPK62JCPRxt-G885nAbYh26oOuLHuao,2835
|
|
128
|
+
stepaudio2/flashcosyvoice/modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
129
|
+
stepaudio2/flashcosyvoice/modules/flow.py,sha256=INAXZTIkGCxdosq7LBSNxb6BNQmWqDGyQV2Xshh228s,8292
|
|
130
|
+
stepaudio2/flashcosyvoice/modules/hifigan.py,sha256=5wrEedjKzE8D6m1JNcaKxyLkvH2cgI0tBbU_1OWFzoo,9991
|
|
131
|
+
stepaudio2/flashcosyvoice/modules/qwen2.py,sha256=L4Kx5s0SdqUmOYv6X6UCkcbRIctG3Dl8IPCNmiuB0Y0,3346
|
|
132
|
+
stepaudio2/flashcosyvoice/modules/sampler.py,sha256=xhRQW6yP9e5mHjY8npypwZY1oe9mZ5CLH_IG5Oz1EAw,10914
|
|
133
|
+
stepaudio2/flashcosyvoice/modules/flow_components/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
134
|
+
stepaudio2/flashcosyvoice/modules/flow_components/estimator.py,sha256=N7DmynOzkItXY2jTZl6eqaSH5IsXlkumpDxyOP-__5g,38148
|
|
135
|
+
stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py,sha256=B2G2NM9xPeIx6r3v8n6Y49qh07PGBo55YZajL_m1Xjw,41265
|
|
136
|
+
stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
137
|
+
stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py,sha256=jEb8ZuPZZ6Q2Nx5CKBrmCkAeTLR-Hlw7k1qxuUJ4kD0,16353
|
|
138
|
+
stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
139
|
+
stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py,sha256=34Z0U3Qus11b6iE8I-k42lGq0lHz0KYDnVjUt6f0f84,21366
|
|
140
|
+
stepaudio2/flashcosyvoice/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
141
|
+
stepaudio2/flashcosyvoice/utils/audio.py,sha256=afxVAIkV_QliIzAwyOW1QygyjZxBUXjLwz4rSCbBs-c,2162
|
|
142
|
+
stepaudio2/flashcosyvoice/utils/context.py,sha256=y_u92r100Wp7xjs4tQd70eB00iaqUOa92qkb5sw-8ho,821
|
|
143
|
+
stepaudio2/flashcosyvoice/utils/loader.py,sha256=ETQwg7E6HxndH-hWSwTcSg2Q2MWrPDTEZtb2ebZaxS0,6030
|
|
144
|
+
stepaudio2/flashcosyvoice/utils/memory.py,sha256=ciip3CEUfiSGe6qOFOfsYsGi6yYFcNcZVoAjnABVp28,597
|
|
145
|
+
minicpmo_utils-0.1.0.dist-info/METADATA,sha256=dVIxQpW56luyHDeDXMrGMnyKNgUo9VBEo61Zi1a5YCA,2120
|
|
146
|
+
minicpmo_utils-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
147
|
+
minicpmo_utils-0.1.0.dist-info/top_level.txt,sha256=yIeAnGCB_ihapVKakC0wqRtAwCjezcDPCC-z6Vjlt0s,49
|
|
148
|
+
minicpmo_utils-0.1.0.dist-info/RECORD,,
|
s3tokenizer/__init__.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# Copyright (c) 2023 OpenAI. (authors: Whisper Team)
|
|
2
|
+
# 2024 Tsinghua Univ. (authors: Xingchen Song)
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""Modified from
|
|
16
|
+
https://github.com/openai/whisper/blob/main/whisper/__init__.py
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import hashlib
|
|
20
|
+
import os
|
|
21
|
+
import urllib
|
|
22
|
+
import warnings
|
|
23
|
+
from typing import List, Union
|
|
24
|
+
|
|
25
|
+
from tqdm import tqdm
|
|
26
|
+
|
|
27
|
+
from s3tokenizer.model_v2 import S3TokenizerV2
|
|
28
|
+
|
|
29
|
+
from .model import S3Tokenizer
|
|
30
|
+
from .utils import (load_audio, log_mel_spectrogram, make_non_pad_mask,
|
|
31
|
+
mask_to_bias, onnx2torch, padding, merge_tokenized_segments)
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
'load_audio', 'log_mel_spectrogram', 'make_non_pad_mask', 'mask_to_bias',
|
|
35
|
+
'onnx2torch', 'padding', 'merge_tokenized_segments'
|
|
36
|
+
]
|
|
37
|
+
_MODELS = {
|
|
38
|
+
"speech_tokenizer_v1":
|
|
39
|
+
"https://www.modelscope.cn/models/iic/cosyvoice-300m/"
|
|
40
|
+
"resolve/master/speech_tokenizer_v1.onnx",
|
|
41
|
+
"speech_tokenizer_v1_25hz":
|
|
42
|
+
"https://www.modelscope.cn/models/iic/CosyVoice-300M-25Hz/"
|
|
43
|
+
"resolve/master/speech_tokenizer_v1.onnx",
|
|
44
|
+
"speech_tokenizer_v2_25hz":
|
|
45
|
+
"https://www.modelscope.cn/models/iic/CosyVoice2-0.5B/"
|
|
46
|
+
"resolve/master/speech_tokenizer_v2.onnx",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
_SHA256S = {
|
|
50
|
+
"speech_tokenizer_v1":
|
|
51
|
+
"23b5a723ed9143aebfd9ffda14ac4c21231f31c35ef837b6a13bb9e5488abb1e",
|
|
52
|
+
"speech_tokenizer_v1_25hz":
|
|
53
|
+
"56285ddd4a83e883ee0cb9f8d69c1089b53a94b1f78ff7e4a0224a27eb4cb486",
|
|
54
|
+
"speech_tokenizer_v2_25hz":
|
|
55
|
+
"d43342aa12163a80bf07bffb94c9de2e120a8df2f9917cd2f642e7f4219c6f71",
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _download(name: str, root: str) -> Union[bytes, str]:
|
|
60
|
+
os.makedirs(root, exist_ok=True)
|
|
61
|
+
|
|
62
|
+
expected_sha256 = _SHA256S[name]
|
|
63
|
+
url = _MODELS[name]
|
|
64
|
+
download_target = os.path.join(root, f"{name}.onnx")
|
|
65
|
+
|
|
66
|
+
if os.path.exists(download_target) and not os.path.isfile(download_target):
|
|
67
|
+
raise RuntimeError(
|
|
68
|
+
f"{download_target} exists and is not a regular file")
|
|
69
|
+
|
|
70
|
+
if os.path.isfile(download_target):
|
|
71
|
+
with open(download_target, "rb") as f:
|
|
72
|
+
model_bytes = f.read()
|
|
73
|
+
if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
|
|
74
|
+
return download_target
|
|
75
|
+
else:
|
|
76
|
+
warnings.warn(
|
|
77
|
+
f"{download_target} exists, but the SHA256 checksum does not"
|
|
78
|
+
" match; re-downloading the file")
|
|
79
|
+
|
|
80
|
+
with urllib.request.urlopen(url) as source, open(download_target,
|
|
81
|
+
"wb") as output:
|
|
82
|
+
with tqdm(
|
|
83
|
+
total=int(source.info().get("Content-Length")),
|
|
84
|
+
ncols=80,
|
|
85
|
+
unit="iB",
|
|
86
|
+
unit_scale=True,
|
|
87
|
+
unit_divisor=1024,
|
|
88
|
+
desc="Downloading onnx checkpoint",
|
|
89
|
+
) as loop:
|
|
90
|
+
while True:
|
|
91
|
+
buffer = source.read(8192)
|
|
92
|
+
if not buffer:
|
|
93
|
+
break
|
|
94
|
+
|
|
95
|
+
output.write(buffer)
|
|
96
|
+
loop.update(len(buffer))
|
|
97
|
+
|
|
98
|
+
model_bytes = open(download_target, "rb").read()
|
|
99
|
+
if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
|
|
100
|
+
raise RuntimeError(
|
|
101
|
+
"Model has been downloaded but the SHA256 checksum does not not"
|
|
102
|
+
" match. Please retry loading the model.")
|
|
103
|
+
|
|
104
|
+
return download_target
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def available_models() -> List[str]:
|
|
108
|
+
"""Returns the names of available models"""
|
|
109
|
+
return list(_MODELS.keys())
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def load_model(
|
|
113
|
+
name: str,
|
|
114
|
+
download_root: str = None,
|
|
115
|
+
) -> S3Tokenizer:
|
|
116
|
+
"""
|
|
117
|
+
Load a S3Tokenizer ASR model
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
name : str
|
|
122
|
+
one of the official model names listed by
|
|
123
|
+
`s3tokenizer.available_models()`, or path to a model checkpoint
|
|
124
|
+
containing the model dimensions and the model state_dict.
|
|
125
|
+
download_root: str
|
|
126
|
+
path to download the model files; by default,
|
|
127
|
+
it uses "~/.cache/s3tokenizer"
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
model : S3Tokenizer
|
|
132
|
+
The S3Tokenizer model instance
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
if download_root is None:
|
|
136
|
+
default = os.path.join(os.path.expanduser("~"), ".cache")
|
|
137
|
+
download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default),
|
|
138
|
+
"s3tokenizer")
|
|
139
|
+
|
|
140
|
+
if name in _MODELS:
|
|
141
|
+
checkpoint_file = _download(name, download_root)
|
|
142
|
+
elif os.path.isfile(name):
|
|
143
|
+
checkpoint_file = name
|
|
144
|
+
else:
|
|
145
|
+
raise RuntimeError(
|
|
146
|
+
f"Model {name} not found; available models = {available_models()}")
|
|
147
|
+
if 'v2' in name:
|
|
148
|
+
model = S3TokenizerV2(name)
|
|
149
|
+
else:
|
|
150
|
+
model = S3Tokenizer(name)
|
|
151
|
+
model.init_from_onnx(checkpoint_file)
|
|
152
|
+
|
|
153
|
+
return model
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
s3tokenizer/cli.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# Copyright (c) 2024 Tsinghua Univ. (authors: Xingchen Song)
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
""" Example Usage
|
|
15
|
+
cpu:
|
|
16
|
+
|
|
17
|
+
s3tokenizer --wav_scp xxx.scp \
|
|
18
|
+
--device "cpu" \
|
|
19
|
+
--output_dir "./" \
|
|
20
|
+
--batch_size 32
|
|
21
|
+
|
|
22
|
+
gpu:
|
|
23
|
+
|
|
24
|
+
torchrun --nproc_per_node=8 --nnodes=1 \
|
|
25
|
+
--rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
|
|
26
|
+
`which s3tokenizer` --wav_scp xxx.scp \
|
|
27
|
+
--device "cuda" \
|
|
28
|
+
--output_dir "./" \
|
|
29
|
+
--batch_size 32
|
|
30
|
+
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
import argparse
|
|
34
|
+
import json
|
|
35
|
+
import os
|
|
36
|
+
|
|
37
|
+
import torch
|
|
38
|
+
import torch.distributed as dist
|
|
39
|
+
from torch.utils.data import DataLoader, Dataset, DistributedSampler
|
|
40
|
+
from tqdm import tqdm
|
|
41
|
+
|
|
42
|
+
import s3tokenizer
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class AudioDataset(Dataset):
|
|
46
|
+
|
|
47
|
+
def __init__(self, wav_scp):
|
|
48
|
+
self.data = []
|
|
49
|
+
self.keys = []
|
|
50
|
+
|
|
51
|
+
with open(wav_scp, 'r', encoding='utf-8') as f:
|
|
52
|
+
for line in f:
|
|
53
|
+
key, file_path = line.strip().split()
|
|
54
|
+
self.data.append(file_path)
|
|
55
|
+
self.keys.append(key)
|
|
56
|
+
|
|
57
|
+
def __len__(self):
|
|
58
|
+
return len(self.data)
|
|
59
|
+
|
|
60
|
+
def __getitem__(self, idx):
|
|
61
|
+
file_path = self.data[idx]
|
|
62
|
+
key = self.keys[idx]
|
|
63
|
+
audio = s3tokenizer.load_audio(file_path)
|
|
64
|
+
mel = s3tokenizer.log_mel_spectrogram(audio)
|
|
65
|
+
return key, mel
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def collate_fn(batch):
|
|
69
|
+
keys = [item[0] for item in batch]
|
|
70
|
+
mels = [item[1] for item in batch]
|
|
71
|
+
mels, mels_lens = s3tokenizer.padding(mels)
|
|
72
|
+
return keys, mels, mels_lens
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def init_distributed():
|
|
76
|
+
world_size = int(os.environ.get('WORLD_SIZE', 1))
|
|
77
|
+
local_rank = int(os.environ.get('LOCAL_RANK', 0))
|
|
78
|
+
rank = int(os.environ.get('RANK', 0))
|
|
79
|
+
print('Inference on multiple gpus, this gpu {}'.format(local_rank) +
|
|
80
|
+
', rank {}, world_size {}'.format(rank, world_size))
|
|
81
|
+
torch.cuda.set_device(local_rank)
|
|
82
|
+
dist.init_process_group("nccl")
|
|
83
|
+
return world_size, local_rank, rank
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_args():
|
|
87
|
+
parser = argparse.ArgumentParser(description='extract speech code')
|
|
88
|
+
parser.add_argument('--model',
|
|
89
|
+
required=True,
|
|
90
|
+
type=str,
|
|
91
|
+
choices=[
|
|
92
|
+
"speech_tokenizer_v1", "speech_tokenizer_v1_25hz",
|
|
93
|
+
"speech_tokenizer_v2_25hz"
|
|
94
|
+
],
|
|
95
|
+
help='model version')
|
|
96
|
+
parser.add_argument('--wav_scp',
|
|
97
|
+
required=True,
|
|
98
|
+
type=str,
|
|
99
|
+
help='each line contains `wav_name wav_path`')
|
|
100
|
+
parser.add_argument('--device',
|
|
101
|
+
required=True,
|
|
102
|
+
type=str,
|
|
103
|
+
choices=["cuda", "cpu"],
|
|
104
|
+
help='device for inference')
|
|
105
|
+
parser.add_argument('--output_dir',
|
|
106
|
+
required=True,
|
|
107
|
+
type=str,
|
|
108
|
+
help='dir to save result')
|
|
109
|
+
parser.add_argument('--batch_size',
|
|
110
|
+
required=True,
|
|
111
|
+
type=int,
|
|
112
|
+
help='batch size (per-device) for inference')
|
|
113
|
+
parser.add_argument('--num_workers',
|
|
114
|
+
type=int,
|
|
115
|
+
default=4,
|
|
116
|
+
help='workers for dataloader')
|
|
117
|
+
parser.add_argument('--prefetch',
|
|
118
|
+
type=int,
|
|
119
|
+
default=5,
|
|
120
|
+
help='prefetch for dataloader')
|
|
121
|
+
args = parser.parse_args()
|
|
122
|
+
return args
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def main():
|
|
126
|
+
args = get_args()
|
|
127
|
+
os.makedirs(args.output_dir, exist_ok=True)
|
|
128
|
+
|
|
129
|
+
if args.device == "cuda":
|
|
130
|
+
assert (torch.cuda.is_available())
|
|
131
|
+
world_size, local_rank, rank = init_distributed()
|
|
132
|
+
else:
|
|
133
|
+
world_size, local_rank, rank = 1, 0, 0
|
|
134
|
+
|
|
135
|
+
device = torch.device(args.device)
|
|
136
|
+
model = s3tokenizer.load_model(args.model).to(device)
|
|
137
|
+
dataset = AudioDataset(args.wav_scp)
|
|
138
|
+
|
|
139
|
+
if args.device == "cuda":
|
|
140
|
+
model = torch.nn.parallel.DistributedDataParallel(
|
|
141
|
+
model, device_ids=[local_rank])
|
|
142
|
+
sampler = DistributedSampler(dataset,
|
|
143
|
+
num_replicas=world_size,
|
|
144
|
+
rank=rank)
|
|
145
|
+
else:
|
|
146
|
+
sampler = None
|
|
147
|
+
|
|
148
|
+
dataloader = DataLoader(dataset,
|
|
149
|
+
batch_size=args.batch_size,
|
|
150
|
+
sampler=sampler,
|
|
151
|
+
shuffle=False,
|
|
152
|
+
num_workers=args.num_workers,
|
|
153
|
+
prefetch_factor=args.prefetch,
|
|
154
|
+
collate_fn=collate_fn)
|
|
155
|
+
|
|
156
|
+
total_steps = len(dataset)
|
|
157
|
+
|
|
158
|
+
if rank == 0:
|
|
159
|
+
progress_bar = tqdm(total=total_steps, desc="Processing", unit="wavs")
|
|
160
|
+
|
|
161
|
+
writer = open(f"{args.output_dir}/part_{rank + 1}_of_{world_size}", "w")
|
|
162
|
+
for keys, mels, mels_lens in dataloader:
|
|
163
|
+
codes, codes_lens = model(mels.to(device), mels_lens.to(device))
|
|
164
|
+
for i, k in enumerate(keys):
|
|
165
|
+
code = codes[i, :codes_lens[i].item()].tolist()
|
|
166
|
+
writer.write(
|
|
167
|
+
json.dumps({
|
|
168
|
+
"key": k,
|
|
169
|
+
"code": code
|
|
170
|
+
}, ensure_ascii=False) + "\n")
|
|
171
|
+
if rank == 0:
|
|
172
|
+
progress_bar.update(world_size * len(keys))
|
|
173
|
+
|
|
174
|
+
if rank == 0:
|
|
175
|
+
progress_bar.close()
|
|
176
|
+
writer.close()
|
|
177
|
+
if args.device == "cuda":
|
|
178
|
+
dist.barrier()
|
|
179
|
+
dist.destroy_process_group()
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
if __name__ == "__main__":
|
|
183
|
+
main()
|