minicpmo-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. cosyvoice/__init__.py +17 -0
  2. cosyvoice/bin/average_model.py +93 -0
  3. cosyvoice/bin/export_jit.py +103 -0
  4. cosyvoice/bin/export_onnx.py +120 -0
  5. cosyvoice/bin/inference_deprecated.py +126 -0
  6. cosyvoice/bin/train.py +195 -0
  7. cosyvoice/cli/__init__.py +0 -0
  8. cosyvoice/cli/cosyvoice.py +209 -0
  9. cosyvoice/cli/frontend.py +238 -0
  10. cosyvoice/cli/model.py +386 -0
  11. cosyvoice/dataset/__init__.py +0 -0
  12. cosyvoice/dataset/dataset.py +151 -0
  13. cosyvoice/dataset/processor.py +434 -0
  14. cosyvoice/flow/decoder.py +494 -0
  15. cosyvoice/flow/flow.py +281 -0
  16. cosyvoice/flow/flow_matching.py +227 -0
  17. cosyvoice/flow/length_regulator.py +70 -0
  18. cosyvoice/hifigan/discriminator.py +230 -0
  19. cosyvoice/hifigan/f0_predictor.py +58 -0
  20. cosyvoice/hifigan/generator.py +582 -0
  21. cosyvoice/hifigan/hifigan.py +67 -0
  22. cosyvoice/llm/llm.py +610 -0
  23. cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  24. cosyvoice/tokenizer/tokenizer.py +279 -0
  25. cosyvoice/transformer/__init__.py +0 -0
  26. cosyvoice/transformer/activation.py +84 -0
  27. cosyvoice/transformer/attention.py +330 -0
  28. cosyvoice/transformer/convolution.py +145 -0
  29. cosyvoice/transformer/decoder.py +396 -0
  30. cosyvoice/transformer/decoder_layer.py +132 -0
  31. cosyvoice/transformer/embedding.py +302 -0
  32. cosyvoice/transformer/encoder.py +474 -0
  33. cosyvoice/transformer/encoder_layer.py +236 -0
  34. cosyvoice/transformer/label_smoothing_loss.py +96 -0
  35. cosyvoice/transformer/positionwise_feed_forward.py +115 -0
  36. cosyvoice/transformer/subsampling.py +383 -0
  37. cosyvoice/transformer/upsample_encoder.py +320 -0
  38. cosyvoice/utils/__init__.py +0 -0
  39. cosyvoice/utils/class_utils.py +83 -0
  40. cosyvoice/utils/common.py +186 -0
  41. cosyvoice/utils/executor.py +176 -0
  42. cosyvoice/utils/file_utils.py +129 -0
  43. cosyvoice/utils/frontend_utils.py +136 -0
  44. cosyvoice/utils/losses.py +57 -0
  45. cosyvoice/utils/mask.py +265 -0
  46. cosyvoice/utils/scheduler.py +738 -0
  47. cosyvoice/utils/train_utils.py +367 -0
  48. cosyvoice/vllm/cosyvoice2.py +103 -0
  49. matcha/__init__.py +0 -0
  50. matcha/app.py +357 -0
  51. matcha/cli.py +418 -0
  52. matcha/hifigan/__init__.py +0 -0
  53. matcha/hifigan/config.py +28 -0
  54. matcha/hifigan/denoiser.py +64 -0
  55. matcha/hifigan/env.py +17 -0
  56. matcha/hifigan/meldataset.py +217 -0
  57. matcha/hifigan/models.py +368 -0
  58. matcha/hifigan/xutils.py +60 -0
  59. matcha/models/__init__.py +0 -0
  60. matcha/models/baselightningmodule.py +209 -0
  61. matcha/models/components/__init__.py +0 -0
  62. matcha/models/components/decoder.py +443 -0
  63. matcha/models/components/flow_matching.py +132 -0
  64. matcha/models/components/text_encoder.py +410 -0
  65. matcha/models/components/transformer.py +316 -0
  66. matcha/models/matcha_tts.py +239 -0
  67. matcha/onnx/__init__.py +0 -0
  68. matcha/onnx/export.py +181 -0
  69. matcha/onnx/infer.py +168 -0
  70. matcha/text/__init__.py +53 -0
  71. matcha/text/cleaners.py +116 -0
  72. matcha/text/numbers.py +71 -0
  73. matcha/text/symbols.py +17 -0
  74. matcha/train.py +122 -0
  75. matcha/utils/__init__.py +5 -0
  76. matcha/utils/audio.py +82 -0
  77. matcha/utils/generate_data_statistics.py +111 -0
  78. matcha/utils/instantiators.py +56 -0
  79. matcha/utils/logging_utils.py +53 -0
  80. matcha/utils/model.py +90 -0
  81. matcha/utils/monotonic_align/__init__.py +22 -0
  82. matcha/utils/monotonic_align/setup.py +7 -0
  83. matcha/utils/pylogger.py +21 -0
  84. matcha/utils/rich_utils.py +101 -0
  85. matcha/utils/utils.py +219 -0
  86. minicpmo/__init__.py +24 -0
  87. minicpmo/utils.py +636 -0
  88. minicpmo/version.py +2 -0
  89. minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
  90. minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
  91. minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
  92. minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
  93. s3tokenizer/__init__.py +153 -0
  94. s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
  95. s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
  96. s3tokenizer/assets/mel_filters.npz +0 -0
  97. s3tokenizer/cli.py +183 -0
  98. s3tokenizer/model.py +546 -0
  99. s3tokenizer/model_v2.py +605 -0
  100. s3tokenizer/utils.py +390 -0
  101. stepaudio2/__init__.py +40 -0
  102. stepaudio2/cosyvoice2/__init__.py +1 -0
  103. stepaudio2/cosyvoice2/flow/__init__.py +0 -0
  104. stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
  105. stepaudio2/cosyvoice2/flow/flow.py +230 -0
  106. stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
  107. stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
  108. stepaudio2/cosyvoice2/transformer/attention.py +328 -0
  109. stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
  110. stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
  111. stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
  112. stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
  113. stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
  114. stepaudio2/cosyvoice2/utils/__init__.py +1 -0
  115. stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
  116. stepaudio2/cosyvoice2/utils/common.py +101 -0
  117. stepaudio2/cosyvoice2/utils/mask.py +49 -0
  118. stepaudio2/flashcosyvoice/__init__.py +0 -0
  119. stepaudio2/flashcosyvoice/cli.py +424 -0
  120. stepaudio2/flashcosyvoice/config.py +80 -0
  121. stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
  122. stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
  123. stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
  124. stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
  125. stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
  126. stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
  127. stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
  128. stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
  129. stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
  130. stepaudio2/flashcosyvoice/modules/flow.py +198 -0
  131. stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
  132. stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
  133. stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
  134. stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
  135. stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
  136. stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
  137. stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
  138. stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
  139. stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
  140. stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
  141. stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
  142. stepaudio2/flashcosyvoice/utils/audio.py +77 -0
  143. stepaudio2/flashcosyvoice/utils/context.py +28 -0
  144. stepaudio2/flashcosyvoice/utils/loader.py +116 -0
  145. stepaudio2/flashcosyvoice/utils/memory.py +19 -0
  146. stepaudio2/stepaudio2.py +204 -0
  147. stepaudio2/token2wav.py +248 -0
  148. stepaudio2/utils.py +91 -0
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: minicpmo-utils
3
+ Version: 0.1.0
4
+ Summary: Unified utilities package for MiniCPM-o: includes cosyvoice + stepaudio2 and extensible utils.
5
+ Author: MiniCPM-o Utils Maintainers
6
+ License: Apache-2.0
7
+ Keywords: minicpmo,audio,tts,utils,cosyvoice,stepaudio2
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ Requires-Dist: torch>=2.3.0
18
+ Requires-Dist: torchaudio>=2.3.0
19
+ Requires-Dist: transformers>=4.49.0
20
+ Requires-Dist: numpy
21
+ Requires-Dist: hyperpyyaml
22
+ Requires-Dist: modelscope
23
+ Requires-Dist: openai-whisper
24
+ Requires-Dist: tqdm
25
+ Requires-Dist: tiktoken
26
+ Requires-Dist: inflect
27
+ Requires-Dist: omegaconf
28
+ Requires-Dist: einops
29
+ Requires-Dist: librosa
30
+ Requires-Dist: onnxruntime>=1.18.0
31
+ Requires-Dist: diffusers
32
+ Provides-Extra: gpu
33
+ Requires-Dist: onnxruntime-gpu>=1.18.0; sys_platform == "linux" and extra == "gpu"
34
+
35
+ ## minicpmo-utils
36
+
37
+ 一个统一安装的工具包(一个 PyPI 分发包),把仓库里的 `cosyvoice` 与 `stepaudio2` 一起打进同一个 wheel,并预留 `minicpmo` 作为后续扩展 utils 的统一入口。
38
+
39
+ ### 安装方式
40
+
41
+ - 从源码本地安装(开发态,可编辑):
42
+ ```bash
43
+ cd minicpmo-utils
44
+ pip install -e .
45
+ ```
46
+
47
+ - 构建并安装 wheel(推荐分发):
48
+ ```bash
49
+ cd minicpmo-utils
50
+ python -m build # 生成 dist/*.whl
51
+ pip install dist/minicpmo_utils-0.1.0-py3-none-any.whl
52
+ ```
53
+
54
+ ### 导入方式
55
+
56
+ 包会暴露以下顶层模块,安装后可直接使用:
57
+ - `import cosyvoice`
58
+ - `import stepaudio2`
59
+ - `import matcha`
60
+ - `import minicpmo`
61
+
62
+ 也支持通过统一入口导入子包:
63
+ ```python
64
+ from minicpmo import cosyvoice, stepaudio2, matcha
65
+ ```
66
+
67
+ 以及通过统一的 utils 入口使用通用工具函数,例如:
68
+
69
+ ```python
70
+ from minicpmo.utils import get_video_frame_audio_segments
71
+ ```
72
+
@@ -0,0 +1,148 @@
1
+ cosyvoice/__init__.py,sha256=yXNTKqiEBedjE7e_Qav__IUpv9XfG9-em4yJxmoH5Zs,530
2
+ cosyvoice/bin/average_model.py,sha256=82LeGlvQh8xFHA_T9fJJDBTtDhJ_UzukJJcFRjyjc9Y,3202
3
+ cosyvoice/bin/export_jit.py,sha256=36EvvRkOE621pMqDL3Il69hXdLXsJtSGtDO3_r13nmA,3906
4
+ cosyvoice/bin/export_onnx.py,sha256=nUHWmNTkGMowvHuXI92OR7R6v1doVsDxvxEzP5GWiQk,4661
5
+ cosyvoice/bin/inference_deprecated.py,sha256=d4x-8eER00VwuOEHnObd9kd7cOriv_xbXZOAEXG60fk,6126
6
+ cosyvoice/bin/train.py,sha256=1bpuU0d8fwc5ygrY65FbT9kfWpdZkfsXX83NBkNAMLc,8071
7
+ cosyvoice/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ cosyvoice/cli/cosyvoice.py,sha256=WCNOWg-_CJ-y9HDxL1tFrCirnB2a030jyN5EaELke6s,12808
9
+ cosyvoice/cli/frontend.py,sha256=8z3yYsmjTpi8olTD2DNooPuW0nFnwDKSJIzPETvqv4c,13827
10
+ cosyvoice/cli/model.py,sha256=X3Za8Ak5LEeRUldLalztoSnu6UsCP9WcYQFvu9MDacQ,24008
11
+ cosyvoice/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ cosyvoice/dataset/dataset.py,sha256=QlINnr5R2kyDQQudr9jqdK_z63VQME2WJo8UBX8sVTo,4804
13
+ cosyvoice/dataset/processor.py,sha256=RBpJ8QlP00_e8qkjEo86h2nBvYcuttNVtp_HIixx-wY,15876
14
+ cosyvoice/flow/decoder.py,sha256=717Oudt_Y93aHVvKa_prKLjqEWVsSx-cEJ0o9lbLvyk,19866
15
+ cosyvoice/flow/flow.py,sha256=qEl_61gzbnVmsfCF0RrP-ctPGiSUmr0sJE-_l8dvm20,12156
16
+ cosyvoice/flow/flow_matching.py,sha256=sa1nH-N_hywDS96PdcwZwbiHWNVON1-itU4UoIit3-Y,10499
17
+ cosyvoice/flow/length_regulator.py,sha256=srvavaBIUN8Mk0Vi35WyN8og-n6P6J0E2bgnqZ1nQRs,3137
18
+ cosyvoice/hifigan/discriminator.py,sha256=PbWxtVhMYAyZbujFPqAhNwqK2cYuP1oo1c8l8Dq5-c8,8617
19
+ cosyvoice/hifigan/f0_predictor.py,sha256=uoymCIodAtYIb-oh0E1p7pgXvdjqICiZgJnLCXGhhmk,2065
20
+ cosyvoice/hifigan/generator.py,sha256=90YB5v6-tBCpYejtiTG0QHTThd7X9vd-6RigKbPUJiY,22754
21
+ cosyvoice/hifigan/hifigan.py,sha256=6QDaL15-wwLbmm6m0rcZkOq9iCPgfllBm32MUt99wNY,3240
22
+ cosyvoice/llm/llm.py,sha256=aYR8ZUgeYrXba-zoS7hONqbCIMsWVlHqt65Ug1uw6uQ,30566
23
+ cosyvoice/tokenizer/tokenizer.py,sha256=lDQPx83ycMaaOutjKQxSQQROIHFOAf6nNvNh-eWlbfI,7456
24
+ cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken,sha256=dHl5Yx6BMZNDaqvP98HCNdN96Al7ccVj7Itjt6UVxxg,907395
25
+ cosyvoice/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ cosyvoice/transformer/activation.py,sha256=pKlsrqn3sFERKG3l6nYL39-cTlNEj1NCCFfcBKUEQMI,3089
27
+ cosyvoice/transformer/attention.py,sha256=QdJpstXjo5UsClOPRkgK_4Vwdn64joBFLzZ0Ns72KLE,14389
28
+ cosyvoice/transformer/convolution.py,sha256=619B8ySpciXHO5xDCvi7IxvXc4bvGEULsP0yn0aatOE,5230
29
+ cosyvoice/transformer/decoder.py,sha256=2wQscn4OZTrJJHM7H7FeaXkv_YDJ089iJIN0VV1Yocw,16580
30
+ cosyvoice/transformer/decoder_layer.py,sha256=uVZiq3LsawsPUMOhX77PFvrLeG0yO0rKHQY7nCHA1k4,4807
31
+ cosyvoice/transformer/embedding.py,sha256=tQwwvlxmw5yV4NsQFYFMVF_NBiqTbA6nrUwBUIslldw,11777
32
+ cosyvoice/transformer/encoder.py,sha256=J_nXSZcgNy--Z3TQkLif8GPH7PiPk6TXWye7GtspGKU,21434
33
+ cosyvoice/transformer/encoder_layer.py,sha256=GSBYK-LJt894Nee1ORGOweudqPLHEcYlf4WYs3kpUbk,9602
34
+ cosyvoice/transformer/label_smoothing_loss.py,sha256=24gEzxwg4a-_bDPeSDZYmxlH2IF5fQLVB8KoqNT0D90,3459
35
+ cosyvoice/transformer/positionwise_feed_forward.py,sha256=boA447zIyght3KUI-5udQL86uYvrq89clJNdAyMp0Pg,4219
36
+ cosyvoice/transformer/subsampling.py,sha256=MfwDR6hRq8EgXf1M9oCZwMQWWJw-maB7JQ6GMM7OGdA,12666
37
+ cosyvoice/transformer/upsample_encoder.py,sha256=qAA8ISzmRpfOQwAfd2kC7mBpajt-Nzk1R5Ap3Mr31Wk,14149
38
+ cosyvoice/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ cosyvoice/utils/class_utils.py,sha256=ddaXf3V0ME-EZM21DIiVZMpVepmhEkGZGLKYHw6Nz8I,3321
40
+ cosyvoice/utils/common.py,sha256=YWGo2Q176wdm9tLeZ8zONadUr0OTDCivN8TRm2Xu-Jc,6792
41
+ cosyvoice/utils/executor.py,sha256=LqeifOT3xpwno9oy97jBBi1nuyCMZVsJsVeJh6n5vp4,8835
42
+ cosyvoice/utils/file_utils.py,sha256=mV_0mRdhZSTODqVBgopxrjeLGbCWf-VLGVtHfgNcN_8,5461
43
+ cosyvoice/utils/frontend_utils.py,sha256=DQypTgz1GeLRf3LNHcq2yweuoN5I5-eSgmNiKE6hjTA,4273
44
+ cosyvoice/utils/losses.py,sha256=XND3_XjOViLBm7kRZRa3BWmeSMCPIXubiqESfyv5XBA,2121
45
+ cosyvoice/utils/mask.py,sha256=hSxuSxQgGiOKsHY5bbWZwVey7lpaKqzw8nfdzoxkiXY,9728
46
+ cosyvoice/utils/scheduler.py,sha256=lEfquE_Lcer2VG2zUVa0n-UxgvJEdEodyT66so-h6jQ,24920
47
+ cosyvoice/utils/train_utils.py,sha256=yUFlYHUZRtrqQBx7kBJiYPbx2ArtAPKWzAxHQk4vsoo,16581
48
+ cosyvoice/vllm/cosyvoice2.py,sha256=csgta5DvFgOjo_D6lrBPBdc66b5pBAwAfk_6SPTyb4k,4056
49
+ matcha/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
+ matcha/app.py,sha256=bBOG1VM5L8X64aF8rH4iddu45dje9lmkyr5ubAq_4-4,13992
51
+ matcha/cli.py,sha256=Iw_SITi4QrYGb_HZnxBNZ_ivnZ_zvXSJJEbyQ4clad8,15519
52
+ matcha/train.py,sha256=eKFZFkaSvmdfMNhOAcvp63kp-Wvj_9IE47mAp4sHGxg,4613
53
+ matcha/hifigan/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
+ matcha/hifigan/config.py,sha256=cxS_YTK_UoO2uma_Ue8QRFtNA6ldSEt1EL36u2-NgCc,779
55
+ matcha/hifigan/denoiser.py,sha256=Q2juI2a3QTqce6fuvqWQnShoV671kVW2JsQyoqwwvLI,2644
56
+ matcha/hifigan/env.py,sha256=QthDmqTWWAIo2tSI-arVteRVxnKUnm9mkjlQVnXbmDc,429
57
+ matcha/hifigan/meldataset.py,sha256=4Ps0NXA3Yv0oV2PyCdHcptc4lZ43tp978m3BM2NYWxc,6786
58
+ matcha/hifigan/models.py,sha256=Ln7J7YzeN4h33tBKQHhlPcoNfmX-_-kwBkTZIYJY5q8,11668
59
+ matcha/hifigan/xutils.py,sha256=aNtYu1SyQaGtbN_NnSfLGhTi-r_y3vYKMIf7EYGaDOA,1396
60
+ matcha/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
+ matcha/models/baselightningmodule.py,sha256=45aD1xV6AunZ7_O6RU6-B0OaUMMNO_gFKUWprs3EfCc,7003
62
+ matcha/models/matcha_tts.py,sha256=zbpw9noGwR_fG9yBKxxtMhmHgnxS7PPE1_ft9KK-jPo,10059
63
+ matcha/models/components/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
+ matcha/models/components/decoder.py,sha256=fqlKYc08lnA6oQf9zRuC3i8vat4go8X0NLYAUxgXams,14459
65
+ matcha/models/components/flow_matching.py,sha256=pvKy7somAHSdV13ahiTYZNkEIn4qn3ZKbLK4kpBEXmw,4657
66
+ matcha/models/components/text_encoder.py,sha256=QEQynUB7PaBsdlJ-e1TUBAe9rL2LBnmk5bAkr_vCTek,14845
67
+ matcha/models/components/transformer.py,sha256=Zv8gktl0qZslG52OXAi7zszq7fr0HivYrytLx6Vty-o,13237
68
+ matcha/onnx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
+ matcha/onnx/export.py,sha256=VUAviRbyh9WhXaxLPQjdQLwaVK_ZZNgaWPMFA2UKh4c,5386
70
+ matcha/onnx/infer.py,sha256=ACgjtfMdvzuU4XmalNLrebmkBD2W8VLcLi8blXnjUmU,6293
71
+ matcha/text/__init__.py,sha256=6dxZsGZ25Fvk4_b72yfA7NVcNInCJLSjpvN9Mc66uBI,1696
72
+ matcha/text/cleaners.py,sha256=t4-wsMBQdJT6J12mHiXME23tmpOi8pmV-GkejDVcNf8,3560
73
+ matcha/text/numbers.py,sha256=Nwp-HmqaqstAELVqqlMu_7bj4qiWQzMXFmw56nBq1h0,2249
74
+ matcha/text/symbols.py,sha256=hw78I8JtBmkri5HiDUQC05i2N9MFkxWJoHujIxJFP1c,635
75
+ matcha/utils/__init__.py,sha256=YNs91rjzeUcVSs3aZjv9thma42I81Jksy9RF3nquvOo,326
76
+ matcha/utils/audio.py,sha256=L3QQZL_MlIXRmkI1ZJbYriTeyAw9bvXKf06Kv7-cYco,2282
77
+ matcha/utils/generate_data_statistics.py,sha256=E5Z9Tpa6ZFXfSLLQcAwhg7YYqBgLZDkIqBoGoOZhEjA,3269
78
+ matcha/utils/instantiators.py,sha256=QxqOnRVabCSeurpPODXqh3JSZ-E7m8jKSsxy1WvTX-I,1828
79
+ matcha/utils/logging_utils.py,sha256=glOI_JG8_YBKHWwD5RRKKkCez3N7DJlH27Vz48yAEtU,1711
80
+ matcha/utils/model.py,sha256=UViKHaV89_IeaKJFww1xHV_RTXqv0YvfQWqwOtnzQ-I,2935
81
+ matcha/utils/pylogger.py,sha256=YbC8Ym5HZrJcDBIsQO6jSnuyY5CLZQR13E_oAS9SYZQ,720
82
+ matcha/utils/rich_utils.py,sha256=Oj5jrkz5s1b3RJL6m_8EXj85LY079FWClMIzf_Gwvcc,3279
83
+ matcha/utils/utils.py,sha256=-XaKibvqf6mXCoGPkNcFKB8Ynq_bK6_S8-ntUhSR1WE,7159
84
+ matcha/utils/monotonic_align/__init__.py,sha256=_s_INV7vL_N9mhYtZADAk11CsSGP8kykn0fEyyM73ts,646
85
+ matcha/utils/monotonic_align/setup.py,sha256=bf0d0cvGRaACC22qq0sqgZEBBhz4qzDlMqBxOyTKC2g,207
86
+ minicpmo/__init__.py,sha256=wyo8jLG2ATqFcTUrcTcvIzzaf37AYcbHdOAK15PLTjE,556
87
+ minicpmo/utils.py,sha256=XJ717gRjAto7KytP8LFiwBfyFaom6fYtVG9oOMOKkiw,22445
88
+ minicpmo/version.py,sha256=p4MxVO8ih2-hnh7INVBkrqMKBDGTkMso1LEbM6mx3jg,23
89
+ s3tokenizer/__init__.py,sha256=8fg7P72mJjjvkyieA6Y2thBj2eNV_9hBxJp8aW3gcVg,5048
90
+ s3tokenizer/cli.py,sha256=qUUA-5Ro-ILeQd-6YXVuItpYikEdaphIUfabpUTlIh4,5942
91
+ s3tokenizer/model.py,sha256=v-DfJg11h9ztX1gHlUOLcpx9xq4ZZT5aqHOKNtG7Hzw,19542
92
+ s3tokenizer/model_v2.py,sha256=x3hNCIT4UaHb715HojZ3AbEBy-wcPDWR-EohJiY4Aw0,21806
93
+ s3tokenizer/utils.py,sha256=baAg5C7yi0AZx7caF0hJBU-pkj3coMPKM8QrLqgk-9k,14140
94
+ s3tokenizer/assets/BAC009S0764W0121.wav,sha256=RtvJmMnR1IERJnxAdB3TIA8uW89AdfjEyX9EURYNzlA,134570
95
+ s3tokenizer/assets/BAC009S0764W0122.wav,sha256=V2DHrOCSOkmfYF03MBHHqvZY4wSgmZpJPGNmnHVjhOE,131724
96
+ s3tokenizer/assets/mel_filters.npz,sha256=dFCucHI6XvnTQePO5ijHywF382zkLES37SvzMl8PbUw,4271
97
+ stepaudio2/__init__.py,sha256=ahfLETm7fp_A320ERK6j2xT4nTyecy8OA8Z-syVLuQQ,1547
98
+ stepaudio2/stepaudio2.py,sha256=hf1EZl5odF8wFqs0VqC-B_DBioHrfpzWgZN501o1-YA,9265
99
+ stepaudio2/token2wav.py,sha256=eAhDCCXjf9R7F5TsNfbNP5Se256Kqq9gKIrKFh0I5qE,11642
100
+ stepaudio2/utils.py,sha256=p83TqxnLAu2OyAq3eCnESlkbU9kkNdH5Tnh1Da-J7Fg,3384
101
+ stepaudio2/cosyvoice2/__init__.py,sha256=VFv7gYSAXPuP6d6arx-EqTy8W1WbPM4lGmdmPENMWT8,44
102
+ stepaudio2/cosyvoice2/flow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
+ stepaudio2/cosyvoice2/flow/decoder_dit.py,sha256=eQxV2JjZLI2HSWHltHtfnU3VwGfsIMpepmgzsyzisV0,22418
104
+ stepaudio2/cosyvoice2/flow/flow.py,sha256=-zF1KSmD8PouUvJ6Bg_V_2RwwotBXbqFjE_oALoXJ4Q,8284
105
+ stepaudio2/cosyvoice2/flow/flow_matching.py,sha256=CdXiwC78e2ViEre35wP2pVxlz6Xvtmv8bEcHrfuRYOM,8385
106
+ stepaudio2/cosyvoice2/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
107
+ stepaudio2/cosyvoice2/transformer/attention.py,sha256=Yy3xmqfnEjE9DxsGeJ7NKKh5YLu7nsLSLcCgmieUo90,14314
108
+ stepaudio2/cosyvoice2/transformer/embedding.py,sha256=2xDZKyM0P26YG2ZBevr-G3ZkyVukZEKjetho0yobSo4,4766
109
+ stepaudio2/cosyvoice2/transformer/encoder_layer.py,sha256=3lRTqu3m7KhJJhqc6B4M5Oqgvakz3junmX2TTYtVXEE,6838
110
+ stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py,sha256=r49V1dFDxbnGocDqCO-B8OXhb3Z4oykhRD5DmBMghAo,1887
111
+ stepaudio2/cosyvoice2/transformer/subsampling.py,sha256=uFjanKdf855iAlq3sv4YhpbOQoInRI2Q7xADLIfIFqc,2529
112
+ stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py,sha256=YVqzOMmNj1-GqJCbDXzzX6XS_WgZD2laYF8LIuNgbAQ,18854
113
+ stepaudio2/cosyvoice2/utils/__init__.py,sha256=J3q6oPwKcQzvgZAdW8pZ58wpCRgx73ZDa12YdsggeTw,35
114
+ stepaudio2/cosyvoice2/utils/class_utils.py,sha256=dGLa5anz6qeHKMAcPMh_z2gNGp2SuFLbjT30YN5gGp0,1377
115
+ stepaudio2/cosyvoice2/utils/common.py,sha256=WIuAd4n0cGwwk1NyD0oeU1oK-5csW76kXL-xz1graII,3334
116
+ stepaudio2/cosyvoice2/utils/mask.py,sha256=CkCYG5pGBHLH-Fdt5fEeQqnc3ytXFYA5g3dhsD5RtpY,1703
117
+ stepaudio2/flashcosyvoice/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
+ stepaudio2/flashcosyvoice/cli.py,sha256=eSDEAFtGbgUv91vTzx4KXFRfAtx6eIgXg_5Ut3EtxUs,19863
119
+ stepaudio2/flashcosyvoice/config.py,sha256=4VoEScKra4qgHxbcFuTycYAPMYYijt-Jjf6GEeKzCdU,2619
120
+ stepaudio2/flashcosyvoice/cosyvoice2.py,sha256=7t2pRvJRxxVHrLRAOMMXwQjsOTHnxnHGGG3ciBk6xmk,7729
121
+ stepaudio2/flashcosyvoice/cosyvoice3.py,sha256=UMxsD-RBudOyiUc4zLNwveluKfwwP4AcrrDOWRrSXLk,57
122
+ stepaudio2/flashcosyvoice/engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
123
+ stepaudio2/flashcosyvoice/engine/block_manager.py,sha256=Q6Fd3GbUwYlXUJlFdjVh_t682C0N0yCAjUVSrAeQb3w,4027
124
+ stepaudio2/flashcosyvoice/engine/llm_engine.py,sha256=j16UKgoPVxmO-f4oMQ4cR6CxmR4OqRQvlHjm2GFnhHM,5430
125
+ stepaudio2/flashcosyvoice/engine/model_runner.py,sha256=fzYoPoUkcr1epAtaDeKVTTypWTcpkDj87MD4-f88opU,14036
126
+ stepaudio2/flashcosyvoice/engine/scheduler.py,sha256=4o8M0XhFCrkaFtoGQHgz0BitYA_YpZP_mvTrfo0PwM4,3134
127
+ stepaudio2/flashcosyvoice/engine/sequence.py,sha256=pwYdtCST9ZHqxPK62JCPRxt-G885nAbYh26oOuLHuao,2835
128
+ stepaudio2/flashcosyvoice/modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
129
+ stepaudio2/flashcosyvoice/modules/flow.py,sha256=INAXZTIkGCxdosq7LBSNxb6BNQmWqDGyQV2Xshh228s,8292
130
+ stepaudio2/flashcosyvoice/modules/hifigan.py,sha256=5wrEedjKzE8D6m1JNcaKxyLkvH2cgI0tBbU_1OWFzoo,9991
131
+ stepaudio2/flashcosyvoice/modules/qwen2.py,sha256=L4Kx5s0SdqUmOYv6X6UCkcbRIctG3Dl8IPCNmiuB0Y0,3346
132
+ stepaudio2/flashcosyvoice/modules/sampler.py,sha256=xhRQW6yP9e5mHjY8npypwZY1oe9mZ5CLH_IG5Oz1EAw,10914
133
+ stepaudio2/flashcosyvoice/modules/flow_components/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
+ stepaudio2/flashcosyvoice/modules/flow_components/estimator.py,sha256=N7DmynOzkItXY2jTZl6eqaSH5IsXlkumpDxyOP-__5g,38148
135
+ stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py,sha256=B2G2NM9xPeIx6r3v8n6Y49qh07PGBo55YZajL_m1Xjw,41265
136
+ stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
+ stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py,sha256=jEb8ZuPZZ6Q2Nx5CKBrmCkAeTLR-Hlw7k1qxuUJ4kD0,16353
138
+ stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
+ stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py,sha256=34Z0U3Qus11b6iE8I-k42lGq0lHz0KYDnVjUt6f0f84,21366
140
+ stepaudio2/flashcosyvoice/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
+ stepaudio2/flashcosyvoice/utils/audio.py,sha256=afxVAIkV_QliIzAwyOW1QygyjZxBUXjLwz4rSCbBs-c,2162
142
+ stepaudio2/flashcosyvoice/utils/context.py,sha256=y_u92r100Wp7xjs4tQd70eB00iaqUOa92qkb5sw-8ho,821
143
+ stepaudio2/flashcosyvoice/utils/loader.py,sha256=ETQwg7E6HxndH-hWSwTcSg2Q2MWrPDTEZtb2ebZaxS0,6030
144
+ stepaudio2/flashcosyvoice/utils/memory.py,sha256=ciip3CEUfiSGe6qOFOfsYsGi6yYFcNcZVoAjnABVp28,597
145
+ minicpmo_utils-0.1.0.dist-info/METADATA,sha256=dVIxQpW56luyHDeDXMrGMnyKNgUo9VBEo61Zi1a5YCA,2120
146
+ minicpmo_utils-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
147
+ minicpmo_utils-0.1.0.dist-info/top_level.txt,sha256=yIeAnGCB_ihapVKakC0wqRtAwCjezcDPCC-z6Vjlt0s,49
148
+ minicpmo_utils-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,5 @@
1
+ cosyvoice
2
+ matcha
3
+ minicpmo
4
+ s3tokenizer
5
+ stepaudio2
@@ -0,0 +1,153 @@
1
+ # Copyright (c) 2023 OpenAI. (authors: Whisper Team)
2
+ # 2024 Tsinghua Univ. (authors: Xingchen Song)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Modified from
16
+ https://github.com/openai/whisper/blob/main/whisper/__init__.py
17
+ """
18
+
19
+ import hashlib
20
+ import os
21
+ import urllib
22
+ import warnings
23
+ from typing import List, Union
24
+
25
+ from tqdm import tqdm
26
+
27
+ from s3tokenizer.model_v2 import S3TokenizerV2
28
+
29
+ from .model import S3Tokenizer
30
+ from .utils import (load_audio, log_mel_spectrogram, make_non_pad_mask,
31
+ mask_to_bias, onnx2torch, padding, merge_tokenized_segments)
32
+
33
+ __all__ = [
34
+ 'load_audio', 'log_mel_spectrogram', 'make_non_pad_mask', 'mask_to_bias',
35
+ 'onnx2torch', 'padding', 'merge_tokenized_segments'
36
+ ]
37
+ _MODELS = {
38
+ "speech_tokenizer_v1":
39
+ "https://www.modelscope.cn/models/iic/cosyvoice-300m/"
40
+ "resolve/master/speech_tokenizer_v1.onnx",
41
+ "speech_tokenizer_v1_25hz":
42
+ "https://www.modelscope.cn/models/iic/CosyVoice-300M-25Hz/"
43
+ "resolve/master/speech_tokenizer_v1.onnx",
44
+ "speech_tokenizer_v2_25hz":
45
+ "https://www.modelscope.cn/models/iic/CosyVoice2-0.5B/"
46
+ "resolve/master/speech_tokenizer_v2.onnx",
47
+ }
48
+
49
+ _SHA256S = {
50
+ "speech_tokenizer_v1":
51
+ "23b5a723ed9143aebfd9ffda14ac4c21231f31c35ef837b6a13bb9e5488abb1e",
52
+ "speech_tokenizer_v1_25hz":
53
+ "56285ddd4a83e883ee0cb9f8d69c1089b53a94b1f78ff7e4a0224a27eb4cb486",
54
+ "speech_tokenizer_v2_25hz":
55
+ "d43342aa12163a80bf07bffb94c9de2e120a8df2f9917cd2f642e7f4219c6f71",
56
+ }
57
+
58
+
59
+ def _download(name: str, root: str) -> Union[bytes, str]:
60
+ os.makedirs(root, exist_ok=True)
61
+
62
+ expected_sha256 = _SHA256S[name]
63
+ url = _MODELS[name]
64
+ download_target = os.path.join(root, f"{name}.onnx")
65
+
66
+ if os.path.exists(download_target) and not os.path.isfile(download_target):
67
+ raise RuntimeError(
68
+ f"{download_target} exists and is not a regular file")
69
+
70
+ if os.path.isfile(download_target):
71
+ with open(download_target, "rb") as f:
72
+ model_bytes = f.read()
73
+ if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
74
+ return download_target
75
+ else:
76
+ warnings.warn(
77
+ f"{download_target} exists, but the SHA256 checksum does not"
78
+ " match; re-downloading the file")
79
+
80
+ with urllib.request.urlopen(url) as source, open(download_target,
81
+ "wb") as output:
82
+ with tqdm(
83
+ total=int(source.info().get("Content-Length")),
84
+ ncols=80,
85
+ unit="iB",
86
+ unit_scale=True,
87
+ unit_divisor=1024,
88
+ desc="Downloading onnx checkpoint",
89
+ ) as loop:
90
+ while True:
91
+ buffer = source.read(8192)
92
+ if not buffer:
93
+ break
94
+
95
+ output.write(buffer)
96
+ loop.update(len(buffer))
97
+
98
+ model_bytes = open(download_target, "rb").read()
99
+ if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
100
+ raise RuntimeError(
101
+ "Model has been downloaded but the SHA256 checksum does not not"
102
+ " match. Please retry loading the model.")
103
+
104
+ return download_target
105
+
106
+
107
+ def available_models() -> List[str]:
108
+ """Returns the names of available models"""
109
+ return list(_MODELS.keys())
110
+
111
+
112
+ def load_model(
113
+ name: str,
114
+ download_root: str = None,
115
+ ) -> S3Tokenizer:
116
+ """
117
+ Load a S3Tokenizer ASR model
118
+
119
+ Parameters
120
+ ----------
121
+ name : str
122
+ one of the official model names listed by
123
+ `s3tokenizer.available_models()`, or path to a model checkpoint
124
+ containing the model dimensions and the model state_dict.
125
+ download_root: str
126
+ path to download the model files; by default,
127
+ it uses "~/.cache/s3tokenizer"
128
+
129
+ Returns
130
+ -------
131
+ model : S3Tokenizer
132
+ The S3Tokenizer model instance
133
+ """
134
+
135
+ if download_root is None:
136
+ default = os.path.join(os.path.expanduser("~"), ".cache")
137
+ download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default),
138
+ "s3tokenizer")
139
+
140
+ if name in _MODELS:
141
+ checkpoint_file = _download(name, download_root)
142
+ elif os.path.isfile(name):
143
+ checkpoint_file = name
144
+ else:
145
+ raise RuntimeError(
146
+ f"Model {name} not found; available models = {available_models()}")
147
+ if 'v2' in name:
148
+ model = S3TokenizerV2(name)
149
+ else:
150
+ model = S3Tokenizer(name)
151
+ model.init_from_onnx(checkpoint_file)
152
+
153
+ return model
Binary file
Binary file
Binary file
s3tokenizer/cli.py ADDED
@@ -0,0 +1,183 @@
1
+ # Copyright (c) 2024 Tsinghua Univ. (authors: Xingchen Song)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """ Example Usage
15
+ cpu:
16
+
17
+ s3tokenizer --wav_scp xxx.scp \
18
+ --device "cpu" \
19
+ --output_dir "./" \
20
+ --batch_size 32
21
+
22
+ gpu:
23
+
24
+ torchrun --nproc_per_node=8 --nnodes=1 \
25
+ --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
26
+ `which s3tokenizer` --wav_scp xxx.scp \
27
+ --device "cuda" \
28
+ --output_dir "./" \
29
+ --batch_size 32
30
+
31
+ """
32
+
33
+ import argparse
34
+ import json
35
+ import os
36
+
37
+ import torch
38
+ import torch.distributed as dist
39
+ from torch.utils.data import DataLoader, Dataset, DistributedSampler
40
+ from tqdm import tqdm
41
+
42
+ import s3tokenizer
43
+
44
+
45
+ class AudioDataset(Dataset):
46
+
47
+ def __init__(self, wav_scp):
48
+ self.data = []
49
+ self.keys = []
50
+
51
+ with open(wav_scp, 'r', encoding='utf-8') as f:
52
+ for line in f:
53
+ key, file_path = line.strip().split()
54
+ self.data.append(file_path)
55
+ self.keys.append(key)
56
+
57
+ def __len__(self):
58
+ return len(self.data)
59
+
60
+ def __getitem__(self, idx):
61
+ file_path = self.data[idx]
62
+ key = self.keys[idx]
63
+ audio = s3tokenizer.load_audio(file_path)
64
+ mel = s3tokenizer.log_mel_spectrogram(audio)
65
+ return key, mel
66
+
67
+
68
+ def collate_fn(batch):
69
+ keys = [item[0] for item in batch]
70
+ mels = [item[1] for item in batch]
71
+ mels, mels_lens = s3tokenizer.padding(mels)
72
+ return keys, mels, mels_lens
73
+
74
+
75
+ def init_distributed():
76
+ world_size = int(os.environ.get('WORLD_SIZE', 1))
77
+ local_rank = int(os.environ.get('LOCAL_RANK', 0))
78
+ rank = int(os.environ.get('RANK', 0))
79
+ print('Inference on multiple gpus, this gpu {}'.format(local_rank) +
80
+ ', rank {}, world_size {}'.format(rank, world_size))
81
+ torch.cuda.set_device(local_rank)
82
+ dist.init_process_group("nccl")
83
+ return world_size, local_rank, rank
84
+
85
+
86
+ def get_args():
87
+ parser = argparse.ArgumentParser(description='extract speech code')
88
+ parser.add_argument('--model',
89
+ required=True,
90
+ type=str,
91
+ choices=[
92
+ "speech_tokenizer_v1", "speech_tokenizer_v1_25hz",
93
+ "speech_tokenizer_v2_25hz"
94
+ ],
95
+ help='model version')
96
+ parser.add_argument('--wav_scp',
97
+ required=True,
98
+ type=str,
99
+ help='each line contains `wav_name wav_path`')
100
+ parser.add_argument('--device',
101
+ required=True,
102
+ type=str,
103
+ choices=["cuda", "cpu"],
104
+ help='device for inference')
105
+ parser.add_argument('--output_dir',
106
+ required=True,
107
+ type=str,
108
+ help='dir to save result')
109
+ parser.add_argument('--batch_size',
110
+ required=True,
111
+ type=int,
112
+ help='batch size (per-device) for inference')
113
+ parser.add_argument('--num_workers',
114
+ type=int,
115
+ default=4,
116
+ help='workers for dataloader')
117
+ parser.add_argument('--prefetch',
118
+ type=int,
119
+ default=5,
120
+ help='prefetch for dataloader')
121
+ args = parser.parse_args()
122
+ return args
123
+
124
+
125
+ def main():
126
+ args = get_args()
127
+ os.makedirs(args.output_dir, exist_ok=True)
128
+
129
+ if args.device == "cuda":
130
+ assert (torch.cuda.is_available())
131
+ world_size, local_rank, rank = init_distributed()
132
+ else:
133
+ world_size, local_rank, rank = 1, 0, 0
134
+
135
+ device = torch.device(args.device)
136
+ model = s3tokenizer.load_model(args.model).to(device)
137
+ dataset = AudioDataset(args.wav_scp)
138
+
139
+ if args.device == "cuda":
140
+ model = torch.nn.parallel.DistributedDataParallel(
141
+ model, device_ids=[local_rank])
142
+ sampler = DistributedSampler(dataset,
143
+ num_replicas=world_size,
144
+ rank=rank)
145
+ else:
146
+ sampler = None
147
+
148
+ dataloader = DataLoader(dataset,
149
+ batch_size=args.batch_size,
150
+ sampler=sampler,
151
+ shuffle=False,
152
+ num_workers=args.num_workers,
153
+ prefetch_factor=args.prefetch,
154
+ collate_fn=collate_fn)
155
+
156
+ total_steps = len(dataset)
157
+
158
+ if rank == 0:
159
+ progress_bar = tqdm(total=total_steps, desc="Processing", unit="wavs")
160
+
161
+ writer = open(f"{args.output_dir}/part_{rank + 1}_of_{world_size}", "w")
162
+ for keys, mels, mels_lens in dataloader:
163
+ codes, codes_lens = model(mels.to(device), mels_lens.to(device))
164
+ for i, k in enumerate(keys):
165
+ code = codes[i, :codes_lens[i].item()].tolist()
166
+ writer.write(
167
+ json.dumps({
168
+ "key": k,
169
+ "code": code
170
+ }, ensure_ascii=False) + "\n")
171
+ if rank == 0:
172
+ progress_bar.update(world_size * len(keys))
173
+
174
+ if rank == 0:
175
+ progress_bar.close()
176
+ writer.close()
177
+ if args.device == "cuda":
178
+ dist.barrier()
179
+ dist.destroy_process_group()
180
+
181
+
182
+ if __name__ == "__main__":
183
+ main()