xinference 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +15 -34
  3. xinference/client/restful/restful_client.py +2 -2
  4. xinference/core/chat_interface.py +45 -10
  5. xinference/core/image_interface.py +9 -0
  6. xinference/core/model.py +8 -5
  7. xinference/core/scheduler.py +1 -2
  8. xinference/core/worker.py +49 -42
  9. xinference/deploy/cmdline.py +2 -2
  10. xinference/deploy/test/test_cmdline.py +7 -7
  11. xinference/model/audio/chattts.py +24 -9
  12. xinference/model/audio/core.py +8 -2
  13. xinference/model/audio/fish_speech.py +228 -0
  14. xinference/model/audio/model_spec.json +8 -0
  15. xinference/model/embedding/core.py +23 -1
  16. xinference/model/image/model_spec.json +2 -1
  17. xinference/model/image/model_spec_modelscope.json +2 -1
  18. xinference/model/image/stable_diffusion/core.py +49 -1
  19. xinference/model/llm/__init__.py +26 -27
  20. xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
  21. xinference/model/llm/llm_family.json +606 -1266
  22. xinference/model/llm/llm_family.py +16 -139
  23. xinference/model/llm/llm_family_modelscope.json +276 -313
  24. xinference/model/llm/lmdeploy/__init__.py +0 -0
  25. xinference/model/llm/lmdeploy/core.py +557 -0
  26. xinference/model/llm/memory.py +9 -9
  27. xinference/model/llm/sglang/core.py +2 -2
  28. xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
  29. xinference/model/llm/{pytorch → transformers}/cogvlm2.py +4 -45
  30. xinference/model/llm/transformers/cogvlm2_video.py +524 -0
  31. xinference/model/llm/{pytorch → transformers}/core.py +3 -10
  32. xinference/model/llm/{pytorch → transformers}/glm4v.py +2 -23
  33. xinference/model/llm/transformers/intern_vl.py +540 -0
  34. xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
  35. xinference/model/llm/{pytorch → transformers}/minicpmv25.py +2 -23
  36. xinference/model/llm/{pytorch → transformers}/minicpmv26.py +66 -41
  37. xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
  38. xinference/model/llm/{pytorch → transformers}/yi_vl.py +2 -24
  39. xinference/model/llm/utils.py +85 -70
  40. xinference/model/llm/vllm/core.py +110 -11
  41. xinference/model/utils.py +1 -95
  42. xinference/thirdparty/fish_speech/__init__.py +0 -0
  43. xinference/thirdparty/fish_speech/fish_speech/__init__.py +0 -0
  44. xinference/thirdparty/fish_speech/fish_speech/callbacks/__init__.py +3 -0
  45. xinference/thirdparty/fish_speech/fish_speech/callbacks/grad_norm.py +113 -0
  46. xinference/thirdparty/fish_speech/fish_speech/configs/__init__.py +0 -0
  47. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  48. xinference/thirdparty/fish_speech/fish_speech/conversation.py +2 -0
  49. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  50. xinference/thirdparty/fish_speech/fish_speech/datasets/concat_repeat.py +53 -0
  51. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  52. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_pb2.py +33 -0
  53. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_stream.py +36 -0
  54. xinference/thirdparty/fish_speech/fish_speech/datasets/semantic.py +496 -0
  55. xinference/thirdparty/fish_speech/fish_speech/datasets/vqgan.py +147 -0
  56. xinference/thirdparty/fish_speech/fish_speech/i18n/__init__.py +3 -0
  57. xinference/thirdparty/fish_speech/fish_speech/i18n/core.py +40 -0
  58. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  59. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +122 -0
  60. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +122 -0
  61. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +123 -0
  62. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +133 -0
  63. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +122 -0
  64. xinference/thirdparty/fish_speech/fish_speech/i18n/scan.py +122 -0
  65. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  66. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/__init__.py +0 -0
  67. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lit_module.py +202 -0
  68. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +779 -0
  69. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lora.py +92 -0
  70. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +3 -0
  71. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +442 -0
  72. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  73. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +44 -0
  74. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +625 -0
  75. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +139 -0
  76. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +115 -0
  77. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +225 -0
  78. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/utils.py +94 -0
  79. xinference/thirdparty/fish_speech/fish_speech/scheduler.py +40 -0
  80. xinference/thirdparty/fish_speech/fish_speech/text/__init__.py +4 -0
  81. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/__init__.py +0 -0
  82. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_class.py +172 -0
  83. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_constant.py +30 -0
  84. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_util.py +342 -0
  85. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/cardinal.py +32 -0
  86. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/date.py +75 -0
  87. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/digit.py +32 -0
  88. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/fraction.py +35 -0
  89. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/money.py +43 -0
  90. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/percentage.py +33 -0
  91. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/telephone.py +51 -0
  92. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +177 -0
  93. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +69 -0
  94. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +130 -0
  95. xinference/thirdparty/fish_speech/fish_speech/train.py +139 -0
  96. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +23 -0
  97. xinference/thirdparty/fish_speech/fish_speech/utils/braceexpand.py +217 -0
  98. xinference/thirdparty/fish_speech/fish_speech/utils/context.py +13 -0
  99. xinference/thirdparty/fish_speech/fish_speech/utils/file.py +16 -0
  100. xinference/thirdparty/fish_speech/fish_speech/utils/instantiators.py +50 -0
  101. xinference/thirdparty/fish_speech/fish_speech/utils/logger.py +55 -0
  102. xinference/thirdparty/fish_speech/fish_speech/utils/logging_utils.py +48 -0
  103. xinference/thirdparty/fish_speech/fish_speech/utils/rich_utils.py +100 -0
  104. xinference/thirdparty/fish_speech/fish_speech/utils/spectrogram.py +122 -0
  105. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +114 -0
  106. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  107. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +120 -0
  108. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1237 -0
  109. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  110. xinference/thirdparty/fish_speech/tools/api.py +495 -0
  111. xinference/thirdparty/fish_speech/tools/auto_rerank.py +159 -0
  112. xinference/thirdparty/fish_speech/tools/download_models.py +55 -0
  113. xinference/thirdparty/fish_speech/tools/extract_model.py +21 -0
  114. xinference/thirdparty/fish_speech/tools/file.py +108 -0
  115. xinference/thirdparty/fish_speech/tools/gen_ref.py +36 -0
  116. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  117. xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +169 -0
  118. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +171 -0
  119. xinference/thirdparty/fish_speech/tools/llama/generate.py +698 -0
  120. xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +95 -0
  121. xinference/thirdparty/fish_speech/tools/llama/quantize.py +497 -0
  122. xinference/thirdparty/fish_speech/tools/llama/rebuild_tokenizer.py +57 -0
  123. xinference/thirdparty/fish_speech/tools/merge_asr_files.py +55 -0
  124. xinference/thirdparty/fish_speech/tools/post_api.py +164 -0
  125. xinference/thirdparty/fish_speech/tools/sensevoice/__init__.py +0 -0
  126. xinference/thirdparty/fish_speech/tools/sensevoice/auto_model.py +573 -0
  127. xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +332 -0
  128. xinference/thirdparty/fish_speech/tools/sensevoice/vad_utils.py +61 -0
  129. xinference/thirdparty/fish_speech/tools/smart_pad.py +47 -0
  130. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  131. xinference/thirdparty/fish_speech/tools/vqgan/create_train_split.py +83 -0
  132. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +227 -0
  133. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +120 -0
  134. xinference/thirdparty/fish_speech/tools/webui.py +619 -0
  135. xinference/thirdparty/fish_speech/tools/whisper_asr.py +176 -0
  136. xinference/thirdparty/internvl/__init__.py +0 -0
  137. xinference/thirdparty/internvl/conversation.py +393 -0
  138. xinference/thirdparty/omnilmm/model/utils.py +16 -1
  139. xinference/web/ui/build/asset-manifest.json +3 -3
  140. xinference/web/ui/build/index.html +1 -1
  141. xinference/web/ui/build/static/js/main.661c7b0a.js +3 -0
  142. xinference/web/ui/build/static/js/{main.17ca0398.js.map → main.661c7b0a.js.map} +1 -1
  143. xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +1 -0
  144. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
  145. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
  146. xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
  147. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
  148. xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
  149. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
  150. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
  151. xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
  152. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
  153. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
  154. xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
  155. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
  156. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/METADATA +22 -13
  157. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/RECORD +170 -79
  158. xinference/locale/utils.py +0 -39
  159. xinference/locale/zh_CN.json +0 -26
  160. xinference/model/llm/ggml/tools/__init__.py +0 -15
  161. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
  162. xinference/model/llm/ggml/tools/gguf.py +0 -884
  163. xinference/model/llm/pytorch/__init__.py +0 -13
  164. xinference/model/llm/pytorch/baichuan.py +0 -81
  165. xinference/model/llm/pytorch/falcon.py +0 -138
  166. xinference/model/llm/pytorch/intern_vl.py +0 -352
  167. xinference/model/llm/pytorch/vicuna.py +0 -69
  168. xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
  169. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
  170. xinference/web/ui/node_modules/.cache/babel-loader/2f40209b32e7e46a2eab6b8c8a355eb42c3caa8bc3228dd929f32fd2b3940294.json +0 -1
  171. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
  172. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
  173. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
  174. xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
  175. xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
  176. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
  177. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
  178. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
  179. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
  180. xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
  181. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
  182. /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
  183. /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
  184. /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
  185. /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
  186. /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
  187. /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
  188. /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
  189. /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
  190. /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.661c7b0a.js.LICENSE.txt} +0 -0
  191. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/LICENSE +0 -0
  192. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/WHEEL +0 -0
  193. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/entry_points.txt +0 -0
  194. {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,332 @@
1
+ import gc
2
+ import os
3
+ import re
4
+
5
+ from audio_separator.separator import Separator
6
+
7
+ os.environ["MODELSCOPE_CACHE"] = "./.cache/funasr"
8
+ os.environ["UVR5_CACHE"] = "./.cache/uvr5-models"
9
+ import json
10
+ import subprocess
11
+ from pathlib import Path
12
+
13
+ import click
14
+ import torch
15
+ from loguru import logger
16
+ from pydub import AudioSegment
17
+ from silero_vad import get_speech_timestamps, load_silero_vad, read_audio
18
+ from tqdm import tqdm
19
+
20
+ from tools.file import AUDIO_EXTENSIONS, VIDEO_EXTENSIONS, list_files
21
+ from tools.sensevoice.auto_model import AutoModel
22
+
23
+
24
+ def uvr5_cli(
25
+ audio_dir: Path,
26
+ output_folder: Path,
27
+ audio_files: list[Path] | None = None,
28
+ output_format: str = "flac",
29
+ model: str = "BS-Roformer-Viperx-1296.ckpt",
30
+ ):
31
+ # ["BS-Roformer-Viperx-1297.ckpt", "BS-Roformer-Viperx-1296.ckpt", "BS-Roformer-Viperx-1053.ckpt", "Mel-Roformer-Viperx-1143.ckpt"]
32
+ sepr = Separator(
33
+ model_file_dir=os.environ["UVR5_CACHE"],
34
+ output_dir=output_folder,
35
+ output_format=output_format,
36
+ )
37
+ dictmodel = {
38
+ "BS-Roformer-Viperx-1297.ckpt": "model_bs_roformer_ep_317_sdr_12.9755.ckpt",
39
+ "BS-Roformer-Viperx-1296.ckpt": "model_bs_roformer_ep_368_sdr_12.9628.ckpt",
40
+ "BS-Roformer-Viperx-1053.ckpt": "model_bs_roformer_ep_937_sdr_10.5309.ckpt",
41
+ "Mel-Roformer-Viperx-1143.ckpt": "model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt",
42
+ }
43
+ roformer_model = dictmodel[model]
44
+ sepr.load_model(roformer_model)
45
+ if audio_files is None:
46
+ audio_files = list_files(
47
+ path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
48
+ )
49
+ total_files = len(audio_files)
50
+
51
+ print(f"{total_files} audio files found")
52
+
53
+ res = []
54
+ for audio in tqdm(audio_files, desc="Denoising: "):
55
+ file_path = str(audio_dir / audio)
56
+ sep_out = sepr.separate(file_path)
57
+ if isinstance(sep_out, str):
58
+ res.append(sep_out)
59
+ elif isinstance(sep_out, list):
60
+ res.extend(sep_out)
61
+ del sepr
62
+ gc.collect()
63
+ if torch.cuda.is_available():
64
+ torch.cuda.empty_cache()
65
+
66
+ return res, roformer_model
67
+
68
+
69
+ def get_sample_rate(media_path: Path):
70
+ result = subprocess.run(
71
+ [
72
+ "ffprobe",
73
+ "-v",
74
+ "quiet",
75
+ "-print_format",
76
+ "json",
77
+ "-show_streams",
78
+ str(media_path),
79
+ ],
80
+ capture_output=True,
81
+ text=True,
82
+ check=True,
83
+ )
84
+ media_info = json.loads(result.stdout)
85
+ for stream in media_info.get("streams", []):
86
+ if stream.get("codec_type") == "audio":
87
+ return stream.get("sample_rate")
88
+ return "44100" # Default sample rate if not found
89
+
90
+
91
+ def convert_to_mono(src_path: Path, out_path: Path, out_fmt: str = "wav"):
92
+ sr = get_sample_rate(src_path)
93
+ out_path.parent.mkdir(parents=True, exist_ok=True)
94
+ if src_path.resolve() == out_path.resolve():
95
+ output = str(out_path.with_stem(out_path.stem + f"_{sr}"))
96
+ else:
97
+ output = str(out_path)
98
+ subprocess.run(
99
+ [
100
+ "ffmpeg",
101
+ "-loglevel",
102
+ "error",
103
+ "-i",
104
+ str(src_path),
105
+ "-acodec",
106
+ "pcm_s16le" if out_fmt == "wav" else "flac",
107
+ "-ar",
108
+ sr,
109
+ "-ac",
110
+ "1",
111
+ "-y",
112
+ output,
113
+ ],
114
+ check=True,
115
+ )
116
+ return out_path
117
+
118
+
119
+ def convert_video_to_audio(video_path: Path, audio_dir: Path):
120
+ cur_dir = audio_dir / video_path.relative_to(audio_dir).parent
121
+ vocals = [
122
+ p
123
+ for p in cur_dir.glob(f"{video_path.stem}_(Vocals)*.*")
124
+ if p.suffix in AUDIO_EXTENSIONS
125
+ ]
126
+ if len(vocals) > 0:
127
+ return vocals[0]
128
+ audio_path = cur_dir / f"{video_path.stem}.wav"
129
+ convert_to_mono(video_path, audio_path)
130
+ return audio_path
131
+
132
+
133
+ @click.command()
134
+ @click.option("--audio-dir", required=True, help="Directory containing audio files")
135
+ @click.option(
136
+ "--save-dir", required=True, help="Directory to save processed audio files"
137
+ )
138
+ @click.option("--device", default="cuda", help="Device to use [cuda / cpu]")
139
+ @click.option("--language", default="auto", help="Language of the transcription")
140
+ @click.option(
141
+ "--max_single_segment_time",
142
+ default=20000,
143
+ type=int,
144
+ help="Maximum of Output single audio duration(ms)",
145
+ )
146
+ @click.option("--fsmn-vad/--silero-vad", default=False)
147
+ @click.option("--punc/--no-punc", default=False)
148
+ @click.option("--denoise/--no-denoise", default=False)
149
+ @click.option("--save_emo/--no_save_emo", default=False)
150
+ def main(
151
+ audio_dir: str,
152
+ save_dir: str,
153
+ device: str,
154
+ language: str,
155
+ max_single_segment_time: int,
156
+ fsmn_vad: bool,
157
+ punc: bool,
158
+ denoise: bool,
159
+ save_emo: bool,
160
+ ):
161
+
162
+ audios_path = Path(audio_dir)
163
+ save_path = Path(save_dir)
164
+ save_path.mkdir(parents=True, exist_ok=True)
165
+
166
+ video_files = list_files(
167
+ path=audio_dir, extensions=VIDEO_EXTENSIONS, recursive=True
168
+ )
169
+ v2a_files = [convert_video_to_audio(p, audio_dir) for p in video_files]
170
+
171
+ if denoise:
172
+ VOCAL = "_(Vocals)"
173
+ original_files = [
174
+ p
175
+ for p in audios_path.glob("**/*")
176
+ if p.suffix in AUDIO_EXTENSIONS and VOCAL not in p.stem
177
+ ]
178
+
179
+ _, cur_model = uvr5_cli(
180
+ audio_dir=audio_dir, output_folder=audio_dir, audio_files=original_files
181
+ )
182
+ need_remove = [p for p in audios_path.glob("**/*(Instrumental)*")]
183
+ need_remove.extend(original_files)
184
+ for _ in need_remove:
185
+ _.unlink()
186
+ vocal_files = [
187
+ p
188
+ for p in audios_path.glob("**/*")
189
+ if p.suffix in AUDIO_EXTENSIONS and VOCAL in p.stem
190
+ ]
191
+ for f in vocal_files:
192
+ fn, ext = f.stem, f.suffix
193
+
194
+ v_pos = fn.find(VOCAL + "_" + cur_model.split(".")[0])
195
+ if v_pos != -1:
196
+ new_fn = fn[: v_pos + len(VOCAL)]
197
+ new_f = f.with_name(new_fn + ext)
198
+ f = f.rename(new_f)
199
+ convert_to_mono(f, f, "flac")
200
+ f.unlink()
201
+
202
+ audio_files = list_files(
203
+ path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
204
+ )
205
+
206
+ logger.info("Loading / Downloading Funasr model...")
207
+
208
+ model_dir = "iic/SenseVoiceSmall"
209
+
210
+ vad_model = "fsmn-vad" if fsmn_vad else None
211
+ vad_kwargs = {"max_single_segment_time": max_single_segment_time}
212
+ punc_model = "ct-punc" if punc else None
213
+
214
+ manager = AutoModel(
215
+ model=model_dir,
216
+ trust_remote_code=False,
217
+ vad_model=vad_model,
218
+ vad_kwargs=vad_kwargs,
219
+ punc_model=punc_model,
220
+ device=device,
221
+ )
222
+
223
+ if not fsmn_vad and vad_model is None:
224
+ vad_model = load_silero_vad()
225
+
226
+ logger.info("Model loaded.")
227
+
228
+ pattern = re.compile(r"_\d{3}\.")
229
+
230
+ for file_path in tqdm(audio_files, desc="Processing audio file"):
231
+
232
+ if pattern.search(file_path.name):
233
+ # logger.info(f"Skipping {file_path} as it has already been processed.")
234
+ continue
235
+
236
+ file_stem = file_path.stem
237
+ file_suffix = file_path.suffix
238
+
239
+ rel_path = Path(file_path).relative_to(audio_dir)
240
+ (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
241
+
242
+ audio = AudioSegment.from_file(file_path)
243
+
244
+ cfg = dict(
245
+ cache={},
246
+ language=language, # "zh", "en", "yue", "ja", "ko", "nospeech"
247
+ use_itn=False,
248
+ batch_size_s=60,
249
+ )
250
+
251
+ if fsmn_vad:
252
+ elapsed, vad_res = manager.vad(input=str(file_path), **cfg)
253
+ else:
254
+ wav = read_audio(
255
+ str(file_path)
256
+ ) # backend (sox, soundfile, or ffmpeg) required!
257
+ audio_key = file_path.stem
258
+ audio_val = []
259
+ speech_timestamps = get_speech_timestamps(
260
+ wav,
261
+ vad_model,
262
+ max_speech_duration_s=max_single_segment_time // 1000,
263
+ return_seconds=True,
264
+ )
265
+
266
+ audio_val = [
267
+ [int(timestamp["start"] * 1000), int(timestamp["end"] * 1000)]
268
+ for timestamp in speech_timestamps
269
+ ]
270
+ vad_res = []
271
+ vad_res.append(dict(key=audio_key, value=audio_val))
272
+
273
+ res = manager.inference_with_vadres(
274
+ input=str(file_path), vad_res=vad_res, **cfg
275
+ )
276
+
277
+ for i, info in enumerate(res):
278
+ [start_ms, end_ms] = info["interval"]
279
+ text = info["text"]
280
+ emo = info["emo"]
281
+ sliced_audio = audio[start_ms:end_ms]
282
+ audio_save_path = (
283
+ save_path / rel_path.parent / f"{file_stem}_{i:03d}{file_suffix}"
284
+ )
285
+ sliced_audio.export(audio_save_path, format=file_suffix[1:])
286
+ print(f"Exported {audio_save_path}: {text}")
287
+
288
+ transcript_save_path = (
289
+ save_path / rel_path.parent / f"{file_stem}_{i:03d}.lab"
290
+ )
291
+ with open(
292
+ transcript_save_path,
293
+ "w",
294
+ encoding="utf-8",
295
+ ) as f:
296
+ f.write(text)
297
+
298
+ if save_emo:
299
+ emo_save_path = save_path / rel_path.parent / f"{file_stem}_{i:03d}.emo"
300
+ with open(
301
+ emo_save_path,
302
+ "w",
303
+ encoding="utf-8",
304
+ ) as f:
305
+ f.write(emo)
306
+
307
+ if audios_path.resolve() == save_path.resolve():
308
+ file_path.unlink()
309
+
310
+
311
+ if __name__ == "__main__":
312
+ main()
313
+ exit(0)
314
+ from funasr.utils.postprocess_utils import rich_transcription_postprocess
315
+
316
+ # Load the audio file
317
+ audio_path = Path(r"D:\PythonProject\ok\1_output_(Vocals).wav")
318
+ model_dir = "iic/SenseVoiceSmall"
319
+ m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir, device="cuda:0")
320
+ m.eval()
321
+
322
+ res = m.inference(
323
+ data_in=f"{kwargs['model_path']}/example/zh.mp3",
324
+ language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
325
+ use_itn=False,
326
+ ban_emo_unk=False,
327
+ **kwargs,
328
+ )
329
+
330
+ print(res)
331
+ text = rich_transcription_postprocess(res[0][0]["text"])
332
+ print(text)
@@ -0,0 +1,61 @@
1
+ import torch
2
+ from torch.nn.utils.rnn import pad_sequence
3
+
4
+
5
+ def slice_padding_fbank(speech, speech_lengths, vad_segments):
6
+ speech_list = []
7
+ speech_lengths_list = []
8
+ for i, segment in enumerate(vad_segments):
9
+
10
+ bed_idx = int(segment[0][0] * 16)
11
+ end_idx = min(int(segment[0][1] * 16), speech_lengths[0])
12
+ speech_i = speech[0, bed_idx:end_idx]
13
+ speech_lengths_i = end_idx - bed_idx
14
+ speech_list.append(speech_i)
15
+ speech_lengths_list.append(speech_lengths_i)
16
+ feats_pad = pad_sequence(speech_list, batch_first=True, padding_value=0.0)
17
+ speech_lengths_pad = torch.Tensor(speech_lengths_list).int()
18
+ return feats_pad, speech_lengths_pad
19
+
20
+
21
+ def slice_padding_audio_samples(speech, speech_lengths, vad_segments):
22
+ speech_list = []
23
+ speech_lengths_list = []
24
+ intervals = []
25
+ for i, segment in enumerate(vad_segments):
26
+ bed_idx = int(segment[0][0] * 16)
27
+ end_idx = min(int(segment[0][1] * 16), speech_lengths)
28
+ speech_i = speech[bed_idx:end_idx]
29
+ speech_lengths_i = end_idx - bed_idx
30
+ speech_list.append(speech_i)
31
+ speech_lengths_list.append(speech_lengths_i)
32
+ intervals.append([bed_idx // 16, end_idx // 16])
33
+
34
+ return speech_list, speech_lengths_list, intervals
35
+
36
+
37
+ def merge_vad(vad_result, max_length=15000, min_length=0):
38
+ new_result = []
39
+ if len(vad_result) <= 1:
40
+ return vad_result
41
+ time_step = [t[0] for t in vad_result] + [t[1] for t in vad_result]
42
+ time_step = sorted(list(set(time_step)))
43
+ if len(time_step) == 0:
44
+ return []
45
+ bg = 0
46
+ for i in range(len(time_step) - 1):
47
+ time = time_step[i]
48
+ if time_step[i + 1] - bg < max_length:
49
+ continue
50
+ if time - bg > min_length:
51
+ new_result.append([bg, time])
52
+ # if time - bg < max_length * 1.5:
53
+ # new_result.append([bg, time])
54
+ # else:
55
+ # split_num = int(time - bg) // max_length + 1
56
+ # spl_l = int(time - bg) // split_num
57
+ # for j in range(split_num):
58
+ # new_result.append([bg + j * spl_l, bg + (j + 1) * spl_l])
59
+ bg = time
60
+ new_result.append([bg, time_step[-1]])
61
+ return new_result
@@ -0,0 +1,47 @@
1
+ import random
2
+ from multiprocessing import Pool
3
+ from pathlib import Path
4
+
5
+ import click
6
+ import librosa
7
+ import torch.nn.functional as F
8
+ import torchaudio
9
+ from tqdm import tqdm
10
+
11
+ from tools.file import AUDIO_EXTENSIONS, list_files
12
+
13
+ threshold = 10 ** (-50 / 20.0)
14
+
15
+
16
+ def process(file):
17
+ waveform, sample_rate = torchaudio.load(str(file), backend="sox")
18
+ loudness = librosa.feature.rms(
19
+ y=waveform.numpy().squeeze(), frame_length=2048, hop_length=512, center=True
20
+ )[0]
21
+ for i in range(len(loudness) - 1, 0, -1):
22
+ if loudness[i] > threshold:
23
+ break
24
+
25
+ silent_time = (len(loudness) - i) * 512 / sample_rate
26
+
27
+ if silent_time <= 0.3:
28
+ random_time = random.uniform(0.3, 0.7)
29
+ waveform = F.pad(
30
+ waveform, (0, int(random_time * sample_rate)), mode="constant", value=0
31
+ )
32
+
33
+ torchaudio.save(uri=str(file), src=waveform, sample_rate=sample_rate)
34
+
35
+
36
+ @click.command()
37
+ @click.argument("source", type=Path)
38
+ @click.option("--num-workers", type=int, default=12)
39
+ def main(source, num_workers):
40
+ files = list(list_files(source, AUDIO_EXTENSIONS, recursive=True))
41
+
42
+ with Pool(num_workers) as p:
43
+ list(tqdm(p.imap_unordered(process, files), total=len(files)))
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
@@ -0,0 +1,83 @@
1
+ import math
2
+ from pathlib import Path
3
+ from random import Random
4
+
5
+ import click
6
+ from loguru import logger
7
+ from pydub import AudioSegment
8
+ from tqdm import tqdm
9
+
10
+ from tools.file import AUDIO_EXTENSIONS, list_files, load_filelist
11
+
12
+
13
+ @click.command()
14
+ @click.argument("root", type=click.Path(exists=True, path_type=Path))
15
+ @click.option("--val-ratio", type=float, default=None)
16
+ @click.option("--val-count", type=int, default=None)
17
+ @click.option("--filelist", default=None, type=Path)
18
+ @click.option("--min-duration", default=None, type=float)
19
+ @click.option("--max-duration", default=None, type=float)
20
+ def main(root, val_ratio, val_count, filelist, min_duration, max_duration):
21
+ if filelist:
22
+ files = [i[0] for i in load_filelist(filelist)]
23
+ else:
24
+ files = list_files(root, AUDIO_EXTENSIONS, recursive=True, sort=True)
25
+
26
+ if min_duration is None and max_duration is None:
27
+ filtered_files = list(map(str, [file.relative_to(root) for file in files]))
28
+ else:
29
+ filtered_files = []
30
+ for file in tqdm(files):
31
+ try:
32
+ audio = AudioSegment.from_file(str(file))
33
+ duration = len(audio) / 1000.0
34
+
35
+ if min_duration is not None and duration < min_duration:
36
+ logger.info(
37
+ f"Skipping {file} due to duration {duration:.2f} < {min_duration:.2f}"
38
+ )
39
+ continue
40
+
41
+ if max_duration is not None and duration > max_duration:
42
+ logger.info(
43
+ f"Skipping {file} due to duration {duration:.2f} > {max_duration:.2f}"
44
+ )
45
+ continue
46
+
47
+ filtered_files.append(str(file.relative_to(root)))
48
+ except Exception as e:
49
+ logger.info(f"Error processing {file}: {e}")
50
+
51
+ logger.info(
52
+ f"Found {len(files)} files, remaining {len(filtered_files)} files after filtering"
53
+ )
54
+
55
+ Random(42).shuffle(filtered_files)
56
+
57
+ if val_count is None and val_ratio is None:
58
+ logger.info("Validation ratio and count not specified, using min(20%, 100)")
59
+ val_size = min(100, math.ceil(len(filtered_files) * 0.2))
60
+ elif val_count is not None and val_ratio is not None:
61
+ logger.error("Cannot specify both val_count and val_ratio")
62
+ return
63
+ elif val_count is not None:
64
+ if val_count < 1 or val_count > len(filtered_files):
65
+ logger.error("val_count must be between 1 and number of files")
66
+ return
67
+ val_size = val_count
68
+ else:
69
+ val_size = math.ceil(len(filtered_files) * val_ratio)
70
+
71
+ logger.info(f"Using {val_size} files for validation")
72
+
73
+ with open(root / "vq_train_filelist.txt", "w", encoding="utf-8") as f:
74
+ f.write("\n".join(filtered_files[val_size:]))
75
+
76
+ with open(root / "vq_val_filelist.txt", "w", encoding="utf-8") as f:
77
+ f.write("\n".join(filtered_files[:val_size]))
78
+
79
+ logger.info("Done")
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()