xinference 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show
  1. xinference/_compat.py +51 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +209 -40
  4. xinference/client/restful/restful_client.py +7 -26
  5. xinference/conftest.py +1 -1
  6. xinference/constants.py +5 -0
  7. xinference/core/cache_tracker.py +1 -1
  8. xinference/core/chat_interface.py +8 -14
  9. xinference/core/event.py +1 -1
  10. xinference/core/image_interface.py +28 -0
  11. xinference/core/model.py +110 -31
  12. xinference/core/scheduler.py +37 -37
  13. xinference/core/status_guard.py +1 -1
  14. xinference/core/supervisor.py +17 -10
  15. xinference/core/utils.py +80 -22
  16. xinference/core/worker.py +17 -16
  17. xinference/deploy/cmdline.py +8 -16
  18. xinference/deploy/local.py +1 -1
  19. xinference/deploy/supervisor.py +1 -1
  20. xinference/deploy/utils.py +1 -1
  21. xinference/deploy/worker.py +1 -1
  22. xinference/model/audio/cosyvoice.py +86 -41
  23. xinference/model/audio/fish_speech.py +9 -9
  24. xinference/model/audio/model_spec.json +9 -9
  25. xinference/model/audio/whisper.py +4 -1
  26. xinference/model/embedding/core.py +52 -31
  27. xinference/model/image/core.py +2 -1
  28. xinference/model/image/model_spec.json +16 -4
  29. xinference/model/image/model_spec_modelscope.json +16 -4
  30. xinference/model/image/sdapi.py +136 -0
  31. xinference/model/image/stable_diffusion/core.py +164 -19
  32. xinference/model/llm/__init__.py +29 -11
  33. xinference/model/llm/llama_cpp/core.py +16 -33
  34. xinference/model/llm/llm_family.json +1011 -1296
  35. xinference/model/llm/llm_family.py +34 -53
  36. xinference/model/llm/llm_family_csghub.json +18 -35
  37. xinference/model/llm/llm_family_modelscope.json +981 -1122
  38. xinference/model/llm/lmdeploy/core.py +56 -88
  39. xinference/model/llm/mlx/core.py +46 -69
  40. xinference/model/llm/sglang/core.py +36 -18
  41. xinference/model/llm/transformers/chatglm.py +168 -306
  42. xinference/model/llm/transformers/cogvlm2.py +36 -63
  43. xinference/model/llm/transformers/cogvlm2_video.py +33 -223
  44. xinference/model/llm/transformers/core.py +55 -50
  45. xinference/model/llm/transformers/deepseek_v2.py +340 -0
  46. xinference/model/llm/transformers/deepseek_vl.py +53 -96
  47. xinference/model/llm/transformers/glm4v.py +55 -111
  48. xinference/model/llm/transformers/intern_vl.py +39 -70
  49. xinference/model/llm/transformers/internlm2.py +32 -54
  50. xinference/model/llm/transformers/minicpmv25.py +22 -55
  51. xinference/model/llm/transformers/minicpmv26.py +158 -68
  52. xinference/model/llm/transformers/omnilmm.py +5 -28
  53. xinference/model/llm/transformers/qwen2_audio.py +168 -0
  54. xinference/model/llm/transformers/qwen2_vl.py +234 -0
  55. xinference/model/llm/transformers/qwen_vl.py +34 -86
  56. xinference/model/llm/transformers/utils.py +32 -38
  57. xinference/model/llm/transformers/yi_vl.py +32 -72
  58. xinference/model/llm/utils.py +280 -554
  59. xinference/model/llm/vllm/core.py +161 -100
  60. xinference/model/rerank/core.py +41 -8
  61. xinference/model/rerank/model_spec.json +7 -0
  62. xinference/model/rerank/model_spec_modelscope.json +7 -1
  63. xinference/model/utils.py +1 -31
  64. xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
  65. xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
  66. xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
  67. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
  68. xinference/thirdparty/cosyvoice/cli/model.py +139 -26
  69. xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
  70. xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
  71. xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
  72. xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
  73. xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
  74. xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
  75. xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
  76. xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
  77. xinference/thirdparty/cosyvoice/utils/common.py +36 -0
  78. xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
  79. xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
  80. xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
  81. xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
  82. xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
  83. xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
  84. xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
  85. xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
  86. xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
  87. xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
  88. xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
  89. xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
  90. xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
  91. xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +33 -0
  92. xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
  93. xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
  94. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
  95. xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
  96. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +1 -1
  97. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +1 -1
  98. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +1 -1
  99. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +1 -1
  100. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +1 -1
  101. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +2 -2
  102. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +0 -3
  103. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +169 -198
  104. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +4 -27
  105. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
  106. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
  107. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +9 -47
  108. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +2 -2
  109. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -0
  110. xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
  111. xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
  112. xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
  113. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +12 -10
  114. xinference/thirdparty/fish_speech/tools/api.py +79 -134
  115. xinference/thirdparty/fish_speech/tools/commons.py +35 -0
  116. xinference/thirdparty/fish_speech/tools/download_models.py +3 -3
  117. xinference/thirdparty/fish_speech/tools/file.py +17 -0
  118. xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +1 -1
  119. xinference/thirdparty/fish_speech/tools/llama/generate.py +29 -24
  120. xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +1 -1
  121. xinference/thirdparty/fish_speech/tools/llama/quantize.py +2 -2
  122. xinference/thirdparty/fish_speech/tools/msgpack_api.py +34 -0
  123. xinference/thirdparty/fish_speech/tools/post_api.py +85 -44
  124. xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
  125. xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +1 -1
  126. xinference/thirdparty/fish_speech/tools/smart_pad.py +16 -3
  127. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +2 -2
  128. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +4 -2
  129. xinference/thirdparty/fish_speech/tools/webui.py +12 -146
  130. xinference/thirdparty/matcha/VERSION +1 -0
  131. xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
  132. xinference/thirdparty/matcha/hifigan/README.md +101 -0
  133. xinference/thirdparty/omnilmm/LICENSE +201 -0
  134. xinference/thirdparty/whisper/__init__.py +156 -0
  135. xinference/thirdparty/whisper/__main__.py +3 -0
  136. xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
  137. xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
  138. xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
  139. xinference/thirdparty/whisper/audio.py +157 -0
  140. xinference/thirdparty/whisper/decoding.py +826 -0
  141. xinference/thirdparty/whisper/model.py +314 -0
  142. xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
  143. xinference/thirdparty/whisper/normalizers/basic.py +76 -0
  144. xinference/thirdparty/whisper/normalizers/english.json +1741 -0
  145. xinference/thirdparty/whisper/normalizers/english.py +550 -0
  146. xinference/thirdparty/whisper/timing.py +386 -0
  147. xinference/thirdparty/whisper/tokenizer.py +395 -0
  148. xinference/thirdparty/whisper/transcribe.py +605 -0
  149. xinference/thirdparty/whisper/triton_ops.py +109 -0
  150. xinference/thirdparty/whisper/utils.py +316 -0
  151. xinference/thirdparty/whisper/version.py +1 -0
  152. xinference/types.py +14 -53
  153. xinference/web/ui/build/asset-manifest.json +6 -6
  154. xinference/web/ui/build/index.html +1 -1
  155. xinference/web/ui/build/static/css/{main.4bafd904.css → main.5061c4c3.css} +2 -2
  156. xinference/web/ui/build/static/css/main.5061c4c3.css.map +1 -0
  157. xinference/web/ui/build/static/js/main.754740c0.js +3 -0
  158. xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.754740c0.js.LICENSE.txt} +2 -0
  159. xinference/web/ui/build/static/js/main.754740c0.js.map +1 -0
  160. xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
  161. xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
  162. xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
  163. xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
  164. xinference/web/ui/node_modules/.cache/babel-loader/cd90b08d177025dfe84209596fc51878f8a86bcaa6a240848a3d2e5fd4c7ff24.json +1 -0
  165. xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
  166. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +1 -0
  167. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
  168. xinference/web/ui/node_modules/.package-lock.json +37 -0
  169. xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
  170. xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
  171. xinference/web/ui/node_modules/nunjucks/package.json +112 -0
  172. xinference/web/ui/package-lock.json +38 -0
  173. xinference/web/ui/package.json +1 -0
  174. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/METADATA +16 -10
  175. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/RECORD +179 -127
  176. xinference/model/llm/transformers/llama_2.py +0 -108
  177. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +0 -442
  178. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +0 -44
  179. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +0 -115
  180. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +0 -225
  181. xinference/thirdparty/fish_speech/tools/auto_rerank.py +0 -159
  182. xinference/thirdparty/fish_speech/tools/gen_ref.py +0 -36
  183. xinference/thirdparty/fish_speech/tools/merge_asr_files.py +0 -55
  184. xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
  185. xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
  186. xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
  187. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
  188. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
  189. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
  190. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
  191. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/LICENSE +0 -0
  192. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/WHEEL +0 -0
  193. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/entry_points.txt +0 -0
  194. {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/top_level.txt +0 -0
@@ -1,40 +1,19 @@
1
1
  import argparse
2
2
  import base64
3
- import json
4
3
  import wave
5
- from pathlib import Path
6
4
 
5
+ import ormsgpack
7
6
  import pyaudio
8
7
  import requests
8
+ from pydub import AudioSegment
9
+ from pydub.playback import play
9
10
 
11
+ from tools.commons import ServeReferenceAudio, ServeTTSRequest
12
+ from tools.file import audio_to_bytes, read_ref_text
10
13
 
11
- def wav_to_base64(file_path):
12
- if not file_path or not Path(file_path).exists():
13
- return None
14
- with open(file_path, "rb") as wav_file:
15
- wav_content = wav_file.read()
16
- base64_encoded = base64.b64encode(wav_content)
17
- return base64_encoded.decode("utf-8")
18
14
 
15
+ def parse_args():
19
16
 
20
- def read_ref_text(ref_text):
21
- path = Path(ref_text)
22
- if path.exists() and path.is_file():
23
- with path.open("r", encoding="utf-8") as file:
24
- return file.read()
25
- return ref_text
26
-
27
-
28
- def play_audio(audio_content, format, channels, rate):
29
- p = pyaudio.PyAudio()
30
- stream = p.open(format=format, channels=channels, rate=rate, output=True)
31
- stream.write(audio_content)
32
- stream.stop_stream()
33
- stream.close()
34
- p.terminate()
35
-
36
-
37
- if __name__ == "__main__":
38
17
  parser = argparse.ArgumentParser(
39
18
  description="Send a WAV file and text to a server and receive synthesized audio."
40
19
  )
@@ -43,16 +22,24 @@ if __name__ == "__main__":
43
22
  "--url",
44
23
  "-u",
45
24
  type=str,
46
- default="http://127.0.0.1:8080/v1/invoke",
25
+ default="http://127.0.0.1:8080/v1/tts",
47
26
  help="URL of the server",
48
27
  )
49
28
  parser.add_argument(
50
29
  "--text", "-t", type=str, required=True, help="Text to be synthesized"
51
30
  )
31
+ parser.add_argument(
32
+ "--reference_id",
33
+ "-id",
34
+ type=str,
35
+ default=None,
36
+ help="ID of the reference model o be used for the speech",
37
+ )
52
38
  parser.add_argument(
53
39
  "--reference_audio",
54
40
  "-ra",
55
41
  type=str,
42
+ nargs="+",
56
43
  default=None,
57
44
  help="Path to the WAV file",
58
45
  )
@@ -60,9 +47,30 @@ if __name__ == "__main__":
60
47
  "--reference_text",
61
48
  "-rt",
62
49
  type=str,
50
+ nargs="+",
63
51
  default=None,
64
52
  help="Reference text for voice synthesis",
65
53
  )
54
+ parser.add_argument(
55
+ "--output",
56
+ "-o",
57
+ type=str,
58
+ default="generated_audio",
59
+ help="Output audio file name",
60
+ )
61
+ parser.add_argument(
62
+ "--play",
63
+ type=bool,
64
+ default=True,
65
+ help="Whether to play audio after receiving data",
66
+ )
67
+ parser.add_argument("--normalize", type=bool, default=True)
68
+ parser.add_argument(
69
+ "--format", type=str, choices=["wav", "mp3", "flac"], default="wav"
70
+ )
71
+ parser.add_argument("--mp3_bitrate", type=int, default=64)
72
+ parser.add_argument("--opus_bitrate", type=int, default=-1000)
73
+ parser.add_argument("--latency", type=str, default="normal", help="延迟选项")
66
74
  parser.add_argument(
67
75
  "--max_new_tokens",
68
76
  type=int,
@@ -88,7 +96,6 @@ if __name__ == "__main__":
88
96
  "--speaker", type=str, default=None, help="Speaker ID for voice synthesis"
89
97
  )
90
98
  parser.add_argument("--emotion", type=str, default=None, help="Speaker's Emotion")
91
- parser.add_argument("--format", type=str, default="wav", help="Audio format")
92
99
  parser.add_argument(
93
100
  "--streaming", type=bool, default=False, help="Enable streaming response"
94
101
  )
@@ -97,18 +104,42 @@ if __name__ == "__main__":
97
104
  )
98
105
  parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio")
99
106
 
100
- args = parser.parse_args()
107
+ return parser.parse_args()
101
108
 
102
- base64_audio = wav_to_base64(args.reference_audio)
103
109
 
104
- ref_text = args.reference_text
105
- if ref_text:
106
- ref_text = read_ref_text(ref_text)
110
+ if __name__ == "__main__":
111
+
112
+ args = parse_args()
113
+
114
+ idstr: str | None = args.reference_id
115
+ # priority: ref_id > [{text, audio},...]
116
+ if idstr is None:
117
+ ref_audios = args.reference_audio
118
+ ref_texts = args.reference_text
119
+ if ref_audios is None:
120
+ byte_audios = []
121
+ else:
122
+ byte_audios = [audio_to_bytes(ref_audio) for ref_audio in ref_audios]
123
+ if ref_texts is None:
124
+ ref_texts = []
125
+ else:
126
+ ref_texts = [read_ref_text(ref_text) for ref_text in ref_texts]
127
+ else:
128
+ byte_audios = []
129
+ ref_texts = []
130
+ pass # in api.py
107
131
 
108
132
  data = {
109
133
  "text": args.text,
110
- "reference_text": ref_text,
111
- "reference_audio": base64_audio,
134
+ "references": [
135
+ ServeReferenceAudio(audio=ref_audio, text=ref_text)
136
+ for ref_text, ref_audio in zip(ref_texts, byte_audios)
137
+ ],
138
+ "reference_id": idstr,
139
+ "normalize": args.normalize,
140
+ "format": args.format,
141
+ "mp3_bitrate": args.mp3_bitrate,
142
+ "opus_bitrate": args.opus_bitrate,
112
143
  "max_new_tokens": args.max_new_tokens,
113
144
  "chunk_length": args.chunk_length,
114
145
  "top_p": args.top_p,
@@ -116,22 +147,30 @@ if __name__ == "__main__":
116
147
  "temperature": args.temperature,
117
148
  "speaker": args.speaker,
118
149
  "emotion": args.emotion,
119
- "format": args.format,
120
150
  "streaming": args.streaming,
121
151
  }
122
152
 
123
- response = requests.post(args.url, json=data, stream=args.streaming)
153
+ pydantic_data = ServeTTSRequest(**data)
124
154
 
125
- audio_format = pyaudio.paInt16 # Assuming 16-bit PCM format
155
+ response = requests.post(
156
+ args.url,
157
+ data=ormsgpack.packb(pydantic_data, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
158
+ stream=args.streaming,
159
+ headers={
160
+ "authorization": "Bearer YOUR_API_KEY",
161
+ "content-type": "application/msgpack",
162
+ },
163
+ )
126
164
 
127
165
  if response.status_code == 200:
128
166
  if args.streaming:
129
167
  p = pyaudio.PyAudio()
168
+ audio_format = pyaudio.paInt16 # Assuming 16-bit PCM format
130
169
  stream = p.open(
131
170
  format=audio_format, channels=args.channels, rate=args.rate, output=True
132
171
  )
133
172
 
134
- wf = wave.open("generated_audio.wav", "wb")
173
+ wf = wave.open(f"{args.output}.wav", "wb")
135
174
  wf.setnchannels(args.channels)
136
175
  wf.setsampwidth(p.get_sample_size(audio_format))
137
176
  wf.setframerate(args.rate)
@@ -153,12 +192,14 @@ if __name__ == "__main__":
153
192
  wf.close()
154
193
  else:
155
194
  audio_content = response.content
156
-
157
- with open("generated_audio.wav", "wb") as audio_file:
195
+ audio_path = f"{args.output}.{args.format}"
196
+ with open(audio_path, "wb") as audio_file:
158
197
  audio_file.write(audio_content)
159
198
 
160
- play_audio(audio_content, audio_format, args.channels, args.rate)
161
- print("Audio has been saved to 'generated_audio.wav'.")
199
+ audio = AudioSegment.from_file(audio_path, format=args.format)
200
+ if args.play:
201
+ play(audio)
202
+ print(f"Audio has been saved to '{audio_path}'.")
162
203
  else:
163
204
  print(f"Request failed with status code {response.status_code}")
164
205
  print(response.json())
@@ -0,0 +1,59 @@
1
+ # FunASR Command Line Interface
2
+
3
+ This tool provides a command-line interface for separating vocals from instrumental tracks, converting videos to audio, and performing speech-to-text transcription on the resulting audio files.
4
+
5
+ ## Requirements
6
+
7
+ - Python >= 3.10
8
+ - PyTorch <= 2.3.1
9
+ - ffmpeg, pydub, audio-separator[gpu].
10
+
11
+ ## Installation
12
+
13
+ Install the required packages:
14
+
15
+ ```bash
16
+ pip install -e .[stable]
17
+ ```
18
+
19
+ Make sure you have `ffmpeg` installed and available in your `PATH`.
20
+
21
+ ## Usage
22
+
23
+ ### Basic Usage
24
+
25
+ To run the tool with default settings:
26
+
27
+ ```bash
28
+ python tools/sensevoice/fun_asr.py --audio-dir <audio_directory> --save-dir <output_directory>
29
+ ```
30
+
31
+ ## Options
32
+
33
+ | Option | Description |
34
+ | :-----------------------: | :---------------------------------------------------------------------------: |
35
+ | --audio-dir | Directory containing audio or video files. |
36
+ | --save-dir | Directory to save processed audio files. |
37
+ | --device | Device to use for processing. Options: cuda (default) or cpu. |
38
+ | --language | Language of the transcription. Default is auto. |
39
+ | --max_single_segment_time | Maximum duration of a single audio segment in milliseconds. Default is 20000. |
40
+ | --punc | Enable punctuation prediction. |
41
+ | --denoise | Enable noise reduction (vocal separation). |
42
+
43
+ ## Example
44
+
45
+ To process audio files in the directory `path/to/audio` and save the output to `path/to/output`, with punctuation and noise reduction enabled:
46
+
47
+ ```bash
48
+ python tools/sensevoice/fun_asr.py --audio-dir path/to/audio --save-dir path/to/output --punc --denoise
49
+ ```
50
+
51
+ ## Additional Notes
52
+
53
+ - The tool supports `both audio and video files`. Videos will be converted to audio automatically.
54
+ - If the `--denoise` option is used, the tool will perform vocal separation to isolate the vocals from the instrumental tracks.
55
+ - The script will automatically create necessary directories in the `--save-dir`.
56
+
57
+ ## Troubleshooting
58
+
59
+ If you encounter any issues, make sure all dependencies are correctly installed and configured. For more detailed troubleshooting, refer to the documentation of each dependency.
@@ -26,7 +26,7 @@ def uvr5_cli(
26
26
  output_folder: Path,
27
27
  audio_files: list[Path] | None = None,
28
28
  output_format: str = "flac",
29
- model: str = "BS-Roformer-Viperx-1296.ckpt",
29
+ model: str = "BS-Roformer-Viperx-1297.ckpt",
30
30
  ):
31
31
  # ["BS-Roformer-Viperx-1297.ckpt", "BS-Roformer-Viperx-1296.ckpt", "BS-Roformer-Viperx-1053.ckpt", "Mel-Roformer-Viperx-1143.ckpt"]
32
32
  sepr = Separator(
@@ -15,21 +15,34 @@ threshold = 10 ** (-50 / 20.0)
15
15
 
16
16
  def process(file):
17
17
  waveform, sample_rate = torchaudio.load(str(file), backend="sox")
18
+ if waveform.size(0) > 1:
19
+ waveform = waveform.mean(dim=0, keepdim=True)
20
+
18
21
  loudness = librosa.feature.rms(
19
22
  y=waveform.numpy().squeeze(), frame_length=2048, hop_length=512, center=True
20
23
  )[0]
24
+
21
25
  for i in range(len(loudness) - 1, 0, -1):
22
26
  if loudness[i] > threshold:
23
27
  break
24
28
 
25
- silent_time = (len(loudness) - i) * 512 / sample_rate
29
+ end_silent_time = (len(loudness) - i) * 512 / sample_rate
26
30
 
27
- if silent_time <= 0.3:
28
- random_time = random.uniform(0.3, 0.7)
31
+ if end_silent_time <= 0.3:
32
+ random_time = random.uniform(0.3, 0.7) - end_silent_time
29
33
  waveform = F.pad(
30
34
  waveform, (0, int(random_time * sample_rate)), mode="constant", value=0
31
35
  )
32
36
 
37
+ for i in range(len(loudness)):
38
+ if loudness[i] > threshold:
39
+ break
40
+
41
+ start_silent_time = i * 512 / sample_rate
42
+
43
+ if start_silent_time > 0.02:
44
+ waveform = waveform[:, int((start_silent_time - 0.02) * sample_rate) :]
45
+
33
46
  torchaudio.save(uri=str(file), src=waveform, sample_rate=sample_rate)
34
47
 
35
48
 
@@ -42,7 +42,7 @@ logger.add(sys.stderr, format=logger_format)
42
42
  @lru_cache(maxsize=1)
43
43
  def get_model(
44
44
  config_name: str = "firefly_gan_vq",
45
- checkpoint_path: str = "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
45
+ checkpoint_path: str = "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
46
46
  device: str | torch.device = "cuda",
47
47
  ):
48
48
  with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
@@ -133,7 +133,7 @@ def process_batch(files: list[Path], model) -> float:
133
133
  @click.option("--config-name", default="firefly_gan_vq")
134
134
  @click.option(
135
135
  "--checkpoint-path",
136
- default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
136
+ default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
137
137
  )
138
138
  @click.option("--batch-size", default=64)
139
139
  @click.option("--filelist", default=None, type=Path)
@@ -59,7 +59,7 @@ def load_model(config_name, checkpoint_path, device="cuda"):
59
59
  @click.option("--config-name", default="firefly_gan_vq")
60
60
  @click.option(
61
61
  "--checkpoint-path",
62
- default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
62
+ default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
63
63
  )
64
64
  @click.option(
65
65
  "--device",
@@ -103,7 +103,9 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
103
103
 
104
104
  # Restore
105
105
  feature_lengths = torch.tensor([indices.shape[1]], device=device)
106
- fake_audios = model.decode(indices=indices[None], feature_lengths=feature_lengths)
106
+ fake_audios, _ = model.decode(
107
+ indices=indices[None], feature_lengths=feature_lengths
108
+ )
107
109
  audio_time = fake_audios.shape[-1] / model.spec_transform.sample_rate
108
110
 
109
111
  logger.info(
@@ -23,7 +23,6 @@ from fish_speech.i18n import i18n
23
23
  from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
24
24
  from fish_speech.utils import autocast_exclude_mps
25
25
  from tools.api import decode_vq_tokens, encode_reference
26
- from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
27
26
  from tools.llama.generate import (
28
27
  GenerateRequest,
29
28
  GenerateResponse,
@@ -40,9 +39,9 @@ HEADER_MD = f"""# Fish Speech
40
39
 
41
40
  {i18n("A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).")}
42
41
 
43
- {i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).")}
42
+ {i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.4).")}
44
43
 
45
- {i18n("Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.")}
44
+ {i18n("Related code and weights are released under CC BY-NC-SA 4.0 License.")}
46
45
 
47
46
  {i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")}
48
47
  """
@@ -160,66 +159,6 @@ def inference(
160
159
  gc.collect()
161
160
 
162
161
 
163
- def inference_with_auto_rerank(
164
- text,
165
- enable_reference_audio,
166
- reference_audio,
167
- reference_text,
168
- max_new_tokens,
169
- chunk_length,
170
- top_p,
171
- repetition_penalty,
172
- temperature,
173
- use_auto_rerank,
174
- streaming=False,
175
- ):
176
-
177
- max_attempts = 2 if use_auto_rerank else 1
178
- best_wer = float("inf")
179
- best_audio = None
180
- best_sample_rate = None
181
-
182
- for attempt in range(max_attempts):
183
- audio_generator = inference(
184
- text,
185
- enable_reference_audio,
186
- reference_audio,
187
- reference_text,
188
- max_new_tokens,
189
- chunk_length,
190
- top_p,
191
- repetition_penalty,
192
- temperature,
193
- streaming=False,
194
- )
195
-
196
- # 获取音频数据
197
- for _ in audio_generator:
198
- pass
199
- _, (sample_rate, audio), message = _
200
-
201
- if audio is None:
202
- return None, None, message
203
-
204
- if not use_auto_rerank:
205
- return None, (sample_rate, audio), None
206
-
207
- asr_result = batch_asr(asr_model, [audio], sample_rate)[0]
208
- wer = calculate_wer(text, asr_result["text"])
209
- if wer <= 0.3 and not asr_result["huge_gap"]:
210
- return None, (sample_rate, audio), None
211
-
212
- if wer < best_wer:
213
- best_wer = wer
214
- best_audio = audio
215
- best_sample_rate = sample_rate
216
-
217
- if attempt == max_attempts - 1:
218
- break
219
-
220
- return None, (best_sample_rate, best_audio), None
221
-
222
-
223
162
  inference_stream = partial(inference, streaming=True)
224
163
 
225
164
  n_audios = 4
@@ -239,13 +178,12 @@ def inference_wrapper(
239
178
  repetition_penalty,
240
179
  temperature,
241
180
  batch_infer_num,
242
- if_load_asr_model,
243
181
  ):
244
182
  audios = []
245
183
  errors = []
246
184
 
247
185
  for _ in range(batch_infer_num):
248
- result = inference_with_auto_rerank(
186
+ result = inference(
249
187
  text,
250
188
  enable_reference_audio,
251
189
  reference_audio,
@@ -255,10 +193,9 @@ def inference_wrapper(
255
193
  top_p,
256
194
  repetition_penalty,
257
195
  temperature,
258
- if_load_asr_model,
259
196
  )
260
197
 
261
- _, audio_data, error_message = result
198
+ _, audio_data, error_message = next(result)
262
199
 
263
200
  audios.append(
264
201
  gr.Audio(value=audio_data if audio_data else None, visible=True),
@@ -301,42 +238,6 @@ def normalize_text(user_input, use_normalization):
301
238
  asr_model = None
302
239
 
303
240
 
304
- def change_if_load_asr_model(if_load):
305
- global asr_model
306
-
307
- if if_load:
308
- gr.Warning("Loading faster whisper model...")
309
- if asr_model is None:
310
- asr_model = load_model()
311
- return gr.Checkbox(label="Unload faster whisper model", value=if_load)
312
-
313
- if if_load is False:
314
- gr.Warning("Unloading faster whisper model...")
315
- del asr_model
316
- asr_model = None
317
- if torch.cuda.is_available():
318
- torch.cuda.empty_cache()
319
- gc.collect()
320
- return gr.Checkbox(label="Load faster whisper model", value=if_load)
321
-
322
-
323
- def change_if_auto_label(if_load, if_auto_label, enable_ref, ref_audio, ref_text):
324
- if if_load and asr_model is not None:
325
- if (
326
- if_auto_label
327
- and enable_ref
328
- and ref_audio is not None
329
- and ref_text.strip() == ""
330
- ):
331
- data, sample_rate = librosa.load(ref_audio)
332
- res = batch_asr(asr_model, [data], sample_rate)[0]
333
- ref_text = res["text"]
334
- else:
335
- gr.Warning("Whisper model not loaded!")
336
-
337
- return gr.Textbox(value=ref_text)
338
-
339
-
340
241
  def build_app():
341
242
  with gr.Blocks(theme=gr.themes.Base()) as app:
342
243
  gr.Markdown(HEADER_MD)
@@ -367,23 +268,17 @@ def build_app():
367
268
  with gr.Row():
368
269
  if_refine_text = gr.Checkbox(
369
270
  label=i18n("Text Normalization"),
370
- value=True,
371
- scale=1,
372
- )
373
-
374
- if_load_asr_model = gr.Checkbox(
375
- label=i18n("Load / Unload ASR model for auto-reranking"),
376
271
  value=False,
377
- scale=3,
272
+ scale=1,
378
273
  )
379
274
 
380
275
  with gr.Row():
381
276
  with gr.Tab(label=i18n("Advanced Config")):
382
277
  chunk_length = gr.Slider(
383
278
  label=i18n("Iterative Prompt Length, 0 means off"),
384
- minimum=0,
385
- maximum=500,
386
- value=100,
279
+ minimum=50,
280
+ maximum=300,
281
+ value=200,
387
282
  step=8,
388
283
  )
389
284
 
@@ -434,12 +329,6 @@ def build_app():
434
329
  type="filepath",
435
330
  )
436
331
  with gr.Row():
437
- if_auto_label = gr.Checkbox(
438
- label=i18n("Auto Labeling"),
439
- min_width=100,
440
- scale=0,
441
- value=False,
442
- )
443
332
  reference_text = gr.Textbox(
444
333
  label=i18n("Reference Text"),
445
334
  lines=1,
@@ -494,28 +383,6 @@ def build_app():
494
383
  fn=normalize_text, inputs=[text, if_refine_text], outputs=[refined_text]
495
384
  )
496
385
 
497
- if_load_asr_model.change(
498
- fn=change_if_load_asr_model,
499
- inputs=[if_load_asr_model],
500
- outputs=[if_load_asr_model],
501
- )
502
-
503
- if_auto_label.change(
504
- fn=lambda: gr.Textbox(value=""),
505
- inputs=[],
506
- outputs=[reference_text],
507
- ).then(
508
- fn=change_if_auto_label,
509
- inputs=[
510
- if_load_asr_model,
511
- if_auto_label,
512
- enable_reference_audio,
513
- reference_audio,
514
- reference_text,
515
- ],
516
- outputs=[reference_text],
517
- )
518
-
519
386
  # # Submit
520
387
  generate.click(
521
388
  inference_wrapper,
@@ -530,7 +397,6 @@ def build_app():
530
397
  repetition_penalty,
531
398
  temperature,
532
399
  batch_infer_num,
533
- if_load_asr_model,
534
400
  ],
535
401
  [stream_audio, *global_audio_list, *global_error_list],
536
402
  concurrency_limit=1,
@@ -560,12 +426,12 @@ def parse_args():
560
426
  parser.add_argument(
561
427
  "--llama-checkpoint-path",
562
428
  type=Path,
563
- default="checkpoints/fish-speech-1.2-sft",
429
+ default="checkpoints/fish-speech-1.4",
564
430
  )
565
431
  parser.add_argument(
566
432
  "--decoder-checkpoint-path",
567
433
  type=Path,
568
- default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
434
+ default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
569
435
  )
570
436
  parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
571
437
  parser.add_argument("--device", type=str, default="cuda")
@@ -605,8 +471,8 @@ if __name__ == "__main__":
605
471
  enable_reference_audio=False,
606
472
  reference_audio=None,
607
473
  reference_text="",
608
- max_new_tokens=0,
609
- chunk_length=100,
474
+ max_new_tokens=1024,
475
+ chunk_length=200,
610
476
  top_p=0.7,
611
477
  repetition_penalty=1.2,
612
478
  temperature=0.7,
@@ -0,0 +1 @@
1
+ 0.0.7.0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Jungil Kong
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.