xinference 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (191) hide show
  1. xinference/_version.py +3 -3
  2. xinference/core/chat_interface.py +1 -1
  3. xinference/core/image_interface.py +9 -0
  4. xinference/core/model.py +4 -1
  5. xinference/core/worker.py +60 -44
  6. xinference/model/audio/chattts.py +25 -9
  7. xinference/model/audio/core.py +8 -2
  8. xinference/model/audio/cosyvoice.py +4 -3
  9. xinference/model/audio/custom.py +4 -5
  10. xinference/model/audio/fish_speech.py +228 -0
  11. xinference/model/audio/model_spec.json +8 -0
  12. xinference/model/embedding/core.py +25 -1
  13. xinference/model/embedding/custom.py +4 -5
  14. xinference/model/flexible/core.py +5 -1
  15. xinference/model/image/custom.py +4 -5
  16. xinference/model/image/model_spec.json +2 -1
  17. xinference/model/image/model_spec_modelscope.json +2 -1
  18. xinference/model/image/stable_diffusion/core.py +66 -3
  19. xinference/model/llm/__init__.py +6 -0
  20. xinference/model/llm/llm_family.json +54 -9
  21. xinference/model/llm/llm_family.py +7 -6
  22. xinference/model/llm/llm_family_modelscope.json +56 -10
  23. xinference/model/llm/lmdeploy/__init__.py +0 -0
  24. xinference/model/llm/lmdeploy/core.py +557 -0
  25. xinference/model/llm/sglang/core.py +7 -1
  26. xinference/model/llm/transformers/cogvlm2.py +4 -45
  27. xinference/model/llm/transformers/cogvlm2_video.py +524 -0
  28. xinference/model/llm/transformers/core.py +3 -0
  29. xinference/model/llm/transformers/glm4v.py +2 -23
  30. xinference/model/llm/transformers/intern_vl.py +94 -11
  31. xinference/model/llm/transformers/minicpmv25.py +2 -23
  32. xinference/model/llm/transformers/minicpmv26.py +2 -22
  33. xinference/model/llm/transformers/yi_vl.py +2 -24
  34. xinference/model/llm/utils.py +13 -1
  35. xinference/model/llm/vllm/core.py +1 -34
  36. xinference/model/rerank/custom.py +4 -5
  37. xinference/model/utils.py +41 -1
  38. xinference/model/video/core.py +3 -1
  39. xinference/model/video/diffusers.py +41 -38
  40. xinference/model/video/model_spec.json +24 -1
  41. xinference/model/video/model_spec_modelscope.json +25 -1
  42. xinference/thirdparty/fish_speech/__init__.py +0 -0
  43. xinference/thirdparty/fish_speech/fish_speech/__init__.py +0 -0
  44. xinference/thirdparty/fish_speech/fish_speech/callbacks/__init__.py +3 -0
  45. xinference/thirdparty/fish_speech/fish_speech/callbacks/grad_norm.py +113 -0
  46. xinference/thirdparty/fish_speech/fish_speech/configs/__init__.py +0 -0
  47. xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
  48. xinference/thirdparty/fish_speech/fish_speech/conversation.py +2 -0
  49. xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
  50. xinference/thirdparty/fish_speech/fish_speech/datasets/concat_repeat.py +53 -0
  51. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
  52. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_pb2.py +33 -0
  53. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_stream.py +36 -0
  54. xinference/thirdparty/fish_speech/fish_speech/datasets/semantic.py +496 -0
  55. xinference/thirdparty/fish_speech/fish_speech/datasets/vqgan.py +147 -0
  56. xinference/thirdparty/fish_speech/fish_speech/i18n/__init__.py +3 -0
  57. xinference/thirdparty/fish_speech/fish_speech/i18n/core.py +40 -0
  58. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
  59. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +122 -0
  60. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +122 -0
  61. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +123 -0
  62. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +133 -0
  63. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +122 -0
  64. xinference/thirdparty/fish_speech/fish_speech/i18n/scan.py +122 -0
  65. xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
  66. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/__init__.py +0 -0
  67. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lit_module.py +202 -0
  68. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +779 -0
  69. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lora.py +92 -0
  70. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +3 -0
  71. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +442 -0
  72. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
  73. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +44 -0
  74. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +625 -0
  75. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +139 -0
  76. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +115 -0
  77. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +225 -0
  78. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/utils.py +94 -0
  79. xinference/thirdparty/fish_speech/fish_speech/scheduler.py +40 -0
  80. xinference/thirdparty/fish_speech/fish_speech/text/__init__.py +4 -0
  81. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/__init__.py +0 -0
  82. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_class.py +172 -0
  83. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_constant.py +30 -0
  84. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_util.py +342 -0
  85. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/cardinal.py +32 -0
  86. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/date.py +75 -0
  87. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/digit.py +32 -0
  88. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/fraction.py +35 -0
  89. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/money.py +43 -0
  90. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/percentage.py +33 -0
  91. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/telephone.py +51 -0
  92. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +177 -0
  93. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +69 -0
  94. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +130 -0
  95. xinference/thirdparty/fish_speech/fish_speech/train.py +139 -0
  96. xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +23 -0
  97. xinference/thirdparty/fish_speech/fish_speech/utils/braceexpand.py +217 -0
  98. xinference/thirdparty/fish_speech/fish_speech/utils/context.py +13 -0
  99. xinference/thirdparty/fish_speech/fish_speech/utils/file.py +16 -0
  100. xinference/thirdparty/fish_speech/fish_speech/utils/instantiators.py +50 -0
  101. xinference/thirdparty/fish_speech/fish_speech/utils/logger.py +55 -0
  102. xinference/thirdparty/fish_speech/fish_speech/utils/logging_utils.py +48 -0
  103. xinference/thirdparty/fish_speech/fish_speech/utils/rich_utils.py +100 -0
  104. xinference/thirdparty/fish_speech/fish_speech/utils/spectrogram.py +122 -0
  105. xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +114 -0
  106. xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
  107. xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +120 -0
  108. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1237 -0
  109. xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
  110. xinference/thirdparty/fish_speech/tools/api.py +495 -0
  111. xinference/thirdparty/fish_speech/tools/auto_rerank.py +159 -0
  112. xinference/thirdparty/fish_speech/tools/download_models.py +55 -0
  113. xinference/thirdparty/fish_speech/tools/extract_model.py +21 -0
  114. xinference/thirdparty/fish_speech/tools/file.py +108 -0
  115. xinference/thirdparty/fish_speech/tools/gen_ref.py +36 -0
  116. xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
  117. xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +169 -0
  118. xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +171 -0
  119. xinference/thirdparty/fish_speech/tools/llama/generate.py +698 -0
  120. xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +95 -0
  121. xinference/thirdparty/fish_speech/tools/llama/quantize.py +497 -0
  122. xinference/thirdparty/fish_speech/tools/llama/rebuild_tokenizer.py +57 -0
  123. xinference/thirdparty/fish_speech/tools/merge_asr_files.py +55 -0
  124. xinference/thirdparty/fish_speech/tools/post_api.py +164 -0
  125. xinference/thirdparty/fish_speech/tools/sensevoice/__init__.py +0 -0
  126. xinference/thirdparty/fish_speech/tools/sensevoice/auto_model.py +573 -0
  127. xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +332 -0
  128. xinference/thirdparty/fish_speech/tools/sensevoice/vad_utils.py +61 -0
  129. xinference/thirdparty/fish_speech/tools/smart_pad.py +47 -0
  130. xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
  131. xinference/thirdparty/fish_speech/tools/vqgan/create_train_split.py +83 -0
  132. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +227 -0
  133. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +120 -0
  134. xinference/thirdparty/fish_speech/tools/webui.py +619 -0
  135. xinference/thirdparty/fish_speech/tools/whisper_asr.py +176 -0
  136. xinference/thirdparty/matcha/__init__.py +0 -0
  137. xinference/thirdparty/matcha/app.py +357 -0
  138. xinference/thirdparty/matcha/cli.py +419 -0
  139. xinference/thirdparty/matcha/data/__init__.py +0 -0
  140. xinference/thirdparty/matcha/data/components/__init__.py +0 -0
  141. xinference/thirdparty/matcha/data/text_mel_datamodule.py +274 -0
  142. xinference/thirdparty/matcha/hifigan/__init__.py +0 -0
  143. xinference/thirdparty/matcha/hifigan/config.py +28 -0
  144. xinference/thirdparty/matcha/hifigan/denoiser.py +64 -0
  145. xinference/thirdparty/matcha/hifigan/env.py +17 -0
  146. xinference/thirdparty/matcha/hifigan/meldataset.py +217 -0
  147. xinference/thirdparty/matcha/hifigan/models.py +368 -0
  148. xinference/thirdparty/matcha/hifigan/xutils.py +60 -0
  149. xinference/thirdparty/matcha/models/__init__.py +0 -0
  150. xinference/thirdparty/matcha/models/baselightningmodule.py +210 -0
  151. xinference/thirdparty/matcha/models/components/__init__.py +0 -0
  152. xinference/thirdparty/matcha/models/components/decoder.py +443 -0
  153. xinference/thirdparty/matcha/models/components/flow_matching.py +132 -0
  154. xinference/thirdparty/matcha/models/components/text_encoder.py +410 -0
  155. xinference/thirdparty/matcha/models/components/transformer.py +316 -0
  156. xinference/thirdparty/matcha/models/matcha_tts.py +244 -0
  157. xinference/thirdparty/matcha/onnx/__init__.py +0 -0
  158. xinference/thirdparty/matcha/onnx/export.py +181 -0
  159. xinference/thirdparty/matcha/onnx/infer.py +168 -0
  160. xinference/thirdparty/matcha/text/__init__.py +53 -0
  161. xinference/thirdparty/matcha/text/cleaners.py +121 -0
  162. xinference/thirdparty/matcha/text/numbers.py +71 -0
  163. xinference/thirdparty/matcha/text/symbols.py +17 -0
  164. xinference/thirdparty/matcha/train.py +122 -0
  165. xinference/thirdparty/matcha/utils/__init__.py +5 -0
  166. xinference/thirdparty/matcha/utils/audio.py +82 -0
  167. xinference/thirdparty/matcha/utils/generate_data_statistics.py +112 -0
  168. xinference/thirdparty/matcha/utils/get_durations_from_trained_model.py +195 -0
  169. xinference/thirdparty/matcha/utils/instantiators.py +56 -0
  170. xinference/thirdparty/matcha/utils/logging_utils.py +53 -0
  171. xinference/thirdparty/matcha/utils/model.py +90 -0
  172. xinference/thirdparty/matcha/utils/monotonic_align/__init__.py +22 -0
  173. xinference/thirdparty/matcha/utils/monotonic_align/core.pyx +47 -0
  174. xinference/thirdparty/matcha/utils/monotonic_align/setup.py +7 -0
  175. xinference/thirdparty/matcha/utils/pylogger.py +21 -0
  176. xinference/thirdparty/matcha/utils/rich_utils.py +101 -0
  177. xinference/thirdparty/matcha/utils/utils.py +259 -0
  178. xinference/web/ui/build/asset-manifest.json +3 -3
  179. xinference/web/ui/build/index.html +1 -1
  180. xinference/web/ui/build/static/js/{main.ffc26121.js → main.661c7b0a.js} +3 -3
  181. xinference/web/ui/build/static/js/main.661c7b0a.js.map +1 -0
  182. xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +1 -0
  183. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/METADATA +31 -11
  184. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/RECORD +189 -49
  185. xinference/web/ui/build/static/js/main.ffc26121.js.map +0 -1
  186. xinference/web/ui/node_modules/.cache/babel-loader/2f40209b32e7e46a2eab6b8c8a355eb42c3caa8bc3228dd929f32fd2b3940294.json +0 -1
  187. /xinference/web/ui/build/static/js/{main.ffc26121.js.LICENSE.txt → main.661c7b0a.js.LICENSE.txt} +0 -0
  188. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/LICENSE +0 -0
  189. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/WHEEL +0 -0
  190. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/entry_points.txt +0 -0
  191. {xinference-0.14.2.dist-info → xinference-0.14.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,176 @@
1
+ """
2
+ Used to transcribe all audio files in one folder into another folder.
3
+ e.g.
4
+ Directory structure:
5
+ --pre_data_root
6
+ ----SP_1
7
+ ------01.wav
8
+ ------02.wav
9
+ ------......
10
+ ----SP_2
11
+ ------01.wav
12
+ ------02.wav
13
+ ------......
14
+ Use
15
+ python tools/whisper_asr.py --audio-dir pre_data_root/SP_1 --save-dir data/SP_1
16
+ to transcribe the first speaker.
17
+
18
+ Use
19
+ python tools/whisper_asr.py --audio-dir pre_data_root/SP_2 --save-dir data/SP_2
20
+ to transcribe the second speaker.
21
+
22
+ Note: Be aware of your audio sample rate, which defaults to 44.1kHz.
23
+ """
24
+
25
+ import re
26
+ from pathlib import Path
27
+
28
+ import click
29
+ import soundfile as sf
30
+ from faster_whisper import WhisperModel
31
+ from loguru import logger
32
+ from pydub import AudioSegment
33
+ from tqdm import tqdm
34
+
35
+ from tools.file import AUDIO_EXTENSIONS, list_files
36
+
37
+
38
+ @click.command()
39
+ @click.option("--model-size", default="large-v3", help="Size of the Whisper model")
40
+ @click.option(
41
+ "--compute-type",
42
+ default="float16",
43
+ help="Computation Precision of the Whisper model [float16 / int8_float16 / int8]",
44
+ )
45
+ @click.option("--audio-dir", required=True, help="Directory containing audio files")
46
+ @click.option(
47
+ "--save-dir", required=True, help="Directory to save processed audio files"
48
+ )
49
+ @click.option(
50
+ "--sample-rate",
51
+ default=44100,
52
+ type=int,
53
+ help="Output sample rate, default to input sample rate",
54
+ )
55
+ @click.option("--device", default="cuda", help="Device to use [cuda / cpu]")
56
+ @click.option("--language", default="auto", help="Language of the transcription")
57
+ @click.option("--initial-prompt", default=None, help="Initial prompt for transcribing")
58
+ def main(
59
+ model_size,
60
+ compute_type,
61
+ audio_dir,
62
+ save_dir,
63
+ sample_rate,
64
+ device,
65
+ language,
66
+ initial_prompt,
67
+ ):
68
+ logger.info("Loading / Downloading Faster Whisper model...")
69
+
70
+ model = WhisperModel(
71
+ model_size,
72
+ device=device,
73
+ compute_type=compute_type,
74
+ download_root="faster_whisper",
75
+ )
76
+
77
+ logger.info("Model loaded.")
78
+
79
+ save_path = Path(save_dir)
80
+ save_path.mkdir(parents=True, exist_ok=True)
81
+
82
+ audio_files = list_files(
83
+ path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
84
+ )
85
+
86
+ for file_path in tqdm(audio_files, desc="Processing audio file"):
87
+ file_stem = file_path.stem
88
+ file_suffix = file_path.suffix
89
+
90
+ rel_path = Path(file_path).relative_to(audio_dir)
91
+ (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
92
+
93
+ audio = AudioSegment.from_file(file_path)
94
+
95
+ segments, info = model.transcribe(
96
+ file_path,
97
+ beam_size=5,
98
+ language=None if language == "auto" else language,
99
+ initial_prompt=initial_prompt,
100
+ )
101
+
102
+ print(
103
+ "Detected language '%s' with probability %f"
104
+ % (info.language, info.language_probability)
105
+ )
106
+ print("Total len(ms): ", len(audio))
107
+
108
+ whole_text = None
109
+ for segment in segments:
110
+ id, start, end, text = (
111
+ segment.id,
112
+ segment.start,
113
+ segment.end,
114
+ segment.text,
115
+ )
116
+ print("Segment %03d [%.2fs -> %.2fs] %s" % (id, start, end, text))
117
+ if not whole_text:
118
+ whole_text = text
119
+ else:
120
+ whole_text += ", " + text
121
+
122
+ whole_text += "."
123
+
124
+ audio_save_path = save_path / rel_path.parent / f"{file_stem}{file_suffix}"
125
+ audio.export(audio_save_path, format=file_suffix[1:])
126
+ print(f"Exported {audio_save_path}")
127
+
128
+ transcript_save_path = save_path / rel_path.parent / f"{file_stem}.lab"
129
+ with open(
130
+ transcript_save_path,
131
+ "w",
132
+ encoding="utf-8",
133
+ ) as f:
134
+ f.write(whole_text)
135
+
136
+
137
+ if __name__ == "__main__":
138
+ main()
139
+ exit(0)
140
+
141
+ audio = AudioSegment.from_wav(
142
+ r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav"
143
+ )
144
+
145
+ model_size = "large-v3"
146
+
147
+ model = WhisperModel(
148
+ model_size,
149
+ device="cuda",
150
+ compute_type="float16",
151
+ download_root="faster_whisper",
152
+ )
153
+
154
+ segments, info = model.transcribe(
155
+ r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav",
156
+ beam_size=5,
157
+ )
158
+
159
+ print(
160
+ "Detected language '%s' with probability %f"
161
+ % (info.language, info.language_probability)
162
+ )
163
+ print("Total len(ms): ", len(audio))
164
+
165
+ for i, segment in enumerate(segments):
166
+ print(
167
+ "Segment %03d [%.2fs -> %.2fs] %s"
168
+ % (i, segment.start, segment.end, segment.text)
169
+ )
170
+ start_ms = int(segment.start * 1000)
171
+ end_ms = int(segment.end * 1000)
172
+ segment_audio = audio[start_ms:end_ms]
173
+ segment_audio.export(f"segment_{i:03d}.wav", format="wav")
174
+ print(f"Exported segment_{i:03d}.wav")
175
+
176
+ print("All segments have been exported.")
File without changes
@@ -0,0 +1,357 @@
1
+ import tempfile
2
+ from argparse import Namespace
3
+ from pathlib import Path
4
+
5
+ import gradio as gr
6
+ import soundfile as sf
7
+ import torch
8
+
9
+ from matcha.cli import (
10
+ MATCHA_URLS,
11
+ VOCODER_URLS,
12
+ assert_model_downloaded,
13
+ get_device,
14
+ load_matcha,
15
+ load_vocoder,
16
+ process_text,
17
+ to_waveform,
18
+ )
19
+ from matcha.utils.utils import get_user_data_dir, plot_tensor
20
+
21
+ LOCATION = Path(get_user_data_dir())
22
+
23
+ args = Namespace(
24
+ cpu=False,
25
+ model="matcha_vctk",
26
+ vocoder="hifigan_univ_v1",
27
+ spk=0,
28
+ )
29
+
30
+ CURRENTLY_LOADED_MODEL = args.model
31
+
32
+
33
+ def MATCHA_TTS_LOC(x):
34
+ return LOCATION / f"{x}.ckpt"
35
+
36
+
37
+ def VOCODER_LOC(x):
38
+ return LOCATION / f"{x}"
39
+
40
+
41
+ LOGO_URL = "https://shivammehta25.github.io/Matcha-TTS/images/logo.png"
42
+ RADIO_OPTIONS = {
43
+ "Multi Speaker (VCTK)": {
44
+ "model": "matcha_vctk",
45
+ "vocoder": "hifigan_univ_v1",
46
+ },
47
+ "Single Speaker (LJ Speech)": {
48
+ "model": "matcha_ljspeech",
49
+ "vocoder": "hifigan_T2_v1",
50
+ },
51
+ }
52
+
53
+ # Ensure all the required models are downloaded
54
+ assert_model_downloaded(MATCHA_TTS_LOC("matcha_ljspeech"), MATCHA_URLS["matcha_ljspeech"])
55
+ assert_model_downloaded(VOCODER_LOC("hifigan_T2_v1"), VOCODER_URLS["hifigan_T2_v1"])
56
+ assert_model_downloaded(MATCHA_TTS_LOC("matcha_vctk"), MATCHA_URLS["matcha_vctk"])
57
+ assert_model_downloaded(VOCODER_LOC("hifigan_univ_v1"), VOCODER_URLS["hifigan_univ_v1"])
58
+
59
+ device = get_device(args)
60
+
61
+ # Load default model
62
+ model = load_matcha(args.model, MATCHA_TTS_LOC(args.model), device)
63
+ vocoder, denoiser = load_vocoder(args.vocoder, VOCODER_LOC(args.vocoder), device)
64
+
65
+
66
+ def load_model(model_name, vocoder_name):
67
+ model = load_matcha(model_name, MATCHA_TTS_LOC(model_name), device)
68
+ vocoder, denoiser = load_vocoder(vocoder_name, VOCODER_LOC(vocoder_name), device)
69
+ return model, vocoder, denoiser
70
+
71
+
72
+ def load_model_ui(model_type, textbox):
73
+ model_name, vocoder_name = RADIO_OPTIONS[model_type]["model"], RADIO_OPTIONS[model_type]["vocoder"]
74
+
75
+ global model, vocoder, denoiser, CURRENTLY_LOADED_MODEL # pylint: disable=global-statement
76
+ if CURRENTLY_LOADED_MODEL != model_name:
77
+ model, vocoder, denoiser = load_model(model_name, vocoder_name)
78
+ CURRENTLY_LOADED_MODEL = model_name
79
+
80
+ if model_name == "matcha_ljspeech":
81
+ spk_slider = gr.update(visible=False, value=-1)
82
+ single_speaker_examples = gr.update(visible=True)
83
+ multi_speaker_examples = gr.update(visible=False)
84
+ length_scale = gr.update(value=0.95)
85
+ else:
86
+ spk_slider = gr.update(visible=True, value=0)
87
+ single_speaker_examples = gr.update(visible=False)
88
+ multi_speaker_examples = gr.update(visible=True)
89
+ length_scale = gr.update(value=0.85)
90
+
91
+ return (
92
+ textbox,
93
+ gr.update(interactive=True),
94
+ spk_slider,
95
+ single_speaker_examples,
96
+ multi_speaker_examples,
97
+ length_scale,
98
+ )
99
+
100
+
101
+ @torch.inference_mode()
102
+ def process_text_gradio(text):
103
+ output = process_text(1, text, device)
104
+ return output["x_phones"][1::2], output["x"], output["x_lengths"]
105
+
106
+
107
+ @torch.inference_mode()
108
+ def synthesise_mel(text, text_length, n_timesteps, temperature, length_scale, spk):
109
+ spk = torch.tensor([spk], device=device, dtype=torch.long) if spk >= 0 else None
110
+ output = model.synthesise(
111
+ text,
112
+ text_length,
113
+ n_timesteps=n_timesteps,
114
+ temperature=temperature,
115
+ spks=spk,
116
+ length_scale=length_scale,
117
+ )
118
+ output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
119
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
120
+ sf.write(fp.name, output["waveform"], 22050, "PCM_24")
121
+
122
+ return fp.name, plot_tensor(output["mel"].squeeze().cpu().numpy())
123
+
124
+
125
+ def multispeaker_example_cacher(text, n_timesteps, mel_temp, length_scale, spk):
126
+ global CURRENTLY_LOADED_MODEL # pylint: disable=global-statement
127
+ if CURRENTLY_LOADED_MODEL != "matcha_vctk":
128
+ global model, vocoder, denoiser # pylint: disable=global-statement
129
+ model, vocoder, denoiser = load_model("matcha_vctk", "hifigan_univ_v1")
130
+ CURRENTLY_LOADED_MODEL = "matcha_vctk"
131
+
132
+ phones, text, text_lengths = process_text_gradio(text)
133
+ audio, mel_spectrogram = synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale, spk)
134
+ return phones, audio, mel_spectrogram
135
+
136
+
137
+ def ljspeech_example_cacher(text, n_timesteps, mel_temp, length_scale, spk=-1):
138
+ global CURRENTLY_LOADED_MODEL # pylint: disable=global-statement
139
+ if CURRENTLY_LOADED_MODEL != "matcha_ljspeech":
140
+ global model, vocoder, denoiser # pylint: disable=global-statement
141
+ model, vocoder, denoiser = load_model("matcha_ljspeech", "hifigan_T2_v1")
142
+ CURRENTLY_LOADED_MODEL = "matcha_ljspeech"
143
+
144
+ phones, text, text_lengths = process_text_gradio(text)
145
+ audio, mel_spectrogram = synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale, spk)
146
+ return phones, audio, mel_spectrogram
147
+
148
+
149
+ def main():
150
+ description = """# 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching
151
+ ### [Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [Éva Székely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/)
152
+ We propose 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis. Our method:
153
+
154
+
155
+ * Is probabilistic
156
+ * Has compact memory footprint
157
+ * Sounds highly natural
158
+ * Is very fast to synthesise from
159
+
160
+
161
+ Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS). Read our [arXiv preprint for more details](https://arxiv.org/abs/2309.03199).
162
+ Code is available in our [GitHub repository](https://github.com/shivammehta25/Matcha-TTS), along with pre-trained models.
163
+
164
+ Cached examples are available at the bottom of the page.
165
+ """
166
+
167
+ with gr.Blocks(title="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching") as demo:
168
+ processed_text = gr.State(value=None)
169
+ processed_text_len = gr.State(value=None)
170
+
171
+ with gr.Box():
172
+ with gr.Row():
173
+ gr.Markdown(description, scale=3)
174
+ with gr.Column():
175
+ gr.Image(LOGO_URL, label="Matcha-TTS logo", height=50, width=50, scale=1, show_label=False)
176
+ html = '<br><iframe width="560" height="315" src="https://www.youtube.com/embed/xmvJkz3bqw0?si=jN7ILyDsbPwJCGoa" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>'
177
+ gr.HTML(html)
178
+
179
+ with gr.Box():
180
+ radio_options = list(RADIO_OPTIONS.keys())
181
+ model_type = gr.Radio(
182
+ radio_options, value=radio_options[0], label="Choose a Model", interactive=True, container=False
183
+ )
184
+
185
+ with gr.Row():
186
+ gr.Markdown("# Text Input")
187
+ with gr.Row():
188
+ text = gr.Textbox(value="", lines=2, label="Text to synthesise", scale=3)
189
+ spk_slider = gr.Slider(
190
+ minimum=0, maximum=107, step=1, value=args.spk, label="Speaker ID", interactive=True, scale=1
191
+ )
192
+
193
+ with gr.Row():
194
+ gr.Markdown("### Hyper parameters")
195
+ with gr.Row():
196
+ n_timesteps = gr.Slider(
197
+ label="Number of ODE steps",
198
+ minimum=1,
199
+ maximum=100,
200
+ step=1,
201
+ value=10,
202
+ interactive=True,
203
+ )
204
+ length_scale = gr.Slider(
205
+ label="Length scale (Speaking rate)",
206
+ minimum=0.5,
207
+ maximum=1.5,
208
+ step=0.05,
209
+ value=1.0,
210
+ interactive=True,
211
+ )
212
+ mel_temp = gr.Slider(
213
+ label="Sampling temperature",
214
+ minimum=0.00,
215
+ maximum=2.001,
216
+ step=0.16675,
217
+ value=0.667,
218
+ interactive=True,
219
+ )
220
+
221
+ synth_btn = gr.Button("Synthesise")
222
+
223
+ with gr.Box():
224
+ with gr.Row():
225
+ gr.Markdown("### Phonetised text")
226
+ phonetised_text = gr.Textbox(interactive=False, scale=10, label="Phonetised text")
227
+
228
+ with gr.Box():
229
+ with gr.Row():
230
+ mel_spectrogram = gr.Image(interactive=False, label="mel spectrogram")
231
+
232
+ # with gr.Row():
233
+ audio = gr.Audio(interactive=False, label="Audio")
234
+
235
+ with gr.Row(visible=False) as example_row_lj_speech:
236
+ examples = gr.Examples( # pylint: disable=unused-variable
237
+ examples=[
238
+ [
239
+ "We propose Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up O D E-based speech synthesis.",
240
+ 50,
241
+ 0.677,
242
+ 0.95,
243
+ ],
244
+ [
245
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
246
+ 2,
247
+ 0.677,
248
+ 0.95,
249
+ ],
250
+ [
251
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
252
+ 4,
253
+ 0.677,
254
+ 0.95,
255
+ ],
256
+ [
257
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
258
+ 10,
259
+ 0.677,
260
+ 0.95,
261
+ ],
262
+ [
263
+ "The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.",
264
+ 50,
265
+ 0.677,
266
+ 0.95,
267
+ ],
268
+ [
269
+ "The narrative of these events is based largely on the recollections of the participants.",
270
+ 10,
271
+ 0.677,
272
+ 0.95,
273
+ ],
274
+ [
275
+ "The jury did not believe him, and the verdict was for the defendants.",
276
+ 10,
277
+ 0.677,
278
+ 0.95,
279
+ ],
280
+ ],
281
+ fn=ljspeech_example_cacher,
282
+ inputs=[text, n_timesteps, mel_temp, length_scale],
283
+ outputs=[phonetised_text, audio, mel_spectrogram],
284
+ cache_examples=True,
285
+ )
286
+
287
+ with gr.Row() as example_row_multispeaker:
288
+ multi_speaker_examples = gr.Examples( # pylint: disable=unused-variable
289
+ examples=[
290
+ [
291
+ "Hello everyone! I am speaker 0 and I am here to tell you that Matcha-TTS is amazing!",
292
+ 10,
293
+ 0.677,
294
+ 0.85,
295
+ 0,
296
+ ],
297
+ [
298
+ "Hello everyone! I am speaker 16 and I am here to tell you that Matcha-TTS is amazing!",
299
+ 10,
300
+ 0.677,
301
+ 0.85,
302
+ 16,
303
+ ],
304
+ [
305
+ "Hello everyone! I am speaker 44 and I am here to tell you that Matcha-TTS is amazing!",
306
+ 50,
307
+ 0.677,
308
+ 0.85,
309
+ 44,
310
+ ],
311
+ [
312
+ "Hello everyone! I am speaker 45 and I am here to tell you that Matcha-TTS is amazing!",
313
+ 50,
314
+ 0.677,
315
+ 0.85,
316
+ 45,
317
+ ],
318
+ [
319
+ "Hello everyone! I am speaker 58 and I am here to tell you that Matcha-TTS is amazing!",
320
+ 4,
321
+ 0.677,
322
+ 0.85,
323
+ 58,
324
+ ],
325
+ ],
326
+ fn=multispeaker_example_cacher,
327
+ inputs=[text, n_timesteps, mel_temp, length_scale, spk_slider],
328
+ outputs=[phonetised_text, audio, mel_spectrogram],
329
+ cache_examples=True,
330
+ label="Multi Speaker Examples",
331
+ )
332
+
333
+ model_type.change(lambda x: gr.update(interactive=False), inputs=[synth_btn], outputs=[synth_btn]).then(
334
+ load_model_ui,
335
+ inputs=[model_type, text],
336
+ outputs=[text, synth_btn, spk_slider, example_row_lj_speech, example_row_multispeaker, length_scale],
337
+ )
338
+
339
+ synth_btn.click(
340
+ fn=process_text_gradio,
341
+ inputs=[
342
+ text,
343
+ ],
344
+ outputs=[phonetised_text, processed_text, processed_text_len],
345
+ api_name="matcha_tts",
346
+ queue=True,
347
+ ).then(
348
+ fn=synthesise_mel,
349
+ inputs=[processed_text, processed_text_len, n_timesteps, mel_temp, length_scale, spk_slider],
350
+ outputs=[audio, mel_spectrogram],
351
+ )
352
+
353
+ demo.queue().launch(share=True)
354
+
355
+
356
+ if __name__ == "__main__":
357
+ main()