xinference 0.15.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (83) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +204 -1
  3. xinference/client/restful/restful_client.py +4 -2
  4. xinference/core/image_interface.py +28 -0
  5. xinference/core/model.py +28 -0
  6. xinference/core/supervisor.py +6 -0
  7. xinference/model/audio/fish_speech.py +9 -9
  8. xinference/model/audio/model_spec.json +9 -9
  9. xinference/model/audio/whisper.py +4 -1
  10. xinference/model/image/core.py +2 -1
  11. xinference/model/image/model_spec.json +16 -4
  12. xinference/model/image/model_spec_modelscope.json +16 -4
  13. xinference/model/image/sdapi.py +136 -0
  14. xinference/model/image/stable_diffusion/core.py +148 -20
  15. xinference/model/llm/__init__.py +8 -0
  16. xinference/model/llm/llm_family.json +393 -0
  17. xinference/model/llm/llm_family.py +3 -1
  18. xinference/model/llm/llm_family_modelscope.json +408 -3
  19. xinference/model/llm/sglang/core.py +3 -0
  20. xinference/model/llm/transformers/chatglm.py +1 -1
  21. xinference/model/llm/transformers/core.py +6 -0
  22. xinference/model/llm/transformers/deepseek_v2.py +340 -0
  23. xinference/model/llm/transformers/qwen2_audio.py +168 -0
  24. xinference/model/llm/transformers/qwen2_vl.py +31 -5
  25. xinference/model/llm/utils.py +104 -84
  26. xinference/model/llm/vllm/core.py +8 -0
  27. xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +2 -3
  28. xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +1 -1
  29. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +1 -1
  30. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +1 -1
  31. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +1 -1
  32. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +1 -1
  33. xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +1 -1
  34. xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +2 -2
  35. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +0 -3
  36. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +169 -198
  37. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +4 -27
  38. xinference/thirdparty/fish_speech/fish_speech/text/clean.py +9 -47
  39. xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +2 -2
  40. xinference/thirdparty/fish_speech/fish_speech/train.py +2 -0
  41. xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +12 -10
  42. xinference/thirdparty/fish_speech/tools/api.py +79 -134
  43. xinference/thirdparty/fish_speech/tools/commons.py +35 -0
  44. xinference/thirdparty/fish_speech/tools/download_models.py +3 -3
  45. xinference/thirdparty/fish_speech/tools/file.py +17 -0
  46. xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +1 -1
  47. xinference/thirdparty/fish_speech/tools/llama/generate.py +29 -24
  48. xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +1 -1
  49. xinference/thirdparty/fish_speech/tools/llama/quantize.py +2 -2
  50. xinference/thirdparty/fish_speech/tools/msgpack_api.py +34 -0
  51. xinference/thirdparty/fish_speech/tools/post_api.py +85 -44
  52. xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +1 -1
  53. xinference/thirdparty/fish_speech/tools/smart_pad.py +16 -3
  54. xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +2 -2
  55. xinference/thirdparty/fish_speech/tools/vqgan/inference.py +4 -2
  56. xinference/thirdparty/fish_speech/tools/webui.py +12 -146
  57. xinference/types.py +7 -4
  58. xinference/web/ui/build/asset-manifest.json +6 -6
  59. xinference/web/ui/build/index.html +1 -1
  60. xinference/web/ui/build/static/css/{main.632e9148.css → main.5061c4c3.css} +2 -2
  61. xinference/web/ui/build/static/css/main.5061c4c3.css.map +1 -0
  62. xinference/web/ui/build/static/js/{main.9cfafbd6.js → main.754740c0.js} +3 -3
  63. xinference/web/ui/build/static/js/main.754740c0.js.map +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/cd90b08d177025dfe84209596fc51878f8a86bcaa6a240848a3d2e5fd4c7ff24.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +1 -0
  66. {xinference-0.15.0.dist-info → xinference-0.15.1.dist-info}/METADATA +9 -3
  67. {xinference-0.15.0.dist-info → xinference-0.15.1.dist-info}/RECORD +72 -74
  68. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +0 -442
  69. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +0 -44
  70. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +0 -115
  71. xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +0 -225
  72. xinference/thirdparty/fish_speech/tools/auto_rerank.py +0 -159
  73. xinference/thirdparty/fish_speech/tools/gen_ref.py +0 -36
  74. xinference/thirdparty/fish_speech/tools/merge_asr_files.py +0 -55
  75. xinference/web/ui/build/static/css/main.632e9148.css.map +0 -1
  76. xinference/web/ui/build/static/js/main.9cfafbd6.js.map +0 -1
  77. xinference/web/ui/node_modules/.cache/babel-loader/01d6d198156bacbd436c51435edbd4b2cacd47a79db929105eba30f74b67d48d.json +0 -1
  78. xinference/web/ui/node_modules/.cache/babel-loader/59eb25f514afcc4fefd1b309d192b2455f1e0aec68a9de598ca4b2333fe2c774.json +0 -1
  79. /xinference/web/ui/build/static/js/{main.9cfafbd6.js.LICENSE.txt → main.754740c0.js.LICENSE.txt} +0 -0
  80. {xinference-0.15.0.dist-info → xinference-0.15.1.dist-info}/LICENSE +0 -0
  81. {xinference-0.15.0.dist-info → xinference-0.15.1.dist-info}/WHEEL +0 -0
  82. {xinference-0.15.0.dist-info → xinference-0.15.1.dist-info}/entry_points.txt +0 -0
  83. {xinference-0.15.0.dist-info → xinference-0.15.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,34 @@
1
+ import httpx
2
+ import ormsgpack
3
+
4
+ from tools.commons import ServeReferenceAudio, ServeTTSRequest
5
+
6
+ # priority: ref_id > references
7
+ request = ServeTTSRequest(
8
+ text="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
9
+ # reference_id="114514",
10
+ references=[
11
+ ServeReferenceAudio(
12
+ audio=open("lengyue.wav", "rb").read(),
13
+ text=open("lengyue.lab", "r", encoding="utf-8").read(),
14
+ )
15
+ ],
16
+ streaming=True,
17
+ )
18
+
19
+ with (
20
+ httpx.Client() as client,
21
+ open("hello.wav", "wb") as f,
22
+ ):
23
+ with client.stream(
24
+ "POST",
25
+ "http://127.0.0.1:8080/v1/tts",
26
+ content=ormsgpack.packb(request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
27
+ headers={
28
+ "authorization": "Bearer YOUR_API_KEY",
29
+ "content-type": "application/msgpack",
30
+ },
31
+ timeout=None,
32
+ ) as response:
33
+ for chunk in response.iter_bytes():
34
+ f.write(chunk)
@@ -1,40 +1,19 @@
1
1
  import argparse
2
2
  import base64
3
- import json
4
3
  import wave
5
- from pathlib import Path
6
4
 
5
+ import ormsgpack
7
6
  import pyaudio
8
7
  import requests
8
+ from pydub import AudioSegment
9
+ from pydub.playback import play
9
10
 
11
+ from tools.commons import ServeReferenceAudio, ServeTTSRequest
12
+ from tools.file import audio_to_bytes, read_ref_text
10
13
 
11
- def wav_to_base64(file_path):
12
- if not file_path or not Path(file_path).exists():
13
- return None
14
- with open(file_path, "rb") as wav_file:
15
- wav_content = wav_file.read()
16
- base64_encoded = base64.b64encode(wav_content)
17
- return base64_encoded.decode("utf-8")
18
14
 
15
+ def parse_args():
19
16
 
20
- def read_ref_text(ref_text):
21
- path = Path(ref_text)
22
- if path.exists() and path.is_file():
23
- with path.open("r", encoding="utf-8") as file:
24
- return file.read()
25
- return ref_text
26
-
27
-
28
- def play_audio(audio_content, format, channels, rate):
29
- p = pyaudio.PyAudio()
30
- stream = p.open(format=format, channels=channels, rate=rate, output=True)
31
- stream.write(audio_content)
32
- stream.stop_stream()
33
- stream.close()
34
- p.terminate()
35
-
36
-
37
- if __name__ == "__main__":
38
17
  parser = argparse.ArgumentParser(
39
18
  description="Send a WAV file and text to a server and receive synthesized audio."
40
19
  )
@@ -43,16 +22,24 @@ if __name__ == "__main__":
43
22
  "--url",
44
23
  "-u",
45
24
  type=str,
46
- default="http://127.0.0.1:8080/v1/invoke",
25
+ default="http://127.0.0.1:8080/v1/tts",
47
26
  help="URL of the server",
48
27
  )
49
28
  parser.add_argument(
50
29
  "--text", "-t", type=str, required=True, help="Text to be synthesized"
51
30
  )
31
+ parser.add_argument(
32
+ "--reference_id",
33
+ "-id",
34
+ type=str,
35
+ default=None,
36
+ help="ID of the reference model o be used for the speech",
37
+ )
52
38
  parser.add_argument(
53
39
  "--reference_audio",
54
40
  "-ra",
55
41
  type=str,
42
+ nargs="+",
56
43
  default=None,
57
44
  help="Path to the WAV file",
58
45
  )
@@ -60,9 +47,30 @@ if __name__ == "__main__":
60
47
  "--reference_text",
61
48
  "-rt",
62
49
  type=str,
50
+ nargs="+",
63
51
  default=None,
64
52
  help="Reference text for voice synthesis",
65
53
  )
54
+ parser.add_argument(
55
+ "--output",
56
+ "-o",
57
+ type=str,
58
+ default="generated_audio",
59
+ help="Output audio file name",
60
+ )
61
+ parser.add_argument(
62
+ "--play",
63
+ type=bool,
64
+ default=True,
65
+ help="Whether to play audio after receiving data",
66
+ )
67
+ parser.add_argument("--normalize", type=bool, default=True)
68
+ parser.add_argument(
69
+ "--format", type=str, choices=["wav", "mp3", "flac"], default="wav"
70
+ )
71
+ parser.add_argument("--mp3_bitrate", type=int, default=64)
72
+ parser.add_argument("--opus_bitrate", type=int, default=-1000)
73
+ parser.add_argument("--latency", type=str, default="normal", help="延迟选项")
66
74
  parser.add_argument(
67
75
  "--max_new_tokens",
68
76
  type=int,
@@ -88,7 +96,6 @@ if __name__ == "__main__":
88
96
  "--speaker", type=str, default=None, help="Speaker ID for voice synthesis"
89
97
  )
90
98
  parser.add_argument("--emotion", type=str, default=None, help="Speaker's Emotion")
91
- parser.add_argument("--format", type=str, default="wav", help="Audio format")
92
99
  parser.add_argument(
93
100
  "--streaming", type=bool, default=False, help="Enable streaming response"
94
101
  )
@@ -97,18 +104,42 @@ if __name__ == "__main__":
97
104
  )
98
105
  parser.add_argument("--rate", type=int, default=44100, help="Sample rate for audio")
99
106
 
100
- args = parser.parse_args()
107
+ return parser.parse_args()
101
108
 
102
- base64_audio = wav_to_base64(args.reference_audio)
103
109
 
104
- ref_text = args.reference_text
105
- if ref_text:
106
- ref_text = read_ref_text(ref_text)
110
+ if __name__ == "__main__":
111
+
112
+ args = parse_args()
113
+
114
+ idstr: str | None = args.reference_id
115
+ # priority: ref_id > [{text, audio},...]
116
+ if idstr is None:
117
+ ref_audios = args.reference_audio
118
+ ref_texts = args.reference_text
119
+ if ref_audios is None:
120
+ byte_audios = []
121
+ else:
122
+ byte_audios = [audio_to_bytes(ref_audio) for ref_audio in ref_audios]
123
+ if ref_texts is None:
124
+ ref_texts = []
125
+ else:
126
+ ref_texts = [read_ref_text(ref_text) for ref_text in ref_texts]
127
+ else:
128
+ byte_audios = []
129
+ ref_texts = []
130
+ pass # in api.py
107
131
 
108
132
  data = {
109
133
  "text": args.text,
110
- "reference_text": ref_text,
111
- "reference_audio": base64_audio,
134
+ "references": [
135
+ ServeReferenceAudio(audio=ref_audio, text=ref_text)
136
+ for ref_text, ref_audio in zip(ref_texts, byte_audios)
137
+ ],
138
+ "reference_id": idstr,
139
+ "normalize": args.normalize,
140
+ "format": args.format,
141
+ "mp3_bitrate": args.mp3_bitrate,
142
+ "opus_bitrate": args.opus_bitrate,
112
143
  "max_new_tokens": args.max_new_tokens,
113
144
  "chunk_length": args.chunk_length,
114
145
  "top_p": args.top_p,
@@ -116,22 +147,30 @@ if __name__ == "__main__":
116
147
  "temperature": args.temperature,
117
148
  "speaker": args.speaker,
118
149
  "emotion": args.emotion,
119
- "format": args.format,
120
150
  "streaming": args.streaming,
121
151
  }
122
152
 
123
- response = requests.post(args.url, json=data, stream=args.streaming)
153
+ pydantic_data = ServeTTSRequest(**data)
124
154
 
125
- audio_format = pyaudio.paInt16 # Assuming 16-bit PCM format
155
+ response = requests.post(
156
+ args.url,
157
+ data=ormsgpack.packb(pydantic_data, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
158
+ stream=args.streaming,
159
+ headers={
160
+ "authorization": "Bearer YOUR_API_KEY",
161
+ "content-type": "application/msgpack",
162
+ },
163
+ )
126
164
 
127
165
  if response.status_code == 200:
128
166
  if args.streaming:
129
167
  p = pyaudio.PyAudio()
168
+ audio_format = pyaudio.paInt16 # Assuming 16-bit PCM format
130
169
  stream = p.open(
131
170
  format=audio_format, channels=args.channels, rate=args.rate, output=True
132
171
  )
133
172
 
134
- wf = wave.open("generated_audio.wav", "wb")
173
+ wf = wave.open(f"{args.output}.wav", "wb")
135
174
  wf.setnchannels(args.channels)
136
175
  wf.setsampwidth(p.get_sample_size(audio_format))
137
176
  wf.setframerate(args.rate)
@@ -153,12 +192,14 @@ if __name__ == "__main__":
153
192
  wf.close()
154
193
  else:
155
194
  audio_content = response.content
156
-
157
- with open("generated_audio.wav", "wb") as audio_file:
195
+ audio_path = f"{args.output}.{args.format}"
196
+ with open(audio_path, "wb") as audio_file:
158
197
  audio_file.write(audio_content)
159
198
 
160
- play_audio(audio_content, audio_format, args.channels, args.rate)
161
- print("Audio has been saved to 'generated_audio.wav'.")
199
+ audio = AudioSegment.from_file(audio_path, format=args.format)
200
+ if args.play:
201
+ play(audio)
202
+ print(f"Audio has been saved to '{audio_path}'.")
162
203
  else:
163
204
  print(f"Request failed with status code {response.status_code}")
164
205
  print(response.json())
@@ -26,7 +26,7 @@ def uvr5_cli(
26
26
  output_folder: Path,
27
27
  audio_files: list[Path] | None = None,
28
28
  output_format: str = "flac",
29
- model: str = "BS-Roformer-Viperx-1296.ckpt",
29
+ model: str = "BS-Roformer-Viperx-1297.ckpt",
30
30
  ):
31
31
  # ["BS-Roformer-Viperx-1297.ckpt", "BS-Roformer-Viperx-1296.ckpt", "BS-Roformer-Viperx-1053.ckpt", "Mel-Roformer-Viperx-1143.ckpt"]
32
32
  sepr = Separator(
@@ -15,21 +15,34 @@ threshold = 10 ** (-50 / 20.0)
15
15
 
16
16
  def process(file):
17
17
  waveform, sample_rate = torchaudio.load(str(file), backend="sox")
18
+ if waveform.size(0) > 1:
19
+ waveform = waveform.mean(dim=0, keepdim=True)
20
+
18
21
  loudness = librosa.feature.rms(
19
22
  y=waveform.numpy().squeeze(), frame_length=2048, hop_length=512, center=True
20
23
  )[0]
24
+
21
25
  for i in range(len(loudness) - 1, 0, -1):
22
26
  if loudness[i] > threshold:
23
27
  break
24
28
 
25
- silent_time = (len(loudness) - i) * 512 / sample_rate
29
+ end_silent_time = (len(loudness) - i) * 512 / sample_rate
26
30
 
27
- if silent_time <= 0.3:
28
- random_time = random.uniform(0.3, 0.7)
31
+ if end_silent_time <= 0.3:
32
+ random_time = random.uniform(0.3, 0.7) - end_silent_time
29
33
  waveform = F.pad(
30
34
  waveform, (0, int(random_time * sample_rate)), mode="constant", value=0
31
35
  )
32
36
 
37
+ for i in range(len(loudness)):
38
+ if loudness[i] > threshold:
39
+ break
40
+
41
+ start_silent_time = i * 512 / sample_rate
42
+
43
+ if start_silent_time > 0.02:
44
+ waveform = waveform[:, int((start_silent_time - 0.02) * sample_rate) :]
45
+
33
46
  torchaudio.save(uri=str(file), src=waveform, sample_rate=sample_rate)
34
47
 
35
48
 
@@ -42,7 +42,7 @@ logger.add(sys.stderr, format=logger_format)
42
42
  @lru_cache(maxsize=1)
43
43
  def get_model(
44
44
  config_name: str = "firefly_gan_vq",
45
- checkpoint_path: str = "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
45
+ checkpoint_path: str = "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
46
46
  device: str | torch.device = "cuda",
47
47
  ):
48
48
  with initialize(version_base="1.3", config_path="../../fish_speech/configs"):
@@ -133,7 +133,7 @@ def process_batch(files: list[Path], model) -> float:
133
133
  @click.option("--config-name", default="firefly_gan_vq")
134
134
  @click.option(
135
135
  "--checkpoint-path",
136
- default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
136
+ default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
137
137
  )
138
138
  @click.option("--batch-size", default=64)
139
139
  @click.option("--filelist", default=None, type=Path)
@@ -59,7 +59,7 @@ def load_model(config_name, checkpoint_path, device="cuda"):
59
59
  @click.option("--config-name", default="firefly_gan_vq")
60
60
  @click.option(
61
61
  "--checkpoint-path",
62
- default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
62
+ default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
63
63
  )
64
64
  @click.option(
65
65
  "--device",
@@ -103,7 +103,9 @@ def main(input_path, output_path, config_name, checkpoint_path, device):
103
103
 
104
104
  # Restore
105
105
  feature_lengths = torch.tensor([indices.shape[1]], device=device)
106
- fake_audios = model.decode(indices=indices[None], feature_lengths=feature_lengths)
106
+ fake_audios, _ = model.decode(
107
+ indices=indices[None], feature_lengths=feature_lengths
108
+ )
107
109
  audio_time = fake_audios.shape[-1] / model.spec_transform.sample_rate
108
110
 
109
111
  logger.info(
@@ -23,7 +23,6 @@ from fish_speech.i18n import i18n
23
23
  from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
24
24
  from fish_speech.utils import autocast_exclude_mps
25
25
  from tools.api import decode_vq_tokens, encode_reference
26
- from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
27
26
  from tools.llama.generate import (
28
27
  GenerateRequest,
29
28
  GenerateResponse,
@@ -40,9 +39,9 @@ HEADER_MD = f"""# Fish Speech
40
39
 
41
40
  {i18n("A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).")}
42
41
 
43
- {i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).")}
42
+ {i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.4).")}
44
43
 
45
- {i18n("Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.")}
44
+ {i18n("Related code and weights are released under CC BY-NC-SA 4.0 License.")}
46
45
 
47
46
  {i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")}
48
47
  """
@@ -160,66 +159,6 @@ def inference(
160
159
  gc.collect()
161
160
 
162
161
 
163
- def inference_with_auto_rerank(
164
- text,
165
- enable_reference_audio,
166
- reference_audio,
167
- reference_text,
168
- max_new_tokens,
169
- chunk_length,
170
- top_p,
171
- repetition_penalty,
172
- temperature,
173
- use_auto_rerank,
174
- streaming=False,
175
- ):
176
-
177
- max_attempts = 2 if use_auto_rerank else 1
178
- best_wer = float("inf")
179
- best_audio = None
180
- best_sample_rate = None
181
-
182
- for attempt in range(max_attempts):
183
- audio_generator = inference(
184
- text,
185
- enable_reference_audio,
186
- reference_audio,
187
- reference_text,
188
- max_new_tokens,
189
- chunk_length,
190
- top_p,
191
- repetition_penalty,
192
- temperature,
193
- streaming=False,
194
- )
195
-
196
- # 获取音频数据
197
- for _ in audio_generator:
198
- pass
199
- _, (sample_rate, audio), message = _
200
-
201
- if audio is None:
202
- return None, None, message
203
-
204
- if not use_auto_rerank:
205
- return None, (sample_rate, audio), None
206
-
207
- asr_result = batch_asr(asr_model, [audio], sample_rate)[0]
208
- wer = calculate_wer(text, asr_result["text"])
209
- if wer <= 0.3 and not asr_result["huge_gap"]:
210
- return None, (sample_rate, audio), None
211
-
212
- if wer < best_wer:
213
- best_wer = wer
214
- best_audio = audio
215
- best_sample_rate = sample_rate
216
-
217
- if attempt == max_attempts - 1:
218
- break
219
-
220
- return None, (best_sample_rate, best_audio), None
221
-
222
-
223
162
  inference_stream = partial(inference, streaming=True)
224
163
 
225
164
  n_audios = 4
@@ -239,13 +178,12 @@ def inference_wrapper(
239
178
  repetition_penalty,
240
179
  temperature,
241
180
  batch_infer_num,
242
- if_load_asr_model,
243
181
  ):
244
182
  audios = []
245
183
  errors = []
246
184
 
247
185
  for _ in range(batch_infer_num):
248
- result = inference_with_auto_rerank(
186
+ result = inference(
249
187
  text,
250
188
  enable_reference_audio,
251
189
  reference_audio,
@@ -255,10 +193,9 @@ def inference_wrapper(
255
193
  top_p,
256
194
  repetition_penalty,
257
195
  temperature,
258
- if_load_asr_model,
259
196
  )
260
197
 
261
- _, audio_data, error_message = result
198
+ _, audio_data, error_message = next(result)
262
199
 
263
200
  audios.append(
264
201
  gr.Audio(value=audio_data if audio_data else None, visible=True),
@@ -301,42 +238,6 @@ def normalize_text(user_input, use_normalization):
301
238
  asr_model = None
302
239
 
303
240
 
304
- def change_if_load_asr_model(if_load):
305
- global asr_model
306
-
307
- if if_load:
308
- gr.Warning("Loading faster whisper model...")
309
- if asr_model is None:
310
- asr_model = load_model()
311
- return gr.Checkbox(label="Unload faster whisper model", value=if_load)
312
-
313
- if if_load is False:
314
- gr.Warning("Unloading faster whisper model...")
315
- del asr_model
316
- asr_model = None
317
- if torch.cuda.is_available():
318
- torch.cuda.empty_cache()
319
- gc.collect()
320
- return gr.Checkbox(label="Load faster whisper model", value=if_load)
321
-
322
-
323
- def change_if_auto_label(if_load, if_auto_label, enable_ref, ref_audio, ref_text):
324
- if if_load and asr_model is not None:
325
- if (
326
- if_auto_label
327
- and enable_ref
328
- and ref_audio is not None
329
- and ref_text.strip() == ""
330
- ):
331
- data, sample_rate = librosa.load(ref_audio)
332
- res = batch_asr(asr_model, [data], sample_rate)[0]
333
- ref_text = res["text"]
334
- else:
335
- gr.Warning("Whisper model not loaded!")
336
-
337
- return gr.Textbox(value=ref_text)
338
-
339
-
340
241
  def build_app():
341
242
  with gr.Blocks(theme=gr.themes.Base()) as app:
342
243
  gr.Markdown(HEADER_MD)
@@ -367,23 +268,17 @@ def build_app():
367
268
  with gr.Row():
368
269
  if_refine_text = gr.Checkbox(
369
270
  label=i18n("Text Normalization"),
370
- value=True,
371
- scale=1,
372
- )
373
-
374
- if_load_asr_model = gr.Checkbox(
375
- label=i18n("Load / Unload ASR model for auto-reranking"),
376
271
  value=False,
377
- scale=3,
272
+ scale=1,
378
273
  )
379
274
 
380
275
  with gr.Row():
381
276
  with gr.Tab(label=i18n("Advanced Config")):
382
277
  chunk_length = gr.Slider(
383
278
  label=i18n("Iterative Prompt Length, 0 means off"),
384
- minimum=0,
385
- maximum=500,
386
- value=100,
279
+ minimum=50,
280
+ maximum=300,
281
+ value=200,
387
282
  step=8,
388
283
  )
389
284
 
@@ -434,12 +329,6 @@ def build_app():
434
329
  type="filepath",
435
330
  )
436
331
  with gr.Row():
437
- if_auto_label = gr.Checkbox(
438
- label=i18n("Auto Labeling"),
439
- min_width=100,
440
- scale=0,
441
- value=False,
442
- )
443
332
  reference_text = gr.Textbox(
444
333
  label=i18n("Reference Text"),
445
334
  lines=1,
@@ -494,28 +383,6 @@ def build_app():
494
383
  fn=normalize_text, inputs=[text, if_refine_text], outputs=[refined_text]
495
384
  )
496
385
 
497
- if_load_asr_model.change(
498
- fn=change_if_load_asr_model,
499
- inputs=[if_load_asr_model],
500
- outputs=[if_load_asr_model],
501
- )
502
-
503
- if_auto_label.change(
504
- fn=lambda: gr.Textbox(value=""),
505
- inputs=[],
506
- outputs=[reference_text],
507
- ).then(
508
- fn=change_if_auto_label,
509
- inputs=[
510
- if_load_asr_model,
511
- if_auto_label,
512
- enable_reference_audio,
513
- reference_audio,
514
- reference_text,
515
- ],
516
- outputs=[reference_text],
517
- )
518
-
519
386
  # # Submit
520
387
  generate.click(
521
388
  inference_wrapper,
@@ -530,7 +397,6 @@ def build_app():
530
397
  repetition_penalty,
531
398
  temperature,
532
399
  batch_infer_num,
533
- if_load_asr_model,
534
400
  ],
535
401
  [stream_audio, *global_audio_list, *global_error_list],
536
402
  concurrency_limit=1,
@@ -560,12 +426,12 @@ def parse_args():
560
426
  parser.add_argument(
561
427
  "--llama-checkpoint-path",
562
428
  type=Path,
563
- default="checkpoints/fish-speech-1.2-sft",
429
+ default="checkpoints/fish-speech-1.4",
564
430
  )
565
431
  parser.add_argument(
566
432
  "--decoder-checkpoint-path",
567
433
  type=Path,
568
- default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
434
+ default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
569
435
  )
570
436
  parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
571
437
  parser.add_argument("--device", type=str, default="cuda")
@@ -605,8 +471,8 @@ if __name__ == "__main__":
605
471
  enable_reference_audio=False,
606
472
  reference_audio=None,
607
473
  reference_text="",
608
- max_new_tokens=0,
609
- chunk_length=100,
474
+ max_new_tokens=1024,
475
+ chunk_length=200,
610
476
  top_p=0.7,
611
477
  repetition_penalty=1.2,
612
478
  temperature=0.7,
xinference/types.py CHANGED
@@ -47,6 +47,12 @@ class ImageList(TypedDict):
47
47
  data: List[Image]
48
48
 
49
49
 
50
+ class SDAPIResult(TypedDict):
51
+ images: List[str]
52
+ parameters: dict
53
+ info: dict
54
+
55
+
50
56
  class Video(TypedDict):
51
57
  url: Optional[str]
52
58
  b64_json: Optional[str]
@@ -422,14 +428,11 @@ class CreateChatModel(BaseModel):
422
428
  CreateChatCompletionTorch = CreateCompletionTorch
423
429
  CreateChatCompletionLlamaCpp: BaseModel = CreateCompletionLlamaCpp
424
430
 
425
- # This type is for openai API compatibility
426
- CreateChatCompletionOpenAI: BaseModel
427
-
428
431
 
429
432
  from ._compat import CreateChatCompletionOpenAI
430
433
 
431
434
 
432
- class CreateChatCompletion(
435
+ class CreateChatCompletion( # type: ignore
433
436
  CreateChatModel,
434
437
  CreateChatCompletionTorch,
435
438
  CreateChatCompletionLlamaCpp,
@@ -1,14 +1,14 @@
1
1
  {
2
2
  "files": {
3
- "main.css": "./static/css/main.632e9148.css",
4
- "main.js": "./static/js/main.9cfafbd6.js",
3
+ "main.css": "./static/css/main.5061c4c3.css",
4
+ "main.js": "./static/js/main.754740c0.js",
5
5
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
6
6
  "index.html": "./index.html",
7
- "main.632e9148.css.map": "./static/css/main.632e9148.css.map",
8
- "main.9cfafbd6.js.map": "./static/js/main.9cfafbd6.js.map"
7
+ "main.5061c4c3.css.map": "./static/css/main.5061c4c3.css.map",
8
+ "main.754740c0.js.map": "./static/js/main.754740c0.js.map"
9
9
  },
10
10
  "entrypoints": [
11
- "static/css/main.632e9148.css",
12
- "static/js/main.9cfafbd6.js"
11
+ "static/css/main.5061c4c3.css",
12
+ "static/js/main.754740c0.js"
13
13
  ]
14
14
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.9cfafbd6.js"></script><link href="./static/css/main.632e9148.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.754740c0.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
@@ -1,2 +1,2 @@
1
- .container{cursor:pointer;display:block}.container,.descriptionCard{border-radius:20px;height:300px;position:relative;width:300px}.descriptionCard{left:-1px;padding:20px;top:-1px}.cardTitle{display:flex;justify-content:space-between}.iconButtonBox{align-items:center;display:flex}.drawerCard{min-height:100%;padding:20px 80px 0;position:relative;width:60vw}.p{-webkit-line-clamp:4;-webkit-box-orient:vertical;display:-webkit-box;font-size:14px;overflow:hidden;padding:0 10px;text-overflow:ellipsis;word-break:break-word}.formContainer{height:80%;overflow:scroll;padding:0 10px}.buttonsContainer{align-items:center;bottom:50px;display:flex;justify-content:space-between;left:100px;position:absolute;right:100px}.buttonContainer{background-color:initial;border-width:0;width:45%}.buttonItem{border:1px solid #e5e7eb;border-radius:4px;padding:5px;width:100%}.instructionText{color:#666;font-size:12px;font-style:italic;margin:30px 0;text-align:center}.iconRow{bottom:20px;justify-content:space-between;left:20px;position:absolute;right:20px}.iconItem,.iconRow{align-items:center;display:flex}.iconItem{flex-direction:column;margin:20px}.boldIconText{font-size:1.2em;font-weight:700}.muiIcon{font-size:1.5em}.smallText{font-size:.8em}.dialogBox{background-color:#fff;height:607px;margin:32px;overflow-x:scroll;width:1241px}.dialogTitle{color:#000;display:flex;justify-content:space-between;padding:20px 20px 7px}.dialogTitle-model_name{font-size:18px;font-weight:700}.pathBox{cursor:pointer;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;width:160px}.pathBox2{width:300px}.empty{color:#555;font-size:20px;left:50%;position:absolute;top:30%;-webkit-transform:translate(-50%);transform:translate(-50%)}.deleteDialog{align-items:center;display:flex}.warningIcon{color:#ed6c02;margin-right:10px}.jsonDialog{background-color:#fff;border-radius:8px;color:#000;display:flex;flex-direction:column;padding:10px 30px}.jsonDialog-title{align-items:center;display:flex;justify-content:space-between;margin:10px 0 20px}.title-name{font-size:16px;font-weight:700}.main-box{height:500px;width:700px}.textarea-box{border:1px solid #ddd;border-radius:5px;color:#444;height:100%;padding:5px 10px;resize:none;width:100%}.but-box{display:flex;justify-content:end;margin-top:20px}.copyText{color:#666;cursor:pointer;font-size:14px!important}.copyText:hover{color:#1976d2}.formBox{max-height:80vh;max-width:50vw;min-width:50vw;overflow:auto;padding:40px 20px 0 0;position:relative;transition:all .4s ease-in-out}.broaden{max-width:100%;min-width:100%;padding-right:0}.show-json{align-items:center;color:#444;display:flex;position:fixed;right:60px;top:90px}.icon{cursor:pointer;margin-left:20px;position:absolute;right:-40px}.icon:hover{color:#1976d2}.arrow{font-size:24px!important}.jsonBox{min-height:80vh;position:relative;transition:all .4s ease-in-out;width:100%}.hide{overflow:hidden;-webkit-transform:translate(30vw);transform:translate(30vw);width:0}.checkboxWrapper{align-items:center;display:flex;flex-wrap:wrap;width:100%}.jsonBox-header{align-items:center;display:flex;justify-content:space-between}.jsonBox-title{font-weight:700;line-height:40px}.textarea{border:1px solid #ddd;border-radius:5px;color:#444;height:calc(100% - 40px);padding:5px 10px;resize:none;width:100%}.addBtn{margin-left:20px!important}.item{background-color:#eee;border-radius:10px;margin:10px 50px 0;overflow:hidden;padding:20px;position:relative}.item:hover .deleteBtn{-webkit-transform:translateX(-50px);transform:translateX(-50px)}.deleteBtn{background-color:#1976d2;border-radius:25px;height:50px;line-height:70px;position:absolute;right:20px;text-align:center;top:calc(50% - 25px);-webkit-transform:translateX(80px);transform:translateX(80px);transition:all .3s ease-in-out;width:50px}.deleteBtn:hover{box-shadow:0 0 10px #aaa;cursor:pointer}.deleteIcon{color:#fff;font-size:28px!important}.chat_template_box{align-items:start;display:flex;gap:10px}.chat_template_test{width:30%}.chat_template_test_mainBox{border:1px solid #ccc;border-radius:4px;height:137px;overflow:scroll;padding:10px}.chat_template_test_tip{color:rgba(0,0,0,.6);font-size:10px;margin:4px 14px 0}.test_res_box{background-color:#eee;border-radius:4px;margin-top:5px;min-height:55px;padding:10px}
2
- /*# sourceMappingURL=main.632e9148.css.map*/
1
+ .container{cursor:pointer;display:block}.container,.descriptionCard{border-radius:20px;height:300px;position:relative;width:300px}.descriptionCard{left:-1px;padding:20px;top:-1px}.cardTitle{display:flex;justify-content:space-between}.iconButtonBox{align-items:center;display:flex}.drawerCard{min-height:100%;padding:20px 80px 0;position:relative;width:60vw}.p{-webkit-line-clamp:4;-webkit-box-orient:vertical;display:-webkit-box;font-size:14px;overflow:hidden;padding:0 10px;text-overflow:ellipsis;word-break:break-word}.formContainer{height:80%;overflow:scroll;padding:0 10px}.buttonsContainer{align-items:center;bottom:50px;display:flex;justify-content:space-between;left:100px;position:absolute;right:100px}.buttonContainer{background-color:initial;border-width:0;width:45%}.buttonItem{border:1px solid #e5e7eb;border-radius:4px;padding:5px;width:100%}.instructionText{color:#666;font-size:12px;font-style:italic;margin:30px 0;text-align:center}.iconRow{bottom:20px;justify-content:space-between;left:20px;position:absolute;right:20px}.iconItem,.iconRow{align-items:center;display:flex}.iconItem{flex-direction:column;margin:20px}.boldIconText{font-size:1.2em;font-weight:700}.muiIcon{font-size:1.5em}.smallText{font-size:.8em}.dialogBox{background-color:#fff;height:607px;margin:32px;overflow-x:scroll;width:1241px}.dialogTitle{color:#000;display:flex;justify-content:space-between;padding:20px 20px 7px}.dialogTitle-model_name{font-size:18px;font-weight:700}.pathBox{cursor:pointer;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;width:160px}.pathBox2{width:300px}.empty{color:#555;font-size:20px;left:50%;position:absolute;top:30%;-webkit-transform:translate(-50%);transform:translate(-50%)}.deleteDialog{align-items:center;display:flex}.warningIcon{color:#ed6c02;margin-right:10px}.jsonDialog{background-color:#fff;border-radius:8px;color:#000;display:flex;flex-direction:column;padding:10px 30px}.jsonDialog-title{align-items:center;display:flex;justify-content:space-between;margin:10px 0 20px}.title-name{font-size:16px;font-weight:700}.main-box{height:500px;width:700px}.textarea-box{border:1px solid #ddd;border-radius:5px;color:#444;height:100%;padding:5px 10px;resize:none;width:100%}.but-box{display:flex;justify-content:end;margin-top:20px}.copyText{color:#666;cursor:pointer;font-size:14px!important}.copyText:hover{color:#1976d2}.formBox{max-height:80vh;max-width:50vw;min-width:50vw;overflow:auto;padding:40px 20px 0 0;position:relative;transition:all .4s ease-in-out}.broaden{max-width:100%;min-width:100%;padding-right:0}.show-json{align-items:center;color:#444;display:flex;position:fixed;right:60px;top:90px}.icon{cursor:pointer;margin-left:20px;position:absolute;right:-40px}.icon:hover{color:#1976d2}.arrow{font-size:24px!important}.jsonBox{min-height:80vh;position:relative;transition:all .4s ease-in-out;width:100%}.hide{overflow:hidden;-webkit-transform:translate(30vw);transform:translate(30vw);width:0}.checkboxWrapper{align-items:center;display:flex;flex-wrap:wrap;width:100%}.jsonBox-header{align-items:center;display:flex;justify-content:space-between}.jsonBox-title{font-weight:700;line-height:40px}.textarea{border:1px solid #ddd;border-radius:5px;color:#444;height:calc(100% - 40px);padding:5px 10px;resize:none;width:100%}.addBtn{margin-left:20px!important}.item{background-color:#eee;border-radius:10px;margin:10px 50px 0;overflow:hidden;padding:20px;position:relative}.item:hover .deleteBtn{-webkit-transform:translateX(-50px);transform:translateX(-50px)}.deleteBtn{background-color:#1976d2;border-radius:25px;height:50px;line-height:70px;position:absolute;right:20px;text-align:center;top:calc(50% - 25px);-webkit-transform:translateX(80px);transform:translateX(80px);transition:all .3s ease-in-out;width:50px}.deleteBtn:hover{box-shadow:0 0 10px #aaa;cursor:pointer}.deleteIcon{color:#fff;font-size:28px!important}.chat_template_box{align-items:start;display:flex;gap:10px}.chat_template_test{width:30%}.chat_template_test_mainBox{border:1px solid #ccc;border-radius:4px;height:137px;overflow:scroll;padding:10px}.chat_template_test_tip{color:rgba(0,0,0,.6);font-size:10px;margin:4px 14px 0}.test_res_box{background-color:#eee;border-radius:4px;margin-top:5px;min-height:55px;padding:10px}.css-19qh8xo-MuiInputBase-input-MuiOutlinedInput-input.Mui-disabled{-webkit-text-fill-color:#000!important}
2
+ /*# sourceMappingURL=main.5061c4c3.css.map*/