xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +79 -2
  3. xinference/client/restful/restful_client.py +65 -3
  4. xinference/conftest.py +0 -7
  5. xinference/core/media_interface.py +132 -8
  6. xinference/core/model.py +44 -6
  7. xinference/core/scheduler.py +1 -10
  8. xinference/core/supervisor.py +8 -17
  9. xinference/core/worker.py +5 -27
  10. xinference/deploy/cmdline.py +6 -2
  11. xinference/model/audio/chattts.py +24 -39
  12. xinference/model/audio/cosyvoice.py +18 -30
  13. xinference/model/audio/funasr.py +42 -0
  14. xinference/model/audio/model_spec.json +71 -1
  15. xinference/model/audio/model_spec_modelscope.json +76 -2
  16. xinference/model/audio/utils.py +75 -0
  17. xinference/model/core.py +1 -0
  18. xinference/model/embedding/__init__.py +74 -18
  19. xinference/model/embedding/core.py +98 -589
  20. xinference/model/embedding/embed_family.py +133 -0
  21. xinference/{thirdparty/omnilmm/train → model/embedding/flag}/__init__.py +1 -1
  22. xinference/model/embedding/flag/core.py +282 -0
  23. xinference/model/embedding/model_spec.json +24 -0
  24. xinference/model/embedding/model_spec_modelscope.json +24 -0
  25. xinference/model/embedding/sentence_transformers/__init__.py +13 -0
  26. xinference/model/embedding/sentence_transformers/core.py +399 -0
  27. xinference/model/embedding/vllm/core.py +95 -0
  28. xinference/model/image/model_spec.json +30 -3
  29. xinference/model/image/model_spec_modelscope.json +41 -2
  30. xinference/model/image/stable_diffusion/core.py +144 -53
  31. xinference/model/llm/__init__.py +6 -54
  32. xinference/model/llm/core.py +19 -5
  33. xinference/model/llm/llama_cpp/core.py +59 -3
  34. xinference/model/llm/llama_cpp/memory.py +457 -0
  35. xinference/model/llm/llm_family.json +247 -402
  36. xinference/model/llm/llm_family.py +88 -16
  37. xinference/model/llm/llm_family_modelscope.json +260 -421
  38. xinference/model/llm/llm_family_openmind_hub.json +0 -34
  39. xinference/model/llm/sglang/core.py +8 -0
  40. xinference/model/llm/transformers/__init__.py +27 -6
  41. xinference/model/llm/transformers/chatglm.py +4 -2
  42. xinference/model/llm/transformers/core.py +49 -28
  43. xinference/model/llm/transformers/deepseek_v2.py +6 -49
  44. xinference/model/llm/transformers/gemma3.py +119 -164
  45. xinference/model/llm/transformers/multimodal/__init__.py +13 -0
  46. xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
  47. xinference/model/llm/transformers/multimodal/core.py +205 -0
  48. xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
  49. xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
  50. xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
  51. xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
  52. xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
  53. xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
  54. xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
  55. xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
  56. xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
  57. xinference/model/llm/transformers/opt.py +4 -2
  58. xinference/model/llm/transformers/utils.py +6 -37
  59. xinference/model/llm/utils.py +11 -0
  60. xinference/model/llm/vllm/core.py +7 -0
  61. xinference/model/rerank/core.py +91 -3
  62. xinference/model/rerank/model_spec.json +24 -0
  63. xinference/model/rerank/model_spec_modelscope.json +24 -0
  64. xinference/model/rerank/utils.py +20 -2
  65. xinference/model/utils.py +38 -1
  66. xinference/model/video/diffusers.py +65 -3
  67. xinference/model/video/model_spec.json +31 -4
  68. xinference/model/video/model_spec_modelscope.json +32 -4
  69. xinference/web/ui/build/asset-manifest.json +6 -6
  70. xinference/web/ui/build/index.html +1 -1
  71. xinference/web/ui/build/static/css/main.013f296b.css +2 -0
  72. xinference/web/ui/build/static/css/main.013f296b.css.map +1 -0
  73. xinference/web/ui/build/static/js/main.8a9e3ba0.js +3 -0
  74. xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +1 -0
  75. xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +1 -0
  76. xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +1 -0
  77. xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/6595880facebca7ceace6f17cf21c3a5a9219a2f52fb0ba9f3cf1131eddbcf6b.json +1 -0
  79. xinference/web/ui/node_modules/.cache/babel-loader/aa998bc2d9c11853add6b8a2e08f50327f56d8824ccaaec92d6dde1b305f0d85.json +1 -0
  80. xinference/web/ui/node_modules/.cache/babel-loader/c748246b1d7bcebc16153be69f37e955bb2145526c47dd425aeeff70d3004dbc.json +1 -0
  81. xinference/web/ui/node_modules/.cache/babel-loader/e31234e95d60a5a7883fbcd70de2475dc1c88c90705df1a530abb68f86f80a51.json +1 -0
  82. xinference/web/ui/src/locales/en.json +21 -8
  83. xinference/web/ui/src/locales/ja.json +224 -0
  84. xinference/web/ui/src/locales/ko.json +224 -0
  85. xinference/web/ui/src/locales/zh.json +21 -8
  86. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/METADATA +14 -11
  87. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/RECORD +93 -100
  88. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/WHEEL +1 -1
  89. xinference/model/llm/transformers/cogvlm2.py +0 -442
  90. xinference/model/llm/transformers/cogvlm2_video.py +0 -333
  91. xinference/model/llm/transformers/deepseek_vl.py +0 -280
  92. xinference/model/llm/transformers/glm_edge_v.py +0 -213
  93. xinference/model/llm/transformers/intern_vl.py +0 -526
  94. xinference/model/llm/transformers/internlm2.py +0 -94
  95. xinference/model/llm/transformers/minicpmv25.py +0 -193
  96. xinference/model/llm/transformers/omnilmm.py +0 -132
  97. xinference/model/llm/transformers/qwen2_audio.py +0 -179
  98. xinference/model/llm/transformers/qwen_vl.py +0 -360
  99. xinference/thirdparty/omnilmm/LICENSE +0 -201
  100. xinference/thirdparty/omnilmm/chat.py +0 -218
  101. xinference/thirdparty/omnilmm/constants.py +0 -4
  102. xinference/thirdparty/omnilmm/conversation.py +0 -332
  103. xinference/thirdparty/omnilmm/model/__init__.py +0 -1
  104. xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
  105. xinference/thirdparty/omnilmm/model/resampler.py +0 -166
  106. xinference/thirdparty/omnilmm/model/utils.py +0 -578
  107. xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
  108. xinference/thirdparty/omnilmm/utils.py +0 -134
  109. xinference/web/ui/build/static/css/main.337afe76.css +0 -2
  110. xinference/web/ui/build/static/css/main.337afe76.css.map +0 -1
  111. xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
  112. xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
  113. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +0 -1
  114. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
  115. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
  116. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
  117. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
  118. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +0 -1
  119. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +0 -1
  120. /xinference/{thirdparty/omnilmm → model/embedding/vllm}/__init__.py +0 -0
  121. /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.8a9e3ba0.js.LICENSE.txt} +0 -0
  122. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/entry_points.txt +0 -0
  123. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/licenses/LICENSE +0 -0
  124. {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/top_level.txt +0 -0
@@ -71,9 +71,10 @@ class ChatTTSModel:
71
71
  import ChatTTS
72
72
  import numpy as np
73
73
  import torch
74
- import torchaudio
75
74
  import xxhash
76
75
 
76
+ from .utils import audio_stream_generator, audio_to_bytes
77
+
77
78
  rnd_spk_emb = None
78
79
 
79
80
  if len(voice) > 400:
@@ -105,44 +106,28 @@ class ChatTTSModel:
105
106
  )
106
107
 
107
108
  assert self._model is not None
109
+
110
+ output = self._model.infer(
111
+ [input], params_infer_code=params_infer_code, stream=stream
112
+ )
108
113
  if stream:
109
- iter = self._model.infer(
110
- [input], params_infer_code=params_infer_code, stream=True
111
- )
112
114
 
113
- def _generator():
114
- with BytesIO() as out:
115
- writer = torchaudio.io.StreamWriter(out, format=response_format)
116
- writer.add_audio_stream(sample_rate=24000, num_channels=1)
117
- i = 0
118
- last_pos = 0
119
- with writer.open():
120
- for it in iter:
121
- for chunk in it:
122
- chunk = np.array([chunk]).transpose()
123
- writer.write_audio_chunk(i, torch.from_numpy(chunk))
124
- new_last_pos = out.tell()
125
- if new_last_pos != last_pos:
126
- out.seek(last_pos)
127
- encoded_bytes = out.read()
128
- yield encoded_bytes
129
- last_pos = new_last_pos
130
-
131
- return _generator()
115
+ def _gen_chunk():
116
+ for it in output:
117
+ for chunk in it:
118
+ yield chunk
119
+
120
+ return audio_stream_generator(
121
+ response_format=response_format,
122
+ sample_rate=24000,
123
+ output_generator=_gen_chunk(),
124
+ output_chunk_transformer=lambda c: torch.from_numpy(
125
+ np.array([c]).transpose()
126
+ ),
127
+ )
132
128
  else:
133
- wavs = self._model.infer([input], params_infer_code=params_infer_code)
134
-
135
- # Save the generated audio
136
- with BytesIO() as out:
137
- try:
138
- torchaudio.save(
139
- out,
140
- torch.from_numpy(wavs[0]).unsqueeze(0),
141
- 24000,
142
- format=response_format,
143
- )
144
- except:
145
- torchaudio.save(
146
- out, torch.from_numpy(wavs[0]), 24000, format=response_format
147
- )
148
- return out.getvalue()
129
+ return audio_to_bytes(
130
+ response_format=response_format,
131
+ sample_rate=24000,
132
+ tensor=torch.from_numpy(output[0]).unsqueeze(0),
133
+ )
@@ -13,7 +13,6 @@
13
13
  # limitations under the License.
14
14
  import io
15
15
  import logging
16
- from io import BytesIO
17
16
  from typing import TYPE_CHECKING, Optional
18
17
 
19
18
  from ..utils import set_all_random_seed
@@ -132,36 +131,25 @@ class CosyVoiceModel:
132
131
  output = self._model.inference_sft(input, voice, stream=stream)
133
132
 
134
133
  import torch
135
- import torchaudio
136
134
 
137
- def _generator_stream():
138
- with BytesIO() as out:
139
- writer = torchaudio.io.StreamWriter(out, format=response_format)
140
- writer.add_audio_stream(
141
- sample_rate=self._model.sample_rate, num_channels=1
142
- )
143
- i = 0
144
- last_pos = 0
145
- with writer.open():
146
- for chunk in output:
147
- chunk = chunk["tts_speech"]
148
- trans_chunk = torch.transpose(chunk, 0, 1)
149
- writer.write_audio_chunk(i, trans_chunk)
150
- new_last_pos = out.tell()
151
- if new_last_pos != last_pos:
152
- out.seek(last_pos)
153
- encoded_bytes = out.read()
154
- yield encoded_bytes
155
- last_pos = new_last_pos
156
-
157
- def _generator_block():
158
- chunks = [o["tts_speech"] for o in output]
159
- t = torch.cat(chunks, dim=1)
160
- with BytesIO() as out:
161
- torchaudio.save(out, t, self._model.sample_rate, format=response_format)
162
- return out.getvalue()
163
-
164
- return _generator_stream() if stream else _generator_block()
135
+ from .utils import audio_stream_generator, audio_to_bytes
136
+
137
+ return (
138
+ audio_stream_generator(
139
+ response_format=response_format,
140
+ sample_rate=self._model.sample_rate,
141
+ output_generator=output,
142
+ output_chunk_transformer=lambda c: torch.transpose(
143
+ c["tts_speech"], 0, 1
144
+ ),
145
+ )
146
+ if stream
147
+ else audio_to_bytes(
148
+ response_format=response_format,
149
+ sample_rate=self._model.sample_rate,
150
+ tensor=torch.cat([o["tts_speech"] for o in output], dim=1),
151
+ )
152
+ )
165
153
 
166
154
  def speech(
167
155
  self,
@@ -44,6 +44,44 @@ class FunASRModel:
44
44
  def model_ability(self):
45
45
  return self._model_spec.model_ability
46
46
 
47
+ def convert_to_openai_format(self, input_data):
48
+ if "timestamp" not in input_data:
49
+ return {"task": "transcribe", "text": input_data["text"]}
50
+ start_time = input_data["timestamp"][0][0] / 1000
51
+ end_time = input_data["timestamp"][-1][1] / 1000
52
+ duration = end_time - start_time
53
+ word_timestamps = []
54
+ for ts in input_data["timestamp"]:
55
+ word_timestamps.append({"start": ts[0] / 1000, "end": ts[1] / 1000})
56
+ if "sentence_info" not in input_data:
57
+ return {
58
+ "task": "transcribe",
59
+ "text": input_data["text"],
60
+ "words": word_timestamps,
61
+ "duration": duration,
62
+ }
63
+ output = {
64
+ "task": "transcribe",
65
+ "duration": duration,
66
+ "text": input_data["text"],
67
+ "words": word_timestamps,
68
+ "segments": [],
69
+ }
70
+ for sentence in input_data["sentence_info"]:
71
+ seg_start = sentence["start"] / 1000
72
+ seg_end = sentence["end"] / 1000
73
+ output["segments"].append(
74
+ {
75
+ "id": len(output["segments"]),
76
+ "start": seg_start,
77
+ "end": seg_end,
78
+ "text": sentence["text"],
79
+ "speaker": sentence["spk"],
80
+ }
81
+ )
82
+
83
+ return output
84
+
47
85
  def load(self):
48
86
  try:
49
87
  from funasr import AutoModel
@@ -103,6 +141,10 @@ class FunASRModel:
103
141
 
104
142
  if response_format == "json":
105
143
  return {"text": text}
144
+ elif response_format == "verbose_json":
145
+ verbose = result[0]
146
+ verbose["text"] = text
147
+ return self.convert_to_openai_format(verbose)
106
148
  else:
107
149
  raise ValueError(f"Unsupported response format: {response_format}")
108
150
 
@@ -218,13 +218,83 @@
218
218
  "batch_size_s": 300
219
219
  }
220
220
  },
221
+ {
222
+ "model_name": "paraformer-zh-hotword",
223
+ "model_family": "funasr",
224
+ "model_id": "JunHowie/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
225
+ "model_revision": "26d622993683d7b0c517ee5ec9c1c8bdde76e324",
226
+ "model_ability": ["audio2text"],
227
+ "multilingual": false,
228
+ "default_model_config": {
229
+ "vad_model": "fsmn-vad",
230
+ "punc_model": "ct-punc"
231
+ },
232
+ "default_transcription_config": {
233
+ "hotword": "",
234
+ "batch_size_s": 300
235
+ }
236
+ },
237
+ {
238
+ "model_name": "paraformer-zh-long",
239
+ "model_family": "funasr",
240
+ "model_id": "JunHowie/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
241
+ "model_revision": "b6d8cb81645e34056cd3dda41e5624a740587de3",
242
+ "model_ability": ["audio2text"],
243
+ "multilingual": false,
244
+ "default_model_config": {
245
+ "vad_model": "fsmn-vad",
246
+ "punc_model": "ct-punc"
247
+ },
248
+ "default_transcription_config": {
249
+ "batch_size_s": 300
250
+ }
251
+ },
252
+ {
253
+ "model_name": "paraformer-zh-spk",
254
+ "model_family": "funasr",
255
+ "model_id": "JunHowie/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
256
+ "model_revision": "36abd64af4392fe02bf76453bc86c081cf1ca6da",
257
+ "model_ability": ["audio2text"],
258
+ "multilingual": false,
259
+ "default_model_config": {
260
+ "vad_model": "fsmn-vad",
261
+ "punc_model": "ct-punc",
262
+ "spk_model":"cam++"
263
+ },
264
+ "default_transcription_config": {
265
+ "batch_size_s": 300
266
+ }
267
+ },
268
+ {
269
+ "model_name": "seaco-paraformer-zh",
270
+ "model_family": "funasr",
271
+ "model_id": "JunHowie/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
272
+ "model_revision": "42e6be00854cf8de0f40002794f99df2a444fa97",
273
+ "model_ability": ["audio2text"],
274
+ "multilingual": false,
275
+ "default_model_config": {
276
+ "vad_model": "fsmn-vad",
277
+ "punc_model": "ct-punc"
278
+ },
279
+ "default_transcription_config": {
280
+ "hotword": "",
281
+ "batch_size_s": 300
282
+ }
283
+ },
221
284
  {
222
285
  "model_name": "ChatTTS",
223
286
  "model_family": "ChatTTS",
224
287
  "model_id": "2Noise/ChatTTS",
225
288
  "model_revision": "1a3c04a8b0651689bd9242fbb55b1f4b5a9aef84",
226
289
  "model_ability": ["text2audio"],
227
- "multilingual": true
290
+ "multilingual": true,
291
+ "virtualenv": {
292
+ "packages": [
293
+ "ChatTTS>=0.2.1",
294
+ "#system_torch#",
295
+ "#system_numpy#"
296
+ ]
297
+ }
228
298
  },
229
299
  {
230
300
  "model_name": "CosyVoice-300M",
@@ -51,7 +51,7 @@
51
51
  "model_name": "paraformer-zh",
52
52
  "model_family": "funasr",
53
53
  "model_hub": "modelscope",
54
- "model_id": "iic/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
54
+ "model_id": "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
55
55
  "model_revision": "master",
56
56
  "model_ability": ["audio2text"],
57
57
  "multilingual": false,
@@ -63,6 +63,73 @@
63
63
  "batch_size_s": 300
64
64
  }
65
65
  },
66
+ {
67
+ "model_name": "paraformer-zh-hotword",
68
+ "model_family": "funasr",
69
+ "model_hub": "modelscope",
70
+ "model_id": "iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
71
+ "model_revision": "master",
72
+ "model_ability": ["audio2text"],
73
+ "multilingual": false,
74
+ "default_model_config": {
75
+ "vad_model": "fsmn-vad",
76
+ "punc_model": "ct-punc"
77
+ },
78
+ "default_transcription_config": {
79
+ "hotword": "",
80
+ "batch_size_s": 300
81
+ }
82
+ },
83
+ {
84
+ "model_name": "paraformer-zh-long",
85
+ "model_family": "funasr",
86
+ "model_hub": "modelscope",
87
+ "model_id": "iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
88
+ "model_revision": "master",
89
+ "model_ability": ["audio2text"],
90
+ "multilingual": false,
91
+ "default_model_config": {
92
+ "vad_model": "fsmn-vad",
93
+ "punc_model": "ct-punc"
94
+ },
95
+ "default_transcription_config": {
96
+ "batch_size_s": 300
97
+ }
98
+ },
99
+ {
100
+ "model_name": "paraformer-zh-spk",
101
+ "model_family": "funasr",
102
+ "model_hub": "modelscope",
103
+ "model_id": "iic/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
104
+ "model_revision": "master",
105
+ "model_ability": ["audio2text"],
106
+ "multilingual": false,
107
+ "default_model_config": {
108
+ "vad_model": "fsmn-vad",
109
+ "punc_model": "ct-punc",
110
+ "spk_model":"cam++"
111
+ },
112
+ "default_transcription_config": {
113
+ "batch_size_s": 300
114
+ }
115
+ },
116
+ {
117
+ "model_name": "seaco-paraformer-zh",
118
+ "model_family": "funasr",
119
+ "model_hub": "modelscope",
120
+ "model_id": "iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
121
+ "model_revision": "master",
122
+ "model_ability": ["audio2text"],
123
+ "multilingual": false,
124
+ "default_model_config": {
125
+ "vad_model": "fsmn-vad",
126
+ "punc_model": "ct-punc"
127
+ },
128
+ "default_transcription_config": {
129
+ "hotword": "",
130
+ "batch_size_s": 300
131
+ }
132
+ },
66
133
  {
67
134
  "model_name": "ChatTTS",
68
135
  "model_family": "ChatTTS",
@@ -70,7 +137,14 @@
70
137
  "model_id": "AI-ModelScope/ChatTTS",
71
138
  "model_revision": "master",
72
139
  "model_ability": ["text2audio"],
73
- "multilingual": true
140
+ "multilingual": true,
141
+ "virtualenv": {
142
+ "packages": [
143
+ "ChatTTS>=0.2.1",
144
+ "#system_torch#",
145
+ "#system_numpy#"
146
+ ]
147
+ }
74
148
  },
75
149
  {
76
150
  "model_name": "CosyVoice-300M",
@@ -13,16 +13,30 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import io
16
+ import logging
17
+ import types
18
+ import wave
19
+ from collections.abc import Callable
16
20
 
17
21
  import numpy as np
22
+ import torch
18
23
 
19
24
  from .core import AudioModelFamilyV1
20
25
 
26
+ logger = logging.getLogger(__name__)
27
+
21
28
 
22
29
  def get_model_version(audio_model: AudioModelFamilyV1) -> str:
23
30
  return audio_model.model_name
24
31
 
25
32
 
33
+ def _extract_pcm_from_wav_bytes(wav_bytes):
34
+ with io.BytesIO(wav_bytes) as wav_io:
35
+ with wave.open(wav_io, "rb") as wav_file:
36
+ num_frames = wav_file.getnframes()
37
+ return wav_file.readframes(num_frames)
38
+
39
+
26
40
  def ensure_sample_rate(
27
41
  audio: np.ndarray, old_sample_rate: int, sample_rate: int
28
42
  ) -> np.ndarray:
@@ -48,3 +62,64 @@ def ensure_sample_rate(
48
62
  audio, sr = sf.read(buffer, dtype="float32")
49
63
 
50
64
  return audio
65
+
66
+
67
+ def audio_stream_generator(
68
+ response_format: str,
69
+ sample_rate: int,
70
+ output_generator: types.GeneratorType,
71
+ output_chunk_transformer: Callable,
72
+ ):
73
+ import torch
74
+ import torchaudio
75
+
76
+ response_pcm = response_format.lower() == "pcm"
77
+ with io.BytesIO() as out:
78
+ if response_pcm:
79
+ logger.info(
80
+ f"PCM stream output, num_channels: 1, sample_rate: {sample_rate}"
81
+ )
82
+ writer = torchaudio.io.StreamWriter(out, format="wav")
83
+ writer.add_audio_stream(
84
+ sample_rate=sample_rate, num_channels=1, format="s16"
85
+ )
86
+ else:
87
+ writer = torchaudio.io.StreamWriter(out, format=response_format)
88
+ writer.add_audio_stream(sample_rate=sample_rate, num_channels=1)
89
+ strip_header = True
90
+ last_pos = 0
91
+ with writer.open():
92
+ for chunk in output_generator:
93
+ trans_chunk = output_chunk_transformer(chunk)
94
+ if response_pcm:
95
+ trans_chunk = trans_chunk.to(torch.float32)
96
+ trans_chunk = (
97
+ (trans_chunk * 32767).clamp(-32768, 32767).to(torch.int16)
98
+ )
99
+ writer.write_audio_chunk(0, trans_chunk)
100
+ new_last_pos = out.tell()
101
+ if new_last_pos != last_pos:
102
+ out.seek(last_pos)
103
+ encoded_bytes = out.read()
104
+ if response_pcm and strip_header:
105
+ # http://soundfile.sapp.org/doc/WaveFormat
106
+ yield _extract_pcm_from_wav_bytes(encoded_bytes)
107
+ strip_header = False
108
+ else:
109
+ yield encoded_bytes
110
+ last_pos = new_last_pos
111
+
112
+
113
+ def audio_to_bytes(response_format: str, sample_rate: int, tensor: "torch.Tensor"):
114
+ import torchaudio
115
+
116
+ response_pcm = response_format.lower() == "pcm"
117
+ with io.BytesIO() as out:
118
+ if response_pcm:
119
+ logger.info(f"PCM output, num_channels: 1, sample_rate: {sample_rate}")
120
+ torchaudio.save(out, tensor, sample_rate, format="wav", encoding="PCM_S")
121
+ # http://soundfile.sapp.org/doc/WaveFormat
122
+ return _extract_pcm_from_wav_bytes(out.getvalue())
123
+ else:
124
+ torchaudio.save(out, tensor, sample_rate, format=response_format)
125
+ return out.getvalue()
xinference/model/core.py CHANGED
@@ -97,6 +97,7 @@ def create_model_instance(
97
97
  devices,
98
98
  model_uid,
99
99
  model_name,
100
+ model_engine,
100
101
  download_hub,
101
102
  model_path,
102
103
  **kwargs,
@@ -16,7 +16,7 @@ import codecs
16
16
  import json
17
17
  import os
18
18
  import warnings
19
- from typing import Any, Dict
19
+ from typing import Any, Dict, List
20
20
 
21
21
  from .core import (
22
22
  EMBEDDING_MODEL_DESCRIPTIONS,
@@ -32,9 +32,15 @@ from .custom import (
32
32
  register_embedding,
33
33
  unregister_embedding,
34
34
  )
35
-
36
- BUILTIN_EMBEDDING_MODELS: Dict[str, Any] = {}
37
- MODELSCOPE_EMBEDDING_MODELS: Dict[str, Any] = {}
35
+ from .embed_family import (
36
+ BUILTIN_EMBEDDING_MODELS,
37
+ EMBEDDING_ENGINES,
38
+ FLAG_EMBEDDER_CLASSES,
39
+ MODELSCOPE_EMBEDDING_MODELS,
40
+ SENTENCE_TRANSFORMER_CLASSES,
41
+ SUPPORTED_ENGINES,
42
+ VLLM_CLASSES,
43
+ )
38
44
 
39
45
 
40
46
  def register_custom_model():
@@ -55,12 +61,56 @@ def register_custom_model():
55
61
  warnings.warn(f"{user_defined_embedding_dir}/{f} has error, {e}")
56
62
 
57
63
 
64
+ def generate_engine_config_by_model_name(model_spec: "EmbeddingModelSpec"):
65
+ model_name = model_spec.model_name
66
+ engines: Dict[str, List[Dict[str, Any]]] = EMBEDDING_ENGINES.get(
67
+ model_name, {}
68
+ ) # structure for engine query
69
+ for engine in SUPPORTED_ENGINES:
70
+ CLASSES = SUPPORTED_ENGINES[engine]
71
+ for cls in CLASSES:
72
+ # Every engine needs to implement match method
73
+ if cls.match(model_spec):
74
+ # we only match the first class for an engine
75
+ engines[engine] = [
76
+ {
77
+ "model_name": model_name,
78
+ "embedding_class": cls,
79
+ }
80
+ ]
81
+ break
82
+ EMBEDDING_ENGINES[model_name] = engines
83
+
84
+
85
+ # will be called in xinference/model/__init__.py
58
86
  def _install():
59
- load_model_family_from_json("model_spec.json", BUILTIN_EMBEDDING_MODELS)
60
- load_model_family_from_json(
61
- "model_spec_modelscope.json", MODELSCOPE_EMBEDDING_MODELS
87
+ _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
88
+ _model_spec_modelscope_json = os.path.join(
89
+ os.path.dirname(__file__), "model_spec_modelscope.json"
90
+ )
91
+ ################### HuggingFace Model List Info Init ###################
92
+ BUILTIN_EMBEDDING_MODELS.update(
93
+ dict(
94
+ (spec["model_name"], EmbeddingModelSpec(**spec))
95
+ for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
96
+ )
97
+ )
98
+ for model_name, model_spec in BUILTIN_EMBEDDING_MODELS.items():
99
+ MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
100
+
101
+ ################### ModelScope Model List Info Init ###################
102
+ MODELSCOPE_EMBEDDING_MODELS.update(
103
+ dict(
104
+ (spec["model_name"], EmbeddingModelSpec(**spec))
105
+ for spec in json.load(
106
+ codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
107
+ )
108
+ )
62
109
  )
110
+ for model_name, model_spec in MODELSCOPE_EMBEDDING_MODELS.items():
111
+ MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
63
112
 
113
+ # TODO: consider support more download hub in future...
64
114
  # register model description after recording model revision
65
115
  for model_spec_info in [BUILTIN_EMBEDDING_MODELS, MODELSCOPE_EMBEDDING_MODELS]:
66
116
  for model_name, model_spec in model_spec_info.items():
@@ -77,16 +127,22 @@ def _install():
77
127
  generate_embedding_description(ud_embedding)
78
128
  )
79
129
 
130
+ from .flag.core import FlagEmbeddingModel
131
+ from .sentence_transformers.core import SentenceTransformerEmbeddingModel
132
+ from .vllm.core import VLLMEmbeddingModel
80
133
 
81
- def load_model_family_from_json(json_filename, target_families):
82
- json_path = os.path.join(os.path.dirname(__file__), json_filename)
83
- target_families.update(
84
- dict(
85
- (spec["model_name"], EmbeddingModelSpec(**spec))
86
- for spec in json.load(codecs.open(json_path, "r", encoding="utf-8"))
87
- )
88
- )
89
- for model_name, model_spec in target_families.items():
90
- MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
134
+ SENTENCE_TRANSFORMER_CLASSES.extend([SentenceTransformerEmbeddingModel])
135
+ FLAG_EMBEDDER_CLASSES.extend([FlagEmbeddingModel])
136
+ VLLM_CLASSES.extend([VLLMEmbeddingModel])
137
+
138
+ SUPPORTED_ENGINES["sentence_transformers"] = SENTENCE_TRANSFORMER_CLASSES
139
+ SUPPORTED_ENGINES["flag"] = FLAG_EMBEDDER_CLASSES
140
+ SUPPORTED_ENGINES["vllm"] = VLLM_CLASSES
141
+
142
+ # Init embedding engine
143
+ for model_infos in [BUILTIN_EMBEDDING_MODELS, MODELSCOPE_EMBEDDING_MODELS]:
144
+ for model_spec in model_infos.values():
145
+ generate_engine_config_by_model_name(model_spec)
91
146
 
92
- del json_path
147
+ del _model_spec_json
148
+ del _model_spec_modelscope_json