xinference 1.6.0.post1__py3-none-any.whl → 1.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/client/restful/restful_client.py +1 -1
- xinference/conftest.py +0 -7
- xinference/core/media_interface.py +9 -8
- xinference/core/model.py +13 -6
- xinference/core/scheduler.py +1 -10
- xinference/core/worker.py +0 -10
- xinference/model/audio/model_spec.json +53 -1
- xinference/model/audio/model_spec_modelscope.json +57 -1
- xinference/model/embedding/core.py +19 -11
- xinference/model/image/model_spec.json +10 -1
- xinference/model/image/model_spec_modelscope.json +20 -0
- xinference/model/llm/__init__.py +6 -54
- xinference/model/llm/core.py +19 -5
- xinference/model/llm/llama_cpp/core.py +59 -3
- xinference/model/llm/llama_cpp/memory.py +455 -0
- xinference/model/llm/llm_family.json +185 -397
- xinference/model/llm/llm_family.py +88 -16
- xinference/model/llm/llm_family_modelscope.json +199 -421
- xinference/model/llm/llm_family_openmind_hub.json +0 -34
- xinference/model/llm/sglang/core.py +4 -0
- xinference/model/llm/transformers/__init__.py +27 -6
- xinference/model/llm/transformers/chatglm.py +4 -2
- xinference/model/llm/transformers/core.py +49 -28
- xinference/model/llm/transformers/deepseek_v2.py +6 -49
- xinference/model/llm/transformers/gemma3.py +119 -164
- xinference/{thirdparty/omnilmm/train → model/llm/transformers/multimodal}/__init__.py +1 -1
- xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
- xinference/model/llm/transformers/multimodal/core.py +205 -0
- xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
- xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
- xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
- xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
- xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
- xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
- xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
- xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
- xinference/model/llm/transformers/opt.py +4 -2
- xinference/model/llm/transformers/utils.py +6 -37
- xinference/model/llm/vllm/core.py +4 -0
- xinference/model/rerank/core.py +7 -1
- xinference/model/rerank/utils.py +17 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.ddf9eaee.js +3 -0
- xinference/web/ui/build/static/js/main.ddf9eaee.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/12e637ed5fa9ca6491b03892b6949c03afd4960fe36ac25744488e7e1982aa19.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/77ac2665a784e99501ae95d32ef5937837a0439a47e965d291b38e99cb619f5b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d4ed4e82bfe69915999ec83f5feaa4301c75ecc6bdf1c78f2d03e4671ecbefc8.json +1 -0
- xinference/web/ui/src/locales/en.json +3 -1
- xinference/web/ui/src/locales/zh.json +3 -1
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/METADATA +6 -4
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/RECORD +60 -76
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/WHEEL +1 -1
- xinference/model/llm/transformers/cogvlm2.py +0 -442
- xinference/model/llm/transformers/cogvlm2_video.py +0 -333
- xinference/model/llm/transformers/deepseek_vl.py +0 -280
- xinference/model/llm/transformers/glm_edge_v.py +0 -213
- xinference/model/llm/transformers/intern_vl.py +0 -526
- xinference/model/llm/transformers/internlm2.py +0 -94
- xinference/model/llm/transformers/minicpmv25.py +0 -193
- xinference/model/llm/transformers/omnilmm.py +0 -132
- xinference/model/llm/transformers/qwen2_audio.py +0 -179
- xinference/model/llm/transformers/qwen_vl.py +0 -360
- xinference/thirdparty/omnilmm/LICENSE +0 -201
- xinference/thirdparty/omnilmm/__init__.py +0 -0
- xinference/thirdparty/omnilmm/chat.py +0 -218
- xinference/thirdparty/omnilmm/constants.py +0 -4
- xinference/thirdparty/omnilmm/conversation.py +0 -332
- xinference/thirdparty/omnilmm/model/__init__.py +0 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
- xinference/thirdparty/omnilmm/model/resampler.py +0 -166
- xinference/thirdparty/omnilmm/model/utils.py +0 -578
- xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
- xinference/thirdparty/omnilmm/utils.py +0 -134
- xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
- xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
- /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.ddf9eaee.js.LICENSE.txt} +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.6.1.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-05-
|
|
11
|
+
"date": "2025-05-30T19:36:43+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "1.6.
|
|
14
|
+
"full-revisionid": "72cc5e39040bdc49981b240c2b59af998554a75f",
|
|
15
|
+
"version": "1.6.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -1017,7 +1017,7 @@ class Client:
|
|
|
1017
1017
|
model_path: Optional[str]
|
|
1018
1018
|
Model path, if gguf format, should be the file path, otherwise, should be directory of the model.
|
|
1019
1019
|
**kwargs:
|
|
1020
|
-
Any other parameters been specified.
|
|
1020
|
+
Any other parameters been specified. e.g. multimodal_projector for multimodal inference with the llama.cpp backend.
|
|
1021
1021
|
|
|
1022
1022
|
Returns
|
|
1023
1023
|
-------
|
xinference/conftest.py
CHANGED
|
@@ -19,7 +19,7 @@ import os
|
|
|
19
19
|
import threading
|
|
20
20
|
import time
|
|
21
21
|
import uuid
|
|
22
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
22
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
23
23
|
|
|
24
24
|
import gradio as gr
|
|
25
25
|
import PIL.Image
|
|
@@ -463,7 +463,7 @@ class MediaInterface:
|
|
|
463
463
|
|
|
464
464
|
def image2video_interface(self) -> "gr.Blocks":
|
|
465
465
|
def image_generate_video(
|
|
466
|
-
image: "PIL.Image",
|
|
466
|
+
image: "PIL.Image.Image",
|
|
467
467
|
prompt: str,
|
|
468
468
|
negative_prompt: str,
|
|
469
469
|
num_frames: int,
|
|
@@ -653,13 +653,14 @@ class MediaInterface:
|
|
|
653
653
|
with open(prompt_speech_file, "rb") as f:
|
|
654
654
|
prompt_speech_bytes = f.read()
|
|
655
655
|
|
|
656
|
+
kw: Dict[str, Any] = {}
|
|
657
|
+
if prompt_speech_bytes:
|
|
658
|
+
kw["prompt_speech"] = prompt_speech_bytes
|
|
659
|
+
if prompt_text:
|
|
660
|
+
kw["prompt_text"] = prompt_text
|
|
661
|
+
|
|
656
662
|
response = model.speech(
|
|
657
|
-
input=input_text,
|
|
658
|
-
voice=voice,
|
|
659
|
-
speed=speed,
|
|
660
|
-
response_format="mp3",
|
|
661
|
-
prompt_speech=prompt_speech_bytes,
|
|
662
|
-
prompt_text=prompt_text,
|
|
663
|
+
input=input_text, voice=voice, speed=speed, response_format="mp3", **kw
|
|
663
664
|
)
|
|
664
665
|
|
|
665
666
|
# Write to a temp .mp3 file and return its path
|
xinference/core/model.py
CHANGED
|
@@ -71,12 +71,8 @@ except ImportError:
|
|
|
71
71
|
OutOfMemoryError = _OutOfMemoryError
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
"cogvlm2",
|
|
77
|
-
"glm-4v",
|
|
78
|
-
"MiniCPM-V-2.6",
|
|
79
|
-
]
|
|
74
|
+
# !!!!! DO NOT add model_name to this list, using `register_batching_multimodal_models` below instead.
|
|
75
|
+
XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = []
|
|
80
76
|
|
|
81
77
|
XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS = ["FLUX.1-dev", "FLUX.1-schnell"]
|
|
82
78
|
XINFERENCE_TEST_OUT_OF_MEMORY_ERROR = bool(
|
|
@@ -84,6 +80,16 @@ XINFERENCE_TEST_OUT_OF_MEMORY_ERROR = bool(
|
|
|
84
80
|
)
|
|
85
81
|
|
|
86
82
|
|
|
83
|
+
def register_batching_multimodal_models(*model_names: str):
|
|
84
|
+
def decorator(cls):
|
|
85
|
+
for name in model_names:
|
|
86
|
+
if name not in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS:
|
|
87
|
+
XINFERENCE_BATCHING_ALLOWED_VISION_MODELS.append(name)
|
|
88
|
+
return cls
|
|
89
|
+
|
|
90
|
+
return decorator
|
|
91
|
+
|
|
92
|
+
|
|
87
93
|
def request_limit(fn):
|
|
88
94
|
"""
|
|
89
95
|
Used by ModelActor.
|
|
@@ -977,6 +983,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
|
|
|
977
983
|
response_format,
|
|
978
984
|
temperature,
|
|
979
985
|
timestamp_granularities,
|
|
986
|
+
**kwargs,
|
|
980
987
|
)
|
|
981
988
|
raise AttributeError(
|
|
982
989
|
f"Model {self._model.model_spec} is not for creating transcriptions."
|
xinference/core/scheduler.py
CHANGED
|
@@ -272,15 +272,6 @@ class InferenceRequest:
|
|
|
272
272
|
)
|
|
273
273
|
|
|
274
274
|
|
|
275
|
-
def _get_valid_batch_kv_cache(cache, skipped_indexes: Set[int]):
|
|
276
|
-
batch_size = cache.key_cache[0].shape[0]
|
|
277
|
-
batch_slices = [num for num in range(batch_size) if num not in skipped_indexes]
|
|
278
|
-
for idx in range(len(cache)):
|
|
279
|
-
cache.key_cache[idx] = cache.key_cache[idx][batch_slices, ::].contiguous()
|
|
280
|
-
cache.value_cache[idx] = cache.value_cache[idx][batch_slices, ::].contiguous()
|
|
281
|
-
return cache
|
|
282
|
-
|
|
283
|
-
|
|
284
275
|
class SchedulerActor(xo.StatelessActor):
|
|
285
276
|
@classmethod
|
|
286
277
|
def gen_uid(cls, model_uid: str, replica_id: str):
|
|
@@ -409,7 +400,7 @@ class SchedulerActor(xo.StatelessActor):
|
|
|
409
400
|
# Some requests have been completed. Batch size needs to be reduced for kv cache.
|
|
410
401
|
if stopped_batch_indexes and len(self._running_queue) > 0:
|
|
411
402
|
kv_cache = self._running_queue[0].kv_cache
|
|
412
|
-
reduced_kv_cache =
|
|
403
|
+
reduced_kv_cache = self._model.build_reduced_kv_cache(
|
|
413
404
|
kv_cache, stopped_batch_indexes
|
|
414
405
|
)
|
|
415
406
|
for r in self._running_queue:
|
xinference/core/worker.py
CHANGED
|
@@ -533,16 +533,6 @@ class WorkerActor(xo.StatelessActor):
|
|
|
533
533
|
existing_model_uids.append(rep_uid)
|
|
534
534
|
if idx in self._gpu_to_embedding_model_uids:
|
|
535
535
|
existing_model_uids.extend(self._gpu_to_embedding_model_uids[idx])
|
|
536
|
-
# If user has run the vLLM model on the GPU that was forced to be specified,
|
|
537
|
-
# it is not possible to force this GPU to be allocated again
|
|
538
|
-
if idx in self._user_specified_gpu_to_model_uids:
|
|
539
|
-
for rep_uid, _ in self._user_specified_gpu_to_model_uids[idx]:
|
|
540
|
-
is_vllm_model = await self.is_model_vllm_backend(rep_uid)
|
|
541
|
-
if is_vllm_model:
|
|
542
|
-
raise RuntimeError(
|
|
543
|
-
f"User specified GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
|
|
544
|
-
f"therefore cannot allocate GPU memory for a new model."
|
|
545
|
-
)
|
|
546
536
|
|
|
547
537
|
if existing_model_uids:
|
|
548
538
|
logger.warning(
|
|
@@ -218,13 +218,65 @@
|
|
|
218
218
|
"batch_size_s": 300
|
|
219
219
|
}
|
|
220
220
|
},
|
|
221
|
+
{
|
|
222
|
+
"model_name": "paraformer-zh-hotword",
|
|
223
|
+
"model_family": "funasr",
|
|
224
|
+
"model_id": "JunHowie/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
|
|
225
|
+
"model_revision": "26d622993683d7b0c517ee5ec9c1c8bdde76e324",
|
|
226
|
+
"model_ability": ["audio2text"],
|
|
227
|
+
"multilingual": false,
|
|
228
|
+
"default_model_config": {
|
|
229
|
+
"vad_model": "fsmn-vad",
|
|
230
|
+
"punc_model": "ct-punc"
|
|
231
|
+
},
|
|
232
|
+
"default_transcription_config": {
|
|
233
|
+
"batch_size_s": 300
|
|
234
|
+
}
|
|
235
|
+
},
|
|
236
|
+
{
|
|
237
|
+
"model_name": "paraformer-zh-long",
|
|
238
|
+
"model_family": "funasr",
|
|
239
|
+
"model_id": "JunHowie/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
|
240
|
+
"model_revision": "b6d8cb81645e34056cd3dda41e5624a740587de3",
|
|
241
|
+
"model_ability": ["audio2text"],
|
|
242
|
+
"multilingual": false,
|
|
243
|
+
"default_model_config": {
|
|
244
|
+
"vad_model": "fsmn-vad",
|
|
245
|
+
"punc_model": "ct-punc"
|
|
246
|
+
},
|
|
247
|
+
"default_transcription_config": {
|
|
248
|
+
"batch_size_s": 300
|
|
249
|
+
}
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
"model_name": "paraformer-zh-spk",
|
|
253
|
+
"model_family": "funasr",
|
|
254
|
+
"model_id": "JunHowie/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
|
|
255
|
+
"model_revision": "36abd64af4392fe02bf76453bc86c081cf1ca6da",
|
|
256
|
+
"model_ability": ["audio2text"],
|
|
257
|
+
"multilingual": false,
|
|
258
|
+
"default_model_config": {
|
|
259
|
+
"vad_model": "fsmn-vad",
|
|
260
|
+
"punc_model": "ct-punc"
|
|
261
|
+
},
|
|
262
|
+
"default_transcription_config": {
|
|
263
|
+
"batch_size_s": 300
|
|
264
|
+
}
|
|
265
|
+
},
|
|
221
266
|
{
|
|
222
267
|
"model_name": "ChatTTS",
|
|
223
268
|
"model_family": "ChatTTS",
|
|
224
269
|
"model_id": "2Noise/ChatTTS",
|
|
225
270
|
"model_revision": "1a3c04a8b0651689bd9242fbb55b1f4b5a9aef84",
|
|
226
271
|
"model_ability": ["text2audio"],
|
|
227
|
-
"multilingual": true
|
|
272
|
+
"multilingual": true,
|
|
273
|
+
"virtualenv": {
|
|
274
|
+
"packages": [
|
|
275
|
+
"ChatTTS>=0.2.1",
|
|
276
|
+
"#system_torch#",
|
|
277
|
+
"#system_numpy#"
|
|
278
|
+
]
|
|
279
|
+
}
|
|
228
280
|
},
|
|
229
281
|
{
|
|
230
282
|
"model_name": "CosyVoice-300M",
|
|
@@ -51,6 +51,55 @@
|
|
|
51
51
|
"model_name": "paraformer-zh",
|
|
52
52
|
"model_family": "funasr",
|
|
53
53
|
"model_hub": "modelscope",
|
|
54
|
+
"model_id": "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
|
55
|
+
"model_revision": "master",
|
|
56
|
+
"model_ability": ["audio2text"],
|
|
57
|
+
"multilingual": false,
|
|
58
|
+
"default_model_config": {
|
|
59
|
+
"vad_model": "fsmn-vad",
|
|
60
|
+
"punc_model": "ct-punc"
|
|
61
|
+
},
|
|
62
|
+
"default_transcription_config": {
|
|
63
|
+
"batch_size_s": 300
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"model_name": "paraformer-zh-hotword",
|
|
68
|
+
"model_family": "funasr",
|
|
69
|
+
"model_hub": "modelscope",
|
|
70
|
+
"model_id": "iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
|
|
71
|
+
"model_revision": "master",
|
|
72
|
+
"model_ability": ["audio2text"],
|
|
73
|
+
"multilingual": false,
|
|
74
|
+
"default_model_config": {
|
|
75
|
+
"vad_model": "fsmn-vad",
|
|
76
|
+
"punc_model": "ct-punc"
|
|
77
|
+
},
|
|
78
|
+
"default_transcription_config": {
|
|
79
|
+
"hotword": "",
|
|
80
|
+
"batch_size_s": 300
|
|
81
|
+
}
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"model_name": "paraformer-zh-long",
|
|
85
|
+
"model_family": "funasr",
|
|
86
|
+
"model_hub": "modelscope",
|
|
87
|
+
"model_id": "iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
|
88
|
+
"model_revision": "master",
|
|
89
|
+
"model_ability": ["audio2text"],
|
|
90
|
+
"multilingual": false,
|
|
91
|
+
"default_model_config": {
|
|
92
|
+
"vad_model": "fsmn-vad",
|
|
93
|
+
"punc_model": "ct-punc"
|
|
94
|
+
},
|
|
95
|
+
"default_transcription_config": {
|
|
96
|
+
"batch_size_s": 300
|
|
97
|
+
}
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"model_name": "paraformer-zh-spk",
|
|
101
|
+
"model_family": "funasr",
|
|
102
|
+
"model_hub": "modelscope",
|
|
54
103
|
"model_id": "iic/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
|
|
55
104
|
"model_revision": "master",
|
|
56
105
|
"model_ability": ["audio2text"],
|
|
@@ -70,7 +119,14 @@
|
|
|
70
119
|
"model_id": "AI-ModelScope/ChatTTS",
|
|
71
120
|
"model_revision": "master",
|
|
72
121
|
"model_ability": ["text2audio"],
|
|
73
|
-
"multilingual": true
|
|
122
|
+
"multilingual": true,
|
|
123
|
+
"virtualenv": {
|
|
124
|
+
"packages": [
|
|
125
|
+
"ChatTTS>=0.2.1",
|
|
126
|
+
"#system_torch#",
|
|
127
|
+
"#system_numpy#"
|
|
128
|
+
]
|
|
129
|
+
}
|
|
74
130
|
},
|
|
75
131
|
{
|
|
76
132
|
"model_name": "CosyVoice-300M",
|
|
@@ -651,19 +651,27 @@ class EmbeddingModel:
|
|
|
651
651
|
img = Image.open(image_data)
|
|
652
652
|
return img
|
|
653
653
|
|
|
654
|
-
objs: list[
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
if
|
|
661
|
-
|
|
662
|
-
|
|
654
|
+
objs: list[str] = []
|
|
655
|
+
if isinstance(sentences, str):
|
|
656
|
+
objs.append(sentences)
|
|
657
|
+
else:
|
|
658
|
+
for item in sentences:
|
|
659
|
+
if isinstance(item, dict):
|
|
660
|
+
if item.get("text") is not None:
|
|
661
|
+
objs.append(item["text"])
|
|
662
|
+
elif item.get("image") is not None:
|
|
663
|
+
if re.match(r"^data:image/.+;base64,", item["image"]):
|
|
664
|
+
image = base64_to_image(item["image"])
|
|
665
|
+
objs.append(image)
|
|
666
|
+
else:
|
|
667
|
+
objs.append(item["image"])
|
|
663
668
|
else:
|
|
664
|
-
|
|
669
|
+
raise ValueError("Please check the input data.")
|
|
670
|
+
elif isinstance(item, str):
|
|
671
|
+
objs.append(item)
|
|
665
672
|
else:
|
|
666
|
-
|
|
673
|
+
raise ValueError("Please check the input data.")
|
|
674
|
+
|
|
667
675
|
all_embeddings, all_token_nums = encode(
|
|
668
676
|
self._model,
|
|
669
677
|
objs,
|
|
@@ -303,7 +303,16 @@
|
|
|
303
303
|
"model_ability": [
|
|
304
304
|
"text2image",
|
|
305
305
|
"image2image"
|
|
306
|
-
]
|
|
306
|
+
],
|
|
307
|
+
"default_model_config": {
|
|
308
|
+
"variant": "fp16"
|
|
309
|
+
},
|
|
310
|
+
"virtualenv": {
|
|
311
|
+
"packages": [
|
|
312
|
+
"diffusers>=0.30.0",
|
|
313
|
+
"#system_numpy#"
|
|
314
|
+
]
|
|
315
|
+
}
|
|
307
316
|
},
|
|
308
317
|
{
|
|
309
318
|
"model_name": "stable-diffusion-inpainting",
|
|
@@ -307,6 +307,26 @@
|
|
|
307
307
|
}
|
|
308
308
|
]
|
|
309
309
|
},
|
|
310
|
+
{
|
|
311
|
+
"model_name": "kolors",
|
|
312
|
+
"model_family": "stable_diffusion",
|
|
313
|
+
"model_hub": "modelscope",
|
|
314
|
+
"model_id": "JunHowie/Kolors-diffusers",
|
|
315
|
+
"model_revision": "master",
|
|
316
|
+
"model_ability": [
|
|
317
|
+
"text2image",
|
|
318
|
+
"image2image"
|
|
319
|
+
],
|
|
320
|
+
"default_model_config": {
|
|
321
|
+
"variant": "fp16"
|
|
322
|
+
},
|
|
323
|
+
"virtualenv": {
|
|
324
|
+
"packages": [
|
|
325
|
+
"diffusers>=0.30.0",
|
|
326
|
+
"#system_numpy#"
|
|
327
|
+
]
|
|
328
|
+
}
|
|
329
|
+
},
|
|
310
330
|
{
|
|
311
331
|
"model_name": "GOT-OCR2_0",
|
|
312
332
|
"model_family": "ocr",
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -73,7 +73,7 @@ def generate_engine_config_by_model_family(model_family):
|
|
|
73
73
|
model_size_in_billions = spec.model_size_in_billions
|
|
74
74
|
quantizations = spec.quantizations
|
|
75
75
|
for quantization in quantizations:
|
|
76
|
-
# traverse all supported engines to match the name, format, size in billions and
|
|
76
|
+
# traverse all supported engines to match the name, format, size in billions and quantization of model
|
|
77
77
|
for engine in SUPPORTED_ENGINES:
|
|
78
78
|
if not check_format_with_engine(
|
|
79
79
|
model_format, engine
|
|
@@ -107,6 +107,10 @@ def generate_engine_config_by_model_family(model_family):
|
|
|
107
107
|
"llm_class": cls,
|
|
108
108
|
}
|
|
109
109
|
)
|
|
110
|
+
if hasattr(spec, "multimodal_projectors"):
|
|
111
|
+
engine_params[-1][
|
|
112
|
+
"multimodal_projectors"
|
|
113
|
+
] = spec.multimodal_projectors
|
|
110
114
|
engines[engine] = engine_params
|
|
111
115
|
break
|
|
112
116
|
LLM_ENGINES[model_name] = engines
|
|
@@ -163,36 +167,9 @@ def _install():
|
|
|
163
167
|
from .lmdeploy.core import LMDeployChatModel, LMDeployModel
|
|
164
168
|
from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
|
|
165
169
|
from .sglang.core import SGLANGChatModel, SGLANGModel, SGLANGVisionModel
|
|
166
|
-
from .transformers.chatglm import ChatglmPytorchChatModel
|
|
167
|
-
from .transformers.cogagent import CogAgentChatModel
|
|
168
|
-
from .transformers.cogvlm2 import CogVLM2Model
|
|
169
|
-
from .transformers.cogvlm2_video import CogVLM2VideoModel
|
|
170
170
|
from .transformers.core import PytorchChatModel, PytorchModel
|
|
171
|
-
from .transformers.deepseek_v2 import (
|
|
172
|
-
DeepSeekV2PytorchChatModel,
|
|
173
|
-
DeepSeekV2PytorchModel,
|
|
174
|
-
)
|
|
175
|
-
from .transformers.deepseek_vl import DeepSeekVLChatModel
|
|
176
|
-
from .transformers.deepseek_vl2 import DeepSeekVL2ChatModel
|
|
177
|
-
from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
|
|
178
|
-
from .transformers.glm4v import Glm4VModel
|
|
179
|
-
from .transformers.glm_edge_v import GlmEdgeVModel
|
|
180
|
-
from .transformers.minicpmv25 import MiniCPMV25Model
|
|
181
|
-
from .transformers.minicpmv26 import MiniCPMV26Model
|
|
182
|
-
from .transformers.opt import OptPytorchModel
|
|
183
|
-
from .transformers.ovis2 import Ovis2ChatModel
|
|
184
|
-
from .transformers.qwen2_audio import Qwen2AudioChatModel
|
|
185
|
-
from .transformers.qwen_vl import QwenVLChatModel
|
|
186
171
|
from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
|
|
187
172
|
|
|
188
|
-
try:
|
|
189
|
-
from .transformers.omnilmm import OmniLMMModel
|
|
190
|
-
except ImportError as e:
|
|
191
|
-
# For quite old transformers version,
|
|
192
|
-
# import will generate error
|
|
193
|
-
OmniLMMModel = None
|
|
194
|
-
warnings.warn(f"Cannot import OmniLLMModel due to reason: {e}")
|
|
195
|
-
|
|
196
173
|
# register llm classes.
|
|
197
174
|
LLAMA_CLASSES.extend(
|
|
198
175
|
[
|
|
@@ -203,32 +180,7 @@ def _install():
|
|
|
203
180
|
VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
|
|
204
181
|
MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
|
|
205
182
|
LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
|
|
206
|
-
TRANSFORMERS_CLASSES.extend(
|
|
207
|
-
[
|
|
208
|
-
ChatglmPytorchChatModel,
|
|
209
|
-
PytorchChatModel,
|
|
210
|
-
QwenVLChatModel,
|
|
211
|
-
Qwen2AudioChatModel,
|
|
212
|
-
DeepSeekVLChatModel,
|
|
213
|
-
DeepSeekVL2ChatModel,
|
|
214
|
-
PytorchModel,
|
|
215
|
-
CogVLM2Model,
|
|
216
|
-
CogVLM2VideoModel,
|
|
217
|
-
MiniCPMV25Model,
|
|
218
|
-
MiniCPMV26Model,
|
|
219
|
-
Glm4VModel,
|
|
220
|
-
DeepSeekV2PytorchModel,
|
|
221
|
-
DeepSeekV2PytorchChatModel,
|
|
222
|
-
OptPytorchModel,
|
|
223
|
-
GlmEdgeVModel,
|
|
224
|
-
CogAgentChatModel,
|
|
225
|
-
Gemma3TextChatModel,
|
|
226
|
-
Gemma3ChatModel,
|
|
227
|
-
Ovis2ChatModel,
|
|
228
|
-
]
|
|
229
|
-
)
|
|
230
|
-
if OmniLMMModel: # type: ignore
|
|
231
|
-
TRANSFORMERS_CLASSES.append(OmniLMMModel)
|
|
183
|
+
TRANSFORMERS_CLASSES.extend([PytorchChatModel, PytorchModel])
|
|
232
184
|
|
|
233
185
|
# support 4 engines for now
|
|
234
186
|
SUPPORTED_ENGINES["vLLM"] = VLLM_CLASSES
|
xinference/model/llm/core.py
CHANGED
|
@@ -160,12 +160,14 @@ class LLMDescription(ModelDescription):
|
|
|
160
160
|
llm_family: "LLMFamilyV1",
|
|
161
161
|
llm_spec: "LLMSpecV1",
|
|
162
162
|
quantization: Optional[str],
|
|
163
|
+
multimodal_projector: Optional[str] = None,
|
|
163
164
|
model_path: Optional[str] = None,
|
|
164
165
|
):
|
|
165
166
|
super().__init__(address, devices, model_path=model_path)
|
|
166
167
|
self._llm_family = llm_family
|
|
167
168
|
self._llm_spec = llm_spec
|
|
168
169
|
self._quantization = quantization
|
|
170
|
+
self._multimodal_projector = multimodal_projector
|
|
169
171
|
|
|
170
172
|
@property
|
|
171
173
|
def spec(self):
|
|
@@ -185,6 +187,7 @@ class LLMDescription(ModelDescription):
|
|
|
185
187
|
"model_family": self._llm_family.model_family
|
|
186
188
|
or self._llm_family.model_name,
|
|
187
189
|
"quantization": self._quantization,
|
|
190
|
+
"multimodal_projector": self._multimodal_projector,
|
|
188
191
|
"model_hub": self._llm_spec.model_hub,
|
|
189
192
|
"revision": self._llm_spec.model_revision,
|
|
190
193
|
"context_length": self._llm_family.context_length,
|
|
@@ -204,6 +207,7 @@ class LLMDescription(ModelDescription):
|
|
|
204
207
|
"model_file_location": model_file_location,
|
|
205
208
|
"cache_status": cache_status,
|
|
206
209
|
"quantization": self._quantization,
|
|
210
|
+
"multimodal_projector": self._multimodal_projector,
|
|
207
211
|
"model_format": self._llm_spec.model_format,
|
|
208
212
|
"model_size_in_billions": self._llm_spec.model_size_in_billions,
|
|
209
213
|
}
|
|
@@ -212,10 +216,19 @@ class LLMDescription(ModelDescription):
|
|
|
212
216
|
def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
|
|
213
217
|
res = defaultdict(list)
|
|
214
218
|
for spec in llm_family.model_specs:
|
|
219
|
+
multimodal_projectors = getattr(spec, "multimodal_projectors", None)
|
|
215
220
|
for q in spec.quantizations:
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
221
|
+
if multimodal_projectors:
|
|
222
|
+
for mmproj in multimodal_projectors:
|
|
223
|
+
res[llm_family.model_name].append(
|
|
224
|
+
LLMDescription(
|
|
225
|
+
None, None, llm_family, spec, q, mmproj
|
|
226
|
+
).to_version_info()
|
|
227
|
+
)
|
|
228
|
+
else:
|
|
229
|
+
res[llm_family.model_name].append(
|
|
230
|
+
LLMDescription(None, None, llm_family, spec, q).to_version_info()
|
|
231
|
+
)
|
|
219
232
|
return res
|
|
220
233
|
|
|
221
234
|
|
|
@@ -260,8 +273,9 @@ def create_llm_model_instance(
|
|
|
260
273
|
)
|
|
261
274
|
logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
|
|
262
275
|
|
|
276
|
+
multimodal_projector = kwargs.get("multimodal_projector")
|
|
263
277
|
if not model_path:
|
|
264
|
-
model_path = cache(llm_family, llm_spec, quantization)
|
|
278
|
+
model_path = cache(llm_family, llm_spec, quantization, multimodal_projector)
|
|
265
279
|
|
|
266
280
|
peft_model = peft_model_config.peft_model if peft_model_config else None
|
|
267
281
|
if peft_model is not None:
|
|
@@ -288,5 +302,5 @@ def create_llm_model_instance(
|
|
|
288
302
|
model_uid, llm_family, llm_spec, quantization, model_path, kwargs
|
|
289
303
|
)
|
|
290
304
|
return model, LLMDescription(
|
|
291
|
-
subpool_addr, devices, llm_family, llm_spec, quantization
|
|
305
|
+
subpool_addr, devices, llm_family, llm_spec, quantization, multimodal_projector
|
|
292
306
|
)
|