xinference 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +49 -65
- xinference/core/model.py +77 -19
- xinference/core/supervisor.py +81 -10
- xinference/core/utils.py +2 -2
- xinference/core/worker.py +32 -0
- xinference/model/image/model_spec.json +18 -0
- xinference/model/image/model_spec_modelscope.json +20 -0
- xinference/model/llm/__init__.py +2 -0
- xinference/model/llm/llm_family.json +96 -0
- xinference/model/llm/llm_family_modelscope.json +99 -0
- xinference/model/llm/mlx/core.py +23 -73
- xinference/model/llm/transformers/cogagent.py +272 -0
- xinference/model/llm/transformers/core.py +1 -0
- xinference/model/llm/transformers/qwen2_vl.py +10 -1
- xinference/model/llm/utils.py +27 -3
- xinference/model/llm/vllm/core.py +37 -7
- xinference/model/llm/vllm/xavier/__init__.py +13 -0
- xinference/model/llm/vllm/xavier/allocator.py +74 -0
- xinference/model/llm/vllm/xavier/block.py +112 -0
- xinference/model/llm/vllm/xavier/block_manager.py +71 -0
- xinference/model/llm/vllm/xavier/block_tracker.py +116 -0
- xinference/model/llm/vllm/xavier/engine.py +247 -0
- xinference/model/llm/vllm/xavier/executor.py +132 -0
- xinference/model/llm/vllm/xavier/scheduler.py +422 -0
- xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
- xinference/model/llm/vllm/xavier/test/test_xavier.py +122 -0
- xinference/model/llm/vllm/xavier/transfer.py +298 -0
- xinference/model/video/diffusers.py +14 -0
- xinference/model/video/model_spec.json +15 -0
- xinference/model/video/model_spec_modelscope.json +16 -0
- xinference/types.py +13 -0
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
- xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
- xinference/web/ui/build/static/js/main.1eb206d1.js +3 -0
- xinference/web/ui/build/static/js/main.1eb206d1.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2213d49de260e1f67c888081b18f120f5225462b829ae57c9e05a05cec83689d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +67 -3
- xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
- xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
- xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
- xinference/web/ui/node_modules/i18next/package.json +129 -0
- xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
- xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
- xinference/web/ui/node_modules/react-i18next/package.json +162 -0
- xinference/web/ui/node_modules/void-elements/package.json +34 -0
- xinference/web/ui/package-lock.json +69 -3
- xinference/web/ui/package.json +2 -0
- xinference/web/ui/src/locales/en.json +186 -0
- xinference/web/ui/src/locales/zh.json +186 -0
- {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/METADATA +9 -6
- {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/RECORD +102 -56
- xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
- xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
- xinference/web/ui/build/static/js/main.4eb4ee80.js +0 -3
- xinference/web/ui/build/static/js/main.4eb4ee80.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8c5eeb02f772d02cbe8b89c05428d0dd41a97866f75f7dc1c2164a67f5a1cf98.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
- /xinference/web/ui/build/static/js/{main.4eb4ee80.js.LICENSE.txt → main.1eb206d1.js.LICENSE.txt} +0 -0
- {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/LICENSE +0 -0
- {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/WHEEL +0 -0
- {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/top_level.txt +0 -0
xinference/core/worker.py
CHANGED
|
@@ -22,6 +22,7 @@ import signal
|
|
|
22
22
|
import threading
|
|
23
23
|
import time
|
|
24
24
|
from collections import defaultdict
|
|
25
|
+
from dataclasses import dataclass
|
|
25
26
|
from logging import getLogger
|
|
26
27
|
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
|
27
28
|
|
|
@@ -58,6 +59,11 @@ else:
|
|
|
58
59
|
MODEL_ACTOR_AUTO_RECOVER_LIMIT = None
|
|
59
60
|
|
|
60
61
|
|
|
62
|
+
@dataclass
|
|
63
|
+
class ModelStatus:
|
|
64
|
+
last_error: str = ""
|
|
65
|
+
|
|
66
|
+
|
|
61
67
|
class WorkerActor(xo.StatelessActor):
|
|
62
68
|
def __init__(
|
|
63
69
|
self,
|
|
@@ -90,6 +96,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
90
96
|
# attributes maintained after model launched:
|
|
91
97
|
self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
|
|
92
98
|
self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
|
|
99
|
+
self._model_uid_to_model_status: Dict[str, ModelStatus] = {}
|
|
93
100
|
self._gpu_to_model_uid: Dict[int, str] = {}
|
|
94
101
|
self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
|
|
95
102
|
# Dict structure: gpu_index: {(replica_model_uid, model_type)}
|
|
@@ -866,6 +873,9 @@ class WorkerActor(xo.StatelessActor):
|
|
|
866
873
|
)
|
|
867
874
|
|
|
868
875
|
try:
|
|
876
|
+
xavier_config: Optional[Dict] = kwargs.pop("xavier_config", None)
|
|
877
|
+
if xavier_config is not None:
|
|
878
|
+
xavier_config["rank_address"] = subpool_address
|
|
869
879
|
model, model_description = await asyncio.to_thread(
|
|
870
880
|
create_model_instance,
|
|
871
881
|
subpool_address,
|
|
@@ -893,6 +903,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
893
903
|
model=model,
|
|
894
904
|
model_description=model_description,
|
|
895
905
|
request_limits=request_limits,
|
|
906
|
+
xavier_config=xavier_config,
|
|
896
907
|
)
|
|
897
908
|
await model_ref.load()
|
|
898
909
|
except:
|
|
@@ -902,6 +913,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
902
913
|
raise
|
|
903
914
|
self._model_uid_to_model[model_uid] = model_ref
|
|
904
915
|
self._model_uid_to_model_spec[model_uid] = model_description
|
|
916
|
+
self._model_uid_to_model_status[model_uid] = ModelStatus()
|
|
905
917
|
self._model_uid_to_addr[model_uid] = subpool_address
|
|
906
918
|
self._model_uid_to_recover_count.setdefault(
|
|
907
919
|
model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
|
|
@@ -921,6 +933,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
921
933
|
origin_uid,
|
|
922
934
|
{"model_ability": abilities, "status": LaunchStatus.READY.name},
|
|
923
935
|
)
|
|
936
|
+
return subpool_address
|
|
924
937
|
|
|
925
938
|
@log_async(logger=logger, level=logging.INFO)
|
|
926
939
|
async def terminate_model(self, model_uid: str, is_model_die=False):
|
|
@@ -976,6 +989,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
976
989
|
status = LaunchStatus.ERROR.name
|
|
977
990
|
else:
|
|
978
991
|
status = LaunchStatus.TERMINATED.name
|
|
992
|
+
self._model_uid_to_model_status.pop(model_uid, None)
|
|
979
993
|
|
|
980
994
|
if self._status_guard_ref is None:
|
|
981
995
|
_ = await self.get_supervisor_ref()
|
|
@@ -1010,6 +1024,9 @@ class WorkerActor(xo.StatelessActor):
|
|
|
1010
1024
|
|
|
1011
1025
|
@log_sync(logger=logger)
|
|
1012
1026
|
def get_model(self, model_uid: str) -> xo.ActorRefType["ModelActor"]:
|
|
1027
|
+
model_status = self._model_uid_to_model_status.get(model_uid)
|
|
1028
|
+
if model_status and model_status.last_error:
|
|
1029
|
+
raise Exception(model_status.last_error)
|
|
1013
1030
|
model_ref = self._model_uid_to_model.get(model_uid, None)
|
|
1014
1031
|
if model_ref is None:
|
|
1015
1032
|
raise ValueError(f"Model not found, uid: {model_uid}")
|
|
@@ -1138,6 +1155,21 @@ class WorkerActor(xo.StatelessActor):
|
|
|
1138
1155
|
}
|
|
1139
1156
|
return ret
|
|
1140
1157
|
|
|
1158
|
+
def update_model_status(self, model_uid: str, **kwargs):
|
|
1159
|
+
model_status = self._model_uid_to_model_status.get(model_uid)
|
|
1160
|
+
if model_status is not None:
|
|
1161
|
+
for k, v in kwargs.items():
|
|
1162
|
+
setattr(model_status, k, v)
|
|
1163
|
+
|
|
1164
|
+
def get_model_status(self, model_uid: str):
|
|
1165
|
+
return self._model_uid_to_model_status.get(model_uid)
|
|
1166
|
+
|
|
1141
1167
|
@staticmethod
|
|
1142
1168
|
def record_metrics(name, op, kwargs):
|
|
1143
1169
|
record_metrics(name, op, kwargs)
|
|
1170
|
+
|
|
1171
|
+
async def start_transfer_for_vllm(
|
|
1172
|
+
self, rep_model_uid: str, rank_addresses: List[str]
|
|
1173
|
+
):
|
|
1174
|
+
model_ref = self._model_uid_to_model[rep_model_uid]
|
|
1175
|
+
await model_ref.start_transfer_for_vllm(rank_addresses)
|
|
@@ -167,6 +167,24 @@
|
|
|
167
167
|
],
|
|
168
168
|
"gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
|
|
169
169
|
},
|
|
170
|
+
{
|
|
171
|
+
"model_name": "HunyuanDiT-v1.2",
|
|
172
|
+
"model_family": "stable_diffusion",
|
|
173
|
+
"model_id": "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers",
|
|
174
|
+
"model_revision": "5e96094e0ad19e7f475de8711f03634ca0ccc40c",
|
|
175
|
+
"model_ability": [
|
|
176
|
+
"text2image"
|
|
177
|
+
]
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
"model_name": "HunyuanDiT-v1.2-Distilled",
|
|
181
|
+
"model_family": "stable_diffusion",
|
|
182
|
+
"model_id": "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers-Distilled",
|
|
183
|
+
"model_revision": "ba991d1546d8c50936c4c16398ed0a87b9b99fb1",
|
|
184
|
+
"model_ability": [
|
|
185
|
+
"text2image"
|
|
186
|
+
]
|
|
187
|
+
},
|
|
170
188
|
{
|
|
171
189
|
"model_name": "sd-turbo",
|
|
172
190
|
"model_family": "stable_diffusion",
|
|
@@ -173,6 +173,26 @@
|
|
|
173
173
|
],
|
|
174
174
|
"gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
|
|
175
175
|
},
|
|
176
|
+
{
|
|
177
|
+
"model_name": "HunyuanDiT-v1.2",
|
|
178
|
+
"model_family": "stable_diffusion",
|
|
179
|
+
"model_hub": "modelscope",
|
|
180
|
+
"model_id": "Xorbits/HunyuanDiT-v1.2-Diffusers",
|
|
181
|
+
"model_revision": "master",
|
|
182
|
+
"model_ability": [
|
|
183
|
+
"text2image"
|
|
184
|
+
]
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
"model_name": "HunyuanDiT-v1.2-Distilled",
|
|
188
|
+
"model_family": "stable_diffusion",
|
|
189
|
+
"model_hub": "modelscope",
|
|
190
|
+
"model_id": "Xorbits/HunyuanDiT-v1.2-Diffusers-Distilled",
|
|
191
|
+
"model_revision": "master",
|
|
192
|
+
"model_ability": [
|
|
193
|
+
"text2image"
|
|
194
|
+
]
|
|
195
|
+
},
|
|
176
196
|
{
|
|
177
197
|
"model_name": "sd-turbo",
|
|
178
198
|
"model_family": "stable_diffusion",
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -134,6 +134,7 @@ def _install():
|
|
|
134
134
|
from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
|
|
135
135
|
from .sglang.core import SGLANGChatModel, SGLANGModel
|
|
136
136
|
from .transformers.chatglm import ChatglmPytorchChatModel
|
|
137
|
+
from .transformers.cogagent import CogAgentChatModel
|
|
137
138
|
from .transformers.cogvlm2 import CogVLM2Model
|
|
138
139
|
from .transformers.cogvlm2_video import CogVLM2VideoModel
|
|
139
140
|
from .transformers.core import PytorchChatModel, PytorchModel
|
|
@@ -195,6 +196,7 @@ def _install():
|
|
|
195
196
|
DeepSeekV2PytorchChatModel,
|
|
196
197
|
OptPytorchModel,
|
|
197
198
|
GlmEdgeVModel,
|
|
199
|
+
CogAgentChatModel,
|
|
198
200
|
]
|
|
199
201
|
)
|
|
200
202
|
if OmniLMMModel: # type: ignore
|
|
@@ -8989,5 +8989,101 @@
|
|
|
8989
8989
|
"<|im_end|>",
|
|
8990
8990
|
"<|endoftext|>"
|
|
8991
8991
|
]
|
|
8992
|
+
},
|
|
8993
|
+
{
|
|
8994
|
+
"version": 1,
|
|
8995
|
+
"context_length": 32768,
|
|
8996
|
+
"model_name": "marco-o1",
|
|
8997
|
+
"model_lang": [
|
|
8998
|
+
"en",
|
|
8999
|
+
"zh"
|
|
9000
|
+
],
|
|
9001
|
+
"model_ability": [
|
|
9002
|
+
"chat",
|
|
9003
|
+
"tools"
|
|
9004
|
+
],
|
|
9005
|
+
"model_description": "Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions",
|
|
9006
|
+
"model_specs": [
|
|
9007
|
+
{
|
|
9008
|
+
"model_format": "pytorch",
|
|
9009
|
+
"model_size_in_billions": 7,
|
|
9010
|
+
"quantizations": [
|
|
9011
|
+
"4-bit",
|
|
9012
|
+
"8-bit",
|
|
9013
|
+
"none"
|
|
9014
|
+
],
|
|
9015
|
+
"model_id": "AIDC-AI/Marco-o1"
|
|
9016
|
+
},
|
|
9017
|
+
{
|
|
9018
|
+
"model_format": "ggufv2",
|
|
9019
|
+
"model_size_in_billions": 7,
|
|
9020
|
+
"quantizations": [
|
|
9021
|
+
"Q2_K",
|
|
9022
|
+
"Q3_K_L",
|
|
9023
|
+
"Q3_K_M",
|
|
9024
|
+
"Q3_K_S",
|
|
9025
|
+
"Q4_0",
|
|
9026
|
+
"Q4_1",
|
|
9027
|
+
"Q4_K_M",
|
|
9028
|
+
"Q4_K_S",
|
|
9029
|
+
"Q5_0",
|
|
9030
|
+
"Q5_1",
|
|
9031
|
+
"Q5_K_M",
|
|
9032
|
+
"Q5_K_S",
|
|
9033
|
+
"Q6_K",
|
|
9034
|
+
"Q8_0"
|
|
9035
|
+
],
|
|
9036
|
+
"model_id": "QuantFactory/Marco-o1-GGUF",
|
|
9037
|
+
"model_file_name_template": "Marco-o1.{quantization}.gguf"
|
|
9038
|
+
}
|
|
9039
|
+
],
|
|
9040
|
+
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\n\n你是一个经过良好训练的AI助手,你的名字是Marco-o1.由阿里国际数字商业集团的AI Business创造.\n \n## 重要!!!!!\n当你回答问题时,你的思考应该在<Thought>内完成,<Output>内输出你的结果。\n<Thought>应该尽可能是英文,但是有2个特例,一个是对原文中的引用,另一个是是数学应该使用markdown格式,<Output>内的输出需要遵循用户输入的语言。\n <|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
|
9041
|
+
"stop_token_ids": [
|
|
9042
|
+
151643,
|
|
9043
|
+
151644,
|
|
9044
|
+
151645
|
|
9045
|
+
],
|
|
9046
|
+
"stop": [
|
|
9047
|
+
"<|endoftext|>",
|
|
9048
|
+
"<|im_start|>",
|
|
9049
|
+
"<|im_end|>"
|
|
9050
|
+
]
|
|
9051
|
+
},
|
|
9052
|
+
{
|
|
9053
|
+
"version": 1,
|
|
9054
|
+
"context_length": 4096,
|
|
9055
|
+
"model_name": "cogagent",
|
|
9056
|
+
"model_lang": [
|
|
9057
|
+
"en",
|
|
9058
|
+
"zh"
|
|
9059
|
+
],
|
|
9060
|
+
"model_ability": [
|
|
9061
|
+
"chat",
|
|
9062
|
+
"vision"
|
|
9063
|
+
],
|
|
9064
|
+
"model_description": "The CogAgent-9B-20241220 model is based on GLM-4V-9B, a bilingual open-source VLM base model. Through data collection and optimization, multi-stage training, and strategy improvements, CogAgent-9B-20241220 achieves significant advancements in GUI perception, inference prediction accuracy, action space completeness, and task generalizability. ",
|
|
9065
|
+
"model_specs": [
|
|
9066
|
+
{
|
|
9067
|
+
"model_format": "pytorch",
|
|
9068
|
+
"model_size_in_billions": "9",
|
|
9069
|
+
"quantizations": [
|
|
9070
|
+
"4-bit",
|
|
9071
|
+
"8-bit",
|
|
9072
|
+
"none"
|
|
9073
|
+
],
|
|
9074
|
+
"model_id": "THUDM/cogagent-9b-20241220"
|
|
9075
|
+
}
|
|
9076
|
+
],
|
|
9077
|
+
"chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
|
9078
|
+
"stop_token_ids": [
|
|
9079
|
+
151329,
|
|
9080
|
+
151336,
|
|
9081
|
+
151338
|
|
9082
|
+
],
|
|
9083
|
+
"stop": [
|
|
9084
|
+
"<|endoftext|>",
|
|
9085
|
+
"<|user|>",
|
|
9086
|
+
"<|observation|>"
|
|
9087
|
+
]
|
|
8992
9088
|
}
|
|
8993
9089
|
]
|
|
@@ -6722,5 +6722,104 @@
|
|
|
6722
6722
|
"<|im_end|>",
|
|
6723
6723
|
"<|endoftext|>"
|
|
6724
6724
|
]
|
|
6725
|
+
},
|
|
6726
|
+
{
|
|
6727
|
+
"version": 1,
|
|
6728
|
+
"context_length": 32768,
|
|
6729
|
+
"model_name": "marco-o1",
|
|
6730
|
+
"model_lang": [
|
|
6731
|
+
"en",
|
|
6732
|
+
"zh"
|
|
6733
|
+
],
|
|
6734
|
+
"model_ability": [
|
|
6735
|
+
"chat",
|
|
6736
|
+
"tools"
|
|
6737
|
+
],
|
|
6738
|
+
"model_description": "Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions",
|
|
6739
|
+
"model_specs": [
|
|
6740
|
+
{
|
|
6741
|
+
"model_format": "pytorch",
|
|
6742
|
+
"model_size_in_billions": 7,
|
|
6743
|
+
"quantizations": [
|
|
6744
|
+
"4-bit",
|
|
6745
|
+
"8-bit",
|
|
6746
|
+
"none"
|
|
6747
|
+
],
|
|
6748
|
+
"model_id": "AIDC-AI/Marco-o1",
|
|
6749
|
+
"model_hub": "modelscope"
|
|
6750
|
+
},
|
|
6751
|
+
{
|
|
6752
|
+
"model_format": "ggufv2",
|
|
6753
|
+
"model_size_in_billions": 7,
|
|
6754
|
+
"quantizations": [
|
|
6755
|
+
"Q2_K",
|
|
6756
|
+
"Q3_K_L",
|
|
6757
|
+
"Q3_K_M",
|
|
6758
|
+
"Q3_K_S",
|
|
6759
|
+
"Q4_0",
|
|
6760
|
+
"Q4_1",
|
|
6761
|
+
"Q4_K_M",
|
|
6762
|
+
"Q4_K_S",
|
|
6763
|
+
"Q5_0",
|
|
6764
|
+
"Q5_1",
|
|
6765
|
+
"Q5_K_M",
|
|
6766
|
+
"Q5_K_S",
|
|
6767
|
+
"Q6_K",
|
|
6768
|
+
"Q8_0"
|
|
6769
|
+
],
|
|
6770
|
+
"model_file_name_template": "Marco-o1.{quantization}.gguf",
|
|
6771
|
+
"model_hub": "modelscope",
|
|
6772
|
+
"model_id": "QuantFactory/Marco-o1-GGUF"
|
|
6773
|
+
}
|
|
6774
|
+
],
|
|
6775
|
+
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\n\n你是一个经过良好训练的AI助手,你的名字是Marco-o1.由阿里国际数字商业集团的AI Business创造.\n \n## 重要!!!!!\n当你回答问题时,你的思考应该在<Thought>内完成,<Output>内输出你的结果。\n<Thought>应该尽可能是英文,但是有2个特例,一个是对原文中的引用,另一个是是数学应该使用markdown格式,<Output>内的输出需要遵循用户输入的语言。\n <|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
|
6776
|
+
"stop_token_ids": [
|
|
6777
|
+
151643,
|
|
6778
|
+
151644,
|
|
6779
|
+
151645
|
|
6780
|
+
],
|
|
6781
|
+
"stop": [
|
|
6782
|
+
"<|endoftext|>",
|
|
6783
|
+
"<|im_start|>",
|
|
6784
|
+
"<|im_end|>"
|
|
6785
|
+
]
|
|
6786
|
+
},
|
|
6787
|
+
{
|
|
6788
|
+
"version": 1,
|
|
6789
|
+
"context_length": 4096,
|
|
6790
|
+
"model_name": "cogagent",
|
|
6791
|
+
"model_lang": [
|
|
6792
|
+
"en",
|
|
6793
|
+
"zh"
|
|
6794
|
+
],
|
|
6795
|
+
"model_ability": [
|
|
6796
|
+
"chat",
|
|
6797
|
+
"vision"
|
|
6798
|
+
],
|
|
6799
|
+
"model_description": "The CogAgent-9B-20241220 model is based on GLM-4V-9B, a bilingual open-source VLM base model. Through data collection and optimization, multi-stage training, and strategy improvements, CogAgent-9B-20241220 achieves significant advancements in GUI perception, inference prediction accuracy, action space completeness, and task generalizability. ",
|
|
6800
|
+
"model_specs": [
|
|
6801
|
+
{
|
|
6802
|
+
"model_format": "pytorch",
|
|
6803
|
+
"model_size_in_billions": "9",
|
|
6804
|
+
"quantizations": [
|
|
6805
|
+
"4-bit",
|
|
6806
|
+
"8-bit",
|
|
6807
|
+
"none"
|
|
6808
|
+
],
|
|
6809
|
+
"model_id": "ZhipuAI/cogagent-9b-20241220",
|
|
6810
|
+
"model_hub": "modelscope"
|
|
6811
|
+
}
|
|
6812
|
+
],
|
|
6813
|
+
"chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
|
6814
|
+
"stop_token_ids": [
|
|
6815
|
+
151329,
|
|
6816
|
+
151336,
|
|
6817
|
+
151338
|
|
6818
|
+
],
|
|
6819
|
+
"stop": [
|
|
6820
|
+
"<|endoftext|>",
|
|
6821
|
+
"<|user|>",
|
|
6822
|
+
"<|observation|>"
|
|
6823
|
+
]
|
|
6725
6824
|
}
|
|
6726
6825
|
]
|
xinference/model/llm/mlx/core.py
CHANGED
|
@@ -477,39 +477,6 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
477
477
|
self._model, self._processor = self._load_model(**kwargs)
|
|
478
478
|
self._tokenizer = self._processor.tokenizer
|
|
479
479
|
|
|
480
|
-
def _generate_stream_inner_no_image(self, **kwargs):
|
|
481
|
-
import mlx.nn as nn
|
|
482
|
-
from mlx_lm.utils import make_sampler, stream_generate
|
|
483
|
-
|
|
484
|
-
# For mlx-lm, the model(inputs) will return logits,
|
|
485
|
-
# but the language model in mlx-vlm will return an object
|
|
486
|
-
# https://github.com/Blaizzy/mlx-vlm/blob/3f5e1620072440afb7496940f67ac1c7fc64056f/mlx_vlm/models/base.py#L260
|
|
487
|
-
# so we cannot pass the language model to stream_generate directly
|
|
488
|
-
# we wrap here to just let model(inputs) return logits to pass stream_generate
|
|
489
|
-
class ModelWrapper(nn.Module):
|
|
490
|
-
def __init__(self, model):
|
|
491
|
-
super().__init__()
|
|
492
|
-
self._model = model.language_model
|
|
493
|
-
|
|
494
|
-
@property
|
|
495
|
-
def layers(self):
|
|
496
|
-
return self._model.layers
|
|
497
|
-
|
|
498
|
-
def __call__(self, *args, **kwargs):
|
|
499
|
-
return self._model(*args, **kwargs).logits
|
|
500
|
-
|
|
501
|
-
sampler = make_sampler(
|
|
502
|
-
temp=kwargs.pop("temperature"), top_p=kwargs.pop("top_p")
|
|
503
|
-
)
|
|
504
|
-
prompt_token_ids = kwargs.pop("prompt_token_ids")
|
|
505
|
-
yield from stream_generate(
|
|
506
|
-
ModelWrapper(self._model),
|
|
507
|
-
self._tokenizer,
|
|
508
|
-
prompt_token_ids,
|
|
509
|
-
sampler=sampler,
|
|
510
|
-
**kwargs,
|
|
511
|
-
)
|
|
512
|
-
|
|
513
480
|
def _generate_stream_inner(self, **kwargs):
|
|
514
481
|
import mlx.core as mx
|
|
515
482
|
from mlx_lm.utils import GenerationResponse
|
|
@@ -517,27 +484,8 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
517
484
|
|
|
518
485
|
inputs = kwargs["prompt_token_ids"]
|
|
519
486
|
|
|
520
|
-
if not isinstance(inputs, tuple):
|
|
521
|
-
# no images
|
|
522
|
-
yield from self._generate_stream_inner_no_image(**kwargs)
|
|
523
|
-
return
|
|
524
|
-
|
|
525
487
|
max_tokens = kwargs.pop("max_tokens")
|
|
526
|
-
input_ids, pixel_values, mask = inputs
|
|
527
|
-
|
|
528
|
-
kwargs = {
|
|
529
|
-
k: v
|
|
530
|
-
for k, v in zip(
|
|
531
|
-
[
|
|
532
|
-
"image_grid_thw",
|
|
533
|
-
"image_sizes",
|
|
534
|
-
"aspect_ratio_ids",
|
|
535
|
-
"aspect_ratio_mask",
|
|
536
|
-
"cross_attention_mask",
|
|
537
|
-
],
|
|
538
|
-
inputs[3:],
|
|
539
|
-
)
|
|
540
|
-
}
|
|
488
|
+
input_ids, pixel_values, mask, kwargs = inputs
|
|
541
489
|
|
|
542
490
|
tokenizer = self._processor.tokenizer
|
|
543
491
|
detokenizer = self._processor.detokenizer
|
|
@@ -583,37 +531,39 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
583
531
|
def _prepare_inputs(
|
|
584
532
|
self, prompt: Union[str, Dict[str, Any]], kwargs
|
|
585
533
|
) -> Tuple[Any, int]:
|
|
534
|
+
import mlx.core as mx
|
|
586
535
|
from mlx_vlm import prepare_inputs
|
|
587
536
|
|
|
588
537
|
prompt_str = prompt.get("prompt") # type: ignore
|
|
589
538
|
images = prompt.get("multi_modal_data", {}).get("image") # type: ignore
|
|
590
539
|
if images and not isinstance(images, list):
|
|
591
540
|
images = [images]
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
541
|
+
resize_shape = kwargs.pop("resize_shape", None)
|
|
542
|
+
image_token_index = getattr(self._model.config, "image_token_index", None)
|
|
543
|
+
|
|
544
|
+
processor = self._processor
|
|
545
|
+
tokenizer = processor if hasattr(processor, "encode") else processor.tokenizer
|
|
546
|
+
prompt_tokens = mx.array(tokenizer.encode(prompt_str))
|
|
596
547
|
|
|
597
548
|
if not images:
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
kwargs.get("lora_name"),
|
|
603
|
-
model=self._model.language_model,
|
|
604
|
-
)
|
|
605
|
-
return prompt_token_ids, len(prompt_token_ids)
|
|
549
|
+
input_ids = prompt_tokens[None, :]
|
|
550
|
+
pixel_values = mask = None
|
|
551
|
+
kwargs = {}
|
|
552
|
+
input_token_len = input_ids.size
|
|
606
553
|
else:
|
|
607
554
|
inputs = prepare_inputs(
|
|
608
|
-
|
|
609
|
-
self._processor,
|
|
610
|
-
images,
|
|
611
|
-
prompt_str,
|
|
612
|
-
image_token_index,
|
|
613
|
-
kwargs.get("resize_shape"),
|
|
555
|
+
processor, images, prompt_str, image_token_index, resize_shape
|
|
614
556
|
)
|
|
615
|
-
input_ids = inputs[
|
|
616
|
-
|
|
557
|
+
input_ids = inputs["input_ids"]
|
|
558
|
+
pixel_values = inputs["pixel_values"]
|
|
559
|
+
mask = inputs["attention_mask"]
|
|
560
|
+
kwargs = {
|
|
561
|
+
k: v
|
|
562
|
+
for k, v in inputs.items()
|
|
563
|
+
if k not in ["input_ids", "pixel_values", "attention_mask"]
|
|
564
|
+
}
|
|
565
|
+
input_token_len = int(mask.sum())
|
|
566
|
+
return (input_ids, pixel_values, mask, kwargs), input_token_len
|
|
617
567
|
|
|
618
568
|
def chat(
|
|
619
569
|
self,
|