xinference 0.12.3__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +6 -6
- xinference/client/restful/restful_client.py +0 -2
- xinference/core/model.py +21 -4
- xinference/core/scheduler.py +2 -0
- xinference/core/worker.py +74 -45
- xinference/deploy/utils.py +33 -2
- xinference/model/llm/__init__.py +5 -0
- xinference/model/llm/llm_family.json +240 -1
- xinference/model/llm/llm_family.py +32 -8
- xinference/model/llm/llm_family_modelscope.json +192 -0
- xinference/model/llm/mlx/__init__.py +13 -0
- xinference/model/llm/mlx/core.py +408 -0
- xinference/model/llm/pytorch/chatglm.py +2 -9
- xinference/model/llm/pytorch/cogvlm2.py +206 -21
- xinference/model/llm/pytorch/core.py +213 -40
- xinference/model/llm/pytorch/glm4v.py +171 -15
- xinference/model/llm/pytorch/qwen_vl.py +168 -7
- xinference/model/llm/pytorch/utils.py +53 -62
- xinference/model/llm/utils.py +24 -5
- xinference/model/rerank/core.py +5 -0
- xinference/thirdparty/deepseek_vl/serve/__init__.py +13 -0
- xinference/thirdparty/deepseek_vl/serve/app_deepseek.py +510 -0
- xinference/thirdparty/deepseek_vl/serve/app_modules/__init__.py +13 -0
- xinference/thirdparty/deepseek_vl/serve/app_modules/gradio_utils.py +94 -0
- xinference/thirdparty/deepseek_vl/serve/app_modules/overwrites.py +81 -0
- xinference/thirdparty/deepseek_vl/serve/app_modules/presets.py +96 -0
- xinference/thirdparty/deepseek_vl/serve/app_modules/utils.py +229 -0
- xinference/thirdparty/deepseek_vl/serve/inference.py +170 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.0fb6f3ab.js +3 -0
- xinference/web/ui/build/static/js/main.0fb6f3ab.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0f6b391abec76271137faad13a3793fe7acc1024e8cd2269c147b653ecd3a73b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2c63090c842376cdd368c3ded88a333ef40d94785747651343040a6f7872a223.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/30a0c79d8025d6441eb75b2df5bc2750a14f30119c869ef02570d294dff65c2f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/40486e655c3c5801f087e2cf206c0b5511aaa0dfdba78046b7181bf9c17e54c5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b5507cd57f16a3a230aa0128e39fe103e928de139ea29e2679e4c64dcbba3b3a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d779b915f83f9c7b5a72515b6932fdd114f1822cef90ae01cc0d12bca59abc2d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d87824cb266194447a9c0c69ebab2d507bfc3e3148976173760d18c035e9dd26.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +1 -0
- {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/METADATA +4 -1
- {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/RECORD +55 -44
- xinference/web/ui/build/static/js/main.77dd47c3.js +0 -3
- xinference/web/ui/build/static/js/main.77dd47c3.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0cd591866aa345566e0b63fb51ff2043e163a770af6fdc2f3bad395d046353e2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/37c1476717199863bbba1530e3513a9368f8f73001b75b4a85c2075956308027.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3da7d55e87882a4af923e187b1351160e34ca102f589086439c15131a227fb6e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/46edc1fe657dfedb2e673148332bb442c6eb98f09f2592c389209e376510afa5.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/72bcecc71c5267250edeb89608859d449b586f13ff9923a5e70e7172976ec403.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/82db357f3fd5b32215d747ee593f69ff06c95ad6cde37f71a96c8290aaab64c0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/935efd2867664c58230378fdf2ff1ea85e58d853b7214014e20dfbca8dab7b05.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bc6da27195ec4607bb472bf61f97c928ad4966fa64e4c2247661bedb7400abba.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c2abe75f04ad82fba68f35ed9cbe2e287762c876684fddccccfa73f739489b65.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f118f99c22b713c678c1209c4e1dd43fe86e3f6e801a4c0c35d3bbf41fd05fe6.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f51bf63ddaa7afd125ef2254a105789333eecc1c94fdf5157a9b88ef7ad0a5bd.json +0 -1
- /xinference/web/ui/build/static/js/{main.77dd47c3.js.LICENSE.txt → main.0fb6f3ab.js.LICENSE.txt} +0 -0
- {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/LICENSE +0 -0
- {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/WHEEL +0 -0
- {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-
|
|
11
|
+
"date": "2024-07-05T18:19:09+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.
|
|
14
|
+
"full-revisionid": "007408c55272bc343821dd152df780de5dc9c037",
|
|
15
|
+
"version": "0.13.0"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/api/restful_api.py
CHANGED
|
@@ -1477,14 +1477,14 @@ class RESTfulAPI:
|
|
|
1477
1477
|
await self._report_error_event(model_uid, str(e))
|
|
1478
1478
|
raise HTTPException(status_code=500, detail=str(e))
|
|
1479
1479
|
|
|
1480
|
-
from ..model.llm.utils import QWEN_TOOL_CALL_FAMILY
|
|
1480
|
+
from ..model.llm.utils import GLM4_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY
|
|
1481
1481
|
|
|
1482
1482
|
model_family = desc.get("model_family", "")
|
|
1483
|
-
function_call_models =
|
|
1484
|
-
"chatglm3",
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1483
|
+
function_call_models = (
|
|
1484
|
+
["chatglm3", "gorilla-openfunctions-v1"]
|
|
1485
|
+
+ QWEN_TOOL_CALL_FAMILY
|
|
1486
|
+
+ GLM4_TOOL_CALL_FAMILY
|
|
1487
|
+
)
|
|
1488
1488
|
|
|
1489
1489
|
is_qwen = desc.get("model_format") == "ggmlv3" and "qwen-chat" == model_family
|
|
1490
1490
|
|
|
@@ -182,8 +182,6 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
|
|
|
182
182
|
f"Failed to rerank documents, detail: {response.json()['detail']}"
|
|
183
183
|
)
|
|
184
184
|
response_data = response.json()
|
|
185
|
-
for r in response_data["results"]:
|
|
186
|
-
r["document"] = documents[r["index"]]
|
|
187
185
|
return response_data
|
|
188
186
|
|
|
189
187
|
|
xinference/core/model.py
CHANGED
|
@@ -65,6 +65,9 @@ except ImportError:
|
|
|
65
65
|
OutOfMemoryError = _OutOfMemoryError
|
|
66
66
|
|
|
67
67
|
|
|
68
|
+
XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = ["qwen-vl-chat", "cogvlm2", "glm-4v"]
|
|
69
|
+
|
|
70
|
+
|
|
68
71
|
def request_limit(fn):
|
|
69
72
|
"""
|
|
70
73
|
Used by ModelActor.
|
|
@@ -268,11 +271,25 @@ class ModelActor(xo.StatelessActor):
|
|
|
268
271
|
|
|
269
272
|
model_ability = self._model_description.get("model_ability", [])
|
|
270
273
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
and isinstance(self._model, PytorchModel)
|
|
274
|
-
and "vision" not in model_ability
|
|
274
|
+
condition = XINFERENCE_TRANSFORMERS_ENABLE_BATCHING and isinstance(
|
|
275
|
+
self._model, PytorchModel
|
|
275
276
|
)
|
|
277
|
+
if condition and "vision" in model_ability:
|
|
278
|
+
if (
|
|
279
|
+
self._model.model_family.model_name
|
|
280
|
+
in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS
|
|
281
|
+
or self._model.model_family.model_family
|
|
282
|
+
in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS
|
|
283
|
+
):
|
|
284
|
+
return True
|
|
285
|
+
else:
|
|
286
|
+
logger.warning(
|
|
287
|
+
f"Currently for multimodal models, "
|
|
288
|
+
f"xinference only supports {', '.join(XINFERENCE_BATCHING_ALLOWED_VISION_MODELS)} for batching. "
|
|
289
|
+
f"Your model {self._model.model_family.model_name} with model family {self._model.model_family.model_family} is disqualified."
|
|
290
|
+
)
|
|
291
|
+
return False
|
|
292
|
+
return condition
|
|
276
293
|
|
|
277
294
|
async def load(self):
|
|
278
295
|
self._model.load()
|
xinference/core/scheduler.py
CHANGED
|
@@ -82,6 +82,8 @@ class InferenceRequest:
|
|
|
82
82
|
# Record error message when this request has error.
|
|
83
83
|
# Must set stopped=True when this field is set.
|
|
84
84
|
self.error_msg: Optional[str] = None
|
|
85
|
+
# For compatibility. Record some extra parameters for some special cases.
|
|
86
|
+
self.extra_kwargs = {}
|
|
85
87
|
|
|
86
88
|
# check the integrity of args passed upstream
|
|
87
89
|
self._check_args()
|
xinference/core/worker.py
CHANGED
|
@@ -73,6 +73,9 @@ class WorkerActor(xo.StatelessActor):
|
|
|
73
73
|
self._main_pool.recover_sub_pool = self.recover_sub_pool
|
|
74
74
|
|
|
75
75
|
# internal states.
|
|
76
|
+
# temporary placeholder during model launch process:
|
|
77
|
+
self._model_uid_launching_guard: Dict[str, bool] = {}
|
|
78
|
+
# attributes maintained after model launched:
|
|
76
79
|
self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
|
|
77
80
|
self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
|
|
78
81
|
self._gpu_to_model_uid: Dict[int, str] = {}
|
|
@@ -594,10 +597,14 @@ class WorkerActor(xo.StatelessActor):
|
|
|
594
597
|
launch_args.pop("kwargs")
|
|
595
598
|
launch_args.update(kwargs)
|
|
596
599
|
|
|
597
|
-
|
|
600
|
+
try:
|
|
601
|
+
origin_uid, _, _ = parse_replica_model_uid(model_uid)
|
|
602
|
+
except Exception as e:
|
|
603
|
+
logger.exception(e)
|
|
604
|
+
raise
|
|
598
605
|
try:
|
|
599
606
|
await self._event_collector_ref.report_event(
|
|
600
|
-
|
|
607
|
+
origin_uid,
|
|
601
608
|
Event(
|
|
602
609
|
event_type=EventType.INFO,
|
|
603
610
|
event_ts=int(time.time()),
|
|
@@ -640,50 +647,55 @@ class WorkerActor(xo.StatelessActor):
|
|
|
640
647
|
assert model_uid not in self._model_uid_to_model
|
|
641
648
|
self._check_model_is_valid(model_name, model_format)
|
|
642
649
|
|
|
643
|
-
|
|
644
|
-
model_uid
|
|
645
|
-
)
|
|
650
|
+
if self.get_model_launch_status(model_uid) is not None:
|
|
651
|
+
raise ValueError(f"{model_uid} is running")
|
|
646
652
|
|
|
647
653
|
try:
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
subpool_address,
|
|
652
|
-
devices,
|
|
653
|
-
model_uid,
|
|
654
|
-
model_type,
|
|
655
|
-
model_name,
|
|
656
|
-
model_engine,
|
|
657
|
-
model_format,
|
|
658
|
-
model_size_in_billions,
|
|
659
|
-
quantization,
|
|
660
|
-
peft_model_config,
|
|
661
|
-
**kwargs,
|
|
662
|
-
)
|
|
663
|
-
await self.update_cache_status(model_name, model_description)
|
|
664
|
-
model_ref = await xo.create_actor(
|
|
665
|
-
ModelActor,
|
|
666
|
-
address=subpool_address,
|
|
667
|
-
uid=model_uid,
|
|
668
|
-
worker_address=self.address,
|
|
669
|
-
model=model,
|
|
670
|
-
model_description=model_description,
|
|
671
|
-
request_limits=request_limits,
|
|
654
|
+
self._model_uid_launching_guard[model_uid] = True
|
|
655
|
+
subpool_address, devices = await self._create_subpool(
|
|
656
|
+
model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
|
|
672
657
|
)
|
|
673
|
-
await model_ref.load()
|
|
674
|
-
except:
|
|
675
|
-
logger.error(f"Failed to load model {model_uid}", exc_info=True)
|
|
676
|
-
self.release_devices(model_uid=model_uid)
|
|
677
|
-
await self._main_pool.remove_sub_pool(subpool_address)
|
|
678
|
-
raise
|
|
679
658
|
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
659
|
+
try:
|
|
660
|
+
model, model_description = await asyncio.to_thread(
|
|
661
|
+
create_model_instance,
|
|
662
|
+
subpool_address,
|
|
663
|
+
devices,
|
|
664
|
+
model_uid,
|
|
665
|
+
model_type,
|
|
666
|
+
model_name,
|
|
667
|
+
model_engine,
|
|
668
|
+
model_format,
|
|
669
|
+
model_size_in_billions,
|
|
670
|
+
quantization,
|
|
671
|
+
peft_model_config,
|
|
672
|
+
**kwargs,
|
|
673
|
+
)
|
|
674
|
+
await self.update_cache_status(model_name, model_description)
|
|
675
|
+
model_ref = await xo.create_actor(
|
|
676
|
+
ModelActor,
|
|
677
|
+
address=subpool_address,
|
|
678
|
+
uid=model_uid,
|
|
679
|
+
worker_address=self.address,
|
|
680
|
+
model=model,
|
|
681
|
+
model_description=model_description,
|
|
682
|
+
request_limits=request_limits,
|
|
683
|
+
)
|
|
684
|
+
await model_ref.load()
|
|
685
|
+
except:
|
|
686
|
+
logger.error(f"Failed to load model {model_uid}", exc_info=True)
|
|
687
|
+
self.release_devices(model_uid=model_uid)
|
|
688
|
+
await self._main_pool.remove_sub_pool(subpool_address)
|
|
689
|
+
raise
|
|
690
|
+
self._model_uid_to_model[model_uid] = model_ref
|
|
691
|
+
self._model_uid_to_model_spec[model_uid] = model_description
|
|
692
|
+
self._model_uid_to_addr[model_uid] = subpool_address
|
|
693
|
+
self._model_uid_to_recover_count.setdefault(
|
|
694
|
+
model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
|
|
695
|
+
)
|
|
696
|
+
self._model_uid_to_launch_args[model_uid] = launch_args
|
|
697
|
+
finally:
|
|
698
|
+
del self._model_uid_launching_guard[model_uid]
|
|
687
699
|
|
|
688
700
|
# update status to READY
|
|
689
701
|
abilities = await self._get_model_ability(model, model_type)
|
|
@@ -694,10 +706,13 @@ class WorkerActor(xo.StatelessActor):
|
|
|
694
706
|
|
|
695
707
|
@log_async(logger=logger)
|
|
696
708
|
async def terminate_model(self, model_uid: str):
|
|
697
|
-
|
|
709
|
+
# Terminate model while its launching is not allow
|
|
710
|
+
if model_uid in self._model_uid_launching_guard:
|
|
711
|
+
raise ValueError(f"{model_uid} is launching")
|
|
712
|
+
origin_uid, _, __ = parse_replica_model_uid(model_uid)
|
|
698
713
|
try:
|
|
699
714
|
await self._event_collector_ref.report_event(
|
|
700
|
-
|
|
715
|
+
origin_uid,
|
|
701
716
|
Event(
|
|
702
717
|
event_type=EventType.INFO,
|
|
703
718
|
event_ts=int(time.time()),
|
|
@@ -708,7 +723,6 @@ class WorkerActor(xo.StatelessActor):
|
|
|
708
723
|
# Report callback error can be log and ignore, should not interrupt the Process
|
|
709
724
|
logger.error("report_event error: %s" % (e))
|
|
710
725
|
|
|
711
|
-
origin_uid, _, _ = parse_replica_model_uid(model_uid)
|
|
712
726
|
await self._status_guard_ref.update_instance_info(
|
|
713
727
|
origin_uid, {"status": LaunchStatus.TERMINATING.name}
|
|
714
728
|
)
|
|
@@ -740,6 +754,21 @@ class WorkerActor(xo.StatelessActor):
|
|
|
740
754
|
origin_uid, {"status": LaunchStatus.TERMINATED.name}
|
|
741
755
|
)
|
|
742
756
|
|
|
757
|
+
# Provide an interface for future version of supervisor to call
|
|
758
|
+
def get_model_launch_status(self, model_uid: str) -> Optional[str]:
|
|
759
|
+
"""
|
|
760
|
+
returns:
|
|
761
|
+
CREATING: model is launching
|
|
762
|
+
RREADY: model is running
|
|
763
|
+
None: model is not running (launch error might have happened)
|
|
764
|
+
"""
|
|
765
|
+
|
|
766
|
+
if model_uid in self._model_uid_launching_guard:
|
|
767
|
+
return LaunchStatus.CREATING.name
|
|
768
|
+
if model_uid in self._model_uid_to_model:
|
|
769
|
+
return LaunchStatus.READY.name
|
|
770
|
+
return None
|
|
771
|
+
|
|
743
772
|
@log_async(logger=logger)
|
|
744
773
|
async def list_models(self) -> Dict[str, Dict[str, Any]]:
|
|
745
774
|
ret = {}
|
xinference/deploy/utils.py
CHANGED
|
@@ -79,6 +79,12 @@ def get_config_dict(
|
|
|
79
79
|
"stream": "ext://sys.stderr",
|
|
80
80
|
"filters": ["logger_name_filter"],
|
|
81
81
|
},
|
|
82
|
+
"console_handler": {
|
|
83
|
+
"class": "logging.StreamHandler",
|
|
84
|
+
"formatter": "formatter",
|
|
85
|
+
"level": log_level,
|
|
86
|
+
"stream": "ext://sys.stderr",
|
|
87
|
+
},
|
|
82
88
|
"file_handler": {
|
|
83
89
|
"class": "logging.handlers.RotatingFileHandler",
|
|
84
90
|
"formatter": "formatter",
|
|
@@ -95,7 +101,32 @@ def get_config_dict(
|
|
|
95
101
|
"handlers": ["stream_handler", "file_handler"],
|
|
96
102
|
"level": log_level,
|
|
97
103
|
"propagate": False,
|
|
98
|
-
}
|
|
104
|
+
},
|
|
105
|
+
"uvicorn": {
|
|
106
|
+
"handlers": ["stream_handler", "file_handler"],
|
|
107
|
+
"level": log_level,
|
|
108
|
+
"propagate": False,
|
|
109
|
+
},
|
|
110
|
+
"uvicorn.error": {
|
|
111
|
+
"handlers": ["stream_handler", "file_handler"],
|
|
112
|
+
"level": log_level,
|
|
113
|
+
"propagate": False,
|
|
114
|
+
},
|
|
115
|
+
"uvicorn.access": {
|
|
116
|
+
"handlers": ["stream_handler", "file_handler"],
|
|
117
|
+
"level": log_level,
|
|
118
|
+
"propagate": False,
|
|
119
|
+
},
|
|
120
|
+
"transformers": {
|
|
121
|
+
"handlers": ["console_handler", "file_handler"],
|
|
122
|
+
"level": log_level,
|
|
123
|
+
"propagate": False,
|
|
124
|
+
},
|
|
125
|
+
"vllm": {
|
|
126
|
+
"handlers": ["console_handler", "file_handler"],
|
|
127
|
+
"level": log_level,
|
|
128
|
+
"propagate": False,
|
|
129
|
+
},
|
|
99
130
|
},
|
|
100
131
|
"root": {
|
|
101
132
|
"level": "WARN",
|
|
@@ -127,7 +158,7 @@ def health_check(address: str, max_attempts: int, sleep_interval: int = 3) -> bo
|
|
|
127
158
|
while attempts < max_attempts:
|
|
128
159
|
time.sleep(sleep_interval)
|
|
129
160
|
try:
|
|
130
|
-
from
|
|
161
|
+
from ..core.supervisor import SupervisorActor
|
|
131
162
|
|
|
132
163
|
supervisor_ref: xo.ActorRefType[SupervisorActor] = await xo.actor_ref( # type: ignore
|
|
133
164
|
address=address, uid=SupervisorActor.uid()
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -34,6 +34,7 @@ from .llm_family import (
|
|
|
34
34
|
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
35
35
|
LLAMA_CLASSES,
|
|
36
36
|
LLM_ENGINES,
|
|
37
|
+
MLX_CLASSES,
|
|
37
38
|
SGLANG_CLASSES,
|
|
38
39
|
SUPPORTED_ENGINES,
|
|
39
40
|
TRANSFORMERS_CLASSES,
|
|
@@ -42,6 +43,7 @@ from .llm_family import (
|
|
|
42
43
|
GgmlLLMSpecV1,
|
|
43
44
|
LLMFamilyV1,
|
|
44
45
|
LLMSpecV1,
|
|
46
|
+
MLXLLMSpecV1,
|
|
45
47
|
PromptStyleV1,
|
|
46
48
|
PytorchLLMSpecV1,
|
|
47
49
|
get_cache_status,
|
|
@@ -112,6 +114,7 @@ def generate_engine_config_by_model_family(model_family):
|
|
|
112
114
|
def _install():
|
|
113
115
|
from .ggml.chatglm import ChatglmCppChatModel
|
|
114
116
|
from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
|
|
117
|
+
from .mlx.core import MLXChatModel, MLXModel
|
|
115
118
|
from .pytorch.baichuan import BaichuanPytorchChatModel
|
|
116
119
|
from .pytorch.chatglm import ChatglmPytorchChatModel
|
|
117
120
|
from .pytorch.cogvlm2 import CogVLM2Model
|
|
@@ -147,6 +150,7 @@ def _install():
|
|
|
147
150
|
)
|
|
148
151
|
SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
|
|
149
152
|
VLLM_CLASSES.extend([VLLMModel, VLLMChatModel])
|
|
153
|
+
MLX_CLASSES.extend([MLXModel, MLXChatModel])
|
|
150
154
|
TRANSFORMERS_CLASSES.extend(
|
|
151
155
|
[
|
|
152
156
|
BaichuanPytorchChatModel,
|
|
@@ -176,6 +180,7 @@ def _install():
|
|
|
176
180
|
SUPPORTED_ENGINES["SGLang"] = SGLANG_CLASSES
|
|
177
181
|
SUPPORTED_ENGINES["Transformers"] = TRANSFORMERS_CLASSES
|
|
178
182
|
SUPPORTED_ENGINES["llama.cpp"] = LLAMA_CLASSES
|
|
183
|
+
SUPPORTED_ENGINES["MLX"] = MLX_CLASSES
|
|
179
184
|
|
|
180
185
|
json_path = os.path.join(
|
|
181
186
|
os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
|
|
@@ -944,7 +944,7 @@
|
|
|
944
944
|
"none"
|
|
945
945
|
],
|
|
946
946
|
"model_id": "THUDM/glm-4v-9b",
|
|
947
|
-
"model_revision": "
|
|
947
|
+
"model_revision": "6c2e4732db8443f64a48d5af04b74425a7d169c4"
|
|
948
948
|
}
|
|
949
949
|
],
|
|
950
950
|
"prompt_style": {
|
|
@@ -2549,6 +2549,38 @@
|
|
|
2549
2549
|
],
|
|
2550
2550
|
"model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
|
|
2551
2551
|
},
|
|
2552
|
+
{
|
|
2553
|
+
"model_format": "mlx",
|
|
2554
|
+
"model_size_in_billions": "0_5",
|
|
2555
|
+
"quantizations": [
|
|
2556
|
+
"4-bit"
|
|
2557
|
+
],
|
|
2558
|
+
"model_id": "Qwen/Qwen2-0.5B-Instruct-MLX"
|
|
2559
|
+
},
|
|
2560
|
+
{
|
|
2561
|
+
"model_format": "mlx",
|
|
2562
|
+
"model_size_in_billions": "1_5",
|
|
2563
|
+
"quantizations": [
|
|
2564
|
+
"4-bit"
|
|
2565
|
+
],
|
|
2566
|
+
"model_id": "Qwen/Qwen2-1.5B-Instruct-MLX"
|
|
2567
|
+
},
|
|
2568
|
+
{
|
|
2569
|
+
"model_format": "mlx",
|
|
2570
|
+
"model_size_in_billions": 7,
|
|
2571
|
+
"quantizations": [
|
|
2572
|
+
"4-bit"
|
|
2573
|
+
],
|
|
2574
|
+
"model_id": "Qwen/Qwen2-7B-Instruct-MLX"
|
|
2575
|
+
},
|
|
2576
|
+
{
|
|
2577
|
+
"model_format": "mlx",
|
|
2578
|
+
"model_size_in_billions": 72,
|
|
2579
|
+
"quantizations": [
|
|
2580
|
+
"4-bit"
|
|
2581
|
+
],
|
|
2582
|
+
"model_id": "mlx-community/Qwen2-72B-Instruct-4bit"
|
|
2583
|
+
},
|
|
2552
2584
|
{
|
|
2553
2585
|
"model_format": "ggufv2",
|
|
2554
2586
|
"model_size_in_billions": "0_5",
|
|
@@ -2565,6 +2597,82 @@
|
|
|
2565
2597
|
],
|
|
2566
2598
|
"model_id": "Qwen/Qwen2-0.5B-Instruct-GGUF",
|
|
2567
2599
|
"model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf"
|
|
2600
|
+
},
|
|
2601
|
+
{
|
|
2602
|
+
"model_format": "ggufv2",
|
|
2603
|
+
"model_size_in_billions": "1_5",
|
|
2604
|
+
"quantizations": [
|
|
2605
|
+
"q2_k",
|
|
2606
|
+
"q3_k_m",
|
|
2607
|
+
"q4_0",
|
|
2608
|
+
"q4_k_m",
|
|
2609
|
+
"q5_0",
|
|
2610
|
+
"q5_k_m",
|
|
2611
|
+
"q6_k",
|
|
2612
|
+
"q8_0",
|
|
2613
|
+
"fp16"
|
|
2614
|
+
],
|
|
2615
|
+
"model_id": "Qwen/Qwen2-1.5B-Instruct-GGUF",
|
|
2616
|
+
"model_file_name_template": "qwen2-1_5b-instruct-{quantization}.gguf"
|
|
2617
|
+
},
|
|
2618
|
+
{
|
|
2619
|
+
"model_format": "ggufv2",
|
|
2620
|
+
"model_size_in_billions": 7,
|
|
2621
|
+
"quantizations": [
|
|
2622
|
+
"q2_k",
|
|
2623
|
+
"q3_k_m",
|
|
2624
|
+
"q4_0",
|
|
2625
|
+
"q4_k_m",
|
|
2626
|
+
"q5_0",
|
|
2627
|
+
"q5_k_m",
|
|
2628
|
+
"q6_k",
|
|
2629
|
+
"q8_0",
|
|
2630
|
+
"fp16"
|
|
2631
|
+
],
|
|
2632
|
+
"model_id": "Qwen/Qwen2-7B-Instruct-GGUF",
|
|
2633
|
+
"model_file_name_template": "qwen2-7b-instruct-{quantization}.gguf"
|
|
2634
|
+
},
|
|
2635
|
+
{
|
|
2636
|
+
"model_format": "ggufv2",
|
|
2637
|
+
"model_size_in_billions": 72,
|
|
2638
|
+
"quantizations": [
|
|
2639
|
+
"q2_k",
|
|
2640
|
+
"q3_k_m",
|
|
2641
|
+
"q4_0",
|
|
2642
|
+
"q4_k_m",
|
|
2643
|
+
"q5_0",
|
|
2644
|
+
"q5_k_m",
|
|
2645
|
+
"q6_k",
|
|
2646
|
+
"q8_0",
|
|
2647
|
+
"fp16"
|
|
2648
|
+
],
|
|
2649
|
+
"model_id": "Qwen/Qwen2-72B-Instruct-GGUF",
|
|
2650
|
+
"model_file_name_template": "qwen2-72b-instruct-{quantization}.gguf",
|
|
2651
|
+
"model_file_name_split_template": "qwen2-72b-instruct-{quantization}-{part}.gguf",
|
|
2652
|
+
"quantization_parts": {
|
|
2653
|
+
"q5_0": [
|
|
2654
|
+
"00001-of-00002",
|
|
2655
|
+
"00002-of-00002"
|
|
2656
|
+
],
|
|
2657
|
+
"q5_k_m": [
|
|
2658
|
+
"00001-of-00002",
|
|
2659
|
+
"00002-of-00002"
|
|
2660
|
+
],
|
|
2661
|
+
"q6_k": [
|
|
2662
|
+
"00001-of-00002",
|
|
2663
|
+
"00002-of-00002"
|
|
2664
|
+
],
|
|
2665
|
+
"q8_0": [
|
|
2666
|
+
"00001-of-00002",
|
|
2667
|
+
"00002-of-00002"
|
|
2668
|
+
],
|
|
2669
|
+
"fp16": [
|
|
2670
|
+
"00001-of-00004",
|
|
2671
|
+
"00002-of-00004",
|
|
2672
|
+
"00003-of-00004",
|
|
2673
|
+
"00004-of-00004"
|
|
2674
|
+
]
|
|
2675
|
+
}
|
|
2568
2676
|
}
|
|
2569
2677
|
],
|
|
2570
2678
|
"prompt_style": {
|
|
@@ -2618,6 +2726,34 @@
|
|
|
2618
2726
|
"Int4"
|
|
2619
2727
|
],
|
|
2620
2728
|
"model_id": "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4"
|
|
2729
|
+
},
|
|
2730
|
+
{
|
|
2731
|
+
"model_format": "ggufv2",
|
|
2732
|
+
"model_size_in_billions": 14,
|
|
2733
|
+
"quantizations": [
|
|
2734
|
+
"q3_k_m",
|
|
2735
|
+
"q4_0",
|
|
2736
|
+
"q4_k_m",
|
|
2737
|
+
"q5_0",
|
|
2738
|
+
"q5_k_m",
|
|
2739
|
+
"q6_k",
|
|
2740
|
+
"q8_0",
|
|
2741
|
+
"fp16"
|
|
2742
|
+
],
|
|
2743
|
+
"model_id": "Qwen/Qwen2-57B-A14B-Instruct-GGUF",
|
|
2744
|
+
"model_file_name_template": "qwen2-57b-a14b-instruct-{quantization}.gguf",
|
|
2745
|
+
"model_file_name_split_template": "qwen2-57b-a14b-instruct-{quantization}-{part}.gguf",
|
|
2746
|
+
"quantization_parts": {
|
|
2747
|
+
"q8_0": [
|
|
2748
|
+
"00001-of-00002",
|
|
2749
|
+
"00002-of-00002"
|
|
2750
|
+
],
|
|
2751
|
+
"fp16": [
|
|
2752
|
+
"00001-of-00003",
|
|
2753
|
+
"00002-of-00003",
|
|
2754
|
+
"00003-of-00003"
|
|
2755
|
+
]
|
|
2756
|
+
}
|
|
2621
2757
|
}
|
|
2622
2758
|
],
|
|
2623
2759
|
"prompt_style": {
|
|
@@ -5809,6 +5945,16 @@
|
|
|
5809
5945
|
"roles": [
|
|
5810
5946
|
"user",
|
|
5811
5947
|
"assistant"
|
|
5948
|
+
],
|
|
5949
|
+
"stop_token_ids": [
|
|
5950
|
+
151643,
|
|
5951
|
+
151644,
|
|
5952
|
+
151645
|
|
5953
|
+
],
|
|
5954
|
+
"stop": [
|
|
5955
|
+
"<|endoftext|>",
|
|
5956
|
+
"<|im_start|>",
|
|
5957
|
+
"<|im_end|>"
|
|
5812
5958
|
]
|
|
5813
5959
|
}
|
|
5814
5960
|
},
|
|
@@ -5997,6 +6143,99 @@
|
|
|
5997
6143
|
]
|
|
5998
6144
|
}
|
|
5999
6145
|
},
|
|
6146
|
+
{
|
|
6147
|
+
"version": 1,
|
|
6148
|
+
"context_length": 8192,
|
|
6149
|
+
"model_name": "gemma-2-it",
|
|
6150
|
+
"model_lang": [
|
|
6151
|
+
"en"
|
|
6152
|
+
],
|
|
6153
|
+
"model_ability": [
|
|
6154
|
+
"chat"
|
|
6155
|
+
],
|
|
6156
|
+
"model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
|
|
6157
|
+
"model_specs": [
|
|
6158
|
+
{
|
|
6159
|
+
"model_format": "pytorch",
|
|
6160
|
+
"model_size_in_billions": 9,
|
|
6161
|
+
"quantizations": [
|
|
6162
|
+
"none",
|
|
6163
|
+
"4-bit",
|
|
6164
|
+
"8-bit"
|
|
6165
|
+
],
|
|
6166
|
+
"model_id": "google/gemma-2-9b-it"
|
|
6167
|
+
},
|
|
6168
|
+
{
|
|
6169
|
+
"model_format": "pytorch",
|
|
6170
|
+
"model_size_in_billions": 27,
|
|
6171
|
+
"quantizations": [
|
|
6172
|
+
"none",
|
|
6173
|
+
"4-bit",
|
|
6174
|
+
"8-bit"
|
|
6175
|
+
],
|
|
6176
|
+
"model_id": "google/gemma-2-27b-it"
|
|
6177
|
+
},
|
|
6178
|
+
{
|
|
6179
|
+
"model_format": "mlx",
|
|
6180
|
+
"model_size_in_billions": 9,
|
|
6181
|
+
"quantizations": [
|
|
6182
|
+
"4-bit"
|
|
6183
|
+
],
|
|
6184
|
+
"model_id": "mlx-community/gemma-2-9b-it-4bit"
|
|
6185
|
+
},
|
|
6186
|
+
{
|
|
6187
|
+
"model_format": "mlx",
|
|
6188
|
+
"model_size_in_billions": 9,
|
|
6189
|
+
"quantizations": [
|
|
6190
|
+
"8-bit"
|
|
6191
|
+
],
|
|
6192
|
+
"model_id": "mlx-community/gemma-2-9b-it-8bit"
|
|
6193
|
+
},
|
|
6194
|
+
{
|
|
6195
|
+
"model_format": "mlx",
|
|
6196
|
+
"model_size_in_billions": 9,
|
|
6197
|
+
"quantizations": [
|
|
6198
|
+
"None"
|
|
6199
|
+
],
|
|
6200
|
+
"model_id": "mlx-community/gemma-2-9b-it-fp16"
|
|
6201
|
+
},
|
|
6202
|
+
{
|
|
6203
|
+
"model_format": "mlx",
|
|
6204
|
+
"model_size_in_billions": 27,
|
|
6205
|
+
"quantizations": [
|
|
6206
|
+
"4-bit"
|
|
6207
|
+
],
|
|
6208
|
+
"model_id": "mlx-community/gemma-2-27b-it-4bit"
|
|
6209
|
+
},
|
|
6210
|
+
{
|
|
6211
|
+
"model_format": "mlx",
|
|
6212
|
+
"model_size_in_billions": 27,
|
|
6213
|
+
"quantizations": [
|
|
6214
|
+
"8-bit"
|
|
6215
|
+
],
|
|
6216
|
+
"model_id": "mlx-community/gemma-2-27b-it-8bit"
|
|
6217
|
+
},
|
|
6218
|
+
{
|
|
6219
|
+
"model_format": "mlx",
|
|
6220
|
+
"model_size_in_billions": 27,
|
|
6221
|
+
"quantizations": [
|
|
6222
|
+
"None"
|
|
6223
|
+
],
|
|
6224
|
+
"model_id": "mlx-community/gemma-2-27b-it-fp16"
|
|
6225
|
+
}
|
|
6226
|
+
],
|
|
6227
|
+
"prompt_style": {
|
|
6228
|
+
"style_name": "gemma",
|
|
6229
|
+
"roles": [
|
|
6230
|
+
"user",
|
|
6231
|
+
"model"
|
|
6232
|
+
],
|
|
6233
|
+
"stop": [
|
|
6234
|
+
"<end_of_turn>",
|
|
6235
|
+
"<start_of_turn>"
|
|
6236
|
+
]
|
|
6237
|
+
}
|
|
6238
|
+
},
|
|
6000
6239
|
{
|
|
6001
6240
|
"version": 1,
|
|
6002
6241
|
"context_length": 4096,
|