xinference 0.12.3__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (71) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +6 -6
  3. xinference/client/restful/restful_client.py +0 -2
  4. xinference/core/model.py +21 -4
  5. xinference/core/scheduler.py +2 -0
  6. xinference/core/worker.py +74 -45
  7. xinference/deploy/utils.py +33 -2
  8. xinference/model/llm/__init__.py +5 -0
  9. xinference/model/llm/llm_family.json +240 -1
  10. xinference/model/llm/llm_family.py +32 -8
  11. xinference/model/llm/llm_family_modelscope.json +192 -0
  12. xinference/model/llm/mlx/__init__.py +13 -0
  13. xinference/model/llm/mlx/core.py +408 -0
  14. xinference/model/llm/pytorch/chatglm.py +2 -9
  15. xinference/model/llm/pytorch/cogvlm2.py +206 -21
  16. xinference/model/llm/pytorch/core.py +213 -40
  17. xinference/model/llm/pytorch/glm4v.py +171 -15
  18. xinference/model/llm/pytorch/qwen_vl.py +168 -7
  19. xinference/model/llm/pytorch/utils.py +53 -62
  20. xinference/model/llm/utils.py +24 -5
  21. xinference/model/rerank/core.py +5 -0
  22. xinference/thirdparty/deepseek_vl/serve/__init__.py +13 -0
  23. xinference/thirdparty/deepseek_vl/serve/app_deepseek.py +510 -0
  24. xinference/thirdparty/deepseek_vl/serve/app_modules/__init__.py +13 -0
  25. xinference/thirdparty/deepseek_vl/serve/app_modules/gradio_utils.py +94 -0
  26. xinference/thirdparty/deepseek_vl/serve/app_modules/overwrites.py +81 -0
  27. xinference/thirdparty/deepseek_vl/serve/app_modules/presets.py +96 -0
  28. xinference/thirdparty/deepseek_vl/serve/app_modules/utils.py +229 -0
  29. xinference/thirdparty/deepseek_vl/serve/inference.py +170 -0
  30. xinference/web/ui/build/asset-manifest.json +3 -3
  31. xinference/web/ui/build/index.html +1 -1
  32. xinference/web/ui/build/static/js/main.0fb6f3ab.js +3 -0
  33. xinference/web/ui/build/static/js/main.0fb6f3ab.js.map +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/0f6b391abec76271137faad13a3793fe7acc1024e8cd2269c147b653ecd3a73b.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/2c63090c842376cdd368c3ded88a333ef40d94785747651343040a6f7872a223.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/30a0c79d8025d6441eb75b2df5bc2750a14f30119c869ef02570d294dff65c2f.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/40486e655c3c5801f087e2cf206c0b5511aaa0dfdba78046b7181bf9c17e54c5.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/b5507cd57f16a3a230aa0128e39fe103e928de139ea29e2679e4c64dcbba3b3a.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/d779b915f83f9c7b5a72515b6932fdd114f1822cef90ae01cc0d12bca59abc2d.json +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/d87824cb266194447a9c0c69ebab2d507bfc3e3148976173760d18c035e9dd26.json +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +1 -0
  49. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/METADATA +4 -1
  50. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/RECORD +55 -44
  51. xinference/web/ui/build/static/js/main.77dd47c3.js +0 -3
  52. xinference/web/ui/build/static/js/main.77dd47c3.js.map +0 -1
  53. xinference/web/ui/node_modules/.cache/babel-loader/0cd591866aa345566e0b63fb51ff2043e163a770af6fdc2f3bad395d046353e2.json +0 -1
  54. xinference/web/ui/node_modules/.cache/babel-loader/37c1476717199863bbba1530e3513a9368f8f73001b75b4a85c2075956308027.json +0 -1
  55. xinference/web/ui/node_modules/.cache/babel-loader/3da7d55e87882a4af923e187b1351160e34ca102f589086439c15131a227fb6e.json +0 -1
  56. xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +0 -1
  57. xinference/web/ui/node_modules/.cache/babel-loader/46edc1fe657dfedb2e673148332bb442c6eb98f09f2592c389209e376510afa5.json +0 -1
  58. xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/72bcecc71c5267250edeb89608859d449b586f13ff9923a5e70e7172976ec403.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/82db357f3fd5b32215d747ee593f69ff06c95ad6cde37f71a96c8290aaab64c0.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/935efd2867664c58230378fdf2ff1ea85e58d853b7214014e20dfbca8dab7b05.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/bc6da27195ec4607bb472bf61f97c928ad4966fa64e4c2247661bedb7400abba.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/c2abe75f04ad82fba68f35ed9cbe2e287762c876684fddccccfa73f739489b65.json +0 -1
  64. xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/f118f99c22b713c678c1209c4e1dd43fe86e3f6e801a4c0c35d3bbf41fd05fe6.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/f51bf63ddaa7afd125ef2254a105789333eecc1c94fdf5157a9b88ef7ad0a5bd.json +0 -1
  67. /xinference/web/ui/build/static/js/{main.77dd47c3.js.LICENSE.txt → main.0fb6f3ab.js.LICENSE.txt} +0 -0
  68. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/LICENSE +0 -0
  69. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/WHEEL +0 -0
  70. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/entry_points.txt +0 -0
  71. {xinference-0.12.3.dist-info → xinference-0.13.0.dist-info}/top_level.txt +0 -0
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-06-28T15:25:07+0800",
11
+ "date": "2024-07-05T18:19:09+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "3d9c261a7d5c4941091d1711cb732ce17b34e7f1",
15
- "version": "0.12.3"
14
+ "full-revisionid": "007408c55272bc343821dd152df780de5dc9c037",
15
+ "version": "0.13.0"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -1477,14 +1477,14 @@ class RESTfulAPI:
1477
1477
  await self._report_error_event(model_uid, str(e))
1478
1478
  raise HTTPException(status_code=500, detail=str(e))
1479
1479
 
1480
- from ..model.llm.utils import QWEN_TOOL_CALL_FAMILY
1480
+ from ..model.llm.utils import GLM4_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY
1481
1481
 
1482
1482
  model_family = desc.get("model_family", "")
1483
- function_call_models = [
1484
- "chatglm3",
1485
- "glm4-chat",
1486
- "gorilla-openfunctions-v1",
1487
- ] + QWEN_TOOL_CALL_FAMILY
1483
+ function_call_models = (
1484
+ ["chatglm3", "gorilla-openfunctions-v1"]
1485
+ + QWEN_TOOL_CALL_FAMILY
1486
+ + GLM4_TOOL_CALL_FAMILY
1487
+ )
1488
1488
 
1489
1489
  is_qwen = desc.get("model_format") == "ggmlv3" and "qwen-chat" == model_family
1490
1490
 
@@ -182,8 +182,6 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
182
182
  f"Failed to rerank documents, detail: {response.json()['detail']}"
183
183
  )
184
184
  response_data = response.json()
185
- for r in response_data["results"]:
186
- r["document"] = documents[r["index"]]
187
185
  return response_data
188
186
 
189
187
 
xinference/core/model.py CHANGED
@@ -65,6 +65,9 @@ except ImportError:
65
65
  OutOfMemoryError = _OutOfMemoryError
66
66
 
67
67
 
68
+ XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = ["qwen-vl-chat", "cogvlm2", "glm-4v"]
69
+
70
+
68
71
  def request_limit(fn):
69
72
  """
70
73
  Used by ModelActor.
@@ -268,11 +271,25 @@ class ModelActor(xo.StatelessActor):
268
271
 
269
272
  model_ability = self._model_description.get("model_ability", [])
270
273
 
271
- return (
272
- XINFERENCE_TRANSFORMERS_ENABLE_BATCHING
273
- and isinstance(self._model, PytorchModel)
274
- and "vision" not in model_ability
274
+ condition = XINFERENCE_TRANSFORMERS_ENABLE_BATCHING and isinstance(
275
+ self._model, PytorchModel
275
276
  )
277
+ if condition and "vision" in model_ability:
278
+ if (
279
+ self._model.model_family.model_name
280
+ in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS
281
+ or self._model.model_family.model_family
282
+ in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS
283
+ ):
284
+ return True
285
+ else:
286
+ logger.warning(
287
+ f"Currently for multimodal models, "
288
+ f"xinference only supports {', '.join(XINFERENCE_BATCHING_ALLOWED_VISION_MODELS)} for batching. "
289
+ f"Your model {self._model.model_family.model_name} with model family {self._model.model_family.model_family} is disqualified."
290
+ )
291
+ return False
292
+ return condition
276
293
 
277
294
  async def load(self):
278
295
  self._model.load()
@@ -82,6 +82,8 @@ class InferenceRequest:
82
82
  # Record error message when this request has error.
83
83
  # Must set stopped=True when this field is set.
84
84
  self.error_msg: Optional[str] = None
85
+ # For compatibility. Record some extra parameters for some special cases.
86
+ self.extra_kwargs = {}
85
87
 
86
88
  # check the integrity of args passed upstream
87
89
  self._check_args()
xinference/core/worker.py CHANGED
@@ -73,6 +73,9 @@ class WorkerActor(xo.StatelessActor):
73
73
  self._main_pool.recover_sub_pool = self.recover_sub_pool
74
74
 
75
75
  # internal states.
76
+ # temporary placeholder during model launch process:
77
+ self._model_uid_launching_guard: Dict[str, bool] = {}
78
+ # attributes maintained after model launched:
76
79
  self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
77
80
  self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
78
81
  self._gpu_to_model_uid: Dict[int, str] = {}
@@ -594,10 +597,14 @@ class WorkerActor(xo.StatelessActor):
594
597
  launch_args.pop("kwargs")
595
598
  launch_args.update(kwargs)
596
599
 
597
- event_model_uid, _, __ = parse_replica_model_uid(model_uid)
600
+ try:
601
+ origin_uid, _, _ = parse_replica_model_uid(model_uid)
602
+ except Exception as e:
603
+ logger.exception(e)
604
+ raise
598
605
  try:
599
606
  await self._event_collector_ref.report_event(
600
- event_model_uid,
607
+ origin_uid,
601
608
  Event(
602
609
  event_type=EventType.INFO,
603
610
  event_ts=int(time.time()),
@@ -640,50 +647,55 @@ class WorkerActor(xo.StatelessActor):
640
647
  assert model_uid not in self._model_uid_to_model
641
648
  self._check_model_is_valid(model_name, model_format)
642
649
 
643
- subpool_address, devices = await self._create_subpool(
644
- model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
645
- )
650
+ if self.get_model_launch_status(model_uid) is not None:
651
+ raise ValueError(f"{model_uid} is running")
646
652
 
647
653
  try:
648
- origin_uid, _, _ = parse_replica_model_uid(model_uid)
649
- model, model_description = await asyncio.to_thread(
650
- create_model_instance,
651
- subpool_address,
652
- devices,
653
- model_uid,
654
- model_type,
655
- model_name,
656
- model_engine,
657
- model_format,
658
- model_size_in_billions,
659
- quantization,
660
- peft_model_config,
661
- **kwargs,
662
- )
663
- await self.update_cache_status(model_name, model_description)
664
- model_ref = await xo.create_actor(
665
- ModelActor,
666
- address=subpool_address,
667
- uid=model_uid,
668
- worker_address=self.address,
669
- model=model,
670
- model_description=model_description,
671
- request_limits=request_limits,
654
+ self._model_uid_launching_guard[model_uid] = True
655
+ subpool_address, devices = await self._create_subpool(
656
+ model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
672
657
  )
673
- await model_ref.load()
674
- except:
675
- logger.error(f"Failed to load model {model_uid}", exc_info=True)
676
- self.release_devices(model_uid=model_uid)
677
- await self._main_pool.remove_sub_pool(subpool_address)
678
- raise
679
658
 
680
- self._model_uid_to_model[model_uid] = model_ref
681
- self._model_uid_to_model_spec[model_uid] = model_description
682
- self._model_uid_to_addr[model_uid] = subpool_address
683
- self._model_uid_to_recover_count.setdefault(
684
- model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
685
- )
686
- self._model_uid_to_launch_args[model_uid] = launch_args
659
+ try:
660
+ model, model_description = await asyncio.to_thread(
661
+ create_model_instance,
662
+ subpool_address,
663
+ devices,
664
+ model_uid,
665
+ model_type,
666
+ model_name,
667
+ model_engine,
668
+ model_format,
669
+ model_size_in_billions,
670
+ quantization,
671
+ peft_model_config,
672
+ **kwargs,
673
+ )
674
+ await self.update_cache_status(model_name, model_description)
675
+ model_ref = await xo.create_actor(
676
+ ModelActor,
677
+ address=subpool_address,
678
+ uid=model_uid,
679
+ worker_address=self.address,
680
+ model=model,
681
+ model_description=model_description,
682
+ request_limits=request_limits,
683
+ )
684
+ await model_ref.load()
685
+ except:
686
+ logger.error(f"Failed to load model {model_uid}", exc_info=True)
687
+ self.release_devices(model_uid=model_uid)
688
+ await self._main_pool.remove_sub_pool(subpool_address)
689
+ raise
690
+ self._model_uid_to_model[model_uid] = model_ref
691
+ self._model_uid_to_model_spec[model_uid] = model_description
692
+ self._model_uid_to_addr[model_uid] = subpool_address
693
+ self._model_uid_to_recover_count.setdefault(
694
+ model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
695
+ )
696
+ self._model_uid_to_launch_args[model_uid] = launch_args
697
+ finally:
698
+ del self._model_uid_launching_guard[model_uid]
687
699
 
688
700
  # update status to READY
689
701
  abilities = await self._get_model_ability(model, model_type)
@@ -694,10 +706,13 @@ class WorkerActor(xo.StatelessActor):
694
706
 
695
707
  @log_async(logger=logger)
696
708
  async def terminate_model(self, model_uid: str):
697
- event_model_uid, _, __ = parse_replica_model_uid(model_uid)
709
+ # Terminate model while its launching is not allow
710
+ if model_uid in self._model_uid_launching_guard:
711
+ raise ValueError(f"{model_uid} is launching")
712
+ origin_uid, _, __ = parse_replica_model_uid(model_uid)
698
713
  try:
699
714
  await self._event_collector_ref.report_event(
700
- event_model_uid,
715
+ origin_uid,
701
716
  Event(
702
717
  event_type=EventType.INFO,
703
718
  event_ts=int(time.time()),
@@ -708,7 +723,6 @@ class WorkerActor(xo.StatelessActor):
708
723
  # Report callback error can be log and ignore, should not interrupt the Process
709
724
  logger.error("report_event error: %s" % (e))
710
725
 
711
- origin_uid, _, _ = parse_replica_model_uid(model_uid)
712
726
  await self._status_guard_ref.update_instance_info(
713
727
  origin_uid, {"status": LaunchStatus.TERMINATING.name}
714
728
  )
@@ -740,6 +754,21 @@ class WorkerActor(xo.StatelessActor):
740
754
  origin_uid, {"status": LaunchStatus.TERMINATED.name}
741
755
  )
742
756
 
757
+ # Provide an interface for future version of supervisor to call
758
+ def get_model_launch_status(self, model_uid: str) -> Optional[str]:
759
+ """
760
+ returns:
761
+ CREATING: model is launching
762
+ RREADY: model is running
763
+ None: model is not running (launch error might have happened)
764
+ """
765
+
766
+ if model_uid in self._model_uid_launching_guard:
767
+ return LaunchStatus.CREATING.name
768
+ if model_uid in self._model_uid_to_model:
769
+ return LaunchStatus.READY.name
770
+ return None
771
+
743
772
  @log_async(logger=logger)
744
773
  async def list_models(self) -> Dict[str, Dict[str, Any]]:
745
774
  ret = {}
@@ -79,6 +79,12 @@ def get_config_dict(
79
79
  "stream": "ext://sys.stderr",
80
80
  "filters": ["logger_name_filter"],
81
81
  },
82
+ "console_handler": {
83
+ "class": "logging.StreamHandler",
84
+ "formatter": "formatter",
85
+ "level": log_level,
86
+ "stream": "ext://sys.stderr",
87
+ },
82
88
  "file_handler": {
83
89
  "class": "logging.handlers.RotatingFileHandler",
84
90
  "formatter": "formatter",
@@ -95,7 +101,32 @@ def get_config_dict(
95
101
  "handlers": ["stream_handler", "file_handler"],
96
102
  "level": log_level,
97
103
  "propagate": False,
98
- }
104
+ },
105
+ "uvicorn": {
106
+ "handlers": ["stream_handler", "file_handler"],
107
+ "level": log_level,
108
+ "propagate": False,
109
+ },
110
+ "uvicorn.error": {
111
+ "handlers": ["stream_handler", "file_handler"],
112
+ "level": log_level,
113
+ "propagate": False,
114
+ },
115
+ "uvicorn.access": {
116
+ "handlers": ["stream_handler", "file_handler"],
117
+ "level": log_level,
118
+ "propagate": False,
119
+ },
120
+ "transformers": {
121
+ "handlers": ["console_handler", "file_handler"],
122
+ "level": log_level,
123
+ "propagate": False,
124
+ },
125
+ "vllm": {
126
+ "handlers": ["console_handler", "file_handler"],
127
+ "level": log_level,
128
+ "propagate": False,
129
+ },
99
130
  },
100
131
  "root": {
101
132
  "level": "WARN",
@@ -127,7 +158,7 @@ def health_check(address: str, max_attempts: int, sleep_interval: int = 3) -> bo
127
158
  while attempts < max_attempts:
128
159
  time.sleep(sleep_interval)
129
160
  try:
130
- from xinference.core.supervisor import SupervisorActor
161
+ from ..core.supervisor import SupervisorActor
131
162
 
132
163
  supervisor_ref: xo.ActorRefType[SupervisorActor] = await xo.actor_ref( # type: ignore
133
164
  address=address, uid=SupervisorActor.uid()
@@ -34,6 +34,7 @@ from .llm_family import (
34
34
  BUILTIN_MODELSCOPE_LLM_FAMILIES,
35
35
  LLAMA_CLASSES,
36
36
  LLM_ENGINES,
37
+ MLX_CLASSES,
37
38
  SGLANG_CLASSES,
38
39
  SUPPORTED_ENGINES,
39
40
  TRANSFORMERS_CLASSES,
@@ -42,6 +43,7 @@ from .llm_family import (
42
43
  GgmlLLMSpecV1,
43
44
  LLMFamilyV1,
44
45
  LLMSpecV1,
46
+ MLXLLMSpecV1,
45
47
  PromptStyleV1,
46
48
  PytorchLLMSpecV1,
47
49
  get_cache_status,
@@ -112,6 +114,7 @@ def generate_engine_config_by_model_family(model_family):
112
114
  def _install():
113
115
  from .ggml.chatglm import ChatglmCppChatModel
114
116
  from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
117
+ from .mlx.core import MLXChatModel, MLXModel
115
118
  from .pytorch.baichuan import BaichuanPytorchChatModel
116
119
  from .pytorch.chatglm import ChatglmPytorchChatModel
117
120
  from .pytorch.cogvlm2 import CogVLM2Model
@@ -147,6 +150,7 @@ def _install():
147
150
  )
148
151
  SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
149
152
  VLLM_CLASSES.extend([VLLMModel, VLLMChatModel])
153
+ MLX_CLASSES.extend([MLXModel, MLXChatModel])
150
154
  TRANSFORMERS_CLASSES.extend(
151
155
  [
152
156
  BaichuanPytorchChatModel,
@@ -176,6 +180,7 @@ def _install():
176
180
  SUPPORTED_ENGINES["SGLang"] = SGLANG_CLASSES
177
181
  SUPPORTED_ENGINES["Transformers"] = TRANSFORMERS_CLASSES
178
182
  SUPPORTED_ENGINES["llama.cpp"] = LLAMA_CLASSES
183
+ SUPPORTED_ENGINES["MLX"] = MLX_CLASSES
179
184
 
180
185
  json_path = os.path.join(
181
186
  os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
@@ -944,7 +944,7 @@
944
944
  "none"
945
945
  ],
946
946
  "model_id": "THUDM/glm-4v-9b",
947
- "model_revision": "e8b84fefc07e58a90c8489337675573fda95e289"
947
+ "model_revision": "6c2e4732db8443f64a48d5af04b74425a7d169c4"
948
948
  }
949
949
  ],
950
950
  "prompt_style": {
@@ -2549,6 +2549,38 @@
2549
2549
  ],
2550
2550
  "model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
2551
2551
  },
2552
+ {
2553
+ "model_format": "mlx",
2554
+ "model_size_in_billions": "0_5",
2555
+ "quantizations": [
2556
+ "4-bit"
2557
+ ],
2558
+ "model_id": "Qwen/Qwen2-0.5B-Instruct-MLX"
2559
+ },
2560
+ {
2561
+ "model_format": "mlx",
2562
+ "model_size_in_billions": "1_5",
2563
+ "quantizations": [
2564
+ "4-bit"
2565
+ ],
2566
+ "model_id": "Qwen/Qwen2-1.5B-Instruct-MLX"
2567
+ },
2568
+ {
2569
+ "model_format": "mlx",
2570
+ "model_size_in_billions": 7,
2571
+ "quantizations": [
2572
+ "4-bit"
2573
+ ],
2574
+ "model_id": "Qwen/Qwen2-7B-Instruct-MLX"
2575
+ },
2576
+ {
2577
+ "model_format": "mlx",
2578
+ "model_size_in_billions": 72,
2579
+ "quantizations": [
2580
+ "4-bit"
2581
+ ],
2582
+ "model_id": "mlx-community/Qwen2-72B-Instruct-4bit"
2583
+ },
2552
2584
  {
2553
2585
  "model_format": "ggufv2",
2554
2586
  "model_size_in_billions": "0_5",
@@ -2565,6 +2597,82 @@
2565
2597
  ],
2566
2598
  "model_id": "Qwen/Qwen2-0.5B-Instruct-GGUF",
2567
2599
  "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf"
2600
+ },
2601
+ {
2602
+ "model_format": "ggufv2",
2603
+ "model_size_in_billions": "1_5",
2604
+ "quantizations": [
2605
+ "q2_k",
2606
+ "q3_k_m",
2607
+ "q4_0",
2608
+ "q4_k_m",
2609
+ "q5_0",
2610
+ "q5_k_m",
2611
+ "q6_k",
2612
+ "q8_0",
2613
+ "fp16"
2614
+ ],
2615
+ "model_id": "Qwen/Qwen2-1.5B-Instruct-GGUF",
2616
+ "model_file_name_template": "qwen2-1_5b-instruct-{quantization}.gguf"
2617
+ },
2618
+ {
2619
+ "model_format": "ggufv2",
2620
+ "model_size_in_billions": 7,
2621
+ "quantizations": [
2622
+ "q2_k",
2623
+ "q3_k_m",
2624
+ "q4_0",
2625
+ "q4_k_m",
2626
+ "q5_0",
2627
+ "q5_k_m",
2628
+ "q6_k",
2629
+ "q8_0",
2630
+ "fp16"
2631
+ ],
2632
+ "model_id": "Qwen/Qwen2-7B-Instruct-GGUF",
2633
+ "model_file_name_template": "qwen2-7b-instruct-{quantization}.gguf"
2634
+ },
2635
+ {
2636
+ "model_format": "ggufv2",
2637
+ "model_size_in_billions": 72,
2638
+ "quantizations": [
2639
+ "q2_k",
2640
+ "q3_k_m",
2641
+ "q4_0",
2642
+ "q4_k_m",
2643
+ "q5_0",
2644
+ "q5_k_m",
2645
+ "q6_k",
2646
+ "q8_0",
2647
+ "fp16"
2648
+ ],
2649
+ "model_id": "Qwen/Qwen2-72B-Instruct-GGUF",
2650
+ "model_file_name_template": "qwen2-72b-instruct-{quantization}.gguf",
2651
+ "model_file_name_split_template": "qwen2-72b-instruct-{quantization}-{part}.gguf",
2652
+ "quantization_parts": {
2653
+ "q5_0": [
2654
+ "00001-of-00002",
2655
+ "00002-of-00002"
2656
+ ],
2657
+ "q5_k_m": [
2658
+ "00001-of-00002",
2659
+ "00002-of-00002"
2660
+ ],
2661
+ "q6_k": [
2662
+ "00001-of-00002",
2663
+ "00002-of-00002"
2664
+ ],
2665
+ "q8_0": [
2666
+ "00001-of-00002",
2667
+ "00002-of-00002"
2668
+ ],
2669
+ "fp16": [
2670
+ "00001-of-00004",
2671
+ "00002-of-00004",
2672
+ "00003-of-00004",
2673
+ "00004-of-00004"
2674
+ ]
2675
+ }
2568
2676
  }
2569
2677
  ],
2570
2678
  "prompt_style": {
@@ -2618,6 +2726,34 @@
2618
2726
  "Int4"
2619
2727
  ],
2620
2728
  "model_id": "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4"
2729
+ },
2730
+ {
2731
+ "model_format": "ggufv2",
2732
+ "model_size_in_billions": 14,
2733
+ "quantizations": [
2734
+ "q3_k_m",
2735
+ "q4_0",
2736
+ "q4_k_m",
2737
+ "q5_0",
2738
+ "q5_k_m",
2739
+ "q6_k",
2740
+ "q8_0",
2741
+ "fp16"
2742
+ ],
2743
+ "model_id": "Qwen/Qwen2-57B-A14B-Instruct-GGUF",
2744
+ "model_file_name_template": "qwen2-57b-a14b-instruct-{quantization}.gguf",
2745
+ "model_file_name_split_template": "qwen2-57b-a14b-instruct-{quantization}-{part}.gguf",
2746
+ "quantization_parts": {
2747
+ "q8_0": [
2748
+ "00001-of-00002",
2749
+ "00002-of-00002"
2750
+ ],
2751
+ "fp16": [
2752
+ "00001-of-00003",
2753
+ "00002-of-00003",
2754
+ "00003-of-00003"
2755
+ ]
2756
+ }
2621
2757
  }
2622
2758
  ],
2623
2759
  "prompt_style": {
@@ -5809,6 +5945,16 @@
5809
5945
  "roles": [
5810
5946
  "user",
5811
5947
  "assistant"
5948
+ ],
5949
+ "stop_token_ids": [
5950
+ 151643,
5951
+ 151644,
5952
+ 151645
5953
+ ],
5954
+ "stop": [
5955
+ "<|endoftext|>",
5956
+ "<|im_start|>",
5957
+ "<|im_end|>"
5812
5958
  ]
5813
5959
  }
5814
5960
  },
@@ -5997,6 +6143,99 @@
5997
6143
  ]
5998
6144
  }
5999
6145
  },
6146
+ {
6147
+ "version": 1,
6148
+ "context_length": 8192,
6149
+ "model_name": "gemma-2-it",
6150
+ "model_lang": [
6151
+ "en"
6152
+ ],
6153
+ "model_ability": [
6154
+ "chat"
6155
+ ],
6156
+ "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
6157
+ "model_specs": [
6158
+ {
6159
+ "model_format": "pytorch",
6160
+ "model_size_in_billions": 9,
6161
+ "quantizations": [
6162
+ "none",
6163
+ "4-bit",
6164
+ "8-bit"
6165
+ ],
6166
+ "model_id": "google/gemma-2-9b-it"
6167
+ },
6168
+ {
6169
+ "model_format": "pytorch",
6170
+ "model_size_in_billions": 27,
6171
+ "quantizations": [
6172
+ "none",
6173
+ "4-bit",
6174
+ "8-bit"
6175
+ ],
6176
+ "model_id": "google/gemma-2-27b-it"
6177
+ },
6178
+ {
6179
+ "model_format": "mlx",
6180
+ "model_size_in_billions": 9,
6181
+ "quantizations": [
6182
+ "4-bit"
6183
+ ],
6184
+ "model_id": "mlx-community/gemma-2-9b-it-4bit"
6185
+ },
6186
+ {
6187
+ "model_format": "mlx",
6188
+ "model_size_in_billions": 9,
6189
+ "quantizations": [
6190
+ "8-bit"
6191
+ ],
6192
+ "model_id": "mlx-community/gemma-2-9b-it-8bit"
6193
+ },
6194
+ {
6195
+ "model_format": "mlx",
6196
+ "model_size_in_billions": 9,
6197
+ "quantizations": [
6198
+ "None"
6199
+ ],
6200
+ "model_id": "mlx-community/gemma-2-9b-it-fp16"
6201
+ },
6202
+ {
6203
+ "model_format": "mlx",
6204
+ "model_size_in_billions": 27,
6205
+ "quantizations": [
6206
+ "4-bit"
6207
+ ],
6208
+ "model_id": "mlx-community/gemma-2-27b-it-4bit"
6209
+ },
6210
+ {
6211
+ "model_format": "mlx",
6212
+ "model_size_in_billions": 27,
6213
+ "quantizations": [
6214
+ "8-bit"
6215
+ ],
6216
+ "model_id": "mlx-community/gemma-2-27b-it-8bit"
6217
+ },
6218
+ {
6219
+ "model_format": "mlx",
6220
+ "model_size_in_billions": 27,
6221
+ "quantizations": [
6222
+ "None"
6223
+ ],
6224
+ "model_id": "mlx-community/gemma-2-27b-it-fp16"
6225
+ }
6226
+ ],
6227
+ "prompt_style": {
6228
+ "style_name": "gemma",
6229
+ "roles": [
6230
+ "user",
6231
+ "model"
6232
+ ],
6233
+ "stop": [
6234
+ "<end_of_turn>",
6235
+ "<start_of_turn>"
6236
+ ]
6237
+ }
6238
+ },
6000
6239
  {
6001
6240
  "version": 1,
6002
6241
  "context_length": 4096,