xinference 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (125) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +49 -65
  3. xinference/core/model.py +77 -19
  4. xinference/core/supervisor.py +81 -10
  5. xinference/core/utils.py +2 -2
  6. xinference/core/worker.py +32 -0
  7. xinference/model/image/model_spec.json +18 -0
  8. xinference/model/image/model_spec_modelscope.json +20 -0
  9. xinference/model/llm/__init__.py +2 -0
  10. xinference/model/llm/llm_family.json +96 -0
  11. xinference/model/llm/llm_family_modelscope.json +99 -0
  12. xinference/model/llm/mlx/core.py +23 -73
  13. xinference/model/llm/transformers/cogagent.py +272 -0
  14. xinference/model/llm/transformers/core.py +1 -0
  15. xinference/model/llm/transformers/qwen2_vl.py +10 -1
  16. xinference/model/llm/utils.py +27 -3
  17. xinference/model/llm/vllm/core.py +37 -7
  18. xinference/model/llm/vllm/xavier/__init__.py +13 -0
  19. xinference/model/llm/vllm/xavier/allocator.py +74 -0
  20. xinference/model/llm/vllm/xavier/block.py +112 -0
  21. xinference/model/llm/vllm/xavier/block_manager.py +71 -0
  22. xinference/model/llm/vllm/xavier/block_tracker.py +116 -0
  23. xinference/model/llm/vllm/xavier/engine.py +247 -0
  24. xinference/model/llm/vllm/xavier/executor.py +132 -0
  25. xinference/model/llm/vllm/xavier/scheduler.py +422 -0
  26. xinference/model/llm/vllm/xavier/test/__init__.py +13 -0
  27. xinference/model/llm/vllm/xavier/test/test_xavier.py +122 -0
  28. xinference/model/llm/vllm/xavier/transfer.py +298 -0
  29. xinference/model/video/diffusers.py +14 -0
  30. xinference/model/video/model_spec.json +15 -0
  31. xinference/model/video/model_spec_modelscope.json +16 -0
  32. xinference/types.py +13 -0
  33. xinference/web/ui/build/asset-manifest.json +6 -6
  34. xinference/web/ui/build/index.html +1 -1
  35. xinference/web/ui/build/static/css/main.51a587ff.css +2 -0
  36. xinference/web/ui/build/static/css/main.51a587ff.css.map +1 -0
  37. xinference/web/ui/build/static/js/main.1eb206d1.js +3 -0
  38. xinference/web/ui/build/static/js/main.1eb206d1.js.map +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/03c4052f1b91f6ba0c5389bdcf49c43319b4076c08e4b8585dab312538ae290a.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/1786b83003b8e9605a0f5f855a185d4d16e38fc893dfb326a2a9cca206b4240a.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/17cbc181dd674b9150b80c73ed6a82656de0082d857f6e5f66d9716129ac0b38.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/2213d49de260e1f67c888081b18f120f5225462b829ae57c9e05a05cec83689d.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/2b484da66c724d0d56a40849c109327408796a668b1381511b6e9e03baa48658.json +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/2cbbbce9b84df73330d4c42b82436ed881b3847628f2fbc346aa62e2859fd88c.json +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/2ec9b14431ed33ce6901bf9f27007be4e6e472709c99d6e22b50ce528e4b78ee.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/3b966db018f96be4a055d6ca205f0990d4d0b370e2980c17d8bca2c9a021819c.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/522b229e3cac219123f0d69673f5570e191c2d2a505dc65b312d336eae2279c0.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/52e45f17ba300580ea3fcc9f9228ccba194bb092b76f25e9255af311f8b05aab.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/5a0bc4631f936459afc1a3b1d3ec2420118b1f00e11f60ccac3e08088f3f27a8.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/6329bc76c406fe5eb305412383fbde5950f847bb5e43261f73f37622c365acb4.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/69b2d5001684174ec9da57e07914eed3eac4960018bceb6cbfa801d861301d7c.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/710c1acda69e561e30a933b98c6a56d50197868b15c21e2aad55ab6d46649eb6.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/720deca1fce5a1dc5056048fa8258fd138a82ea855f350b6613f104a73fb761f.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/76a23b92d26a499c57e61eea2b895fbc9771bd0849a72e66f8e633192017978b.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/858063f23b34dfe600254eb5afd85518b0002ec4b30b7386616c45600826e3b2.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/920b82c1c89124cf217109eeedbfcd3aae3b917be50c9dfb6bbb4ce26bdfd2e7.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/94d8b7aeb0076f2ce07db598cea0e87b13bc8d5614eb530b8d6e696c2daf6f88.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/9e917fe7022d01b2ccbe5cc0ce73d70bb72bee584ff293bad71bdff6695dee28.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/9f28fdb8399f1d0474f0aca86f1658dc94f5bf0c90f6146352de150692de8862.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/a0dfafa06b2bb7cba8cad41c482503f61944f759f4318139362602ef5cc47ccb.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/afb8084f539534cd594755ea2205ecd5bd1f62dddcfdf75a2eace59a28131278.json +1 -0
  67. xinference/web/ui/node_modules/.cache/babel-loader/b57b1438b77294c1f3f6cfce12ac487d8106c6f016975ba0aec94d98997e2e1e.json +1 -0
  68. xinference/web/ui/node_modules/.cache/babel-loader/b9917b0bf8e4d55ccbac1c334aa04d6ff3c5b6ed9e5d38b9ea2c687fa7d3f5a9.json +1 -0
  69. xinference/web/ui/node_modules/.cache/babel-loader/bbcc94b0149963d1d6f267ee1f4f03d3925b758392ce2f516c3fe8af0e0169fc.json +1 -0
  70. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +1 -0
  71. xinference/web/ui/node_modules/.cache/babel-loader/beb112b70f4a56db95920a9e20efb6c97c37b68450716730217a9ee1a9ae92be.json +1 -0
  72. xinference/web/ui/node_modules/.cache/babel-loader/c88db97be0cdf440193b3995996e83510a04cb00048135485fc0e26d197e80b5.json +1 -0
  73. xinference/web/ui/node_modules/.cache/babel-loader/d49e5314d34310a62d01a03067ce1bec5da00abce84c5196aa9c6842fa79a430.json +1 -0
  74. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +1 -0
  75. xinference/web/ui/node_modules/.cache/babel-loader/d9072c318b819b7c90a0f7e9cc0b6413b4dbeb8e9859898e53d75ea882fcde99.json +1 -0
  76. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +1 -0
  77. xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/e242c583c2dbc2784f0fcf513523975f7d5df447e106c1c17e49e8578a6fc3ed.json +1 -0
  79. xinference/web/ui/node_modules/.cache/babel-loader/eac5f1296513e69e4b96f750ddccd4d0264e2bae4e4c449144e83274a48698d9.json +1 -0
  80. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +1 -0
  81. xinference/web/ui/node_modules/.cache/babel-loader/f125bf72e773a14cdaebd0c343e80adb909d12e317ee5c00cd4a57442fbe2c62.json +1 -0
  82. xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +1 -0
  83. xinference/web/ui/node_modules/.package-lock.json +67 -3
  84. xinference/web/ui/node_modules/@babel/runtime/package.json +592 -538
  85. xinference/web/ui/node_modules/html-parse-stringify/package.json +50 -0
  86. xinference/web/ui/node_modules/i18next/dist/esm/package.json +1 -0
  87. xinference/web/ui/node_modules/i18next/package.json +129 -0
  88. xinference/web/ui/node_modules/react-i18next/.eslintrc.json +74 -0
  89. xinference/web/ui/node_modules/react-i18next/dist/es/package.json +1 -0
  90. xinference/web/ui/node_modules/react-i18next/package.json +162 -0
  91. xinference/web/ui/node_modules/void-elements/package.json +34 -0
  92. xinference/web/ui/package-lock.json +69 -3
  93. xinference/web/ui/package.json +2 -0
  94. xinference/web/ui/src/locales/en.json +186 -0
  95. xinference/web/ui/src/locales/zh.json +186 -0
  96. {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/METADATA +9 -6
  97. {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/RECORD +102 -56
  98. xinference/web/ui/build/static/css/main.5061c4c3.css +0 -2
  99. xinference/web/ui/build/static/css/main.5061c4c3.css.map +0 -1
  100. xinference/web/ui/build/static/js/main.4eb4ee80.js +0 -3
  101. xinference/web/ui/build/static/js/main.4eb4ee80.js.map +0 -1
  102. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +0 -1
  103. xinference/web/ui/node_modules/.cache/babel-loader/1130403f9e46f5738a23b45ac59b57de8f360c908c713e2c0670c2cce9bd367a.json +0 -1
  104. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +0 -1
  105. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +0 -1
  106. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +0 -1
  107. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +0 -1
  108. xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +0 -1
  109. xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +0 -1
  110. xinference/web/ui/node_modules/.cache/babel-loader/8c5eeb02f772d02cbe8b89c05428d0dd41a97866f75f7dc1c2164a67f5a1cf98.json +0 -1
  111. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +0 -1
  112. xinference/web/ui/node_modules/.cache/babel-loader/9375a35b05d56989b2755bf72161fa707c92f28569d33765a75f91a568fda6e9.json +0 -1
  113. xinference/web/ui/node_modules/.cache/babel-loader/a158a9ffa0c9b169aee53dd4a0c44501a596755b4e4f6ede7746d65a72e2a71f.json +0 -1
  114. xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +0 -1
  115. xinference/web/ui/node_modules/.cache/babel-loader/d6c643278a0b28320e6f33a60f5fb64c053997cbdc39a60e53ccc574688ade9e.json +0 -1
  116. xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +0 -1
  117. xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +0 -1
  118. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +0 -1
  119. xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +0 -1
  120. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +0 -1
  121. /xinference/web/ui/build/static/js/{main.4eb4ee80.js.LICENSE.txt → main.1eb206d1.js.LICENSE.txt} +0 -0
  122. {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/LICENSE +0 -0
  123. {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/WHEEL +0 -0
  124. {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/entry_points.txt +0 -0
  125. {xinference-1.1.1.dist-info → xinference-1.2.0.dist-info}/top_level.txt +0 -0
xinference/core/worker.py CHANGED
@@ -22,6 +22,7 @@ import signal
22
22
  import threading
23
23
  import time
24
24
  from collections import defaultdict
25
+ from dataclasses import dataclass
25
26
  from logging import getLogger
26
27
  from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
27
28
 
@@ -58,6 +59,11 @@ else:
58
59
  MODEL_ACTOR_AUTO_RECOVER_LIMIT = None
59
60
 
60
61
 
62
+ @dataclass
63
+ class ModelStatus:
64
+ last_error: str = ""
65
+
66
+
61
67
  class WorkerActor(xo.StatelessActor):
62
68
  def __init__(
63
69
  self,
@@ -90,6 +96,7 @@ class WorkerActor(xo.StatelessActor):
90
96
  # attributes maintained after model launched:
91
97
  self._model_uid_to_model: Dict[str, xo.ActorRefType["ModelActor"]] = {}
92
98
  self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
99
+ self._model_uid_to_model_status: Dict[str, ModelStatus] = {}
93
100
  self._gpu_to_model_uid: Dict[int, str] = {}
94
101
  self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
95
102
  # Dict structure: gpu_index: {(replica_model_uid, model_type)}
@@ -866,6 +873,9 @@ class WorkerActor(xo.StatelessActor):
866
873
  )
867
874
 
868
875
  try:
876
+ xavier_config: Optional[Dict] = kwargs.pop("xavier_config", None)
877
+ if xavier_config is not None:
878
+ xavier_config["rank_address"] = subpool_address
869
879
  model, model_description = await asyncio.to_thread(
870
880
  create_model_instance,
871
881
  subpool_address,
@@ -893,6 +903,7 @@ class WorkerActor(xo.StatelessActor):
893
903
  model=model,
894
904
  model_description=model_description,
895
905
  request_limits=request_limits,
906
+ xavier_config=xavier_config,
896
907
  )
897
908
  await model_ref.load()
898
909
  except:
@@ -902,6 +913,7 @@ class WorkerActor(xo.StatelessActor):
902
913
  raise
903
914
  self._model_uid_to_model[model_uid] = model_ref
904
915
  self._model_uid_to_model_spec[model_uid] = model_description
916
+ self._model_uid_to_model_status[model_uid] = ModelStatus()
905
917
  self._model_uid_to_addr[model_uid] = subpool_address
906
918
  self._model_uid_to_recover_count.setdefault(
907
919
  model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
@@ -921,6 +933,7 @@ class WorkerActor(xo.StatelessActor):
921
933
  origin_uid,
922
934
  {"model_ability": abilities, "status": LaunchStatus.READY.name},
923
935
  )
936
+ return subpool_address
924
937
 
925
938
  @log_async(logger=logger, level=logging.INFO)
926
939
  async def terminate_model(self, model_uid: str, is_model_die=False):
@@ -976,6 +989,7 @@ class WorkerActor(xo.StatelessActor):
976
989
  status = LaunchStatus.ERROR.name
977
990
  else:
978
991
  status = LaunchStatus.TERMINATED.name
992
+ self._model_uid_to_model_status.pop(model_uid, None)
979
993
 
980
994
  if self._status_guard_ref is None:
981
995
  _ = await self.get_supervisor_ref()
@@ -1010,6 +1024,9 @@ class WorkerActor(xo.StatelessActor):
1010
1024
 
1011
1025
  @log_sync(logger=logger)
1012
1026
  def get_model(self, model_uid: str) -> xo.ActorRefType["ModelActor"]:
1027
+ model_status = self._model_uid_to_model_status.get(model_uid)
1028
+ if model_status and model_status.last_error:
1029
+ raise Exception(model_status.last_error)
1013
1030
  model_ref = self._model_uid_to_model.get(model_uid, None)
1014
1031
  if model_ref is None:
1015
1032
  raise ValueError(f"Model not found, uid: {model_uid}")
@@ -1138,6 +1155,21 @@ class WorkerActor(xo.StatelessActor):
1138
1155
  }
1139
1156
  return ret
1140
1157
 
1158
+ def update_model_status(self, model_uid: str, **kwargs):
1159
+ model_status = self._model_uid_to_model_status.get(model_uid)
1160
+ if model_status is not None:
1161
+ for k, v in kwargs.items():
1162
+ setattr(model_status, k, v)
1163
+
1164
+ def get_model_status(self, model_uid: str):
1165
+ return self._model_uid_to_model_status.get(model_uid)
1166
+
1141
1167
  @staticmethod
1142
1168
  def record_metrics(name, op, kwargs):
1143
1169
  record_metrics(name, op, kwargs)
1170
+
1171
+ async def start_transfer_for_vllm(
1172
+ self, rep_model_uid: str, rank_addresses: List[str]
1173
+ ):
1174
+ model_ref = self._model_uid_to_model[rep_model_uid]
1175
+ await model_ref.start_transfer_for_vllm(rank_addresses)
@@ -167,6 +167,24 @@
167
167
  ],
168
168
  "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
169
169
  },
170
+ {
171
+ "model_name": "HunyuanDiT-v1.2",
172
+ "model_family": "stable_diffusion",
173
+ "model_id": "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers",
174
+ "model_revision": "5e96094e0ad19e7f475de8711f03634ca0ccc40c",
175
+ "model_ability": [
176
+ "text2image"
177
+ ]
178
+ },
179
+ {
180
+ "model_name": "HunyuanDiT-v1.2-Distilled",
181
+ "model_family": "stable_diffusion",
182
+ "model_id": "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers-Distilled",
183
+ "model_revision": "ba991d1546d8c50936c4c16398ed0a87b9b99fb1",
184
+ "model_ability": [
185
+ "text2image"
186
+ ]
187
+ },
170
188
  {
171
189
  "model_name": "sd-turbo",
172
190
  "model_family": "stable_diffusion",
@@ -173,6 +173,26 @@
173
173
  ],
174
174
  "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
175
175
  },
176
+ {
177
+ "model_name": "HunyuanDiT-v1.2",
178
+ "model_family": "stable_diffusion",
179
+ "model_hub": "modelscope",
180
+ "model_id": "Xorbits/HunyuanDiT-v1.2-Diffusers",
181
+ "model_revision": "master",
182
+ "model_ability": [
183
+ "text2image"
184
+ ]
185
+ },
186
+ {
187
+ "model_name": "HunyuanDiT-v1.2-Distilled",
188
+ "model_family": "stable_diffusion",
189
+ "model_hub": "modelscope",
190
+ "model_id": "Xorbits/HunyuanDiT-v1.2-Diffusers-Distilled",
191
+ "model_revision": "master",
192
+ "model_ability": [
193
+ "text2image"
194
+ ]
195
+ },
176
196
  {
177
197
  "model_name": "sd-turbo",
178
198
  "model_family": "stable_diffusion",
@@ -134,6 +134,7 @@ def _install():
134
134
  from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
135
135
  from .sglang.core import SGLANGChatModel, SGLANGModel
136
136
  from .transformers.chatglm import ChatglmPytorchChatModel
137
+ from .transformers.cogagent import CogAgentChatModel
137
138
  from .transformers.cogvlm2 import CogVLM2Model
138
139
  from .transformers.cogvlm2_video import CogVLM2VideoModel
139
140
  from .transformers.core import PytorchChatModel, PytorchModel
@@ -195,6 +196,7 @@ def _install():
195
196
  DeepSeekV2PytorchChatModel,
196
197
  OptPytorchModel,
197
198
  GlmEdgeVModel,
199
+ CogAgentChatModel,
198
200
  ]
199
201
  )
200
202
  if OmniLMMModel: # type: ignore
@@ -8989,5 +8989,101 @@
8989
8989
  "<|im_end|>",
8990
8990
  "<|endoftext|>"
8991
8991
  ]
8992
+ },
8993
+ {
8994
+ "version": 1,
8995
+ "context_length": 32768,
8996
+ "model_name": "marco-o1",
8997
+ "model_lang": [
8998
+ "en",
8999
+ "zh"
9000
+ ],
9001
+ "model_ability": [
9002
+ "chat",
9003
+ "tools"
9004
+ ],
9005
+ "model_description": "Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions",
9006
+ "model_specs": [
9007
+ {
9008
+ "model_format": "pytorch",
9009
+ "model_size_in_billions": 7,
9010
+ "quantizations": [
9011
+ "4-bit",
9012
+ "8-bit",
9013
+ "none"
9014
+ ],
9015
+ "model_id": "AIDC-AI/Marco-o1"
9016
+ },
9017
+ {
9018
+ "model_format": "ggufv2",
9019
+ "model_size_in_billions": 7,
9020
+ "quantizations": [
9021
+ "Q2_K",
9022
+ "Q3_K_L",
9023
+ "Q3_K_M",
9024
+ "Q3_K_S",
9025
+ "Q4_0",
9026
+ "Q4_1",
9027
+ "Q4_K_M",
9028
+ "Q4_K_S",
9029
+ "Q5_0",
9030
+ "Q5_1",
9031
+ "Q5_K_M",
9032
+ "Q5_K_S",
9033
+ "Q6_K",
9034
+ "Q8_0"
9035
+ ],
9036
+ "model_id": "QuantFactory/Marco-o1-GGUF",
9037
+ "model_file_name_template": "Marco-o1.{quantization}.gguf"
9038
+ }
9039
+ ],
9040
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\n\n你是一个经过良好训练的AI助手,你的名字是Marco-o1.由阿里国际数字商业集团的AI Business创造.\n \n## 重要!!!!!\n当你回答问题时,你的思考应该在<Thought>内完成,<Output>内输出你的结果。\n<Thought>应该尽可能是英文,但是有2个特例,一个是对原文中的引用,另一个是是数学应该使用markdown格式,<Output>内的输出需要遵循用户输入的语言。\n <|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
9041
+ "stop_token_ids": [
9042
+ 151643,
9043
+ 151644,
9044
+ 151645
9045
+ ],
9046
+ "stop": [
9047
+ "<|endoftext|>",
9048
+ "<|im_start|>",
9049
+ "<|im_end|>"
9050
+ ]
9051
+ },
9052
+ {
9053
+ "version": 1,
9054
+ "context_length": 4096,
9055
+ "model_name": "cogagent",
9056
+ "model_lang": [
9057
+ "en",
9058
+ "zh"
9059
+ ],
9060
+ "model_ability": [
9061
+ "chat",
9062
+ "vision"
9063
+ ],
9064
+ "model_description": "The CogAgent-9B-20241220 model is based on GLM-4V-9B, a bilingual open-source VLM base model. Through data collection and optimization, multi-stage training, and strategy improvements, CogAgent-9B-20241220 achieves significant advancements in GUI perception, inference prediction accuracy, action space completeness, and task generalizability. ",
9065
+ "model_specs": [
9066
+ {
9067
+ "model_format": "pytorch",
9068
+ "model_size_in_billions": "9",
9069
+ "quantizations": [
9070
+ "4-bit",
9071
+ "8-bit",
9072
+ "none"
9073
+ ],
9074
+ "model_id": "THUDM/cogagent-9b-20241220"
9075
+ }
9076
+ ],
9077
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
9078
+ "stop_token_ids": [
9079
+ 151329,
9080
+ 151336,
9081
+ 151338
9082
+ ],
9083
+ "stop": [
9084
+ "<|endoftext|>",
9085
+ "<|user|>",
9086
+ "<|observation|>"
9087
+ ]
8992
9088
  }
8993
9089
  ]
@@ -6722,5 +6722,104 @@
6722
6722
  "<|im_end|>",
6723
6723
  "<|endoftext|>"
6724
6724
  ]
6725
+ },
6726
+ {
6727
+ "version": 1,
6728
+ "context_length": 32768,
6729
+ "model_name": "marco-o1",
6730
+ "model_lang": [
6731
+ "en",
6732
+ "zh"
6733
+ ],
6734
+ "model_ability": [
6735
+ "chat",
6736
+ "tools"
6737
+ ],
6738
+ "model_description": "Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions",
6739
+ "model_specs": [
6740
+ {
6741
+ "model_format": "pytorch",
6742
+ "model_size_in_billions": 7,
6743
+ "quantizations": [
6744
+ "4-bit",
6745
+ "8-bit",
6746
+ "none"
6747
+ ],
6748
+ "model_id": "AIDC-AI/Marco-o1",
6749
+ "model_hub": "modelscope"
6750
+ },
6751
+ {
6752
+ "model_format": "ggufv2",
6753
+ "model_size_in_billions": 7,
6754
+ "quantizations": [
6755
+ "Q2_K",
6756
+ "Q3_K_L",
6757
+ "Q3_K_M",
6758
+ "Q3_K_S",
6759
+ "Q4_0",
6760
+ "Q4_1",
6761
+ "Q4_K_M",
6762
+ "Q4_K_S",
6763
+ "Q5_0",
6764
+ "Q5_1",
6765
+ "Q5_K_M",
6766
+ "Q5_K_S",
6767
+ "Q6_K",
6768
+ "Q8_0"
6769
+ ],
6770
+ "model_file_name_template": "Marco-o1.{quantization}.gguf",
6771
+ "model_hub": "modelscope",
6772
+ "model_id": "QuantFactory/Marco-o1-GGUF"
6773
+ }
6774
+ ],
6775
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\n\n你是一个经过良好训练的AI助手,你的名字是Marco-o1.由阿里国际数字商业集团的AI Business创造.\n \n## 重要!!!!!\n当你回答问题时,你的思考应该在<Thought>内完成,<Output>内输出你的结果。\n<Thought>应该尽可能是英文,但是有2个特例,一个是对原文中的引用,另一个是是数学应该使用markdown格式,<Output>内的输出需要遵循用户输入的语言。\n <|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
6776
+ "stop_token_ids": [
6777
+ 151643,
6778
+ 151644,
6779
+ 151645
6780
+ ],
6781
+ "stop": [
6782
+ "<|endoftext|>",
6783
+ "<|im_start|>",
6784
+ "<|im_end|>"
6785
+ ]
6786
+ },
6787
+ {
6788
+ "version": 1,
6789
+ "context_length": 4096,
6790
+ "model_name": "cogagent",
6791
+ "model_lang": [
6792
+ "en",
6793
+ "zh"
6794
+ ],
6795
+ "model_ability": [
6796
+ "chat",
6797
+ "vision"
6798
+ ],
6799
+ "model_description": "The CogAgent-9B-20241220 model is based on GLM-4V-9B, a bilingual open-source VLM base model. Through data collection and optimization, multi-stage training, and strategy improvements, CogAgent-9B-20241220 achieves significant advancements in GUI perception, inference prediction accuracy, action space completeness, and task generalizability. ",
6800
+ "model_specs": [
6801
+ {
6802
+ "model_format": "pytorch",
6803
+ "model_size_in_billions": "9",
6804
+ "quantizations": [
6805
+ "4-bit",
6806
+ "8-bit",
6807
+ "none"
6808
+ ],
6809
+ "model_id": "ZhipuAI/cogagent-9b-20241220",
6810
+ "model_hub": "modelscope"
6811
+ }
6812
+ ],
6813
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
6814
+ "stop_token_ids": [
6815
+ 151329,
6816
+ 151336,
6817
+ 151338
6818
+ ],
6819
+ "stop": [
6820
+ "<|endoftext|>",
6821
+ "<|user|>",
6822
+ "<|observation|>"
6823
+ ]
6725
6824
  }
6726
6825
  ]
@@ -477,39 +477,6 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
477
477
  self._model, self._processor = self._load_model(**kwargs)
478
478
  self._tokenizer = self._processor.tokenizer
479
479
 
480
- def _generate_stream_inner_no_image(self, **kwargs):
481
- import mlx.nn as nn
482
- from mlx_lm.utils import make_sampler, stream_generate
483
-
484
- # For mlx-lm, the model(inputs) will return logits,
485
- # but the language model in mlx-vlm will return an object
486
- # https://github.com/Blaizzy/mlx-vlm/blob/3f5e1620072440afb7496940f67ac1c7fc64056f/mlx_vlm/models/base.py#L260
487
- # so we cannot pass the language model to stream_generate directly
488
- # we wrap here to just let model(inputs) return logits to pass stream_generate
489
- class ModelWrapper(nn.Module):
490
- def __init__(self, model):
491
- super().__init__()
492
- self._model = model.language_model
493
-
494
- @property
495
- def layers(self):
496
- return self._model.layers
497
-
498
- def __call__(self, *args, **kwargs):
499
- return self._model(*args, **kwargs).logits
500
-
501
- sampler = make_sampler(
502
- temp=kwargs.pop("temperature"), top_p=kwargs.pop("top_p")
503
- )
504
- prompt_token_ids = kwargs.pop("prompt_token_ids")
505
- yield from stream_generate(
506
- ModelWrapper(self._model),
507
- self._tokenizer,
508
- prompt_token_ids,
509
- sampler=sampler,
510
- **kwargs,
511
- )
512
-
513
480
  def _generate_stream_inner(self, **kwargs):
514
481
  import mlx.core as mx
515
482
  from mlx_lm.utils import GenerationResponse
@@ -517,27 +484,8 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
517
484
 
518
485
  inputs = kwargs["prompt_token_ids"]
519
486
 
520
- if not isinstance(inputs, tuple):
521
- # no images
522
- yield from self._generate_stream_inner_no_image(**kwargs)
523
- return
524
-
525
487
  max_tokens = kwargs.pop("max_tokens")
526
- input_ids, pixel_values, mask = inputs[:3]
527
-
528
- kwargs = {
529
- k: v
530
- for k, v in zip(
531
- [
532
- "image_grid_thw",
533
- "image_sizes",
534
- "aspect_ratio_ids",
535
- "aspect_ratio_mask",
536
- "cross_attention_mask",
537
- ],
538
- inputs[3:],
539
- )
540
- }
488
+ input_ids, pixel_values, mask, kwargs = inputs
541
489
 
542
490
  tokenizer = self._processor.tokenizer
543
491
  detokenizer = self._processor.detokenizer
@@ -583,37 +531,39 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
583
531
  def _prepare_inputs(
584
532
  self, prompt: Union[str, Dict[str, Any]], kwargs
585
533
  ) -> Tuple[Any, int]:
534
+ import mlx.core as mx
586
535
  from mlx_vlm import prepare_inputs
587
536
 
588
537
  prompt_str = prompt.get("prompt") # type: ignore
589
538
  images = prompt.get("multi_modal_data", {}).get("image") # type: ignore
590
539
  if images and not isinstance(images, list):
591
540
  images = [images]
592
- if hasattr(self._model.config, "image_token_index"):
593
- image_token_index = self._model.config.image_token_index
594
- else:
595
- image_token_index = None
541
+ resize_shape = kwargs.pop("resize_shape", None)
542
+ image_token_index = getattr(self._model.config, "image_token_index", None)
543
+
544
+ processor = self._processor
545
+ tokenizer = processor if hasattr(processor, "encode") else processor.tokenizer
546
+ prompt_tokens = mx.array(tokenizer.encode(prompt_str))
596
547
 
597
548
  if not images:
598
- prompt = prompt["prompt"] # type: ignore
599
- prompt_token_ids = self._tokenizer.encode(prompt)
600
- prompt_token_ids = self._get_prompt_cache(
601
- prompt_token_ids,
602
- kwargs.get("lora_name"),
603
- model=self._model.language_model,
604
- )
605
- return prompt_token_ids, len(prompt_token_ids)
549
+ input_ids = prompt_tokens[None, :]
550
+ pixel_values = mask = None
551
+ kwargs = {}
552
+ input_token_len = input_ids.size
606
553
  else:
607
554
  inputs = prepare_inputs(
608
- None,
609
- self._processor,
610
- images,
611
- prompt_str,
612
- image_token_index,
613
- kwargs.get("resize_shape"),
555
+ processor, images, prompt_str, image_token_index, resize_shape
614
556
  )
615
- input_ids = inputs[0]
616
- return inputs, len(input_ids)
557
+ input_ids = inputs["input_ids"]
558
+ pixel_values = inputs["pixel_values"]
559
+ mask = inputs["attention_mask"]
560
+ kwargs = {
561
+ k: v
562
+ for k, v in inputs.items()
563
+ if k not in ["input_ids", "pixel_values", "attention_mask"]
564
+ }
565
+ input_token_len = int(mask.sum())
566
+ return (input_ids, pixel_values, mask, kwargs), input_token_len
617
567
 
618
568
  def chat(
619
569
  self,