xinference 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show
  1. xinference/_version.py +3 -3
  2. xinference/client/restful/async_restful_client.py +8 -13
  3. xinference/client/restful/restful_client.py +6 -2
  4. xinference/core/chat_interface.py +6 -4
  5. xinference/core/media_interface.py +5 -0
  6. xinference/core/model.py +1 -5
  7. xinference/core/supervisor.py +117 -68
  8. xinference/core/worker.py +49 -37
  9. xinference/deploy/test/test_cmdline.py +2 -6
  10. xinference/model/audio/__init__.py +26 -23
  11. xinference/model/audio/chattts.py +3 -2
  12. xinference/model/audio/core.py +49 -98
  13. xinference/model/audio/cosyvoice.py +3 -2
  14. xinference/model/audio/custom.py +28 -73
  15. xinference/model/audio/f5tts.py +3 -2
  16. xinference/model/audio/f5tts_mlx.py +3 -2
  17. xinference/model/audio/fish_speech.py +3 -2
  18. xinference/model/audio/funasr.py +17 -4
  19. xinference/model/audio/kokoro.py +3 -2
  20. xinference/model/audio/megatts.py +3 -2
  21. xinference/model/audio/melotts.py +3 -2
  22. xinference/model/audio/model_spec.json +572 -171
  23. xinference/model/audio/utils.py +0 -6
  24. xinference/model/audio/whisper.py +3 -2
  25. xinference/model/audio/whisper_mlx.py +3 -2
  26. xinference/model/cache_manager.py +141 -0
  27. xinference/model/core.py +6 -49
  28. xinference/model/custom.py +174 -0
  29. xinference/model/embedding/__init__.py +67 -56
  30. xinference/model/embedding/cache_manager.py +35 -0
  31. xinference/model/embedding/core.py +104 -84
  32. xinference/model/embedding/custom.py +55 -78
  33. xinference/model/embedding/embed_family.py +80 -31
  34. xinference/model/embedding/flag/core.py +21 -5
  35. xinference/model/embedding/llama_cpp/__init__.py +0 -0
  36. xinference/model/embedding/llama_cpp/core.py +234 -0
  37. xinference/model/embedding/model_spec.json +968 -103
  38. xinference/model/embedding/sentence_transformers/core.py +30 -20
  39. xinference/model/embedding/vllm/core.py +11 -5
  40. xinference/model/flexible/__init__.py +8 -2
  41. xinference/model/flexible/core.py +26 -119
  42. xinference/model/flexible/custom.py +69 -0
  43. xinference/model/flexible/launchers/image_process_launcher.py +1 -0
  44. xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
  45. xinference/model/flexible/launchers/transformers_launcher.py +15 -3
  46. xinference/model/flexible/launchers/yolo_launcher.py +5 -1
  47. xinference/model/image/__init__.py +20 -20
  48. xinference/model/image/cache_manager.py +62 -0
  49. xinference/model/image/core.py +70 -182
  50. xinference/model/image/custom.py +28 -72
  51. xinference/model/image/model_spec.json +402 -119
  52. xinference/model/image/ocr/got_ocr2.py +3 -2
  53. xinference/model/image/stable_diffusion/core.py +22 -7
  54. xinference/model/image/stable_diffusion/mlx.py +6 -6
  55. xinference/model/image/utils.py +2 -2
  56. xinference/model/llm/__init__.py +71 -94
  57. xinference/model/llm/cache_manager.py +292 -0
  58. xinference/model/llm/core.py +37 -111
  59. xinference/model/llm/custom.py +88 -0
  60. xinference/model/llm/llama_cpp/core.py +5 -7
  61. xinference/model/llm/llm_family.json +16260 -8151
  62. xinference/model/llm/llm_family.py +138 -839
  63. xinference/model/llm/lmdeploy/core.py +5 -7
  64. xinference/model/llm/memory.py +3 -4
  65. xinference/model/llm/mlx/core.py +6 -8
  66. xinference/model/llm/reasoning_parser.py +3 -1
  67. xinference/model/llm/sglang/core.py +32 -14
  68. xinference/model/llm/transformers/chatglm.py +3 -7
  69. xinference/model/llm/transformers/core.py +49 -27
  70. xinference/model/llm/transformers/deepseek_v2.py +2 -2
  71. xinference/model/llm/transformers/gemma3.py +2 -2
  72. xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
  73. xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
  74. xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
  75. xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
  76. xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
  77. xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
  78. xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
  79. xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
  80. xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
  81. xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
  82. xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
  83. xinference/model/llm/transformers/opt.py +3 -7
  84. xinference/model/llm/utils.py +34 -49
  85. xinference/model/llm/vllm/core.py +77 -27
  86. xinference/model/llm/vllm/xavier/engine.py +5 -3
  87. xinference/model/llm/vllm/xavier/scheduler.py +10 -6
  88. xinference/model/llm/vllm/xavier/transfer.py +1 -1
  89. xinference/model/rerank/__init__.py +26 -25
  90. xinference/model/rerank/core.py +47 -87
  91. xinference/model/rerank/custom.py +25 -71
  92. xinference/model/rerank/model_spec.json +158 -33
  93. xinference/model/rerank/utils.py +2 -2
  94. xinference/model/utils.py +115 -54
  95. xinference/model/video/__init__.py +13 -17
  96. xinference/model/video/core.py +44 -102
  97. xinference/model/video/diffusers.py +4 -3
  98. xinference/model/video/model_spec.json +90 -21
  99. xinference/types.py +5 -3
  100. xinference/web/ui/build/asset-manifest.json +3 -3
  101. xinference/web/ui/build/index.html +1 -1
  102. xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
  103. xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
  104. xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
  105. xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
  106. xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
  107. xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
  108. xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
  109. xinference/web/ui/src/locales/en.json +0 -1
  110. xinference/web/ui/src/locales/ja.json +0 -1
  111. xinference/web/ui/src/locales/ko.json +0 -1
  112. xinference/web/ui/src/locales/zh.json +0 -1
  113. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
  114. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
  115. xinference/model/audio/model_spec_modelscope.json +0 -231
  116. xinference/model/embedding/model_spec_modelscope.json +0 -293
  117. xinference/model/embedding/utils.py +0 -18
  118. xinference/model/image/model_spec_modelscope.json +0 -375
  119. xinference/model/llm/llama_cpp/memory.py +0 -457
  120. xinference/model/llm/llm_family_csghub.json +0 -56
  121. xinference/model/llm/llm_family_modelscope.json +0 -8700
  122. xinference/model/llm/llm_family_openmind_hub.json +0 -1019
  123. xinference/model/rerank/model_spec_modelscope.json +0 -85
  124. xinference/model/video/model_spec_modelscope.json +0 -184
  125. xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
  126. xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
  127. xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
  128. xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
  129. xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
  130. xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
  131. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
  132. /xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
  133. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
  134. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
  135. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
  136. {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,7 @@ from typing import TYPE_CHECKING, Optional
18
18
  import PIL.Image
19
19
 
20
20
  if TYPE_CHECKING:
21
- from ..core import ImageModelFamilyV1
21
+ from ..core import ImageModelFamilyV2
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
@@ -29,9 +29,10 @@ class GotOCR2Model:
29
29
  model_uid: str,
30
30
  model_path: Optional[str] = None,
31
31
  device: Optional[str] = None,
32
- model_spec: Optional["ImageModelFamilyV1"] = None,
32
+ model_spec: Optional["ImageModelFamilyV2"] = None,
33
33
  **kwargs,
34
34
  ):
35
+ self.model_family = model_spec
35
36
  self._model_uid = model_uid
36
37
  self._model_path = model_path
37
38
  self._device = device
@@ -37,7 +37,7 @@ from ..utils import handle_image_result
37
37
 
38
38
  if TYPE_CHECKING:
39
39
  from ....core.progress_tracker import Progressor
40
- from ..core import ImageModelFamilyV1
40
+ from ..core import ImageModelFamilyV2
41
41
 
42
42
  logger = logging.getLogger(__name__)
43
43
 
@@ -87,10 +87,11 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
87
87
  lora_model: Optional[List[LoRA]] = None,
88
88
  lora_load_kwargs: Optional[Dict] = None,
89
89
  lora_fuse_kwargs: Optional[Dict] = None,
90
- model_spec: Optional["ImageModelFamilyV1"] = None,
90
+ model_spec: Optional["ImageModelFamilyV2"] = None,
91
91
  gguf_model_path: Optional[str] = None,
92
92
  **kwargs,
93
93
  ):
94
+ self.model_family = model_spec
94
95
  self._model_uid = model_uid
95
96
  self._model_path = model_path
96
97
  self._device = device
@@ -239,10 +240,22 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
239
240
  logger.debug(
240
241
  "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
241
242
  )
242
- self._model = AutoPipelineModel.from_pretrained(
243
- self._model_path,
244
- **self._kwargs,
245
- )
243
+ try:
244
+ self._model = AutoPipelineModel.from_pretrained(
245
+ self._model_path,
246
+ **self._kwargs,
247
+ )
248
+ except ValueError:
249
+ if "kontext" in self._model_spec.model_name.lower():
250
+ # TODO: remove this branch when auto pipeline supports
251
+ # flux.1-kontext-dev
252
+ from diffusers import FluxKontextPipeline
253
+
254
+ self._model = FluxKontextPipeline.from_pretrained(
255
+ self._model_path, **self._kwargs
256
+ )
257
+ else:
258
+ raise
246
259
  self._load_to_device(self._model)
247
260
  self._apply_lora()
248
261
 
@@ -657,7 +670,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
657
670
  response_format: str = "url",
658
671
  **kwargs,
659
672
  ):
660
- if self._kwargs.get("controlnet"):
673
+ if self._kwargs.get("controlnet") or self._model_spec.model_ability == [ # type: ignore
674
+ "image2image"
675
+ ]:
661
676
  model = self._model
662
677
  else:
663
678
  ability = "image2image"
@@ -20,7 +20,6 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
20
20
 
21
21
  import numpy as np
22
22
  from PIL import Image
23
- from xoscar.utils import classproperty
24
23
 
25
24
  from ....types import LoRA
26
25
  from ..sdapi import SDAPIDiffusionModelMixin
@@ -28,7 +27,7 @@ from ..utils import handle_image_result
28
27
 
29
28
  if TYPE_CHECKING:
30
29
  from ....core.progress_tracker import Progressor
31
- from ..core import ImageModelFamilyV1
30
+ from ..core import ImageModelFamilyV2
32
31
 
33
32
 
34
33
  logger = logging.getLogger(__name__)
@@ -61,9 +60,10 @@ class MLXDiffusionModel(SDAPIDiffusionModelMixin):
61
60
  lora_model: Optional[List[LoRA]] = None,
62
61
  lora_load_kwargs: Optional[Dict] = None,
63
62
  lora_fuse_kwargs: Optional[Dict] = None,
64
- model_spec: Optional["ImageModelFamilyV1"] = None,
63
+ model_spec: Optional["ImageModelFamilyV2"] = None,
65
64
  **kwargs,
66
65
  ):
66
+ self.model_family = model_spec
67
67
  self._model_uid = model_uid
68
68
  self._model_path = model_path
69
69
  self._device = device
@@ -81,9 +81,9 @@ class MLXDiffusionModel(SDAPIDiffusionModelMixin):
81
81
  def model_ability(self):
82
82
  return self._abilities
83
83
 
84
- @classproperty
85
- def supported_models(self):
86
- return ["FLUX.1-schnell", "FLUX.1-dev"]
84
+ @staticmethod
85
+ def support_model(model_name: str) -> bool:
86
+ return "flux" in model_name.lower()
87
87
 
88
88
  def load(self):
89
89
  try:
@@ -24,11 +24,11 @@ from ...constants import XINFERENCE_IMAGE_DIR
24
24
  from ...types import Image, ImageList
25
25
 
26
26
  if TYPE_CHECKING:
27
- from .core import ImageModelFamilyV1
27
+ from .core import ImageModelFamilyV2
28
28
 
29
29
 
30
30
  def get_model_version(
31
- image_model: "ImageModelFamilyV1", controlnet: Optional["ImageModelFamilyV1"]
31
+ image_model: "ImageModelFamilyV2", controlnet: Optional["ImageModelFamilyV2"]
32
32
  ) -> str:
33
33
  return (
34
34
  image_model.model_name
@@ -11,28 +11,25 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
14
  import codecs
16
15
  import json
17
16
  import os
18
17
  import warnings
19
18
 
19
+ from ..utils import flatten_quantizations
20
20
  from .core import (
21
21
  LLM,
22
- LLM_MODEL_DESCRIPTIONS,
23
- LLMDescription,
24
- generate_llm_description,
25
- get_llm_model_descriptions,
22
+ LLM_VERSION_INFOS,
23
+ generate_llm_version_info,
24
+ get_llm_version_infos,
26
25
  )
26
+ from .custom import get_user_defined_llm_families, register_llm, unregister_llm
27
27
  from .llm_family import (
28
- BUILTIN_CSGHUB_LLM_FAMILIES,
29
28
  BUILTIN_LLM_FAMILIES,
30
29
  BUILTIN_LLM_MODEL_CHAT_FAMILIES,
31
30
  BUILTIN_LLM_MODEL_GENERATE_FAMILIES,
32
31
  BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
33
32
  BUILTIN_LLM_PROMPT_STYLE,
34
- BUILTIN_MODELSCOPE_LLM_FAMILIES,
35
- BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
36
33
  LLAMA_CLASSES,
37
34
  LLM_ENGINES,
38
35
  LMDEPLOY_CLASSES,
@@ -41,17 +38,13 @@ from .llm_family import (
41
38
  SUPPORTED_ENGINES,
42
39
  TRANSFORMERS_CLASSES,
43
40
  VLLM_CLASSES,
44
- CustomLLMFamilyV1,
45
- LlamaCppLLMSpecV1,
46
- LLMFamilyV1,
41
+ CustomLLMFamilyV2,
42
+ LlamaCppLLMSpecV2,
43
+ LLMFamilyV2,
47
44
  LLMSpecV1,
48
- MLXLLMSpecV1,
49
- PytorchLLMSpecV1,
50
- get_cache_status,
51
- get_user_defined_llm_families,
45
+ MLXLLMSpecV2,
46
+ PytorchLLMSpecV2,
52
47
  match_llm,
53
- register_llm,
54
- unregister_llm,
55
48
  )
56
49
 
57
50
 
@@ -64,69 +57,72 @@ def check_format_with_engine(model_format, engine):
64
57
  return True
65
58
 
66
59
 
67
- def generate_engine_config_by_model_family(model_family):
60
+ def generate_engine_config_by_model_family(model_family: "LLMFamilyV2"):
68
61
  model_name = model_family.model_name
69
62
  specs = model_family.model_specs
70
63
  engines = LLM_ENGINES.get(model_name, {}) # structure for engine query
71
64
  for spec in specs:
72
65
  model_format = spec.model_format
73
66
  model_size_in_billions = spec.model_size_in_billions
74
- quantizations = spec.quantizations
75
- for quantization in quantizations:
76
- # traverse all supported engines to match the name, format, size in billions and quantization of model
77
- for engine in SUPPORTED_ENGINES:
78
- if not check_format_with_engine(
79
- model_format, engine
80
- ): # match the format of model with engine
81
- continue
82
- CLASSES = SUPPORTED_ENGINES[engine]
83
- for cls in CLASSES:
84
- if cls.match(model_family, spec, quantization):
85
- engine_params = engines.get(engine, [])
86
- already_exists = False
87
- # if the name, format and size in billions of model already exists in the structure, add the new quantization
88
- for param in engine_params:
89
- if (
90
- model_name == param["model_name"]
91
- and model_format == param["model_format"]
92
- and model_size_in_billions
93
- == param["model_size_in_billions"]
94
- ):
95
- if quantization not in param["quantizations"]:
96
- param["quantizations"].append(quantization)
97
- already_exists = True
98
- break
99
- # successfully match the params for the first time, add to the structure
100
- if not already_exists:
101
- engine_params.append(
102
- {
103
- "model_name": model_name,
104
- "model_format": model_format,
105
- "model_size_in_billions": model_size_in_billions,
106
- "quantizations": [quantization],
107
- "llm_class": cls,
108
- }
109
- )
110
- if hasattr(spec, "multimodal_projectors"):
111
- engine_params[-1][
112
- "multimodal_projectors"
113
- ] = spec.multimodal_projectors
114
- engines[engine] = engine_params
115
- break
67
+ quantization = spec.quantization
68
+ # traverse all supported engines to match the name, format, size in billions and quantization of model
69
+ for engine in SUPPORTED_ENGINES:
70
+ if not check_format_with_engine(
71
+ model_format, engine
72
+ ): # match the format of model with engine
73
+ continue
74
+ CLASSES = SUPPORTED_ENGINES[engine]
75
+ for cls in CLASSES:
76
+ if cls.match(model_family, spec, quantization):
77
+ engine_params = engines.get(engine, [])
78
+ already_exists = False
79
+ # if the name, format and size in billions of model already exists in the structure, add the new quantization
80
+ for param in engine_params:
81
+ if (
82
+ model_name == param["model_name"]
83
+ and model_format == param["model_format"]
84
+ and model_size_in_billions
85
+ == param["model_size_in_billions"]
86
+ ):
87
+ if quantization not in param["quantizations"]:
88
+ param["quantizations"].append(quantization)
89
+ already_exists = True
90
+ break
91
+ # successfully match the params for the first time, add to the structure
92
+ if not already_exists:
93
+ engine_params.append(
94
+ {
95
+ "model_name": model_name,
96
+ "model_format": model_format,
97
+ "model_size_in_billions": model_size_in_billions,
98
+ "quantizations": [quantization],
99
+ "llm_class": cls,
100
+ }
101
+ )
102
+ if hasattr(spec, "multimodal_projectors"):
103
+ engine_params[-1][
104
+ "multimodal_projectors"
105
+ ] = spec.multimodal_projectors
106
+ engines[engine] = engine_params
107
+ break
116
108
  LLM_ENGINES[model_name] = engines
117
109
 
118
110
 
119
111
  def register_custom_model():
120
112
  from ...constants import XINFERENCE_MODEL_DIR
113
+ from ..custom import migrate_from_v1_to_v2
114
+
115
+ # migrate from v1 to v2 first
116
+ migrate_from_v1_to_v2("llm", CustomLLMFamilyV2)
121
117
 
122
- user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "llm")
118
+ user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "v2", "llm")
123
119
  if os.path.isdir(user_defined_llm_dir):
124
120
  for f in os.listdir(user_defined_llm_dir):
125
121
  try:
126
122
  with codecs.open(
127
123
  os.path.join(user_defined_llm_dir, f), encoding="utf-8"
128
124
  ) as fd:
129
- user_defined_llm_family = CustomLLMFamilyV1.parse_raw(fd.read())
125
+ user_defined_llm_family = CustomLLMFamilyV2.parse_raw(fd.read())
130
126
  register_llm(user_defined_llm_family, persist=False)
131
127
  except Exception as e:
132
128
  warnings.warn(f"{user_defined_llm_dir}/{f} has error, {e}")
@@ -135,7 +131,11 @@ def register_custom_model():
135
131
  def load_model_family_from_json(json_filename, target_families):
136
132
  json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), json_filename)
137
133
  for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
138
- model_spec = LLMFamilyV1.parse_obj(json_obj)
134
+ flattened = []
135
+ for spec in json_obj["model_specs"]:
136
+ flattened.extend(flatten_quantizations(spec))
137
+ json_obj["model_specs"] = flattened
138
+ model_spec = LLMFamilyV2.parse_obj(json_obj)
139
139
  target_families.append(model_spec)
140
140
 
141
141
  # register chat_template
@@ -178,11 +178,7 @@ def _install():
178
178
  from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
179
179
 
180
180
  # register llm classes.
181
- LLAMA_CLASSES.extend(
182
- [
183
- XllamaCppModel,
184
- ]
185
- )
181
+ LLAMA_CLASSES.extend([XllamaCppModel])
186
182
  SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel, SGLANGVisionModel])
187
183
  VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
188
184
  MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
@@ -198,36 +194,17 @@ def _install():
198
194
  SUPPORTED_ENGINES["LMDEPLOY"] = LMDEPLOY_CLASSES
199
195
 
200
196
  load_model_family_from_json("llm_family.json", BUILTIN_LLM_FAMILIES)
201
- load_model_family_from_json(
202
- "llm_family_modelscope.json", BUILTIN_MODELSCOPE_LLM_FAMILIES
203
- )
204
- load_model_family_from_json(
205
- "llm_family_openmind_hub.json", BUILTIN_OPENMIND_HUB_LLM_FAMILIES
206
- )
207
- load_model_family_from_json("llm_family_csghub.json", BUILTIN_CSGHUB_LLM_FAMILIES)
208
-
209
- for llm_specs in [
210
- BUILTIN_LLM_FAMILIES,
211
- BUILTIN_MODELSCOPE_LLM_FAMILIES,
212
- BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
213
- BUILTIN_CSGHUB_LLM_FAMILIES,
214
- ]:
215
- for llm_spec in llm_specs:
216
- if llm_spec.model_name not in LLM_MODEL_DESCRIPTIONS:
217
- LLM_MODEL_DESCRIPTIONS.update(generate_llm_description(llm_spec))
197
+
198
+ for family in BUILTIN_LLM_FAMILIES:
199
+ if family.model_name not in LLM_VERSION_INFOS:
200
+ LLM_VERSION_INFOS.update(generate_llm_version_info(family))
218
201
 
219
202
  # traverse all families and add engine parameters corresponding to the model name
220
- for families in [
221
- BUILTIN_LLM_FAMILIES,
222
- BUILTIN_MODELSCOPE_LLM_FAMILIES,
223
- BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
224
- BUILTIN_CSGHUB_LLM_FAMILIES,
225
- ]:
226
- for family in families:
227
- generate_engine_config_by_model_family(family)
203
+ for family in BUILTIN_LLM_FAMILIES:
204
+ generate_engine_config_by_model_family(family)
228
205
 
229
206
  register_custom_model()
230
207
 
231
208
  # register model description
232
209
  for ud_llm in get_user_defined_llm_families():
233
- LLM_MODEL_DESCRIPTIONS.update(generate_llm_description(ud_llm))
210
+ LLM_VERSION_INFOS.update(generate_llm_version_info(ud_llm))
@@ -0,0 +1,292 @@
1
+ import logging
2
+ import os
3
+ from typing import TYPE_CHECKING, Optional
4
+
5
+ from ..cache_manager import CacheManager
6
+
7
+ if TYPE_CHECKING:
8
+ from .llm_family import LLMFamilyV2
9
+
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class LLMCacheManager(CacheManager):
15
+ def __init__(
16
+ self, llm_family: "LLMFamilyV2", multimodal_projector: Optional[str] = None
17
+ ):
18
+ super().__init__(llm_family)
19
+ self._llm_family = llm_family
20
+ self._model_name = llm_family.model_name
21
+ self._model_format = llm_family.model_specs[0].model_format
22
+ self._model_size_in_billions = getattr(
23
+ llm_family.model_specs[0], "model_size_in_billions", None
24
+ )
25
+ self._quantization = llm_family.model_specs[0].quantization
26
+ self._model_uri = llm_family.model_specs[0].model_uri
27
+ self._multimodal_projector = multimodal_projector
28
+ self._model_id = llm_family.model_specs[0].model_id
29
+ self._model_hub = llm_family.model_specs[0].model_hub
30
+ self._model_revision = llm_family.model_specs[0].model_revision
31
+ self._cache_dir = os.path.join(
32
+ self._v2_cache_dir_prefix,
33
+ f"{self._model_name.replace('.', '_')}-{self._model_format}-"
34
+ f"{self._model_size_in_billions}b-{self._quantization}",
35
+ )
36
+
37
+ def cache_uri(self) -> str:
38
+ from ..utils import parse_uri
39
+
40
+ cache_dir = self.get_cache_dir()
41
+ assert self._model_uri is not None
42
+ src_scheme, src_root = parse_uri(self._model_uri)
43
+ if src_root.endswith("/"):
44
+ # remove trailing path separator.
45
+ src_root = src_root[:-1]
46
+
47
+ if src_scheme == "file":
48
+ if not os.path.isabs(src_root):
49
+ raise ValueError(
50
+ f"Model URI cannot be a relative path: {self._model_uri}"
51
+ )
52
+ if os.path.exists(cache_dir):
53
+ logger.info(f"Cache {cache_dir} exists")
54
+ return cache_dir
55
+ else:
56
+ os.symlink(src_root, cache_dir, target_is_directory=True)
57
+ return cache_dir
58
+ else:
59
+ raise ValueError(f"Unsupported URL scheme: {src_scheme}")
60
+
61
+ def cache_from_huggingface(self) -> str:
62
+ """
63
+ Cache model from Hugging Face. Return the cache directory.
64
+ """
65
+ import huggingface_hub
66
+
67
+ from ..utils import (
68
+ IS_NEW_HUGGINGFACE_HUB,
69
+ create_symlink,
70
+ generate_model_file_names_with_quantization_parts,
71
+ merge_cached_files,
72
+ retry_download,
73
+ symlink_local_file,
74
+ )
75
+
76
+ cache_dir = self.get_cache_dir()
77
+ if self.get_cache_status():
78
+ return cache_dir
79
+
80
+ use_symlinks = {}
81
+ if not IS_NEW_HUGGINGFACE_HUB:
82
+ use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
83
+
84
+ if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
85
+ download_dir = retry_download(
86
+ huggingface_hub.snapshot_download,
87
+ self._model_name,
88
+ {
89
+ "model_size": self._model_size_in_billions,
90
+ "model_format": self._model_format,
91
+ },
92
+ self._model_id,
93
+ revision=self._model_revision,
94
+ **use_symlinks,
95
+ )
96
+ if IS_NEW_HUGGINGFACE_HUB:
97
+ create_symlink(download_dir, cache_dir)
98
+ elif self._model_format in ["ggufv2"]:
99
+ file_names, final_file_name, need_merge = (
100
+ generate_model_file_names_with_quantization_parts(
101
+ self._llm_family.model_specs[0], self._multimodal_projector
102
+ )
103
+ )
104
+
105
+ for file_name in file_names:
106
+ download_file_path = retry_download(
107
+ huggingface_hub.hf_hub_download,
108
+ self._model_name,
109
+ {
110
+ "model_size": self._model_size_in_billions,
111
+ "model_format": self._model_format,
112
+ },
113
+ self._model_id,
114
+ revision=self._model_revision,
115
+ filename=file_name,
116
+ **use_symlinks,
117
+ )
118
+ if IS_NEW_HUGGINGFACE_HUB:
119
+ symlink_local_file(download_file_path, cache_dir, file_name)
120
+
121
+ if need_merge:
122
+ merge_cached_files(cache_dir, file_names, final_file_name)
123
+ else:
124
+ raise ValueError(f"Unsupported model format: {self._model_format}")
125
+
126
+ return cache_dir
127
+
128
+ def cache_from_modelscope(self) -> str:
129
+ """
130
+ Cache model from Modelscope. Return the cache directory.
131
+ """
132
+ from modelscope.hub.file_download import model_file_download
133
+ from modelscope.hub.snapshot_download import snapshot_download
134
+
135
+ from ..utils import (
136
+ create_symlink,
137
+ generate_model_file_names_with_quantization_parts,
138
+ merge_cached_files,
139
+ retry_download,
140
+ symlink_local_file,
141
+ )
142
+
143
+ cache_dir = self.get_cache_dir()
144
+ if self.get_cache_status():
145
+ return cache_dir
146
+
147
+ if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
148
+ download_dir = retry_download(
149
+ snapshot_download,
150
+ self._model_name,
151
+ {
152
+ "model_size": self._model_size_in_billions,
153
+ "model_format": self._model_format,
154
+ },
155
+ self._model_id,
156
+ revision=self._model_revision,
157
+ )
158
+ create_symlink(download_dir, cache_dir)
159
+
160
+ elif self._model_format in ["ggufv2"]:
161
+ file_names, final_file_name, need_merge = (
162
+ generate_model_file_names_with_quantization_parts(
163
+ self._llm_family.model_specs[0], self._multimodal_projector
164
+ )
165
+ )
166
+
167
+ for filename in file_names:
168
+ download_path = retry_download(
169
+ model_file_download,
170
+ self._model_name,
171
+ {
172
+ "model_size": self._model_size_in_billions,
173
+ "model_format": self._model_format,
174
+ },
175
+ self._model_id,
176
+ filename,
177
+ revision=self._model_revision,
178
+ )
179
+ symlink_local_file(download_path, cache_dir, filename)
180
+
181
+ if need_merge:
182
+ merge_cached_files(cache_dir, file_names, final_file_name)
183
+ else:
184
+ raise ValueError(f"Unsupported format: {self._model_format}")
185
+
186
+ return cache_dir
187
+
188
+ def cache_from_openmind_hub(self) -> str:
189
+ """
190
+ Cache model from openmind_hub. Return the cache directory.
191
+ """
192
+ from openmind_hub import snapshot_download
193
+
194
+ from ..utils import create_symlink, retry_download
195
+
196
+ cache_dir = self.get_cache_dir()
197
+ if self.get_cache_status():
198
+ return cache_dir
199
+
200
+ if self._model_format in ["pytorch", "mindspore"]:
201
+ download_dir = retry_download(
202
+ snapshot_download,
203
+ self._model_name,
204
+ {
205
+ "model_size": self._model_size_in_billions,
206
+ "model_format": self._model_format,
207
+ },
208
+ self._model_id,
209
+ revision=self._model_revision,
210
+ )
211
+ create_symlink(download_dir, cache_dir)
212
+
213
+ else:
214
+ raise ValueError(f"Unsupported format: {self._model_format}")
215
+ return cache_dir
216
+
217
+ def cache_from_csghub(self) -> str:
218
+ """
219
+ Cache model from CSGHub. Return the cache directory.
220
+ """
221
+ from pycsghub.file_download import file_download
222
+ from pycsghub.snapshot_download import snapshot_download
223
+
224
+ from ...constants import XINFERENCE_CSG_ENDPOINT, XINFERENCE_ENV_CSG_TOKEN
225
+ from ..utils import (
226
+ create_symlink,
227
+ generate_model_file_names_with_quantization_parts,
228
+ merge_cached_files,
229
+ retry_download,
230
+ symlink_local_file,
231
+ )
232
+
233
+ cache_dir = self.get_cache_dir()
234
+ if self.get_cache_status():
235
+ return cache_dir
236
+
237
+ if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
238
+ download_dir = retry_download(
239
+ snapshot_download,
240
+ self._model_name,
241
+ {
242
+ "model_size": self._model_size_in_billions,
243
+ "model_format": self._model_format,
244
+ },
245
+ self._model_id,
246
+ endpoint=XINFERENCE_CSG_ENDPOINT,
247
+ token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
248
+ )
249
+ create_symlink(download_dir, cache_dir)
250
+ elif self._model_format in ["ggufv2"]:
251
+ file_names, final_file_name, need_merge = (
252
+ generate_model_file_names_with_quantization_parts(
253
+ self._llm_family.model_specs[0], self._multimodal_projector
254
+ )
255
+ )
256
+
257
+ for filename in file_names:
258
+ download_path = retry_download(
259
+ file_download,
260
+ self._model_name,
261
+ {
262
+ "model_size": self._model_size_in_billions,
263
+ "model_format": self._model_format,
264
+ },
265
+ self._model_id,
266
+ file_name=filename,
267
+ endpoint=XINFERENCE_CSG_ENDPOINT,
268
+ token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
269
+ )
270
+ symlink_local_file(download_path, cache_dir, filename)
271
+
272
+ if need_merge:
273
+ merge_cached_files(cache_dir, file_names, final_file_name)
274
+ else:
275
+ raise ValueError(f"Unsupported format: {self._model_format}")
276
+
277
+ return cache_dir
278
+
279
+ def cache(self) -> str:
280
+ if self._model_uri is not None:
281
+ return self.cache_uri()
282
+ else:
283
+ if self._model_hub == "huggingface":
284
+ return self.cache_from_huggingface()
285
+ elif self._model_hub == "modelscope":
286
+ return self.cache_from_modelscope()
287
+ elif self._model_hub == "openmind_hub":
288
+ return self.cache_from_openmind_hub()
289
+ elif self._model_hub == "csghub":
290
+ return self.cache_from_csghub()
291
+ else:
292
+ raise ValueError(f"Unknown model hub: {self._model_hub}")