xinference 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (62) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +48 -0
  3. xinference/client/restful/restful_client.py +19 -0
  4. xinference/constants.py +1 -0
  5. xinference/core/chat_interface.py +5 -1
  6. xinference/core/image_interface.py +5 -1
  7. xinference/core/model.py +106 -16
  8. xinference/core/scheduler.py +1 -1
  9. xinference/core/worker.py +3 -1
  10. xinference/deploy/supervisor.py +0 -4
  11. xinference/model/audio/chattts.py +25 -14
  12. xinference/model/audio/core.py +6 -2
  13. xinference/model/audio/model_spec.json +1 -1
  14. xinference/model/audio/model_spec_modelscope.json +1 -1
  15. xinference/model/core.py +3 -1
  16. xinference/model/embedding/core.py +6 -2
  17. xinference/model/embedding/model_spec.json +1 -1
  18. xinference/model/image/core.py +65 -6
  19. xinference/model/image/model_spec.json +24 -3
  20. xinference/model/image/model_spec_modelscope.json +25 -3
  21. xinference/model/image/ocr/__init__.py +13 -0
  22. xinference/model/image/ocr/got_ocr2.py +79 -0
  23. xinference/model/image/scheduler/flux.py +1 -1
  24. xinference/model/image/stable_diffusion/core.py +2 -3
  25. xinference/model/image/stable_diffusion/mlx.py +221 -0
  26. xinference/model/llm/__init__.py +33 -0
  27. xinference/model/llm/core.py +3 -1
  28. xinference/model/llm/llm_family.json +9 -0
  29. xinference/model/llm/llm_family.py +68 -2
  30. xinference/model/llm/llm_family_modelscope.json +11 -0
  31. xinference/model/llm/llm_family_openmind_hub.json +1359 -0
  32. xinference/model/rerank/core.py +9 -1
  33. xinference/model/utils.py +7 -0
  34. xinference/model/video/core.py +6 -2
  35. xinference/thirdparty/mlx/__init__.py +13 -0
  36. xinference/thirdparty/mlx/flux/__init__.py +15 -0
  37. xinference/thirdparty/mlx/flux/autoencoder.py +357 -0
  38. xinference/thirdparty/mlx/flux/clip.py +154 -0
  39. xinference/thirdparty/mlx/flux/datasets.py +75 -0
  40. xinference/thirdparty/mlx/flux/flux.py +247 -0
  41. xinference/thirdparty/mlx/flux/layers.py +302 -0
  42. xinference/thirdparty/mlx/flux/lora.py +76 -0
  43. xinference/thirdparty/mlx/flux/model.py +134 -0
  44. xinference/thirdparty/mlx/flux/sampler.py +56 -0
  45. xinference/thirdparty/mlx/flux/t5.py +244 -0
  46. xinference/thirdparty/mlx/flux/tokenizers.py +185 -0
  47. xinference/thirdparty/mlx/flux/trainer.py +98 -0
  48. xinference/thirdparty/mlx/flux/utils.py +179 -0
  49. xinference/web/ui/build/asset-manifest.json +3 -3
  50. xinference/web/ui/build/index.html +1 -1
  51. xinference/web/ui/build/static/js/{main.f7da0140.js → main.2f269bb3.js} +3 -3
  52. xinference/web/ui/build/static/js/main.2f269bb3.js.map +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +1 -0
  54. {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/METADATA +16 -9
  55. {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/RECORD +60 -42
  56. xinference/web/ui/build/static/js/main.f7da0140.js.map +0 -1
  57. xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +0 -1
  58. /xinference/web/ui/build/static/js/{main.f7da0140.js.LICENSE.txt → main.2f269bb3.js.LICENSE.txt} +0 -0
  59. {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/LICENSE +0 -0
  60. {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/WHEEL +0 -0
  61. {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/entry_points.txt +0 -0
  62. {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,11 @@
8
8
  "text2image",
9
9
  "image2image",
10
10
  "inpainting"
11
- ]
11
+ ],
12
+ "default_model_config": {
13
+ "quantize": true,
14
+ "quantize_text_encoder": "text_encoder_2"
15
+ }
12
16
  },
13
17
  {
14
18
  "model_name": "FLUX.1-dev",
@@ -19,7 +23,11 @@
19
23
  "text2image",
20
24
  "image2image",
21
25
  "inpainting"
22
- ]
26
+ ],
27
+ "default_model_config": {
28
+ "quantize": true,
29
+ "quantize_text_encoder": "text_encoder_2"
30
+ }
23
31
  },
24
32
  {
25
33
  "model_name": "sd3-medium",
@@ -30,7 +38,11 @@
30
38
  "text2image",
31
39
  "image2image",
32
40
  "inpainting"
33
- ]
41
+ ],
42
+ "default_model_config": {
43
+ "quantize": true,
44
+ "quantize_text_encoder": "text_encoder_3"
45
+ }
34
46
  },
35
47
  {
36
48
  "model_name": "sd-turbo",
@@ -178,5 +190,14 @@
178
190
  "model_ability": [
179
191
  "inpainting"
180
192
  ]
193
+ },
194
+ {
195
+ "model_name": "GOT-OCR2_0",
196
+ "model_family": "ocr",
197
+ "model_id": "stepfun-ai/GOT-OCR2_0",
198
+ "model_revision": "cf6b7386bc89a54f09785612ba74cb12de6fa17c",
199
+ "model_ability": [
200
+ "ocr"
201
+ ]
181
202
  }
182
203
  ]
@@ -9,7 +9,11 @@
9
9
  "text2image",
10
10
  "image2image",
11
11
  "inpainting"
12
- ]
12
+ ],
13
+ "default_model_config": {
14
+ "quantize": true,
15
+ "quantize_text_encoder": "text_encoder_2"
16
+ }
13
17
  },
14
18
  {
15
19
  "model_name": "FLUX.1-dev",
@@ -21,7 +25,11 @@
21
25
  "text2image",
22
26
  "image2image",
23
27
  "inpainting"
24
- ]
28
+ ],
29
+ "default_model_config": {
30
+ "quantize": true,
31
+ "quantize_text_encoder": "text_encoder_2"
32
+ }
25
33
  },
26
34
  {
27
35
  "model_name": "sd3-medium",
@@ -33,7 +41,11 @@
33
41
  "text2image",
34
42
  "image2image",
35
43
  "inpainting"
36
- ]
44
+ ],
45
+ "default_model_config": {
46
+ "quantize": true,
47
+ "quantize_text_encoder": "text_encoder_3"
48
+ }
37
49
  },
38
50
  {
39
51
  "model_name": "sd-turbo",
@@ -148,5 +160,15 @@
148
160
  "model_revision": "62134b9d8e703b5d6f74f1534457287a8bba77ef"
149
161
  }
150
162
  ]
163
+ },
164
+ {
165
+ "model_name": "GOT-OCR2_0",
166
+ "model_family": "ocr",
167
+ "model_id": "stepfun-ai/GOT-OCR2_0",
168
+ "model_revision": "master",
169
+ "model_hub": "modelscope",
170
+ "model_ability": [
171
+ "ocr"
172
+ ]
151
173
  }
152
174
  ]
@@ -0,0 +1,13 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,79 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ from typing import TYPE_CHECKING, Optional
17
+
18
+ import PIL.Image
19
+
20
+ if TYPE_CHECKING:
21
+ from ..core import ImageModelFamilyV1
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class GotOCR2Model:
27
+ def __init__(
28
+ self,
29
+ model_uid: str,
30
+ model_path: Optional[str] = None,
31
+ device: Optional[str] = None,
32
+ model_spec: Optional["ImageModelFamilyV1"] = None,
33
+ **kwargs,
34
+ ):
35
+ self._model_uid = model_uid
36
+ self._model_path = model_path
37
+ self._device = device
38
+ # model info when loading
39
+ self._model = None
40
+ self._tokenizer = None
41
+ # info
42
+ self._model_spec = model_spec
43
+ self._abilities = model_spec.model_ability or [] # type: ignore
44
+ self._kwargs = kwargs
45
+
46
+ @property
47
+ def model_ability(self):
48
+ return self._abilities
49
+
50
+ def load(self):
51
+ from transformers import AutoModel, AutoTokenizer
52
+
53
+ self._tokenizer = AutoTokenizer.from_pretrained(
54
+ self._model_path, trust_remote_code=True
55
+ )
56
+ model = AutoModel.from_pretrained(
57
+ self._model_path,
58
+ trust_remote_code=True,
59
+ low_cpu_mem_usage=True,
60
+ device_map="cuda",
61
+ use_safetensors=True,
62
+ pad_token_id=self._tokenizer.eos_token_id,
63
+ )
64
+ self._model = model.eval().cuda()
65
+
66
+ def ocr(
67
+ self,
68
+ image: PIL.Image,
69
+ **kwargs,
70
+ ):
71
+ logger.info("Got OCR 2.0 kwargs: %s", kwargs)
72
+ if "ocr_type" not in kwargs:
73
+ kwargs["ocr_type"] = "ocr"
74
+ if image.mode == "RGBA" or image.mode == "CMYK":
75
+ # convert to RGB
76
+ image = image.convert("RGB")
77
+ assert self._model is not None
78
+ # This chat API limits the max new tokens inside.
79
+ return self._model.chat(self._tokenizer, image, gradio_input=True, **kwargs)
@@ -124,7 +124,7 @@ class FluxBatchSchedulerActor(xo.StatelessActor):
124
124
  self._running_queue: deque[Text2ImageRequest] = deque() # type: ignore
125
125
  self._model = None
126
126
  self._available_device = get_available_device()
127
- self._id_to_req: Dict[str, Text2ImageRequest] = {}
127
+ self._id_to_req: Dict[str, Text2ImageRequest] = {} # type: ignore
128
128
 
129
129
  def set_model(self, model):
130
130
  """
@@ -283,9 +283,8 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
283
283
  model.enable_sequential_cpu_offload()
284
284
  elif not self._kwargs.get("device_map"):
285
285
  logger.debug("Loading model to available device")
286
- model = move_model_to_available_device(self._model)
287
- # Recommended if your computer has < 64 GB of RAM
288
- if self._kwargs.get("attention_slicing", True):
286
+ model = move_model_to_available_device(model)
287
+ if self._kwargs.get("attention_slicing", False):
289
288
  model.enable_attention_slicing()
290
289
  if self._kwargs.get("vae_tiling", False):
291
290
  model.enable_vae_tiling()
@@ -0,0 +1,221 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import contextlib
16
+ import gc
17
+ import logging
18
+ import re
19
+ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
20
+
21
+ import numpy as np
22
+ from PIL import Image
23
+ from xoscar.utils import classproperty
24
+
25
+ from ....types import LoRA
26
+ from ..sdapi import SDAPIDiffusionModelMixin
27
+ from ..utils import handle_image_result
28
+
29
+ if TYPE_CHECKING:
30
+ from ....core.progress_tracker import Progressor
31
+ from ..core import ImageModelFamilyV1
32
+
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ def quantization_predicate(name: str, m) -> bool:
38
+ return hasattr(m, "to_quantized") and m.weight.shape[1] % 512 == 0
39
+
40
+
41
+ def to_latent_size(image_size: Tuple[int, int]):
42
+ h, w = image_size
43
+ h = ((h + 15) // 16) * 16
44
+ w = ((w + 15) // 16) * 16
45
+
46
+ if (h, w) != image_size:
47
+ print(
48
+ "Warning: The image dimensions need to be divisible by 16px. "
49
+ f"Changing size to {h}x{w}."
50
+ )
51
+
52
+ return (h // 8, w // 8)
53
+
54
+
55
+ class MLXDiffusionModel(SDAPIDiffusionModelMixin):
56
+ def __init__(
57
+ self,
58
+ model_uid: str,
59
+ model_path: Optional[str] = None,
60
+ device: Optional[str] = None,
61
+ lora_model: Optional[List[LoRA]] = None,
62
+ lora_load_kwargs: Optional[Dict] = None,
63
+ lora_fuse_kwargs: Optional[Dict] = None,
64
+ model_spec: Optional["ImageModelFamilyV1"] = None,
65
+ **kwargs,
66
+ ):
67
+ self._model_uid = model_uid
68
+ self._model_path = model_path
69
+ self._device = device
70
+ # model info when loading
71
+ self._model = None
72
+ self._lora_model = lora_model
73
+ self._lora_load_kwargs = lora_load_kwargs or {}
74
+ self._lora_fuse_kwargs = lora_fuse_kwargs or {}
75
+ # info
76
+ self._model_spec = model_spec
77
+ self._abilities = model_spec.model_ability or [] # type: ignore
78
+ self._kwargs = kwargs
79
+
80
+ @property
81
+ def model_ability(self):
82
+ return self._abilities
83
+
84
+ @classproperty
85
+ def supported_models(self):
86
+ return ["FLUX.1-schnell", "FLUX.1-dev"]
87
+
88
+ def load(self):
89
+ try:
90
+ import mlx.nn as nn
91
+ except ImportError:
92
+ error_message = "Failed to import module 'mlx'"
93
+ installation_guide = [
94
+ "Please make sure 'mlx' is installed. ",
95
+ "You can install it by `pip install mlx`\n",
96
+ ]
97
+
98
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
99
+
100
+ from ....thirdparty.mlx.flux import FluxPipeline
101
+
102
+ logger.debug(
103
+ "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
104
+ )
105
+ flux = self._model = FluxPipeline(
106
+ "flux-" + self._model_spec.model_name.split("-")[1],
107
+ model_path=self._model_path,
108
+ t5_padding=self._kwargs.get("t5_padding", True),
109
+ )
110
+ self._apply_lora()
111
+
112
+ quantize = self._kwargs.get("quantize", True)
113
+ if quantize:
114
+ nn.quantize(flux.flow, class_predicate=quantization_predicate)
115
+ nn.quantize(flux.t5, class_predicate=quantization_predicate)
116
+ nn.quantize(flux.clip, class_predicate=quantization_predicate)
117
+
118
+ def _apply_lora(self):
119
+ if self._lora_model is not None:
120
+ import mlx.core as mx
121
+
122
+ for lora_model in self._lora_model:
123
+ weights, lora_config = mx.load(
124
+ lora_model.local_path, return_metadata=True
125
+ )
126
+ rank = int(lora_config.get("lora_rank", 8))
127
+ num_blocks = int(lora_config.get("lora_blocks", -1))
128
+ flux = self._model
129
+ flux.linear_to_lora_layers(rank, num_blocks)
130
+ flux.flow.load_weights(list(weights.items()), strict=False)
131
+ flux.fuse_lora_layers()
132
+ logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
133
+
134
+ @staticmethod
135
+ @contextlib.contextmanager
136
+ def _release_after():
137
+ import mlx.core as mx
138
+
139
+ try:
140
+ yield
141
+ finally:
142
+ gc.collect()
143
+ mx.metal.clear_cache()
144
+
145
+ def text_to_image(
146
+ self,
147
+ prompt: str,
148
+ n: int = 1,
149
+ size: str = "1024*1024",
150
+ response_format: str = "url",
151
+ **kwargs,
152
+ ):
153
+ import mlx.core as mx
154
+
155
+ flux = self._model
156
+ width, height = map(int, re.split(r"[^\d]+", size))
157
+
158
+ # Make the generator
159
+ latent_size = to_latent_size((height, width))
160
+ gen_latent_kwargs = {}
161
+ if (num_steps := kwargs.get("num_inference_steps")) is None:
162
+ num_steps = 50 if "dev" in self._model_spec.model_name else 2 # type: ignore
163
+ gen_latent_kwargs["num_steps"] = num_steps
164
+ if guidance := kwargs.get("guidance_scale"):
165
+ gen_latent_kwargs["guidance"] = guidance
166
+ if seed := kwargs.get("seed"):
167
+ gen_latent_kwargs["seed"] = seed
168
+
169
+ with self._release_after():
170
+ latents = flux.generate_latents( # type: ignore
171
+ prompt, n_images=n, latent_size=latent_size, **gen_latent_kwargs
172
+ )
173
+
174
+ # First we get and eval the conditioning
175
+ conditioning = next(latents)
176
+ mx.eval(conditioning)
177
+ peak_mem_conditioning = mx.metal.get_peak_memory() / 1024**3
178
+ mx.metal.reset_peak_memory()
179
+
180
+ progressor: Progressor = kwargs.pop("progressor", None)
181
+ # Actual denoising loop
182
+ for i, x_t in enumerate(latents):
183
+ mx.eval(x_t)
184
+ progressor.set_progress((i + 1) / num_steps)
185
+
186
+ peak_mem_generation = mx.metal.get_peak_memory() / 1024**3
187
+ mx.metal.reset_peak_memory()
188
+
189
+ # Decode them into images
190
+ decoded = []
191
+ for i in range(n):
192
+ decoded.append(flux.decode(x_t[i : i + 1], latent_size)) # type: ignore
193
+ mx.eval(decoded[-1])
194
+ peak_mem_decoding = mx.metal.get_peak_memory() / 1024**3
195
+ peak_mem_overall = max(
196
+ peak_mem_conditioning, peak_mem_generation, peak_mem_decoding
197
+ )
198
+
199
+ images = []
200
+ x = mx.concatenate(decoded, axis=0)
201
+ x = (x * 255).astype(mx.uint8)
202
+ for i in range(len(x)):
203
+ im = Image.fromarray(np.array(x[i]))
204
+ images.append(im)
205
+
206
+ logger.debug(
207
+ f"Peak memory used for the text: {peak_mem_conditioning:.3f}GB"
208
+ )
209
+ logger.debug(
210
+ f"Peak memory used for the generation: {peak_mem_generation:.3f}GB"
211
+ )
212
+ logger.debug(f"Peak memory used for the decoding: {peak_mem_decoding:.3f}GB")
213
+ logger.debug(f"Peak memory used overall: {peak_mem_overall:.3f}GB")
214
+
215
+ return handle_image_result(response_format, images)
216
+
217
+ def image_to_image(self, **kwargs):
218
+ raise NotImplementedError
219
+
220
+ def inpainting(self, **kwargs):
221
+ raise NotImplementedError
@@ -32,6 +32,7 @@ from .llm_family import (
32
32
  BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
33
33
  BUILTIN_LLM_PROMPT_STYLE,
34
34
  BUILTIN_MODELSCOPE_LLM_FAMILIES,
35
+ BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
35
36
  LLAMA_CLASSES,
36
37
  LLM_ENGINES,
37
38
  LMDEPLOY_CLASSES,
@@ -258,6 +259,36 @@ def _install():
258
259
  if "tools" in model_spec.model_ability:
259
260
  BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
260
261
 
262
+ openmind_hub_json_path = os.path.join(
263
+ os.path.dirname(os.path.abspath(__file__)), "llm_family_openmind_hub.json"
264
+ )
265
+ for json_obj in json.load(
266
+ codecs.open(openmind_hub_json_path, "r", encoding="utf-8")
267
+ ):
268
+ model_spec = LLMFamilyV1.parse_obj(json_obj)
269
+ BUILTIN_OPENMIND_HUB_LLM_FAMILIES.append(model_spec)
270
+
271
+ # register prompt style, in case that we have something missed
272
+ # if duplicated with huggingface json, keep it as the huggingface style
273
+
274
+ if (
275
+ "chat" in model_spec.model_ability
276
+ and isinstance(model_spec.chat_template, str)
277
+ and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
278
+ ):
279
+ BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
280
+ "chat_template": model_spec.chat_template,
281
+ "stop_token_ids": model_spec.stop_token_ids,
282
+ "stop": model_spec.stop,
283
+ }
284
+ # register model family
285
+ if "chat" in model_spec.model_ability:
286
+ BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
287
+ else:
288
+ BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
289
+ if "tools" in model_spec.model_ability:
290
+ BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
291
+
261
292
  csghub_json_path = os.path.join(
262
293
  os.path.dirname(os.path.abspath(__file__)), "llm_family_csghub.json"
263
294
  )
@@ -288,6 +319,7 @@ def _install():
288
319
  for llm_specs in [
289
320
  BUILTIN_LLM_FAMILIES,
290
321
  BUILTIN_MODELSCOPE_LLM_FAMILIES,
322
+ BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
291
323
  BUILTIN_CSGHUB_LLM_FAMILIES,
292
324
  ]:
293
325
  for llm_spec in llm_specs:
@@ -298,6 +330,7 @@ def _install():
298
330
  for families in [
299
331
  BUILTIN_LLM_FAMILIES,
300
332
  BUILTIN_MODELSCOPE_LLM_FAMILIES,
333
+ BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
301
334
  BUILTIN_CSGHUB_LLM_FAMILIES,
302
335
  ]:
303
336
  for family in families:
@@ -193,7 +193,9 @@ def create_llm_model_instance(
193
193
  model_size_in_billions: Optional[Union[int, str]] = None,
194
194
  quantization: Optional[str] = None,
195
195
  peft_model_config: Optional[PeftModelConfig] = None,
196
- download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
196
+ download_hub: Optional[
197
+ Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
198
+ ] = None,
197
199
  model_path: Optional[str] = None,
198
200
  **kwargs,
199
201
  ) -> Tuple[LLM, LLMDescription]:
@@ -8176,6 +8176,15 @@
8176
8176
  ],
8177
8177
  "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct"
8178
8178
  },
8179
+ {
8180
+ "model_format": "gptq",
8181
+ "model_size_in_billions": "7",
8182
+ "quantizations": [
8183
+ "Int4",
8184
+ "Int8"
8185
+ ],
8186
+ "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-{quantization}"
8187
+ },
8179
8188
  {
8180
8189
  "model_format": "ggufv2",
8181
8190
  "model_size_in_billions": "1_5",
@@ -41,6 +41,7 @@ from ..utils import (
41
41
  create_symlink,
42
42
  download_from_csghub,
43
43
  download_from_modelscope,
44
+ download_from_openmind_hub,
44
45
  is_valid_model_uri,
45
46
  parse_uri,
46
47
  retry_download,
@@ -239,6 +240,7 @@ LLAMA_CLASSES: List[Type[LLM]] = []
239
240
 
240
241
  BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
241
242
  BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
243
+ BUILTIN_OPENMIND_HUB_LLM_FAMILIES: List["LLMFamilyV1"] = []
242
244
  BUILTIN_CSGHUB_LLM_FAMILIES: List["LLMFamilyV1"] = []
243
245
 
244
246
  SGLANG_CLASSES: List[Type[LLM]] = []
@@ -301,6 +303,9 @@ def cache(
301
303
  elif llm_spec.model_hub == "modelscope":
302
304
  logger.info(f"Caching from Modelscope: {llm_spec.model_id}")
303
305
  return cache_from_modelscope(llm_family, llm_spec, quantization)
306
+ elif llm_spec.model_hub == "openmind_hub":
307
+ logger.info(f"Caching from openmind_hub: {llm_spec.model_id}")
308
+ return cache_from_openmind_hub(llm_family, llm_spec, quantization)
304
309
  elif llm_spec.model_hub == "csghub":
305
310
  logger.info(f"Caching from CSGHub: {llm_spec.model_id}")
306
311
  return cache_from_csghub(llm_family, llm_spec, quantization)
@@ -474,7 +479,7 @@ def _skip_download(
474
479
  model_revision: Optional[str],
475
480
  quantization: Optional[str] = None,
476
481
  ) -> bool:
477
- if model_format == "pytorch":
482
+ if model_format in ["pytorch", "mindspore"]:
478
483
  model_hub_to_meta_path = {
479
484
  "huggingface": _get_meta_path(
480
485
  cache_dir, model_format, "huggingface", quantization
@@ -482,6 +487,9 @@ def _skip_download(
482
487
  "modelscope": _get_meta_path(
483
488
  cache_dir, model_format, "modelscope", quantization
484
489
  ),
490
+ "openmind_hub": _get_meta_path(
491
+ cache_dir, model_format, "openmind_hub", quantization
492
+ ),
485
493
  "csghub": _get_meta_path(cache_dir, model_format, "csghub", quantization),
486
494
  }
487
495
  if valid_model_revision(model_hub_to_meta_path[model_hub], model_revision):
@@ -702,6 +710,50 @@ def cache_from_modelscope(
702
710
  return cache_dir
703
711
 
704
712
 
713
+ def cache_from_openmind_hub(
714
+ llm_family: LLMFamilyV1,
715
+ llm_spec: "LLMSpecV1",
716
+ quantization: Optional[str] = None,
717
+ ) -> str:
718
+ """
719
+ Cache model from openmind_hub. Return the cache directory.
720
+ """
721
+ from openmind_hub import snapshot_download
722
+
723
+ cache_dir = _get_cache_dir(llm_family, llm_spec)
724
+ if _skip_download(
725
+ cache_dir,
726
+ llm_spec.model_format,
727
+ llm_spec.model_hub,
728
+ llm_spec.model_revision,
729
+ quantization,
730
+ ):
731
+ return cache_dir
732
+
733
+ if llm_spec.model_format in ["pytorch", "mindspore"]:
734
+ download_dir = retry_download(
735
+ snapshot_download,
736
+ llm_family.model_name,
737
+ {
738
+ "model_size": llm_spec.model_size_in_billions,
739
+ "model_format": llm_spec.model_format,
740
+ },
741
+ llm_spec.model_id,
742
+ revision=llm_spec.model_revision,
743
+ )
744
+ create_symlink(download_dir, cache_dir)
745
+
746
+ else:
747
+ raise ValueError(f"Unsupported format: {llm_spec.model_format}")
748
+
749
+ meta_path = _get_meta_path(
750
+ cache_dir, llm_spec.model_format, llm_spec.model_hub, quantization
751
+ )
752
+ _generate_meta_file(meta_path, llm_family, llm_spec, quantization)
753
+
754
+ return cache_dir
755
+
756
+
705
757
  def cache_from_huggingface(
706
758
  llm_family: LLMFamilyV1,
707
759
  llm_spec: "LLMSpecV1",
@@ -893,7 +945,9 @@ def match_llm(
893
945
  model_format: Optional[str] = None,
894
946
  model_size_in_billions: Optional[Union[int, str]] = None,
895
947
  quantization: Optional[str] = None,
896
- download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
948
+ download_hub: Optional[
949
+ Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
950
+ ] = None,
897
951
  ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
898
952
  """
899
953
  Find an LLM family, spec, and quantization that satisfy given criteria.
@@ -924,6 +978,12 @@ def match_llm(
924
978
  + BUILTIN_LLM_FAMILIES
925
979
  + user_defined_llm_families
926
980
  )
981
+ elif download_hub == "openmind_hub":
982
+ all_families = (
983
+ BUILTIN_OPENMIND_HUB_LLM_FAMILIES
984
+ + BUILTIN_LLM_FAMILIES
985
+ + user_defined_llm_families
986
+ )
927
987
  elif download_hub == "csghub":
928
988
  all_families = (
929
989
  BUILTIN_CSGHUB_LLM_FAMILIES
@@ -938,6 +998,12 @@ def match_llm(
938
998
  + BUILTIN_LLM_FAMILIES
939
999
  + user_defined_llm_families
940
1000
  )
1001
+ elif download_from_openmind_hub():
1002
+ all_families = (
1003
+ BUILTIN_OPENMIND_HUB_LLM_FAMILIES
1004
+ + BUILTIN_LLM_FAMILIES
1005
+ + user_defined_llm_families
1006
+ )
941
1007
  elif download_from_csghub():
942
1008
  all_families = (
943
1009
  BUILTIN_CSGHUB_LLM_FAMILIES
@@ -5880,6 +5880,17 @@
5880
5880
  "model_revision": "master",
5881
5881
  "model_hub": "modelscope"
5882
5882
  },
5883
+ {
5884
+ "model_format": "gptq",
5885
+ "model_size_in_billions": 7,
5886
+ "quantizations": [
5887
+ "Int4",
5888
+ "Int8"
5889
+ ],
5890
+ "model_id": "qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-{quantization}",
5891
+ "model_revision": "master",
5892
+ "model_hub": "modelscope"
5893
+ },
5883
5894
  {
5884
5895
  "model_format": "ggufv2",
5885
5896
  "model_size_in_billions": "1_5",