xinference 1.7.1.post1__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/client/restful/async_restful_client.py +8 -13
- xinference/client/restful/restful_client.py +6 -2
- xinference/core/chat_interface.py +6 -4
- xinference/core/media_interface.py +5 -0
- xinference/core/model.py +1 -5
- xinference/core/supervisor.py +117 -68
- xinference/core/worker.py +49 -37
- xinference/deploy/test/test_cmdline.py +2 -6
- xinference/model/audio/__init__.py +26 -23
- xinference/model/audio/chattts.py +3 -2
- xinference/model/audio/core.py +49 -98
- xinference/model/audio/cosyvoice.py +3 -2
- xinference/model/audio/custom.py +28 -73
- xinference/model/audio/f5tts.py +3 -2
- xinference/model/audio/f5tts_mlx.py +3 -2
- xinference/model/audio/fish_speech.py +3 -2
- xinference/model/audio/funasr.py +17 -4
- xinference/model/audio/kokoro.py +3 -2
- xinference/model/audio/megatts.py +3 -2
- xinference/model/audio/melotts.py +3 -2
- xinference/model/audio/model_spec.json +572 -171
- xinference/model/audio/utils.py +0 -6
- xinference/model/audio/whisper.py +3 -2
- xinference/model/audio/whisper_mlx.py +3 -2
- xinference/model/cache_manager.py +141 -0
- xinference/model/core.py +6 -49
- xinference/model/custom.py +174 -0
- xinference/model/embedding/__init__.py +67 -56
- xinference/model/embedding/cache_manager.py +35 -0
- xinference/model/embedding/core.py +104 -84
- xinference/model/embedding/custom.py +55 -78
- xinference/model/embedding/embed_family.py +80 -31
- xinference/model/embedding/flag/core.py +21 -5
- xinference/model/embedding/llama_cpp/__init__.py +0 -0
- xinference/model/embedding/llama_cpp/core.py +234 -0
- xinference/model/embedding/model_spec.json +968 -103
- xinference/model/embedding/sentence_transformers/core.py +30 -20
- xinference/model/embedding/vllm/core.py +11 -5
- xinference/model/flexible/__init__.py +8 -2
- xinference/model/flexible/core.py +26 -119
- xinference/model/flexible/custom.py +69 -0
- xinference/model/flexible/launchers/image_process_launcher.py +1 -0
- xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
- xinference/model/flexible/launchers/transformers_launcher.py +15 -3
- xinference/model/flexible/launchers/yolo_launcher.py +5 -1
- xinference/model/image/__init__.py +20 -20
- xinference/model/image/cache_manager.py +62 -0
- xinference/model/image/core.py +70 -182
- xinference/model/image/custom.py +28 -72
- xinference/model/image/model_spec.json +402 -119
- xinference/model/image/ocr/got_ocr2.py +3 -2
- xinference/model/image/stable_diffusion/core.py +22 -7
- xinference/model/image/stable_diffusion/mlx.py +6 -6
- xinference/model/image/utils.py +2 -2
- xinference/model/llm/__init__.py +71 -94
- xinference/model/llm/cache_manager.py +292 -0
- xinference/model/llm/core.py +37 -111
- xinference/model/llm/custom.py +88 -0
- xinference/model/llm/llama_cpp/core.py +5 -7
- xinference/model/llm/llm_family.json +16260 -8151
- xinference/model/llm/llm_family.py +138 -839
- xinference/model/llm/lmdeploy/core.py +5 -7
- xinference/model/llm/memory.py +3 -4
- xinference/model/llm/mlx/core.py +6 -8
- xinference/model/llm/reasoning_parser.py +3 -1
- xinference/model/llm/sglang/core.py +32 -14
- xinference/model/llm/transformers/chatglm.py +3 -7
- xinference/model/llm/transformers/core.py +49 -27
- xinference/model/llm/transformers/deepseek_v2.py +2 -2
- xinference/model/llm/transformers/gemma3.py +2 -2
- xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
- xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
- xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
- xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
- xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
- xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
- xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
- xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
- xinference/model/llm/transformers/opt.py +3 -7
- xinference/model/llm/utils.py +34 -49
- xinference/model/llm/vllm/core.py +77 -27
- xinference/model/llm/vllm/xavier/engine.py +5 -3
- xinference/model/llm/vllm/xavier/scheduler.py +10 -6
- xinference/model/llm/vllm/xavier/transfer.py +1 -1
- xinference/model/rerank/__init__.py +26 -25
- xinference/model/rerank/core.py +47 -87
- xinference/model/rerank/custom.py +25 -71
- xinference/model/rerank/model_spec.json +158 -33
- xinference/model/rerank/utils.py +2 -2
- xinference/model/utils.py +115 -54
- xinference/model/video/__init__.py +13 -17
- xinference/model/video/core.py +44 -102
- xinference/model/video/diffusers.py +4 -3
- xinference/model/video/model_spec.json +90 -21
- xinference/types.py +5 -3
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
- xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
- xinference/web/ui/src/locales/en.json +0 -1
- xinference/web/ui/src/locales/ja.json +0 -1
- xinference/web/ui/src/locales/ko.json +0 -1
- xinference/web/ui/src/locales/zh.json +0 -1
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
- xinference/model/audio/model_spec_modelscope.json +0 -231
- xinference/model/embedding/model_spec_modelscope.json +0 -293
- xinference/model/embedding/utils.py +0 -18
- xinference/model/image/model_spec_modelscope.json +0 -375
- xinference/model/llm/llama_cpp/memory.py +0 -457
- xinference/model/llm/llm_family_csghub.json +0 -56
- xinference/model/llm/llm_family_modelscope.json +0 -8700
- xinference/model/llm/llm_family_openmind_hub.json +0 -1019
- xinference/model/rerank/model_spec_modelscope.json +0 -85
- xinference/model/video/model_spec_modelscope.json +0 -184
- xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
- xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
- /xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0
|
@@ -18,7 +18,7 @@ from typing import TYPE_CHECKING, Optional
|
|
|
18
18
|
import PIL.Image
|
|
19
19
|
|
|
20
20
|
if TYPE_CHECKING:
|
|
21
|
-
from ..core import
|
|
21
|
+
from ..core import ImageModelFamilyV2
|
|
22
22
|
|
|
23
23
|
logger = logging.getLogger(__name__)
|
|
24
24
|
|
|
@@ -29,9 +29,10 @@ class GotOCR2Model:
|
|
|
29
29
|
model_uid: str,
|
|
30
30
|
model_path: Optional[str] = None,
|
|
31
31
|
device: Optional[str] = None,
|
|
32
|
-
model_spec: Optional["
|
|
32
|
+
model_spec: Optional["ImageModelFamilyV2"] = None,
|
|
33
33
|
**kwargs,
|
|
34
34
|
):
|
|
35
|
+
self.model_family = model_spec
|
|
35
36
|
self._model_uid = model_uid
|
|
36
37
|
self._model_path = model_path
|
|
37
38
|
self._device = device
|
|
@@ -37,7 +37,7 @@ from ..utils import handle_image_result
|
|
|
37
37
|
|
|
38
38
|
if TYPE_CHECKING:
|
|
39
39
|
from ....core.progress_tracker import Progressor
|
|
40
|
-
from ..core import
|
|
40
|
+
from ..core import ImageModelFamilyV2
|
|
41
41
|
|
|
42
42
|
logger = logging.getLogger(__name__)
|
|
43
43
|
|
|
@@ -87,10 +87,11 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
87
87
|
lora_model: Optional[List[LoRA]] = None,
|
|
88
88
|
lora_load_kwargs: Optional[Dict] = None,
|
|
89
89
|
lora_fuse_kwargs: Optional[Dict] = None,
|
|
90
|
-
model_spec: Optional["
|
|
90
|
+
model_spec: Optional["ImageModelFamilyV2"] = None,
|
|
91
91
|
gguf_model_path: Optional[str] = None,
|
|
92
92
|
**kwargs,
|
|
93
93
|
):
|
|
94
|
+
self.model_family = model_spec
|
|
94
95
|
self._model_uid = model_uid
|
|
95
96
|
self._model_path = model_path
|
|
96
97
|
self._device = device
|
|
@@ -239,10 +240,22 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
239
240
|
logger.debug(
|
|
240
241
|
"Loading model from %s, kwargs: %s", self._model_path, self._kwargs
|
|
241
242
|
)
|
|
242
|
-
|
|
243
|
-
self.
|
|
244
|
-
|
|
245
|
-
|
|
243
|
+
try:
|
|
244
|
+
self._model = AutoPipelineModel.from_pretrained(
|
|
245
|
+
self._model_path,
|
|
246
|
+
**self._kwargs,
|
|
247
|
+
)
|
|
248
|
+
except ValueError:
|
|
249
|
+
if "kontext" in self._model_spec.model_name.lower():
|
|
250
|
+
# TODO: remove this branch when auto pipeline supports
|
|
251
|
+
# flux.1-kontext-dev
|
|
252
|
+
from diffusers import FluxKontextPipeline
|
|
253
|
+
|
|
254
|
+
self._model = FluxKontextPipeline.from_pretrained(
|
|
255
|
+
self._model_path, **self._kwargs
|
|
256
|
+
)
|
|
257
|
+
else:
|
|
258
|
+
raise
|
|
246
259
|
self._load_to_device(self._model)
|
|
247
260
|
self._apply_lora()
|
|
248
261
|
|
|
@@ -657,7 +670,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
657
670
|
response_format: str = "url",
|
|
658
671
|
**kwargs,
|
|
659
672
|
):
|
|
660
|
-
if self._kwargs.get("controlnet"):
|
|
673
|
+
if self._kwargs.get("controlnet") or self._model_spec.model_ability == [ # type: ignore
|
|
674
|
+
"image2image"
|
|
675
|
+
]:
|
|
661
676
|
model = self._model
|
|
662
677
|
else:
|
|
663
678
|
ability = "image2image"
|
|
@@ -20,7 +20,6 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
|
|
|
20
20
|
|
|
21
21
|
import numpy as np
|
|
22
22
|
from PIL import Image
|
|
23
|
-
from xoscar.utils import classproperty
|
|
24
23
|
|
|
25
24
|
from ....types import LoRA
|
|
26
25
|
from ..sdapi import SDAPIDiffusionModelMixin
|
|
@@ -28,7 +27,7 @@ from ..utils import handle_image_result
|
|
|
28
27
|
|
|
29
28
|
if TYPE_CHECKING:
|
|
30
29
|
from ....core.progress_tracker import Progressor
|
|
31
|
-
from ..core import
|
|
30
|
+
from ..core import ImageModelFamilyV2
|
|
32
31
|
|
|
33
32
|
|
|
34
33
|
logger = logging.getLogger(__name__)
|
|
@@ -61,9 +60,10 @@ class MLXDiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
61
60
|
lora_model: Optional[List[LoRA]] = None,
|
|
62
61
|
lora_load_kwargs: Optional[Dict] = None,
|
|
63
62
|
lora_fuse_kwargs: Optional[Dict] = None,
|
|
64
|
-
model_spec: Optional["
|
|
63
|
+
model_spec: Optional["ImageModelFamilyV2"] = None,
|
|
65
64
|
**kwargs,
|
|
66
65
|
):
|
|
66
|
+
self.model_family = model_spec
|
|
67
67
|
self._model_uid = model_uid
|
|
68
68
|
self._model_path = model_path
|
|
69
69
|
self._device = device
|
|
@@ -81,9 +81,9 @@ class MLXDiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
81
81
|
def model_ability(self):
|
|
82
82
|
return self._abilities
|
|
83
83
|
|
|
84
|
-
@
|
|
85
|
-
def
|
|
86
|
-
return
|
|
84
|
+
@staticmethod
|
|
85
|
+
def support_model(model_name: str) -> bool:
|
|
86
|
+
return "flux" in model_name.lower()
|
|
87
87
|
|
|
88
88
|
def load(self):
|
|
89
89
|
try:
|
xinference/model/image/utils.py
CHANGED
|
@@ -24,11 +24,11 @@ from ...constants import XINFERENCE_IMAGE_DIR
|
|
|
24
24
|
from ...types import Image, ImageList
|
|
25
25
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
27
|
-
from .core import
|
|
27
|
+
from .core import ImageModelFamilyV2
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def get_model_version(
|
|
31
|
-
image_model: "
|
|
31
|
+
image_model: "ImageModelFamilyV2", controlnet: Optional["ImageModelFamilyV2"]
|
|
32
32
|
) -> str:
|
|
33
33
|
return (
|
|
34
34
|
image_model.model_name
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -11,28 +11,25 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
14
|
import codecs
|
|
16
15
|
import json
|
|
17
16
|
import os
|
|
18
17
|
import warnings
|
|
19
18
|
|
|
19
|
+
from ..utils import flatten_quantizations
|
|
20
20
|
from .core import (
|
|
21
21
|
LLM,
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
get_llm_model_descriptions,
|
|
22
|
+
LLM_VERSION_INFOS,
|
|
23
|
+
generate_llm_version_info,
|
|
24
|
+
get_llm_version_infos,
|
|
26
25
|
)
|
|
26
|
+
from .custom import get_user_defined_llm_families, register_llm, unregister_llm
|
|
27
27
|
from .llm_family import (
|
|
28
|
-
BUILTIN_CSGHUB_LLM_FAMILIES,
|
|
29
28
|
BUILTIN_LLM_FAMILIES,
|
|
30
29
|
BUILTIN_LLM_MODEL_CHAT_FAMILIES,
|
|
31
30
|
BUILTIN_LLM_MODEL_GENERATE_FAMILIES,
|
|
32
31
|
BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
|
|
33
32
|
BUILTIN_LLM_PROMPT_STYLE,
|
|
34
|
-
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
35
|
-
BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
|
|
36
33
|
LLAMA_CLASSES,
|
|
37
34
|
LLM_ENGINES,
|
|
38
35
|
LMDEPLOY_CLASSES,
|
|
@@ -41,17 +38,13 @@ from .llm_family import (
|
|
|
41
38
|
SUPPORTED_ENGINES,
|
|
42
39
|
TRANSFORMERS_CLASSES,
|
|
43
40
|
VLLM_CLASSES,
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
41
|
+
CustomLLMFamilyV2,
|
|
42
|
+
LlamaCppLLMSpecV2,
|
|
43
|
+
LLMFamilyV2,
|
|
47
44
|
LLMSpecV1,
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
get_cache_status,
|
|
51
|
-
get_user_defined_llm_families,
|
|
45
|
+
MLXLLMSpecV2,
|
|
46
|
+
PytorchLLMSpecV2,
|
|
52
47
|
match_llm,
|
|
53
|
-
register_llm,
|
|
54
|
-
unregister_llm,
|
|
55
48
|
)
|
|
56
49
|
|
|
57
50
|
|
|
@@ -64,69 +57,72 @@ def check_format_with_engine(model_format, engine):
|
|
|
64
57
|
return True
|
|
65
58
|
|
|
66
59
|
|
|
67
|
-
def generate_engine_config_by_model_family(model_family):
|
|
60
|
+
def generate_engine_config_by_model_family(model_family: "LLMFamilyV2"):
|
|
68
61
|
model_name = model_family.model_name
|
|
69
62
|
specs = model_family.model_specs
|
|
70
63
|
engines = LLM_ENGINES.get(model_name, {}) # structure for engine query
|
|
71
64
|
for spec in specs:
|
|
72
65
|
model_format = spec.model_format
|
|
73
66
|
model_size_in_billions = spec.model_size_in_billions
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
break
|
|
67
|
+
quantization = spec.quantization
|
|
68
|
+
# traverse all supported engines to match the name, format, size in billions and quantization of model
|
|
69
|
+
for engine in SUPPORTED_ENGINES:
|
|
70
|
+
if not check_format_with_engine(
|
|
71
|
+
model_format, engine
|
|
72
|
+
): # match the format of model with engine
|
|
73
|
+
continue
|
|
74
|
+
CLASSES = SUPPORTED_ENGINES[engine]
|
|
75
|
+
for cls in CLASSES:
|
|
76
|
+
if cls.match(model_family, spec, quantization):
|
|
77
|
+
engine_params = engines.get(engine, [])
|
|
78
|
+
already_exists = False
|
|
79
|
+
# if the name, format and size in billions of model already exists in the structure, add the new quantization
|
|
80
|
+
for param in engine_params:
|
|
81
|
+
if (
|
|
82
|
+
model_name == param["model_name"]
|
|
83
|
+
and model_format == param["model_format"]
|
|
84
|
+
and model_size_in_billions
|
|
85
|
+
== param["model_size_in_billions"]
|
|
86
|
+
):
|
|
87
|
+
if quantization not in param["quantizations"]:
|
|
88
|
+
param["quantizations"].append(quantization)
|
|
89
|
+
already_exists = True
|
|
90
|
+
break
|
|
91
|
+
# successfully match the params for the first time, add to the structure
|
|
92
|
+
if not already_exists:
|
|
93
|
+
engine_params.append(
|
|
94
|
+
{
|
|
95
|
+
"model_name": model_name,
|
|
96
|
+
"model_format": model_format,
|
|
97
|
+
"model_size_in_billions": model_size_in_billions,
|
|
98
|
+
"quantizations": [quantization],
|
|
99
|
+
"llm_class": cls,
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
if hasattr(spec, "multimodal_projectors"):
|
|
103
|
+
engine_params[-1][
|
|
104
|
+
"multimodal_projectors"
|
|
105
|
+
] = spec.multimodal_projectors
|
|
106
|
+
engines[engine] = engine_params
|
|
107
|
+
break
|
|
116
108
|
LLM_ENGINES[model_name] = engines
|
|
117
109
|
|
|
118
110
|
|
|
119
111
|
def register_custom_model():
|
|
120
112
|
from ...constants import XINFERENCE_MODEL_DIR
|
|
113
|
+
from ..custom import migrate_from_v1_to_v2
|
|
114
|
+
|
|
115
|
+
# migrate from v1 to v2 first
|
|
116
|
+
migrate_from_v1_to_v2("llm", CustomLLMFamilyV2)
|
|
121
117
|
|
|
122
|
-
user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "llm")
|
|
118
|
+
user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "v2", "llm")
|
|
123
119
|
if os.path.isdir(user_defined_llm_dir):
|
|
124
120
|
for f in os.listdir(user_defined_llm_dir):
|
|
125
121
|
try:
|
|
126
122
|
with codecs.open(
|
|
127
123
|
os.path.join(user_defined_llm_dir, f), encoding="utf-8"
|
|
128
124
|
) as fd:
|
|
129
|
-
user_defined_llm_family =
|
|
125
|
+
user_defined_llm_family = CustomLLMFamilyV2.parse_raw(fd.read())
|
|
130
126
|
register_llm(user_defined_llm_family, persist=False)
|
|
131
127
|
except Exception as e:
|
|
132
128
|
warnings.warn(f"{user_defined_llm_dir}/{f} has error, {e}")
|
|
@@ -135,7 +131,11 @@ def register_custom_model():
|
|
|
135
131
|
def load_model_family_from_json(json_filename, target_families):
|
|
136
132
|
json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), json_filename)
|
|
137
133
|
for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
|
|
138
|
-
|
|
134
|
+
flattened = []
|
|
135
|
+
for spec in json_obj["model_specs"]:
|
|
136
|
+
flattened.extend(flatten_quantizations(spec))
|
|
137
|
+
json_obj["model_specs"] = flattened
|
|
138
|
+
model_spec = LLMFamilyV2.parse_obj(json_obj)
|
|
139
139
|
target_families.append(model_spec)
|
|
140
140
|
|
|
141
141
|
# register chat_template
|
|
@@ -178,11 +178,7 @@ def _install():
|
|
|
178
178
|
from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
|
|
179
179
|
|
|
180
180
|
# register llm classes.
|
|
181
|
-
LLAMA_CLASSES.extend(
|
|
182
|
-
[
|
|
183
|
-
XllamaCppModel,
|
|
184
|
-
]
|
|
185
|
-
)
|
|
181
|
+
LLAMA_CLASSES.extend([XllamaCppModel])
|
|
186
182
|
SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel, SGLANGVisionModel])
|
|
187
183
|
VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
|
|
188
184
|
MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
|
|
@@ -198,36 +194,17 @@ def _install():
|
|
|
198
194
|
SUPPORTED_ENGINES["LMDEPLOY"] = LMDEPLOY_CLASSES
|
|
199
195
|
|
|
200
196
|
load_model_family_from_json("llm_family.json", BUILTIN_LLM_FAMILIES)
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
"llm_family_openmind_hub.json", BUILTIN_OPENMIND_HUB_LLM_FAMILIES
|
|
206
|
-
)
|
|
207
|
-
load_model_family_from_json("llm_family_csghub.json", BUILTIN_CSGHUB_LLM_FAMILIES)
|
|
208
|
-
|
|
209
|
-
for llm_specs in [
|
|
210
|
-
BUILTIN_LLM_FAMILIES,
|
|
211
|
-
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
212
|
-
BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
|
|
213
|
-
BUILTIN_CSGHUB_LLM_FAMILIES,
|
|
214
|
-
]:
|
|
215
|
-
for llm_spec in llm_specs:
|
|
216
|
-
if llm_spec.model_name not in LLM_MODEL_DESCRIPTIONS:
|
|
217
|
-
LLM_MODEL_DESCRIPTIONS.update(generate_llm_description(llm_spec))
|
|
197
|
+
|
|
198
|
+
for family in BUILTIN_LLM_FAMILIES:
|
|
199
|
+
if family.model_name not in LLM_VERSION_INFOS:
|
|
200
|
+
LLM_VERSION_INFOS.update(generate_llm_version_info(family))
|
|
218
201
|
|
|
219
202
|
# traverse all families and add engine parameters corresponding to the model name
|
|
220
|
-
for
|
|
221
|
-
|
|
222
|
-
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
223
|
-
BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
|
|
224
|
-
BUILTIN_CSGHUB_LLM_FAMILIES,
|
|
225
|
-
]:
|
|
226
|
-
for family in families:
|
|
227
|
-
generate_engine_config_by_model_family(family)
|
|
203
|
+
for family in BUILTIN_LLM_FAMILIES:
|
|
204
|
+
generate_engine_config_by_model_family(family)
|
|
228
205
|
|
|
229
206
|
register_custom_model()
|
|
230
207
|
|
|
231
208
|
# register model description
|
|
232
209
|
for ud_llm in get_user_defined_llm_families():
|
|
233
|
-
|
|
210
|
+
LLM_VERSION_INFOS.update(generate_llm_version_info(ud_llm))
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from typing import TYPE_CHECKING, Optional
|
|
4
|
+
|
|
5
|
+
from ..cache_manager import CacheManager
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from .llm_family import LLMFamilyV2
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LLMCacheManager(CacheManager):
|
|
15
|
+
def __init__(
|
|
16
|
+
self, llm_family: "LLMFamilyV2", multimodal_projector: Optional[str] = None
|
|
17
|
+
):
|
|
18
|
+
super().__init__(llm_family)
|
|
19
|
+
self._llm_family = llm_family
|
|
20
|
+
self._model_name = llm_family.model_name
|
|
21
|
+
self._model_format = llm_family.model_specs[0].model_format
|
|
22
|
+
self._model_size_in_billions = getattr(
|
|
23
|
+
llm_family.model_specs[0], "model_size_in_billions", None
|
|
24
|
+
)
|
|
25
|
+
self._quantization = llm_family.model_specs[0].quantization
|
|
26
|
+
self._model_uri = llm_family.model_specs[0].model_uri
|
|
27
|
+
self._multimodal_projector = multimodal_projector
|
|
28
|
+
self._model_id = llm_family.model_specs[0].model_id
|
|
29
|
+
self._model_hub = llm_family.model_specs[0].model_hub
|
|
30
|
+
self._model_revision = llm_family.model_specs[0].model_revision
|
|
31
|
+
self._cache_dir = os.path.join(
|
|
32
|
+
self._v2_cache_dir_prefix,
|
|
33
|
+
f"{self._model_name.replace('.', '_')}-{self._model_format}-"
|
|
34
|
+
f"{self._model_size_in_billions}b-{self._quantization}",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def cache_uri(self) -> str:
|
|
38
|
+
from ..utils import parse_uri
|
|
39
|
+
|
|
40
|
+
cache_dir = self.get_cache_dir()
|
|
41
|
+
assert self._model_uri is not None
|
|
42
|
+
src_scheme, src_root = parse_uri(self._model_uri)
|
|
43
|
+
if src_root.endswith("/"):
|
|
44
|
+
# remove trailing path separator.
|
|
45
|
+
src_root = src_root[:-1]
|
|
46
|
+
|
|
47
|
+
if src_scheme == "file":
|
|
48
|
+
if not os.path.isabs(src_root):
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"Model URI cannot be a relative path: {self._model_uri}"
|
|
51
|
+
)
|
|
52
|
+
if os.path.exists(cache_dir):
|
|
53
|
+
logger.info(f"Cache {cache_dir} exists")
|
|
54
|
+
return cache_dir
|
|
55
|
+
else:
|
|
56
|
+
os.symlink(src_root, cache_dir, target_is_directory=True)
|
|
57
|
+
return cache_dir
|
|
58
|
+
else:
|
|
59
|
+
raise ValueError(f"Unsupported URL scheme: {src_scheme}")
|
|
60
|
+
|
|
61
|
+
def cache_from_huggingface(self) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Cache model from Hugging Face. Return the cache directory.
|
|
64
|
+
"""
|
|
65
|
+
import huggingface_hub
|
|
66
|
+
|
|
67
|
+
from ..utils import (
|
|
68
|
+
IS_NEW_HUGGINGFACE_HUB,
|
|
69
|
+
create_symlink,
|
|
70
|
+
generate_model_file_names_with_quantization_parts,
|
|
71
|
+
merge_cached_files,
|
|
72
|
+
retry_download,
|
|
73
|
+
symlink_local_file,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
cache_dir = self.get_cache_dir()
|
|
77
|
+
if self.get_cache_status():
|
|
78
|
+
return cache_dir
|
|
79
|
+
|
|
80
|
+
use_symlinks = {}
|
|
81
|
+
if not IS_NEW_HUGGINGFACE_HUB:
|
|
82
|
+
use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
|
|
83
|
+
|
|
84
|
+
if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
|
|
85
|
+
download_dir = retry_download(
|
|
86
|
+
huggingface_hub.snapshot_download,
|
|
87
|
+
self._model_name,
|
|
88
|
+
{
|
|
89
|
+
"model_size": self._model_size_in_billions,
|
|
90
|
+
"model_format": self._model_format,
|
|
91
|
+
},
|
|
92
|
+
self._model_id,
|
|
93
|
+
revision=self._model_revision,
|
|
94
|
+
**use_symlinks,
|
|
95
|
+
)
|
|
96
|
+
if IS_NEW_HUGGINGFACE_HUB:
|
|
97
|
+
create_symlink(download_dir, cache_dir)
|
|
98
|
+
elif self._model_format in ["ggufv2"]:
|
|
99
|
+
file_names, final_file_name, need_merge = (
|
|
100
|
+
generate_model_file_names_with_quantization_parts(
|
|
101
|
+
self._llm_family.model_specs[0], self._multimodal_projector
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
for file_name in file_names:
|
|
106
|
+
download_file_path = retry_download(
|
|
107
|
+
huggingface_hub.hf_hub_download,
|
|
108
|
+
self._model_name,
|
|
109
|
+
{
|
|
110
|
+
"model_size": self._model_size_in_billions,
|
|
111
|
+
"model_format": self._model_format,
|
|
112
|
+
},
|
|
113
|
+
self._model_id,
|
|
114
|
+
revision=self._model_revision,
|
|
115
|
+
filename=file_name,
|
|
116
|
+
**use_symlinks,
|
|
117
|
+
)
|
|
118
|
+
if IS_NEW_HUGGINGFACE_HUB:
|
|
119
|
+
symlink_local_file(download_file_path, cache_dir, file_name)
|
|
120
|
+
|
|
121
|
+
if need_merge:
|
|
122
|
+
merge_cached_files(cache_dir, file_names, final_file_name)
|
|
123
|
+
else:
|
|
124
|
+
raise ValueError(f"Unsupported model format: {self._model_format}")
|
|
125
|
+
|
|
126
|
+
return cache_dir
|
|
127
|
+
|
|
128
|
+
def cache_from_modelscope(self) -> str:
|
|
129
|
+
"""
|
|
130
|
+
Cache model from Modelscope. Return the cache directory.
|
|
131
|
+
"""
|
|
132
|
+
from modelscope.hub.file_download import model_file_download
|
|
133
|
+
from modelscope.hub.snapshot_download import snapshot_download
|
|
134
|
+
|
|
135
|
+
from ..utils import (
|
|
136
|
+
create_symlink,
|
|
137
|
+
generate_model_file_names_with_quantization_parts,
|
|
138
|
+
merge_cached_files,
|
|
139
|
+
retry_download,
|
|
140
|
+
symlink_local_file,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
cache_dir = self.get_cache_dir()
|
|
144
|
+
if self.get_cache_status():
|
|
145
|
+
return cache_dir
|
|
146
|
+
|
|
147
|
+
if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
|
|
148
|
+
download_dir = retry_download(
|
|
149
|
+
snapshot_download,
|
|
150
|
+
self._model_name,
|
|
151
|
+
{
|
|
152
|
+
"model_size": self._model_size_in_billions,
|
|
153
|
+
"model_format": self._model_format,
|
|
154
|
+
},
|
|
155
|
+
self._model_id,
|
|
156
|
+
revision=self._model_revision,
|
|
157
|
+
)
|
|
158
|
+
create_symlink(download_dir, cache_dir)
|
|
159
|
+
|
|
160
|
+
elif self._model_format in ["ggufv2"]:
|
|
161
|
+
file_names, final_file_name, need_merge = (
|
|
162
|
+
generate_model_file_names_with_quantization_parts(
|
|
163
|
+
self._llm_family.model_specs[0], self._multimodal_projector
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
for filename in file_names:
|
|
168
|
+
download_path = retry_download(
|
|
169
|
+
model_file_download,
|
|
170
|
+
self._model_name,
|
|
171
|
+
{
|
|
172
|
+
"model_size": self._model_size_in_billions,
|
|
173
|
+
"model_format": self._model_format,
|
|
174
|
+
},
|
|
175
|
+
self._model_id,
|
|
176
|
+
filename,
|
|
177
|
+
revision=self._model_revision,
|
|
178
|
+
)
|
|
179
|
+
symlink_local_file(download_path, cache_dir, filename)
|
|
180
|
+
|
|
181
|
+
if need_merge:
|
|
182
|
+
merge_cached_files(cache_dir, file_names, final_file_name)
|
|
183
|
+
else:
|
|
184
|
+
raise ValueError(f"Unsupported format: {self._model_format}")
|
|
185
|
+
|
|
186
|
+
return cache_dir
|
|
187
|
+
|
|
188
|
+
def cache_from_openmind_hub(self) -> str:
|
|
189
|
+
"""
|
|
190
|
+
Cache model from openmind_hub. Return the cache directory.
|
|
191
|
+
"""
|
|
192
|
+
from openmind_hub import snapshot_download
|
|
193
|
+
|
|
194
|
+
from ..utils import create_symlink, retry_download
|
|
195
|
+
|
|
196
|
+
cache_dir = self.get_cache_dir()
|
|
197
|
+
if self.get_cache_status():
|
|
198
|
+
return cache_dir
|
|
199
|
+
|
|
200
|
+
if self._model_format in ["pytorch", "mindspore"]:
|
|
201
|
+
download_dir = retry_download(
|
|
202
|
+
snapshot_download,
|
|
203
|
+
self._model_name,
|
|
204
|
+
{
|
|
205
|
+
"model_size": self._model_size_in_billions,
|
|
206
|
+
"model_format": self._model_format,
|
|
207
|
+
},
|
|
208
|
+
self._model_id,
|
|
209
|
+
revision=self._model_revision,
|
|
210
|
+
)
|
|
211
|
+
create_symlink(download_dir, cache_dir)
|
|
212
|
+
|
|
213
|
+
else:
|
|
214
|
+
raise ValueError(f"Unsupported format: {self._model_format}")
|
|
215
|
+
return cache_dir
|
|
216
|
+
|
|
217
|
+
def cache_from_csghub(self) -> str:
|
|
218
|
+
"""
|
|
219
|
+
Cache model from CSGHub. Return the cache directory.
|
|
220
|
+
"""
|
|
221
|
+
from pycsghub.file_download import file_download
|
|
222
|
+
from pycsghub.snapshot_download import snapshot_download
|
|
223
|
+
|
|
224
|
+
from ...constants import XINFERENCE_CSG_ENDPOINT, XINFERENCE_ENV_CSG_TOKEN
|
|
225
|
+
from ..utils import (
|
|
226
|
+
create_symlink,
|
|
227
|
+
generate_model_file_names_with_quantization_parts,
|
|
228
|
+
merge_cached_files,
|
|
229
|
+
retry_download,
|
|
230
|
+
symlink_local_file,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
cache_dir = self.get_cache_dir()
|
|
234
|
+
if self.get_cache_status():
|
|
235
|
+
return cache_dir
|
|
236
|
+
|
|
237
|
+
if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
|
|
238
|
+
download_dir = retry_download(
|
|
239
|
+
snapshot_download,
|
|
240
|
+
self._model_name,
|
|
241
|
+
{
|
|
242
|
+
"model_size": self._model_size_in_billions,
|
|
243
|
+
"model_format": self._model_format,
|
|
244
|
+
},
|
|
245
|
+
self._model_id,
|
|
246
|
+
endpoint=XINFERENCE_CSG_ENDPOINT,
|
|
247
|
+
token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
|
|
248
|
+
)
|
|
249
|
+
create_symlink(download_dir, cache_dir)
|
|
250
|
+
elif self._model_format in ["ggufv2"]:
|
|
251
|
+
file_names, final_file_name, need_merge = (
|
|
252
|
+
generate_model_file_names_with_quantization_parts(
|
|
253
|
+
self._llm_family.model_specs[0], self._multimodal_projector
|
|
254
|
+
)
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
for filename in file_names:
|
|
258
|
+
download_path = retry_download(
|
|
259
|
+
file_download,
|
|
260
|
+
self._model_name,
|
|
261
|
+
{
|
|
262
|
+
"model_size": self._model_size_in_billions,
|
|
263
|
+
"model_format": self._model_format,
|
|
264
|
+
},
|
|
265
|
+
self._model_id,
|
|
266
|
+
file_name=filename,
|
|
267
|
+
endpoint=XINFERENCE_CSG_ENDPOINT,
|
|
268
|
+
token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
|
|
269
|
+
)
|
|
270
|
+
symlink_local_file(download_path, cache_dir, filename)
|
|
271
|
+
|
|
272
|
+
if need_merge:
|
|
273
|
+
merge_cached_files(cache_dir, file_names, final_file_name)
|
|
274
|
+
else:
|
|
275
|
+
raise ValueError(f"Unsupported format: {self._model_format}")
|
|
276
|
+
|
|
277
|
+
return cache_dir
|
|
278
|
+
|
|
279
|
+
def cache(self) -> str:
|
|
280
|
+
if self._model_uri is not None:
|
|
281
|
+
return self.cache_uri()
|
|
282
|
+
else:
|
|
283
|
+
if self._model_hub == "huggingface":
|
|
284
|
+
return self.cache_from_huggingface()
|
|
285
|
+
elif self._model_hub == "modelscope":
|
|
286
|
+
return self.cache_from_modelscope()
|
|
287
|
+
elif self._model_hub == "openmind_hub":
|
|
288
|
+
return self.cache_from_openmind_hub()
|
|
289
|
+
elif self._model_hub == "csghub":
|
|
290
|
+
return self.cache_from_csghub()
|
|
291
|
+
else:
|
|
292
|
+
raise ValueError(f"Unknown model hub: {self._model_hub}")
|