xinference 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +48 -0
- xinference/client/restful/restful_client.py +19 -0
- xinference/constants.py +1 -0
- xinference/core/chat_interface.py +5 -1
- xinference/core/image_interface.py +5 -1
- xinference/core/model.py +106 -16
- xinference/core/scheduler.py +1 -1
- xinference/core/worker.py +3 -1
- xinference/deploy/supervisor.py +0 -4
- xinference/model/audio/chattts.py +25 -14
- xinference/model/audio/core.py +6 -2
- xinference/model/audio/model_spec.json +1 -1
- xinference/model/audio/model_spec_modelscope.json +1 -1
- xinference/model/core.py +3 -1
- xinference/model/embedding/core.py +6 -2
- xinference/model/embedding/model_spec.json +1 -1
- xinference/model/image/core.py +65 -6
- xinference/model/image/model_spec.json +24 -3
- xinference/model/image/model_spec_modelscope.json +25 -3
- xinference/model/image/ocr/__init__.py +13 -0
- xinference/model/image/ocr/got_ocr2.py +79 -0
- xinference/model/image/scheduler/flux.py +1 -1
- xinference/model/image/stable_diffusion/core.py +2 -3
- xinference/model/image/stable_diffusion/mlx.py +221 -0
- xinference/model/llm/__init__.py +33 -0
- xinference/model/llm/core.py +3 -1
- xinference/model/llm/llm_family.json +9 -0
- xinference/model/llm/llm_family.py +68 -2
- xinference/model/llm/llm_family_modelscope.json +11 -0
- xinference/model/llm/llm_family_openmind_hub.json +1359 -0
- xinference/model/rerank/core.py +9 -1
- xinference/model/utils.py +7 -0
- xinference/model/video/core.py +6 -2
- xinference/thirdparty/mlx/__init__.py +13 -0
- xinference/thirdparty/mlx/flux/__init__.py +15 -0
- xinference/thirdparty/mlx/flux/autoencoder.py +357 -0
- xinference/thirdparty/mlx/flux/clip.py +154 -0
- xinference/thirdparty/mlx/flux/datasets.py +75 -0
- xinference/thirdparty/mlx/flux/flux.py +247 -0
- xinference/thirdparty/mlx/flux/layers.py +302 -0
- xinference/thirdparty/mlx/flux/lora.py +76 -0
- xinference/thirdparty/mlx/flux/model.py +134 -0
- xinference/thirdparty/mlx/flux/sampler.py +56 -0
- xinference/thirdparty/mlx/flux/t5.py +244 -0
- xinference/thirdparty/mlx/flux/tokenizers.py +185 -0
- xinference/thirdparty/mlx/flux/trainer.py +98 -0
- xinference/thirdparty/mlx/flux/utils.py +179 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.f7da0140.js → main.2f269bb3.js} +3 -3
- xinference/web/ui/build/static/js/main.2f269bb3.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +1 -0
- {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/METADATA +16 -9
- {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/RECORD +60 -42
- xinference/web/ui/build/static/js/main.f7da0140.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +0 -1
- /xinference/web/ui/build/static/js/{main.f7da0140.js.LICENSE.txt → main.2f269bb3.js.LICENSE.txt} +0 -0
- {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/LICENSE +0 -0
- {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/WHEEL +0 -0
- {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.16.0.dist-info → xinference-0.16.2.dist-info}/top_level.txt +0 -0
|
@@ -8,7 +8,11 @@
|
|
|
8
8
|
"text2image",
|
|
9
9
|
"image2image",
|
|
10
10
|
"inpainting"
|
|
11
|
-
]
|
|
11
|
+
],
|
|
12
|
+
"default_model_config": {
|
|
13
|
+
"quantize": true,
|
|
14
|
+
"quantize_text_encoder": "text_encoder_2"
|
|
15
|
+
}
|
|
12
16
|
},
|
|
13
17
|
{
|
|
14
18
|
"model_name": "FLUX.1-dev",
|
|
@@ -19,7 +23,11 @@
|
|
|
19
23
|
"text2image",
|
|
20
24
|
"image2image",
|
|
21
25
|
"inpainting"
|
|
22
|
-
]
|
|
26
|
+
],
|
|
27
|
+
"default_model_config": {
|
|
28
|
+
"quantize": true,
|
|
29
|
+
"quantize_text_encoder": "text_encoder_2"
|
|
30
|
+
}
|
|
23
31
|
},
|
|
24
32
|
{
|
|
25
33
|
"model_name": "sd3-medium",
|
|
@@ -30,7 +38,11 @@
|
|
|
30
38
|
"text2image",
|
|
31
39
|
"image2image",
|
|
32
40
|
"inpainting"
|
|
33
|
-
]
|
|
41
|
+
],
|
|
42
|
+
"default_model_config": {
|
|
43
|
+
"quantize": true,
|
|
44
|
+
"quantize_text_encoder": "text_encoder_3"
|
|
45
|
+
}
|
|
34
46
|
},
|
|
35
47
|
{
|
|
36
48
|
"model_name": "sd-turbo",
|
|
@@ -178,5 +190,14 @@
|
|
|
178
190
|
"model_ability": [
|
|
179
191
|
"inpainting"
|
|
180
192
|
]
|
|
193
|
+
},
|
|
194
|
+
{
|
|
195
|
+
"model_name": "GOT-OCR2_0",
|
|
196
|
+
"model_family": "ocr",
|
|
197
|
+
"model_id": "stepfun-ai/GOT-OCR2_0",
|
|
198
|
+
"model_revision": "cf6b7386bc89a54f09785612ba74cb12de6fa17c",
|
|
199
|
+
"model_ability": [
|
|
200
|
+
"ocr"
|
|
201
|
+
]
|
|
181
202
|
}
|
|
182
203
|
]
|
|
@@ -9,7 +9,11 @@
|
|
|
9
9
|
"text2image",
|
|
10
10
|
"image2image",
|
|
11
11
|
"inpainting"
|
|
12
|
-
]
|
|
12
|
+
],
|
|
13
|
+
"default_model_config": {
|
|
14
|
+
"quantize": true,
|
|
15
|
+
"quantize_text_encoder": "text_encoder_2"
|
|
16
|
+
}
|
|
13
17
|
},
|
|
14
18
|
{
|
|
15
19
|
"model_name": "FLUX.1-dev",
|
|
@@ -21,7 +25,11 @@
|
|
|
21
25
|
"text2image",
|
|
22
26
|
"image2image",
|
|
23
27
|
"inpainting"
|
|
24
|
-
]
|
|
28
|
+
],
|
|
29
|
+
"default_model_config": {
|
|
30
|
+
"quantize": true,
|
|
31
|
+
"quantize_text_encoder": "text_encoder_2"
|
|
32
|
+
}
|
|
25
33
|
},
|
|
26
34
|
{
|
|
27
35
|
"model_name": "sd3-medium",
|
|
@@ -33,7 +41,11 @@
|
|
|
33
41
|
"text2image",
|
|
34
42
|
"image2image",
|
|
35
43
|
"inpainting"
|
|
36
|
-
]
|
|
44
|
+
],
|
|
45
|
+
"default_model_config": {
|
|
46
|
+
"quantize": true,
|
|
47
|
+
"quantize_text_encoder": "text_encoder_3"
|
|
48
|
+
}
|
|
37
49
|
},
|
|
38
50
|
{
|
|
39
51
|
"model_name": "sd-turbo",
|
|
@@ -148,5 +160,15 @@
|
|
|
148
160
|
"model_revision": "62134b9d8e703b5d6f74f1534457287a8bba77ef"
|
|
149
161
|
}
|
|
150
162
|
]
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
"model_name": "GOT-OCR2_0",
|
|
166
|
+
"model_family": "ocr",
|
|
167
|
+
"model_id": "stepfun-ai/GOT-OCR2_0",
|
|
168
|
+
"model_revision": "master",
|
|
169
|
+
"model_hub": "modelscope",
|
|
170
|
+
"model_ability": [
|
|
171
|
+
"ocr"
|
|
172
|
+
]
|
|
151
173
|
}
|
|
152
174
|
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from typing import TYPE_CHECKING, Optional
|
|
17
|
+
|
|
18
|
+
import PIL.Image
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from ..core import ImageModelFamilyV1
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GotOCR2Model:
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
model_uid: str,
|
|
30
|
+
model_path: Optional[str] = None,
|
|
31
|
+
device: Optional[str] = None,
|
|
32
|
+
model_spec: Optional["ImageModelFamilyV1"] = None,
|
|
33
|
+
**kwargs,
|
|
34
|
+
):
|
|
35
|
+
self._model_uid = model_uid
|
|
36
|
+
self._model_path = model_path
|
|
37
|
+
self._device = device
|
|
38
|
+
# model info when loading
|
|
39
|
+
self._model = None
|
|
40
|
+
self._tokenizer = None
|
|
41
|
+
# info
|
|
42
|
+
self._model_spec = model_spec
|
|
43
|
+
self._abilities = model_spec.model_ability or [] # type: ignore
|
|
44
|
+
self._kwargs = kwargs
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def model_ability(self):
|
|
48
|
+
return self._abilities
|
|
49
|
+
|
|
50
|
+
def load(self):
|
|
51
|
+
from transformers import AutoModel, AutoTokenizer
|
|
52
|
+
|
|
53
|
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
54
|
+
self._model_path, trust_remote_code=True
|
|
55
|
+
)
|
|
56
|
+
model = AutoModel.from_pretrained(
|
|
57
|
+
self._model_path,
|
|
58
|
+
trust_remote_code=True,
|
|
59
|
+
low_cpu_mem_usage=True,
|
|
60
|
+
device_map="cuda",
|
|
61
|
+
use_safetensors=True,
|
|
62
|
+
pad_token_id=self._tokenizer.eos_token_id,
|
|
63
|
+
)
|
|
64
|
+
self._model = model.eval().cuda()
|
|
65
|
+
|
|
66
|
+
def ocr(
|
|
67
|
+
self,
|
|
68
|
+
image: PIL.Image,
|
|
69
|
+
**kwargs,
|
|
70
|
+
):
|
|
71
|
+
logger.info("Got OCR 2.0 kwargs: %s", kwargs)
|
|
72
|
+
if "ocr_type" not in kwargs:
|
|
73
|
+
kwargs["ocr_type"] = "ocr"
|
|
74
|
+
if image.mode == "RGBA" or image.mode == "CMYK":
|
|
75
|
+
# convert to RGB
|
|
76
|
+
image = image.convert("RGB")
|
|
77
|
+
assert self._model is not None
|
|
78
|
+
# This chat API limits the max new tokens inside.
|
|
79
|
+
return self._model.chat(self._tokenizer, image, gradio_input=True, **kwargs)
|
|
@@ -124,7 +124,7 @@ class FluxBatchSchedulerActor(xo.StatelessActor):
|
|
|
124
124
|
self._running_queue: deque[Text2ImageRequest] = deque() # type: ignore
|
|
125
125
|
self._model = None
|
|
126
126
|
self._available_device = get_available_device()
|
|
127
|
-
self._id_to_req: Dict[str, Text2ImageRequest] = {}
|
|
127
|
+
self._id_to_req: Dict[str, Text2ImageRequest] = {} # type: ignore
|
|
128
128
|
|
|
129
129
|
def set_model(self, model):
|
|
130
130
|
"""
|
|
@@ -283,9 +283,8 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
283
283
|
model.enable_sequential_cpu_offload()
|
|
284
284
|
elif not self._kwargs.get("device_map"):
|
|
285
285
|
logger.debug("Loading model to available device")
|
|
286
|
-
model = move_model_to_available_device(
|
|
287
|
-
|
|
288
|
-
if self._kwargs.get("attention_slicing", True):
|
|
286
|
+
model = move_model_to_available_device(model)
|
|
287
|
+
if self._kwargs.get("attention_slicing", False):
|
|
289
288
|
model.enable_attention_slicing()
|
|
290
289
|
if self._kwargs.get("vae_tiling", False):
|
|
291
290
|
model.enable_vae_tiling()
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import contextlib
|
|
16
|
+
import gc
|
|
17
|
+
import logging
|
|
18
|
+
import re
|
|
19
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
from PIL import Image
|
|
23
|
+
from xoscar.utils import classproperty
|
|
24
|
+
|
|
25
|
+
from ....types import LoRA
|
|
26
|
+
from ..sdapi import SDAPIDiffusionModelMixin
|
|
27
|
+
from ..utils import handle_image_result
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from ....core.progress_tracker import Progressor
|
|
31
|
+
from ..core import ImageModelFamilyV1
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def quantization_predicate(name: str, m) -> bool:
|
|
38
|
+
return hasattr(m, "to_quantized") and m.weight.shape[1] % 512 == 0
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def to_latent_size(image_size: Tuple[int, int]):
|
|
42
|
+
h, w = image_size
|
|
43
|
+
h = ((h + 15) // 16) * 16
|
|
44
|
+
w = ((w + 15) // 16) * 16
|
|
45
|
+
|
|
46
|
+
if (h, w) != image_size:
|
|
47
|
+
print(
|
|
48
|
+
"Warning: The image dimensions need to be divisible by 16px. "
|
|
49
|
+
f"Changing size to {h}x{w}."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return (h // 8, w // 8)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class MLXDiffusionModel(SDAPIDiffusionModelMixin):
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
model_uid: str,
|
|
59
|
+
model_path: Optional[str] = None,
|
|
60
|
+
device: Optional[str] = None,
|
|
61
|
+
lora_model: Optional[List[LoRA]] = None,
|
|
62
|
+
lora_load_kwargs: Optional[Dict] = None,
|
|
63
|
+
lora_fuse_kwargs: Optional[Dict] = None,
|
|
64
|
+
model_spec: Optional["ImageModelFamilyV1"] = None,
|
|
65
|
+
**kwargs,
|
|
66
|
+
):
|
|
67
|
+
self._model_uid = model_uid
|
|
68
|
+
self._model_path = model_path
|
|
69
|
+
self._device = device
|
|
70
|
+
# model info when loading
|
|
71
|
+
self._model = None
|
|
72
|
+
self._lora_model = lora_model
|
|
73
|
+
self._lora_load_kwargs = lora_load_kwargs or {}
|
|
74
|
+
self._lora_fuse_kwargs = lora_fuse_kwargs or {}
|
|
75
|
+
# info
|
|
76
|
+
self._model_spec = model_spec
|
|
77
|
+
self._abilities = model_spec.model_ability or [] # type: ignore
|
|
78
|
+
self._kwargs = kwargs
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def model_ability(self):
|
|
82
|
+
return self._abilities
|
|
83
|
+
|
|
84
|
+
@classproperty
|
|
85
|
+
def supported_models(self):
|
|
86
|
+
return ["FLUX.1-schnell", "FLUX.1-dev"]
|
|
87
|
+
|
|
88
|
+
def load(self):
|
|
89
|
+
try:
|
|
90
|
+
import mlx.nn as nn
|
|
91
|
+
except ImportError:
|
|
92
|
+
error_message = "Failed to import module 'mlx'"
|
|
93
|
+
installation_guide = [
|
|
94
|
+
"Please make sure 'mlx' is installed. ",
|
|
95
|
+
"You can install it by `pip install mlx`\n",
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
99
|
+
|
|
100
|
+
from ....thirdparty.mlx.flux import FluxPipeline
|
|
101
|
+
|
|
102
|
+
logger.debug(
|
|
103
|
+
"Loading model from %s, kwargs: %s", self._model_path, self._kwargs
|
|
104
|
+
)
|
|
105
|
+
flux = self._model = FluxPipeline(
|
|
106
|
+
"flux-" + self._model_spec.model_name.split("-")[1],
|
|
107
|
+
model_path=self._model_path,
|
|
108
|
+
t5_padding=self._kwargs.get("t5_padding", True),
|
|
109
|
+
)
|
|
110
|
+
self._apply_lora()
|
|
111
|
+
|
|
112
|
+
quantize = self._kwargs.get("quantize", True)
|
|
113
|
+
if quantize:
|
|
114
|
+
nn.quantize(flux.flow, class_predicate=quantization_predicate)
|
|
115
|
+
nn.quantize(flux.t5, class_predicate=quantization_predicate)
|
|
116
|
+
nn.quantize(flux.clip, class_predicate=quantization_predicate)
|
|
117
|
+
|
|
118
|
+
def _apply_lora(self):
|
|
119
|
+
if self._lora_model is not None:
|
|
120
|
+
import mlx.core as mx
|
|
121
|
+
|
|
122
|
+
for lora_model in self._lora_model:
|
|
123
|
+
weights, lora_config = mx.load(
|
|
124
|
+
lora_model.local_path, return_metadata=True
|
|
125
|
+
)
|
|
126
|
+
rank = int(lora_config.get("lora_rank", 8))
|
|
127
|
+
num_blocks = int(lora_config.get("lora_blocks", -1))
|
|
128
|
+
flux = self._model
|
|
129
|
+
flux.linear_to_lora_layers(rank, num_blocks)
|
|
130
|
+
flux.flow.load_weights(list(weights.items()), strict=False)
|
|
131
|
+
flux.fuse_lora_layers()
|
|
132
|
+
logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
|
|
133
|
+
|
|
134
|
+
@staticmethod
|
|
135
|
+
@contextlib.contextmanager
|
|
136
|
+
def _release_after():
|
|
137
|
+
import mlx.core as mx
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
yield
|
|
141
|
+
finally:
|
|
142
|
+
gc.collect()
|
|
143
|
+
mx.metal.clear_cache()
|
|
144
|
+
|
|
145
|
+
def text_to_image(
|
|
146
|
+
self,
|
|
147
|
+
prompt: str,
|
|
148
|
+
n: int = 1,
|
|
149
|
+
size: str = "1024*1024",
|
|
150
|
+
response_format: str = "url",
|
|
151
|
+
**kwargs,
|
|
152
|
+
):
|
|
153
|
+
import mlx.core as mx
|
|
154
|
+
|
|
155
|
+
flux = self._model
|
|
156
|
+
width, height = map(int, re.split(r"[^\d]+", size))
|
|
157
|
+
|
|
158
|
+
# Make the generator
|
|
159
|
+
latent_size = to_latent_size((height, width))
|
|
160
|
+
gen_latent_kwargs = {}
|
|
161
|
+
if (num_steps := kwargs.get("num_inference_steps")) is None:
|
|
162
|
+
num_steps = 50 if "dev" in self._model_spec.model_name else 2 # type: ignore
|
|
163
|
+
gen_latent_kwargs["num_steps"] = num_steps
|
|
164
|
+
if guidance := kwargs.get("guidance_scale"):
|
|
165
|
+
gen_latent_kwargs["guidance"] = guidance
|
|
166
|
+
if seed := kwargs.get("seed"):
|
|
167
|
+
gen_latent_kwargs["seed"] = seed
|
|
168
|
+
|
|
169
|
+
with self._release_after():
|
|
170
|
+
latents = flux.generate_latents( # type: ignore
|
|
171
|
+
prompt, n_images=n, latent_size=latent_size, **gen_latent_kwargs
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# First we get and eval the conditioning
|
|
175
|
+
conditioning = next(latents)
|
|
176
|
+
mx.eval(conditioning)
|
|
177
|
+
peak_mem_conditioning = mx.metal.get_peak_memory() / 1024**3
|
|
178
|
+
mx.metal.reset_peak_memory()
|
|
179
|
+
|
|
180
|
+
progressor: Progressor = kwargs.pop("progressor", None)
|
|
181
|
+
# Actual denoising loop
|
|
182
|
+
for i, x_t in enumerate(latents):
|
|
183
|
+
mx.eval(x_t)
|
|
184
|
+
progressor.set_progress((i + 1) / num_steps)
|
|
185
|
+
|
|
186
|
+
peak_mem_generation = mx.metal.get_peak_memory() / 1024**3
|
|
187
|
+
mx.metal.reset_peak_memory()
|
|
188
|
+
|
|
189
|
+
# Decode them into images
|
|
190
|
+
decoded = []
|
|
191
|
+
for i in range(n):
|
|
192
|
+
decoded.append(flux.decode(x_t[i : i + 1], latent_size)) # type: ignore
|
|
193
|
+
mx.eval(decoded[-1])
|
|
194
|
+
peak_mem_decoding = mx.metal.get_peak_memory() / 1024**3
|
|
195
|
+
peak_mem_overall = max(
|
|
196
|
+
peak_mem_conditioning, peak_mem_generation, peak_mem_decoding
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
images = []
|
|
200
|
+
x = mx.concatenate(decoded, axis=0)
|
|
201
|
+
x = (x * 255).astype(mx.uint8)
|
|
202
|
+
for i in range(len(x)):
|
|
203
|
+
im = Image.fromarray(np.array(x[i]))
|
|
204
|
+
images.append(im)
|
|
205
|
+
|
|
206
|
+
logger.debug(
|
|
207
|
+
f"Peak memory used for the text: {peak_mem_conditioning:.3f}GB"
|
|
208
|
+
)
|
|
209
|
+
logger.debug(
|
|
210
|
+
f"Peak memory used for the generation: {peak_mem_generation:.3f}GB"
|
|
211
|
+
)
|
|
212
|
+
logger.debug(f"Peak memory used for the decoding: {peak_mem_decoding:.3f}GB")
|
|
213
|
+
logger.debug(f"Peak memory used overall: {peak_mem_overall:.3f}GB")
|
|
214
|
+
|
|
215
|
+
return handle_image_result(response_format, images)
|
|
216
|
+
|
|
217
|
+
def image_to_image(self, **kwargs):
|
|
218
|
+
raise NotImplementedError
|
|
219
|
+
|
|
220
|
+
def inpainting(self, **kwargs):
|
|
221
|
+
raise NotImplementedError
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -32,6 +32,7 @@ from .llm_family import (
|
|
|
32
32
|
BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
|
|
33
33
|
BUILTIN_LLM_PROMPT_STYLE,
|
|
34
34
|
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
35
|
+
BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
|
|
35
36
|
LLAMA_CLASSES,
|
|
36
37
|
LLM_ENGINES,
|
|
37
38
|
LMDEPLOY_CLASSES,
|
|
@@ -258,6 +259,36 @@ def _install():
|
|
|
258
259
|
if "tools" in model_spec.model_ability:
|
|
259
260
|
BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
|
|
260
261
|
|
|
262
|
+
openmind_hub_json_path = os.path.join(
|
|
263
|
+
os.path.dirname(os.path.abspath(__file__)), "llm_family_openmind_hub.json"
|
|
264
|
+
)
|
|
265
|
+
for json_obj in json.load(
|
|
266
|
+
codecs.open(openmind_hub_json_path, "r", encoding="utf-8")
|
|
267
|
+
):
|
|
268
|
+
model_spec = LLMFamilyV1.parse_obj(json_obj)
|
|
269
|
+
BUILTIN_OPENMIND_HUB_LLM_FAMILIES.append(model_spec)
|
|
270
|
+
|
|
271
|
+
# register prompt style, in case that we have something missed
|
|
272
|
+
# if duplicated with huggingface json, keep it as the huggingface style
|
|
273
|
+
|
|
274
|
+
if (
|
|
275
|
+
"chat" in model_spec.model_ability
|
|
276
|
+
and isinstance(model_spec.chat_template, str)
|
|
277
|
+
and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
|
|
278
|
+
):
|
|
279
|
+
BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
|
|
280
|
+
"chat_template": model_spec.chat_template,
|
|
281
|
+
"stop_token_ids": model_spec.stop_token_ids,
|
|
282
|
+
"stop": model_spec.stop,
|
|
283
|
+
}
|
|
284
|
+
# register model family
|
|
285
|
+
if "chat" in model_spec.model_ability:
|
|
286
|
+
BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
|
|
287
|
+
else:
|
|
288
|
+
BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
|
|
289
|
+
if "tools" in model_spec.model_ability:
|
|
290
|
+
BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
|
|
291
|
+
|
|
261
292
|
csghub_json_path = os.path.join(
|
|
262
293
|
os.path.dirname(os.path.abspath(__file__)), "llm_family_csghub.json"
|
|
263
294
|
)
|
|
@@ -288,6 +319,7 @@ def _install():
|
|
|
288
319
|
for llm_specs in [
|
|
289
320
|
BUILTIN_LLM_FAMILIES,
|
|
290
321
|
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
322
|
+
BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
|
|
291
323
|
BUILTIN_CSGHUB_LLM_FAMILIES,
|
|
292
324
|
]:
|
|
293
325
|
for llm_spec in llm_specs:
|
|
@@ -298,6 +330,7 @@ def _install():
|
|
|
298
330
|
for families in [
|
|
299
331
|
BUILTIN_LLM_FAMILIES,
|
|
300
332
|
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
333
|
+
BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
|
|
301
334
|
BUILTIN_CSGHUB_LLM_FAMILIES,
|
|
302
335
|
]:
|
|
303
336
|
for family in families:
|
xinference/model/llm/core.py
CHANGED
|
@@ -193,7 +193,9 @@ def create_llm_model_instance(
|
|
|
193
193
|
model_size_in_billions: Optional[Union[int, str]] = None,
|
|
194
194
|
quantization: Optional[str] = None,
|
|
195
195
|
peft_model_config: Optional[PeftModelConfig] = None,
|
|
196
|
-
download_hub: Optional[
|
|
196
|
+
download_hub: Optional[
|
|
197
|
+
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
198
|
+
] = None,
|
|
197
199
|
model_path: Optional[str] = None,
|
|
198
200
|
**kwargs,
|
|
199
201
|
) -> Tuple[LLM, LLMDescription]:
|
|
@@ -8176,6 +8176,15 @@
|
|
|
8176
8176
|
],
|
|
8177
8177
|
"model_id": "Qwen/Qwen2.5-Coder-7B-Instruct"
|
|
8178
8178
|
},
|
|
8179
|
+
{
|
|
8180
|
+
"model_format": "gptq",
|
|
8181
|
+
"model_size_in_billions": "7",
|
|
8182
|
+
"quantizations": [
|
|
8183
|
+
"Int4",
|
|
8184
|
+
"Int8"
|
|
8185
|
+
],
|
|
8186
|
+
"model_id": "Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-{quantization}"
|
|
8187
|
+
},
|
|
8179
8188
|
{
|
|
8180
8189
|
"model_format": "ggufv2",
|
|
8181
8190
|
"model_size_in_billions": "1_5",
|
|
@@ -41,6 +41,7 @@ from ..utils import (
|
|
|
41
41
|
create_symlink,
|
|
42
42
|
download_from_csghub,
|
|
43
43
|
download_from_modelscope,
|
|
44
|
+
download_from_openmind_hub,
|
|
44
45
|
is_valid_model_uri,
|
|
45
46
|
parse_uri,
|
|
46
47
|
retry_download,
|
|
@@ -239,6 +240,7 @@ LLAMA_CLASSES: List[Type[LLM]] = []
|
|
|
239
240
|
|
|
240
241
|
BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
241
242
|
BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
243
|
+
BUILTIN_OPENMIND_HUB_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
242
244
|
BUILTIN_CSGHUB_LLM_FAMILIES: List["LLMFamilyV1"] = []
|
|
243
245
|
|
|
244
246
|
SGLANG_CLASSES: List[Type[LLM]] = []
|
|
@@ -301,6 +303,9 @@ def cache(
|
|
|
301
303
|
elif llm_spec.model_hub == "modelscope":
|
|
302
304
|
logger.info(f"Caching from Modelscope: {llm_spec.model_id}")
|
|
303
305
|
return cache_from_modelscope(llm_family, llm_spec, quantization)
|
|
306
|
+
elif llm_spec.model_hub == "openmind_hub":
|
|
307
|
+
logger.info(f"Caching from openmind_hub: {llm_spec.model_id}")
|
|
308
|
+
return cache_from_openmind_hub(llm_family, llm_spec, quantization)
|
|
304
309
|
elif llm_spec.model_hub == "csghub":
|
|
305
310
|
logger.info(f"Caching from CSGHub: {llm_spec.model_id}")
|
|
306
311
|
return cache_from_csghub(llm_family, llm_spec, quantization)
|
|
@@ -474,7 +479,7 @@ def _skip_download(
|
|
|
474
479
|
model_revision: Optional[str],
|
|
475
480
|
quantization: Optional[str] = None,
|
|
476
481
|
) -> bool:
|
|
477
|
-
if model_format
|
|
482
|
+
if model_format in ["pytorch", "mindspore"]:
|
|
478
483
|
model_hub_to_meta_path = {
|
|
479
484
|
"huggingface": _get_meta_path(
|
|
480
485
|
cache_dir, model_format, "huggingface", quantization
|
|
@@ -482,6 +487,9 @@ def _skip_download(
|
|
|
482
487
|
"modelscope": _get_meta_path(
|
|
483
488
|
cache_dir, model_format, "modelscope", quantization
|
|
484
489
|
),
|
|
490
|
+
"openmind_hub": _get_meta_path(
|
|
491
|
+
cache_dir, model_format, "openmind_hub", quantization
|
|
492
|
+
),
|
|
485
493
|
"csghub": _get_meta_path(cache_dir, model_format, "csghub", quantization),
|
|
486
494
|
}
|
|
487
495
|
if valid_model_revision(model_hub_to_meta_path[model_hub], model_revision):
|
|
@@ -702,6 +710,50 @@ def cache_from_modelscope(
|
|
|
702
710
|
return cache_dir
|
|
703
711
|
|
|
704
712
|
|
|
713
|
+
def cache_from_openmind_hub(
|
|
714
|
+
llm_family: LLMFamilyV1,
|
|
715
|
+
llm_spec: "LLMSpecV1",
|
|
716
|
+
quantization: Optional[str] = None,
|
|
717
|
+
) -> str:
|
|
718
|
+
"""
|
|
719
|
+
Cache model from openmind_hub. Return the cache directory.
|
|
720
|
+
"""
|
|
721
|
+
from openmind_hub import snapshot_download
|
|
722
|
+
|
|
723
|
+
cache_dir = _get_cache_dir(llm_family, llm_spec)
|
|
724
|
+
if _skip_download(
|
|
725
|
+
cache_dir,
|
|
726
|
+
llm_spec.model_format,
|
|
727
|
+
llm_spec.model_hub,
|
|
728
|
+
llm_spec.model_revision,
|
|
729
|
+
quantization,
|
|
730
|
+
):
|
|
731
|
+
return cache_dir
|
|
732
|
+
|
|
733
|
+
if llm_spec.model_format in ["pytorch", "mindspore"]:
|
|
734
|
+
download_dir = retry_download(
|
|
735
|
+
snapshot_download,
|
|
736
|
+
llm_family.model_name,
|
|
737
|
+
{
|
|
738
|
+
"model_size": llm_spec.model_size_in_billions,
|
|
739
|
+
"model_format": llm_spec.model_format,
|
|
740
|
+
},
|
|
741
|
+
llm_spec.model_id,
|
|
742
|
+
revision=llm_spec.model_revision,
|
|
743
|
+
)
|
|
744
|
+
create_symlink(download_dir, cache_dir)
|
|
745
|
+
|
|
746
|
+
else:
|
|
747
|
+
raise ValueError(f"Unsupported format: {llm_spec.model_format}")
|
|
748
|
+
|
|
749
|
+
meta_path = _get_meta_path(
|
|
750
|
+
cache_dir, llm_spec.model_format, llm_spec.model_hub, quantization
|
|
751
|
+
)
|
|
752
|
+
_generate_meta_file(meta_path, llm_family, llm_spec, quantization)
|
|
753
|
+
|
|
754
|
+
return cache_dir
|
|
755
|
+
|
|
756
|
+
|
|
705
757
|
def cache_from_huggingface(
|
|
706
758
|
llm_family: LLMFamilyV1,
|
|
707
759
|
llm_spec: "LLMSpecV1",
|
|
@@ -893,7 +945,9 @@ def match_llm(
|
|
|
893
945
|
model_format: Optional[str] = None,
|
|
894
946
|
model_size_in_billions: Optional[Union[int, str]] = None,
|
|
895
947
|
quantization: Optional[str] = None,
|
|
896
|
-
download_hub: Optional[
|
|
948
|
+
download_hub: Optional[
|
|
949
|
+
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
950
|
+
] = None,
|
|
897
951
|
) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
|
|
898
952
|
"""
|
|
899
953
|
Find an LLM family, spec, and quantization that satisfy given criteria.
|
|
@@ -924,6 +978,12 @@ def match_llm(
|
|
|
924
978
|
+ BUILTIN_LLM_FAMILIES
|
|
925
979
|
+ user_defined_llm_families
|
|
926
980
|
)
|
|
981
|
+
elif download_hub == "openmind_hub":
|
|
982
|
+
all_families = (
|
|
983
|
+
BUILTIN_OPENMIND_HUB_LLM_FAMILIES
|
|
984
|
+
+ BUILTIN_LLM_FAMILIES
|
|
985
|
+
+ user_defined_llm_families
|
|
986
|
+
)
|
|
927
987
|
elif download_hub == "csghub":
|
|
928
988
|
all_families = (
|
|
929
989
|
BUILTIN_CSGHUB_LLM_FAMILIES
|
|
@@ -938,6 +998,12 @@ def match_llm(
|
|
|
938
998
|
+ BUILTIN_LLM_FAMILIES
|
|
939
999
|
+ user_defined_llm_families
|
|
940
1000
|
)
|
|
1001
|
+
elif download_from_openmind_hub():
|
|
1002
|
+
all_families = (
|
|
1003
|
+
BUILTIN_OPENMIND_HUB_LLM_FAMILIES
|
|
1004
|
+
+ BUILTIN_LLM_FAMILIES
|
|
1005
|
+
+ user_defined_llm_families
|
|
1006
|
+
)
|
|
941
1007
|
elif download_from_csghub():
|
|
942
1008
|
all_families = (
|
|
943
1009
|
BUILTIN_CSGHUB_LLM_FAMILIES
|
|
@@ -5880,6 +5880,17 @@
|
|
|
5880
5880
|
"model_revision": "master",
|
|
5881
5881
|
"model_hub": "modelscope"
|
|
5882
5882
|
},
|
|
5883
|
+
{
|
|
5884
|
+
"model_format": "gptq",
|
|
5885
|
+
"model_size_in_billions": 7,
|
|
5886
|
+
"quantizations": [
|
|
5887
|
+
"Int4",
|
|
5888
|
+
"Int8"
|
|
5889
|
+
],
|
|
5890
|
+
"model_id": "qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-{quantization}",
|
|
5891
|
+
"model_revision": "master",
|
|
5892
|
+
"model_hub": "modelscope"
|
|
5893
|
+
},
|
|
5883
5894
|
{
|
|
5884
5895
|
"model_format": "ggufv2",
|
|
5885
5896
|
"model_size_in_billions": "1_5",
|