xinference 0.11.1__py3-none-any.whl → 0.11.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +30 -0
- xinference/client/restful/restful_client.py +29 -0
- xinference/core/cache_tracker.py +12 -1
- xinference/core/supervisor.py +30 -2
- xinference/core/utils.py +12 -0
- xinference/core/worker.py +4 -1
- xinference/deploy/cmdline.py +126 -0
- xinference/deploy/test/test_cmdline.py +24 -0
- xinference/model/llm/__init__.py +2 -0
- xinference/model/llm/llm_family.json +501 -6
- xinference/model/llm/llm_family.py +84 -10
- xinference/model/llm/llm_family_modelscope.json +198 -7
- xinference/model/llm/memory.py +332 -0
- xinference/model/llm/pytorch/core.py +2 -0
- xinference/model/llm/pytorch/intern_vl.py +387 -0
- xinference/model/llm/utils.py +13 -0
- xinference/model/llm/vllm/core.py +5 -2
- xinference/model/rerank/core.py +23 -1
- xinference/model/utils.py +17 -7
- xinference/thirdparty/deepseek_vl/models/processing_vlm.py +1 -1
- xinference/thirdparty/deepseek_vl/models/siglip_vit.py +2 -2
- xinference/thirdparty/llava/mm_utils.py +3 -2
- xinference/thirdparty/llava/model/llava_arch.py +1 -1
- xinference/thirdparty/omnilmm/chat.py +6 -5
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/METADATA +8 -7
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/RECORD +31 -29
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/LICENSE +0 -0
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/WHEEL +0 -0
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
# NOTE:
|
|
16
|
+
#
|
|
17
|
+
# The algorithum is ported from https://github.com/RahulSChand/gpu_poor
|
|
18
|
+
#
|
|
19
|
+
# Improvement:
|
|
20
|
+
#
|
|
21
|
+
# The original js code only calculate kv_cache_dtype by float32, instead of most case we run model with float16.
|
|
22
|
+
#
|
|
23
|
+
# Known Issue:
|
|
24
|
+
#
|
|
25
|
+
# * On vllm, some MHA model use smaller memory than calculation (qwen1.5-7B-chat-gptq-int4,
|
|
26
|
+
# qwen1.5-14B-chat-gptq-int4 with large activation_mem).
|
|
27
|
+
#
|
|
28
|
+
# * On vllm, gemma-it-7B pytorch format model use larger gpu mem than calculation
|
|
29
|
+
|
|
30
|
+
import json
|
|
31
|
+
import math
|
|
32
|
+
from dataclasses import dataclass
|
|
33
|
+
from logging import getLogger
|
|
34
|
+
from math import ceil
|
|
35
|
+
from typing import Any, Optional, Union
|
|
36
|
+
|
|
37
|
+
from .llm_family import convert_model_size_to_float
|
|
38
|
+
|
|
39
|
+
logger = getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class ModelLayersInfo:
|
|
44
|
+
vocab_size: int
|
|
45
|
+
heads: int # num_attention_heads, num_heads or n_head
|
|
46
|
+
hidden_dim: int # hidden_size, d_model, or n_embd
|
|
47
|
+
inter_dim: int # intermediate_size, n_inner or d_ff
|
|
48
|
+
num_layers: int # num_layers, num_hidden_layers or n_layer
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class ModelMemInfo:
|
|
53
|
+
"""Memory required by model, unit in MB"""
|
|
54
|
+
|
|
55
|
+
model_mem: int
|
|
56
|
+
kv_cache_mem: int
|
|
57
|
+
activation_mem: int
|
|
58
|
+
overhead: int
|
|
59
|
+
total: int
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
QUANT_NORMALIZE = {"int4": "4-bit", "int8": "8-bit", "4-bit": "4-bit", "8-bit": "8-bit"}
|
|
63
|
+
|
|
64
|
+
GGML_MULTI_FACTOR_DICT = {
|
|
65
|
+
"q4_0": 18,
|
|
66
|
+
"q4_1": 20,
|
|
67
|
+
"q5_0": 22,
|
|
68
|
+
"q5_1": 24,
|
|
69
|
+
"q8_0": 34,
|
|
70
|
+
"q8_1": 40,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
GGML_MULTI_FACTOR_DICT_64 = {
|
|
74
|
+
"q6_K": 54.0,
|
|
75
|
+
"q3": 26.0,
|
|
76
|
+
"q4": 38.0,
|
|
77
|
+
"q5": 46.0,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
GGML_MULTI_FACTOR_DICT_COMBINE = {
|
|
81
|
+
"q3_K_L": [38.0, 26.0],
|
|
82
|
+
"q3_K_M": [46.0, 26.0],
|
|
83
|
+
"q4_K_S": [46.0, 38.0],
|
|
84
|
+
"q4_K_M": [54.0, 38.0],
|
|
85
|
+
"q5_K_M": [54.0, 46.0],
|
|
86
|
+
"q2_K": [26.0, 22.0],
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# Return gpu memory in MB
|
|
91
|
+
def estimate_llm_gpu_memory(
|
|
92
|
+
model_size_in_billions: Union[str, int],
|
|
93
|
+
quantization: Optional[str],
|
|
94
|
+
context_length: int, # input+output
|
|
95
|
+
model_format: str,
|
|
96
|
+
model_name: Optional[str] = None,
|
|
97
|
+
kv_cache_dtype: int = 16,
|
|
98
|
+
) -> Optional[ModelMemInfo]:
|
|
99
|
+
"""
|
|
100
|
+
model_size_in_billions: must be str like 1_8 or 46_7, to match llm.
|
|
101
|
+
"""
|
|
102
|
+
info = get_model_layers_info(
|
|
103
|
+
model_size_in_billions,
|
|
104
|
+
model_name,
|
|
105
|
+
model_format,
|
|
106
|
+
quantization,
|
|
107
|
+
)
|
|
108
|
+
if info is None:
|
|
109
|
+
return None
|
|
110
|
+
size_in_billions = convert_model_size_to_float(model_size_in_billions)
|
|
111
|
+
return estimate_llm_gpu_memory_details(
|
|
112
|
+
info,
|
|
113
|
+
size_in_billions,
|
|
114
|
+
quantization,
|
|
115
|
+
context_length,
|
|
116
|
+
model_format,
|
|
117
|
+
kv_cache_dtype,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def estimate_llm_gpu_memory_details(
|
|
122
|
+
info: ModelLayersInfo,
|
|
123
|
+
size_in_billions: float,
|
|
124
|
+
quantization: Optional[str],
|
|
125
|
+
context_length: int, # input+output
|
|
126
|
+
model_format: str,
|
|
127
|
+
kv_cache_dtype: int = 16,
|
|
128
|
+
) -> ModelMemInfo:
|
|
129
|
+
"""return model_mem, kv_cache, overhead, activation_mem"""
|
|
130
|
+
if kv_cache_dtype not in [8, 16, 32]:
|
|
131
|
+
raise ValueError(f"Invalid kv_cache_dtype {kv_cache_dtype}")
|
|
132
|
+
if kv_cache_dtype == 8:
|
|
133
|
+
kv_dtype_size = 1
|
|
134
|
+
elif kv_cache_dtype == 16:
|
|
135
|
+
kv_dtype_size = 2
|
|
136
|
+
else:
|
|
137
|
+
kv_dtype_size = 4
|
|
138
|
+
overhead = 650.0
|
|
139
|
+
if model_format == "ggmlv3":
|
|
140
|
+
assert quantization is not None and quantization != "none"
|
|
141
|
+
model_size_in_mb = _compute_model_size_ggml(info, quantization)
|
|
142
|
+
inference_mem = float(
|
|
143
|
+
context_length * kv_dtype_size * info.hidden_dim * info.num_layers
|
|
144
|
+
)
|
|
145
|
+
inference_mem = inference_mem / 1024.0 / 1024.0
|
|
146
|
+
activation_mem = _compute_inference_only_activation_memory(context_length, info)
|
|
147
|
+
overhead = overhead + context_length * 0.1
|
|
148
|
+
else:
|
|
149
|
+
if quantization is not None:
|
|
150
|
+
assert isinstance(quantization, str)
|
|
151
|
+
quantization = QUANT_NORMALIZE[quantization.lower()]
|
|
152
|
+
assert quantization is not None
|
|
153
|
+
|
|
154
|
+
model_size = size_in_billions * 1000000000.0
|
|
155
|
+
model_size_in_mb = _convert_to_mb_model_size(model_size, quantization)
|
|
156
|
+
# KV cache
|
|
157
|
+
inference_mem = float(
|
|
158
|
+
context_length * 2 * kv_dtype_size * info.hidden_dim * info.num_layers
|
|
159
|
+
)
|
|
160
|
+
inference_mem = inference_mem / 1024.0 / 1024.0
|
|
161
|
+
activation_mem = _compute_inference_only_activation_memory(context_length, info)
|
|
162
|
+
|
|
163
|
+
total_mem = ceil(inference_mem + model_size_in_mb + overhead + activation_mem)
|
|
164
|
+
return ModelMemInfo(
|
|
165
|
+
model_mem=ceil(model_size_in_mb),
|
|
166
|
+
kv_cache_mem=ceil(inference_mem),
|
|
167
|
+
activation_mem=ceil(activation_mem),
|
|
168
|
+
overhead=ceil(overhead),
|
|
169
|
+
total=total_mem,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _load_item_from_json(config_data: Any, *keys: str) -> str:
|
|
174
|
+
assert len(keys) > 0
|
|
175
|
+
for key in keys:
|
|
176
|
+
v = config_data.get(key)
|
|
177
|
+
if v is not None:
|
|
178
|
+
return v
|
|
179
|
+
raise ValueError("load ModelLayersInfo: missing %s" % (keys[0]))
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def load_model_config_json(config_path: str) -> ModelLayersInfo:
|
|
183
|
+
with open(config_path, "r") as f:
|
|
184
|
+
config_data = json.load(f)
|
|
185
|
+
return ModelLayersInfo(
|
|
186
|
+
vocab_size=int(_load_item_from_json(config_data, "vocab_size")),
|
|
187
|
+
heads=int(
|
|
188
|
+
_load_item_from_json(
|
|
189
|
+
config_data, "num_key_value_heads", "num_attention_heads"
|
|
190
|
+
)
|
|
191
|
+
),
|
|
192
|
+
hidden_dim=int(
|
|
193
|
+
_load_item_from_json(config_data, "hidden_size", "d_model", "n_embd")
|
|
194
|
+
),
|
|
195
|
+
inter_dim=int(_load_item_from_json(config_data, "intermediate_size")),
|
|
196
|
+
num_layers=int(
|
|
197
|
+
_load_item_from_json(
|
|
198
|
+
config_data, "num_hidden_layers", "num_layers", "n_layer"
|
|
199
|
+
)
|
|
200
|
+
),
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def get_model_layers_info(
|
|
205
|
+
model_size_in_billions: Union[str, int],
|
|
206
|
+
model_name: Optional[str],
|
|
207
|
+
model_format: Optional[str],
|
|
208
|
+
quantization: Optional[str],
|
|
209
|
+
) -> Optional[ModelLayersInfo]:
|
|
210
|
+
from . import match_llm
|
|
211
|
+
from .llm_family import cache_model_config
|
|
212
|
+
|
|
213
|
+
if not model_name:
|
|
214
|
+
logger.debug("get_model_layers_info by default size=%s", model_size_in_billions)
|
|
215
|
+
size_in_billions = convert_model_size_to_float(model_size_in_billions)
|
|
216
|
+
return _get_default_layers_from_size(size_in_billions)
|
|
217
|
+
match_result = match_llm(
|
|
218
|
+
model_name=model_name,
|
|
219
|
+
model_format=model_format,
|
|
220
|
+
model_size_in_billions=model_size_in_billions,
|
|
221
|
+
quantization=quantization,
|
|
222
|
+
)
|
|
223
|
+
if not match_result:
|
|
224
|
+
return None
|
|
225
|
+
llm_family, llm_spec, _quant = match_result
|
|
226
|
+
config_path = cache_model_config(llm_family, llm_spec)
|
|
227
|
+
return load_model_config_json(config_path)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _get_default_layers_from_size(size_in_billion: float) -> ModelLayersInfo:
|
|
231
|
+
if size_in_billion < 5:
|
|
232
|
+
vocab_size = 32000
|
|
233
|
+
heads = 32
|
|
234
|
+
num_layers = 24
|
|
235
|
+
elif size_in_billion < 10:
|
|
236
|
+
vocab_size = 32000
|
|
237
|
+
heads = 32
|
|
238
|
+
num_layers = 32
|
|
239
|
+
elif size_in_billion < 24:
|
|
240
|
+
vocab_size = 32000
|
|
241
|
+
heads = 40
|
|
242
|
+
num_layers = 40
|
|
243
|
+
elif size_in_billion < 55:
|
|
244
|
+
vocab_size = 32000
|
|
245
|
+
heads = 60
|
|
246
|
+
num_layers = 48
|
|
247
|
+
else:
|
|
248
|
+
vocab_size = 32000
|
|
249
|
+
heads = 64
|
|
250
|
+
num_layers = 80
|
|
251
|
+
|
|
252
|
+
model_size = int(size_in_billion * 1000000000)
|
|
253
|
+
A = num_layers * 4 + 3 * 4 * num_layers
|
|
254
|
+
B = 2 * vocab_size
|
|
255
|
+
C = -1 * model_size
|
|
256
|
+
h = (-B + math.sqrt(B**2 - 4 * A * C)) / (2 * A)
|
|
257
|
+
h = math.ceil(h)
|
|
258
|
+
return ModelLayersInfo(
|
|
259
|
+
vocab_size=vocab_size,
|
|
260
|
+
heads=heads,
|
|
261
|
+
hidden_dim=h,
|
|
262
|
+
inter_dim=4 * h,
|
|
263
|
+
num_layers=num_layers,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _convert_to_mb_model_size(model_size: float, quantization: Optional[str]) -> float:
|
|
268
|
+
extra = 0.0
|
|
269
|
+
fB = 2.0
|
|
270
|
+
size = (model_size * fB) / (1024.0 * 1024.0)
|
|
271
|
+
# bnb_q4 == 4-bit ?
|
|
272
|
+
if quantization == "8-bit" or quantization == "4-bit":
|
|
273
|
+
extra = 0.06 * size
|
|
274
|
+
if quantization == "8-bit":
|
|
275
|
+
size = size / 2
|
|
276
|
+
if quantization == "4-bit":
|
|
277
|
+
size = size / 4
|
|
278
|
+
return size + extra
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _compute_inference_only_activation_memory(
|
|
282
|
+
context_length: int, info: ModelLayersInfo
|
|
283
|
+
) -> float:
|
|
284
|
+
hidden_dim = info.hidden_dim
|
|
285
|
+
heads = info.heads
|
|
286
|
+
ret = (
|
|
287
|
+
(context_length * hidden_dim * 5 * 2 + (context_length**2) * heads * 2)
|
|
288
|
+
/ 1024
|
|
289
|
+
/ 1024
|
|
290
|
+
)
|
|
291
|
+
return ret
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _compute_model_size_ggml(info: ModelLayersInfo, quantization: str) -> float:
|
|
295
|
+
assert quantization is not None
|
|
296
|
+
vocab_size = info.vocab_size
|
|
297
|
+
num_layers = info.num_layers
|
|
298
|
+
hidden_dim = info.hidden_dim
|
|
299
|
+
inter_dim = info.inter_dim
|
|
300
|
+
total_params = int(
|
|
301
|
+
vocab_size * hidden_dim * 2
|
|
302
|
+
+ num_layers * 4 * (hidden_dim**2)
|
|
303
|
+
+ num_layers * 3 * inter_dim * hidden_dim
|
|
304
|
+
)
|
|
305
|
+
other_v_down_params = (
|
|
306
|
+
num_layers * (hidden_dim**2) + num_layers * hidden_dim * inter_dim
|
|
307
|
+
)
|
|
308
|
+
other_param_q2k = (
|
|
309
|
+
total_params - (hidden_dim**2) * num_layers * 2 + 2 * vocab_size * hidden_dim
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
total = 0.0
|
|
313
|
+
v1 = GGML_MULTI_FACTOR_DICT.get(quantization)
|
|
314
|
+
if v1 is not None:
|
|
315
|
+
total = (v1 * total_params) / (32 * 1024 * 1024)
|
|
316
|
+
v2 = GGML_MULTI_FACTOR_DICT_64.get(quantization)
|
|
317
|
+
if v2 is not None:
|
|
318
|
+
total = (v2 * total_params) / (64 * 1024 * 1024)
|
|
319
|
+
v3 = GGML_MULTI_FACTOR_DICT_COMBINE.get(quantization)
|
|
320
|
+
if v3 is not None:
|
|
321
|
+
factors = v3
|
|
322
|
+
if quantization == "q2_K":
|
|
323
|
+
total = (
|
|
324
|
+
(total_params - other_param_q2k) * factors[1]
|
|
325
|
+
+ other_param_q2k * factors[0]
|
|
326
|
+
) / (64 * 1024 * 1024)
|
|
327
|
+
else:
|
|
328
|
+
total = (
|
|
329
|
+
(total_params - other_v_down_params) * factors[1]
|
|
330
|
+
+ other_v_down_params * factors[0]
|
|
331
|
+
) / (64 * 1024 * 1024)
|
|
332
|
+
return total
|