xinference 0.11.1__py3-none-any.whl → 0.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (31) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +30 -0
  3. xinference/client/restful/restful_client.py +29 -0
  4. xinference/core/cache_tracker.py +12 -1
  5. xinference/core/supervisor.py +30 -2
  6. xinference/core/utils.py +12 -0
  7. xinference/core/worker.py +4 -1
  8. xinference/deploy/cmdline.py +126 -0
  9. xinference/deploy/test/test_cmdline.py +24 -0
  10. xinference/model/llm/__init__.py +2 -0
  11. xinference/model/llm/llm_family.json +501 -6
  12. xinference/model/llm/llm_family.py +84 -10
  13. xinference/model/llm/llm_family_modelscope.json +198 -7
  14. xinference/model/llm/memory.py +332 -0
  15. xinference/model/llm/pytorch/core.py +2 -0
  16. xinference/model/llm/pytorch/intern_vl.py +387 -0
  17. xinference/model/llm/utils.py +13 -0
  18. xinference/model/llm/vllm/core.py +5 -2
  19. xinference/model/rerank/core.py +23 -1
  20. xinference/model/utils.py +17 -7
  21. xinference/thirdparty/deepseek_vl/models/processing_vlm.py +1 -1
  22. xinference/thirdparty/deepseek_vl/models/siglip_vit.py +2 -2
  23. xinference/thirdparty/llava/mm_utils.py +3 -2
  24. xinference/thirdparty/llava/model/llava_arch.py +1 -1
  25. xinference/thirdparty/omnilmm/chat.py +6 -5
  26. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/METADATA +8 -7
  27. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/RECORD +31 -29
  28. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/LICENSE +0 -0
  29. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/WHEEL +0 -0
  30. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/entry_points.txt +0 -0
  31. {xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,332 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # NOTE:
16
+ #
17
+ # The algorithum is ported from https://github.com/RahulSChand/gpu_poor
18
+ #
19
+ # Improvement:
20
+ #
21
+ # The original js code only calculate kv_cache_dtype by float32, instead of most case we run model with float16.
22
+ #
23
+ # Known Issue:
24
+ #
25
+ # * On vllm, some MHA model use smaller memory than calculation (qwen1.5-7B-chat-gptq-int4,
26
+ # qwen1.5-14B-chat-gptq-int4 with large activation_mem).
27
+ #
28
+ # * On vllm, gemma-it-7B pytorch format model use larger gpu mem than calculation
29
+
30
+ import json
31
+ import math
32
+ from dataclasses import dataclass
33
+ from logging import getLogger
34
+ from math import ceil
35
+ from typing import Any, Optional, Union
36
+
37
+ from .llm_family import convert_model_size_to_float
38
+
39
+ logger = getLogger(__name__)
40
+
41
+
42
+ @dataclass
43
+ class ModelLayersInfo:
44
+ vocab_size: int
45
+ heads: int # num_attention_heads, num_heads or n_head
46
+ hidden_dim: int # hidden_size, d_model, or n_embd
47
+ inter_dim: int # intermediate_size, n_inner or d_ff
48
+ num_layers: int # num_layers, num_hidden_layers or n_layer
49
+
50
+
51
+ @dataclass
52
+ class ModelMemInfo:
53
+ """Memory required by model, unit in MB"""
54
+
55
+ model_mem: int
56
+ kv_cache_mem: int
57
+ activation_mem: int
58
+ overhead: int
59
+ total: int
60
+
61
+
62
+ QUANT_NORMALIZE = {"int4": "4-bit", "int8": "8-bit", "4-bit": "4-bit", "8-bit": "8-bit"}
63
+
64
+ GGML_MULTI_FACTOR_DICT = {
65
+ "q4_0": 18,
66
+ "q4_1": 20,
67
+ "q5_0": 22,
68
+ "q5_1": 24,
69
+ "q8_0": 34,
70
+ "q8_1": 40,
71
+ }
72
+
73
+ GGML_MULTI_FACTOR_DICT_64 = {
74
+ "q6_K": 54.0,
75
+ "q3": 26.0,
76
+ "q4": 38.0,
77
+ "q5": 46.0,
78
+ }
79
+
80
+ GGML_MULTI_FACTOR_DICT_COMBINE = {
81
+ "q3_K_L": [38.0, 26.0],
82
+ "q3_K_M": [46.0, 26.0],
83
+ "q4_K_S": [46.0, 38.0],
84
+ "q4_K_M": [54.0, 38.0],
85
+ "q5_K_M": [54.0, 46.0],
86
+ "q2_K": [26.0, 22.0],
87
+ }
88
+
89
+
90
+ # Return gpu memory in MB
91
+ def estimate_llm_gpu_memory(
92
+ model_size_in_billions: Union[str, int],
93
+ quantization: Optional[str],
94
+ context_length: int, # input+output
95
+ model_format: str,
96
+ model_name: Optional[str] = None,
97
+ kv_cache_dtype: int = 16,
98
+ ) -> Optional[ModelMemInfo]:
99
+ """
100
+ model_size_in_billions: must be str like 1_8 or 46_7, to match llm.
101
+ """
102
+ info = get_model_layers_info(
103
+ model_size_in_billions,
104
+ model_name,
105
+ model_format,
106
+ quantization,
107
+ )
108
+ if info is None:
109
+ return None
110
+ size_in_billions = convert_model_size_to_float(model_size_in_billions)
111
+ return estimate_llm_gpu_memory_details(
112
+ info,
113
+ size_in_billions,
114
+ quantization,
115
+ context_length,
116
+ model_format,
117
+ kv_cache_dtype,
118
+ )
119
+
120
+
121
+ def estimate_llm_gpu_memory_details(
122
+ info: ModelLayersInfo,
123
+ size_in_billions: float,
124
+ quantization: Optional[str],
125
+ context_length: int, # input+output
126
+ model_format: str,
127
+ kv_cache_dtype: int = 16,
128
+ ) -> ModelMemInfo:
129
+ """return model_mem, kv_cache, overhead, activation_mem"""
130
+ if kv_cache_dtype not in [8, 16, 32]:
131
+ raise ValueError(f"Invalid kv_cache_dtype {kv_cache_dtype}")
132
+ if kv_cache_dtype == 8:
133
+ kv_dtype_size = 1
134
+ elif kv_cache_dtype == 16:
135
+ kv_dtype_size = 2
136
+ else:
137
+ kv_dtype_size = 4
138
+ overhead = 650.0
139
+ if model_format == "ggmlv3":
140
+ assert quantization is not None and quantization != "none"
141
+ model_size_in_mb = _compute_model_size_ggml(info, quantization)
142
+ inference_mem = float(
143
+ context_length * kv_dtype_size * info.hidden_dim * info.num_layers
144
+ )
145
+ inference_mem = inference_mem / 1024.0 / 1024.0
146
+ activation_mem = _compute_inference_only_activation_memory(context_length, info)
147
+ overhead = overhead + context_length * 0.1
148
+ else:
149
+ if quantization is not None:
150
+ assert isinstance(quantization, str)
151
+ quantization = QUANT_NORMALIZE[quantization.lower()]
152
+ assert quantization is not None
153
+
154
+ model_size = size_in_billions * 1000000000.0
155
+ model_size_in_mb = _convert_to_mb_model_size(model_size, quantization)
156
+ # KV cache
157
+ inference_mem = float(
158
+ context_length * 2 * kv_dtype_size * info.hidden_dim * info.num_layers
159
+ )
160
+ inference_mem = inference_mem / 1024.0 / 1024.0
161
+ activation_mem = _compute_inference_only_activation_memory(context_length, info)
162
+
163
+ total_mem = ceil(inference_mem + model_size_in_mb + overhead + activation_mem)
164
+ return ModelMemInfo(
165
+ model_mem=ceil(model_size_in_mb),
166
+ kv_cache_mem=ceil(inference_mem),
167
+ activation_mem=ceil(activation_mem),
168
+ overhead=ceil(overhead),
169
+ total=total_mem,
170
+ )
171
+
172
+
173
+ def _load_item_from_json(config_data: Any, *keys: str) -> str:
174
+ assert len(keys) > 0
175
+ for key in keys:
176
+ v = config_data.get(key)
177
+ if v is not None:
178
+ return v
179
+ raise ValueError("load ModelLayersInfo: missing %s" % (keys[0]))
180
+
181
+
182
+ def load_model_config_json(config_path: str) -> ModelLayersInfo:
183
+ with open(config_path, "r") as f:
184
+ config_data = json.load(f)
185
+ return ModelLayersInfo(
186
+ vocab_size=int(_load_item_from_json(config_data, "vocab_size")),
187
+ heads=int(
188
+ _load_item_from_json(
189
+ config_data, "num_key_value_heads", "num_attention_heads"
190
+ )
191
+ ),
192
+ hidden_dim=int(
193
+ _load_item_from_json(config_data, "hidden_size", "d_model", "n_embd")
194
+ ),
195
+ inter_dim=int(_load_item_from_json(config_data, "intermediate_size")),
196
+ num_layers=int(
197
+ _load_item_from_json(
198
+ config_data, "num_hidden_layers", "num_layers", "n_layer"
199
+ )
200
+ ),
201
+ )
202
+
203
+
204
+ def get_model_layers_info(
205
+ model_size_in_billions: Union[str, int],
206
+ model_name: Optional[str],
207
+ model_format: Optional[str],
208
+ quantization: Optional[str],
209
+ ) -> Optional[ModelLayersInfo]:
210
+ from . import match_llm
211
+ from .llm_family import cache_model_config
212
+
213
+ if not model_name:
214
+ logger.debug("get_model_layers_info by default size=%s", model_size_in_billions)
215
+ size_in_billions = convert_model_size_to_float(model_size_in_billions)
216
+ return _get_default_layers_from_size(size_in_billions)
217
+ match_result = match_llm(
218
+ model_name=model_name,
219
+ model_format=model_format,
220
+ model_size_in_billions=model_size_in_billions,
221
+ quantization=quantization,
222
+ )
223
+ if not match_result:
224
+ return None
225
+ llm_family, llm_spec, _quant = match_result
226
+ config_path = cache_model_config(llm_family, llm_spec)
227
+ return load_model_config_json(config_path)
228
+
229
+
230
+ def _get_default_layers_from_size(size_in_billion: float) -> ModelLayersInfo:
231
+ if size_in_billion < 5:
232
+ vocab_size = 32000
233
+ heads = 32
234
+ num_layers = 24
235
+ elif size_in_billion < 10:
236
+ vocab_size = 32000
237
+ heads = 32
238
+ num_layers = 32
239
+ elif size_in_billion < 24:
240
+ vocab_size = 32000
241
+ heads = 40
242
+ num_layers = 40
243
+ elif size_in_billion < 55:
244
+ vocab_size = 32000
245
+ heads = 60
246
+ num_layers = 48
247
+ else:
248
+ vocab_size = 32000
249
+ heads = 64
250
+ num_layers = 80
251
+
252
+ model_size = int(size_in_billion * 1000000000)
253
+ A = num_layers * 4 + 3 * 4 * num_layers
254
+ B = 2 * vocab_size
255
+ C = -1 * model_size
256
+ h = (-B + math.sqrt(B**2 - 4 * A * C)) / (2 * A)
257
+ h = math.ceil(h)
258
+ return ModelLayersInfo(
259
+ vocab_size=vocab_size,
260
+ heads=heads,
261
+ hidden_dim=h,
262
+ inter_dim=4 * h,
263
+ num_layers=num_layers,
264
+ )
265
+
266
+
267
+ def _convert_to_mb_model_size(model_size: float, quantization: Optional[str]) -> float:
268
+ extra = 0.0
269
+ fB = 2.0
270
+ size = (model_size * fB) / (1024.0 * 1024.0)
271
+ # bnb_q4 == 4-bit ?
272
+ if quantization == "8-bit" or quantization == "4-bit":
273
+ extra = 0.06 * size
274
+ if quantization == "8-bit":
275
+ size = size / 2
276
+ if quantization == "4-bit":
277
+ size = size / 4
278
+ return size + extra
279
+
280
+
281
+ def _compute_inference_only_activation_memory(
282
+ context_length: int, info: ModelLayersInfo
283
+ ) -> float:
284
+ hidden_dim = info.hidden_dim
285
+ heads = info.heads
286
+ ret = (
287
+ (context_length * hidden_dim * 5 * 2 + (context_length**2) * heads * 2)
288
+ / 1024
289
+ / 1024
290
+ )
291
+ return ret
292
+
293
+
294
+ def _compute_model_size_ggml(info: ModelLayersInfo, quantization: str) -> float:
295
+ assert quantization is not None
296
+ vocab_size = info.vocab_size
297
+ num_layers = info.num_layers
298
+ hidden_dim = info.hidden_dim
299
+ inter_dim = info.inter_dim
300
+ total_params = int(
301
+ vocab_size * hidden_dim * 2
302
+ + num_layers * 4 * (hidden_dim**2)
303
+ + num_layers * 3 * inter_dim * hidden_dim
304
+ )
305
+ other_v_down_params = (
306
+ num_layers * (hidden_dim**2) + num_layers * hidden_dim * inter_dim
307
+ )
308
+ other_param_q2k = (
309
+ total_params - (hidden_dim**2) * num_layers * 2 + 2 * vocab_size * hidden_dim
310
+ )
311
+
312
+ total = 0.0
313
+ v1 = GGML_MULTI_FACTOR_DICT.get(quantization)
314
+ if v1 is not None:
315
+ total = (v1 * total_params) / (32 * 1024 * 1024)
316
+ v2 = GGML_MULTI_FACTOR_DICT_64.get(quantization)
317
+ if v2 is not None:
318
+ total = (v2 * total_params) / (64 * 1024 * 1024)
319
+ v3 = GGML_MULTI_FACTOR_DICT_COMBINE.get(quantization)
320
+ if v3 is not None:
321
+ factors = v3
322
+ if quantization == "q2_K":
323
+ total = (
324
+ (total_params - other_param_q2k) * factors[1]
325
+ + other_param_q2k * factors[0]
326
+ ) / (64 * 1024 * 1024)
327
+ else:
328
+ total = (
329
+ (total_params - other_v_down_params) * factors[1]
330
+ + other_v_down_params * factors[0]
331
+ ) / (64 * 1024 * 1024)
332
+ return total
@@ -60,6 +60,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
60
60
  "OmniLMM",
61
61
  "yi-vl-chat",
62
62
  "deepseek-vl-chat",
63
+ "internvl-chat",
64
+ "mini-internvl-chat",
63
65
  ]
64
66
 
65
67