xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +79 -2
- xinference/client/restful/restful_client.py +65 -3
- xinference/conftest.py +0 -7
- xinference/core/media_interface.py +132 -8
- xinference/core/model.py +44 -6
- xinference/core/scheduler.py +1 -10
- xinference/core/supervisor.py +8 -17
- xinference/core/worker.py +5 -27
- xinference/deploy/cmdline.py +6 -2
- xinference/model/audio/chattts.py +24 -39
- xinference/model/audio/cosyvoice.py +18 -30
- xinference/model/audio/funasr.py +42 -0
- xinference/model/audio/model_spec.json +71 -1
- xinference/model/audio/model_spec_modelscope.json +76 -2
- xinference/model/audio/utils.py +75 -0
- xinference/model/core.py +1 -0
- xinference/model/embedding/__init__.py +74 -18
- xinference/model/embedding/core.py +98 -589
- xinference/model/embedding/embed_family.py +133 -0
- xinference/{thirdparty/omnilmm/train → model/embedding/flag}/__init__.py +1 -1
- xinference/model/embedding/flag/core.py +282 -0
- xinference/model/embedding/model_spec.json +24 -0
- xinference/model/embedding/model_spec_modelscope.json +24 -0
- xinference/model/embedding/sentence_transformers/__init__.py +13 -0
- xinference/model/embedding/sentence_transformers/core.py +399 -0
- xinference/model/embedding/vllm/core.py +95 -0
- xinference/model/image/model_spec.json +30 -3
- xinference/model/image/model_spec_modelscope.json +41 -2
- xinference/model/image/stable_diffusion/core.py +144 -53
- xinference/model/llm/__init__.py +6 -54
- xinference/model/llm/core.py +19 -5
- xinference/model/llm/llama_cpp/core.py +59 -3
- xinference/model/llm/llama_cpp/memory.py +457 -0
- xinference/model/llm/llm_family.json +247 -402
- xinference/model/llm/llm_family.py +88 -16
- xinference/model/llm/llm_family_modelscope.json +260 -421
- xinference/model/llm/llm_family_openmind_hub.json +0 -34
- xinference/model/llm/sglang/core.py +8 -0
- xinference/model/llm/transformers/__init__.py +27 -6
- xinference/model/llm/transformers/chatglm.py +4 -2
- xinference/model/llm/transformers/core.py +49 -28
- xinference/model/llm/transformers/deepseek_v2.py +6 -49
- xinference/model/llm/transformers/gemma3.py +119 -164
- xinference/model/llm/transformers/multimodal/__init__.py +13 -0
- xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
- xinference/model/llm/transformers/multimodal/core.py +205 -0
- xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
- xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
- xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
- xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
- xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
- xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
- xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
- xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
- xinference/model/llm/transformers/opt.py +4 -2
- xinference/model/llm/transformers/utils.py +6 -37
- xinference/model/llm/utils.py +11 -0
- xinference/model/llm/vllm/core.py +7 -0
- xinference/model/rerank/core.py +91 -3
- xinference/model/rerank/model_spec.json +24 -0
- xinference/model/rerank/model_spec_modelscope.json +24 -0
- xinference/model/rerank/utils.py +20 -2
- xinference/model/utils.py +38 -1
- xinference/model/video/diffusers.py +65 -3
- xinference/model/video/model_spec.json +31 -4
- xinference/model/video/model_spec_modelscope.json +32 -4
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.013f296b.css +2 -0
- xinference/web/ui/build/static/css/main.013f296b.css.map +1 -0
- xinference/web/ui/build/static/js/main.8a9e3ba0.js +3 -0
- xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6595880facebca7ceace6f17cf21c3a5a9219a2f52fb0ba9f3cf1131eddbcf6b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/aa998bc2d9c11853add6b8a2e08f50327f56d8824ccaaec92d6dde1b305f0d85.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c748246b1d7bcebc16153be69f37e955bb2145526c47dd425aeeff70d3004dbc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e31234e95d60a5a7883fbcd70de2475dc1c88c90705df1a530abb68f86f80a51.json +1 -0
- xinference/web/ui/src/locales/en.json +21 -8
- xinference/web/ui/src/locales/ja.json +224 -0
- xinference/web/ui/src/locales/ko.json +224 -0
- xinference/web/ui/src/locales/zh.json +21 -8
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/METADATA +14 -11
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/RECORD +93 -100
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/WHEEL +1 -1
- xinference/model/llm/transformers/cogvlm2.py +0 -442
- xinference/model/llm/transformers/cogvlm2_video.py +0 -333
- xinference/model/llm/transformers/deepseek_vl.py +0 -280
- xinference/model/llm/transformers/glm_edge_v.py +0 -213
- xinference/model/llm/transformers/intern_vl.py +0 -526
- xinference/model/llm/transformers/internlm2.py +0 -94
- xinference/model/llm/transformers/minicpmv25.py +0 -193
- xinference/model/llm/transformers/omnilmm.py +0 -132
- xinference/model/llm/transformers/qwen2_audio.py +0 -179
- xinference/model/llm/transformers/qwen_vl.py +0 -360
- xinference/thirdparty/omnilmm/LICENSE +0 -201
- xinference/thirdparty/omnilmm/chat.py +0 -218
- xinference/thirdparty/omnilmm/constants.py +0 -4
- xinference/thirdparty/omnilmm/conversation.py +0 -332
- xinference/thirdparty/omnilmm/model/__init__.py +0 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
- xinference/thirdparty/omnilmm/model/resampler.py +0 -166
- xinference/thirdparty/omnilmm/model/utils.py +0 -578
- xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
- xinference/thirdparty/omnilmm/utils.py +0 -134
- xinference/web/ui/build/static/css/main.337afe76.css +0 -2
- xinference/web/ui/build/static/css/main.337afe76.css.map +0 -1
- xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
- xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +0 -1
- /xinference/{thirdparty/omnilmm → model/embedding/vllm}/__init__.py +0 -0
- /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.8a9e3ba0.js.LICENSE.txt} +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from collections.abc import Sequence
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from gguf import GGUFReader, GGUFValueType # noqa: E402
|
|
22
|
+
except ImportError:
|
|
23
|
+
GGUFReader = GGUFValueType = None
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
|
|
28
|
+
file_endian = reader.endianess.name # codespell:ignore
|
|
29
|
+
if reader.byte_order == "S":
|
|
30
|
+
host_endian = "BIG" if file_endian == "LITTLE" else "LITTLE"
|
|
31
|
+
else:
|
|
32
|
+
host_endian = file_endian
|
|
33
|
+
return (host_endian, file_endian)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def dump_metadata_json(reader: GGUFReader, model_path: str) -> dict:
|
|
37
|
+
host_endian, file_endian = get_file_host_endian(reader)
|
|
38
|
+
metadata: dict[str, Any] = {}
|
|
39
|
+
tensors: dict[str, Any] = {}
|
|
40
|
+
result = {
|
|
41
|
+
"filename": model_path,
|
|
42
|
+
"endian": file_endian,
|
|
43
|
+
"metadata": metadata,
|
|
44
|
+
"tensors": tensors,
|
|
45
|
+
}
|
|
46
|
+
for idx, field in enumerate(reader.fields.values()):
|
|
47
|
+
curr: dict[str, Any] = {
|
|
48
|
+
"index": idx,
|
|
49
|
+
"type": field.types[0].name if field.types else "UNKNOWN",
|
|
50
|
+
"offset": field.offset,
|
|
51
|
+
}
|
|
52
|
+
metadata[field.name] = curr
|
|
53
|
+
if field.types[:1] == [GGUFValueType.ARRAY]:
|
|
54
|
+
curr["array_types"] = [t.name for t in field.types][1:]
|
|
55
|
+
curr["value"] = field.contents()
|
|
56
|
+
else:
|
|
57
|
+
curr["value"] = field.contents()
|
|
58
|
+
for i, tensor in enumerate(reader.tensors):
|
|
59
|
+
tensors[tensor.name] = {
|
|
60
|
+
"index": i,
|
|
61
|
+
"shape": tensor.shape.tolist(),
|
|
62
|
+
"type": tensor.tensor_type.name,
|
|
63
|
+
"offset": tensor.field.offset,
|
|
64
|
+
"n_bytes": tensor.n_bytes,
|
|
65
|
+
}
|
|
66
|
+
return result
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class MemoryEstimate:
|
|
71
|
+
# How many layers we predict we can load
|
|
72
|
+
layers: int
|
|
73
|
+
# The size of the graph which occupies the main GPU
|
|
74
|
+
graph: int
|
|
75
|
+
# How much VRAM will be allocated given the number of layers we predict
|
|
76
|
+
vram_size: int
|
|
77
|
+
# The total size of the model if loaded into VRAM. If all layers are loaded, vram_size == total_size
|
|
78
|
+
total_size: int
|
|
79
|
+
# For multi-GPU scenarios, this provides the tensor split parameter
|
|
80
|
+
tensor_split: str
|
|
81
|
+
# For multi-GPU scenarios, this is the size in bytes per GPU
|
|
82
|
+
gpu_sizes: list[int]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _get_max_min(value):
|
|
86
|
+
if isinstance(value, Sequence):
|
|
87
|
+
return max(value), min(value)
|
|
88
|
+
else:
|
|
89
|
+
return value, value
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def graph_size(
|
|
93
|
+
data: dict,
|
|
94
|
+
context_length: int,
|
|
95
|
+
batch_size: int,
|
|
96
|
+
num_parallel: int,
|
|
97
|
+
kv_cache_type: str,
|
|
98
|
+
):
|
|
99
|
+
"""
|
|
100
|
+
Most of the logic comes from `GraphSize` in https://github.com/ollama/ollama/blob/main/fs/ggml/ggml.go
|
|
101
|
+
"""
|
|
102
|
+
if context_length < batch_size:
|
|
103
|
+
batch_size = context_length
|
|
104
|
+
|
|
105
|
+
metadata = data["metadata"]
|
|
106
|
+
architecture = metadata["general.architecture"]["value"]
|
|
107
|
+
embedding_length = metadata[f"{architecture}.embedding_length"]["value"]
|
|
108
|
+
block_count = metadata[f"{architecture}.block_count"]["value"]
|
|
109
|
+
head_count_max, head_count_min = _get_max_min(
|
|
110
|
+
metadata[f"{architecture}.attention.head_count"]["value"]
|
|
111
|
+
)
|
|
112
|
+
head_count_kv_max, head_count_kv_min = _get_max_min(
|
|
113
|
+
metadata[f"{architecture}.attention.head_count_kv"]["value"]
|
|
114
|
+
)
|
|
115
|
+
vocab = len(metadata["tokenizer.ggml.tokens"]["value"])
|
|
116
|
+
embedding_head_count_max = (
|
|
117
|
+
(embedding_length // head_count_min) if head_count_min > 0 else 0
|
|
118
|
+
)
|
|
119
|
+
embedding_head_count_k = metadata.get(
|
|
120
|
+
f"{architecture}.attention.key_length", {}
|
|
121
|
+
).get("value", embedding_head_count_max)
|
|
122
|
+
embedding_head_count_v = metadata.get(
|
|
123
|
+
f"{architecture}.attention.value_length", {}
|
|
124
|
+
).get("value", embedding_head_count_max)
|
|
125
|
+
|
|
126
|
+
# f16(default)
|
|
127
|
+
bytes_per_kv_element = {
|
|
128
|
+
"q8_0": 1, # 1/2 of fp16
|
|
129
|
+
"q4_0": 0.5, # 1/4 of fp16
|
|
130
|
+
}.get(kv_cache_type, 2)
|
|
131
|
+
|
|
132
|
+
kv = [0] * block_count
|
|
133
|
+
for i in range(block_count):
|
|
134
|
+
kv[i] = (
|
|
135
|
+
context_length
|
|
136
|
+
* (embedding_head_count_k + embedding_head_count_v)
|
|
137
|
+
* head_count_kv_max
|
|
138
|
+
* bytes_per_kv_element
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
full_offload = 0
|
|
142
|
+
partial_offload = 0
|
|
143
|
+
if architecture in ["llama", "llama4"]:
|
|
144
|
+
full_offload = max(
|
|
145
|
+
4
|
|
146
|
+
* batch_size
|
|
147
|
+
* (1 + 4 * embedding_length + context_length * (1 + head_count_max)),
|
|
148
|
+
4 * batch_size * (embedding_length + vocab),
|
|
149
|
+
)
|
|
150
|
+
partial_offload = 4 * batch_size * embedding_length
|
|
151
|
+
partial_offload += max(
|
|
152
|
+
4
|
|
153
|
+
* batch_size
|
|
154
|
+
* (1 + embedding_length + max(context_length, embedding_length))
|
|
155
|
+
+ embedding_length * embedding_length * 9 / 16
|
|
156
|
+
+ 4
|
|
157
|
+
* context_length
|
|
158
|
+
* (
|
|
159
|
+
batch_size * head_count_max
|
|
160
|
+
+ embedding_head_count_max * head_count_kv_max
|
|
161
|
+
),
|
|
162
|
+
4 * batch_size * (embedding_length + vocab)
|
|
163
|
+
+ embedding_length * vocab * 105 / 128,
|
|
164
|
+
)
|
|
165
|
+
elif architecture in ["gemma", "gemma2", "gemma3"]:
|
|
166
|
+
full_offload = max(
|
|
167
|
+
4 * batch_size * (embedding_length + vocab),
|
|
168
|
+
4
|
|
169
|
+
* batch_size
|
|
170
|
+
* (
|
|
171
|
+
2
|
|
172
|
+
+ context_length
|
|
173
|
+
+ context_length * head_count_max
|
|
174
|
+
+ 2 * embedding_length
|
|
175
|
+
+ 2 * embedding_head_count_k * head_count_max
|
|
176
|
+
),
|
|
177
|
+
)
|
|
178
|
+
partial_offload = max(
|
|
179
|
+
4 * embedding_length * batch_size
|
|
180
|
+
+ embedding_length * vocab * 105 / 128
|
|
181
|
+
+ 4 * vocab * batch_size,
|
|
182
|
+
4
|
|
183
|
+
* batch_size
|
|
184
|
+
* (
|
|
185
|
+
2 * embedding_length
|
|
186
|
+
+ 1
|
|
187
|
+
+ 2 * embedding_head_count_k * head_count_max
|
|
188
|
+
+ context_length
|
|
189
|
+
+ context_length * head_count_max
|
|
190
|
+
)
|
|
191
|
+
+ 4 * embedding_head_count_k * context_length * 8
|
|
192
|
+
+ embedding_length * embedding_head_count_k * head_count_max * 9 / 16,
|
|
193
|
+
)
|
|
194
|
+
if architecture == "gemma3":
|
|
195
|
+
gemma3_global_cache_count = 6
|
|
196
|
+
sliding_window = (
|
|
197
|
+
num_parallel
|
|
198
|
+
* metadata[f"{architecture}.attention.sliding_window"]["value"]
|
|
199
|
+
+ batch_size
|
|
200
|
+
)
|
|
201
|
+
for i in range(block_count):
|
|
202
|
+
if (i + 1) % gemma3_global_cache_count != 0:
|
|
203
|
+
kv[i] = (
|
|
204
|
+
sliding_window
|
|
205
|
+
* (embedding_head_count_k + embedding_head_count_v)
|
|
206
|
+
* head_count_kv_max
|
|
207
|
+
* bytes_per_kv_element
|
|
208
|
+
)
|
|
209
|
+
elif architecture == "qwen2":
|
|
210
|
+
full_offload = max(
|
|
211
|
+
4 * batch_size * (embedding_length + vocab),
|
|
212
|
+
4
|
|
213
|
+
* batch_size
|
|
214
|
+
* (
|
|
215
|
+
1
|
|
216
|
+
+ 2 * embedding_length
|
|
217
|
+
+ context_length
|
|
218
|
+
+ context_length * head_count_max
|
|
219
|
+
),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
partial_offload = max(
|
|
223
|
+
4 * batch_size * (embedding_length + vocab)
|
|
224
|
+
+ embedding_length * vocab * 105 / 128,
|
|
225
|
+
4
|
|
226
|
+
* (
|
|
227
|
+
batch_size
|
|
228
|
+
* (1 + 2 * embedding_length + context_length * (1 + head_count_max))
|
|
229
|
+
+ embedding_length * (1 + context_length)
|
|
230
|
+
),
|
|
231
|
+
)
|
|
232
|
+
elif architecture == "stablelm":
|
|
233
|
+
full_offload = (
|
|
234
|
+
4
|
|
235
|
+
* batch_size
|
|
236
|
+
* (context_length * (1 + head_count_max) + 3 * embedding_length + 2)
|
|
237
|
+
)
|
|
238
|
+
partial_offload = max(
|
|
239
|
+
4 * batch_size * (vocab + 2 * embedding_length), full_offload
|
|
240
|
+
)
|
|
241
|
+
elif architecture == "deepseek2":
|
|
242
|
+
full_offload = max(
|
|
243
|
+
4 * batch_size * (3 * embedding_length + vocab),
|
|
244
|
+
4
|
|
245
|
+
* batch_size
|
|
246
|
+
* (
|
|
247
|
+
3 * embedding_length
|
|
248
|
+
+ 2
|
|
249
|
+
+ context_length * (1 + head_count_kv_max)
|
|
250
|
+
+ 2 * embedding_head_count_k * head_count_kv_max
|
|
251
|
+
),
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
partial_offload = max(
|
|
255
|
+
4 * batch_size * (3 * embedding_length + vocab)
|
|
256
|
+
+ embedding_length * vocab * 105 / 128,
|
|
257
|
+
4
|
|
258
|
+
* batch_size
|
|
259
|
+
* (
|
|
260
|
+
2 * embedding_length
|
|
261
|
+
+ 1
|
|
262
|
+
+ 2 * embedding_head_count_k * head_count_kv_max
|
|
263
|
+
+ context_length
|
|
264
|
+
+ context_length * head_count_kv_max
|
|
265
|
+
)
|
|
266
|
+
+ 4 * embedding_head_count_k * context_length * head_count_kv_max
|
|
267
|
+
+ embedding_length * embedding_head_count_k * head_count_kv_max * 9 / 16,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
kv_total = sum(kv)
|
|
271
|
+
if partial_offload == 0:
|
|
272
|
+
partial_offload = (
|
|
273
|
+
head_count_max
|
|
274
|
+
/ (1 if head_count_kv_min <= 0 else head_count_kv_min)
|
|
275
|
+
* kv_total
|
|
276
|
+
/ 6
|
|
277
|
+
)
|
|
278
|
+
if full_offload == 0:
|
|
279
|
+
full_offload = partial_offload
|
|
280
|
+
|
|
281
|
+
return kv, partial_offload, full_offload
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def projector_memory_requirements(projector: str):
|
|
285
|
+
reader = GGUFReader(projector, "r")
|
|
286
|
+
data = dump_metadata_json(reader, projector)
|
|
287
|
+
return sum(t["n_bytes"] for t in data["tensors"].values())
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def estimate_gpu_layers(
|
|
291
|
+
gpus: list[dict],
|
|
292
|
+
model_path: str,
|
|
293
|
+
projectors: list[str],
|
|
294
|
+
context_length: int,
|
|
295
|
+
batch_size: int,
|
|
296
|
+
num_parallel: int,
|
|
297
|
+
kv_cache_type: str,
|
|
298
|
+
):
|
|
299
|
+
"""
|
|
300
|
+
Most of the logic comes from `EstimateGPULayers` in https://github.com/ollama/ollama/blob/main/llm/memory.go
|
|
301
|
+
"""
|
|
302
|
+
# Projectors loaded into GPU0 only
|
|
303
|
+
projector_weights = sum(map(projector_memory_requirements, projectors))
|
|
304
|
+
if projector_weights > 0:
|
|
305
|
+
# Multimodal models require at least 2048 context
|
|
306
|
+
context_length = max(context_length, 2048)
|
|
307
|
+
reader = GGUFReader(model_path, "r")
|
|
308
|
+
data = dump_metadata_json(reader, model_path)
|
|
309
|
+
kv, graph_partial_offload, graph_full_offload = graph_size(
|
|
310
|
+
data,
|
|
311
|
+
context_length=context_length,
|
|
312
|
+
batch_size=batch_size,
|
|
313
|
+
num_parallel=num_parallel,
|
|
314
|
+
kv_cache_type=kv_cache_type,
|
|
315
|
+
)
|
|
316
|
+
# Get all layer sizes
|
|
317
|
+
metadata = data["metadata"]
|
|
318
|
+
architecture = metadata["general.architecture"]["value"]
|
|
319
|
+
block_count = metadata[f"{architecture}.block_count"]["value"]
|
|
320
|
+
layer_sizes = [0] * block_count
|
|
321
|
+
for name, layer in data["tensors"].items():
|
|
322
|
+
if name.startswith("blk."):
|
|
323
|
+
index = int(name[len("blk.") :].split(".")[0])
|
|
324
|
+
layer_sizes[index] += layer["n_bytes"]
|
|
325
|
+
layer_size = layer_sizes[0] if layer_sizes else 0
|
|
326
|
+
|
|
327
|
+
if len(kv) > 0:
|
|
328
|
+
layer_size += kv[0]
|
|
329
|
+
# On metal there's no partial offload overhead
|
|
330
|
+
if gpus[0]["name"] == "Metal":
|
|
331
|
+
graph_partial_offload = graph_full_offload
|
|
332
|
+
elif len(gpus) > 1:
|
|
333
|
+
# Multi gpu should always use the partial graph size
|
|
334
|
+
graph_full_offload = graph_partial_offload
|
|
335
|
+
|
|
336
|
+
# Get output layer size
|
|
337
|
+
memory_layer_output = 0
|
|
338
|
+
# Output layer handled at the end if we have space
|
|
339
|
+
for name, layer in data["tensors"].items():
|
|
340
|
+
if any(
|
|
341
|
+
name.startswith(prefix)
|
|
342
|
+
for prefix in ["output_norm", "output", "token_embd"]
|
|
343
|
+
):
|
|
344
|
+
memory_layer_output += layer["n_bytes"]
|
|
345
|
+
|
|
346
|
+
# Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
|
347
|
+
default_memory_min = 512 * 1024**2
|
|
348
|
+
gpu_allocations = [0] * len(gpus)
|
|
349
|
+
gpus_with_space: list[int] = []
|
|
350
|
+
for i in range(len(gpus)):
|
|
351
|
+
gpu0_overhead = projector_weights if len(gpus_with_space) == 0 else 0
|
|
352
|
+
minimum_memory = gpus[i].get("memory_min", default_memory_min)
|
|
353
|
+
if (
|
|
354
|
+
gpus[i]["memory_free"]
|
|
355
|
+
< gpu0_overhead
|
|
356
|
+
+ max(graph_partial_offload, graph_full_offload)
|
|
357
|
+
+ minimum_memory
|
|
358
|
+
+ 2 * layer_size
|
|
359
|
+
):
|
|
360
|
+
continue
|
|
361
|
+
gpus_with_space.append(i)
|
|
362
|
+
gpu_allocations[i] += gpu0_overhead + minimum_memory + layer_size
|
|
363
|
+
|
|
364
|
+
overflow = 0
|
|
365
|
+
if len(gpus_with_space) == 0:
|
|
366
|
+
overflow = projector_weights
|
|
367
|
+
|
|
368
|
+
# For all the layers, find where they can fit on the GPU(s)
|
|
369
|
+
layer_count = 0
|
|
370
|
+
layer_counts = [0] * len(gpus)
|
|
371
|
+
for i in range(block_count - 1, -1, -1):
|
|
372
|
+
layer_size = layer_sizes[i]
|
|
373
|
+
layer_size += kv[i]
|
|
374
|
+
|
|
375
|
+
# Distribute the layers across the GPU(s) that have space
|
|
376
|
+
for j in range(len(gpus_with_space), 0, -1):
|
|
377
|
+
g = gpus_with_space[i % j]
|
|
378
|
+
used = gpu_allocations[g] + max(graph_partial_offload, graph_full_offload)
|
|
379
|
+
if gpus[g]["memory_free"] > used + layer_size:
|
|
380
|
+
gpu_allocations[g] += layer_size
|
|
381
|
+
layer_counts[g] += 1
|
|
382
|
+
layer_count += 1
|
|
383
|
+
break
|
|
384
|
+
else:
|
|
385
|
+
gpus_with_space = (
|
|
386
|
+
gpus_with_space[: i % j] + gpus_with_space[i % j + 1 :]
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
if len(gpus_with_space) == 0:
|
|
390
|
+
overflow += layer_size
|
|
391
|
+
|
|
392
|
+
fully_loaded = False
|
|
393
|
+
if layer_count >= block_count:
|
|
394
|
+
fully_loaded = True
|
|
395
|
+
|
|
396
|
+
# Determine if we need to consider output then find where it fits
|
|
397
|
+
if memory_layer_output > 0:
|
|
398
|
+
for j in range(len(gpus_with_space), 0, -1):
|
|
399
|
+
g = gpus_with_space[layer_count % j]
|
|
400
|
+
used = gpu_allocations[g] + max(graph_partial_offload, graph_full_offload)
|
|
401
|
+
if gpus[g]["memory_free"] > used + memory_layer_output:
|
|
402
|
+
gpu_allocations[g] += memory_layer_output
|
|
403
|
+
layer_counts[g] += 1
|
|
404
|
+
layer_count += 1
|
|
405
|
+
break
|
|
406
|
+
else:
|
|
407
|
+
gpus_with_space = (
|
|
408
|
+
gpus_with_space[: layer_count % j]
|
|
409
|
+
+ gpus_with_space[layer_count % j + 1 :]
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
if layer_count < block_count + 1:
|
|
413
|
+
fully_loaded = False
|
|
414
|
+
overflow += memory_layer_output
|
|
415
|
+
|
|
416
|
+
# Add the applicable (full or partial) graph allocations
|
|
417
|
+
for i in range(len(gpus)):
|
|
418
|
+
if layer_counts[i] <= 0:
|
|
419
|
+
continue
|
|
420
|
+
if fully_loaded:
|
|
421
|
+
gpu_allocations[i] += graph_full_offload
|
|
422
|
+
else:
|
|
423
|
+
gpu_allocations[i] += graph_partial_offload
|
|
424
|
+
|
|
425
|
+
if fully_loaded:
|
|
426
|
+
graph_offload = graph_full_offload
|
|
427
|
+
else:
|
|
428
|
+
graph_offload = graph_partial_offload
|
|
429
|
+
|
|
430
|
+
# Summaries
|
|
431
|
+
memory_required_partial = sum(gpu_allocations)
|
|
432
|
+
memory_required_total = memory_required_partial + overflow
|
|
433
|
+
|
|
434
|
+
tensor_split = ""
|
|
435
|
+
if len(gpus) > 1:
|
|
436
|
+
tensor_split = ",".join(str(c) for c in layer_counts)
|
|
437
|
+
|
|
438
|
+
estimate = MemoryEstimate(
|
|
439
|
+
layers=0,
|
|
440
|
+
graph=0,
|
|
441
|
+
vram_size=0,
|
|
442
|
+
total_size=int(memory_required_total),
|
|
443
|
+
tensor_split="",
|
|
444
|
+
gpu_sizes=[],
|
|
445
|
+
)
|
|
446
|
+
if gpus[0]["name"] == "CPU":
|
|
447
|
+
return estimate
|
|
448
|
+
if layer_count == 0:
|
|
449
|
+
return estimate
|
|
450
|
+
|
|
451
|
+
estimate.layers = layer_count
|
|
452
|
+
estimate.graph = int(graph_offload)
|
|
453
|
+
estimate.vram_size = int(memory_required_partial)
|
|
454
|
+
estimate.total_size = int(memory_required_total)
|
|
455
|
+
estimate.tensor_split = tensor_split
|
|
456
|
+
estimate.gpu_sizes = [int(i) for i in gpu_allocations]
|
|
457
|
+
return estimate
|