xinference 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/client/restful/async_restful_client.py +8 -13
- xinference/client/restful/restful_client.py +6 -2
- xinference/core/chat_interface.py +6 -4
- xinference/core/media_interface.py +5 -0
- xinference/core/model.py +1 -5
- xinference/core/supervisor.py +117 -68
- xinference/core/worker.py +49 -37
- xinference/deploy/test/test_cmdline.py +2 -6
- xinference/model/audio/__init__.py +26 -23
- xinference/model/audio/chattts.py +3 -2
- xinference/model/audio/core.py +49 -98
- xinference/model/audio/cosyvoice.py +3 -2
- xinference/model/audio/custom.py +28 -73
- xinference/model/audio/f5tts.py +3 -2
- xinference/model/audio/f5tts_mlx.py +3 -2
- xinference/model/audio/fish_speech.py +3 -2
- xinference/model/audio/funasr.py +17 -4
- xinference/model/audio/kokoro.py +3 -2
- xinference/model/audio/megatts.py +3 -2
- xinference/model/audio/melotts.py +3 -2
- xinference/model/audio/model_spec.json +572 -171
- xinference/model/audio/utils.py +0 -6
- xinference/model/audio/whisper.py +3 -2
- xinference/model/audio/whisper_mlx.py +3 -2
- xinference/model/cache_manager.py +141 -0
- xinference/model/core.py +6 -49
- xinference/model/custom.py +174 -0
- xinference/model/embedding/__init__.py +67 -56
- xinference/model/embedding/cache_manager.py +35 -0
- xinference/model/embedding/core.py +104 -84
- xinference/model/embedding/custom.py +55 -78
- xinference/model/embedding/embed_family.py +80 -31
- xinference/model/embedding/flag/core.py +21 -5
- xinference/model/embedding/llama_cpp/__init__.py +0 -0
- xinference/model/embedding/llama_cpp/core.py +234 -0
- xinference/model/embedding/model_spec.json +968 -103
- xinference/model/embedding/sentence_transformers/core.py +30 -20
- xinference/model/embedding/vllm/core.py +11 -5
- xinference/model/flexible/__init__.py +8 -2
- xinference/model/flexible/core.py +26 -119
- xinference/model/flexible/custom.py +69 -0
- xinference/model/flexible/launchers/image_process_launcher.py +1 -0
- xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
- xinference/model/flexible/launchers/transformers_launcher.py +15 -3
- xinference/model/flexible/launchers/yolo_launcher.py +5 -1
- xinference/model/image/__init__.py +20 -20
- xinference/model/image/cache_manager.py +62 -0
- xinference/model/image/core.py +70 -182
- xinference/model/image/custom.py +28 -72
- xinference/model/image/model_spec.json +402 -119
- xinference/model/image/ocr/got_ocr2.py +3 -2
- xinference/model/image/stable_diffusion/core.py +22 -7
- xinference/model/image/stable_diffusion/mlx.py +6 -6
- xinference/model/image/utils.py +2 -2
- xinference/model/llm/__init__.py +71 -94
- xinference/model/llm/cache_manager.py +292 -0
- xinference/model/llm/core.py +37 -111
- xinference/model/llm/custom.py +88 -0
- xinference/model/llm/llama_cpp/core.py +5 -7
- xinference/model/llm/llm_family.json +16260 -8151
- xinference/model/llm/llm_family.py +138 -839
- xinference/model/llm/lmdeploy/core.py +5 -7
- xinference/model/llm/memory.py +3 -4
- xinference/model/llm/mlx/core.py +6 -8
- xinference/model/llm/reasoning_parser.py +3 -1
- xinference/model/llm/sglang/core.py +32 -14
- xinference/model/llm/transformers/chatglm.py +3 -7
- xinference/model/llm/transformers/core.py +49 -27
- xinference/model/llm/transformers/deepseek_v2.py +2 -2
- xinference/model/llm/transformers/gemma3.py +2 -2
- xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
- xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
- xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
- xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
- xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
- xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
- xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
- xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
- xinference/model/llm/transformers/opt.py +3 -7
- xinference/model/llm/utils.py +34 -49
- xinference/model/llm/vllm/core.py +77 -27
- xinference/model/llm/vllm/xavier/engine.py +5 -3
- xinference/model/llm/vllm/xavier/scheduler.py +10 -6
- xinference/model/llm/vllm/xavier/transfer.py +1 -1
- xinference/model/rerank/__init__.py +26 -25
- xinference/model/rerank/core.py +47 -87
- xinference/model/rerank/custom.py +25 -71
- xinference/model/rerank/model_spec.json +158 -33
- xinference/model/rerank/utils.py +2 -2
- xinference/model/utils.py +115 -54
- xinference/model/video/__init__.py +13 -17
- xinference/model/video/core.py +44 -102
- xinference/model/video/diffusers.py +4 -3
- xinference/model/video/model_spec.json +90 -21
- xinference/types.py +5 -3
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
- xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
- xinference/web/ui/src/locales/en.json +0 -1
- xinference/web/ui/src/locales/ja.json +0 -1
- xinference/web/ui/src/locales/ko.json +0 -1
- xinference/web/ui/src/locales/zh.json +0 -1
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
- xinference/model/audio/model_spec_modelscope.json +0 -231
- xinference/model/embedding/model_spec_modelscope.json +0 -293
- xinference/model/embedding/utils.py +0 -18
- xinference/model/image/model_spec_modelscope.json +0 -375
- xinference/model/llm/llama_cpp/memory.py +0 -457
- xinference/model/llm/llm_family_csghub.json +0 -56
- xinference/model/llm/llm_family_modelscope.json +0 -8700
- xinference/model/llm/llm_family_openmind_hub.json +0 -1019
- xinference/model/rerank/model_spec_modelscope.json +0 -85
- xinference/model/video/model_spec_modelscope.json +0 -184
- xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
- xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
- /xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0
|
@@ -1,457 +0,0 @@
|
|
|
1
|
-
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import logging
|
|
16
|
-
from collections.abc import Sequence
|
|
17
|
-
from dataclasses import dataclass
|
|
18
|
-
from typing import Any
|
|
19
|
-
|
|
20
|
-
try:
|
|
21
|
-
from gguf import GGUFReader, GGUFValueType # noqa: E402
|
|
22
|
-
except ImportError:
|
|
23
|
-
GGUFReader = GGUFValueType = None
|
|
24
|
-
logger = logging.getLogger(__name__)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
|
|
28
|
-
file_endian = reader.endianess.name # codespell:ignore
|
|
29
|
-
if reader.byte_order == "S":
|
|
30
|
-
host_endian = "BIG" if file_endian == "LITTLE" else "LITTLE"
|
|
31
|
-
else:
|
|
32
|
-
host_endian = file_endian
|
|
33
|
-
return (host_endian, file_endian)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def dump_metadata_json(reader: GGUFReader, model_path: str) -> dict:
|
|
37
|
-
host_endian, file_endian = get_file_host_endian(reader)
|
|
38
|
-
metadata: dict[str, Any] = {}
|
|
39
|
-
tensors: dict[str, Any] = {}
|
|
40
|
-
result = {
|
|
41
|
-
"filename": model_path,
|
|
42
|
-
"endian": file_endian,
|
|
43
|
-
"metadata": metadata,
|
|
44
|
-
"tensors": tensors,
|
|
45
|
-
}
|
|
46
|
-
for idx, field in enumerate(reader.fields.values()):
|
|
47
|
-
curr: dict[str, Any] = {
|
|
48
|
-
"index": idx,
|
|
49
|
-
"type": field.types[0].name if field.types else "UNKNOWN",
|
|
50
|
-
"offset": field.offset,
|
|
51
|
-
}
|
|
52
|
-
metadata[field.name] = curr
|
|
53
|
-
if field.types[:1] == [GGUFValueType.ARRAY]:
|
|
54
|
-
curr["array_types"] = [t.name for t in field.types][1:]
|
|
55
|
-
curr["value"] = field.contents()
|
|
56
|
-
else:
|
|
57
|
-
curr["value"] = field.contents()
|
|
58
|
-
for i, tensor in enumerate(reader.tensors):
|
|
59
|
-
tensors[tensor.name] = {
|
|
60
|
-
"index": i,
|
|
61
|
-
"shape": tensor.shape.tolist(),
|
|
62
|
-
"type": tensor.tensor_type.name,
|
|
63
|
-
"offset": tensor.field.offset,
|
|
64
|
-
"n_bytes": tensor.n_bytes,
|
|
65
|
-
}
|
|
66
|
-
return result
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
@dataclass
|
|
70
|
-
class MemoryEstimate:
|
|
71
|
-
# How many layers we predict we can load
|
|
72
|
-
layers: int
|
|
73
|
-
# The size of the graph which occupies the main GPU
|
|
74
|
-
graph: int
|
|
75
|
-
# How much VRAM will be allocated given the number of layers we predict
|
|
76
|
-
vram_size: int
|
|
77
|
-
# The total size of the model if loaded into VRAM. If all layers are loaded, vram_size == total_size
|
|
78
|
-
total_size: int
|
|
79
|
-
# For multi-GPU scenarios, this provides the tensor split parameter
|
|
80
|
-
tensor_split: str
|
|
81
|
-
# For multi-GPU scenarios, this is the size in bytes per GPU
|
|
82
|
-
gpu_sizes: list[int]
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def _get_max_min(value):
|
|
86
|
-
if isinstance(value, Sequence):
|
|
87
|
-
return max(value), min(value)
|
|
88
|
-
else:
|
|
89
|
-
return value, value
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def graph_size(
|
|
93
|
-
data: dict,
|
|
94
|
-
context_length: int,
|
|
95
|
-
batch_size: int,
|
|
96
|
-
num_parallel: int,
|
|
97
|
-
kv_cache_type: str,
|
|
98
|
-
):
|
|
99
|
-
"""
|
|
100
|
-
Most of the logic comes from `GraphSize` in https://github.com/ollama/ollama/blob/main/fs/ggml/ggml.go
|
|
101
|
-
"""
|
|
102
|
-
if context_length < batch_size:
|
|
103
|
-
batch_size = context_length
|
|
104
|
-
|
|
105
|
-
metadata = data["metadata"]
|
|
106
|
-
architecture = metadata["general.architecture"]["value"]
|
|
107
|
-
embedding_length = metadata[f"{architecture}.embedding_length"]["value"]
|
|
108
|
-
block_count = metadata[f"{architecture}.block_count"]["value"]
|
|
109
|
-
head_count_max, head_count_min = _get_max_min(
|
|
110
|
-
metadata[f"{architecture}.attention.head_count"]["value"]
|
|
111
|
-
)
|
|
112
|
-
head_count_kv_max, head_count_kv_min = _get_max_min(
|
|
113
|
-
metadata[f"{architecture}.attention.head_count_kv"]["value"]
|
|
114
|
-
)
|
|
115
|
-
vocab = len(metadata["tokenizer.ggml.tokens"]["value"])
|
|
116
|
-
embedding_head_count_max = (
|
|
117
|
-
(embedding_length // head_count_min) if head_count_min > 0 else 0
|
|
118
|
-
)
|
|
119
|
-
embedding_head_count_k = metadata.get(
|
|
120
|
-
f"{architecture}.attention.key_length", {}
|
|
121
|
-
).get("value", embedding_head_count_max)
|
|
122
|
-
embedding_head_count_v = metadata.get(
|
|
123
|
-
f"{architecture}.attention.value_length", {}
|
|
124
|
-
).get("value", embedding_head_count_max)
|
|
125
|
-
|
|
126
|
-
# f16(default)
|
|
127
|
-
bytes_per_kv_element = {
|
|
128
|
-
"q8_0": 1, # 1/2 of fp16
|
|
129
|
-
"q4_0": 0.5, # 1/4 of fp16
|
|
130
|
-
}.get(kv_cache_type, 2)
|
|
131
|
-
|
|
132
|
-
kv = [0] * block_count
|
|
133
|
-
for i in range(block_count):
|
|
134
|
-
kv[i] = (
|
|
135
|
-
context_length
|
|
136
|
-
* (embedding_head_count_k + embedding_head_count_v)
|
|
137
|
-
* head_count_kv_max
|
|
138
|
-
* bytes_per_kv_element
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
full_offload = 0
|
|
142
|
-
partial_offload = 0
|
|
143
|
-
if architecture in ["llama", "llama4"]:
|
|
144
|
-
full_offload = max(
|
|
145
|
-
4
|
|
146
|
-
* batch_size
|
|
147
|
-
* (1 + 4 * embedding_length + context_length * (1 + head_count_max)),
|
|
148
|
-
4 * batch_size * (embedding_length + vocab),
|
|
149
|
-
)
|
|
150
|
-
partial_offload = 4 * batch_size * embedding_length
|
|
151
|
-
partial_offload += max(
|
|
152
|
-
4
|
|
153
|
-
* batch_size
|
|
154
|
-
* (1 + embedding_length + max(context_length, embedding_length))
|
|
155
|
-
+ embedding_length * embedding_length * 9 / 16
|
|
156
|
-
+ 4
|
|
157
|
-
* context_length
|
|
158
|
-
* (
|
|
159
|
-
batch_size * head_count_max
|
|
160
|
-
+ embedding_head_count_max * head_count_kv_max
|
|
161
|
-
),
|
|
162
|
-
4 * batch_size * (embedding_length + vocab)
|
|
163
|
-
+ embedding_length * vocab * 105 / 128,
|
|
164
|
-
)
|
|
165
|
-
elif architecture in ["gemma", "gemma2", "gemma3"]:
|
|
166
|
-
full_offload = max(
|
|
167
|
-
4 * batch_size * (embedding_length + vocab),
|
|
168
|
-
4
|
|
169
|
-
* batch_size
|
|
170
|
-
* (
|
|
171
|
-
2
|
|
172
|
-
+ context_length
|
|
173
|
-
+ context_length * head_count_max
|
|
174
|
-
+ 2 * embedding_length
|
|
175
|
-
+ 2 * embedding_head_count_k * head_count_max
|
|
176
|
-
),
|
|
177
|
-
)
|
|
178
|
-
partial_offload = max(
|
|
179
|
-
4 * embedding_length * batch_size
|
|
180
|
-
+ embedding_length * vocab * 105 / 128
|
|
181
|
-
+ 4 * vocab * batch_size,
|
|
182
|
-
4
|
|
183
|
-
* batch_size
|
|
184
|
-
* (
|
|
185
|
-
2 * embedding_length
|
|
186
|
-
+ 1
|
|
187
|
-
+ 2 * embedding_head_count_k * head_count_max
|
|
188
|
-
+ context_length
|
|
189
|
-
+ context_length * head_count_max
|
|
190
|
-
)
|
|
191
|
-
+ 4 * embedding_head_count_k * context_length * 8
|
|
192
|
-
+ embedding_length * embedding_head_count_k * head_count_max * 9 / 16,
|
|
193
|
-
)
|
|
194
|
-
if architecture == "gemma3":
|
|
195
|
-
gemma3_global_cache_count = 6
|
|
196
|
-
sliding_window = (
|
|
197
|
-
num_parallel
|
|
198
|
-
* metadata[f"{architecture}.attention.sliding_window"]["value"]
|
|
199
|
-
+ batch_size
|
|
200
|
-
)
|
|
201
|
-
for i in range(block_count):
|
|
202
|
-
if (i + 1) % gemma3_global_cache_count != 0:
|
|
203
|
-
kv[i] = (
|
|
204
|
-
sliding_window
|
|
205
|
-
* (embedding_head_count_k + embedding_head_count_v)
|
|
206
|
-
* head_count_kv_max
|
|
207
|
-
* bytes_per_kv_element
|
|
208
|
-
)
|
|
209
|
-
elif architecture == "qwen2":
|
|
210
|
-
full_offload = max(
|
|
211
|
-
4 * batch_size * (embedding_length + vocab),
|
|
212
|
-
4
|
|
213
|
-
* batch_size
|
|
214
|
-
* (
|
|
215
|
-
1
|
|
216
|
-
+ 2 * embedding_length
|
|
217
|
-
+ context_length
|
|
218
|
-
+ context_length * head_count_max
|
|
219
|
-
),
|
|
220
|
-
)
|
|
221
|
-
|
|
222
|
-
partial_offload = max(
|
|
223
|
-
4 * batch_size * (embedding_length + vocab)
|
|
224
|
-
+ embedding_length * vocab * 105 / 128,
|
|
225
|
-
4
|
|
226
|
-
* (
|
|
227
|
-
batch_size
|
|
228
|
-
* (1 + 2 * embedding_length + context_length * (1 + head_count_max))
|
|
229
|
-
+ embedding_length * (1 + context_length)
|
|
230
|
-
),
|
|
231
|
-
)
|
|
232
|
-
elif architecture == "stablelm":
|
|
233
|
-
full_offload = (
|
|
234
|
-
4
|
|
235
|
-
* batch_size
|
|
236
|
-
* (context_length * (1 + head_count_max) + 3 * embedding_length + 2)
|
|
237
|
-
)
|
|
238
|
-
partial_offload = max(
|
|
239
|
-
4 * batch_size * (vocab + 2 * embedding_length), full_offload
|
|
240
|
-
)
|
|
241
|
-
elif architecture == "deepseek2":
|
|
242
|
-
full_offload = max(
|
|
243
|
-
4 * batch_size * (3 * embedding_length + vocab),
|
|
244
|
-
4
|
|
245
|
-
* batch_size
|
|
246
|
-
* (
|
|
247
|
-
3 * embedding_length
|
|
248
|
-
+ 2
|
|
249
|
-
+ context_length * (1 + head_count_kv_max)
|
|
250
|
-
+ 2 * embedding_head_count_k * head_count_kv_max
|
|
251
|
-
),
|
|
252
|
-
)
|
|
253
|
-
|
|
254
|
-
partial_offload = max(
|
|
255
|
-
4 * batch_size * (3 * embedding_length + vocab)
|
|
256
|
-
+ embedding_length * vocab * 105 / 128,
|
|
257
|
-
4
|
|
258
|
-
* batch_size
|
|
259
|
-
* (
|
|
260
|
-
2 * embedding_length
|
|
261
|
-
+ 1
|
|
262
|
-
+ 2 * embedding_head_count_k * head_count_kv_max
|
|
263
|
-
+ context_length
|
|
264
|
-
+ context_length * head_count_kv_max
|
|
265
|
-
)
|
|
266
|
-
+ 4 * embedding_head_count_k * context_length * head_count_kv_max
|
|
267
|
-
+ embedding_length * embedding_head_count_k * head_count_kv_max * 9 / 16,
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
kv_total = sum(kv)
|
|
271
|
-
if partial_offload == 0:
|
|
272
|
-
partial_offload = (
|
|
273
|
-
head_count_max
|
|
274
|
-
/ (1 if head_count_kv_min <= 0 else head_count_kv_min)
|
|
275
|
-
* kv_total
|
|
276
|
-
/ 6
|
|
277
|
-
)
|
|
278
|
-
if full_offload == 0:
|
|
279
|
-
full_offload = partial_offload
|
|
280
|
-
|
|
281
|
-
return kv, partial_offload, full_offload
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
def projector_memory_requirements(projector: str):
|
|
285
|
-
reader = GGUFReader(projector, "r")
|
|
286
|
-
data = dump_metadata_json(reader, projector)
|
|
287
|
-
return sum(t["n_bytes"] for t in data["tensors"].values())
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
def estimate_gpu_layers(
|
|
291
|
-
gpus: list[dict],
|
|
292
|
-
model_path: str,
|
|
293
|
-
projectors: list[str],
|
|
294
|
-
context_length: int,
|
|
295
|
-
batch_size: int,
|
|
296
|
-
num_parallel: int,
|
|
297
|
-
kv_cache_type: str,
|
|
298
|
-
):
|
|
299
|
-
"""
|
|
300
|
-
Most of the logic comes from `EstimateGPULayers` in https://github.com/ollama/ollama/blob/main/llm/memory.go
|
|
301
|
-
"""
|
|
302
|
-
# Projectors loaded into GPU0 only
|
|
303
|
-
projector_weights = sum(map(projector_memory_requirements, projectors))
|
|
304
|
-
if projector_weights > 0:
|
|
305
|
-
# Multimodal models require at least 2048 context
|
|
306
|
-
context_length = max(context_length, 2048)
|
|
307
|
-
reader = GGUFReader(model_path, "r")
|
|
308
|
-
data = dump_metadata_json(reader, model_path)
|
|
309
|
-
kv, graph_partial_offload, graph_full_offload = graph_size(
|
|
310
|
-
data,
|
|
311
|
-
context_length=context_length,
|
|
312
|
-
batch_size=batch_size,
|
|
313
|
-
num_parallel=num_parallel,
|
|
314
|
-
kv_cache_type=kv_cache_type,
|
|
315
|
-
)
|
|
316
|
-
# Get all layer sizes
|
|
317
|
-
metadata = data["metadata"]
|
|
318
|
-
architecture = metadata["general.architecture"]["value"]
|
|
319
|
-
block_count = metadata[f"{architecture}.block_count"]["value"]
|
|
320
|
-
layer_sizes = [0] * block_count
|
|
321
|
-
for name, layer in data["tensors"].items():
|
|
322
|
-
if name.startswith("blk."):
|
|
323
|
-
index = int(name[len("blk.") :].split(".")[0])
|
|
324
|
-
layer_sizes[index] += layer["n_bytes"]
|
|
325
|
-
layer_size = layer_sizes[0] if layer_sizes else 0
|
|
326
|
-
|
|
327
|
-
if len(kv) > 0:
|
|
328
|
-
layer_size += kv[0]
|
|
329
|
-
# On metal there's no partial offload overhead
|
|
330
|
-
if gpus[0]["name"] == "Metal":
|
|
331
|
-
graph_partial_offload = graph_full_offload
|
|
332
|
-
elif len(gpus) > 1:
|
|
333
|
-
# Multi gpu should always use the partial graph size
|
|
334
|
-
graph_full_offload = graph_partial_offload
|
|
335
|
-
|
|
336
|
-
# Get output layer size
|
|
337
|
-
memory_layer_output = 0
|
|
338
|
-
# Output layer handled at the end if we have space
|
|
339
|
-
for name, layer in data["tensors"].items():
|
|
340
|
-
if any(
|
|
341
|
-
name.startswith(prefix)
|
|
342
|
-
for prefix in ["output_norm", "output", "token_embd"]
|
|
343
|
-
):
|
|
344
|
-
memory_layer_output += layer["n_bytes"]
|
|
345
|
-
|
|
346
|
-
# Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
|
347
|
-
default_memory_min = 512 * 1024**2
|
|
348
|
-
gpu_allocations = [0] * len(gpus)
|
|
349
|
-
gpus_with_space: list[int] = []
|
|
350
|
-
for i in range(len(gpus)):
|
|
351
|
-
gpu0_overhead = projector_weights if len(gpus_with_space) == 0 else 0
|
|
352
|
-
minimum_memory = gpus[i].get("memory_min", default_memory_min)
|
|
353
|
-
if (
|
|
354
|
-
gpus[i]["memory_free"]
|
|
355
|
-
< gpu0_overhead
|
|
356
|
-
+ max(graph_partial_offload, graph_full_offload)
|
|
357
|
-
+ minimum_memory
|
|
358
|
-
+ 2 * layer_size
|
|
359
|
-
):
|
|
360
|
-
continue
|
|
361
|
-
gpus_with_space.append(i)
|
|
362
|
-
gpu_allocations[i] += gpu0_overhead + minimum_memory + layer_size
|
|
363
|
-
|
|
364
|
-
overflow = 0
|
|
365
|
-
if len(gpus_with_space) == 0:
|
|
366
|
-
overflow = projector_weights
|
|
367
|
-
|
|
368
|
-
# For all the layers, find where they can fit on the GPU(s)
|
|
369
|
-
layer_count = 0
|
|
370
|
-
layer_counts = [0] * len(gpus)
|
|
371
|
-
for i in range(block_count - 1, -1, -1):
|
|
372
|
-
layer_size = layer_sizes[i]
|
|
373
|
-
layer_size += kv[i]
|
|
374
|
-
|
|
375
|
-
# Distribute the layers across the GPU(s) that have space
|
|
376
|
-
for j in range(len(gpus_with_space), 0, -1):
|
|
377
|
-
g = gpus_with_space[i % j]
|
|
378
|
-
used = gpu_allocations[g] + max(graph_partial_offload, graph_full_offload)
|
|
379
|
-
if gpus[g]["memory_free"] > used + layer_size:
|
|
380
|
-
gpu_allocations[g] += layer_size
|
|
381
|
-
layer_counts[g] += 1
|
|
382
|
-
layer_count += 1
|
|
383
|
-
break
|
|
384
|
-
else:
|
|
385
|
-
gpus_with_space = (
|
|
386
|
-
gpus_with_space[: i % j] + gpus_with_space[i % j + 1 :]
|
|
387
|
-
)
|
|
388
|
-
|
|
389
|
-
if len(gpus_with_space) == 0:
|
|
390
|
-
overflow += layer_size
|
|
391
|
-
|
|
392
|
-
fully_loaded = False
|
|
393
|
-
if layer_count >= block_count:
|
|
394
|
-
fully_loaded = True
|
|
395
|
-
|
|
396
|
-
# Determine if we need to consider output then find where it fits
|
|
397
|
-
if memory_layer_output > 0:
|
|
398
|
-
for j in range(len(gpus_with_space), 0, -1):
|
|
399
|
-
g = gpus_with_space[layer_count % j]
|
|
400
|
-
used = gpu_allocations[g] + max(graph_partial_offload, graph_full_offload)
|
|
401
|
-
if gpus[g]["memory_free"] > used + memory_layer_output:
|
|
402
|
-
gpu_allocations[g] += memory_layer_output
|
|
403
|
-
layer_counts[g] += 1
|
|
404
|
-
layer_count += 1
|
|
405
|
-
break
|
|
406
|
-
else:
|
|
407
|
-
gpus_with_space = (
|
|
408
|
-
gpus_with_space[: layer_count % j]
|
|
409
|
-
+ gpus_with_space[layer_count % j + 1 :]
|
|
410
|
-
)
|
|
411
|
-
|
|
412
|
-
if layer_count < block_count + 1:
|
|
413
|
-
fully_loaded = False
|
|
414
|
-
overflow += memory_layer_output
|
|
415
|
-
|
|
416
|
-
# Add the applicable (full or partial) graph allocations
|
|
417
|
-
for i in range(len(gpus)):
|
|
418
|
-
if layer_counts[i] <= 0:
|
|
419
|
-
continue
|
|
420
|
-
if fully_loaded:
|
|
421
|
-
gpu_allocations[i] += graph_full_offload
|
|
422
|
-
else:
|
|
423
|
-
gpu_allocations[i] += graph_partial_offload
|
|
424
|
-
|
|
425
|
-
if fully_loaded:
|
|
426
|
-
graph_offload = graph_full_offload
|
|
427
|
-
else:
|
|
428
|
-
graph_offload = graph_partial_offload
|
|
429
|
-
|
|
430
|
-
# Summaries
|
|
431
|
-
memory_required_partial = sum(gpu_allocations)
|
|
432
|
-
memory_required_total = memory_required_partial + overflow
|
|
433
|
-
|
|
434
|
-
tensor_split = ""
|
|
435
|
-
if len(gpus) > 1:
|
|
436
|
-
tensor_split = ",".join(str(c) for c in layer_counts)
|
|
437
|
-
|
|
438
|
-
estimate = MemoryEstimate(
|
|
439
|
-
layers=0,
|
|
440
|
-
graph=0,
|
|
441
|
-
vram_size=0,
|
|
442
|
-
total_size=int(memory_required_total),
|
|
443
|
-
tensor_split="",
|
|
444
|
-
gpu_sizes=[],
|
|
445
|
-
)
|
|
446
|
-
if gpus[0]["name"] == "CPU":
|
|
447
|
-
return estimate
|
|
448
|
-
if layer_count == 0:
|
|
449
|
-
return estimate
|
|
450
|
-
|
|
451
|
-
estimate.layers = layer_count
|
|
452
|
-
estimate.graph = int(graph_offload)
|
|
453
|
-
estimate.vram_size = int(memory_required_partial)
|
|
454
|
-
estimate.total_size = int(memory_required_total)
|
|
455
|
-
estimate.tensor_split = tensor_split
|
|
456
|
-
estimate.gpu_sizes = [int(i) for i in gpu_allocations]
|
|
457
|
-
return estimate
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"version": 1,
|
|
4
|
-
"context_length": 32768,
|
|
5
|
-
"model_name": "qwen2-instruct",
|
|
6
|
-
"model_lang": [
|
|
7
|
-
"en",
|
|
8
|
-
"zh"
|
|
9
|
-
],
|
|
10
|
-
"model_ability": [
|
|
11
|
-
"chat",
|
|
12
|
-
"tools"
|
|
13
|
-
],
|
|
14
|
-
"model_description": "Qwen2 is the new series of Qwen large language models",
|
|
15
|
-
"model_specs": [
|
|
16
|
-
{
|
|
17
|
-
"model_format": "pytorch",
|
|
18
|
-
"model_size_in_billions": "0_5",
|
|
19
|
-
"quantizations": [
|
|
20
|
-
"none"
|
|
21
|
-
],
|
|
22
|
-
"model_id": "Qwen/Qwen2-0.5B-Instruct",
|
|
23
|
-
"model_hub": "csghub"
|
|
24
|
-
},
|
|
25
|
-
{
|
|
26
|
-
"model_format": "ggufv2",
|
|
27
|
-
"model_size_in_billions": "0_5",
|
|
28
|
-
"quantizations": [
|
|
29
|
-
"q2_k",
|
|
30
|
-
"q3_k_m",
|
|
31
|
-
"q4_0",
|
|
32
|
-
"q4_k_m",
|
|
33
|
-
"q5_0",
|
|
34
|
-
"q5_k_m",
|
|
35
|
-
"q6_k",
|
|
36
|
-
"q8_0",
|
|
37
|
-
"fp16"
|
|
38
|
-
],
|
|
39
|
-
"model_id": "qwen/Qwen2-0.5B-Instruct-GGUF",
|
|
40
|
-
"model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf",
|
|
41
|
-
"model_hub": "csghub"
|
|
42
|
-
}
|
|
43
|
-
],
|
|
44
|
-
"chat_template": "{%- macro json_to_python_type(json_spec) %}\n {%- set basic_type_map = {\n \"string\": \"str\",\n \"number\": \"float\",\n \"integer\": \"int\",\n \"boolean\": \"bool\"\n} %}\n {%- if basic_type_map[json_spec.type] is defined %}\n {{- basic_type_map[json_spec.type] }}\n {%- elif json_spec.type == \"array\" %}\n {{- \"list[\" + json_to_python_type(json_spec|items) + \"]\" }}\n {%- elif json_spec.type == \"object\" %}\n {%- if json_spec.additionalProperties is defined %}\n {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n {%- else %}\n {{- \"dict\" }}\n {%- endif %}\n {%- elif json_spec.type is iterable %}\n {{- \"Union[\" }}\n {%- for t in json_spec.type %}\n {{- json_to_python_type({\"type\": t}) }}\n {%- if not loop.last %}\n {{- \",\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"]\" }}\n {%- else %}\n {{- \"Any\" }}\n {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n {{- '<|im_start|>system\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] + '\n\n' }}\n {%- endif %}\n {{- '# Tools\n\n' }}\n {{- \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> \" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- '{\"type\": \"function\", \"function\": ' }}\n {{- '{\"name\": ' + tool.name + '\", ' }}\n {{- '\"description\": \"' + tool.name + '(' }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- param_name + \": \" + json_to_python_type(param_fields) }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- if tool.return is defined %}\n {{- \" -> \" + json_to_python_type(tool.return) }}\n {%- endif %}\n {{- \" - \" + tool.description + \"\n\n\" }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {%- if loop.first %}\n {{- \" Args:\n\" }}\n {%- endif %}\n {{- \" \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n {%- endfor %}\n {%- if tool.return is defined and tool.return.description is defined %}\n {{- \"\n Returns:\n \" + tool.return.description }}\n {%- endif %}\n {{- '\"' }}\n {{- ', \"parameters\": ' }}\n {%- if tool.parameters.properties | length == 0 %}\n {{- \"{}\" }}\n {%- else %}\n {{- tool.parameters|tojson }}\n {%- endif %}\n {{- \"}\" }}\n {%- if not loop.last %}\n {{- \"\n\" }}\n {%- endif %}\n {%- endfor %}\n {{- \" </tools>\" }}\n {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n {{- \"For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\n\" }}\n {{- \"<tool_call>\n\" }}\n {{- '{\"name\": <function-name>, \"arguments\": <args-json-object>}\n' }}\n {{- '</tool_call><|im_end|>\n' }}\n{%- else %}\n {%- if messages[0]['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n {%- else %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\n<tool_call>\n' }}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '\"}' }}\n {{- '\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if not message.name is defined %}\n {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n {%- endif %}\n {{- '<|im_start|>user\n<tool_response>\n' }}\n {{- '{\"name\": \"' }}\n {{- message.name }}\n {{- '\", \"content\": ' }}\n {{- message.content|tojson + '}' }}\n {{- '\n</tool_response><|im_end|>\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
|
|
45
|
-
"stop_token_ids": [
|
|
46
|
-
151643,
|
|
47
|
-
151644,
|
|
48
|
-
151645
|
|
49
|
-
],
|
|
50
|
-
"stop": [
|
|
51
|
-
"<|endoftext|>",
|
|
52
|
-
"<|im_start|>",
|
|
53
|
-
"<|im_end|>"
|
|
54
|
-
]
|
|
55
|
-
}
|
|
56
|
-
]
|