xinference 1.7.1.post1__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show
  1. xinference/_version.py +3 -3
  2. xinference/client/restful/async_restful_client.py +8 -13
  3. xinference/client/restful/restful_client.py +6 -2
  4. xinference/core/chat_interface.py +6 -4
  5. xinference/core/media_interface.py +5 -0
  6. xinference/core/model.py +1 -5
  7. xinference/core/supervisor.py +117 -68
  8. xinference/core/worker.py +49 -37
  9. xinference/deploy/test/test_cmdline.py +2 -6
  10. xinference/model/audio/__init__.py +26 -23
  11. xinference/model/audio/chattts.py +3 -2
  12. xinference/model/audio/core.py +49 -98
  13. xinference/model/audio/cosyvoice.py +3 -2
  14. xinference/model/audio/custom.py +28 -73
  15. xinference/model/audio/f5tts.py +3 -2
  16. xinference/model/audio/f5tts_mlx.py +3 -2
  17. xinference/model/audio/fish_speech.py +3 -2
  18. xinference/model/audio/funasr.py +17 -4
  19. xinference/model/audio/kokoro.py +3 -2
  20. xinference/model/audio/megatts.py +3 -2
  21. xinference/model/audio/melotts.py +3 -2
  22. xinference/model/audio/model_spec.json +572 -171
  23. xinference/model/audio/utils.py +0 -6
  24. xinference/model/audio/whisper.py +3 -2
  25. xinference/model/audio/whisper_mlx.py +3 -2
  26. xinference/model/cache_manager.py +141 -0
  27. xinference/model/core.py +6 -49
  28. xinference/model/custom.py +174 -0
  29. xinference/model/embedding/__init__.py +67 -56
  30. xinference/model/embedding/cache_manager.py +35 -0
  31. xinference/model/embedding/core.py +104 -84
  32. xinference/model/embedding/custom.py +55 -78
  33. xinference/model/embedding/embed_family.py +80 -31
  34. xinference/model/embedding/flag/core.py +21 -5
  35. xinference/model/embedding/llama_cpp/__init__.py +0 -0
  36. xinference/model/embedding/llama_cpp/core.py +234 -0
  37. xinference/model/embedding/model_spec.json +968 -103
  38. xinference/model/embedding/sentence_transformers/core.py +30 -20
  39. xinference/model/embedding/vllm/core.py +11 -5
  40. xinference/model/flexible/__init__.py +8 -2
  41. xinference/model/flexible/core.py +26 -119
  42. xinference/model/flexible/custom.py +69 -0
  43. xinference/model/flexible/launchers/image_process_launcher.py +1 -0
  44. xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
  45. xinference/model/flexible/launchers/transformers_launcher.py +15 -3
  46. xinference/model/flexible/launchers/yolo_launcher.py +5 -1
  47. xinference/model/image/__init__.py +20 -20
  48. xinference/model/image/cache_manager.py +62 -0
  49. xinference/model/image/core.py +70 -182
  50. xinference/model/image/custom.py +28 -72
  51. xinference/model/image/model_spec.json +402 -119
  52. xinference/model/image/ocr/got_ocr2.py +3 -2
  53. xinference/model/image/stable_diffusion/core.py +22 -7
  54. xinference/model/image/stable_diffusion/mlx.py +6 -6
  55. xinference/model/image/utils.py +2 -2
  56. xinference/model/llm/__init__.py +71 -94
  57. xinference/model/llm/cache_manager.py +292 -0
  58. xinference/model/llm/core.py +37 -111
  59. xinference/model/llm/custom.py +88 -0
  60. xinference/model/llm/llama_cpp/core.py +5 -7
  61. xinference/model/llm/llm_family.json +16260 -8151
  62. xinference/model/llm/llm_family.py +138 -839
  63. xinference/model/llm/lmdeploy/core.py +5 -7
  64. xinference/model/llm/memory.py +3 -4
  65. xinference/model/llm/mlx/core.py +6 -8
  66. xinference/model/llm/reasoning_parser.py +3 -1
  67. xinference/model/llm/sglang/core.py +32 -14
  68. xinference/model/llm/transformers/chatglm.py +3 -7
  69. xinference/model/llm/transformers/core.py +49 -27
  70. xinference/model/llm/transformers/deepseek_v2.py +2 -2
  71. xinference/model/llm/transformers/gemma3.py +2 -2
  72. xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
  73. xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
  74. xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
  75. xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
  76. xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
  77. xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
  78. xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
  79. xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
  80. xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
  81. xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
  82. xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
  83. xinference/model/llm/transformers/opt.py +3 -7
  84. xinference/model/llm/utils.py +34 -49
  85. xinference/model/llm/vllm/core.py +77 -27
  86. xinference/model/llm/vllm/xavier/engine.py +5 -3
  87. xinference/model/llm/vllm/xavier/scheduler.py +10 -6
  88. xinference/model/llm/vllm/xavier/transfer.py +1 -1
  89. xinference/model/rerank/__init__.py +26 -25
  90. xinference/model/rerank/core.py +47 -87
  91. xinference/model/rerank/custom.py +25 -71
  92. xinference/model/rerank/model_spec.json +158 -33
  93. xinference/model/rerank/utils.py +2 -2
  94. xinference/model/utils.py +115 -54
  95. xinference/model/video/__init__.py +13 -17
  96. xinference/model/video/core.py +44 -102
  97. xinference/model/video/diffusers.py +4 -3
  98. xinference/model/video/model_spec.json +90 -21
  99. xinference/types.py +5 -3
  100. xinference/web/ui/build/asset-manifest.json +3 -3
  101. xinference/web/ui/build/index.html +1 -1
  102. xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
  103. xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
  104. xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
  105. xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
  106. xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
  107. xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
  108. xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
  109. xinference/web/ui/src/locales/en.json +0 -1
  110. xinference/web/ui/src/locales/ja.json +0 -1
  111. xinference/web/ui/src/locales/ko.json +0 -1
  112. xinference/web/ui/src/locales/zh.json +0 -1
  113. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
  114. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
  115. xinference/model/audio/model_spec_modelscope.json +0 -231
  116. xinference/model/embedding/model_spec_modelscope.json +0 -293
  117. xinference/model/embedding/utils.py +0 -18
  118. xinference/model/image/model_spec_modelscope.json +0 -375
  119. xinference/model/llm/llama_cpp/memory.py +0 -457
  120. xinference/model/llm/llm_family_csghub.json +0 -56
  121. xinference/model/llm/llm_family_modelscope.json +0 -8700
  122. xinference/model/llm/llm_family_openmind_hub.json +0 -1019
  123. xinference/model/rerank/model_spec_modelscope.json +0 -85
  124. xinference/model/video/model_spec_modelscope.json +0 -184
  125. xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
  126. xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
  127. xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
  128. xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
  129. xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
  130. xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
  131. xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
  132. /xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
  133. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
  134. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
  135. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
  136. {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0
@@ -1,457 +0,0 @@
1
- # Copyright 2022-2023 XProbe Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import logging
16
- from collections.abc import Sequence
17
- from dataclasses import dataclass
18
- from typing import Any
19
-
20
- try:
21
- from gguf import GGUFReader, GGUFValueType # noqa: E402
22
- except ImportError:
23
- GGUFReader = GGUFValueType = None
24
- logger = logging.getLogger(__name__)
25
-
26
-
27
- def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
28
- file_endian = reader.endianess.name # codespell:ignore
29
- if reader.byte_order == "S":
30
- host_endian = "BIG" if file_endian == "LITTLE" else "LITTLE"
31
- else:
32
- host_endian = file_endian
33
- return (host_endian, file_endian)
34
-
35
-
36
- def dump_metadata_json(reader: GGUFReader, model_path: str) -> dict:
37
- host_endian, file_endian = get_file_host_endian(reader)
38
- metadata: dict[str, Any] = {}
39
- tensors: dict[str, Any] = {}
40
- result = {
41
- "filename": model_path,
42
- "endian": file_endian,
43
- "metadata": metadata,
44
- "tensors": tensors,
45
- }
46
- for idx, field in enumerate(reader.fields.values()):
47
- curr: dict[str, Any] = {
48
- "index": idx,
49
- "type": field.types[0].name if field.types else "UNKNOWN",
50
- "offset": field.offset,
51
- }
52
- metadata[field.name] = curr
53
- if field.types[:1] == [GGUFValueType.ARRAY]:
54
- curr["array_types"] = [t.name for t in field.types][1:]
55
- curr["value"] = field.contents()
56
- else:
57
- curr["value"] = field.contents()
58
- for i, tensor in enumerate(reader.tensors):
59
- tensors[tensor.name] = {
60
- "index": i,
61
- "shape": tensor.shape.tolist(),
62
- "type": tensor.tensor_type.name,
63
- "offset": tensor.field.offset,
64
- "n_bytes": tensor.n_bytes,
65
- }
66
- return result
67
-
68
-
69
- @dataclass
70
- class MemoryEstimate:
71
- # How many layers we predict we can load
72
- layers: int
73
- # The size of the graph which occupies the main GPU
74
- graph: int
75
- # How much VRAM will be allocated given the number of layers we predict
76
- vram_size: int
77
- # The total size of the model if loaded into VRAM. If all layers are loaded, vram_size == total_size
78
- total_size: int
79
- # For multi-GPU scenarios, this provides the tensor split parameter
80
- tensor_split: str
81
- # For multi-GPU scenarios, this is the size in bytes per GPU
82
- gpu_sizes: list[int]
83
-
84
-
85
- def _get_max_min(value):
86
- if isinstance(value, Sequence):
87
- return max(value), min(value)
88
- else:
89
- return value, value
90
-
91
-
92
- def graph_size(
93
- data: dict,
94
- context_length: int,
95
- batch_size: int,
96
- num_parallel: int,
97
- kv_cache_type: str,
98
- ):
99
- """
100
- Most of the logic comes from `GraphSize` in https://github.com/ollama/ollama/blob/main/fs/ggml/ggml.go
101
- """
102
- if context_length < batch_size:
103
- batch_size = context_length
104
-
105
- metadata = data["metadata"]
106
- architecture = metadata["general.architecture"]["value"]
107
- embedding_length = metadata[f"{architecture}.embedding_length"]["value"]
108
- block_count = metadata[f"{architecture}.block_count"]["value"]
109
- head_count_max, head_count_min = _get_max_min(
110
- metadata[f"{architecture}.attention.head_count"]["value"]
111
- )
112
- head_count_kv_max, head_count_kv_min = _get_max_min(
113
- metadata[f"{architecture}.attention.head_count_kv"]["value"]
114
- )
115
- vocab = len(metadata["tokenizer.ggml.tokens"]["value"])
116
- embedding_head_count_max = (
117
- (embedding_length // head_count_min) if head_count_min > 0 else 0
118
- )
119
- embedding_head_count_k = metadata.get(
120
- f"{architecture}.attention.key_length", {}
121
- ).get("value", embedding_head_count_max)
122
- embedding_head_count_v = metadata.get(
123
- f"{architecture}.attention.value_length", {}
124
- ).get("value", embedding_head_count_max)
125
-
126
- # f16(default)
127
- bytes_per_kv_element = {
128
- "q8_0": 1, # 1/2 of fp16
129
- "q4_0": 0.5, # 1/4 of fp16
130
- }.get(kv_cache_type, 2)
131
-
132
- kv = [0] * block_count
133
- for i in range(block_count):
134
- kv[i] = (
135
- context_length
136
- * (embedding_head_count_k + embedding_head_count_v)
137
- * head_count_kv_max
138
- * bytes_per_kv_element
139
- )
140
-
141
- full_offload = 0
142
- partial_offload = 0
143
- if architecture in ["llama", "llama4"]:
144
- full_offload = max(
145
- 4
146
- * batch_size
147
- * (1 + 4 * embedding_length + context_length * (1 + head_count_max)),
148
- 4 * batch_size * (embedding_length + vocab),
149
- )
150
- partial_offload = 4 * batch_size * embedding_length
151
- partial_offload += max(
152
- 4
153
- * batch_size
154
- * (1 + embedding_length + max(context_length, embedding_length))
155
- + embedding_length * embedding_length * 9 / 16
156
- + 4
157
- * context_length
158
- * (
159
- batch_size * head_count_max
160
- + embedding_head_count_max * head_count_kv_max
161
- ),
162
- 4 * batch_size * (embedding_length + vocab)
163
- + embedding_length * vocab * 105 / 128,
164
- )
165
- elif architecture in ["gemma", "gemma2", "gemma3"]:
166
- full_offload = max(
167
- 4 * batch_size * (embedding_length + vocab),
168
- 4
169
- * batch_size
170
- * (
171
- 2
172
- + context_length
173
- + context_length * head_count_max
174
- + 2 * embedding_length
175
- + 2 * embedding_head_count_k * head_count_max
176
- ),
177
- )
178
- partial_offload = max(
179
- 4 * embedding_length * batch_size
180
- + embedding_length * vocab * 105 / 128
181
- + 4 * vocab * batch_size,
182
- 4
183
- * batch_size
184
- * (
185
- 2 * embedding_length
186
- + 1
187
- + 2 * embedding_head_count_k * head_count_max
188
- + context_length
189
- + context_length * head_count_max
190
- )
191
- + 4 * embedding_head_count_k * context_length * 8
192
- + embedding_length * embedding_head_count_k * head_count_max * 9 / 16,
193
- )
194
- if architecture == "gemma3":
195
- gemma3_global_cache_count = 6
196
- sliding_window = (
197
- num_parallel
198
- * metadata[f"{architecture}.attention.sliding_window"]["value"]
199
- + batch_size
200
- )
201
- for i in range(block_count):
202
- if (i + 1) % gemma3_global_cache_count != 0:
203
- kv[i] = (
204
- sliding_window
205
- * (embedding_head_count_k + embedding_head_count_v)
206
- * head_count_kv_max
207
- * bytes_per_kv_element
208
- )
209
- elif architecture == "qwen2":
210
- full_offload = max(
211
- 4 * batch_size * (embedding_length + vocab),
212
- 4
213
- * batch_size
214
- * (
215
- 1
216
- + 2 * embedding_length
217
- + context_length
218
- + context_length * head_count_max
219
- ),
220
- )
221
-
222
- partial_offload = max(
223
- 4 * batch_size * (embedding_length + vocab)
224
- + embedding_length * vocab * 105 / 128,
225
- 4
226
- * (
227
- batch_size
228
- * (1 + 2 * embedding_length + context_length * (1 + head_count_max))
229
- + embedding_length * (1 + context_length)
230
- ),
231
- )
232
- elif architecture == "stablelm":
233
- full_offload = (
234
- 4
235
- * batch_size
236
- * (context_length * (1 + head_count_max) + 3 * embedding_length + 2)
237
- )
238
- partial_offload = max(
239
- 4 * batch_size * (vocab + 2 * embedding_length), full_offload
240
- )
241
- elif architecture == "deepseek2":
242
- full_offload = max(
243
- 4 * batch_size * (3 * embedding_length + vocab),
244
- 4
245
- * batch_size
246
- * (
247
- 3 * embedding_length
248
- + 2
249
- + context_length * (1 + head_count_kv_max)
250
- + 2 * embedding_head_count_k * head_count_kv_max
251
- ),
252
- )
253
-
254
- partial_offload = max(
255
- 4 * batch_size * (3 * embedding_length + vocab)
256
- + embedding_length * vocab * 105 / 128,
257
- 4
258
- * batch_size
259
- * (
260
- 2 * embedding_length
261
- + 1
262
- + 2 * embedding_head_count_k * head_count_kv_max
263
- + context_length
264
- + context_length * head_count_kv_max
265
- )
266
- + 4 * embedding_head_count_k * context_length * head_count_kv_max
267
- + embedding_length * embedding_head_count_k * head_count_kv_max * 9 / 16,
268
- )
269
-
270
- kv_total = sum(kv)
271
- if partial_offload == 0:
272
- partial_offload = (
273
- head_count_max
274
- / (1 if head_count_kv_min <= 0 else head_count_kv_min)
275
- * kv_total
276
- / 6
277
- )
278
- if full_offload == 0:
279
- full_offload = partial_offload
280
-
281
- return kv, partial_offload, full_offload
282
-
283
-
284
- def projector_memory_requirements(projector: str):
285
- reader = GGUFReader(projector, "r")
286
- data = dump_metadata_json(reader, projector)
287
- return sum(t["n_bytes"] for t in data["tensors"].values())
288
-
289
-
290
- def estimate_gpu_layers(
291
- gpus: list[dict],
292
- model_path: str,
293
- projectors: list[str],
294
- context_length: int,
295
- batch_size: int,
296
- num_parallel: int,
297
- kv_cache_type: str,
298
- ):
299
- """
300
- Most of the logic comes from `EstimateGPULayers` in https://github.com/ollama/ollama/blob/main/llm/memory.go
301
- """
302
- # Projectors loaded into GPU0 only
303
- projector_weights = sum(map(projector_memory_requirements, projectors))
304
- if projector_weights > 0:
305
- # Multimodal models require at least 2048 context
306
- context_length = max(context_length, 2048)
307
- reader = GGUFReader(model_path, "r")
308
- data = dump_metadata_json(reader, model_path)
309
- kv, graph_partial_offload, graph_full_offload = graph_size(
310
- data,
311
- context_length=context_length,
312
- batch_size=batch_size,
313
- num_parallel=num_parallel,
314
- kv_cache_type=kv_cache_type,
315
- )
316
- # Get all layer sizes
317
- metadata = data["metadata"]
318
- architecture = metadata["general.architecture"]["value"]
319
- block_count = metadata[f"{architecture}.block_count"]["value"]
320
- layer_sizes = [0] * block_count
321
- for name, layer in data["tensors"].items():
322
- if name.startswith("blk."):
323
- index = int(name[len("blk.") :].split(".")[0])
324
- layer_sizes[index] += layer["n_bytes"]
325
- layer_size = layer_sizes[0] if layer_sizes else 0
326
-
327
- if len(kv) > 0:
328
- layer_size += kv[0]
329
- # On metal there's no partial offload overhead
330
- if gpus[0]["name"] == "Metal":
331
- graph_partial_offload = graph_full_offload
332
- elif len(gpus) > 1:
333
- # Multi gpu should always use the partial graph size
334
- graph_full_offload = graph_partial_offload
335
-
336
- # Get output layer size
337
- memory_layer_output = 0
338
- # Output layer handled at the end if we have space
339
- for name, layer in data["tensors"].items():
340
- if any(
341
- name.startswith(prefix)
342
- for prefix in ["output_norm", "output", "token_embd"]
343
- ):
344
- memory_layer_output += layer["n_bytes"]
345
-
346
- # Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
347
- default_memory_min = 512 * 1024**2
348
- gpu_allocations = [0] * len(gpus)
349
- gpus_with_space: list[int] = []
350
- for i in range(len(gpus)):
351
- gpu0_overhead = projector_weights if len(gpus_with_space) == 0 else 0
352
- minimum_memory = gpus[i].get("memory_min", default_memory_min)
353
- if (
354
- gpus[i]["memory_free"]
355
- < gpu0_overhead
356
- + max(graph_partial_offload, graph_full_offload)
357
- + minimum_memory
358
- + 2 * layer_size
359
- ):
360
- continue
361
- gpus_with_space.append(i)
362
- gpu_allocations[i] += gpu0_overhead + minimum_memory + layer_size
363
-
364
- overflow = 0
365
- if len(gpus_with_space) == 0:
366
- overflow = projector_weights
367
-
368
- # For all the layers, find where they can fit on the GPU(s)
369
- layer_count = 0
370
- layer_counts = [0] * len(gpus)
371
- for i in range(block_count - 1, -1, -1):
372
- layer_size = layer_sizes[i]
373
- layer_size += kv[i]
374
-
375
- # Distribute the layers across the GPU(s) that have space
376
- for j in range(len(gpus_with_space), 0, -1):
377
- g = gpus_with_space[i % j]
378
- used = gpu_allocations[g] + max(graph_partial_offload, graph_full_offload)
379
- if gpus[g]["memory_free"] > used + layer_size:
380
- gpu_allocations[g] += layer_size
381
- layer_counts[g] += 1
382
- layer_count += 1
383
- break
384
- else:
385
- gpus_with_space = (
386
- gpus_with_space[: i % j] + gpus_with_space[i % j + 1 :]
387
- )
388
-
389
- if len(gpus_with_space) == 0:
390
- overflow += layer_size
391
-
392
- fully_loaded = False
393
- if layer_count >= block_count:
394
- fully_loaded = True
395
-
396
- # Determine if we need to consider output then find where it fits
397
- if memory_layer_output > 0:
398
- for j in range(len(gpus_with_space), 0, -1):
399
- g = gpus_with_space[layer_count % j]
400
- used = gpu_allocations[g] + max(graph_partial_offload, graph_full_offload)
401
- if gpus[g]["memory_free"] > used + memory_layer_output:
402
- gpu_allocations[g] += memory_layer_output
403
- layer_counts[g] += 1
404
- layer_count += 1
405
- break
406
- else:
407
- gpus_with_space = (
408
- gpus_with_space[: layer_count % j]
409
- + gpus_with_space[layer_count % j + 1 :]
410
- )
411
-
412
- if layer_count < block_count + 1:
413
- fully_loaded = False
414
- overflow += memory_layer_output
415
-
416
- # Add the applicable (full or partial) graph allocations
417
- for i in range(len(gpus)):
418
- if layer_counts[i] <= 0:
419
- continue
420
- if fully_loaded:
421
- gpu_allocations[i] += graph_full_offload
422
- else:
423
- gpu_allocations[i] += graph_partial_offload
424
-
425
- if fully_loaded:
426
- graph_offload = graph_full_offload
427
- else:
428
- graph_offload = graph_partial_offload
429
-
430
- # Summaries
431
- memory_required_partial = sum(gpu_allocations)
432
- memory_required_total = memory_required_partial + overflow
433
-
434
- tensor_split = ""
435
- if len(gpus) > 1:
436
- tensor_split = ",".join(str(c) for c in layer_counts)
437
-
438
- estimate = MemoryEstimate(
439
- layers=0,
440
- graph=0,
441
- vram_size=0,
442
- total_size=int(memory_required_total),
443
- tensor_split="",
444
- gpu_sizes=[],
445
- )
446
- if gpus[0]["name"] == "CPU":
447
- return estimate
448
- if layer_count == 0:
449
- return estimate
450
-
451
- estimate.layers = layer_count
452
- estimate.graph = int(graph_offload)
453
- estimate.vram_size = int(memory_required_partial)
454
- estimate.total_size = int(memory_required_total)
455
- estimate.tensor_split = tensor_split
456
- estimate.gpu_sizes = [int(i) for i in gpu_allocations]
457
- return estimate
@@ -1,56 +0,0 @@
1
- [
2
- {
3
- "version": 1,
4
- "context_length": 32768,
5
- "model_name": "qwen2-instruct",
6
- "model_lang": [
7
- "en",
8
- "zh"
9
- ],
10
- "model_ability": [
11
- "chat",
12
- "tools"
13
- ],
14
- "model_description": "Qwen2 is the new series of Qwen large language models",
15
- "model_specs": [
16
- {
17
- "model_format": "pytorch",
18
- "model_size_in_billions": "0_5",
19
- "quantizations": [
20
- "none"
21
- ],
22
- "model_id": "Qwen/Qwen2-0.5B-Instruct",
23
- "model_hub": "csghub"
24
- },
25
- {
26
- "model_format": "ggufv2",
27
- "model_size_in_billions": "0_5",
28
- "quantizations": [
29
- "q2_k",
30
- "q3_k_m",
31
- "q4_0",
32
- "q4_k_m",
33
- "q5_0",
34
- "q5_k_m",
35
- "q6_k",
36
- "q8_0",
37
- "fp16"
38
- ],
39
- "model_id": "qwen/Qwen2-0.5B-Instruct-GGUF",
40
- "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf",
41
- "model_hub": "csghub"
42
- }
43
- ],
44
- "chat_template": "{%- macro json_to_python_type(json_spec) %}\n {%- set basic_type_map = {\n \"string\": \"str\",\n \"number\": \"float\",\n \"integer\": \"int\",\n \"boolean\": \"bool\"\n} %}\n {%- if basic_type_map[json_spec.type] is defined %}\n {{- basic_type_map[json_spec.type] }}\n {%- elif json_spec.type == \"array\" %}\n {{- \"list[\" + json_to_python_type(json_spec|items) + \"]\" }}\n {%- elif json_spec.type == \"object\" %}\n {%- if json_spec.additionalProperties is defined %}\n {{- \"dict[str, \" + json_to_python_type(json_spec.additionalProperties) + ']' }}\n {%- else %}\n {{- \"dict\" }}\n {%- endif %}\n {%- elif json_spec.type is iterable %}\n {{- \"Union[\" }}\n {%- for t in json_spec.type %}\n {{- json_to_python_type({\"type\": t}) }}\n {%- if not loop.last %}\n {{- \",\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"]\" }}\n {%- else %}\n {{- \"Any\" }}\n {%- endif %}\n{%- endmacro %}\n\n{%- if tools %}\n {{- '<|im_start|>system\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] + '\n\n' }}\n {%- endif %}\n {{- '# Tools\n\n' }}\n {{- \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> \" }}\n {%- for tool in tools %}\n {%- if tool.function is defined %}\n {%- set tool = tool.function %}\n {%- endif %}\n {{- '{\"type\": \"function\", \"function\": ' }}\n {{- '{\"name\": ' + tool.name + '\", ' }}\n {{- '\"description\": \"' + tool.name + '(' }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {{- param_name + \": \" + json_to_python_type(param_fields) }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- if tool.return is defined %}\n {{- \" -> \" + json_to_python_type(tool.return) }}\n {%- endif %}\n {{- \" - \" + tool.description + \"\n\n\" }}\n {%- for param_name, param_fields in tool.parameters.properties|items %}\n {%- if loop.first %}\n {{- \" Args:\n\" }}\n {%- endif %}\n {{- \" \" + param_name + \"(\" + json_to_python_type(param_fields) + \"): \" + param_fields.description|trim }}\n {%- endfor %}\n {%- if tool.return is defined and tool.return.description is defined %}\n {{- \"\n Returns:\n \" + tool.return.description }}\n {%- endif %}\n {{- '\"' }}\n {{- ', \"parameters\": ' }}\n {%- if tool.parameters.properties | length == 0 %}\n {{- \"{}\" }}\n {%- else %}\n {{- tool.parameters|tojson }}\n {%- endif %}\n {{- \"}\" }}\n {%- if not loop.last %}\n {{- \"\n\" }}\n {%- endif %}\n {%- endfor %}\n {{- \" </tools>\" }}\n {{- 'Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}\n' }}\n {{- \"For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\n\" }}\n {{- \"<tool_call>\n\" }}\n {{- '{\"name\": <function-name>, \"arguments\": <args-json-object>}\n' }}\n {{- '</tool_call><|im_end|>\n' }}\n{%- else %}\n {%- if messages[0]['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}\n {%- else %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.role == \"user\" or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and message.tool_calls is not defined) %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\n<tool_call>\n' }}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '\"}' }}\n {{- '\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if not message.name is defined %}\n {{- raise_exception(\"Tool response dicts require a 'name' key indicating the name of the called function!\") }}\n {%- endif %}\n {{- '<|im_start|>user\n<tool_response>\n' }}\n {{- '{\"name\": \"' }}\n {{- message.name }}\n {{- '\", \"content\": ' }}\n {{- message.content|tojson + '}' }}\n {{- '\n</tool_response><|im_end|>\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
45
- "stop_token_ids": [
46
- 151643,
47
- 151644,
48
- 151645
49
- ],
50
- "stop": [
51
- "<|endoftext|>",
52
- "<|im_start|>",
53
- "<|im_end|>"
54
- ]
55
- }
56
- ]