xinference 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +107 -11
- xinference/client/restful/restful_client.py +51 -11
- xinference/constants.py +5 -1
- xinference/core/media_interface.py +758 -0
- xinference/core/model.py +49 -9
- xinference/core/supervisor.py +1 -1
- xinference/core/utils.py +1 -1
- xinference/core/worker.py +33 -39
- xinference/deploy/cmdline.py +17 -0
- xinference/deploy/utils.py +0 -3
- xinference/model/audio/__init__.py +16 -27
- xinference/model/audio/core.py +2 -1
- xinference/model/audio/cosyvoice.py +4 -2
- xinference/model/audio/model_spec.json +63 -46
- xinference/model/audio/model_spec_modelscope.json +31 -14
- xinference/model/embedding/__init__.py +16 -24
- xinference/model/image/__init__.py +15 -25
- xinference/model/llm/__init__.py +40 -115
- xinference/model/llm/core.py +29 -6
- xinference/model/llm/llama_cpp/core.py +30 -347
- xinference/model/llm/llm_family.json +1674 -2203
- xinference/model/llm/llm_family.py +71 -7
- xinference/model/llm/llm_family_csghub.json +0 -32
- xinference/model/llm/llm_family_modelscope.json +1838 -2016
- xinference/model/llm/llm_family_openmind_hub.json +19 -325
- xinference/model/llm/lmdeploy/core.py +7 -2
- xinference/model/llm/mlx/core.py +23 -7
- xinference/model/llm/reasoning_parser.py +281 -5
- xinference/model/llm/sglang/core.py +39 -11
- xinference/model/llm/transformers/chatglm.py +9 -2
- xinference/model/llm/transformers/cogagent.py +10 -12
- xinference/model/llm/transformers/cogvlm2.py +6 -3
- xinference/model/llm/transformers/cogvlm2_video.py +3 -6
- xinference/model/llm/transformers/core.py +58 -60
- xinference/model/llm/transformers/deepseek_v2.py +4 -2
- xinference/model/llm/transformers/deepseek_vl.py +10 -4
- xinference/model/llm/transformers/deepseek_vl2.py +9 -4
- xinference/model/llm/transformers/gemma3.py +4 -5
- xinference/model/llm/transformers/glm4v.py +3 -21
- xinference/model/llm/transformers/glm_edge_v.py +3 -20
- xinference/model/llm/transformers/intern_vl.py +3 -6
- xinference/model/llm/transformers/internlm2.py +1 -1
- xinference/model/llm/transformers/minicpmv25.py +4 -2
- xinference/model/llm/transformers/minicpmv26.py +5 -3
- xinference/model/llm/transformers/omnilmm.py +1 -1
- xinference/model/llm/transformers/opt.py +1 -1
- xinference/model/llm/transformers/ovis2.py +302 -0
- xinference/model/llm/transformers/qwen-omni.py +8 -1
- xinference/model/llm/transformers/qwen2_audio.py +3 -1
- xinference/model/llm/transformers/qwen2_vl.py +5 -1
- xinference/model/llm/transformers/qwen_vl.py +5 -2
- xinference/model/llm/utils.py +96 -45
- xinference/model/llm/vllm/core.py +108 -24
- xinference/model/llm/vllm/distributed_executor.py +8 -7
- xinference/model/llm/vllm/xavier/allocator.py +1 -1
- xinference/model/llm/vllm/xavier/block_manager.py +1 -1
- xinference/model/llm/vllm/xavier/block_tracker.py +3 -3
- xinference/model/llm/vllm/xavier/executor.py +1 -1
- xinference/model/llm/vllm/xavier/test/test_xavier.py +2 -11
- xinference/model/rerank/__init__.py +13 -24
- xinference/model/video/__init__.py +15 -25
- xinference/model/video/core.py +3 -3
- xinference/model/video/diffusers.py +157 -13
- xinference/model/video/model_spec.json +100 -0
- xinference/model/video/model_spec_modelscope.json +104 -0
- xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
- xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
- xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
- xinference/thirdparty/cosyvoice/bin/train.py +7 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
- xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
- xinference/thirdparty/cosyvoice/cli/model.py +140 -155
- xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
- xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
- xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
- xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
- xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
- xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
- xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
- xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
- xinference/thirdparty/cosyvoice/utils/common.py +1 -1
- xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
- xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
- xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
- xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
- xinference/types.py +2 -71
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/{main.0f6523be.css → main.337afe76.css} +2 -2
- xinference/web/ui/build/static/css/main.337afe76.css.map +1 -0
- xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
- xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6798e126f3bc5f95a4c16a9c2ad52ffe77970c62406d83e20604dfda7ffd2247.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b617f7d21a95045fc57b26a9373551740f1978a826134cbf705c3a1bf8714a93.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c1506cb142151366074975f30fa1ff9cd6e5e978b62a4b074dfc16fe08d70d75.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +1 -0
- xinference/web/ui/src/locales/en.json +7 -4
- xinference/web/ui/src/locales/zh.json +7 -4
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/METADATA +56 -36
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/RECORD +120 -121
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/WHEEL +1 -1
- xinference/core/image_interface.py +0 -377
- xinference/model/llm/transformers/compression.py +0 -258
- xinference/model/llm/transformers/yi_vl.py +0 -239
- xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
- xinference/web/ui/build/static/css/main.0f6523be.css.map +0 -1
- xinference/web/ui/build/static/js/main.4b67a723.js +0 -3
- xinference/web/ui/build/static/js/main.4b67a723.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e4ba658c6b3b0490910acdae0c535a892257efb61539a24adf8038fc653bd22f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +0 -1
- /xinference/web/ui/build/static/js/{main.4b67a723.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -12,32 +12,21 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import concurrent.futures
|
|
15
|
+
import importlib.util
|
|
15
16
|
import logging
|
|
16
17
|
import os
|
|
17
18
|
import queue
|
|
18
|
-
import
|
|
19
|
-
from typing import Dict, Iterator, List, Optional, Union
|
|
19
|
+
from typing import Iterator, List, Optional, Union
|
|
20
20
|
|
|
21
21
|
import orjson
|
|
22
22
|
|
|
23
|
-
from ....types import
|
|
24
|
-
ChatCompletion,
|
|
25
|
-
ChatCompletionChunk,
|
|
26
|
-
Completion,
|
|
27
|
-
CompletionChunk,
|
|
28
|
-
CompletionUsage,
|
|
29
|
-
CreateCompletionLlamaCpp,
|
|
30
|
-
LlamaCppGenerateConfig,
|
|
31
|
-
LlamaCppModelConfig,
|
|
32
|
-
)
|
|
23
|
+
from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
|
|
33
24
|
from ..core import LLM
|
|
34
25
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
35
|
-
from ..utils import
|
|
26
|
+
from ..utils import ChatModelMixin
|
|
36
27
|
|
|
37
28
|
logger = logging.getLogger(__name__)
|
|
38
29
|
|
|
39
|
-
USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 1)))
|
|
40
|
-
|
|
41
30
|
|
|
42
31
|
class _Done:
|
|
43
32
|
pass
|
|
@@ -56,21 +45,16 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
56
45
|
model_spec: "LLMSpecV1",
|
|
57
46
|
quantization: str,
|
|
58
47
|
model_path: str,
|
|
59
|
-
llamacpp_model_config: Optional[
|
|
48
|
+
llamacpp_model_config: Optional[dict] = None,
|
|
60
49
|
):
|
|
61
50
|
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
62
|
-
|
|
63
|
-
self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
|
|
64
|
-
llamacpp_model_config
|
|
65
|
-
)
|
|
51
|
+
self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
|
|
66
52
|
self._llm = None
|
|
67
53
|
self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
|
|
68
54
|
|
|
69
|
-
def _sanitize_model_config(
|
|
70
|
-
self, llamacpp_model_config: Optional[LlamaCppModelConfig]
|
|
71
|
-
) -> LlamaCppModelConfig:
|
|
55
|
+
def _sanitize_model_config(self, llamacpp_model_config: Optional[dict]) -> dict:
|
|
72
56
|
if llamacpp_model_config is None:
|
|
73
|
-
llamacpp_model_config =
|
|
57
|
+
llamacpp_model_config = {}
|
|
74
58
|
|
|
75
59
|
if self.model_family.context_length:
|
|
76
60
|
llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
|
|
@@ -92,31 +76,12 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
92
76
|
|
|
93
77
|
return llamacpp_model_config
|
|
94
78
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
if generate_config is None:
|
|
99
|
-
generate_config = LlamaCppGenerateConfig(
|
|
100
|
-
**CreateCompletionLlamaCpp().dict()
|
|
101
|
-
)
|
|
102
|
-
else:
|
|
103
|
-
from llama_cpp import LlamaGrammar
|
|
104
|
-
|
|
105
|
-
grammar = generate_config.get("grammar")
|
|
106
|
-
if grammar is not None and not isinstance(grammar, LlamaGrammar):
|
|
107
|
-
generate_config["grammar"] = LlamaGrammar.from_string(
|
|
108
|
-
generate_config["grammar"]
|
|
109
|
-
)
|
|
110
|
-
# Validate generate_config and fill default values to the generate config.
|
|
111
|
-
generate_config = LlamaCppGenerateConfig(
|
|
112
|
-
**CreateCompletionLlamaCpp(**generate_config).dict()
|
|
113
|
-
)
|
|
114
|
-
# Currently, llama.cpp does not support lora
|
|
115
|
-
generate_config.pop("lora_name", None) # type: ignore
|
|
116
|
-
return generate_config
|
|
79
|
+
@classmethod
|
|
80
|
+
def check_lib(cls) -> bool:
|
|
81
|
+
return importlib.util.find_spec("xllamacpp") is not None
|
|
117
82
|
|
|
118
83
|
@classmethod
|
|
119
|
-
def
|
|
84
|
+
def match_json(
|
|
120
85
|
cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
|
|
121
86
|
) -> bool:
|
|
122
87
|
if llm_spec.model_format not in ["ggufv2"]:
|
|
@@ -138,7 +103,10 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
138
103
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
139
104
|
|
|
140
105
|
reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
|
|
141
|
-
self.
|
|
106
|
+
enable_thinking = self._llamacpp_model_config.pop("enable_thinking", True)
|
|
107
|
+
self.prepare_parse_reasoning_content(
|
|
108
|
+
reasoning_content, enable_thinking=enable_thinking
|
|
109
|
+
)
|
|
142
110
|
|
|
143
111
|
if os.path.isfile(self.model_path):
|
|
144
112
|
# mostly passed from --model_path
|
|
@@ -147,7 +115,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
147
115
|
# handle legacy cache.
|
|
148
116
|
if (
|
|
149
117
|
self.model_spec.model_file_name_split_template
|
|
150
|
-
and self.model_spec.quantization_parts
|
|
118
|
+
and self.quantization in self.model_spec.quantization_parts
|
|
151
119
|
):
|
|
152
120
|
part = self.model_spec.quantization_parts[self.quantization]
|
|
153
121
|
model_path = os.path.join(
|
|
@@ -180,7 +148,14 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
180
148
|
params.n_parallel = os.cpu_count()
|
|
181
149
|
for k, v in self._llamacpp_model_config.items():
|
|
182
150
|
try:
|
|
183
|
-
|
|
151
|
+
if "." in k:
|
|
152
|
+
parts = k.split(".")
|
|
153
|
+
sub_param = params
|
|
154
|
+
for p in parts[:-1]:
|
|
155
|
+
sub_param = getattr(sub_param, p)
|
|
156
|
+
setattr(sub_param, parts[-1], v)
|
|
157
|
+
else:
|
|
158
|
+
setattr(params, k, v)
|
|
184
159
|
except Exception as e:
|
|
185
160
|
logger.error("Failed to set the param %s = %s, error: %s", k, v, e)
|
|
186
161
|
n_threads = self._llamacpp_model_config.get("n_threads", os.cpu_count())
|
|
@@ -198,14 +173,13 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
198
173
|
raise RuntimeError(f"Load model {self.model_family.model_name} failed")
|
|
199
174
|
|
|
200
175
|
def generate(
|
|
201
|
-
self, prompt: str, generate_config: Optional[
|
|
176
|
+
self, prompt: str, generate_config: Optional[dict] = None
|
|
202
177
|
) -> Union[Completion, Iterator[CompletionChunk]]:
|
|
203
|
-
generate_config =
|
|
178
|
+
generate_config = generate_config or {}
|
|
204
179
|
stream = generate_config.get("stream", False)
|
|
205
180
|
q: queue.Queue = queue.Queue()
|
|
206
181
|
|
|
207
182
|
def _handle_completion():
|
|
208
|
-
# TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
|
|
209
183
|
data = generate_config
|
|
210
184
|
data.pop("stopping_criteria", None)
|
|
211
185
|
data.pop("logits_processor", None)
|
|
@@ -260,16 +234,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
260
234
|
|
|
261
235
|
def chat(
|
|
262
236
|
self,
|
|
263
|
-
messages: List[
|
|
264
|
-
generate_config: Optional[
|
|
237
|
+
messages: List[dict],
|
|
238
|
+
generate_config: Optional[dict] = None,
|
|
265
239
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
266
|
-
generate_config =
|
|
240
|
+
generate_config = generate_config or {}
|
|
267
241
|
stream = generate_config.get("stream", False)
|
|
268
242
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
269
243
|
q: queue.Queue = queue.Queue()
|
|
270
244
|
|
|
271
245
|
def _handle_chat_completion():
|
|
272
|
-
# TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
|
|
273
246
|
data = generate_config
|
|
274
247
|
data.pop("stopping_criteria", None)
|
|
275
248
|
data.pop("logits_processor", None)
|
|
@@ -331,293 +304,3 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
331
304
|
if type(r) is _Error:
|
|
332
305
|
raise Exception("Got error in chat: %s", r.msg)
|
|
333
306
|
return self._to_chat_completion(r, self.reasoning_parser)
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
class LlamaCppModel(LLM):
|
|
337
|
-
def __init__(
|
|
338
|
-
self,
|
|
339
|
-
model_uid: str,
|
|
340
|
-
model_family: "LLMFamilyV1",
|
|
341
|
-
model_spec: "LLMSpecV1",
|
|
342
|
-
quantization: str,
|
|
343
|
-
model_path: str,
|
|
344
|
-
llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
|
|
345
|
-
):
|
|
346
|
-
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
347
|
-
|
|
348
|
-
self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
|
|
349
|
-
llamacpp_model_config
|
|
350
|
-
)
|
|
351
|
-
self._llm = None
|
|
352
|
-
|
|
353
|
-
def _can_apply_cublas(self):
|
|
354
|
-
# TODO: figure out the quantizations supported.
|
|
355
|
-
return True
|
|
356
|
-
|
|
357
|
-
def _sanitize_model_config(
|
|
358
|
-
self, llamacpp_model_config: Optional[LlamaCppModelConfig]
|
|
359
|
-
) -> LlamaCppModelConfig:
|
|
360
|
-
if llamacpp_model_config is None:
|
|
361
|
-
llamacpp_model_config = LlamaCppModelConfig()
|
|
362
|
-
|
|
363
|
-
if self.model_family.context_length:
|
|
364
|
-
llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
|
|
365
|
-
llamacpp_model_config.setdefault("use_mmap", False)
|
|
366
|
-
llamacpp_model_config.setdefault("use_mlock", True)
|
|
367
|
-
|
|
368
|
-
if (
|
|
369
|
-
"llama-2" in self.model_family.model_name
|
|
370
|
-
and self.model_spec.model_size_in_billions == 70
|
|
371
|
-
):
|
|
372
|
-
llamacpp_model_config["use_mlock"] = False
|
|
373
|
-
llamacpp_model_config["n_gqa"] = 8
|
|
374
|
-
|
|
375
|
-
if self._is_darwin_and_apple_silicon():
|
|
376
|
-
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
377
|
-
elif self._is_linux() and self._can_apply_cublas():
|
|
378
|
-
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
379
|
-
llamacpp_model_config.setdefault("reasoning_content", False)
|
|
380
|
-
|
|
381
|
-
return llamacpp_model_config
|
|
382
|
-
|
|
383
|
-
def _sanitize_generate_config(
|
|
384
|
-
self, generate_config: Optional[LlamaCppGenerateConfig]
|
|
385
|
-
) -> LlamaCppGenerateConfig:
|
|
386
|
-
if generate_config is None:
|
|
387
|
-
generate_config = LlamaCppGenerateConfig(
|
|
388
|
-
**CreateCompletionLlamaCpp().dict()
|
|
389
|
-
)
|
|
390
|
-
else:
|
|
391
|
-
from llama_cpp import LlamaGrammar
|
|
392
|
-
|
|
393
|
-
grammar = generate_config.get("grammar")
|
|
394
|
-
if grammar is not None and not isinstance(grammar, LlamaGrammar):
|
|
395
|
-
generate_config["grammar"] = LlamaGrammar.from_string(
|
|
396
|
-
generate_config["grammar"]
|
|
397
|
-
)
|
|
398
|
-
# Validate generate_config and fill default values to the generate config.
|
|
399
|
-
generate_config = LlamaCppGenerateConfig(
|
|
400
|
-
**CreateCompletionLlamaCpp(**generate_config).dict()
|
|
401
|
-
)
|
|
402
|
-
# Currently, llama.cpp does not support lora
|
|
403
|
-
generate_config.pop("lora_name", None) # type: ignore
|
|
404
|
-
return generate_config
|
|
405
|
-
|
|
406
|
-
def load(self):
|
|
407
|
-
try:
|
|
408
|
-
import llama_cpp
|
|
409
|
-
from llama_cpp import Llama
|
|
410
|
-
|
|
411
|
-
if llama_cpp.__version__ < "0.2.0":
|
|
412
|
-
raise ValueError(
|
|
413
|
-
"The llama_cpp version must be greater than 0.2.0. "
|
|
414
|
-
"Please upgrade your version via `pip install -U llama_cpp` or refer to "
|
|
415
|
-
"https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal."
|
|
416
|
-
)
|
|
417
|
-
except ImportError:
|
|
418
|
-
error_message = "Failed to import module 'llama_cpp'"
|
|
419
|
-
installation_guide = [
|
|
420
|
-
"Please make sure 'llama_cpp' is installed. ",
|
|
421
|
-
"You can install it by visiting the installation section of the git repo:\n",
|
|
422
|
-
"https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal",
|
|
423
|
-
]
|
|
424
|
-
|
|
425
|
-
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
426
|
-
|
|
427
|
-
reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
|
|
428
|
-
self.prepare_parse_reasoning_content(reasoning_content)
|
|
429
|
-
|
|
430
|
-
if os.path.isfile(self.model_path):
|
|
431
|
-
# mostly passed from --model_path
|
|
432
|
-
model_path = self.model_path
|
|
433
|
-
else:
|
|
434
|
-
# handle legacy cache.
|
|
435
|
-
if (
|
|
436
|
-
self.model_spec.model_file_name_split_template
|
|
437
|
-
and self.model_spec.quantization_parts
|
|
438
|
-
):
|
|
439
|
-
part = self.model_spec.quantization_parts[self.quantization]
|
|
440
|
-
model_path = os.path.join(
|
|
441
|
-
self.model_path,
|
|
442
|
-
self.model_spec.model_file_name_split_template.format(
|
|
443
|
-
quantization=self.quantization, part=part[0]
|
|
444
|
-
),
|
|
445
|
-
)
|
|
446
|
-
else:
|
|
447
|
-
model_path = os.path.join(
|
|
448
|
-
self.model_path,
|
|
449
|
-
self.model_spec.model_file_name_template.format(
|
|
450
|
-
quantization=self.quantization
|
|
451
|
-
),
|
|
452
|
-
)
|
|
453
|
-
legacy_model_file_path = os.path.join(self.model_path, "model.bin")
|
|
454
|
-
if os.path.exists(legacy_model_file_path):
|
|
455
|
-
model_path = legacy_model_file_path
|
|
456
|
-
|
|
457
|
-
try:
|
|
458
|
-
self._llm = Llama(
|
|
459
|
-
model_path=model_path,
|
|
460
|
-
verbose=True,
|
|
461
|
-
**self._llamacpp_model_config,
|
|
462
|
-
)
|
|
463
|
-
except AssertionError:
|
|
464
|
-
raise RuntimeError(f"Load model {self.model_family.model_name} failed")
|
|
465
|
-
|
|
466
|
-
@classmethod
|
|
467
|
-
def match(
|
|
468
|
-
cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
|
|
469
|
-
) -> bool:
|
|
470
|
-
if llm_spec.model_format not in ["ggufv2"]:
|
|
471
|
-
return False
|
|
472
|
-
if "qwen" in llm_family.model_name:
|
|
473
|
-
return False
|
|
474
|
-
if "generate" not in llm_family.model_ability:
|
|
475
|
-
return False
|
|
476
|
-
return True
|
|
477
|
-
|
|
478
|
-
def generate(
|
|
479
|
-
self, prompt: str, generate_config: Optional[LlamaCppGenerateConfig] = None
|
|
480
|
-
) -> Union[Completion, Iterator[CompletionChunk]]:
|
|
481
|
-
def generator_wrapper(
|
|
482
|
-
_prompt: str,
|
|
483
|
-
_generate_config: LlamaCppGenerateConfig,
|
|
484
|
-
) -> Iterator[CompletionChunk]:
|
|
485
|
-
assert self._llm is not None
|
|
486
|
-
prompt_token_ids: List[int] = (
|
|
487
|
-
(
|
|
488
|
-
self._llm.tokenize(prompt.encode("utf-8"), special=True)
|
|
489
|
-
if prompt != ""
|
|
490
|
-
else [self._llm.token_bos()]
|
|
491
|
-
)
|
|
492
|
-
if isinstance(prompt, str)
|
|
493
|
-
else prompt
|
|
494
|
-
)
|
|
495
|
-
prompt_tokens = len(prompt_token_ids)
|
|
496
|
-
completion_tokens, total_tokens = 0, 0
|
|
497
|
-
request_id = 0
|
|
498
|
-
for index, _completion_chunk in enumerate(
|
|
499
|
-
self._llm(prompt=_prompt, **_generate_config)
|
|
500
|
-
):
|
|
501
|
-
_completion_chunk["model"] = self.model_uid
|
|
502
|
-
request_id = _completion_chunk["id"]
|
|
503
|
-
completion_tokens = index + 1
|
|
504
|
-
total_tokens = prompt_tokens + completion_tokens
|
|
505
|
-
_completion_chunk["usage"] = CompletionUsage(
|
|
506
|
-
prompt_tokens=prompt_tokens,
|
|
507
|
-
completion_tokens=completion_tokens,
|
|
508
|
-
total_tokens=total_tokens,
|
|
509
|
-
)
|
|
510
|
-
yield _completion_chunk
|
|
511
|
-
if include_usage:
|
|
512
|
-
chunk = CompletionChunk(
|
|
513
|
-
id=request_id,
|
|
514
|
-
object="text_completion",
|
|
515
|
-
created=int(time.time()),
|
|
516
|
-
model=self.model_uid,
|
|
517
|
-
choices=[],
|
|
518
|
-
)
|
|
519
|
-
chunk["usage"] = CompletionUsage(
|
|
520
|
-
prompt_tokens=prompt_tokens,
|
|
521
|
-
completion_tokens=completion_tokens,
|
|
522
|
-
total_tokens=total_tokens,
|
|
523
|
-
)
|
|
524
|
-
yield chunk
|
|
525
|
-
|
|
526
|
-
logger.debug(
|
|
527
|
-
"Enter generate, prompt: %s, generate config: %s", prompt, generate_config
|
|
528
|
-
)
|
|
529
|
-
|
|
530
|
-
generate_config = self._sanitize_generate_config(generate_config)
|
|
531
|
-
stream = generate_config.get("stream", False)
|
|
532
|
-
stream_options = generate_config.pop("stream_options", None)
|
|
533
|
-
include_usage = (
|
|
534
|
-
stream_options["include_usage"]
|
|
535
|
-
if isinstance(stream_options, dict)
|
|
536
|
-
else False
|
|
537
|
-
)
|
|
538
|
-
|
|
539
|
-
if not stream:
|
|
540
|
-
assert self._llm is not None
|
|
541
|
-
completion = self._llm(prompt=prompt, **generate_config)
|
|
542
|
-
|
|
543
|
-
return completion
|
|
544
|
-
else:
|
|
545
|
-
return generator_wrapper(prompt, generate_config)
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
|
|
549
|
-
def __init__(
|
|
550
|
-
self,
|
|
551
|
-
model_uid: str,
|
|
552
|
-
model_family: "LLMFamilyV1",
|
|
553
|
-
model_spec: "LLMSpecV1",
|
|
554
|
-
quantization: str,
|
|
555
|
-
model_path: str,
|
|
556
|
-
llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
|
|
557
|
-
):
|
|
558
|
-
super().__init__(
|
|
559
|
-
model_uid,
|
|
560
|
-
model_family,
|
|
561
|
-
model_spec,
|
|
562
|
-
quantization,
|
|
563
|
-
model_path,
|
|
564
|
-
llamacpp_model_config,
|
|
565
|
-
)
|
|
566
|
-
|
|
567
|
-
@classmethod
|
|
568
|
-
def match(
|
|
569
|
-
cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
|
|
570
|
-
) -> bool:
|
|
571
|
-
if llm_spec.model_format not in ["ggufv2"]:
|
|
572
|
-
return False
|
|
573
|
-
if "chat" not in llm_family.model_ability:
|
|
574
|
-
return False
|
|
575
|
-
return True
|
|
576
|
-
|
|
577
|
-
def _sanitize_generate_config(
|
|
578
|
-
self, generate_config: Optional[LlamaCppGenerateConfig]
|
|
579
|
-
) -> LlamaCppGenerateConfig:
|
|
580
|
-
generate_config = super()._sanitize_generate_config(generate_config)
|
|
581
|
-
if self.model_family.stop and self.model_family.stop:
|
|
582
|
-
generate_config["stop"] = self.model_family.stop.copy()
|
|
583
|
-
return generate_config
|
|
584
|
-
|
|
585
|
-
def chat(
|
|
586
|
-
self,
|
|
587
|
-
messages: List[Dict],
|
|
588
|
-
generate_config: Optional[LlamaCppGenerateConfig] = None,
|
|
589
|
-
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
590
|
-
model_family = self.model_family.model_family or self.model_family.model_name
|
|
591
|
-
tools = generate_config.pop("tools", []) if generate_config else None
|
|
592
|
-
full_context_kwargs = {}
|
|
593
|
-
if tools:
|
|
594
|
-
if (
|
|
595
|
-
model_family in QWEN_TOOL_CALL_FAMILY
|
|
596
|
-
or model_family in DEEPSEEK_TOOL_CALL_FAMILY
|
|
597
|
-
):
|
|
598
|
-
full_context_kwargs["tools"] = tools
|
|
599
|
-
assert self.model_family.chat_template is not None
|
|
600
|
-
full_prompt = self.get_full_context(
|
|
601
|
-
messages, self.model_family.chat_template, **full_context_kwargs
|
|
602
|
-
)
|
|
603
|
-
|
|
604
|
-
generate_config = self._sanitize_generate_config(generate_config)
|
|
605
|
-
|
|
606
|
-
stream = generate_config.get("stream", False)
|
|
607
|
-
if stream:
|
|
608
|
-
it = self.generate(full_prompt, generate_config)
|
|
609
|
-
assert isinstance(it, Iterator)
|
|
610
|
-
return self._to_chat_completion_chunks(it, self.reasoning_parser)
|
|
611
|
-
else:
|
|
612
|
-
c = self.generate(full_prompt, generate_config)
|
|
613
|
-
assert not isinstance(c, Iterator)
|
|
614
|
-
if tools:
|
|
615
|
-
return self._post_process_completion(
|
|
616
|
-
self.model_family, self.model_uid, c, self.reasoning_parser
|
|
617
|
-
)
|
|
618
|
-
return self._to_chat_completion(c, self.reasoning_parser)
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
if USE_XLLAMACPP:
|
|
622
|
-
LlamaCppModel = XllamaCppModel # type: ignore # noqa: F811
|
|
623
|
-
LlamaCppChatModel = XllamaCppModel # type: ignore # noqa: F811
|