xinference 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (137) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +107 -11
  3. xinference/client/restful/restful_client.py +51 -11
  4. xinference/constants.py +5 -1
  5. xinference/core/media_interface.py +758 -0
  6. xinference/core/model.py +49 -9
  7. xinference/core/supervisor.py +1 -1
  8. xinference/core/utils.py +1 -1
  9. xinference/core/worker.py +33 -39
  10. xinference/deploy/cmdline.py +17 -0
  11. xinference/deploy/utils.py +0 -3
  12. xinference/model/audio/__init__.py +16 -27
  13. xinference/model/audio/core.py +2 -1
  14. xinference/model/audio/cosyvoice.py +4 -2
  15. xinference/model/audio/model_spec.json +63 -46
  16. xinference/model/audio/model_spec_modelscope.json +31 -14
  17. xinference/model/embedding/__init__.py +16 -24
  18. xinference/model/image/__init__.py +15 -25
  19. xinference/model/llm/__init__.py +40 -115
  20. xinference/model/llm/core.py +29 -6
  21. xinference/model/llm/llama_cpp/core.py +30 -347
  22. xinference/model/llm/llm_family.json +1674 -2203
  23. xinference/model/llm/llm_family.py +71 -7
  24. xinference/model/llm/llm_family_csghub.json +0 -32
  25. xinference/model/llm/llm_family_modelscope.json +1838 -2016
  26. xinference/model/llm/llm_family_openmind_hub.json +19 -325
  27. xinference/model/llm/lmdeploy/core.py +7 -2
  28. xinference/model/llm/mlx/core.py +23 -7
  29. xinference/model/llm/reasoning_parser.py +281 -5
  30. xinference/model/llm/sglang/core.py +39 -11
  31. xinference/model/llm/transformers/chatglm.py +9 -2
  32. xinference/model/llm/transformers/cogagent.py +10 -12
  33. xinference/model/llm/transformers/cogvlm2.py +6 -3
  34. xinference/model/llm/transformers/cogvlm2_video.py +3 -6
  35. xinference/model/llm/transformers/core.py +58 -60
  36. xinference/model/llm/transformers/deepseek_v2.py +4 -2
  37. xinference/model/llm/transformers/deepseek_vl.py +10 -4
  38. xinference/model/llm/transformers/deepseek_vl2.py +9 -4
  39. xinference/model/llm/transformers/gemma3.py +4 -5
  40. xinference/model/llm/transformers/glm4v.py +3 -21
  41. xinference/model/llm/transformers/glm_edge_v.py +3 -20
  42. xinference/model/llm/transformers/intern_vl.py +3 -6
  43. xinference/model/llm/transformers/internlm2.py +1 -1
  44. xinference/model/llm/transformers/minicpmv25.py +4 -2
  45. xinference/model/llm/transformers/minicpmv26.py +5 -3
  46. xinference/model/llm/transformers/omnilmm.py +1 -1
  47. xinference/model/llm/transformers/opt.py +1 -1
  48. xinference/model/llm/transformers/ovis2.py +302 -0
  49. xinference/model/llm/transformers/qwen-omni.py +8 -1
  50. xinference/model/llm/transformers/qwen2_audio.py +3 -1
  51. xinference/model/llm/transformers/qwen2_vl.py +5 -1
  52. xinference/model/llm/transformers/qwen_vl.py +5 -2
  53. xinference/model/llm/utils.py +96 -45
  54. xinference/model/llm/vllm/core.py +108 -24
  55. xinference/model/llm/vllm/distributed_executor.py +8 -7
  56. xinference/model/llm/vllm/xavier/allocator.py +1 -1
  57. xinference/model/llm/vllm/xavier/block_manager.py +1 -1
  58. xinference/model/llm/vllm/xavier/block_tracker.py +3 -3
  59. xinference/model/llm/vllm/xavier/executor.py +1 -1
  60. xinference/model/llm/vllm/xavier/test/test_xavier.py +2 -11
  61. xinference/model/rerank/__init__.py +13 -24
  62. xinference/model/video/__init__.py +15 -25
  63. xinference/model/video/core.py +3 -3
  64. xinference/model/video/diffusers.py +157 -13
  65. xinference/model/video/model_spec.json +100 -0
  66. xinference/model/video/model_spec_modelscope.json +104 -0
  67. xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
  68. xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
  69. xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
  70. xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
  71. xinference/thirdparty/cosyvoice/bin/train.py +7 -2
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
  74. xinference/thirdparty/cosyvoice/cli/model.py +140 -155
  75. xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
  76. xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
  77. xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
  78. xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
  79. xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
  80. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
  81. xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
  84. xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
  85. xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
  86. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
  87. xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
  88. xinference/thirdparty/cosyvoice/utils/common.py +1 -1
  89. xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
  90. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
  91. xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
  92. xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
  93. xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
  94. xinference/types.py +2 -71
  95. xinference/web/ui/build/asset-manifest.json +6 -6
  96. xinference/web/ui/build/index.html +1 -1
  97. xinference/web/ui/build/static/css/{main.0f6523be.css → main.337afe76.css} +2 -2
  98. xinference/web/ui/build/static/css/main.337afe76.css.map +1 -0
  99. xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
  100. xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
  101. xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
  102. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
  103. xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
  104. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
  105. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
  106. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +1 -0
  107. xinference/web/ui/node_modules/.cache/babel-loader/6798e126f3bc5f95a4c16a9c2ad52ffe77970c62406d83e20604dfda7ffd2247.json +1 -0
  108. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
  109. xinference/web/ui/node_modules/.cache/babel-loader/b617f7d21a95045fc57b26a9373551740f1978a826134cbf705c3a1bf8714a93.json +1 -0
  110. xinference/web/ui/node_modules/.cache/babel-loader/c1506cb142151366074975f30fa1ff9cd6e5e978b62a4b074dfc16fe08d70d75.json +1 -0
  111. xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +1 -0
  112. xinference/web/ui/src/locales/en.json +7 -4
  113. xinference/web/ui/src/locales/zh.json +7 -4
  114. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/METADATA +56 -36
  115. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/RECORD +120 -121
  116. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/WHEEL +1 -1
  117. xinference/core/image_interface.py +0 -377
  118. xinference/model/llm/transformers/compression.py +0 -258
  119. xinference/model/llm/transformers/yi_vl.py +0 -239
  120. xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
  121. xinference/web/ui/build/static/css/main.0f6523be.css.map +0 -1
  122. xinference/web/ui/build/static/js/main.4b67a723.js +0 -3
  123. xinference/web/ui/build/static/js/main.4b67a723.js.map +0 -1
  124. xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
  125. xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +0 -1
  126. xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
  127. xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +0 -1
  128. xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +0 -1
  129. xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +0 -1
  130. xinference/web/ui/node_modules/.cache/babel-loader/e4ba658c6b3b0490910acdae0c535a892257efb61539a24adf8038fc653bd22f.json +0 -1
  131. xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +0 -1
  132. xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
  133. xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +0 -1
  134. /xinference/web/ui/build/static/js/{main.4b67a723.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
  135. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/entry_points.txt +0 -0
  136. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/licenses/LICENSE +0 -0
  137. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/top_level.txt +0 -0
@@ -12,32 +12,21 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import concurrent.futures
15
+ import importlib.util
15
16
  import logging
16
17
  import os
17
18
  import queue
18
- import time
19
- from typing import Dict, Iterator, List, Optional, Union
19
+ from typing import Iterator, List, Optional, Union
20
20
 
21
21
  import orjson
22
22
 
23
- from ....types import (
24
- ChatCompletion,
25
- ChatCompletionChunk,
26
- Completion,
27
- CompletionChunk,
28
- CompletionUsage,
29
- CreateCompletionLlamaCpp,
30
- LlamaCppGenerateConfig,
31
- LlamaCppModelConfig,
32
- )
23
+ from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
33
24
  from ..core import LLM
34
25
  from ..llm_family import LLMFamilyV1, LLMSpecV1
35
- from ..utils import DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelMixin
26
+ from ..utils import ChatModelMixin
36
27
 
37
28
  logger = logging.getLogger(__name__)
38
29
 
39
- USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 1)))
40
-
41
30
 
42
31
  class _Done:
43
32
  pass
@@ -56,21 +45,16 @@ class XllamaCppModel(LLM, ChatModelMixin):
56
45
  model_spec: "LLMSpecV1",
57
46
  quantization: str,
58
47
  model_path: str,
59
- llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
48
+ llamacpp_model_config: Optional[dict] = None,
60
49
  ):
61
50
  super().__init__(model_uid, model_family, model_spec, quantization, model_path)
62
-
63
- self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
64
- llamacpp_model_config
65
- )
51
+ self._llamacpp_model_config = self._sanitize_model_config(llamacpp_model_config)
66
52
  self._llm = None
67
53
  self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
68
54
 
69
- def _sanitize_model_config(
70
- self, llamacpp_model_config: Optional[LlamaCppModelConfig]
71
- ) -> LlamaCppModelConfig:
55
+ def _sanitize_model_config(self, llamacpp_model_config: Optional[dict]) -> dict:
72
56
  if llamacpp_model_config is None:
73
- llamacpp_model_config = LlamaCppModelConfig()
57
+ llamacpp_model_config = {}
74
58
 
75
59
  if self.model_family.context_length:
76
60
  llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
@@ -92,31 +76,12 @@ class XllamaCppModel(LLM, ChatModelMixin):
92
76
 
93
77
  return llamacpp_model_config
94
78
 
95
- def _sanitize_generate_config(
96
- self, generate_config: Optional[LlamaCppGenerateConfig]
97
- ) -> LlamaCppGenerateConfig:
98
- if generate_config is None:
99
- generate_config = LlamaCppGenerateConfig(
100
- **CreateCompletionLlamaCpp().dict()
101
- )
102
- else:
103
- from llama_cpp import LlamaGrammar
104
-
105
- grammar = generate_config.get("grammar")
106
- if grammar is not None and not isinstance(grammar, LlamaGrammar):
107
- generate_config["grammar"] = LlamaGrammar.from_string(
108
- generate_config["grammar"]
109
- )
110
- # Validate generate_config and fill default values to the generate config.
111
- generate_config = LlamaCppGenerateConfig(
112
- **CreateCompletionLlamaCpp(**generate_config).dict()
113
- )
114
- # Currently, llama.cpp does not support lora
115
- generate_config.pop("lora_name", None) # type: ignore
116
- return generate_config
79
+ @classmethod
80
+ def check_lib(cls) -> bool:
81
+ return importlib.util.find_spec("xllamacpp") is not None
117
82
 
118
83
  @classmethod
119
- def match(
84
+ def match_json(
120
85
  cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
121
86
  ) -> bool:
122
87
  if llm_spec.model_format not in ["ggufv2"]:
@@ -138,7 +103,10 @@ class XllamaCppModel(LLM, ChatModelMixin):
138
103
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
139
104
 
140
105
  reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
141
- self.prepare_parse_reasoning_content(reasoning_content)
106
+ enable_thinking = self._llamacpp_model_config.pop("enable_thinking", True)
107
+ self.prepare_parse_reasoning_content(
108
+ reasoning_content, enable_thinking=enable_thinking
109
+ )
142
110
 
143
111
  if os.path.isfile(self.model_path):
144
112
  # mostly passed from --model_path
@@ -147,7 +115,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
147
115
  # handle legacy cache.
148
116
  if (
149
117
  self.model_spec.model_file_name_split_template
150
- and self.model_spec.quantization_parts
118
+ and self.quantization in self.model_spec.quantization_parts
151
119
  ):
152
120
  part = self.model_spec.quantization_parts[self.quantization]
153
121
  model_path = os.path.join(
@@ -180,7 +148,14 @@ class XllamaCppModel(LLM, ChatModelMixin):
180
148
  params.n_parallel = os.cpu_count()
181
149
  for k, v in self._llamacpp_model_config.items():
182
150
  try:
183
- setattr(params, k, v)
151
+ if "." in k:
152
+ parts = k.split(".")
153
+ sub_param = params
154
+ for p in parts[:-1]:
155
+ sub_param = getattr(sub_param, p)
156
+ setattr(sub_param, parts[-1], v)
157
+ else:
158
+ setattr(params, k, v)
184
159
  except Exception as e:
185
160
  logger.error("Failed to set the param %s = %s, error: %s", k, v, e)
186
161
  n_threads = self._llamacpp_model_config.get("n_threads", os.cpu_count())
@@ -198,14 +173,13 @@ class XllamaCppModel(LLM, ChatModelMixin):
198
173
  raise RuntimeError(f"Load model {self.model_family.model_name} failed")
199
174
 
200
175
  def generate(
201
- self, prompt: str, generate_config: Optional[LlamaCppGenerateConfig] = None
176
+ self, prompt: str, generate_config: Optional[dict] = None
202
177
  ) -> Union[Completion, Iterator[CompletionChunk]]:
203
- generate_config = self._sanitize_generate_config(generate_config)
178
+ generate_config = generate_config or {}
204
179
  stream = generate_config.get("stream", False)
205
180
  q: queue.Queue = queue.Queue()
206
181
 
207
182
  def _handle_completion():
208
- # TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
209
183
  data = generate_config
210
184
  data.pop("stopping_criteria", None)
211
185
  data.pop("logits_processor", None)
@@ -260,16 +234,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
260
234
 
261
235
  def chat(
262
236
  self,
263
- messages: List[Dict],
264
- generate_config: Optional[LlamaCppGenerateConfig] = None,
237
+ messages: List[dict],
238
+ generate_config: Optional[dict] = None,
265
239
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
266
- generate_config = self._sanitize_generate_config(generate_config)
240
+ generate_config = generate_config or {}
267
241
  stream = generate_config.get("stream", False)
268
242
  tools = generate_config.pop("tools", []) if generate_config else None
269
243
  q: queue.Queue = queue.Queue()
270
244
 
271
245
  def _handle_chat_completion():
272
- # TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
273
246
  data = generate_config
274
247
  data.pop("stopping_criteria", None)
275
248
  data.pop("logits_processor", None)
@@ -331,293 +304,3 @@ class XllamaCppModel(LLM, ChatModelMixin):
331
304
  if type(r) is _Error:
332
305
  raise Exception("Got error in chat: %s", r.msg)
333
306
  return self._to_chat_completion(r, self.reasoning_parser)
334
-
335
-
336
- class LlamaCppModel(LLM):
337
- def __init__(
338
- self,
339
- model_uid: str,
340
- model_family: "LLMFamilyV1",
341
- model_spec: "LLMSpecV1",
342
- quantization: str,
343
- model_path: str,
344
- llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
345
- ):
346
- super().__init__(model_uid, model_family, model_spec, quantization, model_path)
347
-
348
- self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
349
- llamacpp_model_config
350
- )
351
- self._llm = None
352
-
353
- def _can_apply_cublas(self):
354
- # TODO: figure out the quantizations supported.
355
- return True
356
-
357
- def _sanitize_model_config(
358
- self, llamacpp_model_config: Optional[LlamaCppModelConfig]
359
- ) -> LlamaCppModelConfig:
360
- if llamacpp_model_config is None:
361
- llamacpp_model_config = LlamaCppModelConfig()
362
-
363
- if self.model_family.context_length:
364
- llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
365
- llamacpp_model_config.setdefault("use_mmap", False)
366
- llamacpp_model_config.setdefault("use_mlock", True)
367
-
368
- if (
369
- "llama-2" in self.model_family.model_name
370
- and self.model_spec.model_size_in_billions == 70
371
- ):
372
- llamacpp_model_config["use_mlock"] = False
373
- llamacpp_model_config["n_gqa"] = 8
374
-
375
- if self._is_darwin_and_apple_silicon():
376
- llamacpp_model_config.setdefault("n_gpu_layers", -1)
377
- elif self._is_linux() and self._can_apply_cublas():
378
- llamacpp_model_config.setdefault("n_gpu_layers", -1)
379
- llamacpp_model_config.setdefault("reasoning_content", False)
380
-
381
- return llamacpp_model_config
382
-
383
- def _sanitize_generate_config(
384
- self, generate_config: Optional[LlamaCppGenerateConfig]
385
- ) -> LlamaCppGenerateConfig:
386
- if generate_config is None:
387
- generate_config = LlamaCppGenerateConfig(
388
- **CreateCompletionLlamaCpp().dict()
389
- )
390
- else:
391
- from llama_cpp import LlamaGrammar
392
-
393
- grammar = generate_config.get("grammar")
394
- if grammar is not None and not isinstance(grammar, LlamaGrammar):
395
- generate_config["grammar"] = LlamaGrammar.from_string(
396
- generate_config["grammar"]
397
- )
398
- # Validate generate_config and fill default values to the generate config.
399
- generate_config = LlamaCppGenerateConfig(
400
- **CreateCompletionLlamaCpp(**generate_config).dict()
401
- )
402
- # Currently, llama.cpp does not support lora
403
- generate_config.pop("lora_name", None) # type: ignore
404
- return generate_config
405
-
406
- def load(self):
407
- try:
408
- import llama_cpp
409
- from llama_cpp import Llama
410
-
411
- if llama_cpp.__version__ < "0.2.0":
412
- raise ValueError(
413
- "The llama_cpp version must be greater than 0.2.0. "
414
- "Please upgrade your version via `pip install -U llama_cpp` or refer to "
415
- "https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal."
416
- )
417
- except ImportError:
418
- error_message = "Failed to import module 'llama_cpp'"
419
- installation_guide = [
420
- "Please make sure 'llama_cpp' is installed. ",
421
- "You can install it by visiting the installation section of the git repo:\n",
422
- "https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal",
423
- ]
424
-
425
- raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
426
-
427
- reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
428
- self.prepare_parse_reasoning_content(reasoning_content)
429
-
430
- if os.path.isfile(self.model_path):
431
- # mostly passed from --model_path
432
- model_path = self.model_path
433
- else:
434
- # handle legacy cache.
435
- if (
436
- self.model_spec.model_file_name_split_template
437
- and self.model_spec.quantization_parts
438
- ):
439
- part = self.model_spec.quantization_parts[self.quantization]
440
- model_path = os.path.join(
441
- self.model_path,
442
- self.model_spec.model_file_name_split_template.format(
443
- quantization=self.quantization, part=part[0]
444
- ),
445
- )
446
- else:
447
- model_path = os.path.join(
448
- self.model_path,
449
- self.model_spec.model_file_name_template.format(
450
- quantization=self.quantization
451
- ),
452
- )
453
- legacy_model_file_path = os.path.join(self.model_path, "model.bin")
454
- if os.path.exists(legacy_model_file_path):
455
- model_path = legacy_model_file_path
456
-
457
- try:
458
- self._llm = Llama(
459
- model_path=model_path,
460
- verbose=True,
461
- **self._llamacpp_model_config,
462
- )
463
- except AssertionError:
464
- raise RuntimeError(f"Load model {self.model_family.model_name} failed")
465
-
466
- @classmethod
467
- def match(
468
- cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
469
- ) -> bool:
470
- if llm_spec.model_format not in ["ggufv2"]:
471
- return False
472
- if "qwen" in llm_family.model_name:
473
- return False
474
- if "generate" not in llm_family.model_ability:
475
- return False
476
- return True
477
-
478
- def generate(
479
- self, prompt: str, generate_config: Optional[LlamaCppGenerateConfig] = None
480
- ) -> Union[Completion, Iterator[CompletionChunk]]:
481
- def generator_wrapper(
482
- _prompt: str,
483
- _generate_config: LlamaCppGenerateConfig,
484
- ) -> Iterator[CompletionChunk]:
485
- assert self._llm is not None
486
- prompt_token_ids: List[int] = (
487
- (
488
- self._llm.tokenize(prompt.encode("utf-8"), special=True)
489
- if prompt != ""
490
- else [self._llm.token_bos()]
491
- )
492
- if isinstance(prompt, str)
493
- else prompt
494
- )
495
- prompt_tokens = len(prompt_token_ids)
496
- completion_tokens, total_tokens = 0, 0
497
- request_id = 0
498
- for index, _completion_chunk in enumerate(
499
- self._llm(prompt=_prompt, **_generate_config)
500
- ):
501
- _completion_chunk["model"] = self.model_uid
502
- request_id = _completion_chunk["id"]
503
- completion_tokens = index + 1
504
- total_tokens = prompt_tokens + completion_tokens
505
- _completion_chunk["usage"] = CompletionUsage(
506
- prompt_tokens=prompt_tokens,
507
- completion_tokens=completion_tokens,
508
- total_tokens=total_tokens,
509
- )
510
- yield _completion_chunk
511
- if include_usage:
512
- chunk = CompletionChunk(
513
- id=request_id,
514
- object="text_completion",
515
- created=int(time.time()),
516
- model=self.model_uid,
517
- choices=[],
518
- )
519
- chunk["usage"] = CompletionUsage(
520
- prompt_tokens=prompt_tokens,
521
- completion_tokens=completion_tokens,
522
- total_tokens=total_tokens,
523
- )
524
- yield chunk
525
-
526
- logger.debug(
527
- "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
528
- )
529
-
530
- generate_config = self._sanitize_generate_config(generate_config)
531
- stream = generate_config.get("stream", False)
532
- stream_options = generate_config.pop("stream_options", None)
533
- include_usage = (
534
- stream_options["include_usage"]
535
- if isinstance(stream_options, dict)
536
- else False
537
- )
538
-
539
- if not stream:
540
- assert self._llm is not None
541
- completion = self._llm(prompt=prompt, **generate_config)
542
-
543
- return completion
544
- else:
545
- return generator_wrapper(prompt, generate_config)
546
-
547
-
548
- class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
549
- def __init__(
550
- self,
551
- model_uid: str,
552
- model_family: "LLMFamilyV1",
553
- model_spec: "LLMSpecV1",
554
- quantization: str,
555
- model_path: str,
556
- llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
557
- ):
558
- super().__init__(
559
- model_uid,
560
- model_family,
561
- model_spec,
562
- quantization,
563
- model_path,
564
- llamacpp_model_config,
565
- )
566
-
567
- @classmethod
568
- def match(
569
- cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
570
- ) -> bool:
571
- if llm_spec.model_format not in ["ggufv2"]:
572
- return False
573
- if "chat" not in llm_family.model_ability:
574
- return False
575
- return True
576
-
577
- def _sanitize_generate_config(
578
- self, generate_config: Optional[LlamaCppGenerateConfig]
579
- ) -> LlamaCppGenerateConfig:
580
- generate_config = super()._sanitize_generate_config(generate_config)
581
- if self.model_family.stop and self.model_family.stop:
582
- generate_config["stop"] = self.model_family.stop.copy()
583
- return generate_config
584
-
585
- def chat(
586
- self,
587
- messages: List[Dict],
588
- generate_config: Optional[LlamaCppGenerateConfig] = None,
589
- ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
590
- model_family = self.model_family.model_family or self.model_family.model_name
591
- tools = generate_config.pop("tools", []) if generate_config else None
592
- full_context_kwargs = {}
593
- if tools:
594
- if (
595
- model_family in QWEN_TOOL_CALL_FAMILY
596
- or model_family in DEEPSEEK_TOOL_CALL_FAMILY
597
- ):
598
- full_context_kwargs["tools"] = tools
599
- assert self.model_family.chat_template is not None
600
- full_prompt = self.get_full_context(
601
- messages, self.model_family.chat_template, **full_context_kwargs
602
- )
603
-
604
- generate_config = self._sanitize_generate_config(generate_config)
605
-
606
- stream = generate_config.get("stream", False)
607
- if stream:
608
- it = self.generate(full_prompt, generate_config)
609
- assert isinstance(it, Iterator)
610
- return self._to_chat_completion_chunks(it, self.reasoning_parser)
611
- else:
612
- c = self.generate(full_prompt, generate_config)
613
- assert not isinstance(c, Iterator)
614
- if tools:
615
- return self._post_process_completion(
616
- self.model_family, self.model_uid, c, self.reasoning_parser
617
- )
618
- return self._to_chat_completion(c, self.reasoning_parser)
619
-
620
-
621
- if USE_XLLAMACPP:
622
- LlamaCppModel = XllamaCppModel # type: ignore # noqa: F811
623
- LlamaCppChatModel = XllamaCppModel # type: ignore # noqa: F811