xinference 1.3.0.post1__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (52) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +1 -0
  3. xinference/conftest.py +7 -0
  4. xinference/core/model.py +3 -1
  5. xinference/core/scheduler.py +3 -0
  6. xinference/core/worker.py +1 -1
  7. xinference/deploy/cmdline.py +0 -8
  8. xinference/model/embedding/core.py +12 -5
  9. xinference/model/llm/__init__.py +2 -1
  10. xinference/model/llm/core.py +13 -0
  11. xinference/model/llm/llama_cpp/core.py +260 -3
  12. xinference/model/llm/llm_family.json +306 -17
  13. xinference/model/llm/llm_family_modelscope.json +347 -28
  14. xinference/model/llm/mlx/core.py +15 -4
  15. xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +1 -1
  16. xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py +4 -5
  17. xinference/model/llm/sglang/core.py +7 -2
  18. xinference/model/llm/transformers/chatglm.py +4 -4
  19. xinference/model/llm/transformers/core.py +22 -5
  20. xinference/model/llm/transformers/intern_vl.py +2 -1
  21. xinference/model/llm/transformers/utils.py +1 -1
  22. xinference/model/llm/utils.py +103 -67
  23. xinference/model/llm/vllm/core.py +29 -42
  24. xinference/types.py +4 -0
  25. xinference/web/ui/build/asset-manifest.json +3 -3
  26. xinference/web/ui/build/index.html +1 -1
  27. xinference/web/ui/build/static/js/main.55b70cb7.js +3 -0
  28. xinference/web/ui/build/static/js/main.55b70cb7.js.map +1 -0
  29. xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +1 -0
  30. xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +1 -0
  31. xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +1 -0
  32. xinference/web/ui/node_modules/.cache/babel-loader/87a9b13f2466f375ae5c6e7c08b279cc38351d29710d7f7626bbb07a85262b79.json +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +1 -0
  36. xinference/web/ui/src/locales/en.json +9 -1
  37. xinference/web/ui/src/locales/zh.json +9 -1
  38. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/METADATA +7 -3
  39. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/RECORD +44 -43
  40. xinference/web/ui/build/static/js/main.ad42919c.js +0 -3
  41. xinference/web/ui/build/static/js/main.ad42919c.js.map +0 -1
  42. xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +0 -1
  43. xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +0 -1
  44. xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +0 -1
  45. xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +0 -1
  46. xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +0 -1
  47. xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +0 -1
  48. /xinference/web/ui/build/static/js/{main.ad42919c.js.LICENSE.txt → main.55b70cb7.js.LICENSE.txt} +0 -0
  49. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/LICENSE +0 -0
  50. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/WHEEL +0 -0
  51. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/entry_points.txt +0 -0
  52. {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/top_level.txt +0 -0
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-02-22T00:10:55+0800",
11
+ "date": "2025-03-09T12:06:50+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "b2004d49ddeda17dc6404473b1f25f8769911e18",
15
- "version": "1.3.0.post1"
14
+ "full-revisionid": "5d6ec937ce2aca2511e9e0debc4c2ab06ca41f09",
15
+ "version": "1.3.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -1330,6 +1330,7 @@ class RESTfulAPI(CancelMixin):
1330
1330
  raise HTTPException(status_code=500, detail=str(e))
1331
1331
 
1332
1332
  try:
1333
+ kwargs["model_uid"] = model_uid
1333
1334
  embedding = await model.create_embedding(body.input, **kwargs)
1334
1335
  return Response(embedding, media_type="application/json")
1335
1336
  except Exception as e:
xinference/conftest.py CHANGED
@@ -304,3 +304,10 @@ def setup_with_auth():
304
304
  os.remove(auth_file)
305
305
  except:
306
306
  pass
307
+
308
+
309
+ @pytest.fixture
310
+ def set_use_xllamacpp():
311
+ os.environ["USE_XLLAMACPP"] = "1"
312
+ yield
313
+ del os.environ["USE_XLLAMACPP"]
xinference/core/model.py CHANGED
@@ -231,6 +231,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
231
231
  driver_info: Optional[dict] = None, # for model across workers
232
232
  ):
233
233
  super().__init__()
234
+ from ..model.llm.llama_cpp.core import XllamaCppModel
234
235
  from ..model.llm.lmdeploy.core import LMDeployModel
235
236
  from ..model.llm.sglang.core import SGLANGModel
236
237
  from ..model.llm.transformers.core import PytorchModel
@@ -251,7 +252,8 @@ class ModelActor(xo.StatelessActor, CancelMixin):
251
252
  self._lock = (
252
253
  None
253
254
  if isinstance(
254
- self._model, (PytorchModel, VLLMModel, SGLANGModel, LMDeployModel)
255
+ self._model,
256
+ (PytorchModel, VLLMModel, SGLANGModel, LMDeployModel, XllamaCppModel),
255
257
  )
256
258
  else asyncio.locks.Lock()
257
259
  )
@@ -97,6 +97,9 @@ class InferenceRequest:
97
97
  # check the integrity of args passed upstream
98
98
  self._check_args()
99
99
 
100
+ # for reasoning_content using
101
+ self.previous_texts = [""]
102
+
100
103
  def _check_args(self):
101
104
  assert len(self._inference_args) == 1
102
105
  # generate config
xinference/core/worker.py CHANGED
@@ -1002,7 +1002,7 @@ class WorkerActor(xo.StatelessActor):
1002
1002
  )
1003
1003
  try:
1004
1004
  subpool_address = self._model_uid_to_addr[model_uid]
1005
- await self._main_pool.remove_sub_pool(subpool_address)
1005
+ await self._main_pool.remove_sub_pool(subpool_address, force=True)
1006
1006
  except Exception as e:
1007
1007
  logger.debug(
1008
1008
  "Remove sub pool failed, model uid: %s, error: %s", model_uid, e
@@ -821,12 +821,6 @@ def remove_cache(
821
821
  type=bool,
822
822
  help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
823
823
  )
824
- @click.option(
825
- "--reasoning-content",
826
- default=False,
827
- type=bool,
828
- help="Whether or not to enable reasoning content in model responses.",
829
- )
830
824
  @click.option(
831
825
  "--api-key",
832
826
  "-ak",
@@ -855,7 +849,6 @@ def model_launch(
855
849
  worker_ip: Optional[str],
856
850
  gpu_idx: Optional[str],
857
851
  trust_remote_code: bool,
858
- reasoning_content: bool,
859
852
  api_key: Optional[str],
860
853
  model_path: Optional[str],
861
854
  ):
@@ -948,7 +941,6 @@ def model_launch(
948
941
  gpu_idx=_gpu_idx,
949
942
  trust_remote_code=trust_remote_code,
950
943
  model_path=model_path,
951
- reasoning_content=reasoning_content,
952
944
  **kwargs,
953
945
  )
954
946
 
@@ -268,7 +268,7 @@ class EmbeddingModel:
268
268
  **kwargs,
269
269
  ):
270
270
  sentences = self._fix_langchain_openai_inputs(sentences)
271
-
271
+ model_uid = kwargs.pop("model_uid", None)
272
272
  from sentence_transformers import SentenceTransformer
273
273
 
274
274
  kwargs.setdefault("normalize_embeddings", True)
@@ -546,8 +546,14 @@ class EmbeddingModel:
546
546
  # when batching, the attention mask 1 means there is a token
547
547
  # thus we just sum up it to get the total number of tokens
548
548
  if "clip" in self._model_spec.model_name.lower():
549
- all_token_nums += features["input_ids"].numel()
550
- all_token_nums += features["pixel_values"].numel()
549
+ if "input_ids" in features and hasattr(
550
+ features["input_ids"], "numel"
551
+ ):
552
+ all_token_nums += features["input_ids"].numel()
553
+ if "pixel_values" in features and hasattr(
554
+ features["pixel_values"], "numel"
555
+ ):
556
+ all_token_nums += features["pixel_values"].numel()
551
557
  else:
552
558
  all_token_nums += features["attention_mask"].sum().item()
553
559
 
@@ -657,7 +663,7 @@ class EmbeddingModel:
657
663
  self._model,
658
664
  objs,
659
665
  convert_to_numpy=False,
660
- **self._kwargs,
666
+ **kwargs,
661
667
  )
662
668
  else:
663
669
  all_embeddings, all_token_nums = encode(
@@ -693,7 +699,8 @@ class EmbeddingModel:
693
699
  if not is_bge_m3_flag_model and not kwargs.get("return_sparse")
694
700
  else "dict"
695
701
  ),
696
- model=self._model_uid,
702
+ model=model_uid, # type: ignore
703
+ model_replica=self._model_uid,
697
704
  data=embedding_list,
698
705
  usage=usage,
699
706
  )
@@ -129,7 +129,7 @@ def register_custom_model():
129
129
 
130
130
 
131
131
  def _install():
132
- from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
132
+ from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel, XllamaCppModel
133
133
  from .lmdeploy.core import LMDeployChatModel, LMDeployModel
134
134
  from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
135
135
  from .sglang.core import SGLANGChatModel, SGLANGModel
@@ -169,6 +169,7 @@ def _install():
169
169
  [
170
170
  LlamaCppChatModel,
171
171
  LlamaCppModel,
172
+ XllamaCppModel,
172
173
  ]
173
174
  )
174
175
  SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
@@ -25,6 +25,8 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
25
25
  from ...core.utils import parse_replica_model_uid
26
26
  from ...types import PeftModelConfig
27
27
  from ..core import ModelDescription
28
+ from .reasoning_parsers import deepseek_r1_reasoning_parser # noqa: F401
29
+ from .reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
28
30
 
29
31
  if TYPE_CHECKING:
30
32
  from .llm_family import LLMFamilyV1, LLMSpecV1
@@ -57,6 +59,7 @@ class LLM(abc.ABC):
57
59
  self.model_spec = model_spec
58
60
  self.quantization = quantization
59
61
  self.model_path = model_path
62
+ self.reasoning_parser = None
60
63
  if args:
61
64
  raise ValueError(f"Unrecognized positional arguments: {args}")
62
65
  if kwargs:
@@ -117,6 +120,16 @@ class LLM(abc.ABC):
117
120
  ) -> bool:
118
121
  raise NotImplementedError
119
122
 
123
+ def prepare_parse_reasoning_content(self, reasoning_content):
124
+ # Initialize reasoning parser if model has reasoning ability
125
+ if "reasoning" in self.model_family.model_ability and reasoning_content:
126
+ module_name = self.model_family.model_family or self.model_family.model_name
127
+ self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
128
+ self.reasoning_parser = self.reasoning_parser(
129
+ self.model_family.reasoning_start_tag,
130
+ self.model_family.reasoning_end_tag,
131
+ )
132
+
120
133
 
121
134
  class LLMDescription(ModelDescription):
122
135
  def __init__(
@@ -11,11 +11,15 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import concurrent.futures
14
15
  import logging
15
16
  import os
17
+ import queue
16
18
  import time
17
19
  from typing import Dict, Iterator, List, Optional, Union
18
20
 
21
+ import orjson
22
+
19
23
  from ....types import (
20
24
  ChatCompletion,
21
25
  ChatCompletionChunk,
@@ -32,6 +36,248 @@ from ..utils import DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelM
32
36
 
33
37
  logger = logging.getLogger(__name__)
34
38
 
39
+ USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 0)))
40
+
41
+
42
+ class _Sentinel:
43
+ pass
44
+
45
+
46
+ class XllamaCppModel(LLM):
47
+ def __init__(
48
+ self,
49
+ model_uid: str,
50
+ model_family: "LLMFamilyV1",
51
+ model_spec: "LLMSpecV1",
52
+ quantization: str,
53
+ model_path: str,
54
+ llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
55
+ ):
56
+ super().__init__(model_uid, model_family, model_spec, quantization, model_path)
57
+
58
+ self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
59
+ llamacpp_model_config
60
+ )
61
+ self._llm = None
62
+ self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
63
+
64
+ def _sanitize_model_config(
65
+ self, llamacpp_model_config: Optional[LlamaCppModelConfig]
66
+ ) -> LlamaCppModelConfig:
67
+ if llamacpp_model_config is None:
68
+ llamacpp_model_config = LlamaCppModelConfig()
69
+
70
+ if self.model_family.context_length:
71
+ llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
72
+ llamacpp_model_config.setdefault("use_mmap", False)
73
+ llamacpp_model_config.setdefault("use_mlock", True)
74
+
75
+ if (
76
+ "llama-2" in self.model_family.model_name
77
+ and self.model_spec.model_size_in_billions == 70
78
+ ):
79
+ llamacpp_model_config["use_mlock"] = False
80
+ llamacpp_model_config["n_gqa"] = 8
81
+
82
+ if self._is_darwin_and_apple_silicon():
83
+ llamacpp_model_config.setdefault("n_gpu_layers", -1)
84
+ elif self._is_linux():
85
+ llamacpp_model_config.setdefault("n_gpu_layers", -1)
86
+
87
+ return llamacpp_model_config
88
+
89
+ def _sanitize_generate_config(
90
+ self, generate_config: Optional[LlamaCppGenerateConfig]
91
+ ) -> LlamaCppGenerateConfig:
92
+ if generate_config is None:
93
+ generate_config = LlamaCppGenerateConfig(
94
+ **CreateCompletionLlamaCpp().dict()
95
+ )
96
+ else:
97
+ from llama_cpp import LlamaGrammar
98
+
99
+ grammar = generate_config.get("grammar")
100
+ if grammar is not None and not isinstance(grammar, LlamaGrammar):
101
+ generate_config["grammar"] = LlamaGrammar.from_string(
102
+ generate_config["grammar"]
103
+ )
104
+ # Validate generate_config and fill default values to the generate config.
105
+ generate_config = LlamaCppGenerateConfig(
106
+ **CreateCompletionLlamaCpp(**generate_config).dict()
107
+ )
108
+ # Currently, llama.cpp does not support lora
109
+ generate_config.pop("lora_name", None) # type: ignore
110
+ return generate_config
111
+
112
+ @classmethod
113
+ def match(
114
+ cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
115
+ ) -> bool:
116
+ if llm_spec.model_format not in ["ggufv2"]:
117
+ return False
118
+ if (
119
+ "chat" not in llm_family.model_ability
120
+ and "generate" not in llm_family.model_ability
121
+ ):
122
+ return False
123
+ return True
124
+
125
+ def load(self):
126
+ try:
127
+ from xllamacpp import CommonParams, Server
128
+ except ImportError:
129
+ error_message = "Failed to import module 'xllamacpp'"
130
+ installation_guide = ["Please make sure 'xllamacpp' is installed. "]
131
+
132
+ raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
133
+
134
+ if os.path.isfile(self.model_path):
135
+ # mostly passed from --model_path
136
+ model_path = os.path.realpath(self.model_path)
137
+ else:
138
+ # handle legacy cache.
139
+ model_path = os.path.realpath(
140
+ os.path.join(
141
+ self.model_path,
142
+ self.model_spec.model_file_name_template.format(
143
+ quantization=self.quantization
144
+ ),
145
+ )
146
+ )
147
+ legacy_model_file_path = os.path.join(self.model_path, "model.bin")
148
+ if os.path.exists(legacy_model_file_path):
149
+ model_path = legacy_model_file_path
150
+
151
+ try:
152
+ params = CommonParams()
153
+ params.model = model_path
154
+ if self.model_family.chat_template:
155
+ params.chat_template = self.model_family.chat_template
156
+ # This is the default value, could be overwritten by _llamacpp_model_config
157
+ params.n_parallel = os.cpu_count()
158
+ for k, v in self._llamacpp_model_config.items():
159
+ try:
160
+ setattr(params, k, v)
161
+ except Exception as e:
162
+ logger.error("Failed to set the param %s = %s, error: %s", k, v, e)
163
+ n_threads = self._llamacpp_model_config.get("n_threads", os.cpu_count())
164
+ params.cpuparams.n_threads = n_threads
165
+ params.cpuparams_batch.n_threads = n_threads
166
+ if params.n_gpu_layers == -1:
167
+ # Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
168
+ # 0x7FFFFFFF is INT32 max, will be auto set to all layers
169
+ params.n_gpu_layers = 0x7FFFFFFF
170
+ self._llm = Server(params)
171
+ self._executor = concurrent.futures.ThreadPoolExecutor(
172
+ max_workers=max(10, n_threads)
173
+ )
174
+ except AssertionError:
175
+ raise RuntimeError(f"Load model {self.model_family.model_name} failed")
176
+
177
+ def generate(
178
+ self, prompt: str, generate_config: Optional[LlamaCppGenerateConfig] = None
179
+ ) -> Union[Completion, Iterator[CompletionChunk]]:
180
+ generate_config = self._sanitize_generate_config(generate_config)
181
+ stream = generate_config.get("stream", False)
182
+ q: queue.Queue = queue.Queue()
183
+
184
+ def _handle_completion():
185
+ # TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
186
+ data = generate_config
187
+ data.pop("stopping_criteria", None)
188
+ data.pop("logits_processor", None)
189
+ data.pop("suffix", None)
190
+ data.pop("best_of", None)
191
+ data.update(
192
+ {
193
+ "prompt": prompt,
194
+ "stream": stream,
195
+ }
196
+ )
197
+ prompt_json = orjson.dumps(data)
198
+
199
+ def _res_callback(ok):
200
+ try:
201
+ res = orjson.loads(ok)
202
+ res["model"] = self.model_uid
203
+ q.put(res)
204
+ except Exception as e:
205
+ logger.exception("handle_completions callback failed: %s", e)
206
+
207
+ try:
208
+ self._llm.handle_completions(prompt_json, _res_callback, _res_callback)
209
+ except Exception as ex:
210
+ logger.exception("handle_completions failed: %s", ex)
211
+ q.put(_Sentinel)
212
+
213
+ assert self._executor
214
+ self._executor.submit(_handle_completion)
215
+
216
+ if stream:
217
+
218
+ def _to_iterator():
219
+ while (r := q.get()) is not _Sentinel:
220
+ yield r
221
+
222
+ return _to_iterator()
223
+ else:
224
+ return q.get()
225
+
226
+ def chat(
227
+ self,
228
+ messages: List[Dict],
229
+ generate_config: Optional[LlamaCppGenerateConfig] = None,
230
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
231
+ generate_config = self._sanitize_generate_config(generate_config)
232
+ stream = generate_config.get("stream", False)
233
+ tools = generate_config.pop("tools", []) if generate_config else None
234
+ q: queue.Queue = queue.Queue()
235
+
236
+ def _handle_chat_completion():
237
+ # TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
238
+ data = generate_config
239
+ data.pop("stopping_criteria", None)
240
+ data.pop("logits_processor", None)
241
+ data.pop("suffix", None)
242
+ data.pop("best_of", None)
243
+ data.update(
244
+ {
245
+ "messages": messages,
246
+ "stream": stream,
247
+ "tools": tools,
248
+ }
249
+ )
250
+ prompt_json = orjson.dumps(data)
251
+
252
+ def _res_callback(ok):
253
+ try:
254
+ res = orjson.loads(ok)
255
+ res["model"] = self.model_uid
256
+ q.put(res)
257
+ except Exception as e:
258
+ logger.exception("handle_chat_completions callback failed: %s", e)
259
+
260
+ try:
261
+ self._llm.handle_chat_completions(
262
+ prompt_json, _res_callback, _res_callback
263
+ )
264
+ except Exception as ex:
265
+ logger.exception("handle_chat_completions failed: %s", ex)
266
+ q.put(_Sentinel)
267
+
268
+ assert self._executor
269
+ self._executor.submit(_handle_chat_completion)
270
+
271
+ if stream:
272
+
273
+ def _to_iterator():
274
+ while (r := q.get()) is not _Sentinel:
275
+ yield r
276
+
277
+ return _to_iterator()
278
+ else:
279
+ return q.get()
280
+
35
281
 
36
282
  class LlamaCppModel(LLM):
37
283
  def __init__(
@@ -76,6 +322,7 @@ class LlamaCppModel(LLM):
76
322
  llamacpp_model_config.setdefault("n_gpu_layers", -1)
77
323
  elif self._is_linux() and self._can_apply_cublas():
78
324
  llamacpp_model_config.setdefault("n_gpu_layers", -1)
325
+ llamacpp_model_config.setdefault("reasoning_content", False)
79
326
 
80
327
  return llamacpp_model_config
81
328
 
@@ -123,6 +370,9 @@ class LlamaCppModel(LLM):
123
370
 
124
371
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
125
372
 
373
+ reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
374
+ self.prepare_parse_reasoning_content(reasoning_content)
375
+
126
376
  if os.path.isfile(self.model_path):
127
377
  # mostly passed from --model_path
128
378
  model_path = os.path.realpath(self.model_path)
@@ -292,10 +542,17 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
292
542
  if stream:
293
543
  it = self.generate(full_prompt, generate_config)
294
544
  assert isinstance(it, Iterator)
295
- return self._to_chat_completion_chunks(it)
545
+ return self._to_chat_completion_chunks(it, self.reasoning_parser)
296
546
  else:
297
547
  c = self.generate(full_prompt, generate_config)
298
548
  assert not isinstance(c, Iterator)
299
549
  if tools:
300
- return self._tool_calls_completion(self.model_family, self.model_uid, c)
301
- return self._to_chat_completion(c)
550
+ return self._post_process_completion(
551
+ self.model_family, self.model_uid, c, self.reasoning_parser
552
+ )
553
+ return self._to_chat_completion(c, self.reasoning_parser)
554
+
555
+
556
+ if USE_XLLAMACPP:
557
+ LlamaCppModel = XllamaCppModel # type: ignore # noqa: F811
558
+ LlamaCppChatModel = XllamaCppModel # type: ignore # noqa: F811