xinference 1.3.1.post1__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (75) hide show
  1. xinference/_compat.py +1 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +4 -0
  4. xinference/core/chat_interface.py +1 -1
  5. xinference/core/model.py +23 -3
  6. xinference/core/supervisor.py +6 -0
  7. xinference/core/worker.py +54 -11
  8. xinference/model/llm/__init__.py +7 -2
  9. xinference/model/llm/core.py +1 -0
  10. xinference/model/llm/llama_cpp/core.py +50 -15
  11. xinference/model/llm/llm_family.json +388 -13
  12. xinference/model/llm/llm_family_modelscope.json +373 -14
  13. xinference/model/llm/mlx/core.py +15 -11
  14. xinference/model/llm/reasoning_parser.py +17 -9
  15. xinference/model/llm/sglang/core.py +112 -12
  16. xinference/model/llm/transformers/core.py +4 -2
  17. xinference/model/llm/transformers/deepseek_vl.py +1 -1
  18. xinference/model/llm/transformers/deepseek_vl2.py +287 -0
  19. xinference/model/llm/transformers/gemma3.py +185 -0
  20. xinference/model/llm/transformers/intern_vl.py +0 -2
  21. xinference/model/llm/utils.py +62 -42
  22. xinference/model/llm/vllm/core.py +157 -11
  23. xinference/model/llm/vllm/distributed_executor.py +314 -0
  24. xinference/model/rerank/core.py +16 -11
  25. xinference/thirdparty/deepseek_vl2/__init__.py +31 -0
  26. xinference/thirdparty/deepseek_vl2/models/__init__.py +26 -0
  27. xinference/thirdparty/deepseek_vl2/models/configuration_deepseek.py +210 -0
  28. xinference/thirdparty/deepseek_vl2/models/conversation.py +310 -0
  29. xinference/thirdparty/deepseek_vl2/models/modeling_deepseek.py +1975 -0
  30. xinference/thirdparty/deepseek_vl2/models/modeling_deepseek_vl_v2.py +697 -0
  31. xinference/thirdparty/deepseek_vl2/models/processing_deepseek_vl_v2.py +675 -0
  32. xinference/thirdparty/deepseek_vl2/models/siglip_vit.py +661 -0
  33. xinference/thirdparty/deepseek_vl2/serve/__init__.py +0 -0
  34. xinference/thirdparty/deepseek_vl2/serve/app_modules/__init__.py +0 -0
  35. xinference/thirdparty/deepseek_vl2/serve/app_modules/gradio_utils.py +83 -0
  36. xinference/thirdparty/deepseek_vl2/serve/app_modules/overwrites.py +81 -0
  37. xinference/thirdparty/deepseek_vl2/serve/app_modules/presets.py +115 -0
  38. xinference/thirdparty/deepseek_vl2/serve/app_modules/utils.py +333 -0
  39. xinference/thirdparty/deepseek_vl2/serve/assets/Kelpy-Codos.js +100 -0
  40. xinference/thirdparty/deepseek_vl2/serve/assets/avatar.png +0 -0
  41. xinference/thirdparty/deepseek_vl2/serve/assets/custom.css +355 -0
  42. xinference/thirdparty/deepseek_vl2/serve/assets/custom.js +22 -0
  43. xinference/thirdparty/deepseek_vl2/serve/assets/favicon.ico +0 -0
  44. xinference/thirdparty/deepseek_vl2/serve/assets/simsun.ttc +0 -0
  45. xinference/thirdparty/deepseek_vl2/serve/inference.py +197 -0
  46. xinference/thirdparty/deepseek_vl2/utils/__init__.py +18 -0
  47. xinference/thirdparty/deepseek_vl2/utils/io.py +80 -0
  48. xinference/types.py +2 -2
  49. xinference/web/ui/build/asset-manifest.json +6 -6
  50. xinference/web/ui/build/index.html +1 -1
  51. xinference/web/ui/build/static/css/main.b494ae7e.css +2 -0
  52. xinference/web/ui/build/static/css/main.b494ae7e.css.map +1 -0
  53. xinference/web/ui/build/static/js/main.5ca4eea1.js +3 -0
  54. xinference/web/ui/build/static/js/main.5ca4eea1.js.map +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/0f0967acaec5df1d45b80010949c258d64297ebbb0f44b8bb3afcbd45c6f0ec4.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/68249645124f37d01eef83b1d897e751f895bea919b6fb466f907c1f87cebc84.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +1 -0
  59. xinference/web/ui/src/locales/en.json +2 -2
  60. xinference/web/ui/src/locales/zh.json +1 -1
  61. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/METADATA +4 -4
  62. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/RECORD +67 -41
  63. xinference/web/ui/build/static/css/main.f8177338.css +0 -2
  64. xinference/web/ui/build/static/css/main.f8177338.css.map +0 -1
  65. xinference/web/ui/build/static/js/main.55b70cb7.js +0 -3
  66. xinference/web/ui/build/static/js/main.55b70cb7.js.map +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +0 -1
  71. /xinference/web/ui/build/static/js/{main.55b70cb7.js.LICENSE.txt → main.5ca4eea1.js.LICENSE.txt} +0 -0
  72. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/LICENSE +0 -0
  73. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/WHEEL +0 -0
  74. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/entry_points.txt +0 -0
  75. {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/top_level.txt +0 -0
xinference/_compat.py CHANGED
@@ -102,6 +102,7 @@ class CreateChatCompletionOpenAI(BaseModel):
102
102
  frequency_penalty: Optional[float]
103
103
  logit_bias: Optional[Dict[str, int]]
104
104
  logprobs: Optional[bool]
105
+ max_completion_tokens: Optional[int]
105
106
  max_tokens: Optional[int]
106
107
  n: Optional[int]
107
108
  parallel_tool_calls: Optional[bool]
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-03-11T12:00:36+0800",
11
+ "date": "2025-04-03T21:26:30+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "2ef99fbb5450a76a6ba07a909f58b8c2e4c22a28",
15
- "version": "1.3.1.post1"
14
+ "full-revisionid": "23260be3b917e7a2e8381927721ed3de815c0a99",
15
+ "version": "1.4.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -1952,6 +1952,7 @@ class RESTfulAPI(CancelMixin):
1952
1952
  "logit_bias",
1953
1953
  "logit_bias_type",
1954
1954
  "user",
1955
+ "max_completion_tokens",
1955
1956
  }
1956
1957
 
1957
1958
  raw_kwargs = {k: v for k, v in raw_body.items() if k not in exclude}
@@ -1964,6 +1965,9 @@ class RESTfulAPI(CancelMixin):
1964
1965
  if body.max_tokens is None:
1965
1966
  kwargs["max_tokens"] = max_tokens_field.default
1966
1967
 
1968
+ if body.max_completion_tokens is not None:
1969
+ kwargs["max_tokens"] = body.max_completion_tokens
1970
+
1967
1971
  if body.logit_bias is not None:
1968
1972
  raise HTTPException(status_code=501, detail="Not implemented")
1969
1973
 
@@ -137,7 +137,7 @@ class GradioInterface:
137
137
  ):
138
138
  assert isinstance(chunk, dict)
139
139
  delta = chunk["choices"][0]["delta"]
140
- if "content" not in delta:
140
+ if "content" not in delta or delta["content"] is None:
141
141
  continue
142
142
  else:
143
143
  # some model like deepseek-r1-distill-qwen
xinference/core/model.py CHANGED
@@ -185,7 +185,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
185
185
  )
186
186
 
187
187
  if hasattr(self._model, "stop") and callable(self._model.stop):
188
- self._model.stop()
188
+ await asyncio.to_thread(self._model.stop)
189
189
 
190
190
  if isinstance(self._model, LLMVLLMModel):
191
191
  if self._transfer_ref is not None:
@@ -284,6 +284,8 @@ class ModelActor(xo.StatelessActor, CancelMixin):
284
284
  async def __post_create__(self):
285
285
  self._loop = asyncio.get_running_loop()
286
286
 
287
+ logger.debug("Starting ModelActor at %s, uid: %s", self.address, self.uid)
288
+
287
289
  self._handle_pending_requests_task = asyncio.create_task(
288
290
  self._handle_pending_requests()
289
291
  )
@@ -463,7 +465,9 @@ class ModelActor(xo.StatelessActor, CancelMixin):
463
465
  while True:
464
466
  i += 1
465
467
  try:
466
- self._model.load()
468
+ if hasattr(self._model, "set_loop"):
469
+ self._model.set_loop(asyncio.get_running_loop())
470
+ await asyncio.to_thread(self._model.load)
467
471
  if hasattr(self._model, "driver_info"):
468
472
  self._driver_info = self._model.driver_info
469
473
  break
@@ -490,7 +494,23 @@ class ModelActor(xo.StatelessActor, CancelMixin):
490
494
 
491
495
  async def wait_for_load(self):
492
496
  if hasattr(self._model, "wait_for_load"):
493
- self._model.wait_for_load()
497
+ await asyncio.to_thread(self._model.wait_for_load)
498
+
499
+ def need_create_pools(self):
500
+ return getattr(self._model, "need_create_pools", False)
501
+
502
+ def set_pool_addresses(self, pool_addresses: List[str]):
503
+ if hasattr(self._model, "set_pool_addresses"):
504
+ self._model.set_pool_addresses(pool_addresses)
505
+
506
+ def get_pool_addresses(self) -> Optional[List[str]]:
507
+ if hasattr(self._model, "get_pool_addresses"):
508
+ return self._model.get_pool_addresses()
509
+ return None
510
+
511
+ def set_worker_addresses(self, shard: int, worker_addresses: List[str]):
512
+ if hasattr(self._model, "set_worker_addresses"):
513
+ self._model.set_worker_addresses(shard, worker_addresses)
494
514
 
495
515
  def model_uid(self):
496
516
  return (
@@ -1097,6 +1097,7 @@ class SupervisorActor(xo.StatelessActor):
1097
1097
  xavier_config=xavier_config,
1098
1098
  **kwargs,
1099
1099
  )
1100
+ await worker_ref.wait_for_load(_replica_model_uid)
1100
1101
  self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
1101
1102
  return subpool_address
1102
1103
 
@@ -1242,6 +1243,11 @@ class SupervisorActor(xo.StatelessActor):
1242
1243
  available_workers.append(worker_ip)
1243
1244
 
1244
1245
  async def _launch_model():
1246
+ # Validation of n_worker, intercept if it is greater than the available workers.
1247
+ if n_worker > len(available_workers):
1248
+ raise ValueError(
1249
+ "n_worker cannot be larger than the number of available workers."
1250
+ )
1245
1251
  try:
1246
1252
  for _idx, rep_model_uid in enumerate(
1247
1253
  iter_replica_model_uid(model_uid, replica)
xinference/core/worker.py CHANGED
@@ -874,7 +874,7 @@ class WorkerActor(xo.StatelessActor):
874
874
  subpool_address, devices = await self._create_subpool(
875
875
  model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
876
876
  )
877
-
877
+ all_subpool_addresses = [subpool_address]
878
878
  try:
879
879
  xavier_config: Optional[Dict] = kwargs.pop("xavier_config", None)
880
880
  if xavier_config is not None:
@@ -885,7 +885,7 @@ class WorkerActor(xo.StatelessActor):
885
885
  # add a few kwargs
886
886
  model_kwargs.update(
887
887
  dict(
888
- address=self.address,
888
+ address=subpool_address,
889
889
  n_worker=n_worker,
890
890
  shard=shard,
891
891
  driver_info=driver_info,
@@ -923,11 +923,28 @@ class WorkerActor(xo.StatelessActor):
923
923
  shard=shard,
924
924
  driver_info=driver_info,
925
925
  )
926
+ if await model_ref.need_create_pools() and (
927
+ len(devices) > 1 or n_worker > 1 # type: ignore
928
+ ):
929
+ coros = []
930
+ env_name = get_available_device_env_name() or "CUDA_VISIBLE_DEVICES"
931
+ env_value = ",".join(devices)
932
+ for device in devices:
933
+ coros.append(
934
+ self._main_pool.append_sub_pool(
935
+ env={env_name: env_value},
936
+ start_method=self._get_start_method(),
937
+ )
938
+ )
939
+ pool_addresses = await asyncio.gather(*coros)
940
+ all_subpool_addresses.extend(pool_addresses)
941
+ await model_ref.set_pool_addresses(pool_addresses)
926
942
  await model_ref.load()
927
943
  except:
928
944
  logger.error(f"Failed to load model {model_uid}", exc_info=True)
929
945
  self.release_devices(model_uid=model_uid)
930
- await self._main_pool.remove_sub_pool(subpool_address)
946
+ for addr in all_subpool_addresses:
947
+ await self._main_pool.remove_sub_pool(addr)
931
948
  raise
932
949
  self._model_uid_to_model[model_uid] = model_ref
933
950
  self._model_uid_to_model_spec[model_uid] = model_description
@@ -994,15 +1011,36 @@ class WorkerActor(xo.StatelessActor):
994
1011
  if model_ref is None:
995
1012
  logger.debug("Model not found, uid: %s", model_uid)
996
1013
 
1014
+ pool_addresses = None
1015
+ if model_ref is not None:
1016
+ try:
1017
+ # pool addresses if model.need_create_pools()
1018
+ pool_addresses = await model_ref.get_pool_addresses()
1019
+ except Exception as e:
1020
+ # process may disappear, we just ignore it.
1021
+ logger.debug("Fail to get pool addresses, error: %s", e)
1022
+
997
1023
  try:
998
- await xo.destroy_actor(model_ref)
1024
+ logger.debug("Start to destroy model actor: %s", model_ref)
1025
+ coro = xo.destroy_actor(model_ref)
1026
+ await asyncio.wait_for(coro, timeout=5)
999
1027
  except Exception as e:
1000
1028
  logger.debug(
1001
1029
  "Destroy model actor failed, model uid: %s, error: %s", model_uid, e
1002
1030
  )
1003
1031
  try:
1032
+ to_remove_addresses = []
1004
1033
  subpool_address = self._model_uid_to_addr[model_uid]
1005
- await self._main_pool.remove_sub_pool(subpool_address, force=True)
1034
+ to_remove_addresses.append(subpool_address)
1035
+ if pool_addresses:
1036
+ to_remove_addresses.extend(pool_addresses)
1037
+ logger.debug("Remove sub pools: %s", to_remove_addresses)
1038
+ coros = []
1039
+ for to_remove_addr in to_remove_addresses:
1040
+ coros.append(
1041
+ self._main_pool.remove_sub_pool(to_remove_addr, force=True)
1042
+ )
1043
+ await asyncio.gather(*coros)
1006
1044
  except Exception as e:
1007
1045
  logger.debug(
1008
1046
  "Remove sub pool failed, model uid: %s, error: %s", model_uid, e
@@ -1204,18 +1242,23 @@ class WorkerActor(xo.StatelessActor):
1204
1242
  model_ref = self._model_uid_to_model[rep_model_uid]
1205
1243
  await model_ref.start_transfer_for_vllm(rank_addresses)
1206
1244
 
1207
- @log_async(logger=logger, level=logging.INFO)
1208
- async def launch_rank0_model(
1209
- self, rep_model_uid: str, xavier_config: Dict[str, Any]
1210
- ) -> Tuple[str, int]:
1211
- from ..model.llm.vllm.xavier.collective_manager import Rank0ModelActor
1212
-
1245
+ @staticmethod
1246
+ def _get_start_method():
1213
1247
  if os.name != "nt" and platform.system() != "Darwin":
1214
1248
  # Linux
1215
1249
  start_method = "forkserver"
1216
1250
  else:
1217
1251
  # Windows and macOS
1218
1252
  start_method = "spawn"
1253
+ return start_method
1254
+
1255
+ @log_async(logger=logger, level=logging.INFO)
1256
+ async def launch_rank0_model(
1257
+ self, rep_model_uid: str, xavier_config: Dict[str, Any]
1258
+ ) -> Tuple[str, int]:
1259
+ from ..model.llm.vllm.xavier.collective_manager import Rank0ModelActor
1260
+
1261
+ start_method = self._get_start_method()
1219
1262
  subpool_address = await self._main_pool.append_sub_pool(
1220
1263
  start_method=start_method
1221
1264
  )
@@ -132,7 +132,7 @@ def _install():
132
132
  from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel, XllamaCppModel
133
133
  from .lmdeploy.core import LMDeployChatModel, LMDeployModel
134
134
  from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
135
- from .sglang.core import SGLANGChatModel, SGLANGModel
135
+ from .sglang.core import SGLANGChatModel, SGLANGModel, SGLANGVisionModel
136
136
  from .transformers.chatglm import ChatglmPytorchChatModel
137
137
  from .transformers.cogagent import CogAgentChatModel
138
138
  from .transformers.cogvlm2 import CogVLM2Model
@@ -143,6 +143,8 @@ def _install():
143
143
  DeepSeekV2PytorchModel,
144
144
  )
145
145
  from .transformers.deepseek_vl import DeepSeekVLChatModel
146
+ from .transformers.deepseek_vl2 import DeepSeekVL2ChatModel
147
+ from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
146
148
  from .transformers.glm4v import Glm4VModel
147
149
  from .transformers.glm_edge_v import GlmEdgeVModel
148
150
  from .transformers.intern_vl import InternVLChatModel
@@ -172,7 +174,7 @@ def _install():
172
174
  XllamaCppModel,
173
175
  ]
174
176
  )
175
- SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
177
+ SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel, SGLANGVisionModel])
176
178
  VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
177
179
  MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
178
180
  LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
@@ -186,6 +188,7 @@ def _install():
186
188
  Qwen2AudioChatModel,
187
189
  YiVLChatModel,
188
190
  DeepSeekVLChatModel,
191
+ DeepSeekVL2ChatModel,
189
192
  InternVLChatModel,
190
193
  PytorchModel,
191
194
  CogVLM2Model,
@@ -198,6 +201,8 @@ def _install():
198
201
  OptPytorchModel,
199
202
  GlmEdgeVModel,
200
203
  CogAgentChatModel,
204
+ Gemma3TextChatModel,
205
+ Gemma3ChatModel,
201
206
  ]
202
207
  )
203
208
  if OmniLMMModel: # type: ignore
@@ -54,6 +54,7 @@ class LLM(abc.ABC):
54
54
  **kwargs,
55
55
  ):
56
56
  self.model_uid, self.rep_id = parse_replica_model_uid(replica_model_uid)
57
+ self.raw_model_uid = replica_model_uid
57
58
  self.model_family = model_family
58
59
  self.model_spec = model_spec
59
60
  self.quantization = quantization
@@ -39,10 +39,15 @@ logger = logging.getLogger(__name__)
39
39
  USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 0)))
40
40
 
41
41
 
42
- class _Sentinel:
42
+ class _Done:
43
43
  pass
44
44
 
45
45
 
46
+ class _Error:
47
+ def __init__(self, msg):
48
+ self.msg = msg
49
+
50
+
46
51
  class XllamaCppModel(LLM, ChatModelMixin):
47
52
  def __init__(
48
53
  self,
@@ -200,7 +205,14 @@ class XllamaCppModel(LLM, ChatModelMixin):
200
205
  )
201
206
  prompt_json = orjson.dumps(data)
202
207
 
203
- def _res_callback(ok):
208
+ def _error_callback(err):
209
+ try:
210
+ msg = orjson.loads(err)
211
+ q.put(_Error(msg))
212
+ except Exception as e:
213
+ q.put(_Error(str(e)))
214
+
215
+ def _ok_callback(ok):
204
216
  try:
205
217
  res = orjson.loads(ok)
206
218
  res["model"] = self.model_uid
@@ -209,10 +221,10 @@ class XllamaCppModel(LLM, ChatModelMixin):
209
221
  logger.exception("handle_completions callback failed: %s", e)
210
222
 
211
223
  try:
212
- self._llm.handle_completions(prompt_json, _res_callback, _res_callback)
224
+ self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
213
225
  except Exception as ex:
214
226
  logger.exception("handle_completions failed: %s", ex)
215
- q.put(_Sentinel)
227
+ q.put(_Done)
216
228
 
217
229
  assert self._executor
218
230
  self._executor.submit(_handle_completion)
@@ -220,12 +232,17 @@ class XllamaCppModel(LLM, ChatModelMixin):
220
232
  if stream:
221
233
 
222
234
  def _to_iterator():
223
- while (r := q.get()) is not _Sentinel:
235
+ while (r := q.get()) is not _Done:
236
+ if type(r) is _Error:
237
+ raise Exception("Got error in generate stream: %s", r.msg)
224
238
  yield r
225
239
 
226
240
  return _to_iterator()
227
241
  else:
228
- return q.get()
242
+ r = q.get()
243
+ if type(r) is _Error:
244
+ raise Exception("Got error in generate: %s", r.msg)
245
+ return r
229
246
 
230
247
  def chat(
231
248
  self,
@@ -253,7 +270,14 @@ class XllamaCppModel(LLM, ChatModelMixin):
253
270
  )
254
271
  prompt_json = orjson.dumps(data)
255
272
 
256
- def _res_callback(ok):
273
+ def _error_callback(err):
274
+ try:
275
+ msg = orjson.loads(err)
276
+ q.put(_Error(msg))
277
+ except Exception as e:
278
+ q.put(_Error(str(e)))
279
+
280
+ def _ok_callback(ok):
257
281
  try:
258
282
  res = orjson.loads(ok)
259
283
  res["model"] = self.model_uid
@@ -263,11 +287,11 @@ class XllamaCppModel(LLM, ChatModelMixin):
263
287
 
264
288
  try:
265
289
  self._llm.handle_chat_completions(
266
- prompt_json, _res_callback, _res_callback
290
+ prompt_json, _error_callback, _ok_callback
267
291
  )
268
292
  except Exception as ex:
269
293
  logger.exception("handle_chat_completions failed: %s", ex)
270
- q.put(_Sentinel)
294
+ q.put(_Done)
271
295
 
272
296
  assert self._executor
273
297
  self._executor.submit(_handle_chat_completion)
@@ -275,14 +299,24 @@ class XllamaCppModel(LLM, ChatModelMixin):
275
299
  if stream:
276
300
 
277
301
  def _to_iterator():
278
- while (r := q.get()) is not _Sentinel:
279
- yield r
302
+ while (r := q.get()) is not _Done:
303
+ if type(r) is _Error:
304
+ raise Exception("Got error in chat stream: %s", r.msg)
305
+ # Get valid keys (O(1) lookup)
306
+ chunk_keys = ChatCompletionChunk.__annotations__
307
+ # The chunk may contain additional keys (e.g., system_fingerprint),
308
+ # which might not conform to OpenAI/DeepSeek formats.
309
+ # Filter out keys that are not part of ChatCompletionChunk.
310
+ yield {key: r[key] for key in chunk_keys if key in r}
280
311
 
281
312
  return self._to_chat_completion_chunks(
282
313
  _to_iterator(), self.reasoning_parser
283
314
  )
284
315
  else:
285
- return self._to_chat_completion(q.get(), self.reasoning_parser)
316
+ r = q.get()
317
+ if type(r) is _Error:
318
+ raise Exception("Got error in chat: %s", r.msg)
319
+ return self._to_chat_completion(r, self.reasoning_parser)
286
320
 
287
321
 
288
322
  class LlamaCppModel(LLM):
@@ -533,10 +567,11 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
533
567
  tools = generate_config.pop("tools", []) if generate_config else None
534
568
  full_context_kwargs = {}
535
569
  if tools:
536
- if model_family in QWEN_TOOL_CALL_FAMILY:
570
+ if (
571
+ model_family in QWEN_TOOL_CALL_FAMILY
572
+ or model_family in DEEPSEEK_TOOL_CALL_FAMILY
573
+ ):
537
574
  full_context_kwargs["tools"] = tools
538
- elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
539
- self._tools_to_messages_for_deepseek(messages, tools)
540
575
  assert self.model_family.chat_template is not None
541
576
  full_prompt = self.get_full_context(
542
577
  messages, self.model_family.chat_template, **full_context_kwargs