xinference 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (80) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +4 -7
  3. xinference/client/handlers.py +3 -0
  4. xinference/client/restful/restful_client.py +9 -1
  5. xinference/core/model.py +19 -0
  6. xinference/core/resource.py +7 -1
  7. xinference/core/scheduler.py +4 -7
  8. xinference/core/status_guard.py +1 -0
  9. xinference/core/supervisor.py +228 -19
  10. xinference/core/utils.py +1 -29
  11. xinference/core/worker.py +28 -2
  12. xinference/deploy/cmdline.py +33 -3
  13. xinference/deploy/local.py +2 -1
  14. xinference/deploy/test/test_cmdline.py +32 -0
  15. xinference/device_utils.py +43 -1
  16. xinference/model/audio/core.py +5 -0
  17. xinference/model/audio/kokoro.py +122 -0
  18. xinference/model/audio/model_spec.json +8 -0
  19. xinference/model/audio/model_spec_modelscope.json +9 -0
  20. xinference/model/image/stable_diffusion/core.py +15 -6
  21. xinference/model/llm/llama_cpp/core.py +21 -14
  22. xinference/model/llm/llm_family.json +866 -46
  23. xinference/model/llm/llm_family.py +7 -2
  24. xinference/model/llm/llm_family_modelscope.json +873 -16
  25. xinference/model/llm/mlx/core.py +11 -3
  26. xinference/model/llm/reasoning_parsers/__init__.py +13 -0
  27. xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +98 -0
  28. xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py +140 -0
  29. xinference/model/llm/sglang/core.py +99 -11
  30. xinference/model/llm/transformers/core.py +9 -1
  31. xinference/model/llm/transformers/intern_vl.py +23 -14
  32. xinference/model/llm/transformers/qwen2_audio.py +3 -1
  33. xinference/model/llm/transformers/qwen2_vl.py +20 -3
  34. xinference/model/llm/transformers/utils.py +22 -11
  35. xinference/model/llm/utils.py +164 -20
  36. xinference/model/llm/vllm/core.py +36 -4
  37. xinference/model/llm/vllm/xavier/executor.py +2 -2
  38. xinference/model/llm/vllm/xavier/scheduler.py +3 -3
  39. xinference/thirdparty/internvl/conversation.py +26 -17
  40. xinference/types.py +2 -0
  41. xinference/web/ui/build/asset-manifest.json +6 -6
  42. xinference/web/ui/build/index.html +1 -1
  43. xinference/web/ui/build/static/css/main.f8177338.css +2 -0
  44. xinference/web/ui/build/static/css/main.f8177338.css.map +1 -0
  45. xinference/web/ui/build/static/js/main.ad42919c.js +3 -0
  46. xinference/web/ui/build/static/js/main.ad42919c.js.map +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/0acb065326560592b10888234242f94f67efe28458b90f273d4d4fba9daa0cd2.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/6cb9f6c62ab4042f0b11c5d75e51187188e9d6f5f08b1d63e796e051bafdb457.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +1 -0
  59. xinference/web/ui/src/locales/en.json +14 -1
  60. xinference/web/ui/src/locales/zh.json +14 -1
  61. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/METADATA +18 -17
  62. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/RECORD +67 -60
  63. xinference/web/ui/build/static/css/main.51a587ff.css +0 -2
  64. xinference/web/ui/build/static/css/main.51a587ff.css.map +0 -1
  65. xinference/web/ui/build/static/js/main.b0936c54.js +0 -3
  66. xinference/web/ui/build/static/js/main.b0936c54.js.map +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/0c2fb5375667931c4a331c99e0d87dc145e8f327cea3f44d6e56f54c7c1d4020.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/185ceb8872d562e032b47e79df6a45670e06345b8ed70aad1a131e0476783c5c.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/3eefb411b24c2b3ce053570ef50daccf154022f0e168be5ed0fec21394baf9f4.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/63c8e07687ea53a4f8a910ee5e42e0eb26cd1acbfbe820f3e3248a786ee51401.json +0 -1
  71. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +0 -1
  72. xinference/web/ui/node_modules/.cache/babel-loader/a7f1a71f6580dfe810c685a9c1d68e318f71e1fa258fbe50b87a6ac37cc0a598.json +0 -1
  73. xinference/web/ui/node_modules/.cache/babel-loader/bdee44abeadc4abc17d41c52eb49c6e19a4b1a267b6e16876ce91bdeeebfc52d.json +0 -1
  74. xinference/web/ui/node_modules/.cache/babel-loader/d7664d18c4ddbad9c3a6a31b91f7c00fb0dde804608674a9860ee50f33e54708.json +0 -1
  75. xinference/web/ui/node_modules/.cache/babel-loader/ed57202cb79649bb716400436590245547df241988fc7c8e1d85d132299542d2.json +0 -1
  76. /xinference/web/ui/build/static/js/{main.b0936c54.js.LICENSE.txt → main.ad42919c.js.LICENSE.txt} +0 -0
  77. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/LICENSE +0 -0
  78. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/WHEEL +0 -0
  79. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/entry_points.txt +0 -0
  80. {xinference-1.2.1.dist-info → xinference-1.3.0.dist-info}/top_level.txt +0 -0
@@ -770,11 +770,17 @@ def remove_cache(
770
770
  type=int,
771
771
  help="The replica count of the model, default is 1.",
772
772
  )
773
+ @click.option(
774
+ "--n-worker",
775
+ default=1,
776
+ type=int,
777
+ help="The number of workers used by the model, default is 1.",
778
+ )
773
779
  @click.option(
774
780
  "--n-gpu",
775
781
  default="auto",
776
782
  type=str,
777
- help='The number of GPUs used by the model, default is "auto".',
783
+ help='The number of GPUs used by the model, if n_worker>1, means number of GPUs per worker, default is "auto".',
778
784
  )
779
785
  @click.option(
780
786
  "--lora-modules",
@@ -815,6 +821,12 @@ def remove_cache(
815
821
  type=bool,
816
822
  help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
817
823
  )
824
+ @click.option(
825
+ "--reasoning-content",
826
+ default=False,
827
+ type=bool,
828
+ help="Whether or not to enable reasoning content in model responses.",
829
+ )
818
830
  @click.option(
819
831
  "--api-key",
820
832
  "-ak",
@@ -822,6 +834,7 @@ def remove_cache(
822
834
  type=str,
823
835
  help="Api-Key for access xinference api with authorization.",
824
836
  )
837
+ @click.option("--model-path", "-mp", default=None, type=str, help="Model path to run.")
825
838
  @click.pass_context
826
839
  def model_launch(
827
840
  ctx,
@@ -834,6 +847,7 @@ def model_launch(
834
847
  model_format: str,
835
848
  quantization: str,
836
849
  replica: int,
850
+ n_worker: int,
837
851
  n_gpu: str,
838
852
  lora_modules: Optional[Tuple],
839
853
  image_lora_load_kwargs: Optional[Tuple],
@@ -841,15 +855,28 @@ def model_launch(
841
855
  worker_ip: Optional[str],
842
856
  gpu_idx: Optional[str],
843
857
  trust_remote_code: bool,
858
+ reasoning_content: bool,
844
859
  api_key: Optional[str],
860
+ model_path: Optional[str],
845
861
  ):
846
862
  kwargs = {}
847
863
  for i in range(0, len(ctx.args), 2):
848
864
  if not ctx.args[i].startswith("--"):
849
865
  raise ValueError(
850
- f"You must specify extra kwargs with `--` prefix. There is an error in parameter passing that is {ctx.args[i]}."
866
+ f"You must specify extra kwargs with `--` prefix. "
867
+ f"There is an error in parameter passing that is {ctx.args[i]}."
851
868
  )
852
- kwargs[ctx.args[i][2:]] = handle_click_args_type(ctx.args[i + 1])
869
+ param_name = ctx.args[i][2:]
870
+ param_value = handle_click_args_type(ctx.args[i + 1])
871
+ if param_name == "model_path":
872
+ # fix for --model_path which is the old fashion to set model_path,
873
+ # now model_path is a builtin option, try to make it compatible
874
+ if model_path is None:
875
+ model_path = param_value
876
+ continue
877
+ else:
878
+ raise ValueError("Cannot set both for --model-path and --model_path")
879
+ kwargs[param_name] = param_value
853
880
  print(f"Launch model name: {model_name} with kwargs: {kwargs}", file=sys.stderr)
854
881
 
855
882
  if model_type == "LLM" and model_engine is None:
@@ -914,11 +941,14 @@ def model_launch(
914
941
  model_format=model_format,
915
942
  quantization=quantization,
916
943
  replica=replica,
944
+ n_worker=n_worker,
917
945
  n_gpu=_n_gpu,
918
946
  peft_model_config=peft_model_config,
919
947
  worker_ip=worker_ip,
920
948
  gpu_idx=_gpu_idx,
921
949
  trust_remote_code=trust_remote_code,
950
+ model_path=model_path,
951
+ reasoning_content=reasoning_content,
922
952
  **kwargs,
923
953
  )
924
954
 
@@ -41,7 +41,8 @@ async def _start_local_cluster(
41
41
  ):
42
42
  from .utils import create_worker_actor_pool
43
43
 
44
- logging.config.dictConfig(logging_conf) # type: ignore
44
+ if logging_conf:
45
+ logging.config.dictConfig(logging_conf) # type: ignore
45
46
 
46
47
  pool = None
47
48
  try:
@@ -147,6 +147,38 @@ def test_cmdline(setup, stream, model_uid):
147
147
  assert model_uid not in result.stdout
148
148
 
149
149
 
150
+ def test_cmdline_model_path_error(setup):
151
+ endpoint, _ = setup
152
+ runner = CliRunner(mix_stderr=False)
153
+
154
+ # launch model
155
+ result = runner.invoke(
156
+ model_launch,
157
+ [
158
+ "--endpoint",
159
+ endpoint,
160
+ "--model-name",
161
+ "tiny-llama",
162
+ "--size-in-billions",
163
+ 1,
164
+ "--model-format",
165
+ "ggufv2",
166
+ "--quantization",
167
+ "Q2_K",
168
+ "--model-path",
169
+ "/path/to/model",
170
+ "--model_path",
171
+ "/path/to/model",
172
+ ],
173
+ )
174
+ assert result.exit_code > 0
175
+ with pytest.raises(
176
+ ValueError, match="Cannot set both for --model-path and --model_path"
177
+ ):
178
+ t, e, tb = result.exc_info
179
+ raise e.with_traceback(tb)
180
+
181
+
150
182
  def test_cmdline_of_custom_model(setup):
151
183
  endpoint, _ = setup
152
184
  runner = CliRunner()
@@ -13,9 +13,9 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import os
16
+ from typing import Dict, Literal, Union
16
17
 
17
18
  import torch
18
- from typing_extensions import Literal, Union
19
19
 
20
20
  DeviceType = Literal["cuda", "mps", "xpu", "npu", "cpu"]
21
21
  DEVICE_TO_ENV_NAME = {
@@ -122,3 +122,45 @@ def gpu_count():
122
122
  return torch.npu.device_count()
123
123
  else:
124
124
  return 0
125
+
126
+
127
+ def _get_nvidia_gpu_mem_info(gpu_id: int) -> Dict[str, float]:
128
+ from pynvml import (
129
+ nvmlDeviceGetHandleByIndex,
130
+ nvmlDeviceGetMemoryInfo,
131
+ nvmlDeviceGetName,
132
+ nvmlDeviceGetUtilizationRates,
133
+ )
134
+
135
+ handler = nvmlDeviceGetHandleByIndex(gpu_id)
136
+ gpu_name = nvmlDeviceGetName(handler)
137
+ mem_info = nvmlDeviceGetMemoryInfo(handler)
138
+ utilization = nvmlDeviceGetUtilizationRates(handler)
139
+ return {
140
+ "name": gpu_name,
141
+ "total": mem_info.total,
142
+ "used": mem_info.used,
143
+ "free": mem_info.free,
144
+ "util": utilization.gpu,
145
+ }
146
+
147
+
148
+ def get_nvidia_gpu_info() -> Dict:
149
+ from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlShutdown
150
+
151
+ try:
152
+ nvmlInit()
153
+ device_count = nvmlDeviceGetCount()
154
+ res = {}
155
+ for i in range(device_count):
156
+ res[f"gpu-{i}"] = _get_nvidia_gpu_mem_info(i)
157
+ return res
158
+ except:
159
+ # TODO: add log here
160
+ # logger.debug(f"Cannot init nvml. Maybe due to lack of NVIDIA GPUs or incorrect installation of CUDA.")
161
+ return {}
162
+ finally:
163
+ try:
164
+ nvmlShutdown()
165
+ except:
166
+ pass
@@ -25,6 +25,7 @@ from .f5tts import F5TTSModel
25
25
  from .f5tts_mlx import F5TTSMLXModel
26
26
  from .fish_speech import FishSpeechModel
27
27
  from .funasr import FunASRModel
28
+ from .kokoro import KokoroModel
28
29
  from .melotts import MeloTTSModel
29
30
  from .whisper import WhisperModel
30
31
  from .whisper_mlx import WhisperMLXModel
@@ -176,6 +177,7 @@ def create_audio_model_instance(
176
177
  F5TTSModel,
177
178
  F5TTSMLXModel,
178
179
  MeloTTSModel,
180
+ KokoroModel,
179
181
  ],
180
182
  AudioModelDescription,
181
183
  ]:
@@ -192,6 +194,7 @@ def create_audio_model_instance(
192
194
  F5TTSModel,
193
195
  F5TTSMLXModel,
194
196
  MeloTTSModel,
197
+ KokoroModel,
195
198
  ]
196
199
  if model_spec.model_family == "whisper":
197
200
  if not model_spec.engine:
@@ -212,6 +215,8 @@ def create_audio_model_instance(
212
215
  model = F5TTSMLXModel(model_uid, model_path, model_spec, **kwargs)
213
216
  elif model_spec.model_family == "MeloTTS":
214
217
  model = MeloTTSModel(model_uid, model_path, model_spec, **kwargs)
218
+ elif model_spec.model_family == "Kokoro":
219
+ model = KokoroModel(model_uid, model_path, model_spec, **kwargs)
215
220
  else:
216
221
  raise Exception(f"Unsupported audio model family: {model_spec.model_family}")
217
222
  model_description = AudioModelDescription(
@@ -0,0 +1,122 @@
1
+ # Copyright 2022-2023 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from io import BytesIO
16
+ from typing import TYPE_CHECKING, Optional
17
+
18
+ import numpy as np
19
+
20
+ from ...device_utils import get_available_device, is_device_available
21
+
22
+ if TYPE_CHECKING:
23
+ from .core import AudioModelFamilyV1
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class KokoroModel:
29
+ def __init__(
30
+ self,
31
+ model_uid: str,
32
+ model_path: str,
33
+ model_spec: "AudioModelFamilyV1",
34
+ device: Optional[str] = None,
35
+ **kwargs,
36
+ ):
37
+ self._model_uid = model_uid
38
+ self._model_path = model_path
39
+ self._model_spec = model_spec
40
+ self._device = device
41
+ self._model = None
42
+ self._kwargs = kwargs
43
+
44
+ @property
45
+ def model_ability(self):
46
+ return self._model_spec.model_ability
47
+
48
+ def load(self):
49
+ if self._device is None:
50
+ self._device = get_available_device()
51
+ else:
52
+ if not is_device_available(self._device):
53
+ raise ValueError(f"Device {self._device} is not available!")
54
+
55
+ import os
56
+
57
+ from kokoro import KModel, KPipeline
58
+
59
+ config_path = os.path.join(self._model_path, "config.json")
60
+ model_path = os.path.join(self._model_path, "kokoro-v1_0.pth")
61
+ # LANG_CODES = dict(
62
+ # # pip install misaki[en]
63
+ # a='American English',
64
+ # b='British English',
65
+ #
66
+ # # espeak-ng
67
+ # e='es',
68
+ # f='fr-fr',
69
+ # h='hi',
70
+ # i='it',
71
+ # p='pt-br',
72
+ #
73
+ # # pip install misaki[ja]
74
+ # j='Japanese',
75
+ #
76
+ # # pip install misaki[zh]
77
+ # z='Mandarin Chinese',
78
+ # )
79
+ lang_code = self._kwargs.get("lang_code", "a")
80
+ logger.info("Launching Kokoro model with language code: %s", lang_code)
81
+ self._model = KPipeline(
82
+ lang_code=lang_code,
83
+ model=KModel(config=config_path, model=model_path),
84
+ device=self._device,
85
+ )
86
+
87
+ def speech(
88
+ self,
89
+ input: str,
90
+ voice: str,
91
+ response_format: str = "mp3",
92
+ speed: float = 1.0,
93
+ stream: bool = False,
94
+ **kwargs,
95
+ ):
96
+ import soundfile
97
+
98
+ if stream:
99
+ raise Exception("Kokoro does not support stream mode.")
100
+ assert self._model is not None
101
+ if not voice:
102
+ voice = "af_alloy"
103
+ logger.info("Auto select speaker: %s", voice)
104
+ elif voice.endswith(".pt"):
105
+ logger.info("Using custom voice pt: %s", voice)
106
+ else:
107
+ logger.info("Using voice: %s", voice)
108
+ logger.info("Speech kwargs: %s", kwargs)
109
+ generator = self._model(text=input, voice=voice, speed=speed, **kwargs)
110
+ results = list(generator)
111
+ audio = np.concatenate([r[2] for r in results])
112
+ # Save the generated audio
113
+ with BytesIO() as out:
114
+ with soundfile.SoundFile(
115
+ out,
116
+ "w",
117
+ 24000,
118
+ 1,
119
+ format=response_format.upper(),
120
+ ) as f:
121
+ f.write(audio)
122
+ return out.getvalue()
@@ -338,5 +338,13 @@
338
338
  "model_ability": "text-to-audio",
339
339
  "multilingual": false,
340
340
  "language": "KR"
341
+ },
342
+ {
343
+ "model_name": "Kokoro-82M",
344
+ "model_family": "Kokoro",
345
+ "model_id": "hexgrad/Kokoro-82M",
346
+ "model_revision": "7884269d6fd3f9beabc271b6f1308e5699281fa9",
347
+ "model_ability": "text-to-audio",
348
+ "multilingual": true
341
349
  }
342
350
  ]
@@ -100,5 +100,14 @@
100
100
  "model_revision": "master",
101
101
  "model_ability": "text-to-audio",
102
102
  "multilingual": true
103
+ },
104
+ {
105
+ "model_name": "Kokoro-82M",
106
+ "model_family": "Kokoro",
107
+ "model_hub": "modelscope",
108
+ "model_id": "AI-ModelScope/Kokoro-82M",
109
+ "model_revision": "master",
110
+ "model_ability": "text-to-audio",
111
+ "multilingual": true
103
112
  }
104
113
  ]
@@ -22,7 +22,6 @@ import logging
22
22
  import os
23
23
  import re
24
24
  import sys
25
- import warnings
26
25
  from glob import glob
27
26
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
28
27
 
@@ -412,12 +411,22 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
412
411
  else:
413
412
  raise ValueError(f"Unknown sampler: {sampler_name}")
414
413
 
415
- @staticmethod
414
+ def _need_set_scheduler(self, scheduler: Any) -> bool:
415
+ """Determine whether it is necessary to set up a scheduler"""
416
+ if self._model_spec is None:
417
+ return False
418
+ if scheduler is None:
419
+ return False
420
+ if "FLUX" in self._model_spec.model_name:
421
+ logger.warning("FLUX model, skipping scheduler setup")
422
+ return False
423
+ return True
424
+
416
425
  @contextlib.contextmanager
417
- def _reset_when_done(model: Any, sampler_name: str):
418
- assert model is not None
426
+ def _reset_when_done(self, model: Any, sampler_name: str):
419
427
  scheduler = DiffusionModel._get_scheduler(model, sampler_name)
420
- if scheduler:
428
+ if self._need_set_scheduler(scheduler):
429
+ logger.debug("Use scheduler %s", scheduler)
421
430
  default_scheduler = model.scheduler
422
431
  model.scheduler = scheduler
423
432
  try:
@@ -517,7 +526,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
517
526
  for key in list(kwargs):
518
527
  allow_key = model_accept_param(key, model)
519
528
  if not allow_key:
520
- warnings.warn(f"{type(model)} cannot accept `{key}`, will ignore it")
529
+ logger.warning(f"{type(model)} cannot accept `{key}`, will ignore it")
521
530
  kwargs.pop(key)
522
531
 
523
532
  def text_to_image(
@@ -28,7 +28,7 @@ from ....types import (
28
28
  )
29
29
  from ..core import LLM
30
30
  from ..llm_family import LLMFamilyV1, LLMSpecV1
31
- from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin
31
+ from ..utils import DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelMixin
32
32
 
33
33
  logger = logging.getLogger(__name__)
34
34
 
@@ -123,18 +123,22 @@ class LlamaCppModel(LLM):
123
123
 
124
124
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
125
125
 
126
- # handle legacy cache.
127
- model_path = os.path.realpath(
128
- os.path.join(
129
- self.model_path,
130
- self.model_spec.model_file_name_template.format(
131
- quantization=self.quantization
132
- ),
126
+ if os.path.isfile(self.model_path):
127
+ # mostly passed from --model_path
128
+ model_path = os.path.realpath(self.model_path)
129
+ else:
130
+ # handle legacy cache.
131
+ model_path = os.path.realpath(
132
+ os.path.join(
133
+ self.model_path,
134
+ self.model_spec.model_file_name_template.format(
135
+ quantization=self.quantization
136
+ ),
137
+ )
133
138
  )
134
- )
135
- legacy_model_file_path = os.path.join(self.model_path, "model.bin")
136
- if os.path.exists(legacy_model_file_path):
137
- model_path = legacy_model_file_path
139
+ legacy_model_file_path = os.path.join(self.model_path, "model.bin")
140
+ if os.path.exists(legacy_model_file_path):
141
+ model_path = legacy_model_file_path
138
142
 
139
143
  try:
140
144
  self._llm = Llama(
@@ -272,8 +276,11 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
272
276
  model_family = self.model_family.model_family or self.model_family.model_name
273
277
  tools = generate_config.pop("tools", []) if generate_config else None
274
278
  full_context_kwargs = {}
275
- if tools and model_family in QWEN_TOOL_CALL_FAMILY:
276
- full_context_kwargs["tools"] = tools
279
+ if tools:
280
+ if model_family in QWEN_TOOL_CALL_FAMILY:
281
+ full_context_kwargs["tools"] = tools
282
+ elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
283
+ self._tools_to_messages_for_deepseek(messages, tools)
277
284
  assert self.model_family.chat_template is not None
278
285
  full_prompt = self.get_full_context(
279
286
  messages, self.model_family.chat_template, **full_context_kwargs