xinference 1.3.1.post1__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_compat.py +1 -0
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +4 -0
- xinference/core/chat_interface.py +1 -1
- xinference/core/model.py +23 -3
- xinference/core/supervisor.py +6 -0
- xinference/core/worker.py +54 -11
- xinference/model/llm/__init__.py +7 -2
- xinference/model/llm/core.py +1 -0
- xinference/model/llm/llama_cpp/core.py +50 -15
- xinference/model/llm/llm_family.json +388 -13
- xinference/model/llm/llm_family_modelscope.json +373 -14
- xinference/model/llm/mlx/core.py +15 -11
- xinference/model/llm/reasoning_parser.py +17 -9
- xinference/model/llm/sglang/core.py +112 -12
- xinference/model/llm/transformers/core.py +4 -2
- xinference/model/llm/transformers/deepseek_vl.py +1 -1
- xinference/model/llm/transformers/deepseek_vl2.py +287 -0
- xinference/model/llm/transformers/gemma3.py +185 -0
- xinference/model/llm/transformers/intern_vl.py +0 -2
- xinference/model/llm/utils.py +62 -42
- xinference/model/llm/vllm/core.py +157 -11
- xinference/model/llm/vllm/distributed_executor.py +314 -0
- xinference/model/rerank/core.py +16 -11
- xinference/thirdparty/deepseek_vl2/__init__.py +31 -0
- xinference/thirdparty/deepseek_vl2/models/__init__.py +26 -0
- xinference/thirdparty/deepseek_vl2/models/configuration_deepseek.py +210 -0
- xinference/thirdparty/deepseek_vl2/models/conversation.py +310 -0
- xinference/thirdparty/deepseek_vl2/models/modeling_deepseek.py +1975 -0
- xinference/thirdparty/deepseek_vl2/models/modeling_deepseek_vl_v2.py +697 -0
- xinference/thirdparty/deepseek_vl2/models/processing_deepseek_vl_v2.py +675 -0
- xinference/thirdparty/deepseek_vl2/models/siglip_vit.py +661 -0
- xinference/thirdparty/deepseek_vl2/serve/__init__.py +0 -0
- xinference/thirdparty/deepseek_vl2/serve/app_modules/__init__.py +0 -0
- xinference/thirdparty/deepseek_vl2/serve/app_modules/gradio_utils.py +83 -0
- xinference/thirdparty/deepseek_vl2/serve/app_modules/overwrites.py +81 -0
- xinference/thirdparty/deepseek_vl2/serve/app_modules/presets.py +115 -0
- xinference/thirdparty/deepseek_vl2/serve/app_modules/utils.py +333 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/Kelpy-Codos.js +100 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/avatar.png +0 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/custom.css +355 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/custom.js +22 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/favicon.ico +0 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/simsun.ttc +0 -0
- xinference/thirdparty/deepseek_vl2/serve/inference.py +197 -0
- xinference/thirdparty/deepseek_vl2/utils/__init__.py +18 -0
- xinference/thirdparty/deepseek_vl2/utils/io.py +80 -0
- xinference/types.py +2 -2
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.b494ae7e.css +2 -0
- xinference/web/ui/build/static/css/main.b494ae7e.css.map +1 -0
- xinference/web/ui/build/static/js/main.5ca4eea1.js +3 -0
- xinference/web/ui/build/static/js/main.5ca4eea1.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0f0967acaec5df1d45b80010949c258d64297ebbb0f44b8bb3afcbd45c6f0ec4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/68249645124f37d01eef83b1d897e751f895bea919b6fb466f907c1f87cebc84.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +1 -0
- xinference/web/ui/src/locales/en.json +2 -2
- xinference/web/ui/src/locales/zh.json +1 -1
- {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/METADATA +4 -4
- {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/RECORD +67 -41
- xinference/web/ui/build/static/css/main.f8177338.css +0 -2
- xinference/web/ui/build/static/css/main.f8177338.css.map +0 -1
- xinference/web/ui/build/static/js/main.55b70cb7.js +0 -3
- xinference/web/ui/build/static/js/main.55b70cb7.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/db16a983bc08a05f0439cc61ca0840e49e1d8400eef678909f16c032a418a3d6.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +0 -1
- /xinference/web/ui/build/static/js/{main.55b70cb7.js.LICENSE.txt → main.5ca4eea1.js.LICENSE.txt} +0 -0
- {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/LICENSE +0 -0
- {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/WHEEL +0 -0
- {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.3.1.post1.dist-info → xinference-1.4.1.dist-info}/top_level.txt +0 -0
xinference/_compat.py
CHANGED
|
@@ -102,6 +102,7 @@ class CreateChatCompletionOpenAI(BaseModel):
|
|
|
102
102
|
frequency_penalty: Optional[float]
|
|
103
103
|
logit_bias: Optional[Dict[str, int]]
|
|
104
104
|
logprobs: Optional[bool]
|
|
105
|
+
max_completion_tokens: Optional[int]
|
|
105
106
|
max_tokens: Optional[int]
|
|
106
107
|
n: Optional[int]
|
|
107
108
|
parallel_tool_calls: Optional[bool]
|
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-
|
|
11
|
+
"date": "2025-04-03T21:26:30+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "1.
|
|
14
|
+
"full-revisionid": "23260be3b917e7a2e8381927721ed3de815c0a99",
|
|
15
|
+
"version": "1.4.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/api/restful_api.py
CHANGED
|
@@ -1952,6 +1952,7 @@ class RESTfulAPI(CancelMixin):
|
|
|
1952
1952
|
"logit_bias",
|
|
1953
1953
|
"logit_bias_type",
|
|
1954
1954
|
"user",
|
|
1955
|
+
"max_completion_tokens",
|
|
1955
1956
|
}
|
|
1956
1957
|
|
|
1957
1958
|
raw_kwargs = {k: v for k, v in raw_body.items() if k not in exclude}
|
|
@@ -1964,6 +1965,9 @@ class RESTfulAPI(CancelMixin):
|
|
|
1964
1965
|
if body.max_tokens is None:
|
|
1965
1966
|
kwargs["max_tokens"] = max_tokens_field.default
|
|
1966
1967
|
|
|
1968
|
+
if body.max_completion_tokens is not None:
|
|
1969
|
+
kwargs["max_tokens"] = body.max_completion_tokens
|
|
1970
|
+
|
|
1967
1971
|
if body.logit_bias is not None:
|
|
1968
1972
|
raise HTTPException(status_code=501, detail="Not implemented")
|
|
1969
1973
|
|
|
@@ -137,7 +137,7 @@ class GradioInterface:
|
|
|
137
137
|
):
|
|
138
138
|
assert isinstance(chunk, dict)
|
|
139
139
|
delta = chunk["choices"][0]["delta"]
|
|
140
|
-
if "content" not in delta:
|
|
140
|
+
if "content" not in delta or delta["content"] is None:
|
|
141
141
|
continue
|
|
142
142
|
else:
|
|
143
143
|
# some model like deepseek-r1-distill-qwen
|
xinference/core/model.py
CHANGED
|
@@ -185,7 +185,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
|
|
|
185
185
|
)
|
|
186
186
|
|
|
187
187
|
if hasattr(self._model, "stop") and callable(self._model.stop):
|
|
188
|
-
self._model.stop
|
|
188
|
+
await asyncio.to_thread(self._model.stop)
|
|
189
189
|
|
|
190
190
|
if isinstance(self._model, LLMVLLMModel):
|
|
191
191
|
if self._transfer_ref is not None:
|
|
@@ -284,6 +284,8 @@ class ModelActor(xo.StatelessActor, CancelMixin):
|
|
|
284
284
|
async def __post_create__(self):
|
|
285
285
|
self._loop = asyncio.get_running_loop()
|
|
286
286
|
|
|
287
|
+
logger.debug("Starting ModelActor at %s, uid: %s", self.address, self.uid)
|
|
288
|
+
|
|
287
289
|
self._handle_pending_requests_task = asyncio.create_task(
|
|
288
290
|
self._handle_pending_requests()
|
|
289
291
|
)
|
|
@@ -463,7 +465,9 @@ class ModelActor(xo.StatelessActor, CancelMixin):
|
|
|
463
465
|
while True:
|
|
464
466
|
i += 1
|
|
465
467
|
try:
|
|
466
|
-
self._model
|
|
468
|
+
if hasattr(self._model, "set_loop"):
|
|
469
|
+
self._model.set_loop(asyncio.get_running_loop())
|
|
470
|
+
await asyncio.to_thread(self._model.load)
|
|
467
471
|
if hasattr(self._model, "driver_info"):
|
|
468
472
|
self._driver_info = self._model.driver_info
|
|
469
473
|
break
|
|
@@ -490,7 +494,23 @@ class ModelActor(xo.StatelessActor, CancelMixin):
|
|
|
490
494
|
|
|
491
495
|
async def wait_for_load(self):
|
|
492
496
|
if hasattr(self._model, "wait_for_load"):
|
|
493
|
-
self._model.wait_for_load
|
|
497
|
+
await asyncio.to_thread(self._model.wait_for_load)
|
|
498
|
+
|
|
499
|
+
def need_create_pools(self):
|
|
500
|
+
return getattr(self._model, "need_create_pools", False)
|
|
501
|
+
|
|
502
|
+
def set_pool_addresses(self, pool_addresses: List[str]):
|
|
503
|
+
if hasattr(self._model, "set_pool_addresses"):
|
|
504
|
+
self._model.set_pool_addresses(pool_addresses)
|
|
505
|
+
|
|
506
|
+
def get_pool_addresses(self) -> Optional[List[str]]:
|
|
507
|
+
if hasattr(self._model, "get_pool_addresses"):
|
|
508
|
+
return self._model.get_pool_addresses()
|
|
509
|
+
return None
|
|
510
|
+
|
|
511
|
+
def set_worker_addresses(self, shard: int, worker_addresses: List[str]):
|
|
512
|
+
if hasattr(self._model, "set_worker_addresses"):
|
|
513
|
+
self._model.set_worker_addresses(shard, worker_addresses)
|
|
494
514
|
|
|
495
515
|
def model_uid(self):
|
|
496
516
|
return (
|
xinference/core/supervisor.py
CHANGED
|
@@ -1097,6 +1097,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
1097
1097
|
xavier_config=xavier_config,
|
|
1098
1098
|
**kwargs,
|
|
1099
1099
|
)
|
|
1100
|
+
await worker_ref.wait_for_load(_replica_model_uid)
|
|
1100
1101
|
self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
|
|
1101
1102
|
return subpool_address
|
|
1102
1103
|
|
|
@@ -1242,6 +1243,11 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
1242
1243
|
available_workers.append(worker_ip)
|
|
1243
1244
|
|
|
1244
1245
|
async def _launch_model():
|
|
1246
|
+
# Validation of n_worker, intercept if it is greater than the available workers.
|
|
1247
|
+
if n_worker > len(available_workers):
|
|
1248
|
+
raise ValueError(
|
|
1249
|
+
"n_worker cannot be larger than the number of available workers."
|
|
1250
|
+
)
|
|
1245
1251
|
try:
|
|
1246
1252
|
for _idx, rep_model_uid in enumerate(
|
|
1247
1253
|
iter_replica_model_uid(model_uid, replica)
|
xinference/core/worker.py
CHANGED
|
@@ -874,7 +874,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
874
874
|
subpool_address, devices = await self._create_subpool(
|
|
875
875
|
model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
|
|
876
876
|
)
|
|
877
|
-
|
|
877
|
+
all_subpool_addresses = [subpool_address]
|
|
878
878
|
try:
|
|
879
879
|
xavier_config: Optional[Dict] = kwargs.pop("xavier_config", None)
|
|
880
880
|
if xavier_config is not None:
|
|
@@ -885,7 +885,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
885
885
|
# add a few kwargs
|
|
886
886
|
model_kwargs.update(
|
|
887
887
|
dict(
|
|
888
|
-
address=
|
|
888
|
+
address=subpool_address,
|
|
889
889
|
n_worker=n_worker,
|
|
890
890
|
shard=shard,
|
|
891
891
|
driver_info=driver_info,
|
|
@@ -923,11 +923,28 @@ class WorkerActor(xo.StatelessActor):
|
|
|
923
923
|
shard=shard,
|
|
924
924
|
driver_info=driver_info,
|
|
925
925
|
)
|
|
926
|
+
if await model_ref.need_create_pools() and (
|
|
927
|
+
len(devices) > 1 or n_worker > 1 # type: ignore
|
|
928
|
+
):
|
|
929
|
+
coros = []
|
|
930
|
+
env_name = get_available_device_env_name() or "CUDA_VISIBLE_DEVICES"
|
|
931
|
+
env_value = ",".join(devices)
|
|
932
|
+
for device in devices:
|
|
933
|
+
coros.append(
|
|
934
|
+
self._main_pool.append_sub_pool(
|
|
935
|
+
env={env_name: env_value},
|
|
936
|
+
start_method=self._get_start_method(),
|
|
937
|
+
)
|
|
938
|
+
)
|
|
939
|
+
pool_addresses = await asyncio.gather(*coros)
|
|
940
|
+
all_subpool_addresses.extend(pool_addresses)
|
|
941
|
+
await model_ref.set_pool_addresses(pool_addresses)
|
|
926
942
|
await model_ref.load()
|
|
927
943
|
except:
|
|
928
944
|
logger.error(f"Failed to load model {model_uid}", exc_info=True)
|
|
929
945
|
self.release_devices(model_uid=model_uid)
|
|
930
|
-
|
|
946
|
+
for addr in all_subpool_addresses:
|
|
947
|
+
await self._main_pool.remove_sub_pool(addr)
|
|
931
948
|
raise
|
|
932
949
|
self._model_uid_to_model[model_uid] = model_ref
|
|
933
950
|
self._model_uid_to_model_spec[model_uid] = model_description
|
|
@@ -994,15 +1011,36 @@ class WorkerActor(xo.StatelessActor):
|
|
|
994
1011
|
if model_ref is None:
|
|
995
1012
|
logger.debug("Model not found, uid: %s", model_uid)
|
|
996
1013
|
|
|
1014
|
+
pool_addresses = None
|
|
1015
|
+
if model_ref is not None:
|
|
1016
|
+
try:
|
|
1017
|
+
# pool addresses if model.need_create_pools()
|
|
1018
|
+
pool_addresses = await model_ref.get_pool_addresses()
|
|
1019
|
+
except Exception as e:
|
|
1020
|
+
# process may disappear, we just ignore it.
|
|
1021
|
+
logger.debug("Fail to get pool addresses, error: %s", e)
|
|
1022
|
+
|
|
997
1023
|
try:
|
|
998
|
-
|
|
1024
|
+
logger.debug("Start to destroy model actor: %s", model_ref)
|
|
1025
|
+
coro = xo.destroy_actor(model_ref)
|
|
1026
|
+
await asyncio.wait_for(coro, timeout=5)
|
|
999
1027
|
except Exception as e:
|
|
1000
1028
|
logger.debug(
|
|
1001
1029
|
"Destroy model actor failed, model uid: %s, error: %s", model_uid, e
|
|
1002
1030
|
)
|
|
1003
1031
|
try:
|
|
1032
|
+
to_remove_addresses = []
|
|
1004
1033
|
subpool_address = self._model_uid_to_addr[model_uid]
|
|
1005
|
-
|
|
1034
|
+
to_remove_addresses.append(subpool_address)
|
|
1035
|
+
if pool_addresses:
|
|
1036
|
+
to_remove_addresses.extend(pool_addresses)
|
|
1037
|
+
logger.debug("Remove sub pools: %s", to_remove_addresses)
|
|
1038
|
+
coros = []
|
|
1039
|
+
for to_remove_addr in to_remove_addresses:
|
|
1040
|
+
coros.append(
|
|
1041
|
+
self._main_pool.remove_sub_pool(to_remove_addr, force=True)
|
|
1042
|
+
)
|
|
1043
|
+
await asyncio.gather(*coros)
|
|
1006
1044
|
except Exception as e:
|
|
1007
1045
|
logger.debug(
|
|
1008
1046
|
"Remove sub pool failed, model uid: %s, error: %s", model_uid, e
|
|
@@ -1204,18 +1242,23 @@ class WorkerActor(xo.StatelessActor):
|
|
|
1204
1242
|
model_ref = self._model_uid_to_model[rep_model_uid]
|
|
1205
1243
|
await model_ref.start_transfer_for_vllm(rank_addresses)
|
|
1206
1244
|
|
|
1207
|
-
@
|
|
1208
|
-
|
|
1209
|
-
self, rep_model_uid: str, xavier_config: Dict[str, Any]
|
|
1210
|
-
) -> Tuple[str, int]:
|
|
1211
|
-
from ..model.llm.vllm.xavier.collective_manager import Rank0ModelActor
|
|
1212
|
-
|
|
1245
|
+
@staticmethod
|
|
1246
|
+
def _get_start_method():
|
|
1213
1247
|
if os.name != "nt" and platform.system() != "Darwin":
|
|
1214
1248
|
# Linux
|
|
1215
1249
|
start_method = "forkserver"
|
|
1216
1250
|
else:
|
|
1217
1251
|
# Windows and macOS
|
|
1218
1252
|
start_method = "spawn"
|
|
1253
|
+
return start_method
|
|
1254
|
+
|
|
1255
|
+
@log_async(logger=logger, level=logging.INFO)
|
|
1256
|
+
async def launch_rank0_model(
|
|
1257
|
+
self, rep_model_uid: str, xavier_config: Dict[str, Any]
|
|
1258
|
+
) -> Tuple[str, int]:
|
|
1259
|
+
from ..model.llm.vllm.xavier.collective_manager import Rank0ModelActor
|
|
1260
|
+
|
|
1261
|
+
start_method = self._get_start_method()
|
|
1219
1262
|
subpool_address = await self._main_pool.append_sub_pool(
|
|
1220
1263
|
start_method=start_method
|
|
1221
1264
|
)
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -132,7 +132,7 @@ def _install():
|
|
|
132
132
|
from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel, XllamaCppModel
|
|
133
133
|
from .lmdeploy.core import LMDeployChatModel, LMDeployModel
|
|
134
134
|
from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
|
|
135
|
-
from .sglang.core import SGLANGChatModel, SGLANGModel
|
|
135
|
+
from .sglang.core import SGLANGChatModel, SGLANGModel, SGLANGVisionModel
|
|
136
136
|
from .transformers.chatglm import ChatglmPytorchChatModel
|
|
137
137
|
from .transformers.cogagent import CogAgentChatModel
|
|
138
138
|
from .transformers.cogvlm2 import CogVLM2Model
|
|
@@ -143,6 +143,8 @@ def _install():
|
|
|
143
143
|
DeepSeekV2PytorchModel,
|
|
144
144
|
)
|
|
145
145
|
from .transformers.deepseek_vl import DeepSeekVLChatModel
|
|
146
|
+
from .transformers.deepseek_vl2 import DeepSeekVL2ChatModel
|
|
147
|
+
from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
|
|
146
148
|
from .transformers.glm4v import Glm4VModel
|
|
147
149
|
from .transformers.glm_edge_v import GlmEdgeVModel
|
|
148
150
|
from .transformers.intern_vl import InternVLChatModel
|
|
@@ -172,7 +174,7 @@ def _install():
|
|
|
172
174
|
XllamaCppModel,
|
|
173
175
|
]
|
|
174
176
|
)
|
|
175
|
-
SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
|
|
177
|
+
SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel, SGLANGVisionModel])
|
|
176
178
|
VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
|
|
177
179
|
MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
|
|
178
180
|
LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
|
|
@@ -186,6 +188,7 @@ def _install():
|
|
|
186
188
|
Qwen2AudioChatModel,
|
|
187
189
|
YiVLChatModel,
|
|
188
190
|
DeepSeekVLChatModel,
|
|
191
|
+
DeepSeekVL2ChatModel,
|
|
189
192
|
InternVLChatModel,
|
|
190
193
|
PytorchModel,
|
|
191
194
|
CogVLM2Model,
|
|
@@ -198,6 +201,8 @@ def _install():
|
|
|
198
201
|
OptPytorchModel,
|
|
199
202
|
GlmEdgeVModel,
|
|
200
203
|
CogAgentChatModel,
|
|
204
|
+
Gemma3TextChatModel,
|
|
205
|
+
Gemma3ChatModel,
|
|
201
206
|
]
|
|
202
207
|
)
|
|
203
208
|
if OmniLMMModel: # type: ignore
|
xinference/model/llm/core.py
CHANGED
|
@@ -39,10 +39,15 @@ logger = logging.getLogger(__name__)
|
|
|
39
39
|
USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 0)))
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class
|
|
42
|
+
class _Done:
|
|
43
43
|
pass
|
|
44
44
|
|
|
45
45
|
|
|
46
|
+
class _Error:
|
|
47
|
+
def __init__(self, msg):
|
|
48
|
+
self.msg = msg
|
|
49
|
+
|
|
50
|
+
|
|
46
51
|
class XllamaCppModel(LLM, ChatModelMixin):
|
|
47
52
|
def __init__(
|
|
48
53
|
self,
|
|
@@ -200,7 +205,14 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
200
205
|
)
|
|
201
206
|
prompt_json = orjson.dumps(data)
|
|
202
207
|
|
|
203
|
-
def
|
|
208
|
+
def _error_callback(err):
|
|
209
|
+
try:
|
|
210
|
+
msg = orjson.loads(err)
|
|
211
|
+
q.put(_Error(msg))
|
|
212
|
+
except Exception as e:
|
|
213
|
+
q.put(_Error(str(e)))
|
|
214
|
+
|
|
215
|
+
def _ok_callback(ok):
|
|
204
216
|
try:
|
|
205
217
|
res = orjson.loads(ok)
|
|
206
218
|
res["model"] = self.model_uid
|
|
@@ -209,10 +221,10 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
209
221
|
logger.exception("handle_completions callback failed: %s", e)
|
|
210
222
|
|
|
211
223
|
try:
|
|
212
|
-
self._llm.handle_completions(prompt_json,
|
|
224
|
+
self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
|
|
213
225
|
except Exception as ex:
|
|
214
226
|
logger.exception("handle_completions failed: %s", ex)
|
|
215
|
-
q.put(
|
|
227
|
+
q.put(_Done)
|
|
216
228
|
|
|
217
229
|
assert self._executor
|
|
218
230
|
self._executor.submit(_handle_completion)
|
|
@@ -220,12 +232,17 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
220
232
|
if stream:
|
|
221
233
|
|
|
222
234
|
def _to_iterator():
|
|
223
|
-
while (r := q.get()) is not
|
|
235
|
+
while (r := q.get()) is not _Done:
|
|
236
|
+
if type(r) is _Error:
|
|
237
|
+
raise Exception("Got error in generate stream: %s", r.msg)
|
|
224
238
|
yield r
|
|
225
239
|
|
|
226
240
|
return _to_iterator()
|
|
227
241
|
else:
|
|
228
|
-
|
|
242
|
+
r = q.get()
|
|
243
|
+
if type(r) is _Error:
|
|
244
|
+
raise Exception("Got error in generate: %s", r.msg)
|
|
245
|
+
return r
|
|
229
246
|
|
|
230
247
|
def chat(
|
|
231
248
|
self,
|
|
@@ -253,7 +270,14 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
253
270
|
)
|
|
254
271
|
prompt_json = orjson.dumps(data)
|
|
255
272
|
|
|
256
|
-
def
|
|
273
|
+
def _error_callback(err):
|
|
274
|
+
try:
|
|
275
|
+
msg = orjson.loads(err)
|
|
276
|
+
q.put(_Error(msg))
|
|
277
|
+
except Exception as e:
|
|
278
|
+
q.put(_Error(str(e)))
|
|
279
|
+
|
|
280
|
+
def _ok_callback(ok):
|
|
257
281
|
try:
|
|
258
282
|
res = orjson.loads(ok)
|
|
259
283
|
res["model"] = self.model_uid
|
|
@@ -263,11 +287,11 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
263
287
|
|
|
264
288
|
try:
|
|
265
289
|
self._llm.handle_chat_completions(
|
|
266
|
-
prompt_json,
|
|
290
|
+
prompt_json, _error_callback, _ok_callback
|
|
267
291
|
)
|
|
268
292
|
except Exception as ex:
|
|
269
293
|
logger.exception("handle_chat_completions failed: %s", ex)
|
|
270
|
-
q.put(
|
|
294
|
+
q.put(_Done)
|
|
271
295
|
|
|
272
296
|
assert self._executor
|
|
273
297
|
self._executor.submit(_handle_chat_completion)
|
|
@@ -275,14 +299,24 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
275
299
|
if stream:
|
|
276
300
|
|
|
277
301
|
def _to_iterator():
|
|
278
|
-
while (r := q.get()) is not
|
|
279
|
-
|
|
302
|
+
while (r := q.get()) is not _Done:
|
|
303
|
+
if type(r) is _Error:
|
|
304
|
+
raise Exception("Got error in chat stream: %s", r.msg)
|
|
305
|
+
# Get valid keys (O(1) lookup)
|
|
306
|
+
chunk_keys = ChatCompletionChunk.__annotations__
|
|
307
|
+
# The chunk may contain additional keys (e.g., system_fingerprint),
|
|
308
|
+
# which might not conform to OpenAI/DeepSeek formats.
|
|
309
|
+
# Filter out keys that are not part of ChatCompletionChunk.
|
|
310
|
+
yield {key: r[key] for key in chunk_keys if key in r}
|
|
280
311
|
|
|
281
312
|
return self._to_chat_completion_chunks(
|
|
282
313
|
_to_iterator(), self.reasoning_parser
|
|
283
314
|
)
|
|
284
315
|
else:
|
|
285
|
-
|
|
316
|
+
r = q.get()
|
|
317
|
+
if type(r) is _Error:
|
|
318
|
+
raise Exception("Got error in chat: %s", r.msg)
|
|
319
|
+
return self._to_chat_completion(r, self.reasoning_parser)
|
|
286
320
|
|
|
287
321
|
|
|
288
322
|
class LlamaCppModel(LLM):
|
|
@@ -533,10 +567,11 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
|
|
|
533
567
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
534
568
|
full_context_kwargs = {}
|
|
535
569
|
if tools:
|
|
536
|
-
if
|
|
570
|
+
if (
|
|
571
|
+
model_family in QWEN_TOOL_CALL_FAMILY
|
|
572
|
+
or model_family in DEEPSEEK_TOOL_CALL_FAMILY
|
|
573
|
+
):
|
|
537
574
|
full_context_kwargs["tools"] = tools
|
|
538
|
-
elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
|
|
539
|
-
self._tools_to_messages_for_deepseek(messages, tools)
|
|
540
575
|
assert self.model_family.chat_template is not None
|
|
541
576
|
full_prompt = self.get_full_context(
|
|
542
577
|
messages, self.model_family.chat_template, **full_context_kwargs
|