xinference 1.3.0.post1__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +1 -0
- xinference/conftest.py +7 -0
- xinference/core/model.py +3 -1
- xinference/core/scheduler.py +3 -0
- xinference/core/worker.py +1 -1
- xinference/deploy/cmdline.py +0 -8
- xinference/model/embedding/core.py +12 -5
- xinference/model/llm/__init__.py +2 -1
- xinference/model/llm/core.py +13 -0
- xinference/model/llm/llama_cpp/core.py +260 -3
- xinference/model/llm/llm_family.json +306 -17
- xinference/model/llm/llm_family_modelscope.json +347 -28
- xinference/model/llm/mlx/core.py +15 -4
- xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py +1 -1
- xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py +4 -5
- xinference/model/llm/sglang/core.py +7 -2
- xinference/model/llm/transformers/chatglm.py +4 -4
- xinference/model/llm/transformers/core.py +22 -5
- xinference/model/llm/transformers/intern_vl.py +2 -1
- xinference/model/llm/transformers/utils.py +1 -1
- xinference/model/llm/utils.py +103 -67
- xinference/model/llm/vllm/core.py +29 -42
- xinference/types.py +4 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.55b70cb7.js +3 -0
- xinference/web/ui/build/static/js/main.55b70cb7.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2deac8d5636974533e3714f34e94fc754f9153a07c6ee11e72846cb8eae47e4b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/87a9b13f2466f375ae5c6e7c08b279cc38351d29710d7f7626bbb07a85262b79.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e23d476fcbf6fd69c8986bf82133d257d28aa8fc9a5cab231d81c1c75c58cd99.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e7a8c37fda8725cab69c7ef8c627060bd7fc806adc67e00fe628ba148cb86d7f.json +1 -0
- xinference/web/ui/src/locales/en.json +9 -1
- xinference/web/ui/src/locales/zh.json +9 -1
- {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/METADATA +7 -3
- {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/RECORD +44 -43
- xinference/web/ui/build/static/js/main.ad42919c.js +0 -3
- xinference/web/ui/build/static/js/main.ad42919c.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/074a42304bbbaa79e1bfc3b28502457a390df55708de9006f4cc8e35c60aea87.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/279ace390216236a82b3d8995c78eca4d637ac9a523e9f521a2d9c76607a43d7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/630a7bd592596cc6e291fc32238ce7c08238038a64ed8ccee0eb0c13c9902910.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/914c33e91c1012e3bcd3e96f3a25884cbef148290632d0266dab972b8cc1e95f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b7939cd3a48adf12fccfdd0803019b5cc235ff7de3a297dae70ce635e0eea13e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fecf076bcd198a458c2a6ab0e85e40dc1c99994c353164e79c469be162cb74c9.json +0 -1
- /xinference/web/ui/build/static/js/{main.ad42919c.js.LICENSE.txt → main.55b70cb7.js.LICENSE.txt} +0 -0
- {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/LICENSE +0 -0
- {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/WHEEL +0 -0
- {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/entry_points.txt +0 -0
- {xinference-1.3.0.post1.dist-info → xinference-1.3.1.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-
|
|
11
|
+
"date": "2025-03-09T12:06:50+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "1.3.
|
|
14
|
+
"full-revisionid": "5d6ec937ce2aca2511e9e0debc4c2ab06ca41f09",
|
|
15
|
+
"version": "1.3.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/api/restful_api.py
CHANGED
|
@@ -1330,6 +1330,7 @@ class RESTfulAPI(CancelMixin):
|
|
|
1330
1330
|
raise HTTPException(status_code=500, detail=str(e))
|
|
1331
1331
|
|
|
1332
1332
|
try:
|
|
1333
|
+
kwargs["model_uid"] = model_uid
|
|
1333
1334
|
embedding = await model.create_embedding(body.input, **kwargs)
|
|
1334
1335
|
return Response(embedding, media_type="application/json")
|
|
1335
1336
|
except Exception as e:
|
xinference/conftest.py
CHANGED
xinference/core/model.py
CHANGED
|
@@ -231,6 +231,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
|
|
|
231
231
|
driver_info: Optional[dict] = None, # for model across workers
|
|
232
232
|
):
|
|
233
233
|
super().__init__()
|
|
234
|
+
from ..model.llm.llama_cpp.core import XllamaCppModel
|
|
234
235
|
from ..model.llm.lmdeploy.core import LMDeployModel
|
|
235
236
|
from ..model.llm.sglang.core import SGLANGModel
|
|
236
237
|
from ..model.llm.transformers.core import PytorchModel
|
|
@@ -251,7 +252,8 @@ class ModelActor(xo.StatelessActor, CancelMixin):
|
|
|
251
252
|
self._lock = (
|
|
252
253
|
None
|
|
253
254
|
if isinstance(
|
|
254
|
-
self._model,
|
|
255
|
+
self._model,
|
|
256
|
+
(PytorchModel, VLLMModel, SGLANGModel, LMDeployModel, XllamaCppModel),
|
|
255
257
|
)
|
|
256
258
|
else asyncio.locks.Lock()
|
|
257
259
|
)
|
xinference/core/scheduler.py
CHANGED
|
@@ -97,6 +97,9 @@ class InferenceRequest:
|
|
|
97
97
|
# check the integrity of args passed upstream
|
|
98
98
|
self._check_args()
|
|
99
99
|
|
|
100
|
+
# for reasoning_content using
|
|
101
|
+
self.previous_texts = [""]
|
|
102
|
+
|
|
100
103
|
def _check_args(self):
|
|
101
104
|
assert len(self._inference_args) == 1
|
|
102
105
|
# generate config
|
xinference/core/worker.py
CHANGED
|
@@ -1002,7 +1002,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
1002
1002
|
)
|
|
1003
1003
|
try:
|
|
1004
1004
|
subpool_address = self._model_uid_to_addr[model_uid]
|
|
1005
|
-
await self._main_pool.remove_sub_pool(subpool_address)
|
|
1005
|
+
await self._main_pool.remove_sub_pool(subpool_address, force=True)
|
|
1006
1006
|
except Exception as e:
|
|
1007
1007
|
logger.debug(
|
|
1008
1008
|
"Remove sub pool failed, model uid: %s, error: %s", model_uid, e
|
xinference/deploy/cmdline.py
CHANGED
|
@@ -821,12 +821,6 @@ def remove_cache(
|
|
|
821
821
|
type=bool,
|
|
822
822
|
help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
|
|
823
823
|
)
|
|
824
|
-
@click.option(
|
|
825
|
-
"--reasoning-content",
|
|
826
|
-
default=False,
|
|
827
|
-
type=bool,
|
|
828
|
-
help="Whether or not to enable reasoning content in model responses.",
|
|
829
|
-
)
|
|
830
824
|
@click.option(
|
|
831
825
|
"--api-key",
|
|
832
826
|
"-ak",
|
|
@@ -855,7 +849,6 @@ def model_launch(
|
|
|
855
849
|
worker_ip: Optional[str],
|
|
856
850
|
gpu_idx: Optional[str],
|
|
857
851
|
trust_remote_code: bool,
|
|
858
|
-
reasoning_content: bool,
|
|
859
852
|
api_key: Optional[str],
|
|
860
853
|
model_path: Optional[str],
|
|
861
854
|
):
|
|
@@ -948,7 +941,6 @@ def model_launch(
|
|
|
948
941
|
gpu_idx=_gpu_idx,
|
|
949
942
|
trust_remote_code=trust_remote_code,
|
|
950
943
|
model_path=model_path,
|
|
951
|
-
reasoning_content=reasoning_content,
|
|
952
944
|
**kwargs,
|
|
953
945
|
)
|
|
954
946
|
|
|
@@ -268,7 +268,7 @@ class EmbeddingModel:
|
|
|
268
268
|
**kwargs,
|
|
269
269
|
):
|
|
270
270
|
sentences = self._fix_langchain_openai_inputs(sentences)
|
|
271
|
-
|
|
271
|
+
model_uid = kwargs.pop("model_uid", None)
|
|
272
272
|
from sentence_transformers import SentenceTransformer
|
|
273
273
|
|
|
274
274
|
kwargs.setdefault("normalize_embeddings", True)
|
|
@@ -546,8 +546,14 @@ class EmbeddingModel:
|
|
|
546
546
|
# when batching, the attention mask 1 means there is a token
|
|
547
547
|
# thus we just sum up it to get the total number of tokens
|
|
548
548
|
if "clip" in self._model_spec.model_name.lower():
|
|
549
|
-
|
|
550
|
-
|
|
549
|
+
if "input_ids" in features and hasattr(
|
|
550
|
+
features["input_ids"], "numel"
|
|
551
|
+
):
|
|
552
|
+
all_token_nums += features["input_ids"].numel()
|
|
553
|
+
if "pixel_values" in features and hasattr(
|
|
554
|
+
features["pixel_values"], "numel"
|
|
555
|
+
):
|
|
556
|
+
all_token_nums += features["pixel_values"].numel()
|
|
551
557
|
else:
|
|
552
558
|
all_token_nums += features["attention_mask"].sum().item()
|
|
553
559
|
|
|
@@ -657,7 +663,7 @@ class EmbeddingModel:
|
|
|
657
663
|
self._model,
|
|
658
664
|
objs,
|
|
659
665
|
convert_to_numpy=False,
|
|
660
|
-
**
|
|
666
|
+
**kwargs,
|
|
661
667
|
)
|
|
662
668
|
else:
|
|
663
669
|
all_embeddings, all_token_nums = encode(
|
|
@@ -693,7 +699,8 @@ class EmbeddingModel:
|
|
|
693
699
|
if not is_bge_m3_flag_model and not kwargs.get("return_sparse")
|
|
694
700
|
else "dict"
|
|
695
701
|
),
|
|
696
|
-
model=
|
|
702
|
+
model=model_uid, # type: ignore
|
|
703
|
+
model_replica=self._model_uid,
|
|
697
704
|
data=embedding_list,
|
|
698
705
|
usage=usage,
|
|
699
706
|
)
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -129,7 +129,7 @@ def register_custom_model():
|
|
|
129
129
|
|
|
130
130
|
|
|
131
131
|
def _install():
|
|
132
|
-
from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
|
|
132
|
+
from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel, XllamaCppModel
|
|
133
133
|
from .lmdeploy.core import LMDeployChatModel, LMDeployModel
|
|
134
134
|
from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
|
|
135
135
|
from .sglang.core import SGLANGChatModel, SGLANGModel
|
|
@@ -169,6 +169,7 @@ def _install():
|
|
|
169
169
|
[
|
|
170
170
|
LlamaCppChatModel,
|
|
171
171
|
LlamaCppModel,
|
|
172
|
+
XllamaCppModel,
|
|
172
173
|
]
|
|
173
174
|
)
|
|
174
175
|
SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
|
xinference/model/llm/core.py
CHANGED
|
@@ -25,6 +25,8 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
|
|
|
25
25
|
from ...core.utils import parse_replica_model_uid
|
|
26
26
|
from ...types import PeftModelConfig
|
|
27
27
|
from ..core import ModelDescription
|
|
28
|
+
from .reasoning_parsers import deepseek_r1_reasoning_parser # noqa: F401
|
|
29
|
+
from .reasoning_parsers.abs_reasoning_parsers import ReasoningParserManager
|
|
28
30
|
|
|
29
31
|
if TYPE_CHECKING:
|
|
30
32
|
from .llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -57,6 +59,7 @@ class LLM(abc.ABC):
|
|
|
57
59
|
self.model_spec = model_spec
|
|
58
60
|
self.quantization = quantization
|
|
59
61
|
self.model_path = model_path
|
|
62
|
+
self.reasoning_parser = None
|
|
60
63
|
if args:
|
|
61
64
|
raise ValueError(f"Unrecognized positional arguments: {args}")
|
|
62
65
|
if kwargs:
|
|
@@ -117,6 +120,16 @@ class LLM(abc.ABC):
|
|
|
117
120
|
) -> bool:
|
|
118
121
|
raise NotImplementedError
|
|
119
122
|
|
|
123
|
+
def prepare_parse_reasoning_content(self, reasoning_content):
|
|
124
|
+
# Initialize reasoning parser if model has reasoning ability
|
|
125
|
+
if "reasoning" in self.model_family.model_ability and reasoning_content:
|
|
126
|
+
module_name = self.model_family.model_family or self.model_family.model_name
|
|
127
|
+
self.reasoning_parser = ReasoningParserManager.get_parser(module_name)
|
|
128
|
+
self.reasoning_parser = self.reasoning_parser(
|
|
129
|
+
self.model_family.reasoning_start_tag,
|
|
130
|
+
self.model_family.reasoning_end_tag,
|
|
131
|
+
)
|
|
132
|
+
|
|
120
133
|
|
|
121
134
|
class LLMDescription(ModelDescription):
|
|
122
135
|
def __init__(
|
|
@@ -11,11 +11,15 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import concurrent.futures
|
|
14
15
|
import logging
|
|
15
16
|
import os
|
|
17
|
+
import queue
|
|
16
18
|
import time
|
|
17
19
|
from typing import Dict, Iterator, List, Optional, Union
|
|
18
20
|
|
|
21
|
+
import orjson
|
|
22
|
+
|
|
19
23
|
from ....types import (
|
|
20
24
|
ChatCompletion,
|
|
21
25
|
ChatCompletionChunk,
|
|
@@ -32,6 +36,248 @@ from ..utils import DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelM
|
|
|
32
36
|
|
|
33
37
|
logger = logging.getLogger(__name__)
|
|
34
38
|
|
|
39
|
+
USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 0)))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class _Sentinel:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class XllamaCppModel(LLM):
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
model_uid: str,
|
|
50
|
+
model_family: "LLMFamilyV1",
|
|
51
|
+
model_spec: "LLMSpecV1",
|
|
52
|
+
quantization: str,
|
|
53
|
+
model_path: str,
|
|
54
|
+
llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
|
|
55
|
+
):
|
|
56
|
+
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
57
|
+
|
|
58
|
+
self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
|
|
59
|
+
llamacpp_model_config
|
|
60
|
+
)
|
|
61
|
+
self._llm = None
|
|
62
|
+
self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
|
|
63
|
+
|
|
64
|
+
def _sanitize_model_config(
|
|
65
|
+
self, llamacpp_model_config: Optional[LlamaCppModelConfig]
|
|
66
|
+
) -> LlamaCppModelConfig:
|
|
67
|
+
if llamacpp_model_config is None:
|
|
68
|
+
llamacpp_model_config = LlamaCppModelConfig()
|
|
69
|
+
|
|
70
|
+
if self.model_family.context_length:
|
|
71
|
+
llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
|
|
72
|
+
llamacpp_model_config.setdefault("use_mmap", False)
|
|
73
|
+
llamacpp_model_config.setdefault("use_mlock", True)
|
|
74
|
+
|
|
75
|
+
if (
|
|
76
|
+
"llama-2" in self.model_family.model_name
|
|
77
|
+
and self.model_spec.model_size_in_billions == 70
|
|
78
|
+
):
|
|
79
|
+
llamacpp_model_config["use_mlock"] = False
|
|
80
|
+
llamacpp_model_config["n_gqa"] = 8
|
|
81
|
+
|
|
82
|
+
if self._is_darwin_and_apple_silicon():
|
|
83
|
+
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
84
|
+
elif self._is_linux():
|
|
85
|
+
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
86
|
+
|
|
87
|
+
return llamacpp_model_config
|
|
88
|
+
|
|
89
|
+
def _sanitize_generate_config(
|
|
90
|
+
self, generate_config: Optional[LlamaCppGenerateConfig]
|
|
91
|
+
) -> LlamaCppGenerateConfig:
|
|
92
|
+
if generate_config is None:
|
|
93
|
+
generate_config = LlamaCppGenerateConfig(
|
|
94
|
+
**CreateCompletionLlamaCpp().dict()
|
|
95
|
+
)
|
|
96
|
+
else:
|
|
97
|
+
from llama_cpp import LlamaGrammar
|
|
98
|
+
|
|
99
|
+
grammar = generate_config.get("grammar")
|
|
100
|
+
if grammar is not None and not isinstance(grammar, LlamaGrammar):
|
|
101
|
+
generate_config["grammar"] = LlamaGrammar.from_string(
|
|
102
|
+
generate_config["grammar"]
|
|
103
|
+
)
|
|
104
|
+
# Validate generate_config and fill default values to the generate config.
|
|
105
|
+
generate_config = LlamaCppGenerateConfig(
|
|
106
|
+
**CreateCompletionLlamaCpp(**generate_config).dict()
|
|
107
|
+
)
|
|
108
|
+
# Currently, llama.cpp does not support lora
|
|
109
|
+
generate_config.pop("lora_name", None) # type: ignore
|
|
110
|
+
return generate_config
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def match(
|
|
114
|
+
cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
|
|
115
|
+
) -> bool:
|
|
116
|
+
if llm_spec.model_format not in ["ggufv2"]:
|
|
117
|
+
return False
|
|
118
|
+
if (
|
|
119
|
+
"chat" not in llm_family.model_ability
|
|
120
|
+
and "generate" not in llm_family.model_ability
|
|
121
|
+
):
|
|
122
|
+
return False
|
|
123
|
+
return True
|
|
124
|
+
|
|
125
|
+
def load(self):
|
|
126
|
+
try:
|
|
127
|
+
from xllamacpp import CommonParams, Server
|
|
128
|
+
except ImportError:
|
|
129
|
+
error_message = "Failed to import module 'xllamacpp'"
|
|
130
|
+
installation_guide = ["Please make sure 'xllamacpp' is installed. "]
|
|
131
|
+
|
|
132
|
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
133
|
+
|
|
134
|
+
if os.path.isfile(self.model_path):
|
|
135
|
+
# mostly passed from --model_path
|
|
136
|
+
model_path = os.path.realpath(self.model_path)
|
|
137
|
+
else:
|
|
138
|
+
# handle legacy cache.
|
|
139
|
+
model_path = os.path.realpath(
|
|
140
|
+
os.path.join(
|
|
141
|
+
self.model_path,
|
|
142
|
+
self.model_spec.model_file_name_template.format(
|
|
143
|
+
quantization=self.quantization
|
|
144
|
+
),
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
legacy_model_file_path = os.path.join(self.model_path, "model.bin")
|
|
148
|
+
if os.path.exists(legacy_model_file_path):
|
|
149
|
+
model_path = legacy_model_file_path
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
params = CommonParams()
|
|
153
|
+
params.model = model_path
|
|
154
|
+
if self.model_family.chat_template:
|
|
155
|
+
params.chat_template = self.model_family.chat_template
|
|
156
|
+
# This is the default value, could be overwritten by _llamacpp_model_config
|
|
157
|
+
params.n_parallel = os.cpu_count()
|
|
158
|
+
for k, v in self._llamacpp_model_config.items():
|
|
159
|
+
try:
|
|
160
|
+
setattr(params, k, v)
|
|
161
|
+
except Exception as e:
|
|
162
|
+
logger.error("Failed to set the param %s = %s, error: %s", k, v, e)
|
|
163
|
+
n_threads = self._llamacpp_model_config.get("n_threads", os.cpu_count())
|
|
164
|
+
params.cpuparams.n_threads = n_threads
|
|
165
|
+
params.cpuparams_batch.n_threads = n_threads
|
|
166
|
+
if params.n_gpu_layers == -1:
|
|
167
|
+
# Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
|
|
168
|
+
# 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
|
169
|
+
params.n_gpu_layers = 0x7FFFFFFF
|
|
170
|
+
self._llm = Server(params)
|
|
171
|
+
self._executor = concurrent.futures.ThreadPoolExecutor(
|
|
172
|
+
max_workers=max(10, n_threads)
|
|
173
|
+
)
|
|
174
|
+
except AssertionError:
|
|
175
|
+
raise RuntimeError(f"Load model {self.model_family.model_name} failed")
|
|
176
|
+
|
|
177
|
+
def generate(
|
|
178
|
+
self, prompt: str, generate_config: Optional[LlamaCppGenerateConfig] = None
|
|
179
|
+
) -> Union[Completion, Iterator[CompletionChunk]]:
|
|
180
|
+
generate_config = self._sanitize_generate_config(generate_config)
|
|
181
|
+
stream = generate_config.get("stream", False)
|
|
182
|
+
q: queue.Queue = queue.Queue()
|
|
183
|
+
|
|
184
|
+
def _handle_completion():
|
|
185
|
+
# TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
|
|
186
|
+
data = generate_config
|
|
187
|
+
data.pop("stopping_criteria", None)
|
|
188
|
+
data.pop("logits_processor", None)
|
|
189
|
+
data.pop("suffix", None)
|
|
190
|
+
data.pop("best_of", None)
|
|
191
|
+
data.update(
|
|
192
|
+
{
|
|
193
|
+
"prompt": prompt,
|
|
194
|
+
"stream": stream,
|
|
195
|
+
}
|
|
196
|
+
)
|
|
197
|
+
prompt_json = orjson.dumps(data)
|
|
198
|
+
|
|
199
|
+
def _res_callback(ok):
|
|
200
|
+
try:
|
|
201
|
+
res = orjson.loads(ok)
|
|
202
|
+
res["model"] = self.model_uid
|
|
203
|
+
q.put(res)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.exception("handle_completions callback failed: %s", e)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
self._llm.handle_completions(prompt_json, _res_callback, _res_callback)
|
|
209
|
+
except Exception as ex:
|
|
210
|
+
logger.exception("handle_completions failed: %s", ex)
|
|
211
|
+
q.put(_Sentinel)
|
|
212
|
+
|
|
213
|
+
assert self._executor
|
|
214
|
+
self._executor.submit(_handle_completion)
|
|
215
|
+
|
|
216
|
+
if stream:
|
|
217
|
+
|
|
218
|
+
def _to_iterator():
|
|
219
|
+
while (r := q.get()) is not _Sentinel:
|
|
220
|
+
yield r
|
|
221
|
+
|
|
222
|
+
return _to_iterator()
|
|
223
|
+
else:
|
|
224
|
+
return q.get()
|
|
225
|
+
|
|
226
|
+
def chat(
|
|
227
|
+
self,
|
|
228
|
+
messages: List[Dict],
|
|
229
|
+
generate_config: Optional[LlamaCppGenerateConfig] = None,
|
|
230
|
+
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
231
|
+
generate_config = self._sanitize_generate_config(generate_config)
|
|
232
|
+
stream = generate_config.get("stream", False)
|
|
233
|
+
tools = generate_config.pop("tools", []) if generate_config else None
|
|
234
|
+
q: queue.Queue = queue.Queue()
|
|
235
|
+
|
|
236
|
+
def _handle_chat_completion():
|
|
237
|
+
# TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
|
|
238
|
+
data = generate_config
|
|
239
|
+
data.pop("stopping_criteria", None)
|
|
240
|
+
data.pop("logits_processor", None)
|
|
241
|
+
data.pop("suffix", None)
|
|
242
|
+
data.pop("best_of", None)
|
|
243
|
+
data.update(
|
|
244
|
+
{
|
|
245
|
+
"messages": messages,
|
|
246
|
+
"stream": stream,
|
|
247
|
+
"tools": tools,
|
|
248
|
+
}
|
|
249
|
+
)
|
|
250
|
+
prompt_json = orjson.dumps(data)
|
|
251
|
+
|
|
252
|
+
def _res_callback(ok):
|
|
253
|
+
try:
|
|
254
|
+
res = orjson.loads(ok)
|
|
255
|
+
res["model"] = self.model_uid
|
|
256
|
+
q.put(res)
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.exception("handle_chat_completions callback failed: %s", e)
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
self._llm.handle_chat_completions(
|
|
262
|
+
prompt_json, _res_callback, _res_callback
|
|
263
|
+
)
|
|
264
|
+
except Exception as ex:
|
|
265
|
+
logger.exception("handle_chat_completions failed: %s", ex)
|
|
266
|
+
q.put(_Sentinel)
|
|
267
|
+
|
|
268
|
+
assert self._executor
|
|
269
|
+
self._executor.submit(_handle_chat_completion)
|
|
270
|
+
|
|
271
|
+
if stream:
|
|
272
|
+
|
|
273
|
+
def _to_iterator():
|
|
274
|
+
while (r := q.get()) is not _Sentinel:
|
|
275
|
+
yield r
|
|
276
|
+
|
|
277
|
+
return _to_iterator()
|
|
278
|
+
else:
|
|
279
|
+
return q.get()
|
|
280
|
+
|
|
35
281
|
|
|
36
282
|
class LlamaCppModel(LLM):
|
|
37
283
|
def __init__(
|
|
@@ -76,6 +322,7 @@ class LlamaCppModel(LLM):
|
|
|
76
322
|
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
77
323
|
elif self._is_linux() and self._can_apply_cublas():
|
|
78
324
|
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
325
|
+
llamacpp_model_config.setdefault("reasoning_content", False)
|
|
79
326
|
|
|
80
327
|
return llamacpp_model_config
|
|
81
328
|
|
|
@@ -123,6 +370,9 @@ class LlamaCppModel(LLM):
|
|
|
123
370
|
|
|
124
371
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
125
372
|
|
|
373
|
+
reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
|
|
374
|
+
self.prepare_parse_reasoning_content(reasoning_content)
|
|
375
|
+
|
|
126
376
|
if os.path.isfile(self.model_path):
|
|
127
377
|
# mostly passed from --model_path
|
|
128
378
|
model_path = os.path.realpath(self.model_path)
|
|
@@ -292,10 +542,17 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
|
|
|
292
542
|
if stream:
|
|
293
543
|
it = self.generate(full_prompt, generate_config)
|
|
294
544
|
assert isinstance(it, Iterator)
|
|
295
|
-
return self._to_chat_completion_chunks(it)
|
|
545
|
+
return self._to_chat_completion_chunks(it, self.reasoning_parser)
|
|
296
546
|
else:
|
|
297
547
|
c = self.generate(full_prompt, generate_config)
|
|
298
548
|
assert not isinstance(c, Iterator)
|
|
299
549
|
if tools:
|
|
300
|
-
return self.
|
|
301
|
-
|
|
550
|
+
return self._post_process_completion(
|
|
551
|
+
self.model_family, self.model_uid, c, self.reasoning_parser
|
|
552
|
+
)
|
|
553
|
+
return self._to_chat_completion(c, self.reasoning_parser)
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
if USE_XLLAMACPP:
|
|
557
|
+
LlamaCppModel = XllamaCppModel # type: ignore # noqa: F811
|
|
558
|
+
LlamaCppChatModel = XllamaCppModel # type: ignore # noqa: F811
|