PyPI - xinference - Versions diffs - 1.3.0.post2__py3-none-any.whl → 1.3.1.post1__py3-none-any.whl - Mend

xinference 1.3.0.post2py3-none-any.whl → 1.3.1.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (53) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-02-22T23:10:02+0800",
+ "date": "2025-03-11T12:00:36+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "378a47adad8506a13105b063322ccd7a04f7ea5d",
- "version": "1.3.0.post2"
+ "full-revisionid": "2ef99fbb5450a76a6ba07a909f58b8c2e4c22a28",
+ "version": "1.3.1.post1"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -1330,6 +1330,7 @@ class RESTfulAPI(CancelMixin):
             raise HTTPException(status_code=500, detail=str(e))
         try:
+            kwargs["model_uid"] = model_uid
             embedding = await model.create_embedding(body.input, **kwargs)
             return Response(embedding, media_type="application/json")
         except Exception as e:

xinference/conftest.py CHANGED Viewed

@@ -304,3 +304,10 @@ def setup_with_auth():
             os.remove(auth_file)
         except:
             pass
+@pytest.fixture
+def set_use_xllamacpp():
+    os.environ["USE_XLLAMACPP"] = "1"
+    yield
+    del os.environ["USE_XLLAMACPP"]

xinference/core/chat_interface.py CHANGED Viewed

@@ -113,6 +113,7 @@ class GradioInterface:
             max_tokens: int,
             temperature: float,
             lora_name: str,
+            stream: bool,
         ) -> Generator:
             from ..client import RESTfulClient
@@ -123,29 +124,40 @@ class GradioInterface:
             messages = to_chat(flatten(history))
             messages.append(dict(role="user", content=message))
-            response_content = ""
-            for chunk in model.chat(
-                messages,
-                generate_config={
-                    "max_tokens": int(max_tokens),
-                    "temperature": temperature,
-                    "stream": True,
-                    "lora_name": lora_name,
-                },
-            ):
-                assert isinstance(chunk, dict)
-                delta = chunk["choices"][0]["delta"]
-                if "content" not in delta:
-                    continue
-                else:
-                    # some model like deepseek-r1-distill-qwen
-                    # will generate <think>...</think> ...
-                    # in gradio, no output will be rendered,
-                    # thus escape html tags in advance
-                    response_content += html.escape(delta["content"])
-                    yield response_content
-            yield response_content
+            if stream:
+                response_content = ""
+                for chunk in model.chat(
+                    messages,
+                    generate_config={
+                        "max_tokens": int(max_tokens),
+                        "temperature": temperature,
+                        "stream": True,
+                        "lora_name": lora_name,
+                    },
+                ):
+                    assert isinstance(chunk, dict)
+                    delta = chunk["choices"][0]["delta"]
+                    if "content" not in delta:
+                        continue
+                    else:
+                        # some model like deepseek-r1-distill-qwen
+                        # will generate <think>...</think> ...
+                        # in gradio, no output will be rendered,
+                        # thus escape html tags in advance
+                        response_content += html.escape(delta["content"])
+                        yield response_content
+                yield response_content
+            else:
+                result = model.chat(
+                    messages,
+                    generate_config={
+                        "max_tokens": int(max_tokens),
+                        "temperature": temperature,
+                        "lora_name": lora_name,
+                    },
+                )
+                yield html.escape(result["choices"][0]["message"]["content"])  # type: ignore
         return gr.ChatInterface(
             fn=generate_wrapper,
@@ -153,7 +165,9 @@ class GradioInterface:
                 gr.Slider(
                     minimum=1,
                     maximum=self.context_length,
-                    value=512,
+                    value=512
+                    if "reasoning" not in self.model_ability
+                    else self.context_length // 2,
                     step=1,
                     label="Max Tokens",
                 ),
@@ -161,6 +175,7 @@ class GradioInterface:
                     minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
                 ),
                 gr.Text(label="LoRA Name"),
+                gr.Checkbox(label="Stream", value=True),
             ],
             title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
             css="""

xinference/core/model.py CHANGED Viewed

@@ -231,6 +231,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
         driver_info: Optional[dict] = None,  # for model across workers
     ):
         super().__init__()
+        from ..model.llm.llama_cpp.core import XllamaCppModel
         from ..model.llm.lmdeploy.core import LMDeployModel
         from ..model.llm.sglang.core import SGLANGModel
         from ..model.llm.transformers.core import PytorchModel
@@ -251,7 +252,8 @@ class ModelActor(xo.StatelessActor, CancelMixin):
         self._lock = (
             None
             if isinstance(
-                self._model, (PytorchModel, VLLMModel, SGLANGModel, LMDeployModel)
+                self._model,
+                (PytorchModel, VLLMModel, SGLANGModel, LMDeployModel, XllamaCppModel),
             )
             else asyncio.locks.Lock()
         )

xinference/core/scheduler.py CHANGED Viewed

@@ -97,6 +97,9 @@ class InferenceRequest:
         # check the integrity of args passed upstream
         self._check_args()
+        # for reasoning_content using
+        self.previous_texts = [""]
     def _check_args(self):
         assert len(self._inference_args) == 1
         # generate config

xinference/core/worker.py CHANGED Viewed

@@ -1002,7 +1002,7 @@ class WorkerActor(xo.StatelessActor):
             )
         try:
             subpool_address = self._model_uid_to_addr[model_uid]
-            await self._main_pool.remove_sub_pool(subpool_address)
+            await self._main_pool.remove_sub_pool(subpool_address, force=True)
         except Exception as e:
             logger.debug(
                 "Remove sub pool failed, model uid: %s, error: %s", model_uid, e

xinference/model/embedding/core.py CHANGED Viewed

@@ -268,7 +268,7 @@ class EmbeddingModel:
         **kwargs,
     ):
         sentences = self._fix_langchain_openai_inputs(sentences)
+        model_uid = kwargs.pop("model_uid", None)
         from sentence_transformers import SentenceTransformer
         kwargs.setdefault("normalize_embeddings", True)
@@ -546,8 +546,14 @@ class EmbeddingModel:
                 # when batching, the attention mask 1 means there is a token
                 # thus we just sum up it to get the total number of tokens
                 if "clip" in self._model_spec.model_name.lower():
-                    all_token_nums += features["input_ids"].numel()
-                    all_token_nums += features["pixel_values"].numel()
+                    if "input_ids" in features and hasattr(
+                        features["input_ids"], "numel"
+                    ):
+                        all_token_nums += features["input_ids"].numel()
+                    if "pixel_values" in features and hasattr(
+                        features["pixel_values"], "numel"
+                    ):
+                        all_token_nums += features["pixel_values"].numel()
                 else:
                     all_token_nums += features["attention_mask"].sum().item()
@@ -657,7 +663,7 @@ class EmbeddingModel:
                 self._model,
                 objs,
                 convert_to_numpy=False,
-                **self._kwargs,
+                **kwargs,
             )
         else:
             all_embeddings, all_token_nums = encode(
@@ -693,7 +699,8 @@ class EmbeddingModel:
                 if not is_bge_m3_flag_model and not kwargs.get("return_sparse")
                 else "dict"
             ),
-            model=self._model_uid,
+            model=model_uid,  # type: ignore
+            model_replica=self._model_uid,
             data=embedding_list,
             usage=usage,
         )

xinference/model/llm/__init__.py CHANGED Viewed

@@ -129,7 +129,7 @@ def register_custom_model():
 def _install():
-    from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
+    from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel, XllamaCppModel
     from .lmdeploy.core import LMDeployChatModel, LMDeployModel
     from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
     from .sglang.core import SGLANGChatModel, SGLANGModel
@@ -169,6 +169,7 @@ def _install():
         [
             LlamaCppChatModel,
             LlamaCppModel,
+            XllamaCppModel,
         ]
     )
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])

xinference/model/llm/core.py CHANGED Viewed

@@ -25,6 +25,7 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
 from ...core.utils import parse_replica_model_uid
 from ...types import PeftModelConfig
 from ..core import ModelDescription
+from .reasoning_parser import ReasoningParser
 if TYPE_CHECKING:
     from .llm_family import LLMFamilyV1, LLMSpecV1
@@ -57,6 +58,7 @@ class LLM(abc.ABC):
         self.model_spec = model_spec
         self.quantization = quantization
         self.model_path = model_path
+        self.reasoning_parser = None
         if args:
             raise ValueError(f"Unrecognized positional arguments: {args}")
         if kwargs:
@@ -117,6 +119,14 @@ class LLM(abc.ABC):
     ) -> bool:
         raise NotImplementedError
+    def prepare_parse_reasoning_content(self, reasoning_content):
+        # Initialize reasoning parser if model has reasoning ability
+        if "reasoning" in self.model_family.model_ability and reasoning_content:
+            self.reasoning_parser = ReasoningParser(
+                self.model_family.reasoning_start_tag,
+                self.model_family.reasoning_end_tag,
+            )
 class LLMDescription(ModelDescription):
     def __init__(

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -11,11 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import concurrent.futures
 import logging
 import os
+import queue
 import time
 from typing import Dict, Iterator, List, Optional, Union
+import orjson
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
@@ -32,6 +36,254 @@ from ..utils import DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelM
 logger = logging.getLogger(__name__)
+USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 0)))
+class _Sentinel:
+    pass
+class XllamaCppModel(LLM, ChatModelMixin):
+    def __init__(
+        self,
+        model_uid: str,
+        model_family: "LLMFamilyV1",
+        model_spec: "LLMSpecV1",
+        quantization: str,
+        model_path: str,
+        llamacpp_model_config: Optional[LlamaCppModelConfig] = None,
+    ):
+        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
+        self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
+            llamacpp_model_config
+        )
+        self._llm = None
+        self._executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
+    def _sanitize_model_config(
+        self, llamacpp_model_config: Optional[LlamaCppModelConfig]
+    ) -> LlamaCppModelConfig:
+        if llamacpp_model_config is None:
+            llamacpp_model_config = LlamaCppModelConfig()
+        if self.model_family.context_length:
+            llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
+        llamacpp_model_config.setdefault("use_mmap", False)
+        llamacpp_model_config.setdefault("use_mlock", True)
+        if (
+            "llama-2" in self.model_family.model_name
+            and self.model_spec.model_size_in_billions == 70
+        ):
+            llamacpp_model_config["use_mlock"] = False
+            llamacpp_model_config["n_gqa"] = 8
+        if self._is_darwin_and_apple_silicon():
+            llamacpp_model_config.setdefault("n_gpu_layers", -1)
+        elif self._is_linux():
+            llamacpp_model_config.setdefault("n_gpu_layers", -1)
+        llamacpp_model_config.setdefault("reasoning_content", False)
+        return llamacpp_model_config
+    def _sanitize_generate_config(
+        self, generate_config: Optional[LlamaCppGenerateConfig]
+    ) -> LlamaCppGenerateConfig:
+        if generate_config is None:
+            generate_config = LlamaCppGenerateConfig(
+                **CreateCompletionLlamaCpp().dict()
+            )
+        else:
+            from llama_cpp import LlamaGrammar
+            grammar = generate_config.get("grammar")
+            if grammar is not None and not isinstance(grammar, LlamaGrammar):
+                generate_config["grammar"] = LlamaGrammar.from_string(
+                    generate_config["grammar"]
+                )
+            # Validate generate_config and fill default values to the generate config.
+            generate_config = LlamaCppGenerateConfig(
+                **CreateCompletionLlamaCpp(**generate_config).dict()
+            )
+        # Currently, llama.cpp does not support lora
+        generate_config.pop("lora_name", None)  # type: ignore
+        return generate_config
+    @classmethod
+    def match(
+        cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
+    ) -> bool:
+        if llm_spec.model_format not in ["ggufv2"]:
+            return False
+        if (
+            "chat" not in llm_family.model_ability
+            and "generate" not in llm_family.model_ability
+        ):
+            return False
+        return True
+    def load(self):
+        try:
+            from xllamacpp import CommonParams, Server
+        except ImportError:
+            error_message = "Failed to import module 'xllamacpp'"
+            installation_guide = ["Please make sure 'xllamacpp' is installed. "]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
+        self.prepare_parse_reasoning_content(reasoning_content)
+        if os.path.isfile(self.model_path):
+            # mostly passed from --model_path
+            model_path = os.path.realpath(self.model_path)
+        else:
+            # handle legacy cache.
+            model_path = os.path.realpath(
+                os.path.join(
+                    self.model_path,
+                    self.model_spec.model_file_name_template.format(
+                        quantization=self.quantization
+                    ),
+                )
+            )
+            legacy_model_file_path = os.path.join(self.model_path, "model.bin")
+            if os.path.exists(legacy_model_file_path):
+                model_path = legacy_model_file_path
+        try:
+            params = CommonParams()
+            params.model = model_path
+            if self.model_family.chat_template:
+                params.chat_template = self.model_family.chat_template
+            # This is the default value, could be overwritten by _llamacpp_model_config
+            params.n_parallel = os.cpu_count()
+            for k, v in self._llamacpp_model_config.items():
+                try:
+                    setattr(params, k, v)
+                except Exception as e:
+                    logger.error("Failed to set the param %s = %s, error: %s", k, v, e)
+            n_threads = self._llamacpp_model_config.get("n_threads", os.cpu_count())
+            params.cpuparams.n_threads = n_threads
+            params.cpuparams_batch.n_threads = n_threads
+            if params.n_gpu_layers == -1:
+                # Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
+                # 0x7FFFFFFF is INT32 max, will be auto set to all layers
+                params.n_gpu_layers = 0x7FFFFFFF
+            self._llm = Server(params)
+            self._executor = concurrent.futures.ThreadPoolExecutor(
+                max_workers=max(10, n_threads)
+            )
+        except AssertionError:
+            raise RuntimeError(f"Load model {self.model_family.model_name} failed")
+    def generate(
+        self, prompt: str, generate_config: Optional[LlamaCppGenerateConfig] = None
+    ) -> Union[Completion, Iterator[CompletionChunk]]:
+        generate_config = self._sanitize_generate_config(generate_config)
+        stream = generate_config.get("stream", False)
+        q: queue.Queue = queue.Queue()
+        def _handle_completion():
+            # TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
+            data = generate_config
+            data.pop("stopping_criteria", None)
+            data.pop("logits_processor", None)
+            data.pop("suffix", None)
+            data.pop("best_of", None)
+            data.update(
+                {
+                    "prompt": prompt,
+                    "stream": stream,
+                }
+            )
+            prompt_json = orjson.dumps(data)
+            def _res_callback(ok):
+                try:
+                    res = orjson.loads(ok)
+                    res["model"] = self.model_uid
+                    q.put(res)
+                except Exception as e:
+                    logger.exception("handle_completions callback failed: %s", e)
+            try:
+                self._llm.handle_completions(prompt_json, _res_callback, _res_callback)
+            except Exception as ex:
+                logger.exception("handle_completions failed: %s", ex)
+            q.put(_Sentinel)
+        assert self._executor
+        self._executor.submit(_handle_completion)
+        if stream:
+            def _to_iterator():
+                while (r := q.get()) is not _Sentinel:
+                    yield r
+            return _to_iterator()
+        else:
+            return q.get()
+    def chat(
+        self,
+        messages: List[Dict],
+        generate_config: Optional[LlamaCppGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        generate_config = self._sanitize_generate_config(generate_config)
+        stream = generate_config.get("stream", False)
+        tools = generate_config.pop("tools", []) if generate_config else None
+        q: queue.Queue = queue.Queue()
+        def _handle_chat_completion():
+            # TODO(fyrestone): Replace the LlamaCppGenerateConfig with OpenAI params.
+            data = generate_config
+            data.pop("stopping_criteria", None)
+            data.pop("logits_processor", None)
+            data.pop("suffix", None)
+            data.pop("best_of", None)
+            data.update(
+                {
+                    "messages": messages,
+                    "stream": stream,
+                    "tools": tools,
+                }
+            )
+            prompt_json = orjson.dumps(data)
+            def _res_callback(ok):
+                try:
+                    res = orjson.loads(ok)
+                    res["model"] = self.model_uid
+                    q.put(res)
+                except Exception as e:
+                    logger.exception("handle_chat_completions callback failed: %s", e)
+            try:
+                self._llm.handle_chat_completions(
+                    prompt_json, _res_callback, _res_callback
+                )
+            except Exception as ex:
+                logger.exception("handle_chat_completions failed: %s", ex)
+            q.put(_Sentinel)
+        assert self._executor
+        self._executor.submit(_handle_chat_completion)
+        if stream:
+            def _to_iterator():
+                while (r := q.get()) is not _Sentinel:
+                    yield r
+            return self._to_chat_completion_chunks(
+                _to_iterator(), self.reasoning_parser
+            )
+        else:
+            return self._to_chat_completion(q.get(), self.reasoning_parser)
 class LlamaCppModel(LLM):
     def __init__(
@@ -76,6 +328,7 @@ class LlamaCppModel(LLM):
             llamacpp_model_config.setdefault("n_gpu_layers", -1)
         elif self._is_linux() and self._can_apply_cublas():
             llamacpp_model_config.setdefault("n_gpu_layers", -1)
+        llamacpp_model_config.setdefault("reasoning_content", False)
         return llamacpp_model_config
@@ -123,6 +376,9 @@ class LlamaCppModel(LLM):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        reasoning_content = self._llamacpp_model_config.pop("reasoning_content")
+        self.prepare_parse_reasoning_content(reasoning_content)
         if os.path.isfile(self.model_path):
             # mostly passed from --model_path
             model_path = os.path.realpath(self.model_path)
@@ -292,10 +548,17 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
         if stream:
             it = self.generate(full_prompt, generate_config)
             assert isinstance(it, Iterator)
-            return self._to_chat_completion_chunks(it)
+            return self._to_chat_completion_chunks(it, self.reasoning_parser)
         else:
             c = self.generate(full_prompt, generate_config)
             assert not isinstance(c, Iterator)
             if tools:
-                return self._tool_calls_completion(self.model_family, self.model_uid, c)
-            return self._to_chat_completion(c)
+                return self._post_process_completion(
+                    self.model_family, self.model_uid, c, self.reasoning_parser
+                )
+            return self._to_chat_completion(c, self.reasoning_parser)
+if USE_XLLAMACPP:
+    LlamaCppModel = XllamaCppModel  # type: ignore  # noqa: F811
+    LlamaCppChatModel = XllamaCppModel  # type: ignore  # noqa: F811

xinference 1.3.0.post2__py3-none-any.whl → 1.3.1.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.3.0.post2py3-none-any.whl → 1.3.1.post1py3-none-any.whl