PyPI - xinference - Versions diffs - 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

xinference 1.9.1py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (34) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +415 -1
xinference/constants.py +2 -0
xinference/core/supervisor.py +29 -1
xinference/model/audio/core.py +5 -0
xinference/model/audio/kokoro.py +1 -1
xinference/model/audio/kokoro_zh.py +124 -0
xinference/model/audio/model_spec.json +20 -0
xinference/model/embedding/sentence_transformers/core.py +4 -4
xinference/model/embedding/vllm/core.py +7 -1
xinference/model/image/model_spec.json +2 -3
xinference/model/llm/core.py +10 -0
xinference/model/llm/llama_cpp/core.py +1 -0
xinference/model/llm/llm_family.json +40 -20
xinference/model/llm/llm_family.py +1 -0
xinference/model/llm/mlx/core.py +52 -33
xinference/model/llm/sglang/core.py +2 -44
xinference/model/llm/tool_parsers/__init__.py +58 -0
xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
xinference/model/llm/transformers/core.py +1 -1
xinference/model/llm/utils.py +127 -45
xinference/model/llm/vllm/core.py +2 -61
xinference/types.py +105 -2
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/METADATA +7 -3
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/RECORD +34 -26
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0

xinference/model/audio/kokoro_zh.py ADDED Viewed

@@ -0,0 +1,124 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from io import BytesIO
+from typing import TYPE_CHECKING, Optional
+import numpy as np
+from ...device_utils import get_available_device, is_device_available
+if TYPE_CHECKING:
+    from .core import AudioModelFamilyV2
+logger = logging.getLogger(__name__)
+REPO_ID = "hexgrad/Kokoro-82M-v1.1-zh"
+class KokoroZHModel:
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: str,
+        model_spec: "AudioModelFamilyV2",
+        device: Optional[str] = None,
+        **kwargs,
+    ):
+        self.model_family = model_spec
+        self._model_uid = model_uid
+        self._model_path = model_path
+        self._model_spec = model_spec
+        self._device = device
+        self._model = None
+        self._kwargs = kwargs
+        self._en_pipeline = None
+    def _en_callable(self, text):
+        """
+        Fixing the issue of English words being skipped in the Chinese model.
+        from https://hf-mirror.com/hexgrad/Kokoro-82M-v1.1-zh/blob/main/samples/make_zh.py
+        """
+        if text == "Kokoro":
+            return "kˈOkəɹO"
+        elif text == "Sol":
+            return "sˈOl"
+        return next(self._en_pipeline(text)).phonemes
+    @property
+    def model_ability(self):
+        return self._model_spec.model_ability
+    def load(self):
+        if self._device is None:
+            self._device = get_available_device()
+        else:
+            if not is_device_available(self._device):
+                raise ValueError(f"Device {self._device} is not available!")
+        import os
+        from kokoro import KModel, KPipeline
+        self._en_pipeline = KPipeline(lang_code="a", repo_id=REPO_ID, model=False)
+        config_path = os.path.join(self._model_path, "config.json")
+        model_path = os.path.join(self._model_path, "kokoro-v1_1-zh.pth")
+        lang_code = self._kwargs.get("lang_code", "z")
+        logger.info("Launching Kokoro model with language code: %s", lang_code)
+        self._model = KPipeline(
+            lang_code=lang_code,
+            model=KModel(config=config_path, model=model_path).to(self._device),
+            repo_id=REPO_ID,
+            en_callable=self._en_callable,
+            device=self._device,
+        )
+    def speech(
+        self,
+        input: str,
+        voice: str,
+        response_format: str = "mp3",
+        speed: float = 1.0,
+        stream: bool = False,
+        **kwargs,
+    ):
+        import soundfile
+        if stream:
+            raise Exception("Kokoro does not support stream mode.")
+        assert self._model is not None
+        if not voice:
+            voice = "zf_001"
+            logger.info("Auto select speaker: %s", voice)
+        elif voice.endswith(".pt"):
+            logger.info("Using custom voice pt: %s", voice)
+        else:
+            logger.info("Using voice: %s", voice)
+        logger.info("Speech kwargs: %s", kwargs)
+        generator = self._model(text=input, voice=voice, speed=speed, **kwargs)
+        results = list(generator)
+        audio = np.concatenate([r[2] for r in results])
+        # Save the generated audio
+        with BytesIO() as out:
+            with soundfile.SoundFile(
+                out,
+                "w",
+                24000,
+                1,
+                format=response_format.upper(),
+            ) as f:
+                f.write(audio)
+            return out.getvalue()

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -862,6 +862,26 @@
         "model_revision": "master"
       }
     }
+  },
+    {
+    "version": 2,
+    "model_name": "Kokoro-82M-v1.1-zh",
+    "model_family": "Kokoro-zh",
+    "model_ability": [
+      "text2audio",
+      "text2audio_zero_shot"
+    ],
+    "multilingual": false,
+    "model_src": {
+      "huggingface": {
+        "model_id": "hexgrad/Kokoro-82M-v1.1-zh",
+        "model_revision": "01e7505bd6a7a2ac4975463114c3a7650a9f7218"
+      },
+      "modelscope": {
+        "model_id": "AI-ModelScope/Kokoro-82M-v1.1-zh",
+        "model_revision": "master"
+      }
+    }
   },
   {
     "version": 2,

xinference/model/embedding/sentence_transformers/core.py CHANGED Viewed

@@ -265,10 +265,10 @@ class SentenceTransformerEmbeddingModel(EmbeddingModel):
                     "clip" in self.model_family.model_name.lower()
                     or "jina-embeddings-v4" in self.model_family.model_name.lower()
                 ):
-                    if "input_ids" in features and hasattr(
-                        features["input_ids"], "numel"
-                    ):
-                        all_token_nums += features["input_ids"].numel()
+                    # support input_ids and text_input_ids
+                    for key in ["input_ids", "text_input_ids"]:
+                        if key in features and hasattr(features[key], "numel"):
+                            all_token_nums += features[key].numel()
                     if "pixel_values" in features and hasattr(
                         features["pixel_values"], "numel"
                     ):

xinference/model/embedding/vllm/core.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import importlib.util
+import json
 import logging
 from typing import List, Union
@@ -54,13 +55,18 @@ class VLLMEmbeddingModel(EmbeddingModel):
                 self._kwargs["hf_overrides"].update(
                     is_matryoshka=True,
                 )
+            elif isinstance(self._kwargs["hf_overrides"], str):
+                self._kwargs["hf_overrides"] = json.loads(self._kwargs["hf_overrides"])
+                self._kwargs["hf_overrides"].update(
+                    is_matryoshka=True,
+                )
         self._model = LLM(model=self._model_path, task="embed", **self._kwargs)
         self._tokenizer = self._model.get_tokenizer()
     @staticmethod
     def _get_detailed_instruct(task_description: str, query: str) -> str:
-        return f"Instruct: {task_description}\nQuery:{query}"
+        return f"Instruct: {task_description}\nQuery:{query}"  # noqa: E231
     @cache_clean
     def create_embedding(

xinference/model/image/model_spec.json CHANGED Viewed

@@ -824,13 +824,12 @@
         "deepspeed==0.12.3",
         "peft==0.4.0",
         "tiktoken==0.6.0",
-        "bitsandbytes==0.41.0",
-        "scikit-learn==1.2.2",
         "sentencepiece==0.1.99",
         "einops==0.6.1",
         "einops-exts==0.0.4",
         "timm==0.6.13",
-        "numpy==1.26.4"
+        "#system_numpy#",
+        "#system_torch#"
       ]
     },
     "model_src": {

xinference/model/llm/core.py CHANGED Viewed

@@ -27,6 +27,7 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
 from ...core.utils import parse_replica_model_uid
 from ...types import PeftModelConfig
 from .reasoning_parser import ReasoningParser
+from .tool_parsers import TOOL_PARSERS
 if TYPE_CHECKING:
     from .llm_family import LLMFamilyV2, LLMSpecV1
@@ -59,6 +60,7 @@ class LLM(abc.ABC):
         self.quantization = model_family.model_specs[0].quantization
         self.model_path = model_path
         self.reasoning_parser = None
+        self.tool_parser = None
         if args:
             raise ValueError(f"Unrecognized positional arguments: {args}")
         if kwargs:
@@ -171,6 +173,14 @@ class LLM(abc.ABC):
             enable_thinking=enable_thinking,
         )
+    def prepare_parse_tool_calls(self):
+        if self.model_family.tool_parser is None:
+            return
+        if self.model_family.tool_parser not in TOOL_PARSERS:
+            return
+        tool_parser = TOOL_PARSERS[self.model_family.tool_parser]
+        self.tool_parser = tool_parser()
 # Context variable for passing per-request chat context (e.g., chat_template_kwargs).
 # This variable should be set at the beginning of each chat or stream_chat call.

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -122,6 +122,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
         self.prepare_parse_reasoning_content(
             reasoning_content, enable_thinking=enable_thinking
         )
+        self.prepare_parse_tool_calls()
         if os.path.isfile(self.model_path):
             # mostly passed from --model_path

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -1008,7 +1008,8 @@
       "<|endoftext|>",
       "<|im_start|>",
       "<|im_end|>"
-    ]
+    ],
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -1070,7 +1071,8 @@
       "<|end_of_text|>",
       "<|eot_id|>",
       "<|eom_id|>"
-    ]
+    ],
+    "tool_parser": "llama3"
   },
   {
     "version": 2,
@@ -1133,7 +1135,8 @@
       "<|endoftext|>",
       "<|im_start|>",
       "<|im_end|>"
-    ]
+    ],
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -1946,7 +1949,8 @@
       "<|im_end|>"
     ],
     "reasoning_start_tag": "<think>",
-    "reasoning_end_tag": "</think>"
+    "reasoning_end_tag": "</think>",
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -2209,7 +2213,8 @@
       "<|endoftext|>",
       "<|im_start|>",
       "<|im_end|>"
-    ]
+    ],
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -5772,7 +5777,8 @@
       "<｜end▁of▁sentence｜>"
     ],
     "reasoning_start_tag": "<think>",
-    "reasoning_end_tag": "</think>"
+    "reasoning_end_tag": "</think>",
+    "tool_parser": "deepseek_r1"
   },
   {
     "version": 2,
@@ -6620,7 +6626,8 @@
     ],
     "stop": [
       "<｜end▁of▁sentence｜>"
-    ]
+    ],
+    "tool_parser": "deepseek_v3"
   },
   {
     "version": 2,
@@ -7920,7 +7927,8 @@
       "<|endoftext|>",
       "<|user|>",
       "<|observation|>"
-    ]
+    ],
+    "tool_parser":"glm4"
   },
   {
     "version": 2,
@@ -8027,7 +8035,8 @@
       "<|endoftext|>",
       "<|user|>",
       "<|observation|>"
-    ]
+    ],
+    "tool_parser":"glm4"
   },
   {
     "version": 2,
@@ -9189,7 +9198,8 @@
       "<|end_of_text|>",
       "<|eot_id|>",
       "<|eom_id|>"
-    ]
+    ],
+    "tool_parser": "llama3"
   },
   {
     "version": 2,
@@ -11918,7 +11928,8 @@
       "<|endoftext|>",
       "<|im_start|>",
       "<|im_end|>"
-    ]
+    ],
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -11981,7 +11992,8 @@
       "<|endoftext|>",
       "<|im_start|>",
       "<|im_end|>"
-    ]
+    ],
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -12705,7 +12717,8 @@
       "<|endoftext|>",
       "<|im_start|>",
       "<|im_end|>"
-    ]
+    ],
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -12826,7 +12839,8 @@
       "<|endoftext|>",
       "<|im_start|>",
       "<|im_end|>"
-    ]
+    ],
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -14008,7 +14022,8 @@
       "<|endoftext|>",
       "<|im_start|>",
       "<|im_end|>"
-    ]
+    ],
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -15518,7 +15533,8 @@
       "<|endoftext|>",
       "<|im_start|>",
       "<|im_end|>"
-    ]
+    ],
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -17428,7 +17444,8 @@
         "mlx-lm>=0.24.0 ; sys_platform=='darwin'",
         "#system_numpy#"
       ]
-    }
+    },
+    "tool_parser": "qwen"
   },
   {
     "version": 2,
@@ -18043,7 +18060,8 @@
       "<|endoftext|>",
       "<|im_start|>",
       "<|im_end|>"
-    ]
+    ],
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -18655,7 +18673,8 @@
       "<|im_end|>"
     ],
     "reasoning_start_tag": "<think>",
-    "reasoning_end_tag": "</think>"
+    "reasoning_end_tag": "</think>",
+    "tool_parser":"qwen"
   },
   {
     "version": 2,
@@ -19438,7 +19457,8 @@
     "stop": [
       "<|endoftext|>",
       "<|im_end|>"
-    ]
+    ],
+    "tool_parser":"qwen"
   },
   {
     "version": 2,

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -154,6 +154,7 @@ class LLMFamilyV2(BaseModel, ModelInstanceInfoMixin):
     reasoning_end_tag: Optional[str]
     cache_config: Optional[dict]
     virtualenv: Optional[VirtualEnvSettings]
+    tool_parser: Optional[str]
     class Config:
         extra = "allow"

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -148,6 +148,16 @@ class MLXModel(LLM):
         # to call aynsc method with asyncio.run_coroutine_threadsafe
         self._loop = loop  # type: ignore
+    def _cleanup_memory(self):
+        import gc
+        import mlx.core as mx
+        # mandatory recycling
+        gc.collect()
+        # clear the MLX cache
+        mx.clear_cache()
     @property
     def driver_info(self) -> Optional[dict]:
         return self._driver_info
@@ -333,6 +343,7 @@ class MLXModel(LLM):
         self.prepare_parse_reasoning_content(
             reasoning_content, enable_thinking=enable_thinking
         )
+        self.prepare_parse_tool_calls()
         kwargs = {}
         kwargs["revision"] = self._model_config.get(
@@ -458,14 +469,18 @@ class MLXModel(LLM):
             repetition_penalty=kwargs.pop("repetition_penalty"),
             repetition_context_size=kwargs.pop("repetition_context_size"),
         )
-        yield from stream_generate(
-            self._model,
-            self._tokenizer,
-            prompt_token_ids,
-            sampler=sampler,
-            logits_processors=logits_processors,
-            **kwargs,
-        )
+        try:
+            yield from stream_generate(
+                self._model,
+                self._tokenizer,
+                prompt_token_ids,
+                sampler=sampler,
+                logits_processors=logits_processors,
+                **kwargs,
+            )
+        finally:
+            # after completing the inference, clear the memory.
+            self._cleanup_memory()
     def _prepare_inputs(
         self, prompt: Union[str, Dict[str, Any]], kwargs
@@ -755,7 +770,7 @@ class MLXChatModel(MLXModel, ChatModelMixin):
             assert not isinstance(c, Iterator)
             if tools:
                 return self._post_process_completion(
-                    self.model_family, self.model_uid, c, self.reasoning_parser
+                    self.model_family, self.model_uid, c
                 )
             return self._to_chat_completion(c, self.reasoning_parser)
@@ -831,18 +846,32 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
         detokenizer.reset()
         tic = time.perf_counter()
-        for n, (token, logprobs) in enumerate(
-            generate_step(input_ids, self._model, pixel_values, mask, **kwargs),
-        ):
-            if n == 0:
-                prompt_time = time.perf_counter() - tic
-                prompt_tps = len(input_ids) / prompt_time
-                tic = time.perf_counter()
-            if token == tokenizer.eos_token_id:
-                break
-            detokenizer.add_token(token)
+        try:
+            for n, (token, logprobs) in enumerate(
+                generate_step(input_ids, self._model, pixel_values, mask, **kwargs),
+            ):
+                if n == 0:
+                    prompt_time = time.perf_counter() - tic
+                    prompt_tps = len(input_ids) / prompt_time
+                    tic = time.perf_counter()
+                if token == tokenizer.eos_token_id:
+                    break
+                detokenizer.add_token(token)
+                # Yield the last segment if streaming
+                yield GenerationResponse(
+                    text=detokenizer.last_segment,
+                    token=token,
+                    logprobs=logprobs,
+                    from_draft=False,
+                    prompt_tokens=len(input_ids),
+                    prompt_tps=prompt_tps,
+                    generation_tokens=n + 1,
+                    generation_tps=(n + 1) / (time.perf_counter() - tic),
+                    peak_memory=mx.metal.get_peak_memory() / 1e9,
+                )
-            # Yield the last segment if streaming
+            detokenizer.finalize()
             yield GenerationResponse(
                 text=detokenizer.last_segment,
                 token=token,
@@ -854,19 +883,9 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
                 generation_tps=(n + 1) / (time.perf_counter() - tic),
                 peak_memory=mx.metal.get_peak_memory() / 1e9,
             )
-        detokenizer.finalize()
-        yield GenerationResponse(
-            text=detokenizer.last_segment,
-            token=token,
-            logprobs=logprobs,
-            from_draft=False,
-            prompt_tokens=len(input_ids),
-            prompt_tps=prompt_tps,
-            generation_tokens=n + 1,
-            generation_tps=(n + 1) / (time.perf_counter() - tic),
-            peak_memory=mx.metal.get_peak_memory() / 1e9,
-        )
+        finally:
+            # after completing the inference, clear the memory
+            self._cleanup_memory()
     def _prepare_inputs(
         self, prompt: Union[str, Dict[str, Any]], kwargs

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -175,6 +175,7 @@ class SGLANGModel(LLM):
         self.prepare_parse_reasoning_content(
             reasoning_content, enable_thinking=enable_thinking
         )
+        self.prepare_parse_tool_calls()
         # Fix: GH#2169
         if sgl.__version__ >= "0.2.14":
@@ -646,49 +647,6 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
     def is_tool_call_chunk_end(chunk):
         return chunk["choices"][0]["text"].endswith(QWEN_TOOL_CALL_SYMBOLS[1])
-    async def _async_to_tool_completion_chunks(
-        self,
-        chunks: AsyncGenerator[CompletionChunk, None],
-    ) -> AsyncGenerator[ChatCompletionChunk, None]:
-        i = 0
-        previous_texts = [""]
-        tool_call = False
-        tool_call_texts = [""]
-        if self.reasoning_parser:
-            chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
-        async for chunk in chunks:
-            if i == 0:
-                for first_chunk in self._get_first_chat_completion_chunk(
-                    chunk, self.reasoning_parser
-                ):
-                    yield first_chunk
-            # usage
-            choices = chunk.get("choices")
-            if not choices:
-                yield self._get_final_chat_completion_chunk(chunk)
-            else:
-                if self.is_tool_call_chunk_start(chunk):
-                    tool_call = True
-                if tool_call:
-                    tool_call_text = tool_call_texts[-1]
-                    tool_call_text += chunk["choices"][0]["text"]
-                    tool_call_texts.append(tool_call_text)
-                    if self.is_tool_call_chunk_end(chunk):
-                        yield self._post_process_completion_chunk(
-                            self.model_family,
-                            self.model_uid,
-                            chunk,
-                            reasoning_parser=self.reasoning_parser,
-                            tool_call_text=tool_call_text,
-                        )
-                        tool_call = False
-                        tool_call_texts = [""]
-                else:
-                    yield self._to_chat_completion_chunk(
-                        chunk, self.reasoning_parser, previous_texts
-                    )
-            i += 1
     async def async_chat(
         self,
         messages: List[Dict],
@@ -731,7 +689,7 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
             assert not isinstance(c, AsyncGenerator)
             if tools:
                 return self._post_process_completion(
-                    self.model_family, self.model_uid, c, self.reasoning_parser
+                    self.model_family, self.model_uid, c
                 )
             return self._to_chat_completion(c, self.reasoning_parser)

xinference 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

xinference 1.9.1py3-none-any.whl → 1.10.0py3-none-any.whl